{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 500,
  "global_step": 1250,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 470.0,
      "completions/mean_length": 276.6875,
      "completions/mean_terminated_length": 184.60870361328125,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.0008,
      "grad_norm": 6.258511543273926,
      "kl": 0.0003216266632080078,
      "learning_rate": 0.0,
      "loss": -0.1947,
      "num_tokens": 15006.0,
      "reward": 0.03416593745350838,
      "reward_std": 0.019731489941477776,
      "rewards/bleu_reward_func/mean": 0.03416593745350838,
      "rewards/bleu_reward_func/std": 0.030305052176117897,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 424.0,
      "completions/mean_length": 313.625,
      "completions/mean_terminated_length": 159.3333282470703,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.0016,
      "grad_norm": 4.826038360595703,
      "kl": 0.0002695322036743164,
      "learning_rate": 1.5873015873015872e-08,
      "loss": -0.1342,
      "num_tokens": 28018.0,
      "reward": 0.0247221440076828,
      "reward_std": 0.01764773763716221,
      "rewards/bleu_reward_func/mean": 0.0247221440076828,
      "rewards/bleu_reward_func/std": 0.03452335670590401,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 382.78125,
      "completions/mean_terminated_length": 268.76470947265625,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "epoch": 0.0024,
      "grad_norm": 2.2084195613861084,
      "kl": 0.0002834796905517578,
      "learning_rate": 3.1746031746031744e-08,
      "loss": 0.0891,
      "num_tokens": 43059.0,
      "reward": 0.022482896223664284,
      "reward_std": 0.009540551342070103,
      "rewards/bleu_reward_func/mean": 0.022482896223664284,
      "rewards/bleu_reward_func/std": 0.014530075713992119,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 182.59375,
      "completions/mean_terminated_length": 148.51724243164062,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.0032,
      "grad_norm": 3.871168613433838,
      "kl": 0.00033736228942871094,
      "learning_rate": 4.7619047619047613e-08,
      "loss": -0.1353,
      "num_tokens": 50982.0,
      "reward": 0.04917050898075104,
      "reward_std": 0.023263752460479736,
      "rewards/bleu_reward_func/mean": 0.04917050898075104,
      "rewards/bleu_reward_func/std": 0.036324601620435715,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 235.53125,
      "completions/mean_terminated_length": 206.9310302734375,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.004,
      "grad_norm": 3.321488380432129,
      "kl": 0.00027179718017578125,
      "learning_rate": 6.349206349206349e-08,
      "loss": -0.1569,
      "num_tokens": 62191.0,
      "reward": 0.03882071375846863,
      "reward_std": 0.02520540915429592,
      "rewards/bleu_reward_func/mean": 0.03882071375846863,
      "rewards/bleu_reward_func/std": 0.034420739859342575,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 274.5,
      "completions/mean_terminated_length": 219.69232177734375,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.0048,
      "grad_norm": 2.86037015914917,
      "kl": 0.0002570152282714844,
      "learning_rate": 7.936507936507936e-08,
      "loss": 0.0153,
      "num_tokens": 73551.0,
      "reward": 0.03558861464262009,
      "reward_std": 0.019850196316838264,
      "rewards/bleu_reward_func/mean": 0.03558861464262009,
      "rewards/bleu_reward_func/std": 0.02189255878329277,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 372.0,
      "completions/mean_length": 276.9375,
      "completions/mean_terminated_length": 184.95652770996094,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.0056,
      "grad_norm": 2.263695478439331,
      "kl": 0.0002677440643310547,
      "learning_rate": 9.523809523809523e-08,
      "loss": 0.11,
      "num_tokens": 85237.0,
      "reward": 0.017801083624362946,
      "reward_std": 0.008379511535167694,
      "rewards/bleu_reward_func/mean": 0.017801083624362946,
      "rewards/bleu_reward_func/std": 0.015220904722809792,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 399.6875,
      "completions/mean_terminated_length": 300.5882263183594,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "epoch": 0.0064,
      "grad_norm": 2.1360983848571777,
      "kl": 0.00027871131896972656,
      "learning_rate": 1.111111111111111e-07,
      "loss": -0.0919,
      "num_tokens": 100099.0,
      "reward": 0.04303022474050522,
      "reward_std": 0.020876675844192505,
      "rewards/bleu_reward_func/mean": 0.04303022474050522,
      "rewards/bleu_reward_func/std": 0.04522383213043213,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 359.96875,
      "completions/mean_terminated_length": 317.3999938964844,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "epoch": 0.0072,
      "grad_norm": 2.249929904937744,
      "kl": 0.00033664703369140625,
      "learning_rate": 1.2698412698412698e-07,
      "loss": 0.001,
      "num_tokens": 116226.0,
      "reward": 0.03631145507097244,
      "reward_std": 0.019055547192692757,
      "rewards/bleu_reward_func/mean": 0.03631145507097244,
      "rewards/bleu_reward_func/std": 0.024671798571944237,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 402.65625,
      "completions/mean_terminated_length": 293.3125,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "epoch": 0.008,
      "grad_norm": 2.0875115394592285,
      "kl": 0.00031948089599609375,
      "learning_rate": 1.4285714285714285e-07,
      "loss": -0.1079,
      "num_tokens": 132767.0,
      "reward": 0.03503231331706047,
      "reward_std": 0.015032317489385605,
      "rewards/bleu_reward_func/mean": 0.03503231331706047,
      "rewards/bleu_reward_func/std": 0.053626786917448044,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 434.0,
      "completions/mean_length": 427.8125,
      "completions/mean_terminated_length": 212.6666717529297,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.0088,
      "grad_norm": 2.3419556617736816,
      "kl": 0.00034618377685546875,
      "learning_rate": 1.5873015873015872e-07,
      "loss": -0.1937,
      "num_tokens": 150273.0,
      "reward": 0.023869339376688004,
      "reward_std": 0.010186510160565376,
      "rewards/bleu_reward_func/mean": 0.023869339376688004,
      "rewards/bleu_reward_func/std": 0.024761516600847244,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 180.3125,
      "completions/mean_terminated_length": 146.0,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.0096,
      "grad_norm": 4.572720527648926,
      "kl": 0.00017344951629638672,
      "learning_rate": 1.7460317460317458e-07,
      "loss": -0.1778,
      "num_tokens": 158243.0,
      "reward": 0.06084510311484337,
      "reward_std": 0.031551554799079895,
      "rewards/bleu_reward_func/mean": 0.06084510311484337,
      "rewards/bleu_reward_func/std": 0.04978843033313751,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 504.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 244.0625,
      "completions/mean_terminated_length": 244.0625,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.0104,
      "grad_norm": 3.3186678886413574,
      "kl": 0.00041675567626953125,
      "learning_rate": 1.9047619047619045e-07,
      "loss": 0.2379,
      "num_tokens": 168149.0,
      "reward": 0.021276462823152542,
      "reward_std": 0.00621379679068923,
      "rewards/bleu_reward_func/mean": 0.021276462823152542,
      "rewards/bleu_reward_func/std": 0.008626200258731842,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 332.5,
      "completions/mean_terminated_length": 272.66668701171875,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.0112,
      "grad_norm": 2.2437744140625,
      "kl": 0.00031495094299316406,
      "learning_rate": 2.0634920634920632e-07,
      "loss": 0.09,
      "num_tokens": 180917.0,
      "reward": 0.022824838757514954,
      "reward_std": 0.014015388675034046,
      "rewards/bleu_reward_func/mean": 0.022824838757514954,
      "rewards/bleu_reward_func/std": 0.018382087349891663,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 274.90625,
      "completions/mean_terminated_length": 208.51998901367188,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.012,
      "grad_norm": 2.874119997024536,
      "kl": 0.00016677379608154297,
      "learning_rate": 2.222222222222222e-07,
      "loss": -0.0669,
      "num_tokens": 194290.0,
      "reward": 0.13837847113609314,
      "reward_std": 0.10791029036045074,
      "rewards/bleu_reward_func/mean": 0.13837847113609314,
      "rewards/bleu_reward_func/std": 0.16033971309661865,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 465.0,
      "completions/max_terminated_length": 465.0,
      "completions/mean_length": 179.8125,
      "completions/mean_terminated_length": 179.8125,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.0128,
      "grad_norm": 15.92119312286377,
      "kl": 0.00028777122497558594,
      "learning_rate": 2.3809523809523806e-07,
      "loss": 0.0326,
      "num_tokens": 205636.0,
      "reward": 0.050816405564546585,
      "reward_std": 0.02174009010195732,
      "rewards/bleu_reward_func/mean": 0.050816405564546585,
      "rewards/bleu_reward_func/std": 0.03758488595485687,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 257.25,
      "completions/mean_terminated_length": 210.07408142089844,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.0136,
      "grad_norm": 3.077911615371704,
      "kl": 0.00032806396484375,
      "learning_rate": 2.5396825396825396e-07,
      "loss": -0.079,
      "num_tokens": 216124.0,
      "reward": 0.03544948250055313,
      "reward_std": 0.023141874000430107,
      "rewards/bleu_reward_func/mean": 0.03544948250055313,
      "rewards/bleu_reward_func/std": 0.02584005706012249,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 304.25,
      "completions/mean_terminated_length": 179.60000610351562,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.0144,
      "grad_norm": 9.573840141296387,
      "kl": 0.00026106834411621094,
      "learning_rate": 2.698412698412698e-07,
      "loss": 0.1542,
      "num_tokens": 229060.0,
      "reward": 0.09650908410549164,
      "reward_std": 0.06372307240962982,
      "rewards/bleu_reward_func/mean": 0.09650908410549164,
      "rewards/bleu_reward_func/std": 0.13649234175682068,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 471.0,
      "completions/mean_length": 212.3125,
      "completions/mean_terminated_length": 202.64515686035156,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.0152,
      "grad_norm": 3.6799280643463135,
      "kl": 0.00029969215393066406,
      "learning_rate": 2.857142857142857e-07,
      "loss": -0.258,
      "num_tokens": 238062.0,
      "reward": 0.03407663479447365,
      "reward_std": 0.018440743908286095,
      "rewards/bleu_reward_func/mean": 0.03407663479447365,
      "rewards/bleu_reward_func/std": 0.020070552825927734,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 498.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 278.59375,
      "completions/mean_terminated_length": 278.59375,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.016,
      "grad_norm": 2.5249006748199463,
      "kl": 0.0003578662872314453,
      "learning_rate": 3.0158730158730156e-07,
      "loss": -0.2122,
      "num_tokens": 248801.0,
      "reward": 0.04736609756946564,
      "reward_std": 0.03644244372844696,
      "rewards/bleu_reward_func/mean": 0.04736609756946564,
      "rewards/bleu_reward_func/std": 0.058418869972229004,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 467.0,
      "completions/mean_length": 254.4375,
      "completions/mean_terminated_length": 227.79310607910156,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.0168,
      "grad_norm": 3.26560640335083,
      "kl": 0.000385284423828125,
      "learning_rate": 3.1746031746031743e-07,
      "loss": -0.0564,
      "num_tokens": 259415.0,
      "reward": 0.03675752133131027,
      "reward_std": 0.012391982600092888,
      "rewards/bleu_reward_func/mean": 0.03675752133131027,
      "rewards/bleu_reward_func/std": 0.01883433386683464,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 432.0,
      "completions/mean_length": 263.5625,
      "completions/mean_terminated_length": 150.63636779785156,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.0176,
      "grad_norm": 3.0202367305755615,
      "kl": 0.00021600723266601562,
      "learning_rate": 3.333333333333333e-07,
      "loss": 0.0991,
      "num_tokens": 270649.0,
      "reward": 0.046073149889707565,
      "reward_std": 0.035358842462301254,
      "rewards/bleu_reward_func/mean": 0.046073149889707565,
      "rewards/bleu_reward_func/std": 0.0658104419708252,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 376.90625,
      "completions/mean_terminated_length": 331.875,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.0184,
      "grad_norm": 2.1112754344940186,
      "kl": 0.00023984909057617188,
      "learning_rate": 3.4920634920634917e-07,
      "loss": 0.0604,
      "num_tokens": 285446.0,
      "reward": 0.06469863653182983,
      "reward_std": 0.052234675735235214,
      "rewards/bleu_reward_func/mean": 0.06469863653182983,
      "rewards/bleu_reward_func/std": 0.1052434891462326,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 262.875,
      "completions/mean_terminated_length": 227.2857208251953,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.0192,
      "grad_norm": 5.028998374938965,
      "kl": 0.00035119056701660156,
      "learning_rate": 3.6507936507936504e-07,
      "loss": -0.0787,
      "num_tokens": 297450.0,
      "reward": 0.06576113402843475,
      "reward_std": 0.02428753674030304,
      "rewards/bleu_reward_func/mean": 0.06576113402843475,
      "rewards/bleu_reward_func/std": 0.06561829149723053,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 251.375,
      "completions/mean_terminated_length": 214.1428680419922,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.02,
      "grad_norm": 4.391946315765381,
      "kl": 0.0003197193145751953,
      "learning_rate": 3.809523809523809e-07,
      "loss": -0.1142,
      "num_tokens": 309646.0,
      "reward": 0.03192237764596939,
      "reward_std": 0.018320664763450623,
      "rewards/bleu_reward_func/mean": 0.03192237764596939,
      "rewards/bleu_reward_func/std": 0.020625513046979904,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 322.0,
      "completions/mean_length": 199.3125,
      "completions/mean_terminated_length": 76.9565200805664,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.0208,
      "grad_norm": 4.005726337432861,
      "kl": 0.0002747774124145508,
      "learning_rate": 3.968253968253968e-07,
      "loss": -0.0165,
      "num_tokens": 319608.0,
      "reward": 0.04598322883248329,
      "reward_std": 0.03894542530179024,
      "rewards/bleu_reward_func/mean": 0.04598322883248329,
      "rewards/bleu_reward_func/std": 0.0656987875699997,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 445.1875,
      "completions/mean_terminated_length": 378.375,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.0216,
      "grad_norm": 1.7314307689666748,
      "kl": 0.00025200843811035156,
      "learning_rate": 4.1269841269841265e-07,
      "loss": 0.0145,
      "num_tokens": 339982.0,
      "reward": 0.03940076753497124,
      "reward_std": 0.01753806695342064,
      "rewards/bleu_reward_func/mean": 0.03940076753497124,
      "rewards/bleu_reward_func/std": 0.024869710206985474,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 380.0,
      "completions/mean_length": 176.78125,
      "completions/mean_terminated_length": 165.96774291992188,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.0224,
      "grad_norm": 3.5480945110321045,
      "kl": 0.0003466606140136719,
      "learning_rate": 4.285714285714285e-07,
      "loss": 0.3461,
      "num_tokens": 350399.0,
      "reward": 0.032109640538692474,
      "reward_std": 0.01587669923901558,
      "rewards/bleu_reward_func/mean": 0.032109640538692474,
      "rewards/bleu_reward_func/std": 0.021649450063705444,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 422.0,
      "completions/mean_length": 286.28125,
      "completions/mean_terminated_length": 131.84210205078125,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.0232,
      "grad_norm": 3.5640323162078857,
      "kl": 0.00038433074951171875,
      "learning_rate": 4.444444444444444e-07,
      "loss": -0.3725,
      "num_tokens": 366616.0,
      "reward": 0.032555509358644485,
      "reward_std": 0.02120809443295002,
      "rewards/bleu_reward_func/mean": 0.032555509358644485,
      "rewards/bleu_reward_func/std": 0.0240026768296957,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 470.0,
      "completions/mean_length": 316.5625,
      "completions/mean_terminated_length": 251.4166717529297,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.024,
      "grad_norm": 2.3427391052246094,
      "kl": 0.0003643035888671875,
      "learning_rate": 4.6031746031746025e-07,
      "loss": 0.1076,
      "num_tokens": 379194.0,
      "reward": 0.05133620649576187,
      "reward_std": 0.03176493942737579,
      "rewards/bleu_reward_func/mean": 0.05133620649576187,
      "rewards/bleu_reward_func/std": 0.05137130245566368,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 459.0,
      "completions/mean_length": 317.5625,
      "completions/mean_terminated_length": 166.3333282470703,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.0248,
      "grad_norm": 2.8795974254608154,
      "kl": 0.00021916627883911133,
      "learning_rate": 4.761904761904761e-07,
      "loss": -0.0397,
      "num_tokens": 393332.0,
      "reward": 0.02336902543902397,
      "reward_std": 0.016461046412587166,
      "rewards/bleu_reward_func/mean": 0.02336902543902397,
      "rewards/bleu_reward_func/std": 0.01688769832253456,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 273.5625,
      "completions/mean_terminated_length": 180.26087951660156,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.0256,
      "grad_norm": 4.866830348968506,
      "kl": 0.0003495216369628906,
      "learning_rate": 4.92063492063492e-07,
      "loss": -0.1723,
      "num_tokens": 406286.0,
      "reward": 0.0500735342502594,
      "reward_std": 0.03149079158902168,
      "rewards/bleu_reward_func/mean": 0.0500735342502594,
      "rewards/bleu_reward_func/std": 0.042752303183078766,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 428.0,
      "completions/mean_length": 199.90625,
      "completions/mean_terminated_length": 155.32144165039062,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.0264,
      "grad_norm": 3.734315872192383,
      "kl": 0.00031948089599609375,
      "learning_rate": 5.079365079365079e-07,
      "loss": 0.1723,
      "num_tokens": 417019.0,
      "reward": 0.04127680882811546,
      "reward_std": 0.026887936517596245,
      "rewards/bleu_reward_func/mean": 0.04127680882811546,
      "rewards/bleu_reward_func/std": 0.028591442853212357,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 238.8125,
      "completions/mean_terminated_length": 162.3199920654297,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.0272,
      "grad_norm": 3.5805702209472656,
      "kl": 0.0003371238708496094,
      "learning_rate": 5.238095238095238e-07,
      "loss": -0.1221,
      "num_tokens": 430517.0,
      "reward": 0.038475487381219864,
      "reward_std": 0.023987405002117157,
      "rewards/bleu_reward_func/mean": 0.038475487381219864,
      "rewards/bleu_reward_func/std": 0.03382611274719238,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 320.15625,
      "completions/mean_terminated_length": 245.0869598388672,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.028,
      "grad_norm": 2.8223023414611816,
      "kl": 0.0003685951232910156,
      "learning_rate": 5.396825396825396e-07,
      "loss": -0.1863,
      "num_tokens": 444386.0,
      "reward": 0.044519804418087006,
      "reward_std": 0.020455416291952133,
      "rewards/bleu_reward_func/mean": 0.044519804418087006,
      "rewards/bleu_reward_func/std": 0.02401871234178543,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 444.0,
      "completions/mean_length": 235.21875,
      "completions/mean_terminated_length": 109.40909576416016,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.0288,
      "grad_norm": 4.932171821594238,
      "kl": 0.0004143714904785156,
      "learning_rate": 5.555555555555555e-07,
      "loss": -0.0821,
      "num_tokens": 454729.0,
      "reward": 0.051868241280317307,
      "reward_std": 0.03919130563735962,
      "rewards/bleu_reward_func/mean": 0.051868241280317307,
      "rewards/bleu_reward_func/std": 0.04572274535894394,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 240.1875,
      "completions/mean_terminated_length": 231.41934204101562,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.0296,
      "grad_norm": 2.3933703899383545,
      "kl": 0.0003342628479003906,
      "learning_rate": 5.714285714285714e-07,
      "loss": 0.0184,
      "num_tokens": 464367.0,
      "reward": 0.033591024577617645,
      "reward_std": 0.012664815410971642,
      "rewards/bleu_reward_func/mean": 0.033591024577617645,
      "rewards/bleu_reward_func/std": 0.01647285185754299,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 239.0,
      "completions/mean_length": 186.21875,
      "completions/mean_terminated_length": 77.625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.0304,
      "grad_norm": 4.726412296295166,
      "kl": 0.00037977099418640137,
      "learning_rate": 5.873015873015873e-07,
      "loss": -0.1188,
      "num_tokens": 475974.0,
      "reward": 0.11781854927539825,
      "reward_std": 0.07036956399679184,
      "rewards/bleu_reward_func/mean": 0.11781854927539825,
      "rewards/bleu_reward_func/std": 0.17497578263282776,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 433.0,
      "completions/mean_length": 325.34375,
      "completions/mean_terminated_length": 263.125,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.0312,
      "grad_norm": 2.215810537338257,
      "kl": 0.0004220008850097656,
      "learning_rate": 6.031746031746031e-07,
      "loss": -0.0677,
      "num_tokens": 488921.0,
      "reward": 0.02974233217537403,
      "reward_std": 0.0150698097422719,
      "rewards/bleu_reward_func/mean": 0.02974233217537403,
      "rewards/bleu_reward_func/std": 0.016928359866142273,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 419.0,
      "completions/mean_length": 209.84375,
      "completions/mean_terminated_length": 140.11538696289062,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.032,
      "grad_norm": 7.524960041046143,
      "kl": 0.0004963874816894531,
      "learning_rate": 6.19047619047619e-07,
      "loss": -0.3064,
      "num_tokens": 499980.0,
      "reward": 0.03384634852409363,
      "reward_std": 0.025826433673501015,
      "rewards/bleu_reward_func/mean": 0.03384634852409363,
      "rewards/bleu_reward_func/std": 0.026973972097039223,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 367.0625,
      "completions/mean_terminated_length": 318.75,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.0328,
      "grad_norm": 2.123147487640381,
      "kl": 0.0004987716674804688,
      "learning_rate": 6.349206349206349e-07,
      "loss": -0.032,
      "num_tokens": 513846.0,
      "reward": 0.02649177610874176,
      "reward_std": 0.01194241177290678,
      "rewards/bleu_reward_func/mean": 0.02649177610874176,
      "rewards/bleu_reward_func/std": 0.01315221842378378,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 382.0,
      "completions/max_terminated_length": 382.0,
      "completions/mean_length": 177.59375,
      "completions/mean_terminated_length": 177.59375,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.0336,
      "grad_norm": 2.9975976943969727,
      "kl": 0.0005216598510742188,
      "learning_rate": 6.507936507936507e-07,
      "loss": -0.0346,
      "num_tokens": 524625.0,
      "reward": 0.07642016559839249,
      "reward_std": 0.062030211091041565,
      "rewards/bleu_reward_func/mean": 0.07642016559839249,
      "rewards/bleu_reward_func/std": 0.08820059895515442,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 226.03125,
      "completions/mean_terminated_length": 130.70834350585938,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.0344,
      "grad_norm": 4.617770671844482,
      "kl": 0.0005788803100585938,
      "learning_rate": 6.666666666666666e-07,
      "loss": -0.1183,
      "num_tokens": 534186.0,
      "reward": 0.022412922233343124,
      "reward_std": 0.028917275369167328,
      "rewards/bleu_reward_func/mean": 0.022412922233343124,
      "rewards/bleu_reward_func/std": 0.030789699405431747,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 333.625,
      "completions/mean_terminated_length": 283.67999267578125,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "epoch": 0.0352,
      "grad_norm": 2.284592390060425,
      "kl": 0.0004913806915283203,
      "learning_rate": 6.825396825396826e-07,
      "loss": -0.0931,
      "num_tokens": 547598.0,
      "reward": 0.04416097328066826,
      "reward_std": 0.02134130708873272,
      "rewards/bleu_reward_func/mean": 0.04416097328066826,
      "rewards/bleu_reward_func/std": 0.02967960573732853,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 352.3125,
      "completions/mean_terminated_length": 228.11111450195312,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.036,
      "grad_norm": 1.9085440635681152,
      "kl": 0.0005497932434082031,
      "learning_rate": 6.984126984126983e-07,
      "loss": -0.1154,
      "num_tokens": 562848.0,
      "reward": 0.07549206912517548,
      "reward_std": 0.030871842056512833,
      "rewards/bleu_reward_func/mean": 0.07549206912517548,
      "rewards/bleu_reward_func/std": 0.07412120699882507,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 380.875,
      "completions/mean_terminated_length": 302.20001220703125,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "epoch": 0.0368,
      "grad_norm": 2.130894660949707,
      "kl": 0.0005145072937011719,
      "learning_rate": 7.142857142857143e-07,
      "loss": -0.0617,
      "num_tokens": 578060.0,
      "reward": 0.12002657353878021,
      "reward_std": 0.08936386555433273,
      "rewards/bleu_reward_func/mean": 0.12002657353878021,
      "rewards/bleu_reward_func/std": 0.16516655683517456,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 119.21875,
      "completions/mean_terminated_length": 78.5862045288086,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.0376,
      "grad_norm": 5.716607093811035,
      "kl": 0.000782012939453125,
      "learning_rate": 7.301587301587301e-07,
      "loss": 0.0529,
      "num_tokens": 588707.0,
      "reward": 0.0403611958026886,
      "reward_std": 0.03214065358042717,
      "rewards/bleu_reward_func/mean": 0.0403611958026886,
      "rewards/bleu_reward_func/std": 0.053722139447927475,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 254.125,
      "completions/mean_terminated_length": 206.37037658691406,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.0384,
      "grad_norm": 3.3116095066070557,
      "kl": 0.0006084442138671875,
      "learning_rate": 7.46031746031746e-07,
      "loss": -0.177,
      "num_tokens": 598639.0,
      "reward": 0.04110237956047058,
      "reward_std": 0.030773304402828217,
      "rewards/bleu_reward_func/mean": 0.04110237956047058,
      "rewards/bleu_reward_func/std": 0.0406816266477108,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 338.34375,
      "completions/mean_terminated_length": 234.15000915527344,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "epoch": 0.0392,
      "grad_norm": 2.5459675788879395,
      "kl": 0.0008707046508789062,
      "learning_rate": 7.619047619047618e-07,
      "loss": 0.1096,
      "num_tokens": 611754.0,
      "reward": 0.039899833500385284,
      "reward_std": 0.016289234161376953,
      "rewards/bleu_reward_func/mean": 0.039899833500385284,
      "rewards/bleu_reward_func/std": 0.03525659814476967,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 410.0,
      "completions/mean_length": 265.8125,
      "completions/mean_terminated_length": 230.6428680419922,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.04,
      "grad_norm": 2.73764967918396,
      "kl": 0.0009059906005859375,
      "learning_rate": 7.777777777777778e-07,
      "loss": 0.151,
      "num_tokens": 622884.0,
      "reward": 0.04385654628276825,
      "reward_std": 0.02457226999104023,
      "rewards/bleu_reward_func/mean": 0.04385654628276825,
      "rewards/bleu_reward_func/std": 0.03391377627849579,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 324.09375,
      "completions/mean_terminated_length": 211.35000610351562,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.0408,
      "grad_norm": 3.26839017868042,
      "kl": 0.0012226104736328125,
      "learning_rate": 7.936507936507936e-07,
      "loss": 0.0831,
      "num_tokens": 635439.0,
      "reward": 0.052643824368715286,
      "reward_std": 0.03537372499704361,
      "rewards/bleu_reward_func/mean": 0.052643824368715286,
      "rewards/bleu_reward_func/std": 0.06656704843044281,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 286.875,
      "completions/mean_terminated_length": 223.83999633789062,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.0416,
      "grad_norm": 4.038628578186035,
      "kl": 0.00116729736328125,
      "learning_rate": 8.095238095238095e-07,
      "loss": -0.0229,
      "num_tokens": 648083.0,
      "reward": 0.05057225376367569,
      "reward_std": 0.05185646191239357,
      "rewards/bleu_reward_func/mean": 0.05057225376367569,
      "rewards/bleu_reward_func/std": 0.07916928082704544,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 394.0,
      "completions/mean_length": 121.09375,
      "completions/mean_terminated_length": 108.48387145996094,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0424,
      "grad_norm": 11.901754379272461,
      "kl": 0.0016069412231445312,
      "learning_rate": 8.253968253968253e-07,
      "loss": -0.193,
      "num_tokens": 654558.0,
      "reward": 0.02811940386891365,
      "reward_std": 0.017252802848815918,
      "rewards/bleu_reward_func/mean": 0.02811940386891365,
      "rewards/bleu_reward_func/std": 0.020083896815776825,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 229.625,
      "completions/mean_terminated_length": 177.3333282470703,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.0432,
      "grad_norm": 4.099213600158691,
      "kl": 0.0010061264038085938,
      "learning_rate": 8.412698412698413e-07,
      "loss": 0.1683,
      "num_tokens": 666394.0,
      "reward": 0.020335812121629715,
      "reward_std": 0.008468281477689743,
      "rewards/bleu_reward_func/mean": 0.020335812121629715,
      "rewards/bleu_reward_func/std": 0.017663516104221344,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 414.0,
      "completions/mean_length": 182.59375,
      "completions/mean_terminated_length": 148.51724243164062,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.044,
      "grad_norm": 5.222848892211914,
      "kl": 0.00174713134765625,
      "learning_rate": 8.57142857142857e-07,
      "loss": -0.0074,
      "num_tokens": 675413.0,
      "reward": 0.08503767848014832,
      "reward_std": 0.06149422377347946,
      "rewards/bleu_reward_func/mean": 0.08503767848014832,
      "rewards/bleu_reward_func/std": 0.06967518478631973,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 311.8125,
      "completions/mean_terminated_length": 135.1764678955078,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.0448,
      "grad_norm": 3.82554030418396,
      "kl": 0.001544952392578125,
      "learning_rate": 8.73015873015873e-07,
      "loss": 0.1187,
      "num_tokens": 688055.0,
      "reward": 0.021008048206567764,
      "reward_std": 0.006345948204398155,
      "rewards/bleu_reward_func/mean": 0.021008048206567764,
      "rewards/bleu_reward_func/std": 0.015550477430224419,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 445.0,
      "completions/mean_length": 384.90625,
      "completions/mean_terminated_length": 272.76470947265625,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "epoch": 0.0456,
      "grad_norm": 2.4369232654571533,
      "kl": 0.001430511474609375,
      "learning_rate": 8.888888888888888e-07,
      "loss": -0.1333,
      "num_tokens": 704084.0,
      "reward": 0.03840646147727966,
      "reward_std": 0.01636688783764839,
      "rewards/bleu_reward_func/mean": 0.03840646147727966,
      "rewards/bleu_reward_func/std": 0.020463639870285988,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 299.5625,
      "completions/mean_terminated_length": 203.0,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.0464,
      "grad_norm": 2.234159231185913,
      "kl": 0.0017108917236328125,
      "learning_rate": 9.047619047619047e-07,
      "loss": -0.0062,
      "num_tokens": 716110.0,
      "reward": 0.03707782179117203,
      "reward_std": 0.03189729154109955,
      "rewards/bleu_reward_func/mean": 0.03707782179117203,
      "rewards/bleu_reward_func/std": 0.03396334871649742,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 310.125,
      "completions/mean_terminated_length": 242.83334350585938,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.0472,
      "grad_norm": 2.620929002761841,
      "kl": 0.0015125274658203125,
      "learning_rate": 9.206349206349205e-07,
      "loss": -0.1756,
      "num_tokens": 728050.0,
      "reward": 0.04129425063729286,
      "reward_std": 0.02301635593175888,
      "rewards/bleu_reward_func/mean": 0.04129425063729286,
      "rewards/bleu_reward_func/std": 0.043909139931201935,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 345.90625,
      "completions/mean_terminated_length": 290.54168701171875,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.048,
      "grad_norm": 2.1176917552948,
      "kl": 0.0021114349365234375,
      "learning_rate": 9.365079365079365e-07,
      "loss": 0.1678,
      "num_tokens": 741791.0,
      "reward": 0.023899374529719353,
      "reward_std": 0.011497007682919502,
      "rewards/bleu_reward_func/mean": 0.023899374529719353,
      "rewards/bleu_reward_func/std": 0.015334444120526314,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 316.90625,
      "completions/mean_terminated_length": 280.77777099609375,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.0488,
      "grad_norm": 2.236198902130127,
      "kl": 0.0020008087158203125,
      "learning_rate": 9.523809523809522e-07,
      "loss": 0.1585,
      "num_tokens": 754820.0,
      "reward": 0.03658726438879967,
      "reward_std": 0.014603394083678722,
      "rewards/bleu_reward_func/mean": 0.03658726438879967,
      "rewards/bleu_reward_func/std": 0.0265581663697958,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 260.8125,
      "completions/mean_terminated_length": 214.29629516601562,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.0496,
      "grad_norm": 3.063876152038574,
      "kl": 0.0018711090087890625,
      "learning_rate": 9.682539682539682e-07,
      "loss": 0.3903,
      "num_tokens": 765190.0,
      "reward": 0.03036242537200451,
      "reward_std": 0.02141759917140007,
      "rewards/bleu_reward_func/mean": 0.03036242537200451,
      "rewards/bleu_reward_func/std": 0.027455288916826248,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 368.375,
      "completions/mean_terminated_length": 293.1428527832031,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "epoch": 0.0504,
      "grad_norm": 2.7348315715789795,
      "kl": 0.0015745162963867188,
      "learning_rate": 9.84126984126984e-07,
      "loss": -0.0601,
      "num_tokens": 780034.0,
      "reward": 0.04812411963939667,
      "reward_std": 0.022147245705127716,
      "rewards/bleu_reward_func/mean": 0.04812411963939667,
      "rewards/bleu_reward_func/std": 0.04721507802605629,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 339.25,
      "completions/mean_terminated_length": 204.88888549804688,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "epoch": 0.0512,
      "grad_norm": 2.7344889640808105,
      "kl": 0.0023365020751953125,
      "learning_rate": 1e-06,
      "loss": 0.1724,
      "num_tokens": 793210.0,
      "reward": 0.02877359464764595,
      "reward_std": 0.010511023923754692,
      "rewards/bleu_reward_func/mean": 0.02877359464764595,
      "rewards/bleu_reward_func/std": 0.011428051628172398,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 318.15625,
      "completions/mean_terminated_length": 290.46429443359375,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "epoch": 0.052,
      "grad_norm": 2.4814369678497314,
      "kl": 0.001934051513671875,
      "learning_rate": 1e-06,
      "loss": -0.0405,
      "num_tokens": 806207.0,
      "reward": 0.05107945576310158,
      "reward_std": 0.03433047980070114,
      "rewards/bleu_reward_func/mean": 0.05107945576310158,
      "rewards/bleu_reward_func/std": 0.05961597338318825,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 369.0,
      "completions/mean_length": 321.09375,
      "completions/mean_terminated_length": 190.4736785888672,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.0528,
      "grad_norm": 2.776719808578491,
      "kl": 0.0016841888427734375,
      "learning_rate": 1e-06,
      "loss": 0.0672,
      "num_tokens": 818946.0,
      "reward": 0.026194388046860695,
      "reward_std": 0.029626624658703804,
      "rewards/bleu_reward_func/mean": 0.026194388046860695,
      "rewards/bleu_reward_func/std": 0.047487590461969376,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 416.84375,
      "completions/mean_terminated_length": 309.0000305175781,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "epoch": 0.0536,
      "grad_norm": 2.1129209995269775,
      "kl": 0.0015621185302734375,
      "learning_rate": 1e-06,
      "loss": -0.0238,
      "num_tokens": 836741.0,
      "reward": 0.02843387797474861,
      "reward_std": 0.010509947314858437,
      "rewards/bleu_reward_func/mean": 0.02843387797474861,
      "rewards/bleu_reward_func/std": 0.018165679648518562,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 442.0625,
      "completions/mean_terminated_length": 325.5,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.0544,
      "grad_norm": 1.8852394819259644,
      "kl": 0.0014791488647460938,
      "learning_rate": 1e-06,
      "loss": -0.086,
      "num_tokens": 853783.0,
      "reward": 0.028096213936805725,
      "reward_std": 0.008382029831409454,
      "rewards/bleu_reward_func/mean": 0.028096213936805725,
      "rewards/bleu_reward_func/std": 0.01885552704334259,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 318.125,
      "completions/mean_terminated_length": 290.4285888671875,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "epoch": 0.0552,
      "grad_norm": 2.9050652980804443,
      "kl": 0.0017156600952148438,
      "learning_rate": 1e-06,
      "loss": -0.0435,
      "num_tokens": 865659.0,
      "reward": 0.03590589016675949,
      "reward_std": 0.017201866954565048,
      "rewards/bleu_reward_func/mean": 0.03590589016675949,
      "rewards/bleu_reward_func/std": 0.033825989812612534,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 300.0,
      "completions/mean_terminated_length": 269.71429443359375,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.056,
      "grad_norm": 2.2988243103027344,
      "kl": 0.001689910888671875,
      "learning_rate": 1e-06,
      "loss": -0.118,
      "num_tokens": 877371.0,
      "reward": 0.04953785985708237,
      "reward_std": 0.04012230038642883,
      "rewards/bleu_reward_func/mean": 0.04953785985708237,
      "rewards/bleu_reward_func/std": 0.05754861235618591,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 334.625,
      "completions/mean_terminated_length": 254.0,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.0568,
      "grad_norm": 2.465021848678589,
      "kl": 0.00246429443359375,
      "learning_rate": 1e-06,
      "loss": -0.0057,
      "num_tokens": 891055.0,
      "reward": 0.030064472928643227,
      "reward_std": 0.019575169309973717,
      "rewards/bleu_reward_func/mean": 0.030064472928643227,
      "rewards/bleu_reward_func/std": 0.023523783311247826,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 430.6875,
      "completions/mean_terminated_length": 381.8999938964844,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "epoch": 0.0576,
      "grad_norm": 2.359802722930908,
      "kl": 0.002536773681640625,
      "learning_rate": 1e-06,
      "loss": -0.0236,
      "num_tokens": 907301.0,
      "reward": 0.043774448335170746,
      "reward_std": 0.01555405743420124,
      "rewards/bleu_reward_func/mean": 0.043774448335170746,
      "rewards/bleu_reward_func/std": 0.034126028418540955,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 270.09375,
      "completions/mean_terminated_length": 189.45834350585938,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.0584,
      "grad_norm": 3.2106242179870605,
      "kl": 0.0029931068420410156,
      "learning_rate": 1e-06,
      "loss": -0.1018,
      "num_tokens": 919472.0,
      "reward": 0.019871417433023453,
      "reward_std": 0.015124676749110222,
      "rewards/bleu_reward_func/mean": 0.019871417433023453,
      "rewards/bleu_reward_func/std": 0.020977023988962173,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 433.0,
      "completions/mean_length": 234.5625,
      "completions/mean_terminated_length": 170.53846740722656,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.0592,
      "grad_norm": 5.234200954437256,
      "kl": 0.0031566619873046875,
      "learning_rate": 1e-06,
      "loss": -0.3525,
      "num_tokens": 929602.0,
      "reward": 0.03137686848640442,
      "reward_std": 0.02154741995036602,
      "rewards/bleu_reward_func/mean": 0.03137686848640442,
      "rewards/bleu_reward_func/std": 0.029113655909895897,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 301.25,
      "completions/mean_terminated_length": 262.22222900390625,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.06,
      "grad_norm": 2.501121759414673,
      "kl": 0.002643585205078125,
      "learning_rate": 1e-06,
      "loss": -0.1282,
      "num_tokens": 942298.0,
      "reward": 0.026925798505544662,
      "reward_std": 0.012626022100448608,
      "rewards/bleu_reward_func/mean": 0.026925798505544662,
      "rewards/bleu_reward_func/std": 0.014885513111948967,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 472.625,
      "completions/mean_terminated_length": 354.5,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.0608,
      "grad_norm": 1.793560266494751,
      "kl": 0.0019183158874511719,
      "learning_rate": 1e-06,
      "loss": 0.019,
      "num_tokens": 962526.0,
      "reward": 0.05564770847558975,
      "reward_std": 0.048538610339164734,
      "rewards/bleu_reward_func/mean": 0.05564770847558975,
      "rewards/bleu_reward_func/std": 0.06820879131555557,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 494.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 179.0625,
      "completions/mean_terminated_length": 179.0625,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.0616,
      "grad_norm": 5.047986030578613,
      "kl": 0.003528594970703125,
      "learning_rate": 1e-06,
      "loss": -0.2638,
      "num_tokens": 970776.0,
      "reward": 0.06285654753446579,
      "reward_std": 0.03820263221859932,
      "rewards/bleu_reward_func/mean": 0.06285654753446579,
      "rewards/bleu_reward_func/std": 0.05687018483877182,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 429.0,
      "completions/mean_length": 202.71875,
      "completions/mean_terminated_length": 170.72413635253906,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.0624,
      "grad_norm": 3.0843141078948975,
      "kl": 0.0020785927772521973,
      "learning_rate": 1e-06,
      "loss": 0.2207,
      "num_tokens": 981399.0,
      "reward": 0.05065721645951271,
      "reward_std": 0.03324894234538078,
      "rewards/bleu_reward_func/mean": 0.05065721645951271,
      "rewards/bleu_reward_func/std": 0.05357068404555321,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 355.34375,
      "completions/mean_terminated_length": 273.28570556640625,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.0632,
      "grad_norm": 2.630469799041748,
      "kl": 0.00506591796875,
      "learning_rate": 1e-06,
      "loss": -0.0981,
      "num_tokens": 995754.0,
      "reward": 0.059061747044324875,
      "reward_std": 0.030806895345449448,
      "rewards/bleu_reward_func/mean": 0.059061747044324875,
      "rewards/bleu_reward_func/std": 0.05187317356467247,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 300.3125,
      "completions/mean_terminated_length": 189.42857360839844,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.064,
      "grad_norm": 3.2961418628692627,
      "kl": 0.0030832290649414062,
      "learning_rate": 1e-06,
      "loss": -0.0105,
      "num_tokens": 1013100.0,
      "reward": 0.05653802305459976,
      "reward_std": 0.017924563959240913,
      "rewards/bleu_reward_func/mean": 0.05653802305459976,
      "rewards/bleu_reward_func/std": 0.05833124369382858,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 451.0,
      "completions/mean_length": 341.40625,
      "completions/mean_terminated_length": 239.0500030517578,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.0648,
      "grad_norm": 2.498276710510254,
      "kl": 0.0046234130859375,
      "learning_rate": 1e-06,
      "loss": -0.2798,
      "num_tokens": 1026945.0,
      "reward": 0.040751807391643524,
      "reward_std": 0.018808823078870773,
      "rewards/bleu_reward_func/mean": 0.040751807391643524,
      "rewards/bleu_reward_func/std": 0.02410094253718853,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 440.0,
      "completions/mean_length": 324.6875,
      "completions/mean_terminated_length": 262.25,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.0656,
      "grad_norm": 2.130357265472412,
      "kl": 0.002826690673828125,
      "learning_rate": 1e-06,
      "loss": 0.0411,
      "num_tokens": 1042063.0,
      "reward": 0.036513280123472214,
      "reward_std": 0.01892837882041931,
      "rewards/bleu_reward_func/mean": 0.036513280123472214,
      "rewards/bleu_reward_func/std": 0.03590543195605278,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 301.78125,
      "completions/mean_terminated_length": 271.75,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.0664,
      "grad_norm": 2.751675605773926,
      "kl": 0.003963470458984375,
      "learning_rate": 1e-06,
      "loss": 0.1396,
      "num_tokens": 1053920.0,
      "reward": 0.0392126627266407,
      "reward_std": 0.011845908127725124,
      "rewards/bleu_reward_func/mean": 0.0392126627266407,
      "rewards/bleu_reward_func/std": 0.02583330124616623,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 261.875,
      "completions/mean_terminated_length": 236.0,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.0672,
      "grad_norm": 2.6925158500671387,
      "kl": 0.00390625,
      "learning_rate": 1e-06,
      "loss": 0.0117,
      "num_tokens": 1065300.0,
      "reward": 0.029011068865656853,
      "reward_std": 0.016968993470072746,
      "rewards/bleu_reward_func/mean": 0.029011068865656853,
      "rewards/bleu_reward_func/std": 0.02045534923672676,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 447.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 226.59375,
      "completions/mean_terminated_length": 226.59375,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.068,
      "grad_norm": 3.6709952354431152,
      "kl": 0.00417327880859375,
      "learning_rate": 1e-06,
      "loss": 0.1037,
      "num_tokens": 1075647.0,
      "reward": 0.06442218273878098,
      "reward_std": 0.028383802622556686,
      "rewards/bleu_reward_func/mean": 0.06442218273878098,
      "rewards/bleu_reward_func/std": 0.04861043021082878,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 437.0,
      "completions/mean_length": 324.125,
      "completions/mean_terminated_length": 261.5,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.0688,
      "grad_norm": 2.570126533508301,
      "kl": 0.0053558349609375,
      "learning_rate": 1e-06,
      "loss": 0.041,
      "num_tokens": 1089091.0,
      "reward": 0.05085538700222969,
      "reward_std": 0.01834620162844658,
      "rewards/bleu_reward_func/mean": 0.05085538700222969,
      "rewards/bleu_reward_func/std": 0.031752023845911026,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 495.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 180.6875,
      "completions/mean_terminated_length": 180.6875,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.0696,
      "grad_norm": 6.8046674728393555,
      "kl": 0.004756927490234375,
      "learning_rate": 1e-06,
      "loss": -0.2613,
      "num_tokens": 1097313.0,
      "reward": 0.04579862207174301,
      "reward_std": 0.027461305260658264,
      "rewards/bleu_reward_func/mean": 0.04579862207174301,
      "rewards/bleu_reward_func/std": 0.04285133630037308,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 352.3125,
      "completions/mean_terminated_length": 228.11111450195312,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.0704,
      "grad_norm": 3.2189218997955322,
      "kl": 0.0033960342407226562,
      "learning_rate": 1e-06,
      "loss": 0.02,
      "num_tokens": 1115235.0,
      "reward": 0.053727827966213226,
      "reward_std": 0.04648362472653389,
      "rewards/bleu_reward_func/mean": 0.053727827966213226,
      "rewards/bleu_reward_func/std": 0.07603882998228073,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 296.03125,
      "completions/mean_terminated_length": 148.26315307617188,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.0712,
      "grad_norm": 3.394224166870117,
      "kl": 0.00843048095703125,
      "learning_rate": 1e-06,
      "loss": -0.0242,
      "num_tokens": 1127868.0,
      "reward": 0.04170762002468109,
      "reward_std": 0.015014639124274254,
      "rewards/bleu_reward_func/mean": 0.04170762002468109,
      "rewards/bleu_reward_func/std": 0.028102483600378036,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 275.71875,
      "completions/mean_terminated_length": 221.19232177734375,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.072,
      "grad_norm": 3.0822086334228516,
      "kl": 0.00803375244140625,
      "learning_rate": 1e-06,
      "loss": -0.142,
      "num_tokens": 1139531.0,
      "reward": 0.07560917735099792,
      "reward_std": 0.045496731996536255,
      "rewards/bleu_reward_func/mean": 0.07560917735099792,
      "rewards/bleu_reward_func/std": 0.0789395347237587,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 440.0,
      "completions/mean_length": 403.53125,
      "completions/mean_terminated_length": 295.0625,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "epoch": 0.0728,
      "grad_norm": 2.45961594581604,
      "kl": 0.00630950927734375,
      "learning_rate": 1e-06,
      "loss": -0.0524,
      "num_tokens": 1155996.0,
      "reward": 0.03898419439792633,
      "reward_std": 0.01788502372801304,
      "rewards/bleu_reward_func/mean": 0.03898419439792633,
      "rewards/bleu_reward_func/std": 0.022303381934762,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 456.0,
      "completions/mean_length": 377.3125,
      "completions/mean_terminated_length": 242.625,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.0736,
      "grad_norm": 1.5934685468673706,
      "kl": 0.00415802001953125,
      "learning_rate": 1e-06,
      "loss": 0.1459,
      "num_tokens": 1172846.0,
      "reward": 0.08643854409456253,
      "reward_std": 0.0729157105088234,
      "rewards/bleu_reward_func/mean": 0.08643854409456253,
      "rewards/bleu_reward_func/std": 0.12770512700080872,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 332.90625,
      "completions/mean_terminated_length": 282.7599792480469,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.0744,
      "grad_norm": 2.671640157699585,
      "kl": 0.00707244873046875,
      "learning_rate": 1e-06,
      "loss": -0.0502,
      "num_tokens": 1185467.0,
      "reward": 0.05207536742091179,
      "reward_std": 0.02466990053653717,
      "rewards/bleu_reward_func/mean": 0.05207536742091179,
      "rewards/bleu_reward_func/std": 0.03447216376662254,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 464.0,
      "completions/mean_length": 295.6875,
      "completions/mean_terminated_length": 273.3103332519531,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.0752,
      "grad_norm": 2.9223239421844482,
      "kl": 0.006927490234375,
      "learning_rate": 1e-06,
      "loss": -0.1365,
      "num_tokens": 1198065.0,
      "reward": 0.052932240068912506,
      "reward_std": 0.01840699091553688,
      "rewards/bleu_reward_func/mean": 0.052932240068912506,
      "rewards/bleu_reward_func/std": 0.039161618798971176,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 445.90625,
      "completions/mean_terminated_length": 349.3077087402344,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.076,
      "grad_norm": 1.8899106979370117,
      "kl": 0.0072784423828125,
      "learning_rate": 1e-06,
      "loss": -0.0753,
      "num_tokens": 1215878.0,
      "reward": 0.03316285461187363,
      "reward_std": 0.01574653573334217,
      "rewards/bleu_reward_func/mean": 0.03316285461187363,
      "rewards/bleu_reward_func/std": 0.01957116089761257,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 204.375,
      "completions/mean_terminated_length": 183.86666870117188,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.0768,
      "grad_norm": 4.009552478790283,
      "kl": 0.005107879638671875,
      "learning_rate": 1e-06,
      "loss": 0.077,
      "num_tokens": 1227954.0,
      "reward": 0.1502164751291275,
      "reward_std": 0.09705069661140442,
      "rewards/bleu_reward_func/mean": 0.1502164751291275,
      "rewards/bleu_reward_func/std": 0.23583538830280304,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 415.0,
      "completions/mean_length": 258.75,
      "completions/mean_terminated_length": 200.3076934814453,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.0776,
      "grad_norm": 2.948690891265869,
      "kl": 0.00766754150390625,
      "learning_rate": 1e-06,
      "loss": -0.1655,
      "num_tokens": 1240202.0,
      "reward": 0.0456559993326664,
      "reward_std": 0.033094413578510284,
      "rewards/bleu_reward_func/mean": 0.0456559993326664,
      "rewards/bleu_reward_func/std": 0.03354474529623985,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 462.0,
      "completions/mean_length": 333.5625,
      "completions/mean_terminated_length": 240.09524536132812,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.0784,
      "grad_norm": 2.532744884490967,
      "kl": 0.008470535278320312,
      "learning_rate": 1e-06,
      "loss": 0.0968,
      "num_tokens": 1255796.0,
      "reward": 0.0555446520447731,
      "reward_std": 0.03220447525382042,
      "rewards/bleu_reward_func/mean": 0.0555446520447731,
      "rewards/bleu_reward_func/std": 0.05794409662485123,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 487.0,
      "completions/mean_length": 293.8125,
      "completions/mean_terminated_length": 262.64288330078125,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.0792,
      "grad_norm": 10.322442054748535,
      "kl": 0.00716400146484375,
      "learning_rate": 1e-06,
      "loss": -0.0058,
      "num_tokens": 1270534.0,
      "reward": 0.06366641819477081,
      "reward_std": 0.04360166937112808,
      "rewards/bleu_reward_func/mean": 0.06366641819477081,
      "rewards/bleu_reward_func/std": 0.09251260757446289,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 305.21875,
      "completions/mean_terminated_length": 291.433349609375,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.08,
      "grad_norm": 2.6224372386932373,
      "kl": 0.0085296630859375,
      "learning_rate": 1e-06,
      "loss": -0.0406,
      "num_tokens": 1282741.0,
      "reward": 0.04561196267604828,
      "reward_std": 0.032932039350271225,
      "rewards/bleu_reward_func/mean": 0.04561196267604828,
      "rewards/bleu_reward_func/std": 0.06322391331195831,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 282.0625,
      "completions/mean_terminated_length": 192.0869598388672,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.0808,
      "grad_norm": 3.268648147583008,
      "kl": 0.01361083984375,
      "learning_rate": 1e-06,
      "loss": -0.1683,
      "num_tokens": 1294135.0,
      "reward": 0.05667008087038994,
      "reward_std": 0.030532341450452805,
      "rewards/bleu_reward_func/mean": 0.05667008087038994,
      "rewards/bleu_reward_func/std": 0.05759035050868988,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 333.8125,
      "completions/mean_terminated_length": 300.8148193359375,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "epoch": 0.0816,
      "grad_norm": 2.907650947570801,
      "kl": 0.01136016845703125,
      "learning_rate": 1e-06,
      "loss": 0.0913,
      "num_tokens": 1306993.0,
      "reward": 0.03263912349939346,
      "reward_std": 0.009969690814614296,
      "rewards/bleu_reward_func/mean": 0.03263912349939346,
      "rewards/bleu_reward_func/std": 0.014208728447556496,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 440.0,
      "completions/mean_length": 220.0625,
      "completions/mean_terminated_length": 166.0,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.0824,
      "grad_norm": 4.579634666442871,
      "kl": 0.0120391845703125,
      "learning_rate": 1e-06,
      "loss": -0.0133,
      "num_tokens": 1317267.0,
      "reward": 0.033596813678741455,
      "reward_std": 0.015902765095233917,
      "rewards/bleu_reward_func/mean": 0.033596813678741455,
      "rewards/bleu_reward_func/std": 0.01867716945707798,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 454.0,
      "completions/mean_length": 355.375,
      "completions/mean_terminated_length": 198.75,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "epoch": 0.0832,
      "grad_norm": 2.706808090209961,
      "kl": 0.013458251953125,
      "learning_rate": 1e-06,
      "loss": -0.1151,
      "num_tokens": 1334423.0,
      "reward": 0.05682121962308884,
      "reward_std": 0.0347580686211586,
      "rewards/bleu_reward_func/mean": 0.05682121962308884,
      "rewards/bleu_reward_func/std": 0.055416397750377655,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 487.0,
      "completions/mean_length": 490.84375,
      "completions/mean_terminated_length": 427.375,
      "completions/min_length": 270.0,
      "completions/min_terminated_length": 270.0,
      "epoch": 0.084,
      "grad_norm": 1.9632360935211182,
      "kl": 0.00994873046875,
      "learning_rate": 1e-06,
      "loss": -0.0389,
      "num_tokens": 1352930.0,
      "reward": 0.03201688453555107,
      "reward_std": 0.00869814120233059,
      "rewards/bleu_reward_func/mean": 0.03201688453555107,
      "rewards/bleu_reward_func/std": 0.013300970196723938,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 456.0,
      "completions/mean_length": 426.5,
      "completions/mean_terminated_length": 301.5384826660156,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 0.0848,
      "grad_norm": 1.8980785608291626,
      "kl": 0.00910186767578125,
      "learning_rate": 1e-06,
      "loss": 0.1299,
      "num_tokens": 1370418.0,
      "reward": 0.024408889934420586,
      "reward_std": 0.016656802967190742,
      "rewards/bleu_reward_func/mean": 0.024408889934420586,
      "rewards/bleu_reward_func/std": 0.026626311242580414,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 448.09375,
      "completions/mean_terminated_length": 384.1875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "epoch": 0.0856,
      "grad_norm": 2.022083044052124,
      "kl": 0.01107025146484375,
      "learning_rate": 1e-06,
      "loss": -0.0435,
      "num_tokens": 1387805.0,
      "reward": 0.03671019896864891,
      "reward_std": 0.015504223294556141,
      "rewards/bleu_reward_func/mean": 0.03671019896864891,
      "rewards/bleu_reward_func/std": 0.028309425339102745,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 356.46875,
      "completions/mean_terminated_length": 200.9375,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.0864,
      "grad_norm": 2.7162857055664062,
      "kl": 0.01024627685546875,
      "learning_rate": 1e-06,
      "loss": -0.1255,
      "num_tokens": 1402212.0,
      "reward": 0.03878065198659897,
      "reward_std": 0.02206358313560486,
      "rewards/bleu_reward_func/mean": 0.03878065198659897,
      "rewards/bleu_reward_func/std": 0.029185639694333076,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 355.3125,
      "completions/mean_terminated_length": 303.0833435058594,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "epoch": 0.0872,
      "grad_norm": 2.625575304031372,
      "kl": 0.01190185546875,
      "learning_rate": 1e-06,
      "loss": -0.0505,
      "num_tokens": 1416102.0,
      "reward": 0.04079345613718033,
      "reward_std": 0.021366603672504425,
      "rewards/bleu_reward_func/mean": 0.04079345613718033,
      "rewards/bleu_reward_func/std": 0.03239119052886963,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 487.0,
      "completions/mean_length": 333.46875,
      "completions/mean_terminated_length": 263.60870361328125,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.088,
      "grad_norm": 2.462629556655884,
      "kl": 0.0115509033203125,
      "learning_rate": 1e-06,
      "loss": 0.0333,
      "num_tokens": 1429517.0,
      "reward": 0.04992126300930977,
      "reward_std": 0.026332605630159378,
      "rewards/bleu_reward_func/mean": 0.04992126300930977,
      "rewards/bleu_reward_func/std": 0.0346212200820446,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 310.46875,
      "completions/mean_terminated_length": 189.5500030517578,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.0888,
      "grad_norm": 4.681856155395508,
      "kl": 0.0270538330078125,
      "learning_rate": 1e-06,
      "loss": 0.2249,
      "num_tokens": 1443588.0,
      "reward": 0.061340004205703735,
      "reward_std": 0.020015515387058258,
      "rewards/bleu_reward_func/mean": 0.061340004205703735,
      "rewards/bleu_reward_func/std": 0.031175505369901657,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 376.4375,
      "completions/mean_terminated_length": 240.875,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.0896,
      "grad_norm": 2.6015677452087402,
      "kl": 0.00934600830078125,
      "learning_rate": 1e-06,
      "loss": -0.0971,
      "num_tokens": 1459066.0,
      "reward": 0.03436078503727913,
      "reward_std": 0.01115034706890583,
      "rewards/bleu_reward_func/mean": 0.03436078503727913,
      "rewards/bleu_reward_func/std": 0.022014673799276352,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 414.25,
      "completions/mean_terminated_length": 347.3684387207031,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 0.0904,
      "grad_norm": 2.417910099029541,
      "kl": 0.0102996826171875,
      "learning_rate": 1e-06,
      "loss": 0.0133,
      "num_tokens": 1476074.0,
      "reward": 0.0862937867641449,
      "reward_std": 0.033545784652233124,
      "rewards/bleu_reward_func/mean": 0.0862937867641449,
      "rewards/bleu_reward_func/std": 0.04349099099636078,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 417.0625,
      "completions/mean_terminated_length": 295.0,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.0912,
      "grad_norm": 2.263538360595703,
      "kl": 0.0116119384765625,
      "learning_rate": 1e-06,
      "loss": 0.0398,
      "num_tokens": 1491932.0,
      "reward": 0.04719041287899017,
      "reward_std": 0.024131447076797485,
      "rewards/bleu_reward_func/mean": 0.04719041287899017,
      "rewards/bleu_reward_func/std": 0.02794639579951763,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 334.0,
      "completions/mean_length": 314.59375,
      "completions/mean_terminated_length": 140.41175842285156,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.092,
      "grad_norm": 2.4896774291992188,
      "kl": 0.0186920166015625,
      "learning_rate": 1e-06,
      "loss": -0.1274,
      "num_tokens": 1508711.0,
      "reward": 0.06531796604394913,
      "reward_std": 0.05683267116546631,
      "rewards/bleu_reward_func/mean": 0.06531796604394913,
      "rewards/bleu_reward_func/std": 0.08694739639759064,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 389.15625,
      "completions/mean_terminated_length": 249.933349609375,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "epoch": 0.0928,
      "grad_norm": 2.473355531692505,
      "kl": 0.0122833251953125,
      "learning_rate": 1e-06,
      "loss": -0.1519,
      "num_tokens": 1524220.0,
      "reward": 0.03568326681852341,
      "reward_std": 0.013284995220601559,
      "rewards/bleu_reward_func/mean": 0.03568326681852341,
      "rewards/bleu_reward_func/std": 0.018093997612595558,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 414.0,
      "completions/mean_length": 235.03125,
      "completions/mean_terminated_length": 171.11538696289062,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.0936,
      "grad_norm": 3.366828680038452,
      "kl": 0.009637832641601562,
      "learning_rate": 1e-06,
      "loss": 0.0635,
      "num_tokens": 1538085.0,
      "reward": 0.057241007685661316,
      "reward_std": 0.02658858895301819,
      "rewards/bleu_reward_func/mean": 0.057241007685661316,
      "rewards/bleu_reward_func/std": 0.05006576329469681,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 471.0,
      "completions/mean_length": 320.90625,
      "completions/mean_terminated_length": 190.15789794921875,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.0944,
      "grad_norm": 2.6779184341430664,
      "kl": 0.012842178344726562,
      "learning_rate": 1e-06,
      "loss": -0.1215,
      "num_tokens": 1552642.0,
      "reward": 0.11695751547813416,
      "reward_std": 0.03736204653978348,
      "rewards/bleu_reward_func/mean": 0.11695751547813416,
      "rewards/bleu_reward_func/std": 0.09987916797399521,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 309.46875,
      "completions/mean_terminated_length": 252.75999450683594,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.0952,
      "grad_norm": 3.17213773727417,
      "kl": 0.0166168212890625,
      "learning_rate": 1e-06,
      "loss": 0.081,
      "num_tokens": 1567041.0,
      "reward": 0.06574233621358871,
      "reward_std": 0.04479731619358063,
      "rewards/bleu_reward_func/mean": 0.06574233621358871,
      "rewards/bleu_reward_func/std": 0.06972567737102509,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 348.53125,
      "completions/mean_terminated_length": 185.0625,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.096,
      "grad_norm": 3.1326615810394287,
      "kl": 0.00875091552734375,
      "learning_rate": 1e-06,
      "loss": 0.0663,
      "num_tokens": 1585914.0,
      "reward": 0.02676137164235115,
      "reward_std": 0.007912165485322475,
      "rewards/bleu_reward_func/mean": 0.02676137164235115,
      "rewards/bleu_reward_func/std": 0.019907211884856224,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 312.53125,
      "completions/mean_terminated_length": 246.0416717529297,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.0968,
      "grad_norm": 2.3142240047454834,
      "kl": 0.0168609619140625,
      "learning_rate": 1e-06,
      "loss": 0.0616,
      "num_tokens": 1599083.0,
      "reward": 0.0773664191365242,
      "reward_std": 0.03548593446612358,
      "rewards/bleu_reward_func/mean": 0.0773664191365242,
      "rewards/bleu_reward_func/std": 0.09226932376623154,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 352.625,
      "completions/mean_terminated_length": 290.2608642578125,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.0976,
      "grad_norm": 3.595508098602295,
      "kl": 0.01422119140625,
      "learning_rate": 1e-06,
      "loss": 0.1431,
      "num_tokens": 1612983.0,
      "reward": 0.04203544184565544,
      "reward_std": 0.013445420190691948,
      "rewards/bleu_reward_func/mean": 0.04203544184565544,
      "rewards/bleu_reward_func/std": 0.01539506297558546,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 318.96875,
      "completions/mean_terminated_length": 243.43478393554688,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.0984,
      "grad_norm": 2.913456678390503,
      "kl": 0.0119171142578125,
      "learning_rate": 1e-06,
      "loss": -0.0963,
      "num_tokens": 1626678.0,
      "reward": 0.03706140071153641,
      "reward_std": 0.0105556296184659,
      "rewards/bleu_reward_func/mean": 0.03706140071153641,
      "rewards/bleu_reward_func/std": 0.015232140198349953,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 313.0625,
      "completions/mean_terminated_length": 246.75,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.0992,
      "grad_norm": 2.5508322715759277,
      "kl": 0.0186309814453125,
      "learning_rate": 1e-06,
      "loss": 0.1845,
      "num_tokens": 1643632.0,
      "reward": 0.02773209474980831,
      "reward_std": 0.006956611294299364,
      "rewards/bleu_reward_func/mean": 0.02773209474980831,
      "rewards/bleu_reward_func/std": 0.012230291962623596,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 423.0,
      "completions/mean_length": 314.25,
      "completions/mean_terminated_length": 248.33334350585938,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.1,
      "grad_norm": 2.6516916751861572,
      "kl": 0.013763427734375,
      "learning_rate": 1e-06,
      "loss": -0.1173,
      "num_tokens": 1655768.0,
      "reward": 0.052763912826776505,
      "reward_std": 0.02353248931467533,
      "rewards/bleu_reward_func/mean": 0.052763912826776505,
      "rewards/bleu_reward_func/std": 0.03502753749489784,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 409.0,
      "completions/mean_length": 372.5625,
      "completions/mean_terminated_length": 249.5294189453125,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.1008,
      "grad_norm": 2.2216272354125977,
      "kl": 0.0171661376953125,
      "learning_rate": 1e-06,
      "loss": 0.0064,
      "num_tokens": 1672106.0,
      "reward": 0.035765521228313446,
      "reward_std": 0.009950447827577591,
      "rewards/bleu_reward_func/mean": 0.035765521228313446,
      "rewards/bleu_reward_func/std": 0.022508379071950912,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 410.09375,
      "completions/mean_terminated_length": 279.0714416503906,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.1016,
      "grad_norm": 2.5408473014831543,
      "kl": 0.0154571533203125,
      "learning_rate": 1e-06,
      "loss": -0.2255,
      "num_tokens": 1689573.0,
      "reward": 0.035822078585624695,
      "reward_std": 0.01573784276843071,
      "rewards/bleu_reward_func/mean": 0.035822078585624695,
      "rewards/bleu_reward_func/std": 0.02253863401710987,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 298.8125,
      "completions/mean_terminated_length": 284.6000061035156,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.1024,
      "grad_norm": 3.248319387435913,
      "kl": 0.0136260986328125,
      "learning_rate": 1e-06,
      "loss": -0.0412,
      "num_tokens": 1701151.0,
      "reward": 0.058197036385536194,
      "reward_std": 0.017663825303316116,
      "rewards/bleu_reward_func/mean": 0.058197036385536194,
      "rewards/bleu_reward_func/std": 0.04830459877848625,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 409.8125,
      "completions/mean_terminated_length": 348.5,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "epoch": 0.1032,
      "grad_norm": 2.0520575046539307,
      "kl": 0.015594482421875,
      "learning_rate": 1e-06,
      "loss": -0.0989,
      "num_tokens": 1717193.0,
      "reward": 0.07774099707603455,
      "reward_std": 0.024711469188332558,
      "rewards/bleu_reward_func/mean": 0.07774099707603455,
      "rewards/bleu_reward_func/std": 0.0407242514193058,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 329.5625,
      "completions/mean_terminated_length": 303.5,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.104,
      "grad_norm": 2.5769379138946533,
      "kl": 0.0155029296875,
      "learning_rate": 1e-06,
      "loss": 0.1015,
      "num_tokens": 1730843.0,
      "reward": 0.04991535469889641,
      "reward_std": 0.017646994441747665,
      "rewards/bleu_reward_func/mean": 0.04991535469889641,
      "rewards/bleu_reward_func/std": 0.048128433525562286,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 260.875,
      "completions/mean_terminated_length": 177.1666717529297,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.1048,
      "grad_norm": 3.685297727584839,
      "kl": 0.01507568359375,
      "learning_rate": 1e-06,
      "loss": 0.0444,
      "num_tokens": 1743639.0,
      "reward": 0.0769617035984993,
      "reward_std": 0.030974943190813065,
      "rewards/bleu_reward_func/mean": 0.0769617035984993,
      "rewards/bleu_reward_func/std": 0.09881884604692459,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 359.375,
      "completions/mean_terminated_length": 299.6521911621094,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "epoch": 0.1056,
      "grad_norm": 2.244091272354126,
      "kl": 0.01470947265625,
      "learning_rate": 1e-06,
      "loss": -0.0148,
      "num_tokens": 1758339.0,
      "reward": 0.02497226372361183,
      "reward_std": 0.006721612066030502,
      "rewards/bleu_reward_func/mean": 0.02497226372361183,
      "rewards/bleu_reward_func/std": 0.011903750710189342,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 345.9375,
      "completions/mean_terminated_length": 322.21429443359375,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "epoch": 0.1064,
      "grad_norm": 2.510582208633423,
      "kl": 0.0145721435546875,
      "learning_rate": 1e-06,
      "loss": -0.0027,
      "num_tokens": 1771673.0,
      "reward": 0.032875481992959976,
      "reward_std": 0.010129079222679138,
      "rewards/bleu_reward_func/mean": 0.032875481992959976,
      "rewards/bleu_reward_func/std": 0.01497406605631113,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 452.0,
      "completions/mean_length": 307.0,
      "completions/mean_terminated_length": 213.8181915283203,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.1072,
      "grad_norm": 2.790792942047119,
      "kl": 0.0141448974609375,
      "learning_rate": 1e-06,
      "loss": -0.0355,
      "num_tokens": 1784081.0,
      "reward": 0.019524620845913887,
      "reward_std": 0.0064018769189715385,
      "rewards/bleu_reward_func/mean": 0.019524620845913887,
      "rewards/bleu_reward_func/std": 0.011706347577273846,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 442.0,
      "completions/max_terminated_length": 442.0,
      "completions/mean_length": 204.34375,
      "completions/mean_terminated_length": 204.34375,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.108,
      "grad_norm": 2.8634583950042725,
      "kl": 0.0163726806640625,
      "learning_rate": 1e-06,
      "loss": 0.1028,
      "num_tokens": 1793868.0,
      "reward": 0.08782406896352768,
      "reward_std": 0.05941709131002426,
      "rewards/bleu_reward_func/mean": 0.08782406896352768,
      "rewards/bleu_reward_func/std": 0.1270333081483841,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 495.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 229.625,
      "completions/mean_terminated_length": 229.625,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.1088,
      "grad_norm": 3.192974090576172,
      "kl": 0.0136566162109375,
      "learning_rate": 1e-06,
      "loss": -0.0969,
      "num_tokens": 1803704.0,
      "reward": 0.04424320533871651,
      "reward_std": 0.017299409955739975,
      "rewards/bleu_reward_func/mean": 0.04424320533871651,
      "rewards/bleu_reward_func/std": 0.03912116214632988,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 316.9375,
      "completions/mean_terminated_length": 183.4736785888672,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.1096,
      "grad_norm": 2.307054042816162,
      "kl": 0.018157958984375,
      "learning_rate": 1e-06,
      "loss": -0.0291,
      "num_tokens": 1817166.0,
      "reward": 0.03945375978946686,
      "reward_std": 0.025181055068969727,
      "rewards/bleu_reward_func/mean": 0.03945375978946686,
      "rewards/bleu_reward_func/std": 0.03327018395066261,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 442.0,
      "completions/mean_length": 210.1875,
      "completions/mean_terminated_length": 178.96551513671875,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.1104,
      "grad_norm": 5.996477127075195,
      "kl": 0.0174713134765625,
      "learning_rate": 1e-06,
      "loss": 0.0224,
      "num_tokens": 1828812.0,
      "reward": 0.03139394521713257,
      "reward_std": 0.011995144188404083,
      "rewards/bleu_reward_func/mean": 0.03139394521713257,
      "rewards/bleu_reward_func/std": 0.01749689131975174,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 398.6875,
      "completions/mean_terminated_length": 285.375,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.1112,
      "grad_norm": 2.496495246887207,
      "kl": 0.0198974609375,
      "learning_rate": 1e-06,
      "loss": 0.0135,
      "num_tokens": 1845378.0,
      "reward": 0.04385095834732056,
      "reward_std": 0.012373005039989948,
      "rewards/bleu_reward_func/mean": 0.04385095834732056,
      "rewards/bleu_reward_func/std": 0.023492127656936646,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 440.0,
      "completions/mean_length": 407.1875,
      "completions/mean_terminated_length": 325.6666564941406,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "epoch": 0.112,
      "grad_norm": 2.1035640239715576,
      "kl": 0.013671875,
      "learning_rate": 1e-06,
      "loss": 0.0472,
      "num_tokens": 1861792.0,
      "reward": 0.06062568724155426,
      "reward_std": 0.02712031453847885,
      "rewards/bleu_reward_func/mean": 0.06062568724155426,
      "rewards/bleu_reward_func/std": 0.038938842713832855,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 469.0,
      "completions/mean_length": 337.40625,
      "completions/mean_terminated_length": 279.2083435058594,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "epoch": 0.1128,
      "grad_norm": 2.4870097637176514,
      "kl": 0.0177764892578125,
      "learning_rate": 1e-06,
      "loss": -0.042,
      "num_tokens": 1875197.0,
      "reward": 0.07142765074968338,
      "reward_std": 0.025316152721643448,
      "rewards/bleu_reward_func/mean": 0.07142765074968338,
      "rewards/bleu_reward_func/std": 0.054619304835796356,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 296.34375,
      "completions/mean_terminated_length": 256.40740966796875,
      "completions/min_length": 85.0,
      "completions/min_terminated_length": 85.0,
      "epoch": 0.1136,
      "grad_norm": 3.088263750076294,
      "kl": 0.025543212890625,
      "learning_rate": 1e-06,
      "loss": 0.0453,
      "num_tokens": 1888160.0,
      "reward": 0.06487879157066345,
      "reward_std": 0.019724037498235703,
      "rewards/bleu_reward_func/mean": 0.06487879157066345,
      "rewards/bleu_reward_func/std": 0.045981332659721375,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 118.0,
      "completions/mean_length": 387.8125,
      "completions/mean_terminated_length": 70.44444274902344,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.1144,
      "grad_norm": 3.4739062786102295,
      "kl": 0.023529052734375,
      "learning_rate": 1e-06,
      "loss": 0.0193,
      "num_tokens": 1904546.0,
      "reward": 0.016506584361195564,
      "reward_std": 0.010010890662670135,
      "rewards/bleu_reward_func/mean": 0.016506584361195564,
      "rewards/bleu_reward_func/std": 0.014175361953675747,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 439.0,
      "completions/mean_length": 446.65625,
      "completions/mean_terminated_length": 337.75,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "epoch": 0.1152,
      "grad_norm": 2.3656280040740967,
      "kl": 0.01031494140625,
      "learning_rate": 1e-06,
      "loss": -0.0996,
      "num_tokens": 1923767.0,
      "reward": 0.08849923312664032,
      "reward_std": 0.05801050364971161,
      "rewards/bleu_reward_func/mean": 0.08849923312664032,
      "rewards/bleu_reward_func/std": 0.10124781727790833,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 327.125,
      "completions/mean_terminated_length": 275.3599853515625,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.116,
      "grad_norm": 2.725642681121826,
      "kl": 0.022552490234375,
      "learning_rate": 1e-06,
      "loss": 0.0032,
      "num_tokens": 1940155.0,
      "reward": 0.04029117524623871,
      "reward_std": 0.023508241400122643,
      "rewards/bleu_reward_func/mean": 0.04029117524623871,
      "rewards/bleu_reward_func/std": 0.037789031863212585,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 302.0625,
      "completions/mean_terminated_length": 219.9130401611328,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.1168,
      "grad_norm": 3.752535104751587,
      "kl": 0.0152587890625,
      "learning_rate": 1e-06,
      "loss": -0.0033,
      "num_tokens": 1951805.0,
      "reward": 0.03404291719198227,
      "reward_std": 0.018146470189094543,
      "rewards/bleu_reward_func/mean": 0.03404291719198227,
      "rewards/bleu_reward_func/std": 0.023315640166401863,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 411.6875,
      "completions/mean_terminated_length": 282.71429443359375,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.1176,
      "grad_norm": 2.323500156402588,
      "kl": 0.0173492431640625,
      "learning_rate": 1e-06,
      "loss": 0.0301,
      "num_tokens": 1967539.0,
      "reward": 0.06629061698913574,
      "reward_std": 0.01657968759536743,
      "rewards/bleu_reward_func/mean": 0.06629061698913574,
      "rewards/bleu_reward_func/std": 0.07008767873048782,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 478.0,
      "completions/mean_length": 300.6875,
      "completions/mean_terminated_length": 230.25,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "epoch": 0.1184,
      "grad_norm": 2.4731006622314453,
      "kl": 0.022491455078125,
      "learning_rate": 1e-06,
      "loss": -0.0209,
      "num_tokens": 1980961.0,
      "reward": 0.05584581196308136,
      "reward_std": 0.016303110867738724,
      "rewards/bleu_reward_func/mean": 0.05584581196308136,
      "rewards/bleu_reward_func/std": 0.05100340396165848,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 473.0,
      "completions/mean_length": 299.84375,
      "completions/mean_terminated_length": 240.4399871826172,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "epoch": 0.1192,
      "grad_norm": 2.91982364654541,
      "kl": 0.020263671875,
      "learning_rate": 1e-06,
      "loss": -0.0718,
      "num_tokens": 1993940.0,
      "reward": 0.048127830028533936,
      "reward_std": 0.01851847395300865,
      "rewards/bleu_reward_func/mean": 0.048127830028533936,
      "rewards/bleu_reward_func/std": 0.03433293104171753,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 335.46875,
      "completions/mean_terminated_length": 276.625,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "epoch": 0.12,
      "grad_norm": 3.051020383834839,
      "kl": 0.01885986328125,
      "learning_rate": 1e-06,
      "loss": 0.0288,
      "num_tokens": 2007011.0,
      "reward": 0.04020792990922928,
      "reward_std": 0.008897930383682251,
      "rewards/bleu_reward_func/mean": 0.04020792990922928,
      "rewards/bleu_reward_func/std": 0.018972909078001976,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 366.625,
      "completions/mean_terminated_length": 300.54547119140625,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.1208,
      "grad_norm": 3.5532686710357666,
      "kl": 0.0150604248046875,
      "learning_rate": 1e-06,
      "loss": 0.1151,
      "num_tokens": 2025423.0,
      "reward": 0.05799319967627525,
      "reward_std": 0.03025471605360508,
      "rewards/bleu_reward_func/mean": 0.05799319967627525,
      "rewards/bleu_reward_func/std": 0.04186660796403885,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 457.0625,
      "completions/mean_terminated_length": 402.125,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "epoch": 0.1216,
      "grad_norm": 2.035194158554077,
      "kl": 0.0230865478515625,
      "learning_rate": 1e-06,
      "loss": 0.0452,
      "num_tokens": 2043697.0,
      "reward": 0.06312853842973709,
      "reward_std": 0.014973493292927742,
      "rewards/bleu_reward_func/mean": 0.06312853842973709,
      "rewards/bleu_reward_func/std": 0.038368310779333115,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 418.21875,
      "completions/mean_terminated_length": 381.5217590332031,
      "completions/min_length": 214.0,
      "completions/min_terminated_length": 214.0,
      "epoch": 0.1224,
      "grad_norm": 2.344296455383301,
      "kl": 0.0169830322265625,
      "learning_rate": 1e-06,
      "loss": 0.0044,
      "num_tokens": 2060936.0,
      "reward": 0.05173652246594429,
      "reward_std": 0.024875259026885033,
      "rewards/bleu_reward_func/mean": 0.05173652246594429,
      "rewards/bleu_reward_func/std": 0.027617480605840683,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 387.96875,
      "completions/mean_terminated_length": 313.5500183105469,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 163.0,
      "epoch": 0.1232,
      "grad_norm": 2.4385931491851807,
      "kl": 0.020843505859375,
      "learning_rate": 1e-06,
      "loss": 0.0303,
      "num_tokens": 2076247.0,
      "reward": 0.09360536932945251,
      "reward_std": 0.023839600384235382,
      "rewards/bleu_reward_func/mean": 0.09360536932945251,
      "rewards/bleu_reward_func/std": 0.04197891801595688,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 487.0,
      "completions/mean_length": 347.5625,
      "completions/mean_terminated_length": 292.75,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "epoch": 0.124,
      "grad_norm": 3.0927393436431885,
      "kl": 0.020721435546875,
      "learning_rate": 1e-06,
      "loss": -0.1683,
      "num_tokens": 2090385.0,
      "reward": 0.06330172717571259,
      "reward_std": 0.0384925901889801,
      "rewards/bleu_reward_func/mean": 0.06330172717571259,
      "rewards/bleu_reward_func/std": 0.05383189022541046,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 424.0,
      "completions/mean_length": 318.4375,
      "completions/mean_terminated_length": 253.9166717529297,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.1248,
      "grad_norm": 3.673290252685547,
      "kl": 0.02178955078125,
      "learning_rate": 1e-06,
      "loss": -0.0271,
      "num_tokens": 2104047.0,
      "reward": 0.05628419667482376,
      "reward_std": 0.024199776351451874,
      "rewards/bleu_reward_func/mean": 0.05628419667482376,
      "rewards/bleu_reward_func/std": 0.03944230079650879,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 444.0,
      "completions/mean_length": 317.28125,
      "completions/mean_terminated_length": 200.4499969482422,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.1256,
      "grad_norm": 2.7515523433685303,
      "kl": 0.01515960693359375,
      "learning_rate": 1e-06,
      "loss": 0.0605,
      "num_tokens": 2116984.0,
      "reward": 0.03348308056592941,
      "reward_std": 0.02000669576227665,
      "rewards/bleu_reward_func/mean": 0.03348308056592941,
      "rewards/bleu_reward_func/std": 0.04217757657170296,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 352.90625,
      "completions/mean_terminated_length": 336.4482727050781,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.1264,
      "grad_norm": 2.3758599758148193,
      "kl": 0.01812744140625,
      "learning_rate": 1e-06,
      "loss": 0.0262,
      "num_tokens": 2130381.0,
      "reward": 0.0528571754693985,
      "reward_std": 0.015917008742690086,
      "rewards/bleu_reward_func/mean": 0.0528571754693985,
      "rewards/bleu_reward_func/std": 0.03905298560857773,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 428.53125,
      "completions/mean_terminated_length": 345.0625,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.1272,
      "grad_norm": 2.2279083728790283,
      "kl": 0.0218353271484375,
      "learning_rate": 1e-06,
      "loss": -0.0695,
      "num_tokens": 2146966.0,
      "reward": 0.07023762166500092,
      "reward_std": 0.022503603249788284,
      "rewards/bleu_reward_func/mean": 0.07023762166500092,
      "rewards/bleu_reward_func/std": 0.04653822258114815,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 467.0,
      "completions/mean_length": 335.40625,
      "completions/mean_terminated_length": 276.54168701171875,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.128,
      "grad_norm": 2.385481357574463,
      "kl": 0.01656341552734375,
      "learning_rate": 1e-06,
      "loss": 0.0463,
      "num_tokens": 2159947.0,
      "reward": 0.030797000974416733,
      "reward_std": 0.010636158287525177,
      "rewards/bleu_reward_func/mean": 0.030797000974416733,
      "rewards/bleu_reward_func/std": 0.012442766688764095,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 319.1875,
      "completions/mean_terminated_length": 283.4814758300781,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "epoch": 0.1288,
      "grad_norm": 2.474431037902832,
      "kl": 0.01513671875,
      "learning_rate": 1e-06,
      "loss": 0.099,
      "num_tokens": 2173321.0,
      "reward": 0.05447715148329735,
      "reward_std": 0.016968414187431335,
      "rewards/bleu_reward_func/mean": 0.05447715148329735,
      "rewards/bleu_reward_func/std": 0.04606984928250313,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 283.40625,
      "completions/mean_terminated_length": 276.0322570800781,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.1296,
      "grad_norm": 2.662762403488159,
      "kl": 0.0158233642578125,
      "learning_rate": 1e-06,
      "loss": -0.0236,
      "num_tokens": 2185702.0,
      "reward": 0.029800117015838623,
      "reward_std": 0.011480635963380337,
      "rewards/bleu_reward_func/mean": 0.029800117015838623,
      "rewards/bleu_reward_func/std": 0.013534092344343662,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 201.625,
      "completions/mean_terminated_length": 169.51724243164062,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.1304,
      "grad_norm": 3.084282398223877,
      "kl": 0.02069091796875,
      "learning_rate": 1e-06,
      "loss": -0.0555,
      "num_tokens": 2194330.0,
      "reward": 0.07677525281906128,
      "reward_std": 0.03891972452402115,
      "rewards/bleu_reward_func/mean": 0.07677525281906128,
      "rewards/bleu_reward_func/std": 0.08827344328165054,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 437.0,
      "completions/mean_length": 281.3125,
      "completions/mean_terminated_length": 204.4166717529297,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.1312,
      "grad_norm": 3.4201643466949463,
      "kl": 0.019195556640625,
      "learning_rate": 1e-06,
      "loss": 0.077,
      "num_tokens": 2205572.0,
      "reward": 0.04335915669798851,
      "reward_std": 0.011742215603590012,
      "rewards/bleu_reward_func/mean": 0.04335915669798851,
      "rewards/bleu_reward_func/std": 0.02501273900270462,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 394.0,
      "completions/max_terminated_length": 394.0,
      "completions/mean_length": 200.0625,
      "completions/mean_terminated_length": 200.0625,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.132,
      "grad_norm": 4.607831954956055,
      "kl": 0.0267181396484375,
      "learning_rate": 1e-06,
      "loss": -0.2349,
      "num_tokens": 2214158.0,
      "reward": 0.023556701838970184,
      "reward_std": 0.017645370215177536,
      "rewards/bleu_reward_func/mean": 0.023556701838970184,
      "rewards/bleu_reward_func/std": 0.026666147634387016,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 345.84375,
      "completions/mean_terminated_length": 334.7666931152344,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "epoch": 0.1328,
      "grad_norm": 2.454258918762207,
      "kl": 0.0163421630859375,
      "learning_rate": 1e-06,
      "loss": -0.0247,
      "num_tokens": 2227585.0,
      "reward": 0.04457944631576538,
      "reward_std": 0.015946604311466217,
      "rewards/bleu_reward_func/mean": 0.04457944631576538,
      "rewards/bleu_reward_func/std": 0.027206508442759514,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 366.84375,
      "completions/mean_terminated_length": 339.96295166015625,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.1336,
      "grad_norm": 2.361379384994507,
      "kl": 0.015533447265625,
      "learning_rate": 1e-06,
      "loss": -0.0269,
      "num_tokens": 2242860.0,
      "reward": 0.03885602205991745,
      "reward_std": 0.016611171886324883,
      "rewards/bleu_reward_func/mean": 0.03885602205991745,
      "rewards/bleu_reward_func/std": 0.02544359117746353,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 438.0,
      "completions/max_terminated_length": 438.0,
      "completions/mean_length": 179.0625,
      "completions/mean_terminated_length": 179.0625,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.1344,
      "grad_norm": 3.451462507247925,
      "kl": 0.0236968994140625,
      "learning_rate": 1e-06,
      "loss": 0.0097,
      "num_tokens": 2251926.0,
      "reward": 0.08403593301773071,
      "reward_std": 0.0387713760137558,
      "rewards/bleu_reward_func/mean": 0.08403593301773071,
      "rewards/bleu_reward_func/std": 0.058938704431056976,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 418.0,
      "completions/max_terminated_length": 418.0,
      "completions/mean_length": 250.9375,
      "completions/mean_terminated_length": 250.9375,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "epoch": 0.1352,
      "grad_norm": 3.0590903759002686,
      "kl": 0.018585205078125,
      "learning_rate": 1e-06,
      "loss": 0.0594,
      "num_tokens": 2262204.0,
      "reward": 0.1359768509864807,
      "reward_std": 0.030772076919674873,
      "rewards/bleu_reward_func/mean": 0.1359768509864807,
      "rewards/bleu_reward_func/std": 0.11267537623643875,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 282.75,
      "completions/mean_terminated_length": 275.3548278808594,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "epoch": 0.136,
      "grad_norm": 2.741846799850464,
      "kl": 0.0178985595703125,
      "learning_rate": 1e-06,
      "loss": -0.0933,
      "num_tokens": 2273812.0,
      "reward": 0.06083029881119728,
      "reward_std": 0.046626534312963486,
      "rewards/bleu_reward_func/mean": 0.06083029881119728,
      "rewards/bleu_reward_func/std": 0.09518548846244812,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 481.84375,
      "completions/mean_terminated_length": 437.7692565917969,
      "completions/min_length": 363.0,
      "completions/min_terminated_length": 363.0,
      "epoch": 0.1368,
      "grad_norm": 2.0689496994018555,
      "kl": 0.02508544921875,
      "learning_rate": 1e-06,
      "loss": -0.0084,
      "num_tokens": 2293239.0,
      "reward": 0.02994382753968239,
      "reward_std": 0.006383362226188183,
      "rewards/bleu_reward_func/mean": 0.02994382753968239,
      "rewards/bleu_reward_func/std": 0.013090057298541069,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 473.0,
      "completions/mean_length": 439.8125,
      "completions/mean_terminated_length": 358.0000305175781,
      "completions/min_length": 226.0,
      "completions/min_terminated_length": 226.0,
      "epoch": 0.1376,
      "grad_norm": 2.1585867404937744,
      "kl": 0.020751953125,
      "learning_rate": 1e-06,
      "loss": -0.0719,
      "num_tokens": 2311881.0,
      "reward": 0.06273654103279114,
      "reward_std": 0.016566328704357147,
      "rewards/bleu_reward_func/mean": 0.06273654103279114,
      "rewards/bleu_reward_func/std": 0.062433164566755295,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 353.25,
      "completions/mean_terminated_length": 281.0909118652344,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.1384,
      "grad_norm": 3.0912060737609863,
      "kl": 0.026611328125,
      "learning_rate": 1e-06,
      "loss": 0.0097,
      "num_tokens": 2329057.0,
      "reward": 0.04469408839941025,
      "reward_std": 0.013722876086831093,
      "rewards/bleu_reward_func/mean": 0.04469408839941025,
      "rewards/bleu_reward_func/std": 0.039968349039554596,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 417.84375,
      "completions/mean_terminated_length": 361.3500061035156,
      "completions/min_length": 264.0,
      "completions/min_terminated_length": 264.0,
      "epoch": 0.1392,
      "grad_norm": 2.3754167556762695,
      "kl": 0.0189361572265625,
      "learning_rate": 1e-06,
      "loss": -0.0304,
      "num_tokens": 2345180.0,
      "reward": 0.07604481279850006,
      "reward_std": 0.01629452034831047,
      "rewards/bleu_reward_func/mean": 0.07604481279850006,
      "rewards/bleu_reward_func/std": 0.07586659491062164,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 431.0,
      "completions/max_terminated_length": 431.0,
      "completions/mean_length": 165.6875,
      "completions/mean_terminated_length": 165.6875,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.14,
      "grad_norm": 5.492390155792236,
      "kl": 0.02996826171875,
      "learning_rate": 1e-06,
      "loss": 0.1718,
      "num_tokens": 2354442.0,
      "reward": 0.03355713561177254,
      "reward_std": 0.017250124365091324,
      "rewards/bleu_reward_func/mean": 0.03355713561177254,
      "rewards/bleu_reward_func/std": 0.020392760634422302,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 426.0,
      "completions/mean_length": 331.40625,
      "completions/mean_terminated_length": 289.73077392578125,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.1408,
      "grad_norm": 2.441545009613037,
      "kl": 0.02288818359375,
      "learning_rate": 1e-06,
      "loss": -0.0219,
      "num_tokens": 2368495.0,
      "reward": 0.03965570032596588,
      "reward_std": 0.01631091721355915,
      "rewards/bleu_reward_func/mean": 0.03965570032596588,
      "rewards/bleu_reward_func/std": 0.02324427105486393,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 383.6875,
      "completions/mean_terminated_length": 354.0769348144531,
      "completions/min_length": 151.0,
      "completions/min_terminated_length": 151.0,
      "epoch": 0.1416,
      "grad_norm": 2.1311628818511963,
      "kl": 0.0158843994140625,
      "learning_rate": 1e-06,
      "loss": -0.0019,
      "num_tokens": 2382621.0,
      "reward": 0.06360460817813873,
      "reward_std": 0.035029761493206024,
      "rewards/bleu_reward_func/mean": 0.06360460817813873,
      "rewards/bleu_reward_func/std": 0.052434373646974564,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 451.0,
      "completions/mean_length": 298.0625,
      "completions/mean_terminated_length": 238.1599884033203,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.1424,
      "grad_norm": 2.817237138748169,
      "kl": 0.01739501953125,
      "learning_rate": 1e-06,
      "loss": -0.0277,
      "num_tokens": 2396607.0,
      "reward": 0.08697853982448578,
      "reward_std": 0.02260083705186844,
      "rewards/bleu_reward_func/mean": 0.08697853982448578,
      "rewards/bleu_reward_func/std": 0.08185648173093796,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 388.0,
      "completions/mean_length": 279.375,
      "completions/mean_terminated_length": 188.3478240966797,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.1432,
      "grad_norm": 2.8254637718200684,
      "kl": 0.018768310546875,
      "learning_rate": 1e-06,
      "loss": 0.1222,
      "num_tokens": 2408243.0,
      "reward": 0.029145658016204834,
      "reward_std": 0.011095807887613773,
      "rewards/bleu_reward_func/mean": 0.029145658016204834,
      "rewards/bleu_reward_func/std": 0.021273698657751083,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 337.46875,
      "completions/mean_terminated_length": 246.04762268066406,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.144,
      "grad_norm": 3.114673376083374,
      "kl": 0.02655029296875,
      "learning_rate": 1e-06,
      "loss": 0.2521,
      "num_tokens": 2422274.0,
      "reward": 0.03623339533805847,
      "reward_std": 0.023222438991069794,
      "rewards/bleu_reward_func/mean": 0.03623339533805847,
      "rewards/bleu_reward_func/std": 0.03441086784005165,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 413.0,
      "completions/max_terminated_length": 413.0,
      "completions/mean_length": 164.53125,
      "completions/mean_terminated_length": 164.53125,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.1448,
      "grad_norm": 4.906919479370117,
      "kl": 0.0315399169921875,
      "learning_rate": 1e-06,
      "loss": 0.0263,
      "num_tokens": 2430475.0,
      "reward": 0.06976894289255142,
      "reward_std": 0.027743544429540634,
      "rewards/bleu_reward_func/mean": 0.06976894289255142,
      "rewards/bleu_reward_func/std": 0.06131015717983246,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 487.0,
      "completions/mean_length": 244.28125,
      "completions/mean_terminated_length": 182.5,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.1456,
      "grad_norm": 4.8848371505737305,
      "kl": 0.01995849609375,
      "learning_rate": 1e-06,
      "loss": 0.0627,
      "num_tokens": 2442788.0,
      "reward": 0.1191171407699585,
      "reward_std": 0.042263854295015335,
      "rewards/bleu_reward_func/mean": 0.1191171407699585,
      "rewards/bleu_reward_func/std": 0.10318046808242798,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 321.875,
      "completions/mean_terminated_length": 247.478271484375,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "epoch": 0.1464,
      "grad_norm": 2.6523852348327637,
      "kl": 0.01776123046875,
      "learning_rate": 1e-06,
      "loss": 0.0565,
      "num_tokens": 2455648.0,
      "reward": 0.04977214336395264,
      "reward_std": 0.016601046547293663,
      "rewards/bleu_reward_func/mean": 0.04977214336395264,
      "rewards/bleu_reward_func/std": 0.0421786792576313,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 370.09375,
      "completions/mean_terminated_length": 349.8214416503906,
      "completions/min_length": 229.0,
      "completions/min_terminated_length": 229.0,
      "epoch": 0.1472,
      "grad_norm": 2.1785075664520264,
      "kl": 0.018157958984375,
      "learning_rate": 1e-06,
      "loss": -0.0337,
      "num_tokens": 2470147.0,
      "reward": 0.10260511934757233,
      "reward_std": 0.01860986091196537,
      "rewards/bleu_reward_func/mean": 0.10260511934757233,
      "rewards/bleu_reward_func/std": 0.10457844287157059,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 487.0,
      "completions/mean_length": 346.71875,
      "completions/mean_terminated_length": 323.1071472167969,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "epoch": 0.148,
      "grad_norm": 2.3310489654541016,
      "kl": 0.0196685791015625,
      "learning_rate": 1e-06,
      "loss": 0.0781,
      "num_tokens": 2485410.0,
      "reward": 0.049770474433898926,
      "reward_std": 0.022042104974389076,
      "rewards/bleu_reward_func/mean": 0.049770474433898926,
      "rewards/bleu_reward_func/std": 0.06963635981082916,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 330.71875,
      "completions/mean_terminated_length": 270.29168701171875,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.1488,
      "grad_norm": 2.7905642986297607,
      "kl": 0.0135955810546875,
      "learning_rate": 1e-06,
      "loss": -0.1012,
      "num_tokens": 2498737.0,
      "reward": 0.060851939022541046,
      "reward_std": 0.04031149670481682,
      "rewards/bleu_reward_func/mean": 0.060851939022541046,
      "rewards/bleu_reward_func/std": 0.06394880264997482,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 356.8125,
      "completions/mean_terminated_length": 305.0833435058594,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "epoch": 0.1496,
      "grad_norm": 2.6979751586914062,
      "kl": 0.020355224609375,
      "learning_rate": 1e-06,
      "loss": 0.0853,
      "num_tokens": 2512435.0,
      "reward": 0.025785673409700394,
      "reward_std": 0.008767616003751755,
      "rewards/bleu_reward_func/mean": 0.025785673409700394,
      "rewards/bleu_reward_func/std": 0.016393397003412247,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 315.8125,
      "completions/mean_terminated_length": 309.4838562011719,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "epoch": 0.1504,
      "grad_norm": 2.6376216411590576,
      "kl": 0.0171051025390625,
      "learning_rate": 1e-06,
      "loss": 0.1299,
      "num_tokens": 2524845.0,
      "reward": 0.061950668692588806,
      "reward_std": 0.029896268621087074,
      "rewards/bleu_reward_func/mean": 0.061950668692588806,
      "rewards/bleu_reward_func/std": 0.0432080440223217,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 236.9375,
      "completions/mean_terminated_length": 129.30435180664062,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.1512,
      "grad_norm": 5.452763557434082,
      "kl": 0.0180816650390625,
      "learning_rate": 1e-06,
      "loss": 0.0144,
      "num_tokens": 2535291.0,
      "reward": 0.06512497365474701,
      "reward_std": 0.021872583776712418,
      "rewards/bleu_reward_func/mean": 0.06512497365474701,
      "rewards/bleu_reward_func/std": 0.05072392150759697,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 361.5625,
      "completions/mean_terminated_length": 319.44000244140625,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "epoch": 0.152,
      "grad_norm": 2.5568687915802,
      "kl": 0.021331787109375,
      "learning_rate": 1e-06,
      "loss": -0.078,
      "num_tokens": 2549005.0,
      "reward": 0.03104579634964466,
      "reward_std": 0.014428281225264072,
      "rewards/bleu_reward_func/mean": 0.03104579634964466,
      "rewards/bleu_reward_func/std": 0.023532235994935036,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 288.53125,
      "completions/mean_terminated_length": 247.1481475830078,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.1528,
      "grad_norm": 3.435012102127075,
      "kl": 0.019561767578125,
      "learning_rate": 1e-06,
      "loss": 0.1562,
      "num_tokens": 2560494.0,
      "reward": 0.028866298496723175,
      "reward_std": 0.013667687773704529,
      "rewards/bleu_reward_func/mean": 0.028866298496723175,
      "rewards/bleu_reward_func/std": 0.014043555594980717,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 442.0,
      "completions/mean_length": 364.53125,
      "completions/mean_terminated_length": 297.5,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.1536,
      "grad_norm": 2.173910617828369,
      "kl": 0.0178375244140625,
      "learning_rate": 1e-06,
      "loss": 0.0114,
      "num_tokens": 2575791.0,
      "reward": 0.03910418599843979,
      "reward_std": 0.013818096369504929,
      "rewards/bleu_reward_func/mean": 0.03910418599843979,
      "rewards/bleu_reward_func/std": 0.014301484450697899,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 470.0,
      "completions/mean_length": 236.0625,
      "completions/mean_terminated_length": 207.51724243164062,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.1544,
      "grad_norm": 4.14415979385376,
      "kl": 0.0251617431640625,
      "learning_rate": 1e-06,
      "loss": 0.0366,
      "num_tokens": 2586057.0,
      "reward": 0.07592535018920898,
      "reward_std": 0.04757307469844818,
      "rewards/bleu_reward_func/mean": 0.07592535018920898,
      "rewards/bleu_reward_func/std": 0.10841362178325653,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 478.0,
      "completions/mean_length": 453.375,
      "completions/mean_terminated_length": 303.5555725097656,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.1552,
      "grad_norm": 2.0680158138275146,
      "kl": 0.0290679931640625,
      "learning_rate": 1e-06,
      "loss": 0.0241,
      "num_tokens": 2606325.0,
      "reward": 0.031244732439517975,
      "reward_std": 0.01845286600291729,
      "rewards/bleu_reward_func/mean": 0.031244732439517975,
      "rewards/bleu_reward_func/std": 0.03221140056848526,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 307.53125,
      "completions/mean_terminated_length": 214.59091186523438,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.156,
      "grad_norm": 4.693017482757568,
      "kl": 0.03253173828125,
      "learning_rate": 1e-06,
      "loss": -0.081,
      "num_tokens": 2618206.0,
      "reward": 0.043432123959064484,
      "reward_std": 0.020170938223600388,
      "rewards/bleu_reward_func/mean": 0.043432123959064484,
      "rewards/bleu_reward_func/std": 0.024602122604846954,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 511.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 255.125,
      "completions/mean_terminated_length": 255.125,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "epoch": 0.1568,
      "grad_norm": 3.0103824138641357,
      "kl": 0.01934814453125,
      "learning_rate": 1e-06,
      "loss": -0.0563,
      "num_tokens": 2628394.0,
      "reward": 0.05487871170043945,
      "reward_std": 0.022487737238407135,
      "rewards/bleu_reward_func/mean": 0.05487871170043945,
      "rewards/bleu_reward_func/std": 0.04914577677845955,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 424.0,
      "completions/mean_length": 300.5,
      "completions/mean_terminated_length": 217.7391357421875,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.1576,
      "grad_norm": 3.266918659210205,
      "kl": 0.0201568603515625,
      "learning_rate": 1e-06,
      "loss": 0.0096,
      "num_tokens": 2642122.0,
      "reward": 0.10000570863485336,
      "reward_std": 0.027525175362825394,
      "rewards/bleu_reward_func/mean": 0.10000570863485336,
      "rewards/bleu_reward_func/std": 0.06606002897024155,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 322.4375,
      "completions/mean_terminated_length": 269.3599853515625,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.1584,
      "grad_norm": 2.755803108215332,
      "kl": 0.0229949951171875,
      "learning_rate": 1e-06,
      "loss": -0.1242,
      "num_tokens": 2654848.0,
      "reward": 0.04044795408844948,
      "reward_std": 0.017633788287639618,
      "rewards/bleu_reward_func/mean": 0.04044795408844948,
      "rewards/bleu_reward_func/std": 0.02563118189573288,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 289.90625,
      "completions/mean_terminated_length": 248.7777862548828,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.1592,
      "grad_norm": 3.5435428619384766,
      "kl": 0.014862060546875,
      "learning_rate": 1e-06,
      "loss": -0.0525,
      "num_tokens": 2666789.0,
      "reward": 0.18700216710567474,
      "reward_std": 0.06094446778297424,
      "rewards/bleu_reward_func/mean": 0.18700216710567474,
      "rewards/bleu_reward_func/std": 0.12359358370304108,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 417.0,
      "completions/mean_length": 435.21875,
      "completions/mean_terminated_length": 239.0,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "epoch": 0.16,
      "grad_norm": 2.0920848846435547,
      "kl": 0.0169677734375,
      "learning_rate": 1e-06,
      "loss": -0.0518,
      "num_tokens": 2683212.0,
      "reward": 0.04426024854183197,
      "reward_std": 0.022208159789443016,
      "rewards/bleu_reward_func/mean": 0.04426024854183197,
      "rewards/bleu_reward_func/std": 0.03876553475856781,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 478.0,
      "completions/mean_length": 234.34375,
      "completions/mean_terminated_length": 205.6206817626953,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.1608,
      "grad_norm": 3.487138032913208,
      "kl": 0.01708984375,
      "learning_rate": 1e-06,
      "loss": -0.0069,
      "num_tokens": 2693463.0,
      "reward": 0.05370340123772621,
      "reward_std": 0.03217038884758949,
      "rewards/bleu_reward_func/mean": 0.05370340123772621,
      "rewards/bleu_reward_func/std": 0.0736880749464035,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 429.0,
      "completions/max_terminated_length": 429.0,
      "completions/mean_length": 243.65625,
      "completions/mean_terminated_length": 243.65625,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.1616,
      "grad_norm": 3.3060476779937744,
      "kl": 0.022186279296875,
      "learning_rate": 1e-06,
      "loss": -0.0214,
      "num_tokens": 2707276.0,
      "reward": 0.052934836596250534,
      "reward_std": 0.0255296491086483,
      "rewards/bleu_reward_func/mean": 0.052934836596250534,
      "rewards/bleu_reward_func/std": 0.04204695671796799,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 469.0,
      "completions/mean_length": 365.71875,
      "completions/mean_terminated_length": 236.64706420898438,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.1624,
      "grad_norm": 2.4367294311523438,
      "kl": 0.0296630859375,
      "learning_rate": 1e-06,
      "loss": 0.0339,
      "num_tokens": 2724531.0,
      "reward": 0.024516377598047256,
      "reward_std": 0.00745509285479784,
      "rewards/bleu_reward_func/mean": 0.024516377598047256,
      "rewards/bleu_reward_func/std": 0.017017923295497894,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 472.9375,
      "completions/mean_terminated_length": 398.3636474609375,
      "completions/min_length": 278.0,
      "completions/min_terminated_length": 278.0,
      "epoch": 0.1632,
      "grad_norm": 1.9994412660598755,
      "kl": 0.021209716796875,
      "learning_rate": 1e-06,
      "loss": -0.0193,
      "num_tokens": 2744433.0,
      "reward": 0.015427513048052788,
      "reward_std": 0.0040624747052788734,
      "rewards/bleu_reward_func/mean": 0.015427513048052788,
      "rewards/bleu_reward_func/std": 0.012431508861482143,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 330.21875,
      "completions/mean_terminated_length": 188.8333282470703,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.164,
      "grad_norm": 3.1568257808685303,
      "kl": 0.01837158203125,
      "learning_rate": 1e-06,
      "loss": -0.0931,
      "num_tokens": 2757264.0,
      "reward": 0.07403382658958435,
      "reward_std": 0.02855892851948738,
      "rewards/bleu_reward_func/mean": 0.07403382658958435,
      "rewards/bleu_reward_func/std": 0.056942496448755264,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 428.0,
      "completions/mean_length": 385.90625,
      "completions/mean_terminated_length": 259.8125,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "epoch": 0.1648,
      "grad_norm": 2.4639594554901123,
      "kl": 0.022552490234375,
      "learning_rate": 1e-06,
      "loss": 0.096,
      "num_tokens": 2773037.0,
      "reward": 0.05201449990272522,
      "reward_std": 0.012545755133032799,
      "rewards/bleu_reward_func/mean": 0.05201449990272522,
      "rewards/bleu_reward_func/std": 0.03619503602385521,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 462.0,
      "completions/mean_length": 214.53125,
      "completions/mean_terminated_length": 204.9354705810547,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.1656,
      "grad_norm": 7.2300615310668945,
      "kl": 0.0164642333984375,
      "learning_rate": 1e-06,
      "loss": -0.0894,
      "num_tokens": 2782534.0,
      "reward": 0.05154382437467575,
      "reward_std": 0.02355325222015381,
      "rewards/bleu_reward_func/mean": 0.05154382437467575,
      "rewards/bleu_reward_func/std": 0.03372048959136009,
      "step": 207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 465.0,
      "completions/mean_length": 247.4375,
      "completions/mean_terminated_length": 173.36000061035156,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.1664,
      "grad_norm": 4.72393274307251,
      "kl": 0.02191162109375,
      "learning_rate": 1e-06,
      "loss": 0.0464,
      "num_tokens": 2795668.0,
      "reward": 0.08009414374828339,
      "reward_std": 0.04780849814414978,
      "rewards/bleu_reward_func/mean": 0.08009414374828339,
      "rewards/bleu_reward_func/std": 0.11779887974262238,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 477.0,
      "completions/mean_length": 351.59375,
      "completions/mean_terminated_length": 306.67999267578125,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.1672,
      "grad_norm": 3.04226016998291,
      "kl": 0.0213470458984375,
      "learning_rate": 1e-06,
      "loss": 0.1197,
      "num_tokens": 2809247.0,
      "reward": 0.07256356626749039,
      "reward_std": 0.018727965652942657,
      "rewards/bleu_reward_func/mean": 0.07256356626749039,
      "rewards/bleu_reward_func/std": 0.03669372946023941,
      "step": 209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 473.0,
      "completions/mean_length": 225.25,
      "completions/mean_terminated_length": 184.2857208251953,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.168,
      "grad_norm": 4.737229824066162,
      "kl": 0.02362060546875,
      "learning_rate": 1e-06,
      "loss": 0.1422,
      "num_tokens": 2818399.0,
      "reward": 0.035233426839113235,
      "reward_std": 0.01539241336286068,
      "rewards/bleu_reward_func/mean": 0.035233426839113235,
      "rewards/bleu_reward_func/std": 0.018226031213998795,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 450.0,
      "completions/mean_length": 352.84375,
      "completions/mean_terminated_length": 243.94737243652344,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.1688,
      "grad_norm": 3.0926690101623535,
      "kl": 0.017730712890625,
      "learning_rate": 1e-06,
      "loss": -0.0648,
      "num_tokens": 2833410.0,
      "reward": 0.05755448341369629,
      "reward_std": 0.025512943044304848,
      "rewards/bleu_reward_func/mean": 0.05755448341369629,
      "rewards/bleu_reward_func/std": 0.09508957713842392,
      "step": 211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 330.6875,
      "completions/mean_terminated_length": 288.8461608886719,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "epoch": 0.1696,
      "grad_norm": 2.5703189373016357,
      "kl": 0.01244354248046875,
      "learning_rate": 1e-06,
      "loss": 0.0727,
      "num_tokens": 2848008.0,
      "reward": 0.1348918080329895,
      "reward_std": 0.10722550749778748,
      "rewards/bleu_reward_func/mean": 0.1348918080329895,
      "rewards/bleu_reward_func/std": 0.20862343907356262,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 367.0,
      "completions/max_terminated_length": 367.0,
      "completions/mean_length": 187.375,
      "completions/mean_terminated_length": 187.375,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.1704,
      "grad_norm": 3.83489990234375,
      "kl": 0.0211029052734375,
      "learning_rate": 1e-06,
      "loss": -0.1678,
      "num_tokens": 2856124.0,
      "reward": 0.08521190285682678,
      "reward_std": 0.04054812341928482,
      "rewards/bleu_reward_func/mean": 0.08521190285682678,
      "rewards/bleu_reward_func/std": 0.050891559571027756,
      "step": 213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 325.4375,
      "completions/mean_terminated_length": 273.1999816894531,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "epoch": 0.1712,
      "grad_norm": 2.281848192214966,
      "kl": 0.0146026611328125,
      "learning_rate": 1e-06,
      "loss": 0.0584,
      "num_tokens": 2868610.0,
      "reward": 0.03156504034996033,
      "reward_std": 0.00848651397973299,
      "rewards/bleu_reward_func/mean": 0.03156504034996033,
      "rewards/bleu_reward_func/std": 0.024025922641158104,
      "step": 214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 463.0,
      "completions/mean_length": 391.125,
      "completions/mean_terminated_length": 254.1333465576172,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.172,
      "grad_norm": 2.294666290283203,
      "kl": 0.0182952880859375,
      "learning_rate": 1e-06,
      "loss": -0.023,
      "num_tokens": 2884318.0,
      "reward": 0.06922988593578339,
      "reward_std": 0.021780148148536682,
      "rewards/bleu_reward_func/mean": 0.06922988593578339,
      "rewards/bleu_reward_func/std": 0.060424305498600006,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 319.6875,
      "completions/mean_terminated_length": 292.21429443359375,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.1728,
      "grad_norm": 2.571230411529541,
      "kl": 0.016632080078125,
      "learning_rate": 1e-06,
      "loss": 0.1612,
      "num_tokens": 2896860.0,
      "reward": 0.09209141135215759,
      "reward_std": 0.04961652681231499,
      "rewards/bleu_reward_func/mean": 0.09209141135215759,
      "rewards/bleu_reward_func/std": 0.0983605682849884,
      "step": 216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 201.0625,
      "completions/mean_terminated_length": 156.6428680419922,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.1736,
      "grad_norm": 4.5812458992004395,
      "kl": 0.0264892578125,
      "learning_rate": 1e-06,
      "loss": -0.0075,
      "num_tokens": 2910246.0,
      "reward": 0.03037147969007492,
      "reward_std": 0.01431269571185112,
      "rewards/bleu_reward_func/mean": 0.03037147969007492,
      "rewards/bleu_reward_func/std": 0.018613692373037338,
      "step": 217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 312.5,
      "completions/mean_terminated_length": 176.0,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.1744,
      "grad_norm": 2.713181734085083,
      "kl": 0.0229949951171875,
      "learning_rate": 1e-06,
      "loss": -0.0948,
      "num_tokens": 2923078.0,
      "reward": 0.026773083955049515,
      "reward_std": 0.013973203487694263,
      "rewards/bleu_reward_func/mean": 0.026773083955049515,
      "rewards/bleu_reward_func/std": 0.018786389380693436,
      "step": 218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 379.125,
      "completions/mean_terminated_length": 246.25,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "epoch": 0.1752,
      "grad_norm": 2.995640516281128,
      "kl": 0.01776123046875,
      "learning_rate": 1e-06,
      "loss": -0.0257,
      "num_tokens": 2938594.0,
      "reward": 0.046795397996902466,
      "reward_std": 0.032065290957689285,
      "rewards/bleu_reward_func/mean": 0.046795397996902466,
      "rewards/bleu_reward_func/std": 0.036055758595466614,
      "step": 219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 256.875,
      "completions/mean_terminated_length": 239.86668395996094,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.176,
      "grad_norm": 3.257596492767334,
      "kl": 0.020233154296875,
      "learning_rate": 1e-06,
      "loss": -0.1987,
      "num_tokens": 2950486.0,
      "reward": 0.04207265004515648,
      "reward_std": 0.015948571264743805,
      "rewards/bleu_reward_func/mean": 0.04207265004515648,
      "rewards/bleu_reward_func/std": 0.02378344163298607,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 403.0,
      "completions/mean_length": 303.3125,
      "completions/mean_terminated_length": 221.6521759033203,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.1768,
      "grad_norm": 2.1766207218170166,
      "kl": 0.01788330078125,
      "learning_rate": 1e-06,
      "loss": -0.0255,
      "num_tokens": 2963320.0,
      "reward": 0.02319950982928276,
      "reward_std": 0.02059568464756012,
      "rewards/bleu_reward_func/mean": 0.02319950982928276,
      "rewards/bleu_reward_func/std": 0.024123726412653923,
      "step": 221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 476.0,
      "completions/mean_length": 249.3125,
      "completions/mean_terminated_length": 231.80001831054688,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "epoch": 0.1776,
      "grad_norm": 3.291715621948242,
      "kl": 0.018707275390625,
      "learning_rate": 1e-06,
      "loss": -0.0471,
      "num_tokens": 2973706.0,
      "reward": 0.03604161739349365,
      "reward_std": 0.0192702729254961,
      "rewards/bleu_reward_func/mean": 0.03604161739349365,
      "rewards/bleu_reward_func/std": 0.03049510158598423,
      "step": 222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 505.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 253.03125,
      "completions/mean_terminated_length": 253.03125,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.1784,
      "grad_norm": 4.5163798332214355,
      "kl": 0.0269317626953125,
      "learning_rate": 1e-06,
      "loss": 0.1078,
      "num_tokens": 2984139.0,
      "reward": 0.08039151877164841,
      "reward_std": 0.03706767037510872,
      "rewards/bleu_reward_func/mean": 0.08039151877164841,
      "rewards/bleu_reward_func/std": 0.08994851261377335,
      "step": 223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 314.8125,
      "completions/mean_terminated_length": 249.08334350585938,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.1792,
      "grad_norm": 2.842648506164551,
      "kl": 0.0249176025390625,
      "learning_rate": 1e-06,
      "loss": 0.0148,
      "num_tokens": 2997189.0,
      "reward": 0.05913674458861351,
      "reward_std": 0.020169682800769806,
      "rewards/bleu_reward_func/mean": 0.05913674458861351,
      "rewards/bleu_reward_func/std": 0.03278661519289017,
      "step": 224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 325.4375,
      "completions/mean_terminated_length": 114.00000762939453,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.18,
      "grad_norm": 3.618607521057129,
      "kl": 0.026611328125,
      "learning_rate": 1e-06,
      "loss": -0.0644,
      "num_tokens": 3012395.0,
      "reward": 0.06164587289094925,
      "reward_std": 0.038472697138786316,
      "rewards/bleu_reward_func/mean": 0.06164587289094925,
      "rewards/bleu_reward_func/std": 0.07648678123950958,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 495.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 192.96875,
      "completions/mean_terminated_length": 192.96875,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.1808,
      "grad_norm": 4.200687408447266,
      "kl": 0.02935791015625,
      "learning_rate": 1e-06,
      "loss": 0.2838,
      "num_tokens": 3020850.0,
      "reward": 0.06510348618030548,
      "reward_std": 0.03152220696210861,
      "rewards/bleu_reward_func/mean": 0.06510348618030548,
      "rewards/bleu_reward_func/std": 0.05109791085124016,
      "step": 226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 339.53125,
      "completions/mean_terminated_length": 299.73077392578125,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "epoch": 0.1816,
      "grad_norm": 2.5614731311798096,
      "kl": 0.020538330078125,
      "learning_rate": 1e-06,
      "loss": -0.0955,
      "num_tokens": 3034523.0,
      "reward": 0.02740243449807167,
      "reward_std": 0.013225449249148369,
      "rewards/bleu_reward_func/mean": 0.02740243449807167,
      "rewards/bleu_reward_func/std": 0.022882074117660522,
      "step": 227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 458.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 195.5625,
      "completions/mean_terminated_length": 195.5625,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.1824,
      "grad_norm": 3.117429494857788,
      "kl": 0.027130126953125,
      "learning_rate": 1e-06,
      "loss": 0.0859,
      "num_tokens": 3046133.0,
      "reward": 0.04862482473254204,
      "reward_std": 0.032265372574329376,
      "rewards/bleu_reward_func/mean": 0.04862482473254204,
      "rewards/bleu_reward_func/std": 0.050319138914346695,
      "step": 228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 477.0,
      "completions/mean_length": 256.40625,
      "completions/mean_terminated_length": 239.36668395996094,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.1832,
      "grad_norm": 2.8993618488311768,
      "kl": 0.0225982666015625,
      "learning_rate": 1e-06,
      "loss": -0.021,
      "num_tokens": 3058066.0,
      "reward": 0.07180735468864441,
      "reward_std": 0.03843909874558449,
      "rewards/bleu_reward_func/mean": 0.07180735468864441,
      "rewards/bleu_reward_func/std": 0.09140986949205399,
      "step": 229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 271.0625,
      "completions/mean_terminated_length": 126.5,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.184,
      "grad_norm": 7.003058910369873,
      "kl": 0.0565185546875,
      "learning_rate": 1e-06,
      "loss": -0.0465,
      "num_tokens": 3070092.0,
      "reward": 0.05519847571849823,
      "reward_std": 0.015686171129345894,
      "rewards/bleu_reward_func/mean": 0.05519847571849823,
      "rewards/bleu_reward_func/std": 0.02879628911614418,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 334.15625,
      "completions/mean_terminated_length": 315.75860595703125,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.1848,
      "grad_norm": 2.597579002380371,
      "kl": 0.021514892578125,
      "learning_rate": 1e-06,
      "loss": -0.011,
      "num_tokens": 3083105.0,
      "reward": 0.057620543986558914,
      "reward_std": 0.02059962786734104,
      "rewards/bleu_reward_func/mean": 0.057620543986558914,
      "rewards/bleu_reward_func/std": 0.03736231103539467,
      "step": 231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 198.03125,
      "completions/mean_terminated_length": 177.10000610351562,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.1856,
      "grad_norm": 3.682512044906616,
      "kl": 0.02569580078125,
      "learning_rate": 1e-06,
      "loss": 0.2759,
      "num_tokens": 3091698.0,
      "reward": 0.052232254296541214,
      "reward_std": 0.035380616784095764,
      "rewards/bleu_reward_func/mean": 0.052232254296541214,
      "rewards/bleu_reward_func/std": 0.06341397017240524,
      "step": 232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 316.3125,
      "completions/mean_terminated_length": 310.0,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "epoch": 0.1864,
      "grad_norm": 2.698293685913086,
      "kl": 0.0152130126953125,
      "learning_rate": 1e-06,
      "loss": 0.1178,
      "num_tokens": 3104156.0,
      "reward": 0.0916142389178276,
      "reward_std": 0.04101229086518288,
      "rewards/bleu_reward_func/mean": 0.0916142389178276,
      "rewards/bleu_reward_func/std": 0.10220352560281754,
      "step": 233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 365.0,
      "completions/mean_length": 232.4375,
      "completions/mean_terminated_length": 203.51724243164062,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.1872,
      "grad_norm": 3.543837785720825,
      "kl": 0.023590087890625,
      "learning_rate": 1e-06,
      "loss": 0.0425,
      "num_tokens": 3113882.0,
      "reward": 0.035266127437353134,
      "reward_std": 0.011640656739473343,
      "rewards/bleu_reward_func/mean": 0.035266127437353134,
      "rewards/bleu_reward_func/std": 0.022689029574394226,
      "step": 234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 340.8125,
      "completions/mean_terminated_length": 335.2903137207031,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "epoch": 0.188,
      "grad_norm": 2.4419240951538086,
      "kl": 0.0162506103515625,
      "learning_rate": 1e-06,
      "loss": -0.0465,
      "num_tokens": 3129372.0,
      "reward": 0.08140328526496887,
      "reward_std": 0.03153820335865021,
      "rewards/bleu_reward_func/mean": 0.08140328526496887,
      "rewards/bleu_reward_func/std": 0.09040741622447968,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 426.0,
      "completions/mean_length": 149.8125,
      "completions/mean_terminated_length": 125.66667175292969,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.1888,
      "grad_norm": 5.230119705200195,
      "kl": 0.0335540771484375,
      "learning_rate": 1e-06,
      "loss": -0.1208,
      "num_tokens": 3137742.0,
      "reward": 0.08085089921951294,
      "reward_std": 0.06161949411034584,
      "rewards/bleu_reward_func/mean": 0.08085089921951294,
      "rewards/bleu_reward_func/std": 0.08157042413949966,
      "step": 236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 499.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 218.96875,
      "completions/mean_terminated_length": 218.96875,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.1896,
      "grad_norm": 3.527757406234741,
      "kl": 0.0172576904296875,
      "learning_rate": 1e-06,
      "loss": -0.0976,
      "num_tokens": 3147781.0,
      "reward": 0.10917741060256958,
      "reward_std": 0.04233718663454056,
      "rewards/bleu_reward_func/mean": 0.10917741060256958,
      "rewards/bleu_reward_func/std": 0.09499745815992355,
      "step": 237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 418.53125,
      "completions/mean_terminated_length": 376.04547119140625,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "epoch": 0.1904,
      "grad_norm": 2.3092899322509766,
      "kl": 0.0175323486328125,
      "learning_rate": 1e-06,
      "loss": 0.0596,
      "num_tokens": 3163734.0,
      "reward": 0.0354929119348526,
      "reward_std": 0.016294876113533974,
      "rewards/bleu_reward_func/mean": 0.0354929119348526,
      "rewards/bleu_reward_func/std": 0.01924656331539154,
      "step": 238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 296.5,
      "completions/mean_terminated_length": 282.13336181640625,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.1912,
      "grad_norm": 2.64257550239563,
      "kl": 0.020233154296875,
      "learning_rate": 1e-06,
      "loss": -0.0343,
      "num_tokens": 3176118.0,
      "reward": 0.05447524040937424,
      "reward_std": 0.01689964160323143,
      "rewards/bleu_reward_func/mean": 0.05447524040937424,
      "rewards/bleu_reward_func/std": 0.03051072731614113,
      "step": 239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 311.71875,
      "completions/mean_terminated_length": 291.0,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.192,
      "grad_norm": 2.7584307193756104,
      "kl": 0.020416259765625,
      "learning_rate": 1e-06,
      "loss": -0.1264,
      "num_tokens": 3190469.0,
      "reward": 0.13415664434432983,
      "reward_std": 0.0733218789100647,
      "rewards/bleu_reward_func/mean": 0.13415664434432983,
      "rewards/bleu_reward_func/std": 0.11462453752756119,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 289.5,
      "completions/mean_terminated_length": 238.1538543701172,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.1928,
      "grad_norm": 3.149303913116455,
      "kl": 0.0215301513671875,
      "learning_rate": 1e-06,
      "loss": 0.0126,
      "num_tokens": 3202589.0,
      "reward": 0.10199414938688278,
      "reward_std": 0.044677168130874634,
      "rewards/bleu_reward_func/mean": 0.10199414938688278,
      "rewards/bleu_reward_func/std": 0.14810438454151154,
      "step": 241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 391.21875,
      "completions/mean_terminated_length": 318.75,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "epoch": 0.1936,
      "grad_norm": 2.327310800552368,
      "kl": 0.0197906494140625,
      "learning_rate": 1e-06,
      "loss": -0.0446,
      "num_tokens": 3220500.0,
      "reward": 0.04337170720100403,
      "reward_std": 0.020304495468735695,
      "rewards/bleu_reward_func/mean": 0.04337170720100403,
      "rewards/bleu_reward_func/std": 0.0399162657558918,
      "step": 242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 465.0,
      "completions/mean_length": 404.375,
      "completions/mean_terminated_length": 309.4117736816406,
      "completions/min_length": 165.0,
      "completions/min_terminated_length": 165.0,
      "epoch": 0.1944,
      "grad_norm": 2.4237332344055176,
      "kl": 0.023651123046875,
      "learning_rate": 1e-06,
      "loss": -0.0035,
      "num_tokens": 3236000.0,
      "reward": 0.059977754950523376,
      "reward_std": 0.022927038371562958,
      "rewards/bleu_reward_func/mean": 0.059977754950523376,
      "rewards/bleu_reward_func/std": 0.032850753515958786,
      "step": 243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 456.0,
      "completions/mean_length": 352.84375,
      "completions/mean_terminated_length": 308.2799987792969,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.1952,
      "grad_norm": 2.4127137660980225,
      "kl": 0.01862335205078125,
      "learning_rate": 1e-06,
      "loss": -0.0088,
      "num_tokens": 3252163.0,
      "reward": 0.13270705938339233,
      "reward_std": 0.03011954203248024,
      "rewards/bleu_reward_func/mean": 0.13270705938339233,
      "rewards/bleu_reward_func/std": 0.10800201445817947,
      "step": 244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 438.0,
      "completions/mean_length": 257.0625,
      "completions/mean_terminated_length": 248.8386993408203,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.196,
      "grad_norm": 4.602512359619141,
      "kl": 0.02532958984375,
      "learning_rate": 1e-06,
      "loss": 0.0806,
      "num_tokens": 3266605.0,
      "reward": 0.03341788053512573,
      "reward_std": 0.013418522663414478,
      "rewards/bleu_reward_func/mean": 0.03341788053512573,
      "rewards/bleu_reward_func/std": 0.021642079576849937,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 353.09375,
      "completions/mean_terminated_length": 257.75,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.1968,
      "grad_norm": 2.3705356121063232,
      "kl": 0.02313232421875,
      "learning_rate": 1e-06,
      "loss": 0.03,
      "num_tokens": 3280688.0,
      "reward": 0.06873345375061035,
      "reward_std": 0.040343694388866425,
      "rewards/bleu_reward_func/mean": 0.06873345375061035,
      "rewards/bleu_reward_func/std": 0.056226469576358795,
      "step": 246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 370.4375,
      "completions/mean_terminated_length": 315.0434875488281,
      "completions/min_length": 224.0,
      "completions/min_terminated_length": 224.0,
      "epoch": 0.1976,
      "grad_norm": 2.6424918174743652,
      "kl": 0.025970458984375,
      "learning_rate": 1e-06,
      "loss": -0.044,
      "num_tokens": 3295534.0,
      "reward": 0.04273587465286255,
      "reward_std": 0.014303158968687057,
      "rewards/bleu_reward_func/mean": 0.04273587465286255,
      "rewards/bleu_reward_func/std": 0.021357977762818336,
      "step": 247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 382.0625,
      "completions/mean_terminated_length": 252.125,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.1984,
      "grad_norm": 2.337956190109253,
      "kl": 0.0235137939453125,
      "learning_rate": 1e-06,
      "loss": -0.0092,
      "num_tokens": 3310872.0,
      "reward": 0.037085238844156265,
      "reward_std": 0.0252089761197567,
      "rewards/bleu_reward_func/mean": 0.037085238844156265,
      "rewards/bleu_reward_func/std": 0.034677691757678986,
      "step": 248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 459.0,
      "completions/mean_length": 253.9375,
      "completions/mean_terminated_length": 206.1481475830078,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.1992,
      "grad_norm": 3.5819602012634277,
      "kl": 0.0323333740234375,
      "learning_rate": 1e-06,
      "loss": 0.0387,
      "num_tokens": 3322966.0,
      "reward": 0.06450790166854858,
      "reward_std": 0.022195765748620033,
      "rewards/bleu_reward_func/mean": 0.06450790166854858,
      "rewards/bleu_reward_func/std": 0.054868634790182114,
      "step": 249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 360.875,
      "completions/mean_terminated_length": 281.71429443359375,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.2,
      "grad_norm": 2.814183473587036,
      "kl": 0.0179443359375,
      "learning_rate": 1e-06,
      "loss": -0.1752,
      "num_tokens": 3337362.0,
      "reward": 0.039325565099716187,
      "reward_std": 0.025641005486249924,
      "rewards/bleu_reward_func/mean": 0.039325565099716187,
      "rewards/bleu_reward_func/std": 0.04151046276092529,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 475.0,
      "completions/mean_length": 344.65625,
      "completions/mean_terminated_length": 279.1739196777344,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "epoch": 0.2008,
      "grad_norm": 2.49495005607605,
      "kl": 0.01953125,
      "learning_rate": 1e-06,
      "loss": 0.0217,
      "num_tokens": 3352271.0,
      "reward": 0.04269051179289818,
      "reward_std": 0.020738966763019562,
      "rewards/bleu_reward_func/mean": 0.04269051179289818,
      "rewards/bleu_reward_func/std": 0.02881108783185482,
      "step": 251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 333.28125,
      "completions/mean_terminated_length": 252.0454559326172,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.2016,
      "grad_norm": 4.844610214233398,
      "kl": 0.0460662841796875,
      "learning_rate": 1e-06,
      "loss": 0.0077,
      "num_tokens": 3366952.0,
      "reward": 0.02587553858757019,
      "reward_std": 0.01308115478605032,
      "rewards/bleu_reward_func/mean": 0.02587553858757019,
      "rewards/bleu_reward_func/std": 0.02540062554180622,
      "step": 252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 136.1875,
      "completions/mean_terminated_length": 124.06451416015625,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.2024,
      "grad_norm": 8.286824226379395,
      "kl": 0.0217132568359375,
      "learning_rate": 1e-06,
      "loss": -0.2455,
      "num_tokens": 3373646.0,
      "reward": 0.03720610588788986,
      "reward_std": 0.02497956156730652,
      "rewards/bleu_reward_func/mean": 0.03720610588788986,
      "rewards/bleu_reward_func/std": 0.04735667258501053,
      "step": 253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 433.0,
      "completions/max_terminated_length": 433.0,
      "completions/mean_length": 273.28125,
      "completions/mean_terminated_length": 273.28125,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "epoch": 0.2032,
      "grad_norm": 3.4898288249969482,
      "kl": 0.0250244140625,
      "learning_rate": 1e-06,
      "loss": -0.0109,
      "num_tokens": 3386335.0,
      "reward": 0.07081638276576996,
      "reward_std": 0.028427409008145332,
      "rewards/bleu_reward_func/mean": 0.07081638276576996,
      "rewards/bleu_reward_func/std": 0.05091365799307823,
      "step": 254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 507.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 295.15625,
      "completions/mean_terminated_length": 295.15625,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.204,
      "grad_norm": 2.3208115100860596,
      "kl": 0.0214996337890625,
      "learning_rate": 1e-06,
      "loss": -0.1527,
      "num_tokens": 3397868.0,
      "reward": 0.09917673468589783,
      "reward_std": 0.0416448600590229,
      "rewards/bleu_reward_func/mean": 0.09917673468589783,
      "rewards/bleu_reward_func/std": 0.08467306196689606,
      "step": 255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 395.0,
      "completions/mean_length": 210.375,
      "completions/mean_terminated_length": 200.64515686035156,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "epoch": 0.2048,
      "grad_norm": 3.7243247032165527,
      "kl": 0.02008056640625,
      "learning_rate": 1e-06,
      "loss": 0.0121,
      "num_tokens": 3406896.0,
      "reward": 0.09600116312503815,
      "reward_std": 0.04207791015505791,
      "rewards/bleu_reward_func/mean": 0.09600116312503815,
      "rewards/bleu_reward_func/std": 0.11218695342540741,
      "step": 256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 416.78125,
      "completions/mean_terminated_length": 359.6499938964844,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "epoch": 0.2056,
      "grad_norm": 2.2771170139312744,
      "kl": 0.0246124267578125,
      "learning_rate": 1e-06,
      "loss": -0.0175,
      "num_tokens": 3423553.0,
      "reward": 0.09614823013544083,
      "reward_std": 0.024631768465042114,
      "rewards/bleu_reward_func/mean": 0.09614823013544083,
      "rewards/bleu_reward_func/std": 0.05439407005906105,
      "step": 257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 347.0,
      "completions/mean_terminated_length": 218.6666717529297,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.2064,
      "grad_norm": 3.226184844970703,
      "kl": 0.030609130859375,
      "learning_rate": 1e-06,
      "loss": -0.0879,
      "num_tokens": 3439921.0,
      "reward": 0.04686765745282173,
      "reward_std": 0.020355040207505226,
      "rewards/bleu_reward_func/mean": 0.04686765745282173,
      "rewards/bleu_reward_func/std": 0.0473078228533268,
      "step": 258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 297.75,
      "completions/mean_terminated_length": 258.0740661621094,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.2072,
      "grad_norm": 2.7204718589782715,
      "kl": 0.02374267578125,
      "learning_rate": 1e-06,
      "loss": 0.2345,
      "num_tokens": 3451953.0,
      "reward": 0.07326146960258484,
      "reward_std": 0.055320855230093,
      "rewards/bleu_reward_func/mean": 0.07326146960258484,
      "rewards/bleu_reward_func/std": 0.10083870589733124,
      "step": 259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 359.0,
      "completions/max_terminated_length": 359.0,
      "completions/mean_length": 168.03125,
      "completions/mean_terminated_length": 168.03125,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.208,
      "grad_norm": 4.883479595184326,
      "kl": 0.07550048828125,
      "learning_rate": 1e-06,
      "loss": -0.1051,
      "num_tokens": 3462906.0,
      "reward": 0.06956590712070465,
      "reward_std": 0.03378972038626671,
      "rewards/bleu_reward_func/mean": 0.06956590712070465,
      "rewards/bleu_reward_func/std": 0.05879009887576103,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 383.0,
      "completions/mean_terminated_length": 315.4285888671875,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "epoch": 0.2088,
      "grad_norm": 2.163914918899536,
      "kl": 0.022429943084716797,
      "learning_rate": 1e-06,
      "loss": 0.0093,
      "num_tokens": 3480722.0,
      "reward": 0.12491203844547272,
      "reward_std": 0.10957963019609451,
      "rewards/bleu_reward_func/mean": 0.12491203844547272,
      "rewards/bleu_reward_func/std": 0.22631219029426575,
      "step": 261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 435.0,
      "completions/mean_length": 217.03125,
      "completions/mean_terminated_length": 186.51724243164062,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.2096,
      "grad_norm": 4.656850814819336,
      "kl": 0.046905517578125,
      "learning_rate": 1e-06,
      "loss": 0.1128,
      "num_tokens": 3490139.0,
      "reward": 0.03650316223502159,
      "reward_std": 0.01834101229906082,
      "rewards/bleu_reward_func/mean": 0.03650316223502159,
      "rewards/bleu_reward_func/std": 0.019939929246902466,
      "step": 262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 170.46875,
      "completions/mean_terminated_length": 121.67857360839844,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.2104,
      "grad_norm": 4.333711624145508,
      "kl": 0.0684814453125,
      "learning_rate": 1e-06,
      "loss": -0.0531,
      "num_tokens": 3500210.0,
      "reward": 0.12142158299684525,
      "reward_std": 0.051336318254470825,
      "rewards/bleu_reward_func/mean": 0.12142158299684525,
      "rewards/bleu_reward_func/std": 0.11394964903593063,
      "step": 263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 445.0,
      "completions/mean_length": 313.09375,
      "completions/mean_terminated_length": 222.68182373046875,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.2112,
      "grad_norm": 4.1456618309021,
      "kl": 0.040740966796875,
      "learning_rate": 1e-06,
      "loss": -0.0505,
      "num_tokens": 3512237.0,
      "reward": 0.04106439650058746,
      "reward_std": 0.010877052322030067,
      "rewards/bleu_reward_func/mean": 0.04106439650058746,
      "rewards/bleu_reward_func/std": 0.029797548428177834,
      "step": 264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 325.78125,
      "completions/mean_terminated_length": 273.6399841308594,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "epoch": 0.212,
      "grad_norm": 2.98254656791687,
      "kl": 0.0294189453125,
      "learning_rate": 1e-06,
      "loss": -0.0504,
      "num_tokens": 3525790.0,
      "reward": 0.0902123674750328,
      "reward_std": 0.02512788400053978,
      "rewards/bleu_reward_func/mean": 0.0902123674750328,
      "rewards/bleu_reward_func/std": 0.09073200821876526,
      "step": 265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 459.65625,
      "completions/mean_terminated_length": 325.8888854980469,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.2128,
      "grad_norm": 2.1296770572662354,
      "kl": 0.0257110595703125,
      "learning_rate": 1e-06,
      "loss": 0.0355,
      "num_tokens": 3543467.0,
      "reward": 0.0487542450428009,
      "reward_std": 0.01123578380793333,
      "rewards/bleu_reward_func/mean": 0.0487542450428009,
      "rewards/bleu_reward_func/std": 0.0231720469892025,
      "step": 266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 248.03125,
      "completions/mean_terminated_length": 187.11538696289062,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.2136,
      "grad_norm": 8.958330154418945,
      "kl": 0.0360107421875,
      "learning_rate": 1e-06,
      "loss": 0.1156,
      "num_tokens": 3553948.0,
      "reward": 0.07910416275262833,
      "reward_std": 0.015652041882276535,
      "rewards/bleu_reward_func/mean": 0.07910416275262833,
      "rewards/bleu_reward_func/std": 0.07359592616558075,
      "step": 267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 471.0,
      "completions/mean_length": 257.96875,
      "completions/mean_terminated_length": 231.6896514892578,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "epoch": 0.2144,
      "grad_norm": 2.6220574378967285,
      "kl": 0.030303955078125,
      "learning_rate": 1e-06,
      "loss": 0.1163,
      "num_tokens": 3566659.0,
      "reward": 0.04102238267660141,
      "reward_std": 0.01749418117105961,
      "rewards/bleu_reward_func/mean": 0.04102238267660141,
      "rewards/bleu_reward_func/std": 0.02520221658051014,
      "step": 268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 410.0,
      "completions/mean_length": 300.5625,
      "completions/mean_terminated_length": 230.08334350585938,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.2152,
      "grad_norm": 3.4121954441070557,
      "kl": 0.01523590087890625,
      "learning_rate": 1e-06,
      "loss": -0.0559,
      "num_tokens": 3579605.0,
      "reward": 0.05740036815404892,
      "reward_std": 0.023826539516448975,
      "rewards/bleu_reward_func/mean": 0.05740036815404892,
      "rewards/bleu_reward_func/std": 0.0372060090303421,
      "step": 269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 360.9375,
      "completions/mean_terminated_length": 227.64706420898438,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.216,
      "grad_norm": 2.156470775604248,
      "kl": 0.031585693359375,
      "learning_rate": 1e-06,
      "loss": -0.2085,
      "num_tokens": 3595707.0,
      "reward": 0.03448399901390076,
      "reward_std": 0.01828095316886902,
      "rewards/bleu_reward_func/mean": 0.03448399901390076,
      "rewards/bleu_reward_func/std": 0.02368060126900673,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 370.0,
      "completions/mean_length": 352.5625,
      "completions/mean_terminated_length": 211.88235473632812,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 0.2168,
      "grad_norm": 2.4653913974761963,
      "kl": 0.027069091796875,
      "learning_rate": 1e-06,
      "loss": -0.0934,
      "num_tokens": 3611005.0,
      "reward": 0.032711900770664215,
      "reward_std": 0.010354666039347649,
      "rewards/bleu_reward_func/mean": 0.032711900770664215,
      "rewards/bleu_reward_func/std": 0.013180587440729141,
      "step": 271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 373.0,
      "completions/mean_length": 233.1875,
      "completions/mean_terminated_length": 155.1199951171875,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.2176,
      "grad_norm": 3.7216508388519287,
      "kl": 0.039306640625,
      "learning_rate": 1e-06,
      "loss": 0.0428,
      "num_tokens": 3623131.0,
      "reward": 0.025905201211571693,
      "reward_std": 0.017245225608348846,
      "rewards/bleu_reward_func/mean": 0.025905201211571693,
      "rewards/bleu_reward_func/std": 0.02249467745423317,
      "step": 272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 362.0625,
      "completions/mean_terminated_length": 229.76470947265625,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.2184,
      "grad_norm": 2.705111265182495,
      "kl": 0.039459228515625,
      "learning_rate": 1e-06,
      "loss": -0.0603,
      "num_tokens": 3641261.0,
      "reward": 0.04823341965675354,
      "reward_std": 0.017576558515429497,
      "rewards/bleu_reward_func/mean": 0.04823341965675354,
      "rewards/bleu_reward_func/std": 0.03309940919280052,
      "step": 273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 384.0,
      "completions/mean_length": 273.0625,
      "completions/mean_terminated_length": 179.56521606445312,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.2192,
      "grad_norm": 7.897398471832275,
      "kl": 0.030792236328125,
      "learning_rate": 1e-06,
      "loss": -0.1464,
      "num_tokens": 3653487.0,
      "reward": 0.02417801320552826,
      "reward_std": 0.012017752975225449,
      "rewards/bleu_reward_func/mean": 0.02417801320552826,
      "rewards/bleu_reward_func/std": 0.018230969086289406,
      "step": 274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 321.28125,
      "completions/mean_terminated_length": 246.6521759033203,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.22,
      "grad_norm": 3.6866021156311035,
      "kl": 0.028167724609375,
      "learning_rate": 1e-06,
      "loss": 0.0214,
      "num_tokens": 3667096.0,
      "reward": 0.10455590486526489,
      "reward_std": 0.04352106153964996,
      "rewards/bleu_reward_func/mean": 0.10455590486526489,
      "rewards/bleu_reward_func/std": 0.07865530997514725,
      "step": 275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 236.375,
      "completions/mean_terminated_length": 218.00001525878906,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "epoch": 0.2208,
      "grad_norm": 3.3426926136016846,
      "kl": 0.02593994140625,
      "learning_rate": 1e-06,
      "loss": 0.0467,
      "num_tokens": 3677572.0,
      "reward": 0.07310892641544342,
      "reward_std": 0.049130503088235855,
      "rewards/bleu_reward_func/mean": 0.07310892641544342,
      "rewards/bleu_reward_func/std": 0.0947326272726059,
      "step": 276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 443.9375,
      "completions/mean_terminated_length": 408.2857360839844,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "epoch": 0.2216,
      "grad_norm": 2.185582399368286,
      "kl": 0.0277252197265625,
      "learning_rate": 1e-06,
      "loss": 0.0697,
      "num_tokens": 3693906.0,
      "reward": 0.0204878319054842,
      "reward_std": 0.0076804393902421,
      "rewards/bleu_reward_func/mean": 0.0204878319054842,
      "rewards/bleu_reward_func/std": 0.010448083281517029,
      "step": 277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 499.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 179.8125,
      "completions/mean_terminated_length": 179.8125,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.2224,
      "grad_norm": 3.5973546504974365,
      "kl": 0.03302001953125,
      "learning_rate": 1e-06,
      "loss": 0.1902,
      "num_tokens": 3701988.0,
      "reward": 0.05479752644896507,
      "reward_std": 0.02715984173119068,
      "rewards/bleu_reward_func/mean": 0.05479752644896507,
      "rewards/bleu_reward_func/std": 0.046242304146289825,
      "step": 278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 416.0,
      "completions/mean_length": 390.65625,
      "completions/mean_terminated_length": 213.3076934814453,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.2232,
      "grad_norm": 2.381040096282959,
      "kl": 0.029388427734375,
      "learning_rate": 1e-06,
      "loss": 0.0447,
      "num_tokens": 3717433.0,
      "reward": 0.0374862439930439,
      "reward_std": 0.013264529407024384,
      "rewards/bleu_reward_func/mean": 0.0374862439930439,
      "rewards/bleu_reward_func/std": 0.027201348915696144,
      "step": 279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 332.8125,
      "completions/mean_terminated_length": 262.6956481933594,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "epoch": 0.224,
      "grad_norm": 2.9090490341186523,
      "kl": 0.032562255859375,
      "learning_rate": 1e-06,
      "loss": 0.0125,
      "num_tokens": 3730611.0,
      "reward": 0.04209320247173309,
      "reward_std": 0.012306570075452328,
      "rewards/bleu_reward_func/mean": 0.04209320247173309,
      "rewards/bleu_reward_func/std": 0.03222496807575226,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 478.0,
      "completions/mean_length": 223.0625,
      "completions/mean_terminated_length": 181.7857208251953,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.2248,
      "grad_norm": 3.2262182235717773,
      "kl": 0.03692626953125,
      "learning_rate": 1e-06,
      "loss": -0.0013,
      "num_tokens": 3739557.0,
      "reward": 0.028793197125196457,
      "reward_std": 0.009944088757038116,
      "rewards/bleu_reward_func/mean": 0.028793197125196457,
      "rewards/bleu_reward_func/std": 0.011341102421283722,
      "step": 281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 429.0,
      "completions/mean_length": 435.8125,
      "completions/mean_terminated_length": 268.20001220703125,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.2256,
      "grad_norm": 2.236760139465332,
      "kl": 0.032806396484375,
      "learning_rate": 1e-06,
      "loss": -0.086,
      "num_tokens": 3760695.0,
      "reward": 0.044973913580179214,
      "reward_std": 0.010784904472529888,
      "rewards/bleu_reward_func/mean": 0.044973913580179214,
      "rewards/bleu_reward_func/std": 0.014913774095475674,
      "step": 282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 345.84375,
      "completions/mean_terminated_length": 299.32000732421875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "epoch": 0.2264,
      "grad_norm": 2.4806909561157227,
      "kl": 0.031829833984375,
      "learning_rate": 1e-06,
      "loss": 0.0659,
      "num_tokens": 3773914.0,
      "reward": 0.04734322056174278,
      "reward_std": 0.01934235356748104,
      "rewards/bleu_reward_func/mean": 0.04734322056174278,
      "rewards/bleu_reward_func/std": 0.030831577256321907,
      "step": 283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 478.0,
      "completions/mean_length": 315.5625,
      "completions/mean_terminated_length": 162.7777862548828,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.2272,
      "grad_norm": 3.51826810836792,
      "kl": 0.039093017578125,
      "learning_rate": 1e-06,
      "loss": 0.0564,
      "num_tokens": 3787564.0,
      "reward": 0.05522763729095459,
      "reward_std": 0.02003159187734127,
      "rewards/bleu_reward_func/mean": 0.05522763729095459,
      "rewards/bleu_reward_func/std": 0.046319931745529175,
      "step": 284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 411.59375,
      "completions/mean_terminated_length": 282.5,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 0.228,
      "grad_norm": 2.7587406635284424,
      "kl": 0.032806396484375,
      "learning_rate": 1e-06,
      "loss": 0.0267,
      "num_tokens": 3803919.0,
      "reward": 0.033719129860401154,
      "reward_std": 0.00807010754942894,
      "rewards/bleu_reward_func/mean": 0.033719129860401154,
      "rewards/bleu_reward_func/std": 0.013236233964562416,
      "step": 285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 300.0,
      "completions/max_terminated_length": 300.0,
      "completions/mean_length": 170.625,
      "completions/mean_terminated_length": 170.625,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.2288,
      "grad_norm": 3.637995481491089,
      "kl": 0.0235595703125,
      "learning_rate": 1e-06,
      "loss": 0.0381,
      "num_tokens": 3812531.0,
      "reward": 0.06265231966972351,
      "reward_std": 0.023745257407426834,
      "rewards/bleu_reward_func/mean": 0.06265231966972351,
      "rewards/bleu_reward_func/std": 0.04604180529713631,
      "step": 286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 351.6875,
      "completions/mean_terminated_length": 267.71429443359375,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "epoch": 0.2296,
      "grad_norm": 2.835132122039795,
      "kl": 0.03070068359375,
      "learning_rate": 1e-06,
      "loss": -0.0183,
      "num_tokens": 3826105.0,
      "reward": 0.042010486125946045,
      "reward_std": 0.02037208527326584,
      "rewards/bleu_reward_func/mean": 0.042010486125946045,
      "rewards/bleu_reward_func/std": 0.025461316108703613,
      "step": 287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 473.0,
      "completions/mean_length": 286.28125,
      "completions/mean_terminated_length": 234.19232177734375,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.2304,
      "grad_norm": 3.922866106033325,
      "kl": 0.030029296875,
      "learning_rate": 1e-06,
      "loss": 0.1685,
      "num_tokens": 3839346.0,
      "reward": 0.016506824642419815,
      "reward_std": 0.003976969514042139,
      "rewards/bleu_reward_func/mean": 0.016506824642419815,
      "rewards/bleu_reward_func/std": 0.008394270204007626,
      "step": 288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 464.0,
      "completions/mean_length": 425.875,
      "completions/mean_terminated_length": 339.75,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "epoch": 0.2312,
      "grad_norm": 2.1530535221099854,
      "kl": 0.031036376953125,
      "learning_rate": 1e-06,
      "loss": -0.0435,
      "num_tokens": 3856542.0,
      "reward": 0.04274771362543106,
      "reward_std": 0.015802588313817978,
      "rewards/bleu_reward_func/mean": 0.04274771362543106,
      "rewards/bleu_reward_func/std": 0.018804650753736496,
      "step": 289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 264.90625,
      "completions/mean_terminated_length": 182.5416717529297,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.232,
      "grad_norm": 4.459621429443359,
      "kl": 0.03009033203125,
      "learning_rate": 1e-06,
      "loss": 0.0053,
      "num_tokens": 3869307.0,
      "reward": 0.09130969643592834,
      "reward_std": 0.04283912479877472,
      "rewards/bleu_reward_func/mean": 0.09130969643592834,
      "rewards/bleu_reward_func/std": 0.08340450376272202,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 450.0,
      "completions/mean_length": 303.34375,
      "completions/mean_terminated_length": 233.7916717529297,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "epoch": 0.2328,
      "grad_norm": 2.758756160736084,
      "kl": 0.036041259765625,
      "learning_rate": 1e-06,
      "loss": -0.0283,
      "num_tokens": 3881046.0,
      "reward": 0.03966425359249115,
      "reward_std": 0.01337943784892559,
      "rewards/bleu_reward_func/mean": 0.03966425359249115,
      "rewards/bleu_reward_func/std": 0.020967189222574234,
      "step": 291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 289.5,
      "completions/mean_terminated_length": 215.33334350585938,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.2336,
      "grad_norm": 4.92682409286499,
      "kl": 0.0416259765625,
      "learning_rate": 1e-06,
      "loss": 0.0587,
      "num_tokens": 3893494.0,
      "reward": 0.07078565657138824,
      "reward_std": 0.025623325258493423,
      "rewards/bleu_reward_func/mean": 0.07078565657138824,
      "rewards/bleu_reward_func/std": 0.04824245721101761,
      "step": 292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 373.34375,
      "completions/mean_terminated_length": 278.47369384765625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "epoch": 0.2344,
      "grad_norm": 2.9965121746063232,
      "kl": 0.0308837890625,
      "learning_rate": 1e-06,
      "loss": 0.0633,
      "num_tokens": 3908305.0,
      "reward": 0.04714567959308624,
      "reward_std": 0.011470139026641846,
      "rewards/bleu_reward_func/mean": 0.04714567959308624,
      "rewards/bleu_reward_func/std": 0.028922023251652718,
      "step": 293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 363.625,
      "completions/mean_terminated_length": 285.9047546386719,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.2352,
      "grad_norm": 2.949458122253418,
      "kl": 0.034149169921875,
      "learning_rate": 1e-06,
      "loss": 0.1081,
      "num_tokens": 3924405.0,
      "reward": 0.04587670788168907,
      "reward_std": 0.02025657892227173,
      "rewards/bleu_reward_func/mean": 0.04587670788168907,
      "rewards/bleu_reward_func/std": 0.029254309833049774,
      "step": 294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 403.65625,
      "completions/mean_terminated_length": 319.3888854980469,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "epoch": 0.236,
      "grad_norm": 2.505204200744629,
      "kl": 0.0325927734375,
      "learning_rate": 1e-06,
      "loss": -0.0542,
      "num_tokens": 3939506.0,
      "reward": 0.03485488519072533,
      "reward_std": 0.014378003776073456,
      "rewards/bleu_reward_func/mean": 0.03485488519072533,
      "rewards/bleu_reward_func/std": 0.01973474584519863,
      "step": 295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 309.125,
      "completions/mean_terminated_length": 216.9091033935547,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.2368,
      "grad_norm": 6.216697692871094,
      "kl": 0.031768798828125,
      "learning_rate": 1e-06,
      "loss": 0.059,
      "num_tokens": 3952214.0,
      "reward": 0.07422341406345367,
      "reward_std": 0.01819428987801075,
      "rewards/bleu_reward_func/mean": 0.07422341406345367,
      "rewards/bleu_reward_func/std": 0.07971075177192688,
      "step": 296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 367.0,
      "completions/mean_length": 449.65625,
      "completions/mean_terminated_length": 262.625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 0.2376,
      "grad_norm": 2.2672359943389893,
      "kl": 0.028839111328125,
      "learning_rate": 1e-06,
      "loss": 0.0007,
      "num_tokens": 3971571.0,
      "reward": 0.05678567662835121,
      "reward_std": 0.020236749202013016,
      "rewards/bleu_reward_func/mean": 0.05678567662835121,
      "rewards/bleu_reward_func/std": 0.0485292449593544,
      "step": 297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 305.53125,
      "completions/mean_terminated_length": 257.8846130371094,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.2384,
      "grad_norm": 3.060343027114868,
      "kl": 0.036529541015625,
      "learning_rate": 1e-06,
      "loss": 0.0017,
      "num_tokens": 3984268.0,
      "reward": 0.09549540281295776,
      "reward_std": 0.02425481379032135,
      "rewards/bleu_reward_func/mean": 0.09549540281295776,
      "rewards/bleu_reward_func/std": 0.10033179074525833,
      "step": 298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 467.0,
      "completions/mean_length": 359.90625,
      "completions/mean_terminated_length": 255.84210205078125,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "epoch": 0.2392,
      "grad_norm": 2.7860310077667236,
      "kl": 0.024871826171875,
      "learning_rate": 1e-06,
      "loss": 0.0637,
      "num_tokens": 3998681.0,
      "reward": 0.02910490334033966,
      "reward_std": 0.008882608264684677,
      "rewards/bleu_reward_func/mean": 0.02910490334033966,
      "rewards/bleu_reward_func/std": 0.012598116882145405,
      "step": 299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 253.84375,
      "completions/mean_terminated_length": 236.6333465576172,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.24,
      "grad_norm": 3.2227437496185303,
      "kl": 0.035919189453125,
      "learning_rate": 1e-06,
      "loss": 0.0523,
      "num_tokens": 4008996.0,
      "reward": 0.03897559642791748,
      "reward_std": 0.011807247996330261,
      "rewards/bleu_reward_func/mean": 0.03897559642791748,
      "rewards/bleu_reward_func/std": 0.014171565882861614,
      "step": 300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 341.0,
      "completions/mean_length": 264.65625,
      "completions/mean_terminated_length": 182.20834350585938,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.2408,
      "grad_norm": 3.2822182178497314,
      "kl": 0.02972412109375,
      "learning_rate": 1e-06,
      "loss": -0.0655,
      "num_tokens": 4021769.0,
      "reward": 0.12274128198623657,
      "reward_std": 0.06403186917304993,
      "rewards/bleu_reward_func/mean": 0.12274128198623657,
      "rewards/bleu_reward_func/std": 0.09856269508600235,
      "step": 301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 467.09375,
      "completions/mean_terminated_length": 416.20001220703125,
      "completions/min_length": 283.0,
      "completions/min_terminated_length": 283.0,
      "epoch": 0.2416,
      "grad_norm": 2.018660306930542,
      "kl": 0.03399658203125,
      "learning_rate": 1e-06,
      "loss": -0.0454,
      "num_tokens": 4040132.0,
      "reward": 0.034933868795633316,
      "reward_std": 0.007452279329299927,
      "rewards/bleu_reward_func/mean": 0.034933868795633316,
      "rewards/bleu_reward_func/std": 0.02128129079937935,
      "step": 302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 392.46875,
      "completions/mean_terminated_length": 257.0,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.2424,
      "grad_norm": 2.8577029705047607,
      "kl": 0.0274200439453125,
      "learning_rate": 1e-06,
      "loss": -0.0111,
      "num_tokens": 4056587.0,
      "reward": 0.02288379706442356,
      "reward_std": 0.006262771785259247,
      "rewards/bleu_reward_func/mean": 0.02288379706442356,
      "rewards/bleu_reward_func/std": 0.014597552828490734,
      "step": 303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 332.0,
      "completions/mean_length": 271.75,
      "completions/mean_terminated_length": 191.6666717529297,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.2432,
      "grad_norm": 5.046367168426514,
      "kl": 0.02838134765625,
      "learning_rate": 1e-06,
      "loss": 0.0364,
      "num_tokens": 4067563.0,
      "reward": 0.02564316801726818,
      "reward_std": 0.011585518717765808,
      "rewards/bleu_reward_func/mean": 0.02564316801726818,
      "rewards/bleu_reward_func/std": 0.023773526772856712,
      "step": 304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 294.0,
      "completions/mean_length": 271.96875,
      "completions/mean_terminated_length": 191.95834350585938,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "epoch": 0.244,
      "grad_norm": 3.2017996311187744,
      "kl": 0.0457763671875,
      "learning_rate": 1e-06,
      "loss": 0.0443,
      "num_tokens": 4080074.0,
      "reward": 0.048807431012392044,
      "reward_std": 0.01481578778475523,
      "rewards/bleu_reward_func/mean": 0.048807431012392044,
      "rewards/bleu_reward_func/std": 0.025570906698703766,
      "step": 305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 452.0,
      "completions/max_terminated_length": 452.0,
      "completions/mean_length": 265.21875,
      "completions/mean_terminated_length": 265.21875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "epoch": 0.2448,
      "grad_norm": 2.6399717330932617,
      "kl": 0.025665283203125,
      "learning_rate": 1e-06,
      "loss": -0.0796,
      "num_tokens": 4091201.0,
      "reward": 0.04743821173906326,
      "reward_std": 0.01849541999399662,
      "rewards/bleu_reward_func/mean": 0.04743821173906326,
      "rewards/bleu_reward_func/std": 0.02645285800099373,
      "step": 306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 356.0,
      "completions/mean_length": 277.875,
      "completions/mean_terminated_length": 186.26087951660156,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "epoch": 0.2456,
      "grad_norm": 2.4667458534240723,
      "kl": 0.029388427734375,
      "learning_rate": 1e-06,
      "loss": 0.1769,
      "num_tokens": 4102253.0,
      "reward": 0.07918344438076019,
      "reward_std": 0.03603646531701088,
      "rewards/bleu_reward_func/mean": 0.07918344438076019,
      "rewards/bleu_reward_func/std": 0.09297043830156326,
      "step": 307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 378.0,
      "completions/mean_length": 198.4375,
      "completions/mean_terminated_length": 177.53334045410156,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.2464,
      "grad_norm": 2.9031858444213867,
      "kl": 0.0194854736328125,
      "learning_rate": 1e-06,
      "loss": -0.0419,
      "num_tokens": 4111451.0,
      "reward": 0.05468355864286423,
      "reward_std": 0.02622107043862343,
      "rewards/bleu_reward_func/mean": 0.05468355864286423,
      "rewards/bleu_reward_func/std": 0.03912202641367912,
      "step": 308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 294.65625,
      "completions/mean_terminated_length": 287.6451416015625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "epoch": 0.2472,
      "grad_norm": 2.6340668201446533,
      "kl": 0.0333251953125,
      "learning_rate": 1e-06,
      "loss": 0.0643,
      "num_tokens": 4122936.0,
      "reward": 0.07767876982688904,
      "reward_std": 0.017796212807297707,
      "rewards/bleu_reward_func/mean": 0.07767876982688904,
      "rewards/bleu_reward_func/std": 0.08556719124317169,
      "step": 309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 463.0,
      "completions/max_terminated_length": 463.0,
      "completions/mean_length": 252.40625,
      "completions/mean_terminated_length": 252.40625,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.248,
      "grad_norm": 3.0190749168395996,
      "kl": 0.0158843994140625,
      "learning_rate": 1e-06,
      "loss": -0.0605,
      "num_tokens": 4136381.0,
      "reward": 0.10290344059467316,
      "reward_std": 0.03325870633125305,
      "rewards/bleu_reward_func/mean": 0.10290344059467316,
      "rewards/bleu_reward_func/std": 0.06130888685584068,
      "step": 310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 395.0,
      "completions/mean_length": 229.46875,
      "completions/mean_terminated_length": 210.6333465576172,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.2488,
      "grad_norm": 3.156555652618408,
      "kl": 0.024261474609375,
      "learning_rate": 1e-06,
      "loss": 0.0582,
      "num_tokens": 4146276.0,
      "reward": 0.074183389544487,
      "reward_std": 0.024614207446575165,
      "rewards/bleu_reward_func/mean": 0.074183389544487,
      "rewards/bleu_reward_func/std": 0.06735430657863617,
      "step": 311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 342.3125,
      "completions/mean_terminated_length": 294.79998779296875,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.2496,
      "grad_norm": 3.10429310798645,
      "kl": 0.02587890625,
      "learning_rate": 1e-06,
      "loss": 0.144,
      "num_tokens": 4160590.0,
      "reward": 0.05624938756227493,
      "reward_std": 0.01750083453953266,
      "rewards/bleu_reward_func/mean": 0.05624938756227493,
      "rewards/bleu_reward_func/std": 0.03703475371003151,
      "step": 312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 327.625,
      "completions/mean_terminated_length": 217.0,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.2504,
      "grad_norm": 3.401366949081421,
      "kl": 0.04010009765625,
      "learning_rate": 1e-06,
      "loss": -0.1329,
      "num_tokens": 4175810.0,
      "reward": 0.06421037018299103,
      "reward_std": 0.023138659074902534,
      "rewards/bleu_reward_func/mean": 0.06421037018299103,
      "rewards/bleu_reward_func/std": 0.02672930806875229,
      "step": 313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 287.46875,
      "completions/mean_terminated_length": 255.3928680419922,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.2512,
      "grad_norm": 2.5841426849365234,
      "kl": 0.020111083984375,
      "learning_rate": 1e-06,
      "loss": 0.1266,
      "num_tokens": 4187337.0,
      "reward": 0.06008291244506836,
      "reward_std": 0.03618035838007927,
      "rewards/bleu_reward_func/mean": 0.06008291244506836,
      "rewards/bleu_reward_func/std": 0.07106407731771469,
      "step": 314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 312.0,
      "completions/mean_length": 265.46875,
      "completions/mean_terminated_length": 183.2916717529297,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.252,
      "grad_norm": 3.229022741317749,
      "kl": 0.03179931640625,
      "learning_rate": 1e-06,
      "loss": -0.0984,
      "num_tokens": 4201208.0,
      "reward": 0.07605750858783722,
      "reward_std": 0.02687455154955387,
      "rewards/bleu_reward_func/mean": 0.07605750858783722,
      "rewards/bleu_reward_func/std": 0.046578504145145416,
      "step": 315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 360.25,
      "completions/mean_terminated_length": 300.86956787109375,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.2528,
      "grad_norm": 2.501115560531616,
      "kl": 0.030731201171875,
      "learning_rate": 1e-06,
      "loss": -0.0733,
      "num_tokens": 4218152.0,
      "reward": 0.15790392458438873,
      "reward_std": 0.05716419219970703,
      "rewards/bleu_reward_func/mean": 0.15790392458438873,
      "rewards/bleu_reward_func/std": 0.18658459186553955,
      "step": 316
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 318.125,
      "completions/mean_terminated_length": 253.5,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.2536,
      "grad_norm": 3.5820846557617188,
      "kl": 0.0333251953125,
      "learning_rate": 1e-06,
      "loss": 0.0424,
      "num_tokens": 4231732.0,
      "reward": 0.08669077605009079,
      "reward_std": 0.023732244968414307,
      "rewards/bleu_reward_func/mean": 0.08669077605009079,
      "rewards/bleu_reward_func/std": 0.10351579636335373,
      "step": 317
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 252.09375,
      "completions/mean_terminated_length": 179.3199920654297,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.2544,
      "grad_norm": 4.425368785858154,
      "kl": 0.032501220703125,
      "learning_rate": 1e-06,
      "loss": -0.0886,
      "num_tokens": 4242311.0,
      "reward": 0.03214521333575249,
      "reward_std": 0.02095024473965168,
      "rewards/bleu_reward_func/mean": 0.03214521333575249,
      "rewards/bleu_reward_func/std": 0.03977763652801514,
      "step": 318
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 464.0,
      "completions/mean_length": 260.34375,
      "completions/mean_terminated_length": 202.2692413330078,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.2552,
      "grad_norm": 3.3167836666107178,
      "kl": 0.029937744140625,
      "learning_rate": 1e-06,
      "loss": 0.0631,
      "num_tokens": 4254082.0,
      "reward": 0.03737305477261543,
      "reward_std": 0.012050272896885872,
      "rewards/bleu_reward_func/mean": 0.03737305477261543,
      "rewards/bleu_reward_func/std": 0.028553711250424385,
      "step": 319
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 317.40625,
      "completions/mean_terminated_length": 262.91998291015625,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "epoch": 0.256,
      "grad_norm": 2.909987688064575,
      "kl": 0.031707763671875,
      "learning_rate": 1e-06,
      "loss": -0.0386,
      "num_tokens": 4266695.0,
      "reward": 0.039175860583782196,
      "reward_std": 0.009842153638601303,
      "rewards/bleu_reward_func/mean": 0.039175860583782196,
      "rewards/bleu_reward_func/std": 0.017322974279522896,
      "step": 320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 287.28125,
      "completions/mean_terminated_length": 212.375,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.2568,
      "grad_norm": 3.3348236083984375,
      "kl": 0.025726318359375,
      "learning_rate": 1e-06,
      "loss": 0.0681,
      "num_tokens": 4278376.0,
      "reward": 0.05241217091679573,
      "reward_std": 0.015721352770924568,
      "rewards/bleu_reward_func/mean": 0.05241217091679573,
      "rewards/bleu_reward_func/std": 0.03155159205198288,
      "step": 321
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 217.65625,
      "completions/mean_terminated_length": 119.54167175292969,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.2576,
      "grad_norm": 5.488730430603027,
      "kl": 0.05377197265625,
      "learning_rate": 1e-06,
      "loss": 0.0444,
      "num_tokens": 4287813.0,
      "reward": 0.09449569880962372,
      "reward_std": 0.05701503902673721,
      "rewards/bleu_reward_func/mean": 0.09449569880962372,
      "rewards/bleu_reward_func/std": 0.09492365270853043,
      "step": 322
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 441.4375,
      "completions/mean_terminated_length": 323.8333435058594,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.2584,
      "grad_norm": 2.3454339504241943,
      "kl": 0.023223876953125,
      "learning_rate": 1e-06,
      "loss": 0.0284,
      "num_tokens": 4306675.0,
      "reward": 0.0388575978577137,
      "reward_std": 0.012517341412603855,
      "rewards/bleu_reward_func/mean": 0.0388575978577137,
      "rewards/bleu_reward_func/std": 0.05242987349629402,
      "step": 323
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 278.90625,
      "completions/mean_terminated_length": 263.3666687011719,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.2592,
      "grad_norm": 2.8751518726348877,
      "kl": 0.025543212890625,
      "learning_rate": 1e-06,
      "loss": 0.0622,
      "num_tokens": 4317360.0,
      "reward": 0.032819002866744995,
      "reward_std": 0.010129611939191818,
      "rewards/bleu_reward_func/mean": 0.032819002866744995,
      "rewards/bleu_reward_func/std": 0.025955382734537125,
      "step": 324
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 332.84375,
      "completions/mean_terminated_length": 262.7391357421875,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.26,
      "grad_norm": 7.335513591766357,
      "kl": 0.0263671875,
      "learning_rate": 1e-06,
      "loss": -0.0788,
      "num_tokens": 4333659.0,
      "reward": 0.08996531367301941,
      "reward_std": 0.03278956562280655,
      "rewards/bleu_reward_func/mean": 0.08996531367301941,
      "rewards/bleu_reward_func/std": 0.11964567005634308,
      "step": 325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 293.90625,
      "completions/mean_terminated_length": 279.3666687011719,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "epoch": 0.2608,
      "grad_norm": 2.7281105518341064,
      "kl": 0.02679443359375,
      "learning_rate": 1e-06,
      "loss": 0.0164,
      "num_tokens": 4346760.0,
      "reward": 0.035310667008161545,
      "reward_std": 0.01689826510846615,
      "rewards/bleu_reward_func/mean": 0.035310667008161545,
      "rewards/bleu_reward_func/std": 0.024357853457331657,
      "step": 326
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 416.0,
      "completions/mean_length": 395.4375,
      "completions/mean_terminated_length": 263.3333435058594,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "epoch": 0.2616,
      "grad_norm": 2.143420457839966,
      "kl": 0.0240478515625,
      "learning_rate": 1e-06,
      "loss": 0.0552,
      "num_tokens": 4362278.0,
      "reward": 0.026128560304641724,
      "reward_std": 0.00915272906422615,
      "rewards/bleu_reward_func/mean": 0.026128560304641724,
      "rewards/bleu_reward_func/std": 0.016462024301290512,
      "step": 327
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 276.34375,
      "completions/mean_terminated_length": 268.7419128417969,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.2624,
      "grad_norm": 2.9250733852386475,
      "kl": 0.02899169921875,
      "learning_rate": 1e-06,
      "loss": -0.0632,
      "num_tokens": 4376457.0,
      "reward": 0.06787262856960297,
      "reward_std": 0.02760476991534233,
      "rewards/bleu_reward_func/mean": 0.06787262856960297,
      "rewards/bleu_reward_func/std": 0.04441879689693451,
      "step": 328
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 316.46875,
      "completions/mean_terminated_length": 310.1612854003906,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "epoch": 0.2632,
      "grad_norm": 2.7614059448242188,
      "kl": 0.0227508544921875,
      "learning_rate": 1e-06,
      "loss": 0.052,
      "num_tokens": 4388400.0,
      "reward": 0.10147911310195923,
      "reward_std": 0.01994110643863678,
      "rewards/bleu_reward_func/mean": 0.10147911310195923,
      "rewards/bleu_reward_func/std": 0.09049764275550842,
      "step": 329
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 405.5625,
      "completions/mean_terminated_length": 322.77777099609375,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "epoch": 0.264,
      "grad_norm": 2.2795681953430176,
      "kl": 0.02447509765625,
      "learning_rate": 1e-06,
      "loss": 0.0386,
      "num_tokens": 4403378.0,
      "reward": 0.035395894199609756,
      "reward_std": 0.01003876980394125,
      "rewards/bleu_reward_func/mean": 0.035395894199609756,
      "rewards/bleu_reward_func/std": 0.026000048965215683,
      "step": 330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 278.125,
      "completions/mean_terminated_length": 253.9310302734375,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.2648,
      "grad_norm": 2.9069783687591553,
      "kl": 0.03643798828125,
      "learning_rate": 1e-06,
      "loss": 0.035,
      "num_tokens": 4414550.0,
      "reward": 0.06586553156375885,
      "reward_std": 0.02944871410727501,
      "rewards/bleu_reward_func/mean": 0.06586553156375885,
      "rewards/bleu_reward_func/std": 0.051484063267707825,
      "step": 331
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 503.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 245.78125,
      "completions/mean_terminated_length": 245.78125,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.2656,
      "grad_norm": 2.919708490371704,
      "kl": 0.022796630859375,
      "learning_rate": 1e-06,
      "loss": 0.114,
      "num_tokens": 4424263.0,
      "reward": 0.05288301408290863,
      "reward_std": 0.03779301792383194,
      "rewards/bleu_reward_func/mean": 0.05288301408290863,
      "rewards/bleu_reward_func/std": 0.048032332211732864,
      "step": 332
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 457.0,
      "completions/mean_length": 332.15625,
      "completions/mean_terminated_length": 326.3548278808594,
      "completions/min_length": 171.0,
      "completions/min_terminated_length": 171.0,
      "epoch": 0.2664,
      "grad_norm": 2.5168418884277344,
      "kl": 0.0219573974609375,
      "learning_rate": 1e-06,
      "loss": 0.0112,
      "num_tokens": 4437548.0,
      "reward": 0.07673460245132446,
      "reward_std": 0.024972733110189438,
      "rewards/bleu_reward_func/mean": 0.07673460245132446,
      "rewards/bleu_reward_func/std": 0.06732524931430817,
      "step": 333
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 194.34375,
      "completions/mean_terminated_length": 184.09677124023438,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.2672,
      "grad_norm": 3.8905692100524902,
      "kl": 0.03387451171875,
      "learning_rate": 1e-06,
      "loss": -0.0208,
      "num_tokens": 4449471.0,
      "reward": 0.03636704757809639,
      "reward_std": 0.015638206154108047,
      "rewards/bleu_reward_func/mean": 0.03636704757809639,
      "rewards/bleu_reward_func/std": 0.01746034063398838,
      "step": 334
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 334.34375,
      "completions/mean_terminated_length": 275.125,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.268,
      "grad_norm": 2.877241373062134,
      "kl": 0.031646728515625,
      "learning_rate": 1e-06,
      "loss": -0.0796,
      "num_tokens": 4465066.0,
      "reward": 0.04503689333796501,
      "reward_std": 0.019507717341184616,
      "rewards/bleu_reward_func/mean": 0.04503689333796501,
      "rewards/bleu_reward_func/std": 0.03148737922310829,
      "step": 335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 393.0,
      "completions/mean_length": 361.46875,
      "completions/mean_terminated_length": 244.38888549804688,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.2688,
      "grad_norm": 2.3143179416656494,
      "kl": 0.023193359375,
      "learning_rate": 1e-06,
      "loss": -0.051,
      "num_tokens": 4479641.0,
      "reward": 0.04236375913023949,
      "reward_std": 0.01834411546587944,
      "rewards/bleu_reward_func/mean": 0.04236375913023949,
      "rewards/bleu_reward_func/std": 0.023316362872719765,
      "step": 336
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 462.0,
      "completions/mean_length": 280.6875,
      "completions/mean_terminated_length": 237.8518524169922,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.2696,
      "grad_norm": 2.4942591190338135,
      "kl": 0.0158843994140625,
      "learning_rate": 1e-06,
      "loss": -0.0403,
      "num_tokens": 4490735.0,
      "reward": 0.0699649378657341,
      "reward_std": 0.025273269042372704,
      "rewards/bleu_reward_func/mean": 0.0699649378657341,
      "rewards/bleu_reward_func/std": 0.07529071718454361,
      "step": 337
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 368.1875,
      "completions/mean_terminated_length": 311.9130554199219,
      "completions/min_length": 152.0,
      "completions/min_terminated_length": 152.0,
      "epoch": 0.2704,
      "grad_norm": 2.073747158050537,
      "kl": 0.0158843994140625,
      "learning_rate": 1e-06,
      "loss": 0.0201,
      "num_tokens": 4505757.0,
      "reward": 0.06272565573453903,
      "reward_std": 0.04652194678783417,
      "rewards/bleu_reward_func/mean": 0.06272565573453903,
      "rewards/bleu_reward_func/std": 0.09588578343391418,
      "step": 338
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 301.90625,
      "completions/mean_terminated_length": 231.875,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.2712,
      "grad_norm": 3.1780126094818115,
      "kl": 0.0195159912109375,
      "learning_rate": 1e-06,
      "loss": -0.1707,
      "num_tokens": 4523066.0,
      "reward": 0.029990248382091522,
      "reward_std": 0.01466078869998455,
      "rewards/bleu_reward_func/mean": 0.029990248382091522,
      "rewards/bleu_reward_func/std": 0.028150422498583794,
      "step": 339
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 358.5,
      "completions/mean_terminated_length": 239.11111450195312,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.272,
      "grad_norm": 2.1125426292419434,
      "kl": 0.0198822021484375,
      "learning_rate": 1e-06,
      "loss": -0.0078,
      "num_tokens": 4540362.0,
      "reward": 0.14181923866271973,
      "reward_std": 0.10672705620527267,
      "rewards/bleu_reward_func/mean": 0.14181923866271973,
      "rewards/bleu_reward_func/std": 0.26110920310020447,
      "step": 340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 247.5,
      "completions/mean_terminated_length": 220.13792419433594,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.2728,
      "grad_norm": 4.835620403289795,
      "kl": 0.0215606689453125,
      "learning_rate": 1e-06,
      "loss": 0.3826,
      "num_tokens": 4551714.0,
      "reward": 0.16201910376548767,
      "reward_std": 0.08808144181966782,
      "rewards/bleu_reward_func/mean": 0.16201910376548767,
      "rewards/bleu_reward_func/std": 0.2542886435985565,
      "step": 341
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 242.84375,
      "completions/mean_terminated_length": 215.0,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.2736,
      "grad_norm": 2.835961103439331,
      "kl": 0.0259246826171875,
      "learning_rate": 1e-06,
      "loss": -0.0701,
      "num_tokens": 4561693.0,
      "reward": 0.04708701744675636,
      "reward_std": 0.015518728643655777,
      "rewards/bleu_reward_func/mean": 0.04708701744675636,
      "rewards/bleu_reward_func/std": 0.048917315900325775,
      "step": 342
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 106.0,
      "completions/mean_length": 282.78125,
      "completions/mean_terminated_length": 53.5625,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.2744,
      "grad_norm": 5.41905403137207,
      "kl": 0.092864990234375,
      "learning_rate": 1e-06,
      "loss": -0.0831,
      "num_tokens": 4574398.0,
      "reward": 0.054219573736190796,
      "reward_std": 0.02302156388759613,
      "rewards/bleu_reward_func/mean": 0.054219573736190796,
      "rewards/bleu_reward_func/std": 0.04756642505526543,
      "step": 343
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 401.09375,
      "completions/mean_terminated_length": 290.1875,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "epoch": 0.2752,
      "grad_norm": 2.166447401046753,
      "kl": 0.0235137939453125,
      "learning_rate": 1e-06,
      "loss": 0.0272,
      "num_tokens": 4590585.0,
      "reward": 0.036001622676849365,
      "reward_std": 0.02016858570277691,
      "rewards/bleu_reward_func/mean": 0.036001622676849365,
      "rewards/bleu_reward_func/std": 0.03957979381084442,
      "step": 344
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 391.0,
      "completions/max_terminated_length": 391.0,
      "completions/mean_length": 157.0625,
      "completions/mean_terminated_length": 157.0625,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.276,
      "grad_norm": 4.106038570404053,
      "kl": 0.029022216796875,
      "learning_rate": 1e-06,
      "loss": -0.0406,
      "num_tokens": 4600707.0,
      "reward": 0.05328774452209473,
      "reward_std": 0.018984105437994003,
      "rewards/bleu_reward_func/mean": 0.05328774452209473,
      "rewards/bleu_reward_func/std": 0.035300616174936295,
      "step": 345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 328.84375,
      "completions/mean_terminated_length": 267.79168701171875,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "epoch": 0.2768,
      "grad_norm": 2.932767391204834,
      "kl": 0.028656005859375,
      "learning_rate": 1e-06,
      "loss": 0.0407,
      "num_tokens": 4613366.0,
      "reward": 0.032469406723976135,
      "reward_std": 0.008480279706418514,
      "rewards/bleu_reward_func/mean": 0.032469406723976135,
      "rewards/bleu_reward_func/std": 0.018695853650569916,
      "step": 346
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 464.0,
      "completions/mean_length": 336.125,
      "completions/mean_terminated_length": 267.3043518066406,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "epoch": 0.2776,
      "grad_norm": 2.633439064025879,
      "kl": 0.023101806640625,
      "learning_rate": 1e-06,
      "loss": -0.0152,
      "num_tokens": 4627018.0,
      "reward": 0.04361742362380028,
      "reward_std": 0.02035902440547943,
      "rewards/bleu_reward_func/mean": 0.04361742362380028,
      "rewards/bleu_reward_func/std": 0.021847298368811607,
      "step": 347
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 293.46875,
      "completions/mean_terminated_length": 207.95652770996094,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.2784,
      "grad_norm": 2.943420886993408,
      "kl": 0.027130126953125,
      "learning_rate": 1e-06,
      "loss": 0.007,
      "num_tokens": 4640505.0,
      "reward": 0.21496786177158356,
      "reward_std": 0.06119208037853241,
      "rewards/bleu_reward_func/mean": 0.21496786177158356,
      "rewards/bleu_reward_func/std": 0.28509706258773804,
      "step": 348
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 359.0,
      "completions/mean_length": 216.59375,
      "completions/mean_terminated_length": 196.90000915527344,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.2792,
      "grad_norm": 2.763566255569458,
      "kl": 0.01910400390625,
      "learning_rate": 1e-06,
      "loss": 0.0424,
      "num_tokens": 4649468.0,
      "reward": 0.07204422354698181,
      "reward_std": 0.04057842493057251,
      "rewards/bleu_reward_func/mean": 0.07204422354698181,
      "rewards/bleu_reward_func/std": 0.06618453562259674,
      "step": 349
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 314.34375,
      "completions/mean_terminated_length": 259.0,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "epoch": 0.28,
      "grad_norm": 2.9624099731445312,
      "kl": 0.021331787109375,
      "learning_rate": 1e-06,
      "loss": 0.0837,
      "num_tokens": 4665599.0,
      "reward": 0.06650006771087646,
      "reward_std": 0.04054763540625572,
      "rewards/bleu_reward_func/mean": 0.06650006771087646,
      "rewards/bleu_reward_func/std": 0.05784449353814125,
      "step": 350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 302.375,
      "completions/mean_terminated_length": 220.3478240966797,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.2808,
      "grad_norm": 3.327723741531372,
      "kl": 0.02691650390625,
      "learning_rate": 1e-06,
      "loss": -0.0126,
      "num_tokens": 4679795.0,
      "reward": 0.04633244499564171,
      "reward_std": 0.0104750357568264,
      "rewards/bleu_reward_func/mean": 0.04633244499564171,
      "rewards/bleu_reward_func/std": 0.0204442348331213,
      "step": 351
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 476.0,
      "completions/mean_length": 357.96875,
      "completions/mean_terminated_length": 238.1666717529297,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.2816,
      "grad_norm": 3.8369596004486084,
      "kl": 0.0295562744140625,
      "learning_rate": 1e-06,
      "loss": -0.0252,
      "num_tokens": 4693930.0,
      "reward": 0.0545232817530632,
      "reward_std": 0.017153870314359665,
      "rewards/bleu_reward_func/mean": 0.0545232817530632,
      "rewards/bleu_reward_func/std": 0.05576448515057564,
      "step": 352
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 326.03125,
      "completions/mean_terminated_length": 264.04168701171875,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.2824,
      "grad_norm": 2.7297415733337402,
      "kl": 0.025543212890625,
      "learning_rate": 1e-06,
      "loss": 0.1881,
      "num_tokens": 4707739.0,
      "reward": 0.04129674285650253,
      "reward_std": 0.041807621717453,
      "rewards/bleu_reward_func/mean": 0.04129674285650253,
      "rewards/bleu_reward_func/std": 0.05771995335817337,
      "step": 353
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 315.40625,
      "completions/mean_terminated_length": 279.0,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.2832,
      "grad_norm": 3.3393967151641846,
      "kl": 0.026458740234375,
      "learning_rate": 1e-06,
      "loss": -0.0207,
      "num_tokens": 4720072.0,
      "reward": 0.0364898145198822,
      "reward_std": 0.009451567195355892,
      "rewards/bleu_reward_func/mean": 0.0364898145198822,
      "rewards/bleu_reward_func/std": 0.030448051169514656,
      "step": 354
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 282.25,
      "completions/mean_terminated_length": 205.6666717529297,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.284,
      "grad_norm": 3.096372365951538,
      "kl": 0.029876708984375,
      "learning_rate": 1e-06,
      "loss": -0.0442,
      "num_tokens": 4731840.0,
      "reward": 0.03462470322847366,
      "reward_std": 0.024994423612952232,
      "rewards/bleu_reward_func/mean": 0.03462470322847366,
      "rewards/bleu_reward_func/std": 0.03290289640426636,
      "step": 355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 430.0,
      "completions/mean_length": 298.90625,
      "completions/mean_terminated_length": 284.70001220703125,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "epoch": 0.2848,
      "grad_norm": 3.161959409713745,
      "kl": 0.022979736328125,
      "learning_rate": 1e-06,
      "loss": -0.1124,
      "num_tokens": 4743517.0,
      "reward": 0.04477345570921898,
      "reward_std": 0.014152650721371174,
      "rewards/bleu_reward_func/mean": 0.04477345570921898,
      "rewards/bleu_reward_func/std": 0.023930195719003677,
      "step": 356
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 433.0,
      "completions/mean_length": 276.34375,
      "completions/mean_terminated_length": 260.63336181640625,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "epoch": 0.2856,
      "grad_norm": 2.8648571968078613,
      "kl": 0.0195159912109375,
      "learning_rate": 1e-06,
      "loss": -0.1692,
      "num_tokens": 4755056.0,
      "reward": 0.0786188468337059,
      "reward_std": 0.036090441048145294,
      "rewards/bleu_reward_func/mean": 0.0786188468337059,
      "rewards/bleu_reward_func/std": 0.04780329018831253,
      "step": 357
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 298.28125,
      "completions/mean_terminated_length": 227.0416717529297,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.2864,
      "grad_norm": 3.4011240005493164,
      "kl": 0.047210693359375,
      "learning_rate": 1e-06,
      "loss": 0.0091,
      "num_tokens": 4766817.0,
      "reward": 0.05945078283548355,
      "reward_std": 0.01760122738778591,
      "rewards/bleu_reward_func/mean": 0.05945078283548355,
      "rewards/bleu_reward_func/std": 0.030062546953558922,
      "step": 358
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 290.75,
      "completions/mean_terminated_length": 228.79998779296875,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.2872,
      "grad_norm": 2.69232439994812,
      "kl": 0.03240966796875,
      "learning_rate": 1e-06,
      "loss": 0.1015,
      "num_tokens": 4778625.0,
      "reward": 0.04637129232287407,
      "reward_std": 0.017768511548638344,
      "rewards/bleu_reward_func/mean": 0.04637129232287407,
      "rewards/bleu_reward_func/std": 0.024475086480379105,
      "step": 359
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 482.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 196.8125,
      "completions/mean_terminated_length": 196.8125,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.288,
      "grad_norm": 5.5619120597839355,
      "kl": 0.035400390625,
      "learning_rate": 1e-06,
      "loss": -0.1212,
      "num_tokens": 4787059.0,
      "reward": 0.057328179478645325,
      "reward_std": 0.024165252223610878,
      "rewards/bleu_reward_func/mean": 0.057328179478645325,
      "rewards/bleu_reward_func/std": 0.04007139429450035,
      "step": 360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 461.0,
      "completions/max_terminated_length": 461.0,
      "completions/mean_length": 247.90625,
      "completions/mean_terminated_length": 247.90625,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.2888,
      "grad_norm": 3.4733636379241943,
      "kl": 0.0301513671875,
      "learning_rate": 1e-06,
      "loss": 0.105,
      "num_tokens": 4797280.0,
      "reward": 0.0336172953248024,
      "reward_std": 0.02327040769159794,
      "rewards/bleu_reward_func/mean": 0.0336172953248024,
      "rewards/bleu_reward_func/std": 0.03187631443142891,
      "step": 361
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 255.28125,
      "completions/mean_terminated_length": 238.1666717529297,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.2896,
      "grad_norm": 3.501699686050415,
      "kl": 0.0192413330078125,
      "learning_rate": 1e-06,
      "loss": -0.0395,
      "num_tokens": 4807833.0,
      "reward": 0.10404136776924133,
      "reward_std": 0.03069019690155983,
      "rewards/bleu_reward_func/mean": 0.10404136776924133,
      "rewards/bleu_reward_func/std": 0.11612053960561752,
      "step": 362
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 446.0,
      "completions/mean_length": 311.28125,
      "completions/mean_terminated_length": 190.85000610351562,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.2904,
      "grad_norm": 3.420729160308838,
      "kl": 0.029693603515625,
      "learning_rate": 1e-06,
      "loss": -0.1242,
      "num_tokens": 4821250.0,
      "reward": 0.03283509612083435,
      "reward_std": 0.00878961943089962,
      "rewards/bleu_reward_func/mean": 0.03283509612083435,
      "rewards/bleu_reward_func/std": 0.03208829462528229,
      "step": 363
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 410.65625,
      "completions/mean_terminated_length": 357.5714416503906,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "epoch": 0.2912,
      "grad_norm": 2.394613742828369,
      "kl": 0.020355224609375,
      "learning_rate": 1e-06,
      "loss": 0.0723,
      "num_tokens": 4838031.0,
      "reward": 0.03294292837381363,
      "reward_std": 0.0140132587403059,
      "rewards/bleu_reward_func/mean": 0.03294292837381363,
      "rewards/bleu_reward_func/std": 0.036045484244823456,
      "step": 364
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 478.0,
      "completions/mean_length": 212.4375,
      "completions/mean_terminated_length": 202.77418518066406,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.292,
      "grad_norm": 4.775505542755127,
      "kl": 0.021697998046875,
      "learning_rate": 1e-06,
      "loss": -0.0461,
      "num_tokens": 4850365.0,
      "reward": 0.09430208802223206,
      "reward_std": 0.06408664584159851,
      "rewards/bleu_reward_func/mean": 0.09430208802223206,
      "rewards/bleu_reward_func/std": 0.11308187246322632,
      "step": 365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 391.0,
      "completions/max_terminated_length": 391.0,
      "completions/mean_length": 203.0625,
      "completions/mean_terminated_length": 203.0625,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.2928,
      "grad_norm": 3.6602747440338135,
      "kl": 0.029144287109375,
      "learning_rate": 1e-06,
      "loss": 0.1312,
      "num_tokens": 4859159.0,
      "reward": 0.09555967152118683,
      "reward_std": 0.028682151809334755,
      "rewards/bleu_reward_func/mean": 0.09555967152118683,
      "rewards/bleu_reward_func/std": 0.07242386043071747,
      "step": 366
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 392.46875,
      "completions/mean_terminated_length": 287.0,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "epoch": 0.2936,
      "grad_norm": 2.596052408218384,
      "kl": 0.0323486328125,
      "learning_rate": 1e-06,
      "loss": -0.0453,
      "num_tokens": 4875438.0,
      "reward": 0.06572610139846802,
      "reward_std": 0.019221346825361252,
      "rewards/bleu_reward_func/mean": 0.06572610139846802,
      "rewards/bleu_reward_func/std": 0.05693361535668373,
      "step": 367
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 330.90625,
      "completions/mean_terminated_length": 260.0434875488281,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.2944,
      "grad_norm": 2.464620590209961,
      "kl": 0.027130126953125,
      "learning_rate": 1e-06,
      "loss": -0.0032,
      "num_tokens": 4889683.0,
      "reward": 0.051844000816345215,
      "reward_std": 0.016486987471580505,
      "rewards/bleu_reward_func/mean": 0.051844000816345215,
      "rewards/bleu_reward_func/std": 0.017129171639680862,
      "step": 368
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 382.0,
      "completions/mean_length": 303.25,
      "completions/mean_terminated_length": 233.6666717529297,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.2952,
      "grad_norm": 2.683809280395508,
      "kl": 0.0198974609375,
      "learning_rate": 1e-06,
      "loss": 0.0027,
      "num_tokens": 4901515.0,
      "reward": 0.04742293432354927,
      "reward_std": 0.022541342303156853,
      "rewards/bleu_reward_func/mean": 0.04742293432354927,
      "rewards/bleu_reward_func/std": 0.03928080573678017,
      "step": 369
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 276.46875,
      "completions/mean_terminated_length": 210.51998901367188,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.296,
      "grad_norm": 2.637234926223755,
      "kl": 0.0212860107421875,
      "learning_rate": 1e-06,
      "loss": -0.187,
      "num_tokens": 4913330.0,
      "reward": 0.0685403048992157,
      "reward_std": 0.03239838033914566,
      "rewards/bleu_reward_func/mean": 0.0685403048992157,
      "rewards/bleu_reward_func/std": 0.09188274294137955,
      "step": 370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 315.78125,
      "completions/mean_terminated_length": 213.0,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.2968,
      "grad_norm": 4.868023872375488,
      "kl": 0.026947021484375,
      "learning_rate": 1e-06,
      "loss": 0.0314,
      "num_tokens": 4926643.0,
      "reward": 0.11580727994441986,
      "reward_std": 0.05246927589178085,
      "rewards/bleu_reward_func/mean": 0.11580727994441986,
      "rewards/bleu_reward_func/std": 0.12015223503112793,
      "step": 371
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 281.25,
      "completions/mean_terminated_length": 77.64705657958984,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.2976,
      "grad_norm": 5.435993194580078,
      "kl": 0.03948974609375,
      "learning_rate": 1e-06,
      "loss": 0.0457,
      "num_tokens": 4940059.0,
      "reward": 0.021783608943223953,
      "reward_std": 0.0037742627318948507,
      "rewards/bleu_reward_func/mean": 0.021783608943223953,
      "rewards/bleu_reward_func/std": 0.016994595527648926,
      "step": 372
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 361.21875,
      "completions/mean_terminated_length": 310.9583435058594,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.2984,
      "grad_norm": 3.6274783611297607,
      "kl": 0.02838134765625,
      "learning_rate": 1e-06,
      "loss": 0.1372,
      "num_tokens": 4954882.0,
      "reward": 0.033363357186317444,
      "reward_std": 0.010118735022842884,
      "rewards/bleu_reward_func/mean": 0.033363357186317444,
      "rewards/bleu_reward_func/std": 0.01759323477745056,
      "step": 373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 268.46875,
      "completions/mean_terminated_length": 212.2692413330078,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.2992,
      "grad_norm": 3.4594333171844482,
      "kl": 0.040863037109375,
      "learning_rate": 1e-06,
      "loss": -0.03,
      "num_tokens": 4965705.0,
      "reward": 0.06493013352155685,
      "reward_std": 0.024484504014253616,
      "rewards/bleu_reward_func/mean": 0.06493013352155685,
      "rewards/bleu_reward_func/std": 0.05946136638522148,
      "step": 374
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 356.1875,
      "completions/mean_terminated_length": 312.55999755859375,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "epoch": 0.3,
      "grad_norm": 2.657532215118408,
      "kl": 0.0255126953125,
      "learning_rate": 1e-06,
      "loss": -0.1039,
      "num_tokens": 4982839.0,
      "reward": 0.09929439425468445,
      "reward_std": 0.02464054897427559,
      "rewards/bleu_reward_func/mean": 0.09929439425468445,
      "rewards/bleu_reward_func/std": 0.12181542813777924,
      "step": 375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 298.53125,
      "completions/mean_terminated_length": 238.75999450683594,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.3008,
      "grad_norm": 2.7878925800323486,
      "kl": 0.030426025390625,
      "learning_rate": 1e-06,
      "loss": -0.1121,
      "num_tokens": 4994976.0,
      "reward": 0.09792232513427734,
      "reward_std": 0.03112916275858879,
      "rewards/bleu_reward_func/mean": 0.09792232513427734,
      "rewards/bleu_reward_func/std": 0.10117895156145096,
      "step": 376
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 246.875,
      "completions/mean_terminated_length": 209.00001525878906,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "epoch": 0.3016,
      "grad_norm": 3.0961310863494873,
      "kl": 0.039215087890625,
      "learning_rate": 1e-06,
      "loss": -0.0131,
      "num_tokens": 5005532.0,
      "reward": 0.04401427507400513,
      "reward_std": 0.011130438186228275,
      "rewards/bleu_reward_func/mean": 0.04401427507400513,
      "rewards/bleu_reward_func/std": 0.026963340118527412,
      "step": 377
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 432.0,
      "completions/mean_length": 352.96875,
      "completions/mean_terminated_length": 269.6666564941406,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "epoch": 0.3024,
      "grad_norm": 2.8589773178100586,
      "kl": 0.029449462890625,
      "learning_rate": 1e-06,
      "loss": 0.0124,
      "num_tokens": 5019211.0,
      "reward": 0.05802150070667267,
      "reward_std": 0.021435074508190155,
      "rewards/bleu_reward_func/mean": 0.05802150070667267,
      "rewards/bleu_reward_func/std": 0.03692251443862915,
      "step": 378
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 480.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 236.9375,
      "completions/mean_terminated_length": 236.9375,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.3032,
      "grad_norm": 3.043027877807617,
      "kl": 0.02935791015625,
      "learning_rate": 1e-06,
      "loss": -0.0605,
      "num_tokens": 5029297.0,
      "reward": 0.027532659471035004,
      "reward_std": 0.012727165594696999,
      "rewards/bleu_reward_func/mean": 0.027532659471035004,
      "rewards/bleu_reward_func/std": 0.016757391393184662,
      "step": 379
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 404.0,
      "completions/mean_length": 190.5625,
      "completions/mean_terminated_length": 180.19354248046875,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.304,
      "grad_norm": 3.4946486949920654,
      "kl": 0.0279083251953125,
      "learning_rate": 1e-06,
      "loss": 0.1935,
      "num_tokens": 5039483.0,
      "reward": 0.060268811881542206,
      "reward_std": 0.03754986822605133,
      "rewards/bleu_reward_func/mean": 0.060268811881542206,
      "rewards/bleu_reward_func/std": 0.038724955171346664,
      "step": 380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 457.0,
      "completions/mean_length": 221.03125,
      "completions/mean_terminated_length": 190.9310302734375,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.3048,
      "grad_norm": 3.8986661434173584,
      "kl": 0.034942626953125,
      "learning_rate": 1e-06,
      "loss": 0.0139,
      "num_tokens": 5049020.0,
      "reward": 0.034335315227508545,
      "reward_std": 0.00831439159810543,
      "rewards/bleu_reward_func/mean": 0.034335315227508545,
      "rewards/bleu_reward_func/std": 0.01297684945166111,
      "step": 381
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 281.0,
      "completions/mean_terminated_length": 265.6000061035156,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.3056,
      "grad_norm": 3.340994119644165,
      "kl": 0.034881591796875,
      "learning_rate": 1e-06,
      "loss": 0.1244,
      "num_tokens": 5059932.0,
      "reward": 0.031757794320583344,
      "reward_std": 0.00996050052344799,
      "rewards/bleu_reward_func/mean": 0.031757794320583344,
      "rewards/bleu_reward_func/std": 0.014244799502193928,
      "step": 382
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 312.96875,
      "completions/mean_terminated_length": 235.0869598388672,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.3064,
      "grad_norm": 3.1813881397247314,
      "kl": 0.02911376953125,
      "learning_rate": 1e-06,
      "loss": 0.0916,
      "num_tokens": 5072299.0,
      "reward": 0.02868136763572693,
      "reward_std": 0.011929353699088097,
      "rewards/bleu_reward_func/mean": 0.02868136763572693,
      "rewards/bleu_reward_func/std": 0.018595216795802116,
      "step": 383
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 290.03125,
      "completions/mean_terminated_length": 203.17391967773438,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.3072,
      "grad_norm": 2.8743865489959717,
      "kl": 0.027252197265625,
      "learning_rate": 1e-06,
      "loss": 0.0136,
      "num_tokens": 5085700.0,
      "reward": 0.06972040981054306,
      "reward_std": 0.030980605632066727,
      "rewards/bleu_reward_func/mean": 0.06972040981054306,
      "rewards/bleu_reward_func/std": 0.060256555676460266,
      "step": 384
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 469.0,
      "completions/mean_length": 296.78125,
      "completions/mean_terminated_length": 225.0416717529297,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.308,
      "grad_norm": 2.9356751441955566,
      "kl": 0.02740478515625,
      "learning_rate": 1e-06,
      "loss": -0.0112,
      "num_tokens": 5097589.0,
      "reward": 0.04450830817222595,
      "reward_std": 0.02866341546177864,
      "rewards/bleu_reward_func/mean": 0.04450830817222595,
      "rewards/bleu_reward_func/std": 0.04176363721489906,
      "step": 385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 382.125,
      "completions/mean_terminated_length": 304.20001220703125,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.3088,
      "grad_norm": 2.262878656387329,
      "kl": 0.0264129638671875,
      "learning_rate": 1e-06,
      "loss": -0.0724,
      "num_tokens": 5113993.0,
      "reward": 0.12424831092357635,
      "reward_std": 0.04111553356051445,
      "rewards/bleu_reward_func/mean": 0.12424831092357635,
      "rewards/bleu_reward_func/std": 0.15211449563503265,
      "step": 386
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 464.0,
      "completions/mean_length": 362.09375,
      "completions/mean_terminated_length": 293.9545593261719,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.3096,
      "grad_norm": 2.2440779209136963,
      "kl": 0.0277099609375,
      "learning_rate": 1e-06,
      "loss": -0.015,
      "num_tokens": 5128876.0,
      "reward": 0.04977039247751236,
      "reward_std": 0.01728152297437191,
      "rewards/bleu_reward_func/mean": 0.04977039247751236,
      "rewards/bleu_reward_func/std": 0.029223492369055748,
      "step": 387
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 372.0,
      "completions/mean_length": 276.4375,
      "completions/mean_terminated_length": 197.9166717529297,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.3104,
      "grad_norm": 2.516524076461792,
      "kl": 0.027374267578125,
      "learning_rate": 1e-06,
      "loss": -0.0645,
      "num_tokens": 5140674.0,
      "reward": 0.146553173661232,
      "reward_std": 0.04176880046725273,
      "rewards/bleu_reward_func/mean": 0.146553173661232,
      "rewards/bleu_reward_func/std": 0.09359844774007797,
      "step": 388
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 324.96875,
      "completions/mean_terminated_length": 305.6206970214844,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.3112,
      "grad_norm": 2.948312759399414,
      "kl": 0.02374267578125,
      "learning_rate": 1e-06,
      "loss": 0.0621,
      "num_tokens": 5155745.0,
      "reward": 0.106197290122509,
      "reward_std": 0.06629303842782974,
      "rewards/bleu_reward_func/mean": 0.106197290122509,
      "rewards/bleu_reward_func/std": 0.11851444095373154,
      "step": 389
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 298.5625,
      "completions/mean_terminated_length": 152.5263214111328,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.312,
      "grad_norm": 14.583303451538086,
      "kl": 0.05859375,
      "learning_rate": 1e-06,
      "loss": -0.0482,
      "num_tokens": 5168939.0,
      "reward": 0.07266208529472351,
      "reward_std": 0.019275350496172905,
      "rewards/bleu_reward_func/mean": 0.07266208529472351,
      "rewards/bleu_reward_func/std": 0.11420844495296478,
      "step": 390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 424.0,
      "completions/mean_length": 265.125,
      "completions/mean_terminated_length": 257.1612854003906,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "epoch": 0.3128,
      "grad_norm": 2.5951454639434814,
      "kl": 0.025299072265625,
      "learning_rate": 1e-06,
      "loss": 0.0048,
      "num_tokens": 5179439.0,
      "reward": 0.0554918497800827,
      "reward_std": 0.020207270979881287,
      "rewards/bleu_reward_func/mean": 0.0554918497800827,
      "rewards/bleu_reward_func/std": 0.052480507642030716,
      "step": 391
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 452.09375,
      "completions/mean_terminated_length": 375.0714416503906,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.3136,
      "grad_norm": 1.9732576608657837,
      "kl": 0.02559661865234375,
      "learning_rate": 1e-06,
      "loss": -0.0084,
      "num_tokens": 5196810.0,
      "reward": 0.07723353058099747,
      "reward_std": 0.021651268005371094,
      "rewards/bleu_reward_func/mean": 0.07723353058099747,
      "rewards/bleu_reward_func/std": 0.06710720807313919,
      "step": 392
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 392.0,
      "completions/mean_terminated_length": 237.71429443359375,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "epoch": 0.3144,
      "grad_norm": 2.6798901557922363,
      "kl": 0.03570556640625,
      "learning_rate": 1e-06,
      "loss": -0.0554,
      "num_tokens": 5212434.0,
      "reward": 0.06295132637023926,
      "reward_std": 0.022763650864362717,
      "rewards/bleu_reward_func/mean": 0.06295132637023926,
      "rewards/bleu_reward_func/std": 0.03463296964764595,
      "step": 393
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 456.6875,
      "completions/mean_terminated_length": 385.5714416503906,
      "completions/min_length": 289.0,
      "completions/min_terminated_length": 289.0,
      "epoch": 0.3152,
      "grad_norm": 2.0554585456848145,
      "kl": 0.029510498046875,
      "learning_rate": 1e-06,
      "loss": -0.0209,
      "num_tokens": 5230848.0,
      "reward": 0.04525914788246155,
      "reward_std": 0.007387248799204826,
      "rewards/bleu_reward_func/mean": 0.04525914788246155,
      "rewards/bleu_reward_func/std": 0.02194630168378353,
      "step": 394
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 383.0,
      "completions/mean_length": 155.40625,
      "completions/mean_terminated_length": 143.90322875976562,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.316,
      "grad_norm": 3.7899491786956787,
      "kl": 0.037841796875,
      "learning_rate": 1e-06,
      "loss": -0.177,
      "num_tokens": 5238805.0,
      "reward": 0.039721157401800156,
      "reward_std": 0.03929724916815758,
      "rewards/bleu_reward_func/mean": 0.039721157401800156,
      "rewards/bleu_reward_func/std": 0.05673614889383316,
      "step": 395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 430.0,
      "completions/mean_length": 358.65625,
      "completions/mean_terminated_length": 298.6521911621094,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.3168,
      "grad_norm": 2.6228115558624268,
      "kl": 0.01983642578125,
      "learning_rate": 1e-06,
      "loss": -0.0173,
      "num_tokens": 5256122.0,
      "reward": 0.0510735958814621,
      "reward_std": 0.016432739794254303,
      "rewards/bleu_reward_func/mean": 0.0510735958814621,
      "rewards/bleu_reward_func/std": 0.0460241362452507,
      "step": 396
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 339.0,
      "completions/max_terminated_length": 339.0,
      "completions/mean_length": 125.5625,
      "completions/mean_terminated_length": 125.5625,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.3176,
      "grad_norm": 14.036137580871582,
      "kl": 0.1956787109375,
      "learning_rate": 1e-06,
      "loss": -0.0332,
      "num_tokens": 5265292.0,
      "reward": 0.09528109431266785,
      "reward_std": 0.0211674515157938,
      "rewards/bleu_reward_func/mean": 0.09528109431266785,
      "rewards/bleu_reward_func/std": 0.07186417281627655,
      "step": 397
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 466.0,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 261.75,
      "completions/mean_terminated_length": 261.75,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 0.3184,
      "grad_norm": 3.5530920028686523,
      "kl": 0.02069091796875,
      "learning_rate": 1e-06,
      "loss": -0.2157,
      "num_tokens": 5275676.0,
      "reward": 0.05174801126122475,
      "reward_std": 0.020869575440883636,
      "rewards/bleu_reward_func/mean": 0.05174801126122475,
      "rewards/bleu_reward_func/std": 0.04189353436231613,
      "step": 398
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 234.75,
      "completions/mean_terminated_length": 157.1199951171875,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.3192,
      "grad_norm": 3.5558056831359863,
      "kl": 0.0399169921875,
      "learning_rate": 1e-06,
      "loss": 0.1006,
      "num_tokens": 5286300.0,
      "reward": 0.04154960438609123,
      "reward_std": 0.019385188817977905,
      "rewards/bleu_reward_func/mean": 0.04154960438609123,
      "rewards/bleu_reward_func/std": 0.028244102373719215,
      "step": 399
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 502.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 255.625,
      "completions/mean_terminated_length": 255.625,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.32,
      "grad_norm": 3.1600592136383057,
      "kl": 0.031707763671875,
      "learning_rate": 1e-06,
      "loss": -0.0721,
      "num_tokens": 5296840.0,
      "reward": 0.06132878363132477,
      "reward_std": 0.017585109919309616,
      "rewards/bleu_reward_func/mean": 0.06132878363132477,
      "rewards/bleu_reward_func/std": 0.03360173851251602,
      "step": 400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 69.0,
      "completions/mean_length": 378.65625,
      "completions/mean_terminated_length": 37.88888931274414,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.3208,
      "grad_norm": 4.9900360107421875,
      "kl": 0.033050537109375,
      "learning_rate": 1e-06,
      "loss": -0.0658,
      "num_tokens": 5313149.0,
      "reward": 0.025677043944597244,
      "reward_std": 0.0111201461404562,
      "rewards/bleu_reward_func/mean": 0.025677043944597244,
      "rewards/bleu_reward_func/std": 0.02006489410996437,
      "step": 401
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 297.375,
      "completions/mean_terminated_length": 108.0,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "epoch": 0.3216,
      "grad_norm": 3.406717538833618,
      "kl": 0.040863037109375,
      "learning_rate": 1e-06,
      "loss": -0.0749,
      "num_tokens": 5325945.0,
      "reward": 0.045327264815568924,
      "reward_std": 0.02512126788496971,
      "rewards/bleu_reward_func/mean": 0.045327264815568924,
      "rewards/bleu_reward_func/std": 0.03467832878232002,
      "step": 402
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 296.5625,
      "completions/mean_terminated_length": 236.239990234375,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.3224,
      "grad_norm": 3.1772117614746094,
      "kl": 0.032318115234375,
      "learning_rate": 1e-06,
      "loss": 0.0194,
      "num_tokens": 5338475.0,
      "reward": 0.044717058539390564,
      "reward_std": 0.014735497534275055,
      "rewards/bleu_reward_func/mean": 0.044717058539390564,
      "rewards/bleu_reward_func/std": 0.025379199534654617,
      "step": 403
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 248.46875,
      "completions/mean_terminated_length": 239.9677276611328,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.3232,
      "grad_norm": 4.004060745239258,
      "kl": 0.04180908203125,
      "learning_rate": 1e-06,
      "loss": -0.093,
      "num_tokens": 5349122.0,
      "reward": 0.06194135546684265,
      "reward_std": 0.02376762218773365,
      "rewards/bleu_reward_func/mean": 0.06194135546684265,
      "rewards/bleu_reward_func/std": 0.02893226593732834,
      "step": 404
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 412.0,
      "completions/mean_length": 164.25,
      "completions/mean_terminated_length": 153.03225708007812,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.324,
      "grad_norm": 4.68143892288208,
      "kl": 0.0418701171875,
      "learning_rate": 1e-06,
      "loss": 0.0933,
      "num_tokens": 5356842.0,
      "reward": 0.17849373817443848,
      "reward_std": 0.10065864771604538,
      "rewards/bleu_reward_func/mean": 0.17849373817443848,
      "rewards/bleu_reward_func/std": 0.25785142183303833,
      "step": 405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 394.03125,
      "completions/mean_terminated_length": 354.7083435058594,
      "completions/min_length": 191.0,
      "completions/min_terminated_length": 191.0,
      "epoch": 0.3248,
      "grad_norm": 2.360607862472534,
      "kl": 0.026702880859375,
      "learning_rate": 1e-06,
      "loss": -0.0347,
      "num_tokens": 5371827.0,
      "reward": 0.08791603147983551,
      "reward_std": 0.02064087614417076,
      "rewards/bleu_reward_func/mean": 0.08791603147983551,
      "rewards/bleu_reward_func/std": 0.08002207428216934,
      "step": 406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 469.0,
      "completions/mean_length": 407.75,
      "completions/mean_terminated_length": 208.72727966308594,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.3256,
      "grad_norm": 2.7047009468078613,
      "kl": 0.034332275390625,
      "learning_rate": 1e-06,
      "loss": 0.0369,
      "num_tokens": 5388667.0,
      "reward": 0.035750459879636765,
      "reward_std": 0.00714261457324028,
      "rewards/bleu_reward_func/mean": 0.035750459879636765,
      "rewards/bleu_reward_func/std": 0.02296554110944271,
      "step": 407
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 337.78125,
      "completions/mean_terminated_length": 279.7083435058594,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.3264,
      "grad_norm": 2.637406349182129,
      "kl": 0.036590576171875,
      "learning_rate": 1e-06,
      "loss": 0.0154,
      "num_tokens": 5401732.0,
      "reward": 0.029655063524842262,
      "reward_std": 0.010500041767954826,
      "rewards/bleu_reward_func/mean": 0.029655063524842262,
      "rewards/bleu_reward_func/std": 0.012400495819747448,
      "step": 408
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 406.71875,
      "completions/mean_terminated_length": 351.5714416503906,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.3272,
      "grad_norm": 2.2207653522491455,
      "kl": 0.026458740234375,
      "learning_rate": 1e-06,
      "loss": 0.0013,
      "num_tokens": 5419075.0,
      "reward": 0.06239059194922447,
      "reward_std": 0.018355626612901688,
      "rewards/bleu_reward_func/mean": 0.06239059194922447,
      "rewards/bleu_reward_func/std": 0.03472558781504631,
      "step": 409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 330.625,
      "completions/mean_terminated_length": 304.71429443359375,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.328,
      "grad_norm": 2.539473295211792,
      "kl": 0.03277587890625,
      "learning_rate": 1e-06,
      "loss": -0.0873,
      "num_tokens": 5431623.0,
      "reward": 0.06792166829109192,
      "reward_std": 0.02972714975476265,
      "rewards/bleu_reward_func/mean": 0.06792166829109192,
      "rewards/bleu_reward_func/std": 0.04709053412079811,
      "step": 410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 221.0,
      "completions/mean_length": 327.4375,
      "completions/mean_terminated_length": 142.875,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.3288,
      "grad_norm": 3.064293384552002,
      "kl": 0.0247650146484375,
      "learning_rate": 1e-06,
      "loss": -0.0849,
      "num_tokens": 5445821.0,
      "reward": 0.027155395597219467,
      "reward_std": 0.012970471754670143,
      "rewards/bleu_reward_func/mean": 0.027155395597219467,
      "rewards/bleu_reward_func/std": 0.016487330198287964,
      "step": 411
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 323.96875,
      "completions/mean_terminated_length": 250.3913116455078,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.3296,
      "grad_norm": 3.3556032180786133,
      "kl": 0.026641845703125,
      "learning_rate": 1e-06,
      "loss": 0.0394,
      "num_tokens": 5459084.0,
      "reward": 0.05504102632403374,
      "reward_std": 0.01868896186351776,
      "rewards/bleu_reward_func/mean": 0.05504102632403374,
      "rewards/bleu_reward_func/std": 0.032063912600278854,
      "step": 412
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 293.34375,
      "completions/mean_terminated_length": 262.1071472167969,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "epoch": 0.3304,
      "grad_norm": 2.61944842338562,
      "kl": 0.027618408203125,
      "learning_rate": 1e-06,
      "loss": -0.0423,
      "num_tokens": 5471703.0,
      "reward": 0.04594315215945244,
      "reward_std": 0.016052130609750748,
      "rewards/bleu_reward_func/mean": 0.04594315215945244,
      "rewards/bleu_reward_func/std": 0.030643180012702942,
      "step": 413
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 381.40625,
      "completions/mean_terminated_length": 292.0526428222656,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "epoch": 0.3312,
      "grad_norm": 2.4337165355682373,
      "kl": 0.0261993408203125,
      "learning_rate": 1e-06,
      "loss": 0.0127,
      "num_tokens": 5486460.0,
      "reward": 0.03960081934928894,
      "reward_std": 0.010423287749290466,
      "rewards/bleu_reward_func/mean": 0.03960081934928894,
      "rewards/bleu_reward_func/std": 0.018299689516425133,
      "step": 414
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 415.25,
      "completions/mean_terminated_length": 329.8823547363281,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "epoch": 0.332,
      "grad_norm": 2.1501145362854004,
      "kl": 0.034088134765625,
      "learning_rate": 1e-06,
      "loss": -0.0909,
      "num_tokens": 5503620.0,
      "reward": 0.051469504833221436,
      "reward_std": 0.01700912043452263,
      "rewards/bleu_reward_func/mean": 0.051469504833221436,
      "rewards/bleu_reward_func/std": 0.03233012557029724,
      "step": 415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 471.0,
      "completions/mean_length": 339.90625,
      "completions/mean_terminated_length": 300.19232177734375,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "epoch": 0.3328,
      "grad_norm": 2.446009397506714,
      "kl": 0.027069091796875,
      "learning_rate": 1e-06,
      "loss": 0.0161,
      "num_tokens": 5518281.0,
      "reward": 0.04302642494440079,
      "reward_std": 0.018161989748477936,
      "rewards/bleu_reward_func/mean": 0.04302642494440079,
      "rewards/bleu_reward_func/std": 0.037101197987794876,
      "step": 416
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 456.0,
      "completions/mean_length": 329.46875,
      "completions/mean_terminated_length": 268.625,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 0.3336,
      "grad_norm": 2.420654058456421,
      "kl": 0.030914306640625,
      "learning_rate": 1e-06,
      "loss": -0.1055,
      "num_tokens": 5532568.0,
      "reward": 0.04830830916762352,
      "reward_std": 0.015328258275985718,
      "rewards/bleu_reward_func/mean": 0.04830830916762352,
      "rewards/bleu_reward_func/std": 0.02932472713291645,
      "step": 417
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 312.78125,
      "completions/mean_terminated_length": 246.375,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.3344,
      "grad_norm": 2.4366395473480225,
      "kl": 0.03277587890625,
      "learning_rate": 1e-06,
      "loss": -0.063,
      "num_tokens": 5548425.0,
      "reward": 0.028444793075323105,
      "reward_std": 0.009239492937922478,
      "rewards/bleu_reward_func/mean": 0.028444793075323105,
      "rewards/bleu_reward_func/std": 0.024574536830186844,
      "step": 418
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 336.5625,
      "completions/mean_terminated_length": 324.8666687011719,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.3352,
      "grad_norm": 2.1961772441864014,
      "kl": 0.028564453125,
      "learning_rate": 1e-06,
      "loss": -0.0136,
      "num_tokens": 5561371.0,
      "reward": 0.03844983130693436,
      "reward_std": 0.01347007229924202,
      "rewards/bleu_reward_func/mean": 0.03844983130693436,
      "rewards/bleu_reward_func/std": 0.02470613457262516,
      "step": 419
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 435.0,
      "completions/mean_length": 283.21875,
      "completions/mean_terminated_length": 240.8518524169922,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.336,
      "grad_norm": 2.4404196739196777,
      "kl": 0.02447509765625,
      "learning_rate": 1e-06,
      "loss": 0.1708,
      "num_tokens": 5576154.0,
      "reward": 0.12900003790855408,
      "reward_std": 0.05478304252028465,
      "rewards/bleu_reward_func/mean": 0.12900003790855408,
      "rewards/bleu_reward_func/std": 0.08241026103496552,
      "step": 420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 288.65625,
      "completions/mean_terminated_length": 247.29629516601562,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.3368,
      "grad_norm": 2.4558486938476562,
      "kl": 0.02593994140625,
      "learning_rate": 1e-06,
      "loss": 0.133,
      "num_tokens": 5588247.0,
      "reward": 0.04614394158124924,
      "reward_std": 0.022616572678089142,
      "rewards/bleu_reward_func/mean": 0.04614394158124924,
      "rewards/bleu_reward_func/std": 0.042861953377723694,
      "step": 421
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 451.0,
      "completions/mean_length": 405.34375,
      "completions/mean_terminated_length": 298.6875,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.3376,
      "grad_norm": 2.181236505508423,
      "kl": 0.035614013671875,
      "learning_rate": 1e-06,
      "loss": 0.1833,
      "num_tokens": 5604570.0,
      "reward": 0.029749825596809387,
      "reward_std": 0.019359689205884933,
      "rewards/bleu_reward_func/mean": 0.029749825596809387,
      "rewards/bleu_reward_func/std": 0.0279875285923481,
      "step": 422
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 362.21875,
      "completions/mean_terminated_length": 312.29168701171875,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "epoch": 0.3384,
      "grad_norm": 2.532691240310669,
      "kl": 0.029205322265625,
      "learning_rate": 1e-06,
      "loss": 0.0322,
      "num_tokens": 5618353.0,
      "reward": 0.07113789021968842,
      "reward_std": 0.01926705427467823,
      "rewards/bleu_reward_func/mean": 0.07113789021968842,
      "rewards/bleu_reward_func/std": 0.07943008095026016,
      "step": 423
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 272.71875,
      "completions/mean_terminated_length": 256.7666931152344,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "epoch": 0.3392,
      "grad_norm": 3.0738492012023926,
      "kl": 0.032440185546875,
      "learning_rate": 1e-06,
      "loss": 0.0292,
      "num_tokens": 5629712.0,
      "reward": 0.0503704771399498,
      "reward_std": 0.021814901381731033,
      "rewards/bleu_reward_func/mean": 0.0503704771399498,
      "rewards/bleu_reward_func/std": 0.05399306118488312,
      "step": 424
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 359.0,
      "completions/mean_length": 316.59375,
      "completions/mean_terminated_length": 240.13043212890625,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.34,
      "grad_norm": 2.2919762134552,
      "kl": 0.03045654296875,
      "learning_rate": 1e-06,
      "loss": -0.0064,
      "num_tokens": 5642171.0,
      "reward": 0.04583510756492615,
      "reward_std": 0.016485266387462616,
      "rewards/bleu_reward_func/mean": 0.04583510756492615,
      "rewards/bleu_reward_func/std": 0.033508144319057465,
      "step": 425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 453.0,
      "completions/max_terminated_length": 453.0,
      "completions/mean_length": 293.15625,
      "completions/mean_terminated_length": 293.15625,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.3408,
      "grad_norm": 2.3206934928894043,
      "kl": 0.027252197265625,
      "learning_rate": 1e-06,
      "loss": 0.0463,
      "num_tokens": 5653728.0,
      "reward": 0.020433904603123665,
      "reward_std": 0.011948324739933014,
      "rewards/bleu_reward_func/mean": 0.020433904603123665,
      "rewards/bleu_reward_func/std": 0.019608385860919952,
      "step": 426
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 471.0,
      "completions/mean_length": 266.75,
      "completions/mean_terminated_length": 210.1538543701172,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.3416,
      "grad_norm": 2.8593051433563232,
      "kl": 0.040802001953125,
      "learning_rate": 1e-06,
      "loss": -0.0247,
      "num_tokens": 5664072.0,
      "reward": 0.03389505296945572,
      "reward_std": 0.013023952953517437,
      "rewards/bleu_reward_func/mean": 0.03389505296945572,
      "rewards/bleu_reward_func/std": 0.02550988271832466,
      "step": 427
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 302.5625,
      "completions/mean_terminated_length": 232.75,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.3424,
      "grad_norm": 2.5821361541748047,
      "kl": 0.0328369140625,
      "learning_rate": 1e-06,
      "loss": -0.0366,
      "num_tokens": 5678370.0,
      "reward": 0.05251599848270416,
      "reward_std": 0.01706322655081749,
      "rewards/bleu_reward_func/mean": 0.05251599848270416,
      "rewards/bleu_reward_func/std": 0.0254330113530159,
      "step": 428
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 458.53125,
      "completions/mean_terminated_length": 426.45001220703125,
      "completions/min_length": 290.0,
      "completions/min_terminated_length": 290.0,
      "epoch": 0.3432,
      "grad_norm": 1.9138927459716797,
      "kl": 0.02752685546875,
      "learning_rate": 1e-06,
      "loss": -0.0187,
      "num_tokens": 5696027.0,
      "reward": 0.0476941354572773,
      "reward_std": 0.01937401294708252,
      "rewards/bleu_reward_func/mean": 0.0476941354572773,
      "rewards/bleu_reward_func/std": 0.030997809022665024,
      "step": 429
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 252.4375,
      "completions/mean_terminated_length": 165.9166717529297,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.344,
      "grad_norm": 4.442004203796387,
      "kl": 0.035736083984375,
      "learning_rate": 1e-06,
      "loss": -0.0416,
      "num_tokens": 5706201.0,
      "reward": 0.09095358103513718,
      "reward_std": 0.04099667817354202,
      "rewards/bleu_reward_func/mean": 0.09095358103513718,
      "rewards/bleu_reward_func/std": 0.08905672281980515,
      "step": 430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 489.5625,
      "completions/mean_terminated_length": 422.25,
      "completions/min_length": 279.0,
      "completions/min_terminated_length": 279.0,
      "epoch": 0.3448,
      "grad_norm": 1.909276008605957,
      "kl": 0.035797119140625,
      "learning_rate": 1e-06,
      "loss": 0.0172,
      "num_tokens": 5726163.0,
      "reward": 0.041696012020111084,
      "reward_std": 0.019856570288538933,
      "rewards/bleu_reward_func/mean": 0.041696012020111084,
      "rewards/bleu_reward_func/std": 0.03666767105460167,
      "step": 431
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 397.15625,
      "completions/mean_terminated_length": 337.0,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "epoch": 0.3456,
      "grad_norm": 2.4237301349639893,
      "kl": 0.036224365234375,
      "learning_rate": 1e-06,
      "loss": 0.0294,
      "num_tokens": 5745016.0,
      "reward": 0.038987092673778534,
      "reward_std": 0.013197116553783417,
      "rewards/bleu_reward_func/mean": 0.038987092673778534,
      "rewards/bleu_reward_func/std": 0.03411531820893288,
      "step": 432
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 225.28125,
      "completions/mean_terminated_length": 195.6206817626953,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.3464,
      "grad_norm": 4.905090808868408,
      "kl": 0.045806884765625,
      "learning_rate": 1e-06,
      "loss": 0.1137,
      "num_tokens": 5754873.0,
      "reward": 0.04227697476744652,
      "reward_std": 0.012464666739106178,
      "rewards/bleu_reward_func/mean": 0.04227697476744652,
      "rewards/bleu_reward_func/std": 0.02255011536180973,
      "step": 433
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 439.0,
      "completions/mean_length": 390.65625,
      "completions/mean_terminated_length": 269.3125,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "epoch": 0.3472,
      "grad_norm": 2.905046224594116,
      "kl": 0.0345458984375,
      "learning_rate": 1e-06,
      "loss": 0.0644,
      "num_tokens": 5772358.0,
      "reward": 0.05010952055454254,
      "reward_std": 0.026643291115760803,
      "rewards/bleu_reward_func/mean": 0.05010952055454254,
      "rewards/bleu_reward_func/std": 0.04131516441702843,
      "step": 434
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 437.0,
      "completions/mean_length": 389.78125,
      "completions/mean_terminated_length": 316.45001220703125,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.348,
      "grad_norm": 2.3226656913757324,
      "kl": 0.032135009765625,
      "learning_rate": 1e-06,
      "loss": 0.0247,
      "num_tokens": 5787815.0,
      "reward": 0.04076055437326431,
      "reward_std": 0.009578779339790344,
      "rewards/bleu_reward_func/mean": 0.04076055437326431,
      "rewards/bleu_reward_func/std": 0.018154015764594078,
      "step": 435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 389.65625,
      "completions/mean_terminated_length": 355.3999938964844,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 215.0,
      "epoch": 0.3488,
      "grad_norm": 2.4872429370880127,
      "kl": 0.027496337890625,
      "learning_rate": 1e-06,
      "loss": -0.0609,
      "num_tokens": 5802556.0,
      "reward": 0.09538507461547852,
      "reward_std": 0.02605431340634823,
      "rewards/bleu_reward_func/mean": 0.09538507461547852,
      "rewards/bleu_reward_func/std": 0.060399290174245834,
      "step": 436
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 281.0625,
      "completions/mean_terminated_length": 204.08334350585938,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.3496,
      "grad_norm": 3.9208173751831055,
      "kl": 0.0377197265625,
      "learning_rate": 1e-06,
      "loss": 0.0755,
      "num_tokens": 5814790.0,
      "reward": 0.07319147884845734,
      "reward_std": 0.021372804418206215,
      "rewards/bleu_reward_func/mean": 0.07319147884845734,
      "rewards/bleu_reward_func/std": 0.06475379317998886,
      "step": 437
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 273.375,
      "completions/mean_terminated_length": 265.6773986816406,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "epoch": 0.3504,
      "grad_norm": 2.6346755027770996,
      "kl": 0.025146484375,
      "learning_rate": 1e-06,
      "loss": -0.0409,
      "num_tokens": 5827610.0,
      "reward": 0.04968114197254181,
      "reward_std": 0.01877327263355255,
      "rewards/bleu_reward_func/mean": 0.04968114197254181,
      "rewards/bleu_reward_func/std": 0.0327991247177124,
      "step": 438
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 343.5625,
      "completions/mean_terminated_length": 326.137939453125,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.3512,
      "grad_norm": 2.0653529167175293,
      "kl": 0.02227783203125,
      "learning_rate": 1e-06,
      "loss": 0.0021,
      "num_tokens": 5844580.0,
      "reward": 0.04412662982940674,
      "reward_std": 0.03156070411205292,
      "rewards/bleu_reward_func/mean": 0.04412662982940674,
      "rewards/bleu_reward_func/std": 0.039357323199510574,
      "step": 439
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 387.90625,
      "completions/mean_terminated_length": 339.34783935546875,
      "completions/min_length": 143.0,
      "completions/min_terminated_length": 143.0,
      "epoch": 0.352,
      "grad_norm": 2.318570852279663,
      "kl": 0.037872314453125,
      "learning_rate": 1e-06,
      "loss": -0.0107,
      "num_tokens": 5860249.0,
      "reward": 0.04714466631412506,
      "reward_std": 0.008974202908575535,
      "rewards/bleu_reward_func/mean": 0.04714466631412506,
      "rewards/bleu_reward_func/std": 0.06077088788151741,
      "step": 440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 457.0,
      "completions/mean_length": 221.6875,
      "completions/mean_terminated_length": 124.91667175292969,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.3528,
      "grad_norm": 4.25236177444458,
      "kl": 0.05303955078125,
      "learning_rate": 1e-06,
      "loss": -0.211,
      "num_tokens": 5870535.0,
      "reward": 0.04983676224946976,
      "reward_std": 0.0235724039375782,
      "rewards/bleu_reward_func/mean": 0.04983676224946976,
      "rewards/bleu_reward_func/std": 0.05665838345885277,
      "step": 441
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 178.6875,
      "completions/mean_terminated_length": 167.93548583984375,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.3536,
      "grad_norm": 4.9184794425964355,
      "kl": 0.047027587890625,
      "learning_rate": 1e-06,
      "loss": 0.1442,
      "num_tokens": 5878733.0,
      "reward": 0.07976078987121582,
      "reward_std": 0.03809776157140732,
      "rewards/bleu_reward_func/mean": 0.07976078987121582,
      "rewards/bleu_reward_func/std": 0.08007866889238358,
      "step": 442
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 465.0,
      "completions/mean_length": 318.40625,
      "completions/mean_terminated_length": 253.875,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.3544,
      "grad_norm": 3.3139231204986572,
      "kl": 0.028900146484375,
      "learning_rate": 1e-06,
      "loss": -0.0201,
      "num_tokens": 5891178.0,
      "reward": 0.16382896900177002,
      "reward_std": 0.028719859197735786,
      "rewards/bleu_reward_func/mean": 0.16382896900177002,
      "rewards/bleu_reward_func/std": 0.2198071926832199,
      "step": 443
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 358.8125,
      "completions/mean_terminated_length": 330.4444580078125,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "epoch": 0.3552,
      "grad_norm": 2.494014024734497,
      "kl": 0.0315093994140625,
      "learning_rate": 1e-06,
      "loss": 0.0693,
      "num_tokens": 5905684.0,
      "reward": 0.15048904716968536,
      "reward_std": 0.041027601808309555,
      "rewards/bleu_reward_func/mean": 0.15048904716968536,
      "rewards/bleu_reward_func/std": 0.21759282052516937,
      "step": 444
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 464.0,
      "completions/mean_length": 305.0,
      "completions/mean_terminated_length": 266.6666564941406,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "epoch": 0.356,
      "grad_norm": 2.9205825328826904,
      "kl": 0.031890869140625,
      "learning_rate": 1e-06,
      "loss": -0.0476,
      "num_tokens": 5917428.0,
      "reward": 0.03617691248655319,
      "reward_std": 0.013296255841851234,
      "rewards/bleu_reward_func/mean": 0.03617691248655319,
      "rewards/bleu_reward_func/std": 0.01859820820391178,
      "step": 445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 263.28125,
      "completions/mean_terminated_length": 227.75001525878906,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.3568,
      "grad_norm": 5.170389652252197,
      "kl": 0.03790283203125,
      "learning_rate": 1e-06,
      "loss": -0.0332,
      "num_tokens": 5929189.0,
      "reward": 0.043621551245450974,
      "reward_std": 0.01647448167204857,
      "rewards/bleu_reward_func/mean": 0.043621551245450974,
      "rewards/bleu_reward_func/std": 0.026312116533517838,
      "step": 446
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 487.0,
      "completions/mean_length": 391.03125,
      "completions/mean_terminated_length": 284.29412841796875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "epoch": 0.3576,
      "grad_norm": 2.2997524738311768,
      "kl": 0.0362548828125,
      "learning_rate": 1e-06,
      "loss": -0.0569,
      "num_tokens": 5945750.0,
      "reward": 0.03494875133037567,
      "reward_std": 0.00850139930844307,
      "rewards/bleu_reward_func/mean": 0.03494875133037567,
      "rewards/bleu_reward_func/std": 0.02439236082136631,
      "step": 447
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 219.78125,
      "completions/mean_terminated_length": 189.55172729492188,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.3584,
      "grad_norm": 3.48850154876709,
      "kl": 0.028778076171875,
      "learning_rate": 1e-06,
      "loss": -0.0647,
      "num_tokens": 5954879.0,
      "reward": 0.08465160429477692,
      "reward_std": 0.0695774257183075,
      "rewards/bleu_reward_func/mean": 0.08465160429477692,
      "rewards/bleu_reward_func/std": 0.11551004648208618,
      "step": 448
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 444.0,
      "completions/mean_length": 338.8125,
      "completions/mean_terminated_length": 220.3157958984375,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.3592,
      "grad_norm": 3.661801338195801,
      "kl": 0.0262603759765625,
      "learning_rate": 1e-06,
      "loss": -0.0604,
      "num_tokens": 5969169.0,
      "reward": 0.09182567894458771,
      "reward_std": 0.03519277274608612,
      "rewards/bleu_reward_func/mean": 0.09182567894458771,
      "rewards/bleu_reward_func/std": 0.08222125470638275,
      "step": 449
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 351.8125,
      "completions/mean_terminated_length": 314.8461608886719,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.36,
      "grad_norm": 2.7594478130340576,
      "kl": 0.02459716796875,
      "learning_rate": 1e-06,
      "loss": -0.1806,
      "num_tokens": 5982683.0,
      "reward": 0.07073526084423065,
      "reward_std": 0.04453439265489578,
      "rewards/bleu_reward_func/mean": 0.07073526084423065,
      "rewards/bleu_reward_func/std": 0.049988992512226105,
      "step": 450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 439.0,
      "completions/mean_length": 234.6875,
      "completions/mean_terminated_length": 142.25,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.3608,
      "grad_norm": 6.336760520935059,
      "kl": 0.08734130859375,
      "learning_rate": 1e-06,
      "loss": -0.0034,
      "num_tokens": 5995337.0,
      "reward": 0.14889724552631378,
      "reward_std": 0.030763918533921242,
      "rewards/bleu_reward_func/mean": 0.14889724552631378,
      "rewards/bleu_reward_func/std": 0.18464770913124084,
      "step": 451
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 417.0,
      "completions/max_terminated_length": 417.0,
      "completions/mean_length": 270.75,
      "completions/mean_terminated_length": 270.75,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "epoch": 0.3616,
      "grad_norm": 2.630225419998169,
      "kl": 0.03399658203125,
      "learning_rate": 1e-06,
      "loss": 0.014,
      "num_tokens": 6006921.0,
      "reward": 0.02614082582294941,
      "reward_std": 0.010051444172859192,
      "rewards/bleu_reward_func/mean": 0.02614082582294941,
      "rewards/bleu_reward_func/std": 0.011256206780672073,
      "step": 452
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 494.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 248.375,
      "completions/mean_terminated_length": 248.375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.3624,
      "grad_norm": 3.011857032775879,
      "kl": 0.0301513671875,
      "learning_rate": 1e-06,
      "loss": 0.1245,
      "num_tokens": 6019309.0,
      "reward": 0.06672752648591995,
      "reward_std": 0.02101920172572136,
      "rewards/bleu_reward_func/mean": 0.06672752648591995,
      "rewards/bleu_reward_func/std": 0.053218722343444824,
      "step": 453
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 357.375,
      "completions/mean_terminated_length": 328.7407531738281,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.3632,
      "grad_norm": 2.4943957328796387,
      "kl": 0.0428466796875,
      "learning_rate": 1e-06,
      "loss": -0.1298,
      "num_tokens": 6032993.0,
      "reward": 0.04359011352062225,
      "reward_std": 0.016948901116847992,
      "rewards/bleu_reward_func/mean": 0.04359011352062225,
      "rewards/bleu_reward_func/std": 0.025716470554471016,
      "step": 454
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 462.0,
      "completions/mean_length": 197.15625,
      "completions/mean_terminated_length": 176.1666717529297,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "epoch": 0.364,
      "grad_norm": 3.7868881225585938,
      "kl": 0.051300048828125,
      "learning_rate": 1e-06,
      "loss": 0.0982,
      "num_tokens": 6041718.0,
      "reward": 0.03436025232076645,
      "reward_std": 0.00970546342432499,
      "rewards/bleu_reward_func/mean": 0.03436025232076645,
      "rewards/bleu_reward_func/std": 0.01754005253314972,
      "step": 455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 279.1875,
      "completions/mean_terminated_length": 214.0,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "epoch": 0.3648,
      "grad_norm": 3.222365617752075,
      "kl": 0.051025390625,
      "learning_rate": 1e-06,
      "loss": 0.1112,
      "num_tokens": 6054284.0,
      "reward": 0.0742957666516304,
      "reward_std": 0.015302993357181549,
      "rewards/bleu_reward_func/mean": 0.0742957666516304,
      "rewards/bleu_reward_func/std": 0.061175521463155746,
      "step": 456
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 260.78125,
      "completions/mean_terminated_length": 234.79310607910156,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.3656,
      "grad_norm": 3.3111205101013184,
      "kl": 0.04364013671875,
      "learning_rate": 1e-06,
      "loss": 0.0182,
      "num_tokens": 6064853.0,
      "reward": 0.04622993618249893,
      "reward_std": 0.0203024260699749,
      "rewards/bleu_reward_func/mean": 0.04622993618249893,
      "rewards/bleu_reward_func/std": 0.04259706288576126,
      "step": 457
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 410.0,
      "completions/mean_length": 366.5,
      "completions/mean_terminated_length": 266.9473571777344,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "epoch": 0.3664,
      "grad_norm": 2.6107733249664307,
      "kl": 0.0300750732421875,
      "learning_rate": 1e-06,
      "loss": 0.0746,
      "num_tokens": 6081629.0,
      "reward": 0.05567412078380585,
      "reward_std": 0.017643755301833153,
      "rewards/bleu_reward_func/mean": 0.05567412078380585,
      "rewards/bleu_reward_func/std": 0.029336081817746162,
      "step": 458
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 476.0,
      "completions/mean_length": 423.6875,
      "completions/mean_terminated_length": 370.70001220703125,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "epoch": 0.3672,
      "grad_norm": 1.9285842180252075,
      "kl": 0.0279541015625,
      "learning_rate": 1e-06,
      "loss": -0.0551,
      "num_tokens": 6099291.0,
      "reward": 0.05514095351099968,
      "reward_std": 0.02625124529004097,
      "rewards/bleu_reward_func/mean": 0.05514095351099968,
      "rewards/bleu_reward_func/std": 0.034680720418691635,
      "step": 459
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 338.125,
      "completions/mean_terminated_length": 247.04762268066406,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.368,
      "grad_norm": 3.4013428688049316,
      "kl": 0.038787841796875,
      "learning_rate": 1e-06,
      "loss": 0.0224,
      "num_tokens": 6113607.0,
      "reward": 0.04680419713258743,
      "reward_std": 0.01677008531987667,
      "rewards/bleu_reward_func/mean": 0.04680419713258743,
      "rewards/bleu_reward_func/std": 0.02986827678978443,
      "step": 460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 419.0,
      "completions/mean_terminated_length": 382.60870361328125,
      "completions/min_length": 228.0,
      "completions/min_terminated_length": 228.0,
      "epoch": 0.3688,
      "grad_norm": 2.1877553462982178,
      "kl": 0.0406494140625,
      "learning_rate": 1e-06,
      "loss": 0.0481,
      "num_tokens": 6129815.0,
      "reward": 0.061673715710639954,
      "reward_std": 0.01531613152474165,
      "rewards/bleu_reward_func/mean": 0.061673715710639954,
      "rewards/bleu_reward_func/std": 0.04928870499134064,
      "step": 461
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 408.25,
      "completions/mean_terminated_length": 373.66668701171875,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 182.0,
      "epoch": 0.3696,
      "grad_norm": 2.0608067512512207,
      "kl": 0.028839111328125,
      "learning_rate": 1e-06,
      "loss": -0.0268,
      "num_tokens": 6145599.0,
      "reward": 0.09434099495410919,
      "reward_std": 0.017724918201565742,
      "rewards/bleu_reward_func/mean": 0.09434099495410919,
      "rewards/bleu_reward_func/std": 0.09700965881347656,
      "step": 462
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 428.0,
      "completions/mean_length": 267.96875,
      "completions/mean_terminated_length": 211.6538543701172,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.3704,
      "grad_norm": 4.569107532501221,
      "kl": 0.036895751953125,
      "learning_rate": 1e-06,
      "loss": -0.1095,
      "num_tokens": 6158022.0,
      "reward": 0.03472236171364784,
      "reward_std": 0.011135936714708805,
      "rewards/bleu_reward_func/mean": 0.03472236171364784,
      "rewards/bleu_reward_func/std": 0.019644495099782944,
      "step": 463
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 379.0,
      "completions/max_terminated_length": 379.0,
      "completions/mean_length": 171.03125,
      "completions/mean_terminated_length": 171.03125,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.3712,
      "grad_norm": 6.893502235412598,
      "kl": 0.048492431640625,
      "learning_rate": 1e-06,
      "loss": -0.075,
      "num_tokens": 6165759.0,
      "reward": 0.06419667601585388,
      "reward_std": 0.027115123346447945,
      "rewards/bleu_reward_func/mean": 0.06419667601585388,
      "rewards/bleu_reward_func/std": 0.0568375438451767,
      "step": 464
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 438.0,
      "completions/max_terminated_length": 438.0,
      "completions/mean_length": 213.875,
      "completions/mean_terminated_length": 213.875,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.372,
      "grad_norm": 3.312126874923706,
      "kl": 0.0325927734375,
      "learning_rate": 1e-06,
      "loss": 0.0366,
      "num_tokens": 6174859.0,
      "reward": 0.07219819724559784,
      "reward_std": 0.021119076758623123,
      "rewards/bleu_reward_func/mean": 0.07219819724559784,
      "rewards/bleu_reward_func/std": 0.05505922809243202,
      "step": 465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 298.75,
      "completions/mean_terminated_length": 268.2857360839844,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "epoch": 0.3728,
      "grad_norm": 2.6578316688537598,
      "kl": 0.03704833984375,
      "learning_rate": 1e-06,
      "loss": -0.0174,
      "num_tokens": 6187539.0,
      "reward": 0.054259613156318665,
      "reward_std": 0.028212059289216995,
      "rewards/bleu_reward_func/mean": 0.054259613156318665,
      "rewards/bleu_reward_func/std": 0.04007524251937866,
      "step": 466
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 426.0,
      "completions/max_terminated_length": 426.0,
      "completions/mean_length": 190.625,
      "completions/mean_terminated_length": 190.625,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.3736,
      "grad_norm": 4.327213764190674,
      "kl": 0.059539794921875,
      "learning_rate": 1e-06,
      "loss": -0.0598,
      "num_tokens": 6196935.0,
      "reward": 0.04006721451878548,
      "reward_std": 0.015936415642499924,
      "rewards/bleu_reward_func/mean": 0.04006721451878548,
      "rewards/bleu_reward_func/std": 0.0206410214304924,
      "step": 467
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 358.875,
      "completions/mean_terminated_length": 316.0,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 162.0,
      "epoch": 0.3744,
      "grad_norm": 2.417494535446167,
      "kl": 0.03228759765625,
      "learning_rate": 1e-06,
      "loss": 0.0124,
      "num_tokens": 6210563.0,
      "reward": 0.04092847555875778,
      "reward_std": 0.019996026530861855,
      "rewards/bleu_reward_func/mean": 0.04092847555875778,
      "rewards/bleu_reward_func/std": 0.03578585386276245,
      "step": 468
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 401.75,
      "completions/mean_terminated_length": 370.8800048828125,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.3752,
      "grad_norm": 2.1943817138671875,
      "kl": 0.03948974609375,
      "learning_rate": 1e-06,
      "loss": -0.0128,
      "num_tokens": 6225899.0,
      "reward": 0.05942702293395996,
      "reward_std": 0.018910693004727364,
      "rewards/bleu_reward_func/mean": 0.05942702293395996,
      "rewards/bleu_reward_func/std": 0.03152807429432869,
      "step": 469
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 339.84375,
      "completions/mean_terminated_length": 272.478271484375,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.376,
      "grad_norm": 2.7955689430236816,
      "kl": 0.03955078125,
      "learning_rate": 1e-06,
      "loss": -0.0509,
      "num_tokens": 6239326.0,
      "reward": 0.025974374264478683,
      "reward_std": 0.011392309330403805,
      "rewards/bleu_reward_func/mean": 0.025974374264478683,
      "rewards/bleu_reward_func/std": 0.018641771748661995,
      "step": 470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 445.0,
      "completions/max_terminated_length": 445.0,
      "completions/mean_length": 217.90625,
      "completions/mean_terminated_length": 217.90625,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.3768,
      "grad_norm": 4.0323405265808105,
      "kl": 0.0279541015625,
      "learning_rate": 1e-06,
      "loss": 0.0632,
      "num_tokens": 6250795.0,
      "reward": 0.09103134274482727,
      "reward_std": 0.027077559381723404,
      "rewards/bleu_reward_func/mean": 0.09103134274482727,
      "rewards/bleu_reward_func/std": 0.08849667012691498,
      "step": 471
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 393.28125,
      "completions/mean_terminated_length": 381.0,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "epoch": 0.3776,
      "grad_norm": 2.2373785972595215,
      "kl": 0.03131103515625,
      "learning_rate": 1e-06,
      "loss": -0.0229,
      "num_tokens": 6265660.0,
      "reward": 0.05581410974264145,
      "reward_std": 0.02045728638768196,
      "rewards/bleu_reward_func/mean": 0.05581410974264145,
      "rewards/bleu_reward_func/std": 0.03612606227397919,
      "step": 472
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 449.0,
      "completions/mean_length": 347.0,
      "completions/mean_terminated_length": 282.4347839355469,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.3784,
      "grad_norm": 2.50510573387146,
      "kl": 0.0333251953125,
      "learning_rate": 1e-06,
      "loss": -0.1003,
      "num_tokens": 6281332.0,
      "reward": 0.09204696118831635,
      "reward_std": 0.03634490817785263,
      "rewards/bleu_reward_func/mean": 0.09204696118831635,
      "rewards/bleu_reward_func/std": 0.10213056951761246,
      "step": 473
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 398.46875,
      "completions/mean_terminated_length": 346.8636474609375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "epoch": 0.3792,
      "grad_norm": 2.1126277446746826,
      "kl": 0.0368194580078125,
      "learning_rate": 1e-06,
      "loss": -0.0118,
      "num_tokens": 6297131.0,
      "reward": 0.07596694678068161,
      "reward_std": 0.023722348734736443,
      "rewards/bleu_reward_func/mean": 0.07596694678068161,
      "rewards/bleu_reward_func/std": 0.06731286644935608,
      "step": 474
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 296.59375,
      "completions/mean_terminated_length": 274.3103332519531,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.38,
      "grad_norm": 2.5251946449279785,
      "kl": 0.03680419921875,
      "learning_rate": 1e-06,
      "loss": 0.0783,
      "num_tokens": 6310086.0,
      "reward": 0.036592863500118256,
      "reward_std": 0.023251082748174667,
      "rewards/bleu_reward_func/mean": 0.036592863500118256,
      "rewards/bleu_reward_func/std": 0.03332400694489479,
      "step": 475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 488.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 257.59375,
      "completions/mean_terminated_length": 257.59375,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.3808,
      "grad_norm": 3.1550424098968506,
      "kl": 0.0521240234375,
      "learning_rate": 1e-06,
      "loss": -0.0125,
      "num_tokens": 6320209.0,
      "reward": 0.03460177034139633,
      "reward_std": 0.013276169076561928,
      "rewards/bleu_reward_func/mean": 0.03460177034139633,
      "rewards/bleu_reward_func/std": 0.021203402429819107,
      "step": 476
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 355.25,
      "completions/mean_terminated_length": 326.22222900390625,
      "completions/min_length": 195.0,
      "completions/min_terminated_length": 195.0,
      "epoch": 0.3816,
      "grad_norm": 2.3512442111968994,
      "kl": 0.033203125,
      "learning_rate": 1e-06,
      "loss": -0.0415,
      "num_tokens": 6334825.0,
      "reward": 0.03142130374908447,
      "reward_std": 0.008906159549951553,
      "rewards/bleu_reward_func/mean": 0.03142130374908447,
      "rewards/bleu_reward_func/std": 0.013355448842048645,
      "step": 477
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 278.0,
      "completions/mean_terminated_length": 234.6666717529297,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.3824,
      "grad_norm": 3.489647150039673,
      "kl": 0.04071044921875,
      "learning_rate": 1e-06,
      "loss": -0.1996,
      "num_tokens": 6346761.0,
      "reward": 0.05901729688048363,
      "reward_std": 0.03193315863609314,
      "rewards/bleu_reward_func/mean": 0.05901729688048363,
      "rewards/bleu_reward_func/std": 0.06475787609815598,
      "step": 478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 458.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 263.34375,
      "completions/mean_terminated_length": 263.34375,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "epoch": 0.3832,
      "grad_norm": 2.6905431747436523,
      "kl": 0.04022216796875,
      "learning_rate": 1e-06,
      "loss": -0.0777,
      "num_tokens": 6357628.0,
      "reward": 0.029398879036307335,
      "reward_std": 0.014063382521271706,
      "rewards/bleu_reward_func/mean": 0.029398879036307335,
      "rewards/bleu_reward_func/std": 0.01639050990343094,
      "step": 479
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 405.0,
      "completions/mean_length": 265.375,
      "completions/mean_terminated_length": 257.4193420410156,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.384,
      "grad_norm": 3.13386607170105,
      "kl": 0.046142578125,
      "learning_rate": 1e-06,
      "loss": 0.0544,
      "num_tokens": 6367960.0,
      "reward": 0.03670423477888107,
      "reward_std": 0.009904170408844948,
      "rewards/bleu_reward_func/mean": 0.03670423477888107,
      "rewards/bleu_reward_func/std": 0.026974406093358994,
      "step": 480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 392.53125,
      "completions/mean_terminated_length": 193.4166717529297,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.3848,
      "grad_norm": 3.0782554149627686,
      "kl": 0.059326171875,
      "learning_rate": 1e-06,
      "loss": -0.1164,
      "num_tokens": 6384225.0,
      "reward": 0.06590355932712555,
      "reward_std": 0.018399305641651154,
      "rewards/bleu_reward_func/mean": 0.06590355932712555,
      "rewards/bleu_reward_func/std": 0.03893038630485535,
      "step": 481
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 349.0,
      "completions/mean_length": 328.4375,
      "completions/mean_terminated_length": 144.875,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.3856,
      "grad_norm": 3.397596836090088,
      "kl": 0.03509521484375,
      "learning_rate": 1e-06,
      "loss": -0.0348,
      "num_tokens": 6398527.0,
      "reward": 0.1248546689748764,
      "reward_std": 0.06577208638191223,
      "rewards/bleu_reward_func/mean": 0.1248546689748764,
      "rewards/bleu_reward_func/std": 0.17896804213523865,
      "step": 482
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 248.8125,
      "completions/mean_terminated_length": 200.07408142089844,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.3864,
      "grad_norm": 5.109568119049072,
      "kl": 0.071258544921875,
      "learning_rate": 1e-06,
      "loss": 0.048,
      "num_tokens": 6408897.0,
      "reward": 0.04945829138159752,
      "reward_std": 0.028692957013845444,
      "rewards/bleu_reward_func/mean": 0.04945829138159752,
      "rewards/bleu_reward_func/std": 0.049232520163059235,
      "step": 483
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 362.0,
      "completions/mean_length": 198.5,
      "completions/mean_terminated_length": 188.3870849609375,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.3872,
      "grad_norm": 3.791825532913208,
      "kl": 0.04644775390625,
      "learning_rate": 1e-06,
      "loss": 0.0976,
      "num_tokens": 6417281.0,
      "reward": 0.042633987963199615,
      "reward_std": 0.012612289749085903,
      "rewards/bleu_reward_func/mean": 0.042633987963199615,
      "rewards/bleu_reward_func/std": 0.01618749275803566,
      "step": 484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 481.375,
      "completions/mean_terminated_length": 422.90911865234375,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "epoch": 0.388,
      "grad_norm": 2.041027069091797,
      "kl": 0.0343017578125,
      "learning_rate": 1e-06,
      "loss": 0.0043,
      "num_tokens": 6438757.0,
      "reward": 0.07603560388088226,
      "reward_std": 0.02007678709924221,
      "rewards/bleu_reward_func/mean": 0.07603560388088226,
      "rewards/bleu_reward_func/std": 0.03578682988882065,
      "step": 485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 288.53125,
      "completions/mean_terminated_length": 247.1481475830078,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.3888,
      "grad_norm": 2.764733076095581,
      "kl": 0.0421142578125,
      "learning_rate": 1e-06,
      "loss": -0.0224,
      "num_tokens": 6450254.0,
      "reward": 0.04120934009552002,
      "reward_std": 0.020221907645463943,
      "rewards/bleu_reward_func/mean": 0.04120934009552002,
      "rewards/bleu_reward_func/std": 0.039144545793533325,
      "step": 486
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 282.8125,
      "completions/mean_terminated_length": 218.63999938964844,
      "completions/min_length": 85.0,
      "completions/min_terminated_length": 85.0,
      "epoch": 0.3896,
      "grad_norm": 3.3871254920959473,
      "kl": 0.041168212890625,
      "learning_rate": 1e-06,
      "loss": 0.0849,
      "num_tokens": 6461760.0,
      "reward": 0.03640275448560715,
      "reward_std": 0.011773956939578056,
      "rewards/bleu_reward_func/mean": 0.03640275448560715,
      "rewards/bleu_reward_func/std": 0.016311539337038994,
      "step": 487
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 434.0625,
      "completions/mean_terminated_length": 356.125,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "epoch": 0.3904,
      "grad_norm": 2.419614553451538,
      "kl": 0.037078857421875,
      "learning_rate": 1e-06,
      "loss": 0.0212,
      "num_tokens": 6479426.0,
      "reward": 0.07108810544013977,
      "reward_std": 0.01458063255995512,
      "rewards/bleu_reward_func/mean": 0.07108810544013977,
      "rewards/bleu_reward_func/std": 0.04704602435231209,
      "step": 488
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 333.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 158.0625,
      "completions/mean_terminated_length": 158.0625,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.3912,
      "grad_norm": 4.360542297363281,
      "kl": 0.085205078125,
      "learning_rate": 1e-06,
      "loss": 0.1004,
      "num_tokens": 6486812.0,
      "reward": 0.08444488793611526,
      "reward_std": 0.028125371783971786,
      "rewards/bleu_reward_func/mean": 0.08444488793611526,
      "rewards/bleu_reward_func/std": 0.06024865806102753,
      "step": 489
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 484.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 228.125,
      "completions/mean_terminated_length": 228.125,
      "completions/min_length": 85.0,
      "completions/min_terminated_length": 85.0,
      "epoch": 0.392,
      "grad_norm": 3.349405288696289,
      "kl": 0.049224853515625,
      "learning_rate": 1e-06,
      "loss": 0.0236,
      "num_tokens": 6496928.0,
      "reward": 0.0794411301612854,
      "reward_std": 0.015294745564460754,
      "rewards/bleu_reward_func/mean": 0.0794411301612854,
      "rewards/bleu_reward_func/std": 0.071323461830616,
      "step": 490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 411.0,
      "completions/mean_length": 391.34375,
      "completions/mean_terminated_length": 236.21429443359375,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.3928,
      "grad_norm": 2.2550203800201416,
      "kl": 0.03802490234375,
      "learning_rate": 1e-06,
      "loss": -0.0734,
      "num_tokens": 6517467.0,
      "reward": 0.10378064960241318,
      "reward_std": 0.05355631560087204,
      "rewards/bleu_reward_func/mean": 0.10378064960241318,
      "rewards/bleu_reward_func/std": 0.08197584748268127,
      "step": 491
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 316.25,
      "completions/mean_terminated_length": 261.44000244140625,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.3936,
      "grad_norm": 2.498426914215088,
      "kl": 0.03643798828125,
      "learning_rate": 1e-06,
      "loss": -0.0782,
      "num_tokens": 6531899.0,
      "reward": 0.038530509918928146,
      "reward_std": 0.029034338891506195,
      "rewards/bleu_reward_func/mean": 0.038530509918928146,
      "rewards/bleu_reward_func/std": 0.04517725482583046,
      "step": 492
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 346.0,
      "completions/mean_terminated_length": 216.88888549804688,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.3944,
      "grad_norm": 3.5450828075408936,
      "kl": 0.05267333984375,
      "learning_rate": 1e-06,
      "loss": 0.0445,
      "num_tokens": 6546123.0,
      "reward": 0.03807983547449112,
      "reward_std": 0.012768654152750969,
      "rewards/bleu_reward_func/mean": 0.03807983547449112,
      "rewards/bleu_reward_func/std": 0.019427087157964706,
      "step": 493
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 478.0,
      "completions/mean_length": 164.375,
      "completions/mean_terminated_length": 128.41378784179688,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.3952,
      "grad_norm": 7.575675010681152,
      "kl": 0.067230224609375,
      "learning_rate": 1e-06,
      "loss": 0.0558,
      "num_tokens": 6553823.0,
      "reward": 0.036415085196495056,
      "reward_std": 0.015921277925372124,
      "rewards/bleu_reward_func/mean": 0.036415085196495056,
      "rewards/bleu_reward_func/std": 0.03126350790262222,
      "step": 494
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 306.53125,
      "completions/mean_terminated_length": 259.1153869628906,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "epoch": 0.396,
      "grad_norm": 2.6021785736083984,
      "kl": 0.034881591796875,
      "learning_rate": 1e-06,
      "loss": 0.0336,
      "num_tokens": 6570000.0,
      "reward": 0.038223300129175186,
      "reward_std": 0.014713780023157597,
      "rewards/bleu_reward_func/mean": 0.038223300129175186,
      "rewards/bleu_reward_func/std": 0.01743900217115879,
      "step": 495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 362.25,
      "completions/mean_terminated_length": 283.8095397949219,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "epoch": 0.3968,
      "grad_norm": 2.632026433944702,
      "kl": 0.0277099609375,
      "learning_rate": 1e-06,
      "loss": 0.0039,
      "num_tokens": 6586440.0,
      "reward": 0.05412636697292328,
      "reward_std": 0.02242736518383026,
      "rewards/bleu_reward_func/mean": 0.05412636697292328,
      "rewards/bleu_reward_func/std": 0.03469071537256241,
      "step": 496
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 381.9375,
      "completions/mean_terminated_length": 338.5833435058594,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "epoch": 0.3976,
      "grad_norm": 2.3472609519958496,
      "kl": 0.031829833984375,
      "learning_rate": 1e-06,
      "loss": -0.0066,
      "num_tokens": 6601902.0,
      "reward": 0.05178507789969444,
      "reward_std": 0.015618492849171162,
      "rewards/bleu_reward_func/mean": 0.05178507789969444,
      "rewards/bleu_reward_func/std": 0.020302964374423027,
      "step": 497
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 328.28125,
      "completions/mean_terminated_length": 256.39129638671875,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "epoch": 0.3984,
      "grad_norm": 3.0667872428894043,
      "kl": 0.04681396484375,
      "learning_rate": 1e-06,
      "loss": 0.0113,
      "num_tokens": 6614759.0,
      "reward": 0.02746613696217537,
      "reward_std": 0.006646636873483658,
      "rewards/bleu_reward_func/mean": 0.02746613696217537,
      "rewards/bleu_reward_func/std": 0.016086198389530182,
      "step": 498
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 467.0,
      "completions/mean_length": 188.5,
      "completions/mean_terminated_length": 155.03448486328125,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.3992,
      "grad_norm": 5.149783134460449,
      "kl": 0.07086181640625,
      "learning_rate": 1e-06,
      "loss": 0.0702,
      "num_tokens": 6622999.0,
      "reward": 0.06750474870204926,
      "reward_std": 0.014433549717068672,
      "rewards/bleu_reward_func/mean": 0.06750474870204926,
      "rewards/bleu_reward_func/std": 0.05492382496595383,
      "step": 499
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 432.0,
      "completions/mean_length": 385.0625,
      "completions/mean_terminated_length": 273.058837890625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 0.4,
      "grad_norm": 2.2382819652557373,
      "kl": 0.035736083984375,
      "learning_rate": 1e-06,
      "loss": -0.0591,
      "num_tokens": 6637913.0,
      "reward": 0.07065648585557938,
      "reward_std": 0.03134492412209511,
      "rewards/bleu_reward_func/mean": 0.07065648585557938,
      "rewards/bleu_reward_func/std": 0.05255349352955818,
      "step": 500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 386.0625,
      "completions/mean_terminated_length": 320.0952453613281,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 0.4008,
      "grad_norm": 2.6071560382843018,
      "kl": 0.02655029296875,
      "learning_rate": 1e-06,
      "loss": 0.0223,
      "num_tokens": 6653755.0,
      "reward": 0.03733908385038376,
      "reward_std": 0.021098248660564423,
      "rewards/bleu_reward_func/mean": 0.03733908385038376,
      "rewards/bleu_reward_func/std": 0.03734488785266876,
      "step": 501
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 267.09375,
      "completions/mean_terminated_length": 198.51998901367188,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.4016,
      "grad_norm": 4.854734420776367,
      "kl": 0.04248046875,
      "learning_rate": 1e-06,
      "loss": -0.0771,
      "num_tokens": 6665286.0,
      "reward": 0.21961082518100739,
      "reward_std": 0.09568939357995987,
      "rewards/bleu_reward_func/mean": 0.21961082518100739,
      "rewards/bleu_reward_func/std": 0.3429633677005768,
      "step": 502
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 275.46875,
      "completions/mean_terminated_length": 267.8387145996094,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.4024,
      "grad_norm": 3.6082074642181396,
      "kl": 0.033843994140625,
      "learning_rate": 1e-06,
      "loss": -0.04,
      "num_tokens": 6676325.0,
      "reward": 0.0696449875831604,
      "reward_std": 0.045844756066799164,
      "rewards/bleu_reward_func/mean": 0.0696449875831604,
      "rewards/bleu_reward_func/std": 0.051705408841371536,
      "step": 503
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 325.0,
      "completions/mean_terminated_length": 240.0,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.4032,
      "grad_norm": 4.330098628997803,
      "kl": 0.04473876953125,
      "learning_rate": 1e-06,
      "loss": -0.1325,
      "num_tokens": 6689805.0,
      "reward": 0.03944069519639015,
      "reward_std": 0.01668594963848591,
      "rewards/bleu_reward_func/mean": 0.03944069519639015,
      "rewards/bleu_reward_func/std": 0.017823999747633934,
      "step": 504
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 423.0,
      "completions/mean_length": 250.4375,
      "completions/mean_terminated_length": 163.25,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.404,
      "grad_norm": 3.294508218765259,
      "kl": 0.05078125,
      "learning_rate": 1e-06,
      "loss": 0.1719,
      "num_tokens": 6700619.0,
      "reward": 0.0427066832780838,
      "reward_std": 0.018565086647868156,
      "rewards/bleu_reward_func/mean": 0.0427066832780838,
      "rewards/bleu_reward_func/std": 0.02504456229507923,
      "step": 505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 429.0,
      "completions/mean_length": 367.6875,
      "completions/mean_terminated_length": 292.0952453613281,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "epoch": 0.4048,
      "grad_norm": 2.24147629737854,
      "kl": 0.034454345703125,
      "learning_rate": 1e-06,
      "loss": 0.1088,
      "num_tokens": 6717257.0,
      "reward": 0.034352123737335205,
      "reward_std": 0.007905229926109314,
      "rewards/bleu_reward_func/mean": 0.034352123737335205,
      "rewards/bleu_reward_func/std": 0.028700148686766624,
      "step": 506
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 417.5,
      "completions/mean_terminated_length": 360.8000183105469,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "epoch": 0.4056,
      "grad_norm": 2.1727077960968018,
      "kl": 0.032928466796875,
      "learning_rate": 1e-06,
      "loss": -0.0452,
      "num_tokens": 6733721.0,
      "reward": 0.04472000151872635,
      "reward_std": 0.012819021940231323,
      "rewards/bleu_reward_func/mean": 0.04472000151872635,
      "rewards/bleu_reward_func/std": 0.020674971863627434,
      "step": 507
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 348.46875,
      "completions/mean_terminated_length": 262.8095397949219,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.4064,
      "grad_norm": 7.316678524017334,
      "kl": 0.035491943359375,
      "learning_rate": 1e-06,
      "loss": -0.0005,
      "num_tokens": 6749720.0,
      "reward": 0.06846344470977783,
      "reward_std": 0.022212965413928032,
      "rewards/bleu_reward_func/mean": 0.06846344470977783,
      "rewards/bleu_reward_func/std": 0.07043828815221786,
      "step": 508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 492.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 237.84375,
      "completions/mean_terminated_length": 237.84375,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.4072,
      "grad_norm": 3.6807727813720703,
      "kl": 0.03271484375,
      "learning_rate": 1e-06,
      "loss": -0.0019,
      "num_tokens": 6759187.0,
      "reward": 0.044801339507102966,
      "reward_std": 0.01820746809244156,
      "rewards/bleu_reward_func/mean": 0.044801339507102966,
      "rewards/bleu_reward_func/std": 0.02651328034698963,
      "step": 509
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 306.0,
      "completions/mean_length": 235.0625,
      "completions/mean_terminated_length": 142.75,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.408,
      "grad_norm": 3.6571717262268066,
      "kl": 0.0440673828125,
      "learning_rate": 1e-06,
      "loss": -0.0074,
      "num_tokens": 6770805.0,
      "reward": 0.043380383402109146,
      "reward_std": 0.025585712864995003,
      "rewards/bleu_reward_func/mean": 0.043380383402109146,
      "rewards/bleu_reward_func/std": 0.04197373613715172,
      "step": 510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 398.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 164.65625,
      "completions/mean_terminated_length": 164.65625,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.4088,
      "grad_norm": 4.0850830078125,
      "kl": 0.0361785888671875,
      "learning_rate": 1e-06,
      "loss": -0.2266,
      "num_tokens": 6778194.0,
      "reward": 0.1240055114030838,
      "reward_std": 0.038006868213415146,
      "rewards/bleu_reward_func/mean": 0.1240055114030838,
      "rewards/bleu_reward_func/std": 0.1415473371744156,
      "step": 511
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 360.71875,
      "completions/mean_terminated_length": 318.3599853515625,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 0.4096,
      "grad_norm": 2.39262056350708,
      "kl": 0.03729248046875,
      "learning_rate": 1e-06,
      "loss": 0.1221,
      "num_tokens": 6792945.0,
      "reward": 0.026915479451417923,
      "reward_std": 0.008309369906783104,
      "rewards/bleu_reward_func/mean": 0.026915479451417923,
      "rewards/bleu_reward_func/std": 0.01223670318722725,
      "step": 512
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 475.0,
      "completions/mean_length": 424.9375,
      "completions/mean_terminated_length": 326.2666931152344,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "epoch": 0.4104,
      "grad_norm": 2.1014010906219482,
      "kl": 0.03375244140625,
      "learning_rate": 1e-06,
      "loss": -0.0362,
      "num_tokens": 6809839.0,
      "reward": 0.038692403584718704,
      "reward_std": 0.012931729666888714,
      "rewards/bleu_reward_func/mean": 0.038692403584718704,
      "rewards/bleu_reward_func/std": 0.03173365071415901,
      "step": 513
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 423.09375,
      "completions/mean_terminated_length": 388.3043518066406,
      "completions/min_length": 251.0,
      "completions/min_terminated_length": 251.0,
      "epoch": 0.4112,
      "grad_norm": 2.0441877841949463,
      "kl": 0.03924560546875,
      "learning_rate": 1e-06,
      "loss": 0.0653,
      "num_tokens": 6826258.0,
      "reward": 0.033866725862026215,
      "reward_std": 0.004912001546472311,
      "rewards/bleu_reward_func/mean": 0.033866725862026215,
      "rewards/bleu_reward_func/std": 0.03187695890665054,
      "step": 514
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 265.875,
      "completions/mean_terminated_length": 196.95999145507812,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.412,
      "grad_norm": 4.132297039031982,
      "kl": 0.05572509765625,
      "learning_rate": 1e-06,
      "loss": 0.0553,
      "num_tokens": 6838686.0,
      "reward": 0.038207922130823135,
      "reward_std": 0.007710086181759834,
      "rewards/bleu_reward_func/mean": 0.038207922130823135,
      "rewards/bleu_reward_func/std": 0.01941581815481186,
      "step": 515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 369.375,
      "completions/mean_terminated_length": 321.8333435058594,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 111.0,
      "epoch": 0.4128,
      "grad_norm": 2.4867780208587646,
      "kl": 0.033538818359375,
      "learning_rate": 1e-06,
      "loss": -0.0815,
      "num_tokens": 6853018.0,
      "reward": 0.041043445467948914,
      "reward_std": 0.015324940904974937,
      "rewards/bleu_reward_func/mean": 0.041043445467948914,
      "rewards/bleu_reward_func/std": 0.028135672211647034,
      "step": 516
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 446.0,
      "completions/max_terminated_length": 446.0,
      "completions/mean_length": 158.375,
      "completions/mean_terminated_length": 158.375,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.4136,
      "grad_norm": 5.145825386047363,
      "kl": 0.0673828125,
      "learning_rate": 1e-06,
      "loss": -0.0586,
      "num_tokens": 6862070.0,
      "reward": 0.049210622906684875,
      "reward_std": 0.017335664480924606,
      "rewards/bleu_reward_func/mean": 0.049210622906684875,
      "rewards/bleu_reward_func/std": 0.02346484549343586,
      "step": 517
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 397.34375,
      "completions/mean_terminated_length": 296.1764831542969,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "epoch": 0.4144,
      "grad_norm": 3.008666515350342,
      "kl": 0.0364837646484375,
      "learning_rate": 1e-06,
      "loss": -0.1012,
      "num_tokens": 6880193.0,
      "reward": 0.038832880556583405,
      "reward_std": 0.012767771258950233,
      "rewards/bleu_reward_func/mean": 0.038832880556583405,
      "rewards/bleu_reward_func/std": 0.03091544844210148,
      "step": 518
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 439.0,
      "completions/mean_length": 260.6875,
      "completions/mean_terminated_length": 190.3199920654297,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.4152,
      "grad_norm": 2.8610570430755615,
      "kl": 0.051849365234375,
      "learning_rate": 1e-06,
      "loss": -0.021,
      "num_tokens": 6891911.0,
      "reward": 0.03679898753762245,
      "reward_std": 0.017421672120690346,
      "rewards/bleu_reward_func/mean": 0.03679898753762245,
      "rewards/bleu_reward_func/std": 0.030264802277088165,
      "step": 519
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 448.09375,
      "completions/mean_terminated_length": 256.375,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.416,
      "grad_norm": 2.3569674491882324,
      "kl": 0.040191650390625,
      "learning_rate": 1e-06,
      "loss": 0.16,
      "num_tokens": 6909442.0,
      "reward": 0.04537253454327583,
      "reward_std": 0.026473576202988625,
      "rewards/bleu_reward_func/mean": 0.04537253454327583,
      "rewards/bleu_reward_func/std": 0.03027988225221634,
      "step": 520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 399.28125,
      "completions/mean_terminated_length": 299.8235168457031,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "epoch": 0.4168,
      "grad_norm": 2.596334934234619,
      "kl": 0.047119140625,
      "learning_rate": 1e-06,
      "loss": -0.0005,
      "num_tokens": 6925683.0,
      "reward": 0.04248907417058945,
      "reward_std": 0.008885795250535011,
      "rewards/bleu_reward_func/mean": 0.04248907417058945,
      "rewards/bleu_reward_func/std": 0.027150554582476616,
      "step": 521
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 396.0,
      "completions/mean_length": 224.5,
      "completions/mean_terminated_length": 215.22579956054688,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.4176,
      "grad_norm": 4.284977912902832,
      "kl": 0.056060791015625,
      "learning_rate": 1e-06,
      "loss": 0.0544,
      "num_tokens": 6934955.0,
      "reward": 0.031697362661361694,
      "reward_std": 0.01221911795437336,
      "rewards/bleu_reward_func/mean": 0.031697362661361694,
      "rewards/bleu_reward_func/std": 0.014145019464194775,
      "step": 522
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 260.1875,
      "completions/mean_terminated_length": 243.40000915527344,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "epoch": 0.4184,
      "grad_norm": 2.888882637023926,
      "kl": 0.05804443359375,
      "learning_rate": 1e-06,
      "loss": -0.0438,
      "num_tokens": 6945593.0,
      "reward": 0.041664689779281616,
      "reward_std": 0.013896044343709946,
      "rewards/bleu_reward_func/mean": 0.041664689779281616,
      "rewards/bleu_reward_func/std": 0.024034207686781883,
      "step": 523
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 349.59375,
      "completions/mean_terminated_length": 286.0434875488281,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "epoch": 0.4192,
      "grad_norm": 3.398181676864624,
      "kl": 0.03240966796875,
      "learning_rate": 1e-06,
      "loss": -0.1082,
      "num_tokens": 6959404.0,
      "reward": 0.07310491800308228,
      "reward_std": 0.0231708325445652,
      "rewards/bleu_reward_func/mean": 0.07310491800308228,
      "rewards/bleu_reward_func/std": 0.04921337589621544,
      "step": 524
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 317.5,
      "completions/mean_terminated_length": 297.3793029785156,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.42,
      "grad_norm": 3.1394002437591553,
      "kl": 0.049713134765625,
      "learning_rate": 1e-06,
      "loss": -0.0133,
      "num_tokens": 6971580.0,
      "reward": 0.07620484381914139,
      "reward_std": 0.029079508036375046,
      "rewards/bleu_reward_func/mean": 0.07620484381914139,
      "rewards/bleu_reward_func/std": 0.055587053298950195,
      "step": 525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 297.6875,
      "completions/mean_terminated_length": 226.25,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.4208,
      "grad_norm": 15.365631103515625,
      "kl": 0.191741943359375,
      "learning_rate": 1e-06,
      "loss": -0.006,
      "num_tokens": 6985578.0,
      "reward": 0.0780831128358841,
      "reward_std": 0.018097946420311928,
      "rewards/bleu_reward_func/mean": 0.0780831128358841,
      "rewards/bleu_reward_func/std": 0.09850489348173141,
      "step": 526
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 369.125,
      "completions/mean_terminated_length": 348.71429443359375,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.4216,
      "grad_norm": 2.0799665451049805,
      "kl": 0.03955078125,
      "learning_rate": 1e-06,
      "loss": 0.0467,
      "num_tokens": 7000926.0,
      "reward": 0.05712277069687843,
      "reward_std": 0.02389085479080677,
      "rewards/bleu_reward_func/mean": 0.05712277069687843,
      "rewards/bleu_reward_func/std": 0.04602767527103424,
      "step": 527
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 278.0,
      "completions/mean_length": 234.40625,
      "completions/mean_terminated_length": 141.875,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.4224,
      "grad_norm": 3.6912519931793213,
      "kl": 0.0577392578125,
      "learning_rate": 1e-06,
      "loss": -0.0821,
      "num_tokens": 7011011.0,
      "reward": 0.02966947853565216,
      "reward_std": 0.009855142794549465,
      "rewards/bleu_reward_func/mean": 0.02966947853565216,
      "rewards/bleu_reward_func/std": 0.012489824555814266,
      "step": 528
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 345.40625,
      "completions/mean_terminated_length": 334.3000183105469,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.4232,
      "grad_norm": 2.465789794921875,
      "kl": 0.032958984375,
      "learning_rate": 1e-06,
      "loss": 0.1264,
      "num_tokens": 7026168.0,
      "reward": 0.10139614343643188,
      "reward_std": 0.04301796853542328,
      "rewards/bleu_reward_func/mean": 0.10139614343643188,
      "rewards/bleu_reward_func/std": 0.12598717212677002,
      "step": 529
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 243.78125,
      "completions/mean_terminated_length": 205.46429443359375,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.424,
      "grad_norm": 4.074453830718994,
      "kl": 0.038360595703125,
      "learning_rate": 1e-06,
      "loss": 0.004,
      "num_tokens": 7036849.0,
      "reward": 0.07757672667503357,
      "reward_std": 0.02031567506492138,
      "rewards/bleu_reward_func/mean": 0.07757672667503357,
      "rewards/bleu_reward_func/std": 0.06997023522853851,
      "step": 530
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 218.25,
      "completions/mean_terminated_length": 187.86207580566406,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.4248,
      "grad_norm": 3.842729091644287,
      "kl": 0.041717529296875,
      "learning_rate": 1e-06,
      "loss": -0.0248,
      "num_tokens": 7046225.0,
      "reward": 0.03865049406886101,
      "reward_std": 0.01612503081560135,
      "rewards/bleu_reward_func/mean": 0.03865049406886101,
      "rewards/bleu_reward_func/std": 0.02242407016456127,
      "step": 531
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 461.375,
      "completions/mean_terminated_length": 387.3846435546875,
      "completions/min_length": 244.0,
      "completions/min_terminated_length": 244.0,
      "epoch": 0.4256,
      "grad_norm": 2.2066099643707275,
      "kl": 0.036346435546875,
      "learning_rate": 1e-06,
      "loss": 0.0395,
      "num_tokens": 7065165.0,
      "reward": 0.036675065755844116,
      "reward_std": 0.009393100626766682,
      "rewards/bleu_reward_func/mean": 0.036675065755844116,
      "rewards/bleu_reward_func/std": 0.021476779133081436,
      "step": 532
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 325.5625,
      "completions/mean_terminated_length": 263.41668701171875,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "epoch": 0.4264,
      "grad_norm": 3.2754671573638916,
      "kl": 0.045806884765625,
      "learning_rate": 1e-06,
      "loss": -0.0558,
      "num_tokens": 7078359.0,
      "reward": 0.06816762685775757,
      "reward_std": 0.016703680157661438,
      "rewards/bleu_reward_func/mean": 0.06816762685775757,
      "rewards/bleu_reward_func/std": 0.04132222384214401,
      "step": 533
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 383.0625,
      "completions/mean_terminated_length": 294.84210205078125,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "epoch": 0.4272,
      "grad_norm": 2.0327775478363037,
      "kl": 0.0302734375,
      "learning_rate": 1e-06,
      "loss": -0.0255,
      "num_tokens": 7092673.0,
      "reward": 0.10351169109344482,
      "reward_std": 0.02997823804616928,
      "rewards/bleu_reward_func/mean": 0.10351169109344482,
      "rewards/bleu_reward_func/std": 0.1509554237127304,
      "step": 534
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 368.0,
      "completions/mean_length": 302.125,
      "completions/mean_terminated_length": 232.1666717529297,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.428,
      "grad_norm": 3.195488929748535,
      "kl": 0.041168212890625,
      "learning_rate": 1e-06,
      "loss": -0.1486,
      "num_tokens": 7104781.0,
      "reward": 0.04248940944671631,
      "reward_std": 0.01792888715863228,
      "rewards/bleu_reward_func/mean": 0.04248940944671631,
      "rewards/bleu_reward_func/std": 0.02808193489909172,
      "step": 535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 461.0,
      "completions/mean_length": 341.84375,
      "completions/mean_terminated_length": 275.2608642578125,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.4288,
      "grad_norm": 2.6887435913085938,
      "kl": 0.03656005859375,
      "learning_rate": 1e-06,
      "loss": -0.1183,
      "num_tokens": 7118416.0,
      "reward": 0.07263948023319244,
      "reward_std": 0.02492811344563961,
      "rewards/bleu_reward_func/mean": 0.07263948023319244,
      "rewards/bleu_reward_func/std": 0.089384526014328,
      "step": 536
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 420.0,
      "completions/mean_length": 388.6875,
      "completions/mean_terminated_length": 265.375,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "epoch": 0.4296,
      "grad_norm": 2.2471821308135986,
      "kl": 0.033721923828125,
      "learning_rate": 1e-06,
      "loss": 0.0593,
      "num_tokens": 7133742.0,
      "reward": 0.01961388997733593,
      "reward_std": 0.005338278133422136,
      "rewards/bleu_reward_func/mean": 0.01961388997733593,
      "rewards/bleu_reward_func/std": 0.009376008063554764,
      "step": 537
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 438.0,
      "completions/mean_length": 268.25,
      "completions/mean_terminated_length": 223.11111450195312,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.4304,
      "grad_norm": 4.410032749176025,
      "kl": 0.03692626953125,
      "learning_rate": 1e-06,
      "loss": -0.0016,
      "num_tokens": 7149422.0,
      "reward": 0.05439123511314392,
      "reward_std": 0.02193494513630867,
      "rewards/bleu_reward_func/mean": 0.05439123511314392,
      "rewards/bleu_reward_func/std": 0.05751095712184906,
      "step": 538
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 237.0,
      "completions/mean_length": 314.5,
      "completions/mean_terminated_length": 117.0,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.4312,
      "grad_norm": 3.9528393745422363,
      "kl": 0.04327392578125,
      "learning_rate": 1e-06,
      "loss": 0.0183,
      "num_tokens": 7165182.0,
      "reward": 0.11979202926158905,
      "reward_std": 0.029252737760543823,
      "rewards/bleu_reward_func/mean": 0.11979202926158905,
      "rewards/bleu_reward_func/std": 0.05838814005255699,
      "step": 539
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 348.46875,
      "completions/mean_terminated_length": 310.73077392578125,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.432,
      "grad_norm": 2.450528383255005,
      "kl": 0.037353515625,
      "learning_rate": 1e-06,
      "loss": -0.0731,
      "num_tokens": 7178621.0,
      "reward": 0.07015785574913025,
      "reward_std": 0.013684559613466263,
      "rewards/bleu_reward_func/mean": 0.07015785574913025,
      "rewards/bleu_reward_func/std": 0.08590352535247803,
      "step": 540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 248.15625,
      "completions/mean_terminated_length": 199.29629516601562,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.4328,
      "grad_norm": 4.424651145935059,
      "kl": 0.05096435546875,
      "learning_rate": 1e-06,
      "loss": -0.1833,
      "num_tokens": 7189602.0,
      "reward": 0.05476554483175278,
      "reward_std": 0.02339433878660202,
      "rewards/bleu_reward_func/mean": 0.05476554483175278,
      "rewards/bleu_reward_func/std": 0.03689594194293022,
      "step": 541
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 439.0,
      "completions/max_terminated_length": 439.0,
      "completions/mean_length": 225.46875,
      "completions/mean_terminated_length": 225.46875,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.4336,
      "grad_norm": 4.021416664123535,
      "kl": 0.05731201171875,
      "learning_rate": 1e-06,
      "loss": -0.0843,
      "num_tokens": 7199641.0,
      "reward": 0.057144373655319214,
      "reward_std": 0.01742716133594513,
      "rewards/bleu_reward_func/mean": 0.057144373655319214,
      "rewards/bleu_reward_func/std": 0.041288405656814575,
      "step": 542
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 400.3125,
      "completions/mean_terminated_length": 301.76470947265625,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "epoch": 0.4344,
      "grad_norm": 2.6051321029663086,
      "kl": 0.033721923828125,
      "learning_rate": 1e-06,
      "loss": -0.054,
      "num_tokens": 7214963.0,
      "reward": 0.045022256672382355,
      "reward_std": 0.013825424946844578,
      "rewards/bleu_reward_func/mean": 0.045022256672382355,
      "rewards/bleu_reward_func/std": 0.021449485793709755,
      "step": 543
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 245.75,
      "completions/mean_terminated_length": 171.1999969482422,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.4352,
      "grad_norm": 5.376564979553223,
      "kl": 0.0401611328125,
      "learning_rate": 1e-06,
      "loss": 0.0519,
      "num_tokens": 7225603.0,
      "reward": 0.25259390473365784,
      "reward_std": 0.10001413524150848,
      "rewards/bleu_reward_func/mean": 0.25259390473365784,
      "rewards/bleu_reward_func/std": 0.2557314336299896,
      "step": 544
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 380.8125,
      "completions/mean_terminated_length": 278.77777099609375,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.436,
      "grad_norm": 3.3095123767852783,
      "kl": 0.03887939453125,
      "learning_rate": 1e-06,
      "loss": -0.0544,
      "num_tokens": 7241525.0,
      "reward": 0.05672682821750641,
      "reward_std": 0.021520383656024933,
      "rewards/bleu_reward_func/mean": 0.05672682821750641,
      "rewards/bleu_reward_func/std": 0.04318132996559143,
      "step": 545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 356.0,
      "completions/max_terminated_length": 356.0,
      "completions/mean_length": 145.125,
      "completions/mean_terminated_length": 145.125,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.4368,
      "grad_norm": 4.073734760284424,
      "kl": 0.0557861328125,
      "learning_rate": 1e-06,
      "loss": -0.1033,
      "num_tokens": 7255145.0,
      "reward": 0.09350405633449554,
      "reward_std": 0.03147149458527565,
      "rewards/bleu_reward_func/mean": 0.09350405633449554,
      "rewards/bleu_reward_func/std": 0.10825508832931519,
      "step": 546
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 333.125,
      "completions/mean_terminated_length": 291.8461608886719,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.4376,
      "grad_norm": 3.2068068981170654,
      "kl": 0.03753662109375,
      "learning_rate": 1e-06,
      "loss": -0.1119,
      "num_tokens": 7269005.0,
      "reward": 0.03155931830406189,
      "reward_std": 0.018099911510944366,
      "rewards/bleu_reward_func/mean": 0.03155931830406189,
      "rewards/bleu_reward_func/std": 0.02396266907453537,
      "step": 547
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 327.8125,
      "completions/mean_terminated_length": 301.5,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.4384,
      "grad_norm": 3.2204573154449463,
      "kl": 0.041412353515625,
      "learning_rate": 1e-06,
      "loss": -0.0929,
      "num_tokens": 7285751.0,
      "reward": 0.031415294855833054,
      "reward_std": 0.011055306531488895,
      "rewards/bleu_reward_func/mean": 0.031415294855833054,
      "rewards/bleu_reward_func/std": 0.025729909539222717,
      "step": 548
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 378.0,
      "completions/mean_length": 274.90625,
      "completions/mean_terminated_length": 182.13043212890625,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.4392,
      "grad_norm": 4.724465847015381,
      "kl": 0.041778564453125,
      "learning_rate": 1e-06,
      "loss": 0.132,
      "num_tokens": 7297668.0,
      "reward": 0.11313501000404358,
      "reward_std": 0.036550864577293396,
      "rewards/bleu_reward_func/mean": 0.11313501000404358,
      "rewards/bleu_reward_func/std": 0.048100944608449936,
      "step": 549
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 461.0,
      "completions/mean_length": 273.28125,
      "completions/mean_terminated_length": 193.70834350585938,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.44,
      "grad_norm": 3.449267625808716,
      "kl": 0.0501708984375,
      "learning_rate": 1e-06,
      "loss": -0.0538,
      "num_tokens": 7308973.0,
      "reward": 0.0663086399435997,
      "reward_std": 0.025801170617341995,
      "rewards/bleu_reward_func/mean": 0.0663086399435997,
      "rewards/bleu_reward_func/std": 0.05170860141515732,
      "step": 550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 315.0625,
      "completions/mean_terminated_length": 249.4166717529297,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.4408,
      "grad_norm": 2.530473232269287,
      "kl": 0.04833984375,
      "learning_rate": 1e-06,
      "loss": 0.1201,
      "num_tokens": 7322271.0,
      "reward": 0.07075276970863342,
      "reward_std": 0.02442948892712593,
      "rewards/bleu_reward_func/mean": 0.07075276970863342,
      "rewards/bleu_reward_func/std": 0.027710873633623123,
      "step": 551
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 299.0,
      "completions/mean_length": 342.28125,
      "completions/mean_terminated_length": 172.5625,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.4416,
      "grad_norm": 3.0482726097106934,
      "kl": 0.04998779296875,
      "learning_rate": 1e-06,
      "loss": -0.0208,
      "num_tokens": 7336592.0,
      "reward": 0.03471437096595764,
      "reward_std": 0.006943271495401859,
      "rewards/bleu_reward_func/mean": 0.03471437096595764,
      "rewards/bleu_reward_func/std": 0.018991071730852127,
      "step": 552
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 266.75,
      "completions/mean_terminated_length": 198.0800018310547,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.4424,
      "grad_norm": 4.837971210479736,
      "kl": 0.05224609375,
      "learning_rate": 1e-06,
      "loss": -0.027,
      "num_tokens": 7353232.0,
      "reward": 0.09570033848285675,
      "reward_std": 0.023528877645730972,
      "rewards/bleu_reward_func/mean": 0.09570033848285675,
      "rewards/bleu_reward_func/std": 0.0557950884103775,
      "step": 553
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 229.15625,
      "completions/mean_terminated_length": 188.75001525878906,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.4432,
      "grad_norm": 6.145666122436523,
      "kl": 0.04083251953125,
      "learning_rate": 1e-06,
      "loss": 0.1587,
      "num_tokens": 7363293.0,
      "reward": 0.15402229130268097,
      "reward_std": 0.060593266040086746,
      "rewards/bleu_reward_func/mean": 0.15402229130268097,
      "rewards/bleu_reward_func/std": 0.21718958020210266,
      "step": 554
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 431.96875,
      "completions/mean_terminated_length": 329.0714416503906,
      "completions/min_length": 209.0,
      "completions/min_terminated_length": 209.0,
      "epoch": 0.444,
      "grad_norm": 2.2111520767211914,
      "kl": 0.04241943359375,
      "learning_rate": 1e-06,
      "loss": -0.0774,
      "num_tokens": 7381116.0,
      "reward": 0.09687276929616928,
      "reward_std": 0.020684881135821342,
      "rewards/bleu_reward_func/mean": 0.09687276929616928,
      "rewards/bleu_reward_func/std": 0.0794241800904274,
      "step": 555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 290.5,
      "completions/mean_terminated_length": 283.3548278808594,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "epoch": 0.4448,
      "grad_norm": 3.065490484237671,
      "kl": 0.03900146484375,
      "learning_rate": 1e-06,
      "loss": -0.1863,
      "num_tokens": 7392580.0,
      "reward": 0.07492414861917496,
      "reward_std": 0.026103414595127106,
      "rewards/bleu_reward_func/mean": 0.07492414861917496,
      "rewards/bleu_reward_func/std": 0.0615268349647522,
      "step": 556
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 288.875,
      "completions/mean_terminated_length": 281.6773986816406,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.4456,
      "grad_norm": 3.6267192363739014,
      "kl": 0.03515625,
      "learning_rate": 1e-06,
      "loss": -0.1178,
      "num_tokens": 7407360.0,
      "reward": 0.051946789026260376,
      "reward_std": 0.01748417690396309,
      "rewards/bleu_reward_func/mean": 0.051946789026260376,
      "rewards/bleu_reward_func/std": 0.05551343038678169,
      "step": 557
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 256.0,
      "completions/mean_length": 358.4375,
      "completions/mean_terminated_length": 204.875,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "epoch": 0.4464,
      "grad_norm": 2.4198813438415527,
      "kl": 0.0382080078125,
      "learning_rate": 1e-06,
      "loss": 0.0175,
      "num_tokens": 7423862.0,
      "reward": 0.04432545229792595,
      "reward_std": 0.016445258632302284,
      "rewards/bleu_reward_func/mean": 0.04432545229792595,
      "rewards/bleu_reward_func/std": 0.027678146958351135,
      "step": 558
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 478.0,
      "completions/mean_length": 431.375,
      "completions/mean_terminated_length": 360.23529052734375,
      "completions/min_length": 262.0,
      "completions/min_terminated_length": 262.0,
      "epoch": 0.4472,
      "grad_norm": 2.404078722000122,
      "kl": 0.039306640625,
      "learning_rate": 1e-06,
      "loss": -0.0133,
      "num_tokens": 7440074.0,
      "reward": 0.03491047024726868,
      "reward_std": 0.017125248908996582,
      "rewards/bleu_reward_func/mean": 0.03491047024726868,
      "rewards/bleu_reward_func/std": 0.03506564348936081,
      "step": 559
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 376.21875,
      "completions/mean_terminated_length": 240.4375,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.448,
      "grad_norm": 3.0429000854492188,
      "kl": 0.054931640625,
      "learning_rate": 1e-06,
      "loss": -0.1107,
      "num_tokens": 7454921.0,
      "reward": 0.07136739790439606,
      "reward_std": 0.015603035688400269,
      "rewards/bleu_reward_func/mean": 0.07136739790439606,
      "rewards/bleu_reward_func/std": 0.06946107745170593,
      "step": 560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 480.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 266.6875,
      "completions/mean_terminated_length": 266.6875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "epoch": 0.4488,
      "grad_norm": 2.670884609222412,
      "kl": 0.0489501953125,
      "learning_rate": 1e-06,
      "loss": 0.0048,
      "num_tokens": 7465999.0,
      "reward": 0.06271904706954956,
      "reward_std": 0.02406102605164051,
      "rewards/bleu_reward_func/mean": 0.06271904706954956,
      "rewards/bleu_reward_func/std": 0.04184677079319954,
      "step": 561
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 365.0,
      "completions/mean_length": 403.25,
      "completions/mean_terminated_length": 244.30770874023438,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.4496,
      "grad_norm": 2.7331299781799316,
      "kl": 0.03823089599609375,
      "learning_rate": 1e-06,
      "loss": 0.1477,
      "num_tokens": 7481983.0,
      "reward": 0.059584274888038635,
      "reward_std": 0.0577334426343441,
      "rewards/bleu_reward_func/mean": 0.059584274888038635,
      "rewards/bleu_reward_func/std": 0.1189492866396904,
      "step": 562
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 412.15625,
      "completions/mean_terminated_length": 393.6666564941406,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 0.4504,
      "grad_norm": 2.2429537773132324,
      "kl": 0.037750244140625,
      "learning_rate": 1e-06,
      "loss": 0.013,
      "num_tokens": 7497380.0,
      "reward": 0.061556171625852585,
      "reward_std": 0.018194040283560753,
      "rewards/bleu_reward_func/mean": 0.061556171625852585,
      "rewards/bleu_reward_func/std": 0.044673360884189606,
      "step": 563
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 418.75,
      "completions/mean_terminated_length": 346.22222900390625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "epoch": 0.4512,
      "grad_norm": 2.5125789642333984,
      "kl": 0.0450439453125,
      "learning_rate": 1e-06,
      "loss": 0.0744,
      "num_tokens": 7514652.0,
      "reward": 0.09208458662033081,
      "reward_std": 0.031232329085469246,
      "rewards/bleu_reward_func/mean": 0.09208458662033081,
      "rewards/bleu_reward_func/std": 0.10837023705244064,
      "step": 564
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 323.96875,
      "completions/mean_terminated_length": 261.29168701171875,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.452,
      "grad_norm": 2.9214463233947754,
      "kl": 0.048583984375,
      "learning_rate": 1e-06,
      "loss": -0.0177,
      "num_tokens": 7527531.0,
      "reward": 0.0327458456158638,
      "reward_std": 0.007936608046293259,
      "rewards/bleu_reward_func/mean": 0.0327458456158638,
      "rewards/bleu_reward_func/std": 0.023914847522974014,
      "step": 565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 380.90625,
      "completions/mean_terminated_length": 312.23809814453125,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "epoch": 0.4528,
      "grad_norm": 2.2859840393066406,
      "kl": 0.045196533203125,
      "learning_rate": 1e-06,
      "loss": -0.0492,
      "num_tokens": 7542848.0,
      "reward": 0.07620274275541306,
      "reward_std": 0.027277415618300438,
      "rewards/bleu_reward_func/mean": 0.07620274275541306,
      "rewards/bleu_reward_func/std": 0.043575797230005264,
      "step": 566
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 295.84375,
      "completions/mean_terminated_length": 288.8709716796875,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "epoch": 0.4536,
      "grad_norm": 2.6817574501037598,
      "kl": 0.035888671875,
      "learning_rate": 1e-06,
      "loss": 0.0675,
      "num_tokens": 7554347.0,
      "reward": 0.04382229968905449,
      "reward_std": 0.016639089211821556,
      "rewards/bleu_reward_func/mean": 0.04382229968905449,
      "rewards/bleu_reward_func/std": 0.048808448016643524,
      "step": 567
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 396.875,
      "completions/mean_terminated_length": 143.60000610351562,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.4544,
      "grad_norm": 3.5614659786224365,
      "kl": 0.03765869140625,
      "learning_rate": 1e-06,
      "loss": -0.1146,
      "num_tokens": 7571287.0,
      "reward": 0.047724343836307526,
      "reward_std": 0.025788918137550354,
      "rewards/bleu_reward_func/mean": 0.047724343836307526,
      "rewards/bleu_reward_func/std": 0.0531436912715435,
      "step": 568
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 324.8125,
      "completions/mean_terminated_length": 281.6153869628906,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "epoch": 0.4552,
      "grad_norm": 2.7155921459198,
      "kl": 0.031982421875,
      "learning_rate": 1e-06,
      "loss": 0.0584,
      "num_tokens": 7583833.0,
      "reward": 0.10536953061819077,
      "reward_std": 0.01935265213251114,
      "rewards/bleu_reward_func/mean": 0.10536953061819077,
      "rewards/bleu_reward_func/std": 0.1391269862651825,
      "step": 569
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 389.125,
      "completions/mean_terminated_length": 280.70587158203125,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.456,
      "grad_norm": 3.139817953109741,
      "kl": 0.045562744140625,
      "learning_rate": 1e-06,
      "loss": 0.1345,
      "num_tokens": 7600669.0,
      "reward": 0.03381893038749695,
      "reward_std": 0.016742901876568794,
      "rewards/bleu_reward_func/mean": 0.03381893038749695,
      "rewards/bleu_reward_func/std": 0.02772444114089012,
      "step": 570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 336.28125,
      "completions/mean_terminated_length": 244.23809814453125,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "epoch": 0.4568,
      "grad_norm": 3.0243892669677734,
      "kl": 0.043701171875,
      "learning_rate": 1e-06,
      "loss": 0.066,
      "num_tokens": 7613814.0,
      "reward": 0.04240579158067703,
      "reward_std": 0.01026132982224226,
      "rewards/bleu_reward_func/mean": 0.04240579158067703,
      "rewards/bleu_reward_func/std": 0.03058365173637867,
      "step": 571
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 452.0,
      "completions/mean_length": 208.8125,
      "completions/mean_terminated_length": 199.03225708007812,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.4576,
      "grad_norm": 5.602510929107666,
      "kl": 0.04998779296875,
      "learning_rate": 1e-06,
      "loss": 0.0459,
      "num_tokens": 7623040.0,
      "reward": 0.08647982776165009,
      "reward_std": 0.03401945158839226,
      "rewards/bleu_reward_func/mean": 0.08647982776165009,
      "rewards/bleu_reward_func/std": 0.06481946259737015,
      "step": 572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 394.9375,
      "completions/mean_terminated_length": 314.84210205078125,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "epoch": 0.4584,
      "grad_norm": 2.113441228866577,
      "kl": 0.0284423828125,
      "learning_rate": 1e-06,
      "loss": -0.1758,
      "num_tokens": 7641758.0,
      "reward": 0.05996987968683243,
      "reward_std": 0.03625640273094177,
      "rewards/bleu_reward_func/mean": 0.05996987968683243,
      "rewards/bleu_reward_func/std": 0.07776294648647308,
      "step": 573
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 469.0,
      "completions/mean_length": 338.125,
      "completions/mean_terminated_length": 289.44000244140625,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "epoch": 0.4592,
      "grad_norm": 3.0386271476745605,
      "kl": 0.03668212890625,
      "learning_rate": 1e-06,
      "loss": 0.0939,
      "num_tokens": 7655066.0,
      "reward": 0.04174087196588516,
      "reward_std": 0.01017804816365242,
      "rewards/bleu_reward_func/mean": 0.04174087196588516,
      "rewards/bleu_reward_func/std": 0.011822175234556198,
      "step": 574
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 361.4375,
      "completions/mean_terminated_length": 258.4210510253906,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.46,
      "grad_norm": 3.434180498123169,
      "kl": 0.035400390625,
      "learning_rate": 1e-06,
      "loss": 0.0801,
      "num_tokens": 7670080.0,
      "reward": 0.061414189636707306,
      "reward_std": 0.017353273928165436,
      "rewards/bleu_reward_func/mean": 0.061414189636707306,
      "rewards/bleu_reward_func/std": 0.06352873891592026,
      "step": 575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 359.25,
      "completions/mean_terminated_length": 289.81817626953125,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.4608,
      "grad_norm": 3.759580612182617,
      "kl": 0.046783447265625,
      "learning_rate": 1e-06,
      "loss": 0.0683,
      "num_tokens": 7684952.0,
      "reward": 0.03925281763076782,
      "reward_std": 0.01383259054273367,
      "rewards/bleu_reward_func/mean": 0.03925281763076782,
      "rewards/bleu_reward_func/std": 0.026625417172908783,
      "step": 576
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 467.0,
      "completions/mean_length": 463.9375,
      "completions/mean_terminated_length": 341.1111145019531,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "epoch": 0.4616,
      "grad_norm": 2.7189910411834717,
      "kl": 0.037567138671875,
      "learning_rate": 1e-06,
      "loss": 0.0284,
      "num_tokens": 7704214.0,
      "reward": 0.05996118485927582,
      "reward_std": 0.011609978042542934,
      "rewards/bleu_reward_func/mean": 0.05996118485927582,
      "rewards/bleu_reward_func/std": 0.016379138454794884,
      "step": 577
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 316.84375,
      "completions/mean_terminated_length": 296.6551818847656,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "epoch": 0.4624,
      "grad_norm": 3.2235851287841797,
      "kl": 0.0435791015625,
      "learning_rate": 1e-06,
      "loss": 0.0329,
      "num_tokens": 7718913.0,
      "reward": 0.046588048338890076,
      "reward_std": 0.01663251593708992,
      "rewards/bleu_reward_func/mean": 0.046588048338890076,
      "rewards/bleu_reward_func/std": 0.027757668867707253,
      "step": 578
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 347.71875,
      "completions/mean_terminated_length": 219.94444274902344,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.4632,
      "grad_norm": 7.463287353515625,
      "kl": 0.03564453125,
      "learning_rate": 1e-06,
      "loss": -0.2157,
      "num_tokens": 7732912.0,
      "reward": 0.046244870871305466,
      "reward_std": 0.029473457485437393,
      "rewards/bleu_reward_func/mean": 0.046244870871305466,
      "rewards/bleu_reward_func/std": 0.047696422785520554,
      "step": 579
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 329.21875,
      "completions/mean_terminated_length": 295.370361328125,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.464,
      "grad_norm": 3.48012375831604,
      "kl": 0.058197021484375,
      "learning_rate": 1e-06,
      "loss": -0.0137,
      "num_tokens": 7746103.0,
      "reward": 0.08398178219795227,
      "reward_std": 0.02495785802602768,
      "rewards/bleu_reward_func/mean": 0.08398178219795227,
      "rewards/bleu_reward_func/std": 0.042412761598825455,
      "step": 580
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 448.0,
      "completions/mean_length": 266.21875,
      "completions/mean_terminated_length": 118.75,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.4648,
      "grad_norm": 4.121673107147217,
      "kl": 0.0460205078125,
      "learning_rate": 1e-06,
      "loss": 0.0792,
      "num_tokens": 7756350.0,
      "reward": 0.05314105004072189,
      "reward_std": 0.025027906522154808,
      "rewards/bleu_reward_func/mean": 0.05314105004072189,
      "rewards/bleu_reward_func/std": 0.0441046804189682,
      "step": 581
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 473.0,
      "completions/mean_length": 312.0,
      "completions/mean_terminated_length": 233.7391357421875,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.4656,
      "grad_norm": 4.499547004699707,
      "kl": 0.037322998046875,
      "learning_rate": 1e-06,
      "loss": -0.037,
      "num_tokens": 7769326.0,
      "reward": 0.03417757526040077,
      "reward_std": 0.014021034352481365,
      "rewards/bleu_reward_func/mean": 0.03417757526040077,
      "rewards/bleu_reward_func/std": 0.020486222580075264,
      "step": 582
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 211.9375,
      "completions/mean_terminated_length": 191.933349609375,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.4664,
      "grad_norm": 3.6834604740142822,
      "kl": 0.05157470703125,
      "learning_rate": 1e-06,
      "loss": -0.0825,
      "num_tokens": 7781164.0,
      "reward": 0.03835342079401016,
      "reward_std": 0.012080431915819645,
      "rewards/bleu_reward_func/mean": 0.03835342079401016,
      "rewards/bleu_reward_func/std": 0.03473048284649849,
      "step": 583
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 377.78125,
      "completions/mean_terminated_length": 340.1999816894531,
      "completions/min_length": 225.0,
      "completions/min_terminated_length": 225.0,
      "epoch": 0.4672,
      "grad_norm": 2.79353666305542,
      "kl": 0.0355224609375,
      "learning_rate": 1e-06,
      "loss": 0.0312,
      "num_tokens": 7796253.0,
      "reward": 0.053808994591236115,
      "reward_std": 0.011477467603981495,
      "rewards/bleu_reward_func/mean": 0.053808994591236115,
      "rewards/bleu_reward_func/std": 0.026024699211120605,
      "step": 584
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 308.5625,
      "completions/mean_terminated_length": 287.5172424316406,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.468,
      "grad_norm": 2.750420570373535,
      "kl": 0.04071044921875,
      "learning_rate": 1e-06,
      "loss": -0.1319,
      "num_tokens": 7809199.0,
      "reward": 0.033281125128269196,
      "reward_std": 0.0192633755505085,
      "rewards/bleu_reward_func/mean": 0.033281125128269196,
      "rewards/bleu_reward_func/std": 0.02813081629574299,
      "step": 585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 257.0,
      "completions/mean_length": 424.75,
      "completions/mean_terminated_length": 163.0,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.4688,
      "grad_norm": 3.282240152359009,
      "kl": 0.05096435546875,
      "learning_rate": 1e-06,
      "loss": -0.057,
      "num_tokens": 7825903.0,
      "reward": 0.03255011513829231,
      "reward_std": 0.009569083340466022,
      "rewards/bleu_reward_func/mean": 0.03255011513829231,
      "rewards/bleu_reward_func/std": 0.025185568258166313,
      "step": 586
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 283.75,
      "completions/mean_terminated_length": 268.5333557128906,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.4696,
      "grad_norm": 3.6260647773742676,
      "kl": 0.03955078125,
      "learning_rate": 1e-06,
      "loss": -0.18,
      "num_tokens": 7837039.0,
      "reward": 0.0632857158780098,
      "reward_std": 0.03276119753718376,
      "rewards/bleu_reward_func/mean": 0.0632857158780098,
      "rewards/bleu_reward_func/std": 0.049753375351428986,
      "step": 587
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 500.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 310.625,
      "completions/mean_terminated_length": 310.625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "epoch": 0.4704,
      "grad_norm": 2.673442840576172,
      "kl": 0.037353515625,
      "learning_rate": 1e-06,
      "loss": 0.0266,
      "num_tokens": 7848931.0,
      "reward": 0.04075375944375992,
      "reward_std": 0.01151657197624445,
      "rewards/bleu_reward_func/mean": 0.04075375944375992,
      "rewards/bleu_reward_func/std": 0.02330603078007698,
      "step": 588
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 306.9375,
      "completions/mean_terminated_length": 277.64288330078125,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.4712,
      "grad_norm": 4.481053829193115,
      "kl": 0.046234130859375,
      "learning_rate": 1e-06,
      "loss": 0.0059,
      "num_tokens": 7862305.0,
      "reward": 0.05103903263807297,
      "reward_std": 0.01719430461525917,
      "rewards/bleu_reward_func/mean": 0.05103903263807297,
      "rewards/bleu_reward_func/std": 0.05009397119283676,
      "step": 589
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 282.0625,
      "completions/mean_terminated_length": 239.48147583007812,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "epoch": 0.472,
      "grad_norm": 2.702883243560791,
      "kl": 0.038818359375,
      "learning_rate": 1e-06,
      "loss": 0.0339,
      "num_tokens": 7873635.0,
      "reward": 0.043863605707883835,
      "reward_std": 0.01219463162124157,
      "rewards/bleu_reward_func/mean": 0.043863605707883835,
      "rewards/bleu_reward_func/std": 0.02272559143602848,
      "step": 590
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 380.5625,
      "completions/mean_terminated_length": 311.71429443359375,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "epoch": 0.4728,
      "grad_norm": 3.459629535675049,
      "kl": 0.030670166015625,
      "learning_rate": 1e-06,
      "loss": 0.0118,
      "num_tokens": 7888205.0,
      "reward": 0.05964861810207367,
      "reward_std": 0.014393117278814316,
      "rewards/bleu_reward_func/mean": 0.05964861810207367,
      "rewards/bleu_reward_func/std": 0.030775554478168488,
      "step": 591
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 208.875,
      "completions/mean_terminated_length": 152.74073791503906,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.4736,
      "grad_norm": 3.997481346130371,
      "kl": 0.053131103515625,
      "learning_rate": 1e-06,
      "loss": -0.0981,
      "num_tokens": 7896961.0,
      "reward": 0.09768746048212051,
      "reward_std": 0.06965920329093933,
      "rewards/bleu_reward_func/mean": 0.09768746048212051,
      "rewards/bleu_reward_func/std": 0.09702237695455551,
      "step": 592
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 365.0,
      "completions/mean_terminated_length": 264.4210510253906,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "epoch": 0.4744,
      "grad_norm": 2.9996702671051025,
      "kl": 0.04754638671875,
      "learning_rate": 1e-06,
      "loss": -0.0128,
      "num_tokens": 7911913.0,
      "reward": 0.03393930196762085,
      "reward_std": 0.011044314131140709,
      "rewards/bleu_reward_func/mean": 0.03393930196762085,
      "rewards/bleu_reward_func/std": 0.02244512550532818,
      "step": 593
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 398.4375,
      "completions/mean_terminated_length": 360.5833435058594,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.4752,
      "grad_norm": 2.3302993774414062,
      "kl": 0.027374267578125,
      "learning_rate": 1e-06,
      "loss": 0.003,
      "num_tokens": 7926615.0,
      "reward": 0.055141009390354156,
      "reward_std": 0.03131604939699173,
      "rewards/bleu_reward_func/mean": 0.055141009390354156,
      "rewards/bleu_reward_func/std": 0.03603653982281685,
      "step": 594
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 462.0,
      "completions/mean_length": 345.96875,
      "completions/mean_terminated_length": 290.625,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "epoch": 0.476,
      "grad_norm": 2.3898162841796875,
      "kl": 0.03863525390625,
      "learning_rate": 1e-06,
      "loss": -0.0262,
      "num_tokens": 7942582.0,
      "reward": 0.09485931694507599,
      "reward_std": 0.0281388983130455,
      "rewards/bleu_reward_func/mean": 0.09485931694507599,
      "rewards/bleu_reward_func/std": 0.09958592057228088,
      "step": 595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 447.4375,
      "completions/mean_terminated_length": 390.4705810546875,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "epoch": 0.4768,
      "grad_norm": 2.3441829681396484,
      "kl": 0.0261077880859375,
      "learning_rate": 1e-06,
      "loss": -0.0278,
      "num_tokens": 7960692.0,
      "reward": 0.0386398509144783,
      "reward_std": 0.015945829451084137,
      "rewards/bleu_reward_func/mean": 0.0386398509144783,
      "rewards/bleu_reward_func/std": 0.030894169583916664,
      "step": 596
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 430.0,
      "completions/max_terminated_length": 430.0,
      "completions/mean_length": 231.21875,
      "completions/mean_terminated_length": 231.21875,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "epoch": 0.4776,
      "grad_norm": 3.6054701805114746,
      "kl": 0.04876708984375,
      "learning_rate": 1e-06,
      "loss": 0.1195,
      "num_tokens": 7971027.0,
      "reward": 0.060304559767246246,
      "reward_std": 0.019354872405529022,
      "rewards/bleu_reward_func/mean": 0.060304559767246246,
      "rewards/bleu_reward_func/std": 0.02801518701016903,
      "step": 597
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 447.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 247.40625,
      "completions/mean_terminated_length": 247.40625,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "epoch": 0.4784,
      "grad_norm": 3.131502628326416,
      "kl": 0.036651611328125,
      "learning_rate": 1e-06,
      "loss": 0.0377,
      "num_tokens": 7981976.0,
      "reward": 0.04672680422663689,
      "reward_std": 0.014762789011001587,
      "rewards/bleu_reward_func/mean": 0.04672680422663689,
      "rewards/bleu_reward_func/std": 0.02195819653570652,
      "step": 598
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 417.25,
      "completions/mean_terminated_length": 352.4210510253906,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "epoch": 0.4792,
      "grad_norm": 2.603997230529785,
      "kl": 0.0496826171875,
      "learning_rate": 1e-06,
      "loss": -0.1255,
      "num_tokens": 7998472.0,
      "reward": 0.027579082176089287,
      "reward_std": 0.01969665102660656,
      "rewards/bleu_reward_func/mean": 0.027579082176089287,
      "rewards/bleu_reward_func/std": 0.03603406250476837,
      "step": 599
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 455.0,
      "completions/mean_length": 328.78125,
      "completions/mean_terminated_length": 257.08697509765625,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.48,
      "grad_norm": 4.4895172119140625,
      "kl": 0.05499267578125,
      "learning_rate": 1e-06,
      "loss": 0.0563,
      "num_tokens": 8015513.0,
      "reward": 0.12187319993972778,
      "reward_std": 0.024902716279029846,
      "rewards/bleu_reward_func/mean": 0.12187319993972778,
      "rewards/bleu_reward_func/std": 0.04940319433808327,
      "step": 600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 402.21875,
      "completions/mean_terminated_length": 305.3529357910156,
      "completions/min_length": 79.0,
      "completions/min_terminated_length": 79.0,
      "epoch": 0.4808,
      "grad_norm": 2.3858461380004883,
      "kl": 0.04559326171875,
      "learning_rate": 1e-06,
      "loss": -0.146,
      "num_tokens": 8031040.0,
      "reward": 0.05429249256849289,
      "reward_std": 0.025205722078680992,
      "rewards/bleu_reward_func/mean": 0.05429249256849289,
      "rewards/bleu_reward_func/std": 0.040760673582553864,
      "step": 601
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 402.625,
      "completions/mean_terminated_length": 317.5555725097656,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.4816,
      "grad_norm": 2.570845127105713,
      "kl": 0.04290771484375,
      "learning_rate": 1e-06,
      "loss": -0.1161,
      "num_tokens": 8047092.0,
      "reward": 0.03673902899026871,
      "reward_std": 0.02603769302368164,
      "rewards/bleu_reward_func/mean": 0.03673902899026871,
      "rewards/bleu_reward_func/std": 0.03560277447104454,
      "step": 602
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 347.21875,
      "completions/mean_terminated_length": 272.31817626953125,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 0.4824,
      "grad_norm": 2.776517391204834,
      "kl": 0.038055419921875,
      "learning_rate": 1e-06,
      "loss": -0.0265,
      "num_tokens": 8061299.0,
      "reward": 0.058416951447725296,
      "reward_std": 0.016790183261036873,
      "rewards/bleu_reward_func/mean": 0.058416951447725296,
      "rewards/bleu_reward_func/std": 0.025730496272444725,
      "step": 603
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 392.0,
      "completions/max_terminated_length": 392.0,
      "completions/mean_length": 223.1875,
      "completions/mean_terminated_length": 223.1875,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.4832,
      "grad_norm": 3.1524572372436523,
      "kl": 0.0537109375,
      "learning_rate": 1e-06,
      "loss": 0.0024,
      "num_tokens": 8070425.0,
      "reward": 0.05830112844705582,
      "reward_std": 0.019749192520976067,
      "rewards/bleu_reward_func/mean": 0.05830112844705582,
      "rewards/bleu_reward_func/std": 0.06983724236488342,
      "step": 604
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 315.5625,
      "completions/mean_terminated_length": 181.15789794921875,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.484,
      "grad_norm": 2.831850051879883,
      "kl": 0.048583984375,
      "learning_rate": 1e-06,
      "loss": 0.1295,
      "num_tokens": 8084955.0,
      "reward": 0.08599106967449188,
      "reward_std": 0.029317699372768402,
      "rewards/bleu_reward_func/mean": 0.08599106967449188,
      "rewards/bleu_reward_func/std": 0.08069133013486862,
      "step": 605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 328.96875,
      "completions/mean_terminated_length": 277.7200012207031,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.4848,
      "grad_norm": 2.704080820083618,
      "kl": 0.034210205078125,
      "learning_rate": 1e-06,
      "loss": -0.0324,
      "num_tokens": 8101450.0,
      "reward": 0.12661004066467285,
      "reward_std": 0.03845934569835663,
      "rewards/bleu_reward_func/mean": 0.12661004066467285,
      "rewards/bleu_reward_func/std": 0.16385024785995483,
      "step": 606
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 420.875,
      "completions/mean_terminated_length": 395.3599853515625,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "epoch": 0.4856,
      "grad_norm": 2.152388095855713,
      "kl": 0.030975341796875,
      "learning_rate": 1e-06,
      "loss": -0.0631,
      "num_tokens": 8119342.0,
      "reward": 0.05611906573176384,
      "reward_std": 0.01729283295571804,
      "rewards/bleu_reward_func/mean": 0.05611906573176384,
      "rewards/bleu_reward_func/std": 0.03917882218956947,
      "step": 607
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 371.25,
      "completions/mean_terminated_length": 338.7692565917969,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "epoch": 0.4864,
      "grad_norm": 2.3123111724853516,
      "kl": 0.037353515625,
      "learning_rate": 1e-06,
      "loss": -0.0253,
      "num_tokens": 8133462.0,
      "reward": 0.046634070575237274,
      "reward_std": 0.016718275845050812,
      "rewards/bleu_reward_func/mean": 0.046634070575237274,
      "rewards/bleu_reward_func/std": 0.04330248758196831,
      "step": 608
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 313.4375,
      "completions/mean_terminated_length": 247.25,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.4872,
      "grad_norm": 3.9397592544555664,
      "kl": 0.04949951171875,
      "learning_rate": 1e-06,
      "loss": 0.0353,
      "num_tokens": 8149140.0,
      "reward": 0.11079287528991699,
      "reward_std": 0.02652319148182869,
      "rewards/bleu_reward_func/mean": 0.11079287528991699,
      "rewards/bleu_reward_func/std": 0.04020438715815544,
      "step": 609
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 338.0,
      "completions/mean_length": 275.75,
      "completions/mean_terminated_length": 197.0,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.488,
      "grad_norm": 3.4938271045684814,
      "kl": 0.0595703125,
      "learning_rate": 1e-06,
      "loss": -0.0852,
      "num_tokens": 8162356.0,
      "reward": 0.0434708371758461,
      "reward_std": 0.008759420365095139,
      "rewards/bleu_reward_func/mean": 0.0434708371758461,
      "rewards/bleu_reward_func/std": 0.01517016626894474,
      "step": 610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 506.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 327.65625,
      "completions/mean_terminated_length": 327.65625,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "epoch": 0.4888,
      "grad_norm": 2.629668951034546,
      "kl": 0.05108642578125,
      "learning_rate": 1e-06,
      "loss": 0.1184,
      "num_tokens": 8175073.0,
      "reward": 0.06240731105208397,
      "reward_std": 0.019540153443813324,
      "rewards/bleu_reward_func/mean": 0.06240731105208397,
      "rewards/bleu_reward_func/std": 0.04322398081421852,
      "step": 611
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 369.78125,
      "completions/mean_terminated_length": 360.3000183105469,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "epoch": 0.4896,
      "grad_norm": 2.4656405448913574,
      "kl": 0.04461669921875,
      "learning_rate": 1e-06,
      "loss": 0.1145,
      "num_tokens": 8189442.0,
      "reward": 0.017378607764840126,
      "reward_std": 0.005311779212206602,
      "rewards/bleu_reward_func/mean": 0.017378607764840126,
      "rewards/bleu_reward_func/std": 0.012686874717473984,
      "step": 612
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 286.375,
      "completions/mean_terminated_length": 254.1428680419922,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.4904,
      "grad_norm": 4.021340370178223,
      "kl": 0.05572509765625,
      "learning_rate": 1e-06,
      "loss": -0.0739,
      "num_tokens": 8200734.0,
      "reward": 0.0681406781077385,
      "reward_std": 0.02279684692621231,
      "rewards/bleu_reward_func/mean": 0.0681406781077385,
      "rewards/bleu_reward_func/std": 0.0697670578956604,
      "step": 613
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 249.03125,
      "completions/mean_terminated_length": 221.8275909423828,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.4912,
      "grad_norm": 6.891125679016113,
      "kl": 0.041168212890625,
      "learning_rate": 1e-06,
      "loss": 0.0114,
      "num_tokens": 8213271.0,
      "reward": 0.10955880582332611,
      "reward_std": 0.05131708085536957,
      "rewards/bleu_reward_func/mean": 0.10955880582332611,
      "rewards/bleu_reward_func/std": 0.11350703984498978,
      "step": 614
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 461.0,
      "completions/max_terminated_length": 461.0,
      "completions/mean_length": 193.09375,
      "completions/mean_terminated_length": 193.09375,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.492,
      "grad_norm": 5.938106536865234,
      "kl": 0.0616455078125,
      "learning_rate": 1e-06,
      "loss": 0.0362,
      "num_tokens": 8222730.0,
      "reward": 0.06254906952381134,
      "reward_std": 0.013879001140594482,
      "rewards/bleu_reward_func/mean": 0.06254906952381134,
      "rewards/bleu_reward_func/std": 0.03822917118668556,
      "step": 615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 403.0625,
      "completions/mean_terminated_length": 279.6000061035156,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.4928,
      "grad_norm": 2.2214114665985107,
      "kl": 0.044952392578125,
      "learning_rate": 1e-06,
      "loss": 0.1107,
      "num_tokens": 8238876.0,
      "reward": 0.0595497228205204,
      "reward_std": 0.02531789056956768,
      "rewards/bleu_reward_func/mean": 0.0595497228205204,
      "rewards/bleu_reward_func/std": 0.04042090103030205,
      "step": 616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 370.3125,
      "completions/mean_terminated_length": 314.86956787109375,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.4936,
      "grad_norm": 2.4716479778289795,
      "kl": 0.04425048828125,
      "learning_rate": 1e-06,
      "loss": -0.1187,
      "num_tokens": 8252774.0,
      "reward": 0.040819909423589706,
      "reward_std": 0.01310974545776844,
      "rewards/bleu_reward_func/mean": 0.040819909423589706,
      "rewards/bleu_reward_func/std": 0.014063726179301739,
      "step": 617
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 303.21875,
      "completions/mean_terminated_length": 193.85714721679688,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "epoch": 0.4944,
      "grad_norm": 3.001490592956543,
      "kl": 0.0645751953125,
      "learning_rate": 1e-06,
      "loss": -0.084,
      "num_tokens": 8268237.0,
      "reward": 0.06573346257209778,
      "reward_std": 0.016953492537140846,
      "rewards/bleu_reward_func/mean": 0.06573346257209778,
      "rewards/bleu_reward_func/std": 0.029022369533777237,
      "step": 618
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 290.21875,
      "completions/mean_terminated_length": 189.4091033935547,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.4952,
      "grad_norm": 3.4469478130340576,
      "kl": 0.056976318359375,
      "learning_rate": 1e-06,
      "loss": 0.0337,
      "num_tokens": 8279260.0,
      "reward": 0.11688727140426636,
      "reward_std": 0.05538788437843323,
      "rewards/bleu_reward_func/mean": 0.11688727140426636,
      "rewards/bleu_reward_func/std": 0.15909381210803986,
      "step": 619
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 415.0,
      "completions/max_terminated_length": 415.0,
      "completions/mean_length": 253.125,
      "completions/mean_terminated_length": 253.125,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.496,
      "grad_norm": 3.6850404739379883,
      "kl": 0.0391845703125,
      "learning_rate": 1e-06,
      "loss": -0.0111,
      "num_tokens": 8289320.0,
      "reward": 0.08325017243623734,
      "reward_std": 0.038202736526727676,
      "rewards/bleu_reward_func/mean": 0.08325017243623734,
      "rewards/bleu_reward_func/std": 0.09062850475311279,
      "step": 620
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 210.125,
      "completions/mean_terminated_length": 178.89654541015625,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.4968,
      "grad_norm": 3.800476551055908,
      "kl": 0.067138671875,
      "learning_rate": 1e-06,
      "loss": -0.0907,
      "num_tokens": 8297828.0,
      "reward": 0.03796212375164032,
      "reward_std": 0.013833219185471535,
      "rewards/bleu_reward_func/mean": 0.03796212375164032,
      "rewards/bleu_reward_func/std": 0.017871350049972534,
      "step": 621
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 430.0,
      "completions/mean_length": 381.5625,
      "completions/mean_terminated_length": 233.7333526611328,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.4976,
      "grad_norm": 2.6759557723999023,
      "kl": 0.0574951171875,
      "learning_rate": 1e-06,
      "loss": -0.0515,
      "num_tokens": 8315566.0,
      "reward": 0.03240504860877991,
      "reward_std": 0.015285526402294636,
      "rewards/bleu_reward_func/mean": 0.03240504860877991,
      "rewards/bleu_reward_func/std": 0.038935378193855286,
      "step": 622
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 389.0,
      "completions/mean_terminated_length": 266.0,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "epoch": 0.4984,
      "grad_norm": 2.6809816360473633,
      "kl": 0.0592498779296875,
      "learning_rate": 1e-06,
      "loss": -0.1443,
      "num_tokens": 8332166.0,
      "reward": 0.09420234709978104,
      "reward_std": 0.040369659662246704,
      "rewards/bleu_reward_func/mean": 0.09420234709978104,
      "rewards/bleu_reward_func/std": 0.13240835070610046,
      "step": 623
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 418.0,
      "completions/mean_length": 251.125,
      "completions/mean_terminated_length": 202.8148193359375,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.4992,
      "grad_norm": 3.325798273086548,
      "kl": 0.039554595947265625,
      "learning_rate": 1e-06,
      "loss": 0.1752,
      "num_tokens": 8343354.0,
      "reward": 0.03703116998076439,
      "reward_std": 0.019214333966374397,
      "rewards/bleu_reward_func/mean": 0.03703116998076439,
      "rewards/bleu_reward_func/std": 0.023005735129117966,
      "step": 624
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 323.1875,
      "completions/mean_terminated_length": 270.32000732421875,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.5,
      "grad_norm": 3.1225502490997314,
      "kl": 0.0413818359375,
      "learning_rate": 1e-06,
      "loss": -0.04,
      "num_tokens": 8358024.0,
      "reward": 0.02892148494720459,
      "reward_std": 0.01050527486950159,
      "rewards/bleu_reward_func/mean": 0.02892148494720459,
      "rewards/bleu_reward_func/std": 0.016583766788244247,
      "step": 625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 433.0,
      "completions/mean_length": 312.8125,
      "completions/mean_terminated_length": 246.4166717529297,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.5008,
      "grad_norm": 4.3559088706970215,
      "kl": 0.0572509765625,
      "learning_rate": 1e-06,
      "loss": -0.1897,
      "num_tokens": 8370506.0,
      "reward": 0.06611833721399307,
      "reward_std": 0.023439275100827217,
      "rewards/bleu_reward_func/mean": 0.06611833721399307,
      "rewards/bleu_reward_func/std": 0.04732180014252663,
      "step": 626
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 455.0,
      "completions/max_terminated_length": 455.0,
      "completions/mean_length": 230.84375,
      "completions/mean_terminated_length": 230.84375,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.5016,
      "grad_norm": 6.250845909118652,
      "kl": 0.0501708984375,
      "learning_rate": 1e-06,
      "loss": -0.0384,
      "num_tokens": 8380765.0,
      "reward": 0.20678502321243286,
      "reward_std": 0.03768392652273178,
      "rewards/bleu_reward_func/mean": 0.20678502321243286,
      "rewards/bleu_reward_func/std": 0.2695082426071167,
      "step": 627
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 416.0625,
      "completions/mean_terminated_length": 358.5,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "epoch": 0.5024,
      "grad_norm": 2.0310866832733154,
      "kl": 0.037506103515625,
      "learning_rate": 1e-06,
      "loss": 0.0731,
      "num_tokens": 8396487.0,
      "reward": 0.06099681928753853,
      "reward_std": 0.019012173637747765,
      "rewards/bleu_reward_func/mean": 0.06099681928753853,
      "rewards/bleu_reward_func/std": 0.08027222752571106,
      "step": 628
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 283.9375,
      "completions/mean_terminated_length": 180.27273559570312,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.5032,
      "grad_norm": 3.856144666671753,
      "kl": 0.045654296875,
      "learning_rate": 1e-06,
      "loss": 0.0224,
      "num_tokens": 8407765.0,
      "reward": 0.05136800557374954,
      "reward_std": 0.014663058333098888,
      "rewards/bleu_reward_func/mean": 0.05136800557374954,
      "rewards/bleu_reward_func/std": 0.04050833731889725,
      "step": 629
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 352.34375,
      "completions/mean_terminated_length": 279.7727355957031,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "epoch": 0.504,
      "grad_norm": 2.120601177215576,
      "kl": 0.030643463134765625,
      "learning_rate": 1e-06,
      "loss": -0.0581,
      "num_tokens": 8424784.0,
      "reward": 0.18435360491275787,
      "reward_std": 0.07129880785942078,
      "rewards/bleu_reward_func/mean": 0.18435360491275787,
      "rewards/bleu_reward_func/std": 0.28386473655700684,
      "step": 630
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 447.75,
      "completions/mean_terminated_length": 409.20001220703125,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 0.5048,
      "grad_norm": 2.139927625656128,
      "kl": 0.04376220703125,
      "learning_rate": 1e-06,
      "loss": -0.0121,
      "num_tokens": 8443384.0,
      "reward": 0.03744620829820633,
      "reward_std": 0.011771570891141891,
      "rewards/bleu_reward_func/mean": 0.03744620829820633,
      "rewards/bleu_reward_func/std": 0.024914991110563278,
      "step": 631
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 433.0,
      "completions/max_terminated_length": 433.0,
      "completions/mean_length": 152.46875,
      "completions/mean_terminated_length": 152.46875,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.5056,
      "grad_norm": 4.558698654174805,
      "kl": 0.079833984375,
      "learning_rate": 1e-06,
      "loss": -0.0267,
      "num_tokens": 8451391.0,
      "reward": 0.06995508074760437,
      "reward_std": 0.03034752979874611,
      "rewards/bleu_reward_func/mean": 0.06995508074760437,
      "rewards/bleu_reward_func/std": 0.0674305334687233,
      "step": 632
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 341.34375,
      "completions/mean_terminated_length": 293.55999755859375,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 175.0,
      "epoch": 0.5064,
      "grad_norm": 2.3992624282836914,
      "kl": 0.045166015625,
      "learning_rate": 1e-06,
      "loss": -0.0131,
      "num_tokens": 8465602.0,
      "reward": 0.032830677926540375,
      "reward_std": 0.007615429349243641,
      "rewards/bleu_reward_func/mean": 0.032830677926540375,
      "rewards/bleu_reward_func/std": 0.017384473234415054,
      "step": 633
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 265.1875,
      "completions/mean_terminated_length": 219.48147583007812,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.5072,
      "grad_norm": 3.78257155418396,
      "kl": 0.044921875,
      "learning_rate": 1e-06,
      "loss": -0.0649,
      "num_tokens": 8476480.0,
      "reward": 0.08779050409793854,
      "reward_std": 0.03017434850335121,
      "rewards/bleu_reward_func/mean": 0.08779050409793854,
      "rewards/bleu_reward_func/std": 0.09867992997169495,
      "step": 634
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 340.28125,
      "completions/mean_terminated_length": 283.04168701171875,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "epoch": 0.508,
      "grad_norm": 2.275812864303589,
      "kl": 0.0479736328125,
      "learning_rate": 1e-06,
      "loss": -0.0002,
      "num_tokens": 8490385.0,
      "reward": 0.027935819700360298,
      "reward_std": 0.01714843139052391,
      "rewards/bleu_reward_func/mean": 0.027935819700360298,
      "rewards/bleu_reward_func/std": 0.02698652818799019,
      "step": 635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 339.8125,
      "completions/mean_terminated_length": 236.5,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.5088,
      "grad_norm": 3.2245848178863525,
      "kl": 0.0477447509765625,
      "learning_rate": 1e-06,
      "loss": 0.1014,
      "num_tokens": 8505507.0,
      "reward": 0.12299371510744095,
      "reward_std": 0.08378162980079651,
      "rewards/bleu_reward_func/mean": 0.12299371510744095,
      "rewards/bleu_reward_func/std": 0.10894149541854858,
      "step": 636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 348.375,
      "completions/mean_terminated_length": 318.0740661621094,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "epoch": 0.5096,
      "grad_norm": 2.773343086242676,
      "kl": 0.036895751953125,
      "learning_rate": 1e-06,
      "loss": -0.1357,
      "num_tokens": 8521959.0,
      "reward": 0.05392155051231384,
      "reward_std": 0.021032003685832024,
      "rewards/bleu_reward_func/mean": 0.05392155051231384,
      "rewards/bleu_reward_func/std": 0.03237828612327576,
      "step": 637
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 471.0,
      "completions/mean_length": 373.21875,
      "completions/mean_terminated_length": 300.5238037109375,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "epoch": 0.5104,
      "grad_norm": 2.5911452770233154,
      "kl": 0.0361328125,
      "learning_rate": 1e-06,
      "loss": 0.0505,
      "num_tokens": 8535774.0,
      "reward": 0.07769744098186493,
      "reward_std": 0.01766272261738777,
      "rewards/bleu_reward_func/mean": 0.07769744098186493,
      "rewards/bleu_reward_func/std": 0.04564467817544937,
      "step": 638
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 336.5625,
      "completions/mean_terminated_length": 278.0833435058594,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.5112,
      "grad_norm": 2.6450626850128174,
      "kl": 0.04058837890625,
      "learning_rate": 1e-06,
      "loss": -0.112,
      "num_tokens": 8550680.0,
      "reward": 0.039511341601610184,
      "reward_std": 0.020058486610651016,
      "rewards/bleu_reward_func/mean": 0.039511341601610184,
      "rewards/bleu_reward_func/std": 0.03323696553707123,
      "step": 639
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 338.59375,
      "completions/mean_terminated_length": 219.94737243652344,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.512,
      "grad_norm": 3.6534550189971924,
      "kl": 0.0657958984375,
      "learning_rate": 1e-06,
      "loss": -0.0021,
      "num_tokens": 8564075.0,
      "reward": 0.04427193105220795,
      "reward_std": 0.009654231369495392,
      "rewards/bleu_reward_func/mean": 0.04427193105220795,
      "rewards/bleu_reward_func/std": 0.016233008354902267,
      "step": 640
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 387.90625,
      "completions/mean_terminated_length": 278.4117736816406,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "epoch": 0.5128,
      "grad_norm": 2.5884809494018555,
      "kl": 0.0670166015625,
      "learning_rate": 1e-06,
      "loss": -0.022,
      "num_tokens": 8579848.0,
      "reward": 0.04640874266624451,
      "reward_std": 0.009371737949550152,
      "rewards/bleu_reward_func/mean": 0.04640874266624451,
      "rewards/bleu_reward_func/std": 0.014582473784685135,
      "step": 641
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 420.34375,
      "completions/mean_terminated_length": 328.6875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "epoch": 0.5136,
      "grad_norm": 2.2449564933776855,
      "kl": 0.05255126953125,
      "learning_rate": 1e-06,
      "loss": -0.0035,
      "num_tokens": 8597587.0,
      "reward": 0.027716750279068947,
      "reward_std": 0.00882766768336296,
      "rewards/bleu_reward_func/mean": 0.027716750279068947,
      "rewards/bleu_reward_func/std": 0.018463805317878723,
      "step": 642
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 458.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 274.0,
      "completions/mean_terminated_length": 274.0,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.5144,
      "grad_norm": 2.397284507751465,
      "kl": 0.0361328125,
      "learning_rate": 1e-06,
      "loss": 0.1852,
      "num_tokens": 8608219.0,
      "reward": 0.06662235409021378,
      "reward_std": 0.021093640476465225,
      "rewards/bleu_reward_func/mean": 0.06662235409021378,
      "rewards/bleu_reward_func/std": 0.04868040978908539,
      "step": 643
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 421.625,
      "completions/mean_terminated_length": 249.09091186523438,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "epoch": 0.5152,
      "grad_norm": 2.3074114322662354,
      "kl": 0.05316162109375,
      "learning_rate": 1e-06,
      "loss": 0.0663,
      "num_tokens": 8626695.0,
      "reward": 0.022337350994348526,
      "reward_std": 0.005283673293888569,
      "rewards/bleu_reward_func/mean": 0.022337350994348526,
      "rewards/bleu_reward_func/std": 0.014813877642154694,
      "step": 644
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 320.34375,
      "completions/mean_terminated_length": 266.67999267578125,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "epoch": 0.516,
      "grad_norm": 3.265336275100708,
      "kl": 0.041290283203125,
      "learning_rate": 1e-06,
      "loss": -0.0401,
      "num_tokens": 8640778.0,
      "reward": 0.08885028958320618,
      "reward_std": 0.04409442096948624,
      "rewards/bleu_reward_func/mean": 0.08885028958320618,
      "rewards/bleu_reward_func/std": 0.05580241233110428,
      "step": 645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 402.625,
      "completions/mean_terminated_length": 123.11111450195312,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.5168,
      "grad_norm": 4.289402008056641,
      "kl": 0.09149169921875,
      "learning_rate": 1e-06,
      "loss": 0.0137,
      "num_tokens": 8656158.0,
      "reward": 0.04943261295557022,
      "reward_std": 0.01372533105313778,
      "rewards/bleu_reward_func/mean": 0.04943261295557022,
      "rewards/bleu_reward_func/std": 0.022848060354590416,
      "step": 646
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 458.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 233.4375,
      "completions/mean_terminated_length": 233.4375,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.5176,
      "grad_norm": 2.6133716106414795,
      "kl": 0.04571533203125,
      "learning_rate": 1e-06,
      "loss": -0.0194,
      "num_tokens": 8666388.0,
      "reward": 0.06988528370857239,
      "reward_std": 0.0369546078145504,
      "rewards/bleu_reward_func/mean": 0.06988528370857239,
      "rewards/bleu_reward_func/std": 0.07029401510953903,
      "step": 647
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 475.0,
      "completions/mean_length": 386.09375,
      "completions/mean_terminated_length": 288.1666564941406,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "epoch": 0.5184,
      "grad_norm": 2.8481228351593018,
      "kl": 0.03912353515625,
      "learning_rate": 1e-06,
      "loss": -0.0154,
      "num_tokens": 8680743.0,
      "reward": 0.04563836753368378,
      "reward_std": 0.013341530226171017,
      "rewards/bleu_reward_func/mean": 0.04563836753368378,
      "rewards/bleu_reward_func/std": 0.028745615854859352,
      "step": 648
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 240.90625,
      "completions/mean_terminated_length": 150.5416717529297,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.5192,
      "grad_norm": 8.067500114440918,
      "kl": 0.084716796875,
      "learning_rate": 1e-06,
      "loss": 0.048,
      "num_tokens": 8695428.0,
      "reward": 0.18488597869873047,
      "reward_std": 0.028848692774772644,
      "rewards/bleu_reward_func/mean": 0.18488597869873047,
      "rewards/bleu_reward_func/std": 0.1518058031797409,
      "step": 649
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 429.0,
      "completions/mean_length": 306.09375,
      "completions/mean_terminated_length": 237.45834350585938,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.52,
      "grad_norm": 2.86417818069458,
      "kl": 0.0567626953125,
      "learning_rate": 1e-06,
      "loss": -0.0099,
      "num_tokens": 8708903.0,
      "reward": 0.07543742656707764,
      "reward_std": 0.024170244112610817,
      "rewards/bleu_reward_func/mean": 0.07543742656707764,
      "rewards/bleu_reward_func/std": 0.058858614414930344,
      "step": 650
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 476.0,
      "completions/mean_length": 454.09375,
      "completions/mean_terminated_length": 326.70001220703125,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 0.5208,
      "grad_norm": 2.209012985229492,
      "kl": 0.04949951171875,
      "learning_rate": 1e-06,
      "loss": 0.0536,
      "num_tokens": 8726226.0,
      "reward": 0.03396552428603172,
      "reward_std": 0.01698872074484825,
      "rewards/bleu_reward_func/mean": 0.03396552428603172,
      "rewards/bleu_reward_func/std": 0.024311579763889313,
      "step": 651
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 353.0,
      "completions/mean_terminated_length": 257.6000061035156,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.5216,
      "grad_norm": 5.167045593261719,
      "kl": 0.037353515625,
      "learning_rate": 1e-06,
      "loss": 0.0297,
      "num_tokens": 8740538.0,
      "reward": 0.07770118117332458,
      "reward_std": 0.031651660799980164,
      "rewards/bleu_reward_func/mean": 0.07770118117332458,
      "rewards/bleu_reward_func/std": 0.062497008591890335,
      "step": 652
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 418.25,
      "completions/mean_terminated_length": 354.1052551269531,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.5224,
      "grad_norm": 2.1188549995422363,
      "kl": 0.04437255859375,
      "learning_rate": 1e-06,
      "loss": -0.0311,
      "num_tokens": 8756394.0,
      "reward": 0.07865491509437561,
      "reward_std": 0.03826368600130081,
      "rewards/bleu_reward_func/mean": 0.07865491509437561,
      "rewards/bleu_reward_func/std": 0.06751979142427444,
      "step": 653
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 267.25,
      "completions/mean_terminated_length": 259.3548278808594,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.5232,
      "grad_norm": 2.7834532260894775,
      "kl": 0.045166015625,
      "learning_rate": 1e-06,
      "loss": -0.1165,
      "num_tokens": 8767866.0,
      "reward": 0.12012386322021484,
      "reward_std": 0.05065811797976494,
      "rewards/bleu_reward_func/mean": 0.12012386322021484,
      "rewards/bleu_reward_func/std": 0.08707272261381149,
      "step": 654
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 347.0625,
      "completions/mean_terminated_length": 282.5217590332031,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.524,
      "grad_norm": 3.0647759437561035,
      "kl": 0.04437255859375,
      "learning_rate": 1e-06,
      "loss": -0.0115,
      "num_tokens": 8783660.0,
      "reward": 0.09437389671802521,
      "reward_std": 0.03334784880280495,
      "rewards/bleu_reward_func/mean": 0.09437389671802521,
      "rewards/bleu_reward_func/std": 0.07559803873300552,
      "step": 655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 415.96875,
      "completions/mean_terminated_length": 319.9375,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "epoch": 0.5248,
      "grad_norm": 2.300849199295044,
      "kl": 0.0504150390625,
      "learning_rate": 1e-06,
      "loss": 0.0585,
      "num_tokens": 8800083.0,
      "reward": 0.05515716224908829,
      "reward_std": 0.01902184821665287,
      "rewards/bleu_reward_func/mean": 0.05515716224908829,
      "rewards/bleu_reward_func/std": 0.030146554112434387,
      "step": 656
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 291.28125,
      "completions/mean_terminated_length": 291.28125,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "epoch": 0.5256,
      "grad_norm": 2.5877864360809326,
      "kl": 0.041748046875,
      "learning_rate": 1e-06,
      "loss": 0.0736,
      "num_tokens": 8811428.0,
      "reward": 0.11595845222473145,
      "reward_std": 0.06343421339988708,
      "rewards/bleu_reward_func/mean": 0.11595845222473145,
      "rewards/bleu_reward_func/std": 0.1822866052389145,
      "step": 657
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 440.0,
      "completions/mean_length": 308.84375,
      "completions/mean_terminated_length": 216.5,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.5264,
      "grad_norm": 4.225070953369141,
      "kl": 0.042755126953125,
      "learning_rate": 1e-06,
      "loss": 0.0062,
      "num_tokens": 8823455.0,
      "reward": 0.08148862421512604,
      "reward_std": 0.021023821085691452,
      "rewards/bleu_reward_func/mean": 0.08148862421512604,
      "rewards/bleu_reward_func/std": 0.07011328637599945,
      "step": 658
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 470.0,
      "completions/mean_length": 318.59375,
      "completions/mean_terminated_length": 254.125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "epoch": 0.5272,
      "grad_norm": 2.809264659881592,
      "kl": 0.03753662109375,
      "learning_rate": 1e-06,
      "loss": 0.076,
      "num_tokens": 8835834.0,
      "reward": 0.04945838451385498,
      "reward_std": 0.02451205439865589,
      "rewards/bleu_reward_func/mean": 0.04945838451385498,
      "rewards/bleu_reward_func/std": 0.03474752977490425,
      "step": 659
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 469.0,
      "completions/mean_length": 323.53125,
      "completions/mean_terminated_length": 304.03448486328125,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "epoch": 0.528,
      "grad_norm": 2.119414806365967,
      "kl": 0.034088134765625,
      "learning_rate": 1e-06,
      "loss": 0.0428,
      "num_tokens": 8848443.0,
      "reward": 0.07028196007013321,
      "reward_std": 0.017643148079514503,
      "rewards/bleu_reward_func/mean": 0.07028196007013321,
      "rewards/bleu_reward_func/std": 0.05406918004155159,
      "step": 660
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 448.9375,
      "completions/mean_terminated_length": 328.54547119140625,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "epoch": 0.5288,
      "grad_norm": 2.035613536834717,
      "kl": 0.05206298828125,
      "learning_rate": 1e-06,
      "loss": -0.0222,
      "num_tokens": 8866841.0,
      "reward": 0.05816391110420227,
      "reward_std": 0.016069550067186356,
      "rewards/bleu_reward_func/mean": 0.05816391110420227,
      "rewards/bleu_reward_func/std": 0.020034752786159515,
      "step": 661
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 261.40625,
      "completions/mean_terminated_length": 244.70001220703125,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.5296,
      "grad_norm": 3.209188461303711,
      "kl": 0.043365478515625,
      "learning_rate": 1e-06,
      "loss": -0.0163,
      "num_tokens": 8877422.0,
      "reward": 0.04772093892097473,
      "reward_std": 0.016260413452982903,
      "rewards/bleu_reward_func/mean": 0.04772093892097473,
      "rewards/bleu_reward_func/std": 0.024178825318813324,
      "step": 662
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 310.65625,
      "completions/mean_terminated_length": 289.82757568359375,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "epoch": 0.5304,
      "grad_norm": 2.511366367340088,
      "kl": 0.0345458984375,
      "learning_rate": 1e-06,
      "loss": 0.0756,
      "num_tokens": 8890523.0,
      "reward": 0.09435027837753296,
      "reward_std": 0.029953738674521446,
      "rewards/bleu_reward_func/mean": 0.09435027837753296,
      "rewards/bleu_reward_func/std": 0.07095350325107574,
      "step": 663
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 265.03125,
      "completions/mean_terminated_length": 182.70834350585938,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.5312,
      "grad_norm": 4.535026550292969,
      "kl": 0.050811767578125,
      "learning_rate": 1e-06,
      "loss": -0.0718,
      "num_tokens": 8902308.0,
      "reward": 0.07631168514490128,
      "reward_std": 0.018188592046499252,
      "rewards/bleu_reward_func/mean": 0.07631168514490128,
      "rewards/bleu_reward_func/std": 0.04229210317134857,
      "step": 664
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 223.5,
      "completions/mean_terminated_length": 214.19354248046875,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.532,
      "grad_norm": 3.217503070831299,
      "kl": 0.04931640625,
      "learning_rate": 1e-06,
      "loss": 0.1727,
      "num_tokens": 8911500.0,
      "reward": 0.023189637809991837,
      "reward_std": 0.007045770063996315,
      "rewards/bleu_reward_func/mean": 0.023189637809991837,
      "rewards/bleu_reward_func/std": 0.011228468269109726,
      "step": 665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 475.0,
      "completions/mean_length": 292.59375,
      "completions/mean_terminated_length": 261.25,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.5328,
      "grad_norm": 9.476320266723633,
      "kl": 0.213714599609375,
      "learning_rate": 1e-06,
      "loss": -0.016,
      "num_tokens": 8925455.0,
      "reward": 0.08436296880245209,
      "reward_std": 0.021188655868172646,
      "rewards/bleu_reward_func/mean": 0.08436296880245209,
      "rewards/bleu_reward_func/std": 0.07301143556833267,
      "step": 666
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 245.25,
      "completions/mean_terminated_length": 156.33334350585938,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.5336,
      "grad_norm": 5.552678108215332,
      "kl": 0.04876708984375,
      "learning_rate": 1e-06,
      "loss": 0.0796,
      "num_tokens": 8939039.0,
      "reward": 0.09512823820114136,
      "reward_std": 0.03876760974526405,
      "rewards/bleu_reward_func/mean": 0.09512823820114136,
      "rewards/bleu_reward_func/std": 0.13977199792861938,
      "step": 667
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 383.0,
      "completions/mean_length": 239.75,
      "completions/mean_terminated_length": 163.51998901367188,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.5344,
      "grad_norm": 2.919515371322632,
      "kl": 0.02215576171875,
      "learning_rate": 1e-06,
      "loss": 0.0769,
      "num_tokens": 8949271.0,
      "reward": 0.1288418173789978,
      "reward_std": 0.06042589992284775,
      "rewards/bleu_reward_func/mean": 0.1288418173789978,
      "rewards/bleu_reward_func/std": 0.10018094629049301,
      "step": 668
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 292.5625,
      "completions/mean_terminated_length": 261.21429443359375,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "epoch": 0.5352,
      "grad_norm": 2.719533920288086,
      "kl": 0.03509521484375,
      "learning_rate": 1e-06,
      "loss": -0.029,
      "num_tokens": 8960585.0,
      "reward": 0.11158549785614014,
      "reward_std": 0.025866547599434853,
      "rewards/bleu_reward_func/mean": 0.11158549785614014,
      "rewards/bleu_reward_func/std": 0.06140602380037308,
      "step": 669
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 331.34375,
      "completions/mean_terminated_length": 271.125,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.536,
      "grad_norm": 2.3229901790618896,
      "kl": 0.03875732421875,
      "learning_rate": 1e-06,
      "loss": 0.1602,
      "num_tokens": 8975308.0,
      "reward": 0.07134771347045898,
      "reward_std": 0.013178054243326187,
      "rewards/bleu_reward_func/mean": 0.07134771347045898,
      "rewards/bleu_reward_func/std": 0.049940213561058044,
      "step": 670
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 418.25,
      "completions/mean_terminated_length": 354.1052551269531,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "epoch": 0.5368,
      "grad_norm": 2.3134210109710693,
      "kl": 0.04217529296875,
      "learning_rate": 1e-06,
      "loss": -0.0344,
      "num_tokens": 8992764.0,
      "reward": 0.050522781908512115,
      "reward_std": 0.01361394114792347,
      "rewards/bleu_reward_func/mean": 0.050522781908512115,
      "rewards/bleu_reward_func/std": 0.020486921072006226,
      "step": 671
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 501.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 202.96875,
      "completions/mean_terminated_length": 202.96875,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.5376,
      "grad_norm": 3.8227100372314453,
      "kl": 0.0574951171875,
      "learning_rate": 1e-06,
      "loss": -0.0995,
      "num_tokens": 9001603.0,
      "reward": 0.06377962231636047,
      "reward_std": 0.026187829673290253,
      "rewards/bleu_reward_func/mean": 0.06377962231636047,
      "rewards/bleu_reward_func/std": 0.03809577226638794,
      "step": 672
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 441.0,
      "completions/mean_length": 350.8125,
      "completions/mean_terminated_length": 189.625,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.5384,
      "grad_norm": 4.19647216796875,
      "kl": 0.037567138671875,
      "learning_rate": 1e-06,
      "loss": 0.021,
      "num_tokens": 9017317.0,
      "reward": 0.06122228503227234,
      "reward_std": 0.029229629784822464,
      "rewards/bleu_reward_func/mean": 0.06122228503227234,
      "rewards/bleu_reward_func/std": 0.0862286314368248,
      "step": 673
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 498.0,
      "completions/mean_terminated_length": 400.0,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 0.5392,
      "grad_norm": 1.9404997825622559,
      "kl": 0.040435791015625,
      "learning_rate": 1e-06,
      "loss": -0.017,
      "num_tokens": 9037933.0,
      "reward": 0.04427298903465271,
      "reward_std": 0.006757338996976614,
      "rewards/bleu_reward_func/mean": 0.04427298903465271,
      "rewards/bleu_reward_func/std": 0.030673207715153694,
      "step": 674
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 406.875,
      "completions/mean_terminated_length": 359.0909118652344,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 0.54,
      "grad_norm": 2.2146005630493164,
      "kl": 0.033935546875,
      "learning_rate": 1e-06,
      "loss": -0.0065,
      "num_tokens": 9053473.0,
      "reward": 0.06295045465230942,
      "reward_std": 0.017118435353040695,
      "rewards/bleu_reward_func/mean": 0.06295045465230942,
      "rewards/bleu_reward_func/std": 0.053336694836616516,
      "step": 675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 315.53125,
      "completions/mean_terminated_length": 226.22727966308594,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.5408,
      "grad_norm": 2.9350626468658447,
      "kl": 0.052520751953125,
      "learning_rate": 1e-06,
      "loss": 0.1251,
      "num_tokens": 9066026.0,
      "reward": 0.028412725776433945,
      "reward_std": 0.00589718297123909,
      "rewards/bleu_reward_func/mean": 0.028412725776433945,
      "rewards/bleu_reward_func/std": 0.00858322810381651,
      "step": 676
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 488.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 214.40625,
      "completions/mean_terminated_length": 214.40625,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.5416,
      "grad_norm": 2.6705992221832275,
      "kl": 0.0361328125,
      "learning_rate": 1e-06,
      "loss": -0.0716,
      "num_tokens": 9075087.0,
      "reward": 0.04127844423055649,
      "reward_std": 0.01752633973956108,
      "rewards/bleu_reward_func/mean": 0.04127844423055649,
      "rewards/bleu_reward_func/std": 0.024528201669454575,
      "step": 677
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 424.90625,
      "completions/mean_terminated_length": 348.058837890625,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.5424,
      "grad_norm": 2.137740135192871,
      "kl": 0.033782958984375,
      "learning_rate": 1e-06,
      "loss": 0.0089,
      "num_tokens": 9094100.0,
      "reward": 0.10619800537824631,
      "reward_std": 0.02650071680545807,
      "rewards/bleu_reward_func/mean": 0.10619800537824631,
      "rewards/bleu_reward_func/std": 0.11194527894258499,
      "step": 678
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 362.625,
      "completions/mean_terminated_length": 170.57144165039062,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.5432,
      "grad_norm": 4.145079612731934,
      "kl": 0.09063720703125,
      "learning_rate": 1e-06,
      "loss": -0.0814,
      "num_tokens": 9108312.0,
      "reward": 0.03859299048781395,
      "reward_std": 0.013083922676742077,
      "rewards/bleu_reward_func/mean": 0.03859299048781395,
      "rewards/bleu_reward_func/std": 0.027552325278520584,
      "step": 679
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 329.15625,
      "completions/mean_terminated_length": 323.258056640625,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "epoch": 0.544,
      "grad_norm": 2.8421475887298584,
      "kl": 0.03363037109375,
      "learning_rate": 1e-06,
      "loss": -0.0938,
      "num_tokens": 9122117.0,
      "reward": 0.07394321262836456,
      "reward_std": 0.027200117707252502,
      "rewards/bleu_reward_func/mean": 0.07394321262836456,
      "rewards/bleu_reward_func/std": 0.05345241352915764,
      "step": 680
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 356.375,
      "completions/mean_terminated_length": 295.478271484375,
      "completions/min_length": 79.0,
      "completions/min_terminated_length": 79.0,
      "epoch": 0.5448,
      "grad_norm": 2.2800376415252686,
      "kl": 0.03594970703125,
      "learning_rate": 1e-06,
      "loss": -0.0034,
      "num_tokens": 9135601.0,
      "reward": 0.034722380340099335,
      "reward_std": 0.011629019863903522,
      "rewards/bleu_reward_func/mean": 0.034722380340099335,
      "rewards/bleu_reward_func/std": 0.02644946798682213,
      "step": 681
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 483.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 178.71875,
      "completions/mean_terminated_length": 178.71875,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.5456,
      "grad_norm": 3.663381338119507,
      "kl": 0.030303955078125,
      "learning_rate": 1e-06,
      "loss": 0.0411,
      "num_tokens": 9142976.0,
      "reward": 0.08912401646375656,
      "reward_std": 0.03632171079516411,
      "rewards/bleu_reward_func/mean": 0.08912401646375656,
      "rewards/bleu_reward_func/std": 0.05631924048066139,
      "step": 682
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 314.21875,
      "completions/mean_terminated_length": 248.2916717529297,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.5464,
      "grad_norm": 4.255805492401123,
      "kl": 0.0494384765625,
      "learning_rate": 1e-06,
      "loss": -0.0512,
      "num_tokens": 9157071.0,
      "reward": 0.05893013998866081,
      "reward_std": 0.019025936722755432,
      "rewards/bleu_reward_func/mean": 0.05893013998866081,
      "rewards/bleu_reward_func/std": 0.05139394477009773,
      "step": 683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 437.0,
      "completions/mean_length": 243.625,
      "completions/mean_terminated_length": 225.7333526611328,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.5472,
      "grad_norm": 3.2021656036376953,
      "kl": 0.0478515625,
      "learning_rate": 1e-06,
      "loss": 0.105,
      "num_tokens": 9166795.0,
      "reward": 0.026314986869692802,
      "reward_std": 0.007651232648640871,
      "rewards/bleu_reward_func/mean": 0.026314986869692802,
      "rewards/bleu_reward_func/std": 0.010862961411476135,
      "step": 684
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 389.0,
      "completions/max_terminated_length": 389.0,
      "completions/mean_length": 165.25,
      "completions/mean_terminated_length": 165.25,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.548,
      "grad_norm": 3.6673038005828857,
      "kl": 0.075836181640625,
      "learning_rate": 1e-06,
      "loss": -0.1709,
      "num_tokens": 9174915.0,
      "reward": 0.0813303291797638,
      "reward_std": 0.025265149772167206,
      "rewards/bleu_reward_func/mean": 0.0813303291797638,
      "rewards/bleu_reward_func/std": 0.07939371466636658,
      "step": 685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 455.0,
      "completions/mean_length": 238.96875,
      "completions/mean_terminated_length": 188.40740966796875,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.5488,
      "grad_norm": 5.579142093658447,
      "kl": 0.0555419921875,
      "learning_rate": 1e-06,
      "loss": 0.0005,
      "num_tokens": 9185234.0,
      "reward": 0.05107392370700836,
      "reward_std": 0.02069205418229103,
      "rewards/bleu_reward_func/mean": 0.05107392370700836,
      "rewards/bleu_reward_func/std": 0.047578368335962296,
      "step": 686
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 451.0,
      "completions/mean_length": 406.1875,
      "completions/mean_terminated_length": 251.53846740722656,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "epoch": 0.5496,
      "grad_norm": 2.2841057777404785,
      "kl": 0.0462646484375,
      "learning_rate": 1e-06,
      "loss": -0.0739,
      "num_tokens": 9201024.0,
      "reward": 0.03643956780433655,
      "reward_std": 0.008713691495358944,
      "rewards/bleu_reward_func/mean": 0.03643956780433655,
      "rewards/bleu_reward_func/std": 0.024110866710543633,
      "step": 687
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 270.4375,
      "completions/mean_terminated_length": 235.9285888671875,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.5504,
      "grad_norm": 3.174736738204956,
      "kl": 0.043487548828125,
      "learning_rate": 1e-06,
      "loss": -0.1688,
      "num_tokens": 9212678.0,
      "reward": 0.08469453454017639,
      "reward_std": 0.03412746265530586,
      "rewards/bleu_reward_func/mean": 0.08469453454017639,
      "rewards/bleu_reward_func/std": 0.057441964745521545,
      "step": 688
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 412.0,
      "completions/mean_length": 311.0,
      "completions/mean_terminated_length": 244.0,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "epoch": 0.5512,
      "grad_norm": 2.86037540435791,
      "kl": 0.03607177734375,
      "learning_rate": 1e-06,
      "loss": -0.0022,
      "num_tokens": 9226670.0,
      "reward": 0.10889802128076553,
      "reward_std": 0.0333615243434906,
      "rewards/bleu_reward_func/mean": 0.10889802128076553,
      "rewards/bleu_reward_func/std": 0.11031165719032288,
      "step": 689
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 426.3125,
      "completions/mean_terminated_length": 340.625,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "epoch": 0.552,
      "grad_norm": 2.6019811630249023,
      "kl": 0.0339202880859375,
      "learning_rate": 1e-06,
      "loss": -0.0429,
      "num_tokens": 9244608.0,
      "reward": 0.05243753641843796,
      "reward_std": 0.016431640833616257,
      "rewards/bleu_reward_func/mean": 0.05243753641843796,
      "rewards/bleu_reward_func/std": 0.033674199134111404,
      "step": 690
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 274.71875,
      "completions/mean_terminated_length": 258.9000244140625,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.5528,
      "grad_norm": 2.8404016494750977,
      "kl": 0.0394287109375,
      "learning_rate": 1e-06,
      "loss": -0.0131,
      "num_tokens": 9256231.0,
      "reward": 0.03480883315205574,
      "reward_std": 0.022311776876449585,
      "rewards/bleu_reward_func/mean": 0.03480883315205574,
      "rewards/bleu_reward_func/std": 0.04648389294743538,
      "step": 691
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 464.40625,
      "completions/mean_terminated_length": 410.4666748046875,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "epoch": 0.5536,
      "grad_norm": 1.8518298864364624,
      "kl": 0.03631591796875,
      "learning_rate": 1e-06,
      "loss": 0.0071,
      "num_tokens": 9274868.0,
      "reward": 0.02014119178056717,
      "reward_std": 0.006333409808576107,
      "rewards/bleu_reward_func/mean": 0.02014119178056717,
      "rewards/bleu_reward_func/std": 0.016668220981955528,
      "step": 692
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 466.0,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 214.875,
      "completions/mean_terminated_length": 214.875,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.5544,
      "grad_norm": 8.611943244934082,
      "kl": 0.066192626953125,
      "learning_rate": 1e-06,
      "loss": 0.0955,
      "num_tokens": 9285320.0,
      "reward": 0.051611416041851044,
      "reward_std": 0.010077946819365025,
      "rewards/bleu_reward_func/mean": 0.051611416041851044,
      "rewards/bleu_reward_func/std": 0.023184411227703094,
      "step": 693
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 324.46875,
      "completions/mean_terminated_length": 261.9583435058594,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.5552,
      "grad_norm": 14.116987228393555,
      "kl": 0.0401611328125,
      "learning_rate": 1e-06,
      "loss": 0.0856,
      "num_tokens": 9300647.0,
      "reward": 0.11092260479927063,
      "reward_std": 0.034637127071619034,
      "rewards/bleu_reward_func/mean": 0.11092260479927063,
      "rewards/bleu_reward_func/std": 0.056095682084560394,
      "step": 694
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 304.4375,
      "completions/mean_terminated_length": 282.96551513671875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "epoch": 0.556,
      "grad_norm": 2.5600430965423584,
      "kl": 0.04901123046875,
      "learning_rate": 1e-06,
      "loss": -0.0664,
      "num_tokens": 9312669.0,
      "reward": 0.027583984658122063,
      "reward_std": 0.00833278801292181,
      "rewards/bleu_reward_func/mean": 0.027583984658122063,
      "rewards/bleu_reward_func/std": 0.011630790308117867,
      "step": 695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 339.03125,
      "completions/mean_terminated_length": 235.25,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.5568,
      "grad_norm": 3.3769948482513428,
      "kl": 0.05389404296875,
      "learning_rate": 1e-06,
      "loss": -0.0098,
      "num_tokens": 9327814.0,
      "reward": 0.028040939942002296,
      "reward_std": 0.010371413081884384,
      "rewards/bleu_reward_func/mean": 0.028040939942002296,
      "rewards/bleu_reward_func/std": 0.01842903159558773,
      "step": 696
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 324.40625,
      "completions/mean_terminated_length": 261.875,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "epoch": 0.5576,
      "grad_norm": 2.9584405422210693,
      "kl": 0.051727294921875,
      "learning_rate": 1e-06,
      "loss": -0.027,
      "num_tokens": 9341091.0,
      "reward": 0.0529029443860054,
      "reward_std": 0.015040645375847816,
      "rewards/bleu_reward_func/mean": 0.0529029443860054,
      "rewards/bleu_reward_func/std": 0.03654506802558899,
      "step": 697
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 470.0,
      "completions/mean_length": 378.75,
      "completions/mean_terminated_length": 334.3333435058594,
      "completions/min_length": 234.0,
      "completions/min_terminated_length": 234.0,
      "epoch": 0.5584,
      "grad_norm": 2.1786551475524902,
      "kl": 0.037689208984375,
      "learning_rate": 1e-06,
      "loss": -0.032,
      "num_tokens": 9356283.0,
      "reward": 0.03368465229868889,
      "reward_std": 0.009372582659125328,
      "rewards/bleu_reward_func/mean": 0.03368465229868889,
      "rewards/bleu_reward_func/std": 0.026379503309726715,
      "step": 698
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 216.0,
      "completions/mean_length": 238.71875,
      "completions/mean_terminated_length": 131.78260803222656,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.5592,
      "grad_norm": 4.376157760620117,
      "kl": 0.07623291015625,
      "learning_rate": 1e-06,
      "loss": 0.028,
      "num_tokens": 9367506.0,
      "reward": 0.05952916666865349,
      "reward_std": 0.023435667157173157,
      "rewards/bleu_reward_func/mean": 0.05952916666865349,
      "rewards/bleu_reward_func/std": 0.04434419795870781,
      "step": 699
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 359.0,
      "completions/mean_length": 274.0,
      "completions/mean_terminated_length": 194.6666717529297,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "epoch": 0.56,
      "grad_norm": 2.882861852645874,
      "kl": 0.053955078125,
      "learning_rate": 1e-06,
      "loss": 0.0267,
      "num_tokens": 9379354.0,
      "reward": 0.04700922966003418,
      "reward_std": 0.017730262130498886,
      "rewards/bleu_reward_func/mean": 0.04700922966003418,
      "rewards/bleu_reward_func/std": 0.04125557094812393,
      "step": 700
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 447.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 155.53125,
      "completions/mean_terminated_length": 155.53125,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.5608,
      "grad_norm": 3.9251868724823,
      "kl": 0.0543212890625,
      "learning_rate": 1e-06,
      "loss": -0.0544,
      "num_tokens": 9386403.0,
      "reward": 0.04936821013689041,
      "reward_std": 0.03505264222621918,
      "rewards/bleu_reward_func/mean": 0.04936821013689041,
      "rewards/bleu_reward_func/std": 0.03844968229532242,
      "step": 701
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 399.3125,
      "completions/mean_terminated_length": 348.0909118652344,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.5616,
      "grad_norm": 2.3044822216033936,
      "kl": 0.046051025390625,
      "learning_rate": 1e-06,
      "loss": 0.0413,
      "num_tokens": 9404461.0,
      "reward": 0.05913422256708145,
      "reward_std": 0.014526767656207085,
      "rewards/bleu_reward_func/mean": 0.05913422256708145,
      "rewards/bleu_reward_func/std": 0.02571999840438366,
      "step": 702
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 288.0,
      "completions/mean_length": 227.96875,
      "completions/mean_terminated_length": 133.2916717529297,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.5624,
      "grad_norm": 3.9835569858551025,
      "kl": 0.08758544921875,
      "learning_rate": 1e-06,
      "loss": -0.0113,
      "num_tokens": 9414196.0,
      "reward": 0.04630535468459129,
      "reward_std": 0.02081681415438652,
      "rewards/bleu_reward_func/mean": 0.04630535468459129,
      "rewards/bleu_reward_func/std": 0.029882358387112617,
      "step": 703
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 246.25,
      "completions/mean_terminated_length": 237.6774139404297,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.5632,
      "grad_norm": 3.185335874557495,
      "kl": 0.029571533203125,
      "learning_rate": 1e-06,
      "loss": 0.0604,
      "num_tokens": 9424820.0,
      "reward": 0.08910296112298965,
      "reward_std": 0.030036643147468567,
      "rewards/bleu_reward_func/mean": 0.08910296112298965,
      "rewards/bleu_reward_func/std": 0.04086713492870331,
      "step": 704
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 408.0,
      "completions/max_terminated_length": 408.0,
      "completions/mean_length": 161.34375,
      "completions/mean_terminated_length": 161.34375,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.564,
      "grad_norm": 3.8254852294921875,
      "kl": 0.072540283203125,
      "learning_rate": 1e-06,
      "loss": -0.1746,
      "num_tokens": 9432663.0,
      "reward": 0.027168624103069305,
      "reward_std": 0.008981114253401756,
      "rewards/bleu_reward_func/mean": 0.027168624103069305,
      "rewards/bleu_reward_func/std": 0.02017930895090103,
      "step": 705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 292.75,
      "completions/mean_terminated_length": 242.1538543701172,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.5648,
      "grad_norm": 2.8090875148773193,
      "kl": 0.036773681640625,
      "learning_rate": 1e-06,
      "loss": 0.0482,
      "num_tokens": 9444167.0,
      "reward": 0.06258943676948547,
      "reward_std": 0.02587122470140457,
      "rewards/bleu_reward_func/mean": 0.06258943676948547,
      "rewards/bleu_reward_func/std": 0.07109640538692474,
      "step": 706
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 341.5,
      "completions/mean_terminated_length": 293.7599792480469,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.5656,
      "grad_norm": 2.612130880355835,
      "kl": 0.06427001953125,
      "learning_rate": 1e-06,
      "loss": -0.0579,
      "num_tokens": 9457983.0,
      "reward": 0.06597445905208588,
      "reward_std": 0.028306953608989716,
      "rewards/bleu_reward_func/mean": 0.06597445905208588,
      "rewards/bleu_reward_func/std": 0.05735038220882416,
      "step": 707
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 325.1875,
      "completions/mean_terminated_length": 252.0869598388672,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.5664,
      "grad_norm": 2.9376416206359863,
      "kl": 0.040283203125,
      "learning_rate": 1e-06,
      "loss": 0.0036,
      "num_tokens": 9470925.0,
      "reward": 0.08510507643222809,
      "reward_std": 0.024027425795793533,
      "rewards/bleu_reward_func/mean": 0.08510507643222809,
      "rewards/bleu_reward_func/std": 0.06637418270111084,
      "step": 708
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 328.4375,
      "completions/mean_terminated_length": 302.21429443359375,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "epoch": 0.5672,
      "grad_norm": 2.756030321121216,
      "kl": 0.034027099609375,
      "learning_rate": 1e-06,
      "loss": -0.0527,
      "num_tokens": 9483395.0,
      "reward": 0.05016753822565079,
      "reward_std": 0.020536717027425766,
      "rewards/bleu_reward_func/mean": 0.05016753822565079,
      "rewards/bleu_reward_func/std": 0.04033496230840683,
      "step": 709
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 428.0,
      "completions/mean_length": 307.1875,
      "completions/mean_terminated_length": 184.3000030517578,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.568,
      "grad_norm": 2.9602859020233154,
      "kl": 0.07080078125,
      "learning_rate": 1e-06,
      "loss": 0.0387,
      "num_tokens": 9495689.0,
      "reward": 0.03180449828505516,
      "reward_std": 0.02256343513727188,
      "rewards/bleu_reward_func/mean": 0.03180449828505516,
      "rewards/bleu_reward_func/std": 0.02739291824400425,
      "step": 710
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 282.15625,
      "completions/mean_terminated_length": 229.11538696289062,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "epoch": 0.5688,
      "grad_norm": 2.9386565685272217,
      "kl": 0.0604248046875,
      "learning_rate": 1e-06,
      "loss": -0.0003,
      "num_tokens": 9509142.0,
      "reward": 0.026789426803588867,
      "reward_std": 0.007120601832866669,
      "rewards/bleu_reward_func/mean": 0.026789426803588867,
      "rewards/bleu_reward_func/std": 0.01533250231295824,
      "step": 711
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 445.1875,
      "completions/mean_terminated_length": 410.19049072265625,
      "completions/min_length": 308.0,
      "completions/min_terminated_length": 308.0,
      "epoch": 0.5696,
      "grad_norm": 2.129762887954712,
      "kl": 0.0325927734375,
      "learning_rate": 1e-06,
      "loss": -0.0566,
      "num_tokens": 9525972.0,
      "reward": 0.0978296622633934,
      "reward_std": 0.022149382159113884,
      "rewards/bleu_reward_func/mean": 0.0978296622633934,
      "rewards/bleu_reward_func/std": 0.08168322592973709,
      "step": 712
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 366.1875,
      "completions/mean_terminated_length": 317.5833435058594,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "epoch": 0.5704,
      "grad_norm": 2.2154335975646973,
      "kl": 0.04345703125,
      "learning_rate": 1e-06,
      "loss": 0.0888,
      "num_tokens": 9541162.0,
      "reward": 0.03301853686571121,
      "reward_std": 0.018747247755527496,
      "rewards/bleu_reward_func/mean": 0.03301853686571121,
      "rewards/bleu_reward_func/std": 0.03410644084215164,
      "step": 713
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 332.90625,
      "completions/mean_terminated_length": 225.4499969482422,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.5712,
      "grad_norm": 3.655517101287842,
      "kl": 0.034454345703125,
      "learning_rate": 1e-06,
      "loss": -0.0415,
      "num_tokens": 9554111.0,
      "reward": 0.06881336867809296,
      "reward_std": 0.02799813821911812,
      "rewards/bleu_reward_func/mean": 0.06881336867809296,
      "rewards/bleu_reward_func/std": 0.046132639050483704,
      "step": 714
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 375.0,
      "completions/mean_length": 323.53125,
      "completions/mean_terminated_length": 194.57894897460938,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.572,
      "grad_norm": 2.6926376819610596,
      "kl": 0.0515899658203125,
      "learning_rate": 1e-06,
      "loss": 0.0721,
      "num_tokens": 9569472.0,
      "reward": 0.03615172579884529,
      "reward_std": 0.010936561971902847,
      "rewards/bleu_reward_func/mean": 0.03615172579884529,
      "rewards/bleu_reward_func/std": 0.02190208248794079,
      "step": 715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 469.0,
      "completions/mean_length": 369.78125,
      "completions/mean_terminated_length": 259.1666564941406,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "epoch": 0.5728,
      "grad_norm": 2.659458875656128,
      "kl": 0.05145263671875,
      "learning_rate": 1e-06,
      "loss": 0.0228,
      "num_tokens": 9586745.0,
      "reward": 0.04501129686832428,
      "reward_std": 0.022404177114367485,
      "rewards/bleu_reward_func/mean": 0.04501129686832428,
      "rewards/bleu_reward_func/std": 0.03303966298699379,
      "step": 716
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 301.5,
      "completions/mean_terminated_length": 262.5185241699219,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "epoch": 0.5736,
      "grad_norm": 2.87153697013855,
      "kl": 0.045135498046875,
      "learning_rate": 1e-06,
      "loss": -0.0035,
      "num_tokens": 9598977.0,
      "reward": 0.08759818971157074,
      "reward_std": 0.016007091850042343,
      "rewards/bleu_reward_func/mean": 0.08759818971157074,
      "rewards/bleu_reward_func/std": 0.058541323989629745,
      "step": 717
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 465.0,
      "completions/mean_length": 376.84375,
      "completions/mean_terminated_length": 331.79168701171875,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "epoch": 0.5744,
      "grad_norm": 2.2236804962158203,
      "kl": 0.04248046875,
      "learning_rate": 1e-06,
      "loss": -0.0317,
      "num_tokens": 9613732.0,
      "reward": 0.027184750884771347,
      "reward_std": 0.008963186293840408,
      "rewards/bleu_reward_func/mean": 0.027184750884771347,
      "rewards/bleu_reward_func/std": 0.012044839560985565,
      "step": 718
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 305.0,
      "completions/mean_length": 404.875,
      "completions/mean_terminated_length": 131.11111450195312,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.5752,
      "grad_norm": 3.107280731201172,
      "kl": 0.05364990234375,
      "learning_rate": 1e-06,
      "loss": -0.1847,
      "num_tokens": 9632200.0,
      "reward": 0.022721879184246063,
      "reward_std": 0.015824276953935623,
      "rewards/bleu_reward_func/mean": 0.022721879184246063,
      "rewards/bleu_reward_func/std": 0.026122624054551125,
      "step": 719
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 464.0,
      "completions/mean_length": 302.71875,
      "completions/mean_terminated_length": 288.7666931152344,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.576,
      "grad_norm": 2.6198558807373047,
      "kl": 0.03948974609375,
      "learning_rate": 1e-06,
      "loss": 0.1811,
      "num_tokens": 9644919.0,
      "reward": 0.028920229524374008,
      "reward_std": 0.009729383513331413,
      "rewards/bleu_reward_func/mean": 0.028920229524374008,
      "rewards/bleu_reward_func/std": 0.012061752378940582,
      "step": 720
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 413.0,
      "completions/mean_length": 334.40625,
      "completions/mean_terminated_length": 275.2083435058594,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "epoch": 0.5768,
      "grad_norm": 2.3346035480499268,
      "kl": 0.03521728515625,
      "learning_rate": 1e-06,
      "loss": -0.0052,
      "num_tokens": 9657428.0,
      "reward": 0.033784326165914536,
      "reward_std": 0.012217823415994644,
      "rewards/bleu_reward_func/mean": 0.033784326165914536,
      "rewards/bleu_reward_func/std": 0.017847878858447075,
      "step": 721
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 392.6875,
      "completions/mean_terminated_length": 352.91668701171875,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.5776,
      "grad_norm": 2.2907447814941406,
      "kl": 0.0361328125,
      "learning_rate": 1e-06,
      "loss": -0.0447,
      "num_tokens": 9673434.0,
      "reward": 0.05538846179842949,
      "reward_std": 0.027156081050634384,
      "rewards/bleu_reward_func/mean": 0.05538846179842949,
      "rewards/bleu_reward_func/std": 0.0433816984295845,
      "step": 722
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 323.65625,
      "completions/mean_terminated_length": 304.17242431640625,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "epoch": 0.5784,
      "grad_norm": 2.55709171295166,
      "kl": 0.039825439453125,
      "learning_rate": 1e-06,
      "loss": -0.0556,
      "num_tokens": 9685775.0,
      "reward": 0.08060881495475769,
      "reward_std": 0.02005528286099434,
      "rewards/bleu_reward_func/mean": 0.08060881495475769,
      "rewards/bleu_reward_func/std": 0.07852939516305923,
      "step": 723
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 381.1875,
      "completions/mean_terminated_length": 351.0,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 0.5792,
      "grad_norm": 2.155306816101074,
      "kl": 0.03192138671875,
      "learning_rate": 1e-06,
      "loss": 0.0164,
      "num_tokens": 9700613.0,
      "reward": 0.1121407151222229,
      "reward_std": 0.025965237990021706,
      "rewards/bleu_reward_func/mean": 0.1121407151222229,
      "rewards/bleu_reward_func/std": 0.14134716987609863,
      "step": 724
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 324.78125,
      "completions/mean_terminated_length": 262.375,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "epoch": 0.58,
      "grad_norm": 2.3479228019714355,
      "kl": 0.04144287109375,
      "learning_rate": 1e-06,
      "loss": 0.0594,
      "num_tokens": 9713542.0,
      "reward": 0.025968845933675766,
      "reward_std": 0.008134625852108002,
      "rewards/bleu_reward_func/mean": 0.025968845933675766,
      "rewards/bleu_reward_func/std": 0.01379316858947277,
      "step": 725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 426.0,
      "completions/mean_length": 347.03125,
      "completions/mean_terminated_length": 234.15789794921875,
      "completions/min_length": 167.0,
      "completions/min_terminated_length": 167.0,
      "epoch": 0.5808,
      "grad_norm": 2.790693759918213,
      "kl": 0.0469970703125,
      "learning_rate": 1e-06,
      "loss": -0.0013,
      "num_tokens": 9727183.0,
      "reward": 0.07316627353429794,
      "reward_std": 0.02672586776316166,
      "rewards/bleu_reward_func/mean": 0.07316627353429794,
      "rewards/bleu_reward_func/std": 0.04870554059743881,
      "step": 726
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 473.0,
      "completions/mean_length": 413.65625,
      "completions/mean_terminated_length": 269.923095703125,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "epoch": 0.5816,
      "grad_norm": 2.371492624282837,
      "kl": 0.038482666015625,
      "learning_rate": 1e-06,
      "loss": 0.0874,
      "num_tokens": 9743332.0,
      "reward": 0.028866440057754517,
      "reward_std": 0.009991827420890331,
      "rewards/bleu_reward_func/mean": 0.028866440057754517,
      "rewards/bleu_reward_func/std": 0.017279163002967834,
      "step": 727
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 20.0,
      "completions/mean_length": 389.0,
      "completions/mean_terminated_length": 20.0,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.5824,
      "grad_norm": 1.6963756084442139,
      "kl": 0.03743934631347656,
      "learning_rate": 1e-06,
      "loss": 0.0015,
      "num_tokens": 9760908.0,
      "reward": 0.26194876432418823,
      "reward_std": 0.0028098882175982,
      "rewards/bleu_reward_func/mean": 0.26194876432418823,
      "rewards/bleu_reward_func/std": 0.43297266960144043,
      "step": 728
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 372.53125,
      "completions/mean_terminated_length": 317.9565124511719,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.5832,
      "grad_norm": 2.238243579864502,
      "kl": 0.041015625,
      "learning_rate": 1e-06,
      "loss": -0.0603,
      "num_tokens": 9775317.0,
      "reward": 0.05312762036919594,
      "reward_std": 0.021938100457191467,
      "rewards/bleu_reward_func/mean": 0.05312762036919594,
      "rewards/bleu_reward_func/std": 0.026401638984680176,
      "step": 729
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 463.0,
      "completions/mean_length": 273.78125,
      "completions/mean_terminated_length": 249.13792419433594,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 144.0,
      "epoch": 0.584,
      "grad_norm": 2.6824791431427,
      "kl": 0.040802001953125,
      "learning_rate": 1e-06,
      "loss": 0.1689,
      "num_tokens": 9785942.0,
      "reward": 0.043269768357276917,
      "reward_std": 0.01920248009264469,
      "rewards/bleu_reward_func/mean": 0.043269768357276917,
      "rewards/bleu_reward_func/std": 0.036198996007442474,
      "step": 730
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 391.96875,
      "completions/mean_terminated_length": 369.7407531738281,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "epoch": 0.5848,
      "grad_norm": 2.225390672683716,
      "kl": 0.030548095703125,
      "learning_rate": 1e-06,
      "loss": -0.0489,
      "num_tokens": 9801301.0,
      "reward": 0.043691135942935944,
      "reward_std": 0.016106728464365005,
      "rewards/bleu_reward_func/mean": 0.043691135942935944,
      "rewards/bleu_reward_func/std": 0.03660671412944794,
      "step": 731
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 452.0,
      "completions/mean_length": 316.09375,
      "completions/mean_terminated_length": 239.43478393554688,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.5856,
      "grad_norm": 3.061321258544922,
      "kl": 0.066162109375,
      "learning_rate": 1e-06,
      "loss": 0.0924,
      "num_tokens": 9815992.0,
      "reward": 0.04043514281511307,
      "reward_std": 0.02053326927125454,
      "rewards/bleu_reward_func/mean": 0.04043514281511307,
      "rewards/bleu_reward_func/std": 0.04115479812026024,
      "step": 732
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 383.8125,
      "completions/mean_terminated_length": 375.2666931152344,
      "completions/min_length": 269.0,
      "completions/min_terminated_length": 269.0,
      "epoch": 0.5864,
      "grad_norm": 1.9717614650726318,
      "kl": 0.0321044921875,
      "learning_rate": 1e-06,
      "loss": 0.0407,
      "num_tokens": 9830522.0,
      "reward": 0.026746738702058792,
      "reward_std": 0.006414106115698814,
      "rewards/bleu_reward_func/mean": 0.026746738702058792,
      "rewards/bleu_reward_func/std": 0.0077305627055466175,
      "step": 733
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 331.0,
      "completions/mean_terminated_length": 248.72727966308594,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.5872,
      "grad_norm": 2.053102970123291,
      "kl": 0.0360870361328125,
      "learning_rate": 1e-06,
      "loss": 0.0197,
      "num_tokens": 9843674.0,
      "reward": 0.05009941756725311,
      "reward_std": 0.027085162699222565,
      "rewards/bleu_reward_func/mean": 0.05009941756725311,
      "rewards/bleu_reward_func/std": 0.03603508323431015,
      "step": 734
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 456.0,
      "completions/mean_length": 378.34375,
      "completions/mean_terminated_length": 333.79168701171875,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.588,
      "grad_norm": 2.1135923862457275,
      "kl": 0.0374755859375,
      "learning_rate": 1e-06,
      "loss": 0.042,
      "num_tokens": 9858789.0,
      "reward": 0.025262653827667236,
      "reward_std": 0.009363568387925625,
      "rewards/bleu_reward_func/mean": 0.025262653827667236,
      "rewards/bleu_reward_func/std": 0.015334555879235268,
      "step": 735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 290.09375,
      "completions/mean_terminated_length": 249.0,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.5888,
      "grad_norm": 3.513441562652588,
      "kl": 0.0484619140625,
      "learning_rate": 1e-06,
      "loss": 0.2233,
      "num_tokens": 9870368.0,
      "reward": 0.031309109181165695,
      "reward_std": 0.012565305456519127,
      "rewards/bleu_reward_func/mean": 0.031309109181165695,
      "rewards/bleu_reward_func/std": 0.019024599343538284,
      "step": 736
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 430.0,
      "completions/max_terminated_length": 430.0,
      "completions/mean_length": 203.96875,
      "completions/mean_terminated_length": 203.96875,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.5896,
      "grad_norm": 3.6366395950317383,
      "kl": 0.043121337890625,
      "learning_rate": 1e-06,
      "loss": -0.1022,
      "num_tokens": 9878959.0,
      "reward": 0.10009264945983887,
      "reward_std": 0.04424141347408295,
      "rewards/bleu_reward_func/mean": 0.10009264945983887,
      "rewards/bleu_reward_func/std": 0.07444142550230026,
      "step": 737
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 432.0,
      "completions/mean_length": 298.0625,
      "completions/mean_terminated_length": 200.8181915283203,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.5904,
      "grad_norm": 3.4232091903686523,
      "kl": 0.05059814453125,
      "learning_rate": 1e-06,
      "loss": -0.007,
      "num_tokens": 9893561.0,
      "reward": 0.039665337651968,
      "reward_std": 0.0123225636780262,
      "rewards/bleu_reward_func/mean": 0.039665337651968,
      "rewards/bleu_reward_func/std": 0.024130748584866524,
      "step": 738
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 336.125,
      "completions/mean_terminated_length": 244.0,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.5912,
      "grad_norm": 2.8178460597991943,
      "kl": 0.047271728515625,
      "learning_rate": 1e-06,
      "loss": -0.1451,
      "num_tokens": 9909901.0,
      "reward": 0.04546702653169632,
      "reward_std": 0.021167172119021416,
      "rewards/bleu_reward_func/mean": 0.04546702653169632,
      "rewards/bleu_reward_func/std": 0.04271375387907028,
      "step": 739
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 451.0,
      "completions/mean_length": 292.9375,
      "completions/mean_terminated_length": 285.8709716796875,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.592,
      "grad_norm": 2.625126600265503,
      "kl": 0.0390625,
      "learning_rate": 1e-06,
      "loss": -0.2049,
      "num_tokens": 9922867.0,
      "reward": 0.09222639352083206,
      "reward_std": 0.03261449933052063,
      "rewards/bleu_reward_func/mean": 0.09222639352083206,
      "rewards/bleu_reward_func/std": 0.09164327383041382,
      "step": 740
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 249.96875,
      "completions/mean_terminated_length": 222.862060546875,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.5928,
      "grad_norm": 3.74029803276062,
      "kl": 0.059112548828125,
      "learning_rate": 1e-06,
      "loss": -0.1066,
      "num_tokens": 9935442.0,
      "reward": 0.252871572971344,
      "reward_std": 0.0454796738922596,
      "rewards/bleu_reward_func/mean": 0.252871572971344,
      "rewards/bleu_reward_func/std": 0.29931873083114624,
      "step": 741
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 442.03125,
      "completions/mean_terminated_length": 352.0714416503906,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 0.5936,
      "grad_norm": 1.9116039276123047,
      "kl": 0.04791259765625,
      "learning_rate": 1e-06,
      "loss": -0.012,
      "num_tokens": 9953635.0,
      "reward": 0.03428558260202408,
      "reward_std": 0.0095596294850111,
      "rewards/bleu_reward_func/mean": 0.03428558260202408,
      "rewards/bleu_reward_func/std": 0.03276081383228302,
      "step": 742
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 285.71875,
      "completions/mean_terminated_length": 243.8148193359375,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.5944,
      "grad_norm": 3.1582913398742676,
      "kl": 0.052490234375,
      "learning_rate": 1e-06,
      "loss": 0.0456,
      "num_tokens": 9965906.0,
      "reward": 0.048588261008262634,
      "reward_std": 0.017030086368322372,
      "rewards/bleu_reward_func/mean": 0.048588261008262634,
      "rewards/bleu_reward_func/std": 0.027793744578957558,
      "step": 743
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 349.625,
      "completions/mean_terminated_length": 264.5714416503906,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "epoch": 0.5952,
      "grad_norm": 2.2140707969665527,
      "kl": 0.041290283203125,
      "learning_rate": 1e-06,
      "loss": 0.0686,
      "num_tokens": 9982022.0,
      "reward": 0.045479342341423035,
      "reward_std": 0.021971603855490685,
      "rewards/bleu_reward_func/mean": 0.045479342341423035,
      "rewards/bleu_reward_func/std": 0.037209052592515945,
      "step": 744
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 487.0,
      "completions/mean_length": 265.53125,
      "completions/mean_terminated_length": 257.58062744140625,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.596,
      "grad_norm": 5.462935447692871,
      "kl": 0.0552978515625,
      "learning_rate": 1e-06,
      "loss": -0.0597,
      "num_tokens": 9994487.0,
      "reward": 0.03378114849328995,
      "reward_std": 0.012287897989153862,
      "rewards/bleu_reward_func/mean": 0.03378114849328995,
      "rewards/bleu_reward_func/std": 0.01468950230628252,
      "step": 745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 396.78125,
      "completions/mean_terminated_length": 307.1666564941406,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.5968,
      "grad_norm": 2.1518917083740234,
      "kl": 0.041534423828125,
      "learning_rate": 1e-06,
      "loss": 0.0918,
      "num_tokens": 10009392.0,
      "reward": 0.025878749787807465,
      "reward_std": 0.015285806730389595,
      "rewards/bleu_reward_func/mean": 0.025878749787807465,
      "rewards/bleu_reward_func/std": 0.031384509056806564,
      "step": 746
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 257.59375,
      "completions/mean_terminated_length": 240.6333465576172,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.5976,
      "grad_norm": 3.0447263717651367,
      "kl": 0.0303955078125,
      "learning_rate": 1e-06,
      "loss": -0.1925,
      "num_tokens": 10021539.0,
      "reward": 0.10353910177946091,
      "reward_std": 0.06533479690551758,
      "rewards/bleu_reward_func/mean": 0.10353910177946091,
      "rewards/bleu_reward_func/std": 0.12369221448898315,
      "step": 747
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 254.5,
      "completions/mean_terminated_length": 237.33334350585938,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.5984,
      "grad_norm": 2.9500083923339844,
      "kl": 0.03375244140625,
      "learning_rate": 1e-06,
      "loss": 0.0413,
      "num_tokens": 10032043.0,
      "reward": 0.09129080176353455,
      "reward_std": 0.04000641033053398,
      "rewards/bleu_reward_func/mean": 0.09129080176353455,
      "rewards/bleu_reward_func/std": 0.06342270225286484,
      "step": 748
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 416.0,
      "completions/max_terminated_length": 416.0,
      "completions/mean_length": 180.875,
      "completions/mean_terminated_length": 180.875,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "epoch": 0.5992,
      "grad_norm": 3.592421293258667,
      "kl": 0.0577392578125,
      "learning_rate": 1e-06,
      "loss": -0.0851,
      "num_tokens": 10039823.0,
      "reward": 0.038300834596157074,
      "reward_std": 0.012440194375813007,
      "rewards/bleu_reward_func/mean": 0.038300834596157074,
      "rewards/bleu_reward_func/std": 0.021254317834973335,
      "step": 749
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 387.0,
      "completions/mean_length": 280.125,
      "completions/mean_terminated_length": 226.61538696289062,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.6,
      "grad_norm": 2.9240732192993164,
      "kl": 0.046600341796875,
      "learning_rate": 1e-06,
      "loss": 0.0647,
      "num_tokens": 10051187.0,
      "reward": 0.05779760330915451,
      "reward_std": 0.014689221978187561,
      "rewards/bleu_reward_func/mean": 0.05779760330915451,
      "rewards/bleu_reward_func/std": 0.04038412868976593,
      "step": 750
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 463.0,
      "completions/mean_length": 299.40625,
      "completions/mean_terminated_length": 250.34616088867188,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.6008,
      "grad_norm": 2.695838689804077,
      "kl": 0.03765869140625,
      "learning_rate": 1e-06,
      "loss": 0.0816,
      "num_tokens": 10064256.0,
      "reward": 0.056154537945985794,
      "reward_std": 0.04283731430768967,
      "rewards/bleu_reward_func/mean": 0.056154537945985794,
      "rewards/bleu_reward_func/std": 0.06427828222513199,
      "step": 751
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 235.4375,
      "completions/mean_terminated_length": 195.92857360839844,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "epoch": 0.6016,
      "grad_norm": 3.5030364990234375,
      "kl": 0.054962158203125,
      "learning_rate": 1e-06,
      "loss": 0.1026,
      "num_tokens": 10074310.0,
      "reward": 0.0592612624168396,
      "reward_std": 0.038583509624004364,
      "rewards/bleu_reward_func/mean": 0.0592612624168396,
      "rewards/bleu_reward_func/std": 0.06044392287731171,
      "step": 752
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 403.0,
      "completions/max_terminated_length": 403.0,
      "completions/mean_length": 177.59375,
      "completions/mean_terminated_length": 177.59375,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "epoch": 0.6024,
      "grad_norm": 4.249257564544678,
      "kl": 0.03570556640625,
      "learning_rate": 1e-06,
      "loss": 0.0027,
      "num_tokens": 10082169.0,
      "reward": 0.06106291711330414,
      "reward_std": 0.03034021332859993,
      "rewards/bleu_reward_func/mean": 0.06106291711330414,
      "rewards/bleu_reward_func/std": 0.055352307856082916,
      "step": 753
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 423.0,
      "completions/mean_length": 305.40625,
      "completions/mean_terminated_length": 144.72222900390625,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.6032,
      "grad_norm": 3.6615188121795654,
      "kl": 0.04107666015625,
      "learning_rate": 1e-06,
      "loss": -0.0413,
      "num_tokens": 10096142.0,
      "reward": 0.07839120924472809,
      "reward_std": 0.027994198724627495,
      "rewards/bleu_reward_func/mean": 0.07839120924472809,
      "rewards/bleu_reward_func/std": 0.08647292107343674,
      "step": 754
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 413.0,
      "completions/mean_length": 247.5625,
      "completions/mean_terminated_length": 173.51998901367188,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.604,
      "grad_norm": 3.6631929874420166,
      "kl": 0.05218505859375,
      "learning_rate": 1e-06,
      "loss": -0.1032,
      "num_tokens": 10106736.0,
      "reward": 0.03967122733592987,
      "reward_std": 0.01704089716076851,
      "rewards/bleu_reward_func/mean": 0.03967122733592987,
      "rewards/bleu_reward_func/std": 0.024564094841480255,
      "step": 755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 461.0,
      "completions/mean_length": 382.75,
      "completions/mean_terminated_length": 332.1739196777344,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "epoch": 0.6048,
      "grad_norm": 2.315969944000244,
      "kl": 0.0330810546875,
      "learning_rate": 1e-06,
      "loss": 0.0998,
      "num_tokens": 10123032.0,
      "reward": 0.039958953857421875,
      "reward_std": 0.010448317043483257,
      "rewards/bleu_reward_func/mean": 0.039958953857421875,
      "rewards/bleu_reward_func/std": 0.03418637439608574,
      "step": 756
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 299.09375,
      "completions/mean_terminated_length": 215.78260803222656,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.6056,
      "grad_norm": 2.656048059463501,
      "kl": 0.02557373046875,
      "learning_rate": 1e-06,
      "loss": -0.0165,
      "num_tokens": 10136523.0,
      "reward": 0.16351552307605743,
      "reward_std": 0.1012059897184372,
      "rewards/bleu_reward_func/mean": 0.16351552307605743,
      "rewards/bleu_reward_func/std": 0.3160014748573303,
      "step": 757
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 439.0,
      "completions/mean_length": 184.5,
      "completions/mean_terminated_length": 162.6666717529297,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.6064,
      "grad_norm": 6.093653202056885,
      "kl": 0.0682373046875,
      "learning_rate": 1e-06,
      "loss": 0.0402,
      "num_tokens": 10144755.0,
      "reward": 0.04594259709119797,
      "reward_std": 0.014256558381021023,
      "rewards/bleu_reward_func/mean": 0.04594259709119797,
      "rewards/bleu_reward_func/std": 0.021436382085084915,
      "step": 758
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 388.03125,
      "completions/mean_terminated_length": 303.2105407714844,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "epoch": 0.6072,
      "grad_norm": 2.354794979095459,
      "kl": 0.0377197265625,
      "learning_rate": 1e-06,
      "loss": -0.0584,
      "num_tokens": 10162964.0,
      "reward": 0.08822646737098694,
      "reward_std": 0.03655345365405083,
      "rewards/bleu_reward_func/mean": 0.08822646737098694,
      "rewards/bleu_reward_func/std": 0.04937893897294998,
      "step": 759
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 265.1875,
      "completions/mean_terminated_length": 219.48147583007812,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "epoch": 0.608,
      "grad_norm": 3.1614513397216797,
      "kl": 0.0482177734375,
      "learning_rate": 1e-06,
      "loss": -0.1552,
      "num_tokens": 10174554.0,
      "reward": 0.09878893941640854,
      "reward_std": 0.045050833374261856,
      "rewards/bleu_reward_func/mean": 0.09878893941640854,
      "rewards/bleu_reward_func/std": 0.09232120960950851,
      "step": 760
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 347.84375,
      "completions/mean_terminated_length": 301.8800048828125,
      "completions/min_length": 153.0,
      "completions/min_terminated_length": 153.0,
      "epoch": 0.6088,
      "grad_norm": 2.764409065246582,
      "kl": 0.0374755859375,
      "learning_rate": 1e-06,
      "loss": 0.0588,
      "num_tokens": 10188061.0,
      "reward": 0.0298735611140728,
      "reward_std": 0.012367211282253265,
      "rewards/bleu_reward_func/mean": 0.0298735611140728,
      "rewards/bleu_reward_func/std": 0.019031813368201256,
      "step": 761
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 456.0,
      "completions/mean_length": 300.15625,
      "completions/mean_terminated_length": 203.8636474609375,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.6096,
      "grad_norm": 4.376463413238525,
      "kl": 0.0523681640625,
      "learning_rate": 1e-06,
      "loss": -0.0498,
      "num_tokens": 10200210.0,
      "reward": 0.04680792987346649,
      "reward_std": 0.012458568438887596,
      "rewards/bleu_reward_func/mean": 0.04680792987346649,
      "rewards/bleu_reward_func/std": 0.015480151399970055,
      "step": 762
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 425.0,
      "completions/max_terminated_length": 425.0,
      "completions/mean_length": 133.9375,
      "completions/mean_terminated_length": 133.9375,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.6104,
      "grad_norm": 5.940901756286621,
      "kl": 0.05828857421875,
      "learning_rate": 1e-06,
      "loss": 0.0172,
      "num_tokens": 10207136.0,
      "reward": 0.04009559005498886,
      "reward_std": 0.016614696010947227,
      "rewards/bleu_reward_func/mean": 0.04009559005498886,
      "rewards/bleu_reward_func/std": 0.03101743757724762,
      "step": 763
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 444.0,
      "completions/mean_length": 326.84375,
      "completions/mean_terminated_length": 275.0,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 131.0,
      "epoch": 0.6112,
      "grad_norm": 2.4283318519592285,
      "kl": 0.04022216796875,
      "learning_rate": 1e-06,
      "loss": -0.0041,
      "num_tokens": 10219763.0,
      "reward": 0.034885473549366,
      "reward_std": 0.011106548830866814,
      "rewards/bleu_reward_func/mean": 0.034885473549366,
      "rewards/bleu_reward_func/std": 0.01220767293125391,
      "step": 764
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 242.0,
      "completions/mean_terminated_length": 152.0,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.612,
      "grad_norm": 4.512617111206055,
      "kl": 0.057891845703125,
      "learning_rate": 1e-06,
      "loss": -0.0146,
      "num_tokens": 10233067.0,
      "reward": 0.04029272869229317,
      "reward_std": 0.010187342762947083,
      "rewards/bleu_reward_func/mean": 0.04029272869229317,
      "rewards/bleu_reward_func/std": 0.01465767901390791,
      "step": 765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 380.4375,
      "completions/mean_terminated_length": 328.95654296875,
      "completions/min_length": 147.0,
      "completions/min_terminated_length": 147.0,
      "epoch": 0.6128,
      "grad_norm": 2.1606087684631348,
      "kl": 0.029327392578125,
      "learning_rate": 1e-06,
      "loss": 0.1129,
      "num_tokens": 10250633.0,
      "reward": 0.04106716811656952,
      "reward_std": 0.020106343552470207,
      "rewards/bleu_reward_func/mean": 0.04106716811656952,
      "rewards/bleu_reward_func/std": 0.023741254583001137,
      "step": 766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 438.0,
      "completions/mean_length": 432.6875,
      "completions/mean_terminated_length": 353.375,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 0.6136,
      "grad_norm": 2.2307021617889404,
      "kl": 0.044036865234375,
      "learning_rate": 1e-06,
      "loss": 0.0143,
      "num_tokens": 10269047.0,
      "reward": 0.030770011246204376,
      "reward_std": 0.007664786651730537,
      "rewards/bleu_reward_func/mean": 0.030770011246204376,
      "rewards/bleu_reward_func/std": 0.013669944368302822,
      "step": 767
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 456.28125,
      "completions/mean_terminated_length": 374.8461608886719,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "epoch": 0.6144,
      "grad_norm": 2.2227320671081543,
      "kl": 0.050994873046875,
      "learning_rate": 1e-06,
      "loss": -0.0417,
      "num_tokens": 10288664.0,
      "reward": 0.05277414619922638,
      "reward_std": 0.01249920204281807,
      "rewards/bleu_reward_func/mean": 0.05277414619922638,
      "rewards/bleu_reward_func/std": 0.031958963721990585,
      "step": 768
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 222.21875,
      "completions/mean_terminated_length": 212.87095642089844,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.6152,
      "grad_norm": 4.83256196975708,
      "kl": 0.0826416015625,
      "learning_rate": 1e-06,
      "loss": 0.0396,
      "num_tokens": 10299135.0,
      "reward": 0.044097334146499634,
      "reward_std": 0.019661743193864822,
      "rewards/bleu_reward_func/mean": 0.044097334146499634,
      "rewards/bleu_reward_func/std": 0.025644388049840927,
      "step": 769
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 349.5,
      "completions/mean_terminated_length": 332.6896667480469,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "epoch": 0.616,
      "grad_norm": 2.3625354766845703,
      "kl": 0.036376953125,
      "learning_rate": 1e-06,
      "loss": -0.0849,
      "num_tokens": 10312383.0,
      "reward": 0.03365849331021309,
      "reward_std": 0.009999667294323444,
      "rewards/bleu_reward_func/mean": 0.03365849331021309,
      "rewards/bleu_reward_func/std": 0.0307177621871233,
      "step": 770
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 341.3125,
      "completions/mean_terminated_length": 293.5199890136719,
      "completions/min_length": 79.0,
      "completions/min_terminated_length": 79.0,
      "epoch": 0.6168,
      "grad_norm": 2.5119853019714355,
      "kl": 0.0298614501953125,
      "learning_rate": 1e-06,
      "loss": -0.1266,
      "num_tokens": 10326233.0,
      "reward": 0.06748858094215393,
      "reward_std": 0.056034184992313385,
      "rewards/bleu_reward_func/mean": 0.06748858094215393,
      "rewards/bleu_reward_func/std": 0.06614639610052109,
      "step": 771
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 451.0,
      "completions/mean_length": 199.03125,
      "completions/mean_terminated_length": 188.93548583984375,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.6176,
      "grad_norm": 3.840604066848755,
      "kl": 0.0477294921875,
      "learning_rate": 1e-06,
      "loss": -0.0144,
      "num_tokens": 10338010.0,
      "reward": 0.05809897184371948,
      "reward_std": 0.03548566997051239,
      "rewards/bleu_reward_func/mean": 0.05809897184371948,
      "rewards/bleu_reward_func/std": 0.03829097002744675,
      "step": 772
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 445.0,
      "completions/max_terminated_length": 445.0,
      "completions/mean_length": 268.5625,
      "completions/mean_terminated_length": 268.5625,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "epoch": 0.6184,
      "grad_norm": 3.002624273300171,
      "kl": 0.033477783203125,
      "learning_rate": 1e-06,
      "loss": -0.0311,
      "num_tokens": 10348676.0,
      "reward": 0.04606045410037041,
      "reward_std": 0.013474350795149803,
      "rewards/bleu_reward_func/mean": 0.04606045410037041,
      "rewards/bleu_reward_func/std": 0.03948834538459778,
      "step": 773
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 259.625,
      "completions/mean_terminated_length": 175.5,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.6192,
      "grad_norm": 4.124053955078125,
      "kl": 0.055450439453125,
      "learning_rate": 1e-06,
      "loss": -0.0785,
      "num_tokens": 10361736.0,
      "reward": 0.05262724682688713,
      "reward_std": 0.00995348859578371,
      "rewards/bleu_reward_func/mean": 0.05262724682688713,
      "rewards/bleu_reward_func/std": 0.04580436274409294,
      "step": 774
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 483.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 268.09375,
      "completions/mean_terminated_length": 268.09375,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.62,
      "grad_norm": 3.3182435035705566,
      "kl": 0.05712890625,
      "learning_rate": 1e-06,
      "loss": 0.0194,
      "num_tokens": 10372547.0,
      "reward": 0.03659249097108841,
      "reward_std": 0.01361929066479206,
      "rewards/bleu_reward_func/mean": 0.03659249097108841,
      "rewards/bleu_reward_func/std": 0.017074862495064735,
      "step": 775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 399.0625,
      "completions/mean_terminated_length": 339.9047546386719,
      "completions/min_length": 187.0,
      "completions/min_terminated_length": 187.0,
      "epoch": 0.6208,
      "grad_norm": 1.9342231750488281,
      "kl": 0.035369873046875,
      "learning_rate": 1e-06,
      "loss": 0.0046,
      "num_tokens": 10388957.0,
      "reward": 0.09792732447385788,
      "reward_std": 0.02487635612487793,
      "rewards/bleu_reward_func/mean": 0.09792732447385788,
      "rewards/bleu_reward_func/std": 0.0962534099817276,
      "step": 776
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 466.0,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 167.1875,
      "completions/mean_terminated_length": 167.1875,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.6216,
      "grad_norm": 6.700768947601318,
      "kl": 0.04681396484375,
      "learning_rate": 1e-06,
      "loss": -0.0026,
      "num_tokens": 10397779.0,
      "reward": 0.032624099403619766,
      "reward_std": 0.011605742387473583,
      "rewards/bleu_reward_func/mean": 0.032624099403619766,
      "rewards/bleu_reward_func/std": 0.020308885723352432,
      "step": 777
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 385.75,
      "completions/mean_terminated_length": 319.6190490722656,
      "completions/min_length": 85.0,
      "completions/min_terminated_length": 85.0,
      "epoch": 0.6224,
      "grad_norm": 2.1677191257476807,
      "kl": 0.051605224609375,
      "learning_rate": 1e-06,
      "loss": 0.0461,
      "num_tokens": 10414483.0,
      "reward": 0.08826855570077896,
      "reward_std": 0.02615802362561226,
      "rewards/bleu_reward_func/mean": 0.08826855570077896,
      "rewards/bleu_reward_func/std": 0.1053905114531517,
      "step": 778
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 252.5625,
      "completions/mean_terminated_length": 179.9199981689453,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.6232,
      "grad_norm": 6.194201469421387,
      "kl": 0.04010009765625,
      "learning_rate": 1e-06,
      "loss": -0.0265,
      "num_tokens": 10425557.0,
      "reward": 0.02823360078036785,
      "reward_std": 0.03581786900758743,
      "rewards/bleu_reward_func/mean": 0.02823360078036785,
      "rewards/bleu_reward_func/std": 0.04058787599205971,
      "step": 779
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 423.0,
      "completions/mean_length": 226.40625,
      "completions/mean_terminated_length": 131.20834350585938,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.624,
      "grad_norm": 5.036434173583984,
      "kl": 0.050079345703125,
      "learning_rate": 1e-06,
      "loss": 0.1302,
      "num_tokens": 10436914.0,
      "reward": 0.044641509652137756,
      "reward_std": 0.016347650438547134,
      "rewards/bleu_reward_func/mean": 0.044641509652137756,
      "rewards/bleu_reward_func/std": 0.03360149264335632,
      "step": 780
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 368.0,
      "completions/mean_length": 204.0,
      "completions/mean_terminated_length": 194.06451416015625,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "epoch": 0.6248,
      "grad_norm": 2.847588062286377,
      "kl": 0.033935546875,
      "learning_rate": 1e-06,
      "loss": 0.0755,
      "num_tokens": 10448866.0,
      "reward": 0.07604652643203735,
      "reward_std": 0.022845547646284103,
      "rewards/bleu_reward_func/mean": 0.07604652643203735,
      "rewards/bleu_reward_func/std": 0.052399422973394394,
      "step": 781
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 433.0,
      "completions/max_terminated_length": 433.0,
      "completions/mean_length": 202.4375,
      "completions/mean_terminated_length": 202.4375,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.6256,
      "grad_norm": 3.90671968460083,
      "kl": 0.033660888671875,
      "learning_rate": 1e-06,
      "loss": 0.0688,
      "num_tokens": 10462408.0,
      "reward": 0.05024778097867966,
      "reward_std": 0.02265213616192341,
      "rewards/bleu_reward_func/mean": 0.05024778097867966,
      "rewards/bleu_reward_func/std": 0.03187122941017151,
      "step": 782
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 444.0,
      "completions/mean_length": 276.78125,
      "completions/mean_terminated_length": 198.375,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.6264,
      "grad_norm": 3.0001416206359863,
      "kl": 0.04388427734375,
      "learning_rate": 1e-06,
      "loss": -0.0108,
      "num_tokens": 10473897.0,
      "reward": 0.03825919330120087,
      "reward_std": 0.013500811532139778,
      "rewards/bleu_reward_func/mean": 0.03825919330120087,
      "rewards/bleu_reward_func/std": 0.02070869877934456,
      "step": 783
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 444.0,
      "completions/mean_length": 254.8125,
      "completions/mean_terminated_length": 195.4615478515625,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "epoch": 0.6272,
      "grad_norm": 2.65230131149292,
      "kl": 0.03281402587890625,
      "learning_rate": 1e-06,
      "loss": -0.1691,
      "num_tokens": 10485235.0,
      "reward": 0.08533032983541489,
      "reward_std": 0.05742814019322395,
      "rewards/bleu_reward_func/mean": 0.08533032983541489,
      "rewards/bleu_reward_func/std": 0.106187604367733,
      "step": 784
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 319.90625,
      "completions/mean_terminated_length": 292.46429443359375,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.628,
      "grad_norm": 2.5516910552978516,
      "kl": 0.042205810546875,
      "learning_rate": 1e-06,
      "loss": -0.1961,
      "num_tokens": 10502464.0,
      "reward": 0.047782108187675476,
      "reward_std": 0.032827217131853104,
      "rewards/bleu_reward_func/mean": 0.047782108187675476,
      "rewards/bleu_reward_func/std": 0.04888693243265152,
      "step": 785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 302.4375,
      "completions/mean_terminated_length": 207.18182373046875,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.6288,
      "grad_norm": 3.678699493408203,
      "kl": 0.05059814453125,
      "learning_rate": 1e-06,
      "loss": -0.2486,
      "num_tokens": 10514454.0,
      "reward": 0.0350123755633831,
      "reward_std": 0.019910816103219986,
      "rewards/bleu_reward_func/mean": 0.0350123755633831,
      "rewards/bleu_reward_func/std": 0.03280069679021835,
      "step": 786
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 441.0,
      "completions/mean_length": 230.03125,
      "completions/mean_terminated_length": 220.9354705810547,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.6296,
      "grad_norm": 3.5091724395751953,
      "kl": 0.060211181640625,
      "learning_rate": 1e-06,
      "loss": -0.035,
      "num_tokens": 10526559.0,
      "reward": 0.0770467221736908,
      "reward_std": 0.025734489783644676,
      "rewards/bleu_reward_func/mean": 0.0770467221736908,
      "rewards/bleu_reward_func/std": 0.06628144532442093,
      "step": 787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 315.65625,
      "completions/mean_terminated_length": 238.8260955810547,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.6304,
      "grad_norm": 2.624885082244873,
      "kl": 0.030364990234375,
      "learning_rate": 1e-06,
      "loss": 0.0859,
      "num_tokens": 10538644.0,
      "reward": 0.031655922532081604,
      "reward_std": 0.012592853978276253,
      "rewards/bleu_reward_func/mean": 0.031655922532081604,
      "rewards/bleu_reward_func/std": 0.015495400875806808,
      "step": 788
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 320.40625,
      "completions/mean_terminated_length": 256.54168701171875,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.6312,
      "grad_norm": 7.265035152435303,
      "kl": 0.045196533203125,
      "learning_rate": 1e-06,
      "loss": -0.0586,
      "num_tokens": 10551681.0,
      "reward": 0.11770100891590118,
      "reward_std": 0.048164598643779755,
      "rewards/bleu_reward_func/mean": 0.11770100891590118,
      "rewards/bleu_reward_func/std": 0.10264891386032104,
      "step": 789
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 352.0,
      "completions/mean_length": 350.5,
      "completions/mean_terminated_length": 189.0,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.632,
      "grad_norm": 2.7487597465515137,
      "kl": 0.0435791015625,
      "learning_rate": 1e-06,
      "loss": -0.0856,
      "num_tokens": 10567657.0,
      "reward": 0.030746258795261383,
      "reward_std": 0.012422507628798485,
      "rewards/bleu_reward_func/mean": 0.030746258795261383,
      "rewards/bleu_reward_func/std": 0.0163208469748497,
      "step": 790
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 420.0,
      "completions/mean_length": 312.5,
      "completions/mean_terminated_length": 246.0,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.6328,
      "grad_norm": 5.9650187492370605,
      "kl": 0.049163818359375,
      "learning_rate": 1e-06,
      "loss": -0.1018,
      "num_tokens": 10580481.0,
      "reward": 0.1085551381111145,
      "reward_std": 0.03489597514271736,
      "rewards/bleu_reward_func/mean": 0.1085551381111145,
      "rewards/bleu_reward_func/std": 0.07419593632221222,
      "step": 791
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 274.34375,
      "completions/mean_terminated_length": 258.5,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.6336,
      "grad_norm": 2.4724159240722656,
      "kl": 0.034820556640625,
      "learning_rate": 1e-06,
      "loss": -0.0042,
      "num_tokens": 10591612.0,
      "reward": 0.048242103308439255,
      "reward_std": 0.018077358603477478,
      "rewards/bleu_reward_func/mean": 0.048242103308439255,
      "rewards/bleu_reward_func/std": 0.038909122347831726,
      "step": 792
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 399.28125,
      "completions/mean_terminated_length": 355.1739196777344,
      "completions/min_length": 178.0,
      "completions/min_terminated_length": 178.0,
      "epoch": 0.6344,
      "grad_norm": 2.54805588722229,
      "kl": 0.025634765625,
      "learning_rate": 1e-06,
      "loss": -0.085,
      "num_tokens": 10606909.0,
      "reward": 0.05102061852812767,
      "reward_std": 0.021770458668470383,
      "rewards/bleu_reward_func/mean": 0.05102061852812767,
      "rewards/bleu_reward_func/std": 0.025904852896928787,
      "step": 793
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 353.25,
      "completions/mean_terminated_length": 336.82757568359375,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "epoch": 0.6352,
      "grad_norm": 2.3026504516601562,
      "kl": 0.040191650390625,
      "learning_rate": 1e-06,
      "loss": -0.0445,
      "num_tokens": 10622421.0,
      "reward": 0.044227294623851776,
      "reward_std": 0.01541022676974535,
      "rewards/bleu_reward_func/mean": 0.044227294623851776,
      "rewards/bleu_reward_func/std": 0.027547884732484818,
      "step": 794
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 400.59375,
      "completions/mean_terminated_length": 289.1875,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 146.0,
      "epoch": 0.636,
      "grad_norm": 2.2095108032226562,
      "kl": 0.049224853515625,
      "learning_rate": 1e-06,
      "loss": -0.0347,
      "num_tokens": 10639072.0,
      "reward": 0.04881560057401657,
      "reward_std": 0.02202250249683857,
      "rewards/bleu_reward_func/mean": 0.04881560057401657,
      "rewards/bleu_reward_func/std": 0.053479425609111786,
      "step": 795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 443.0,
      "completions/mean_length": 348.40625,
      "completions/mean_terminated_length": 204.05882263183594,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.6368,
      "grad_norm": 2.7905285358428955,
      "kl": 0.04681396484375,
      "learning_rate": 1e-06,
      "loss": 0.1886,
      "num_tokens": 10652597.0,
      "reward": 0.08770878612995148,
      "reward_std": 0.03572450950741768,
      "rewards/bleu_reward_func/mean": 0.08770878612995148,
      "rewards/bleu_reward_func/std": 0.07477344572544098,
      "step": 796
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 477.6875,
      "completions/mean_terminated_length": 420.5,
      "completions/min_length": 358.0,
      "completions/min_terminated_length": 358.0,
      "epoch": 0.6376,
      "grad_norm": 2.1412551403045654,
      "kl": 0.04559326171875,
      "learning_rate": 1e-06,
      "loss": 0.0094,
      "num_tokens": 10670643.0,
      "reward": 0.07936374843120575,
      "reward_std": 0.015750272199511528,
      "rewards/bleu_reward_func/mean": 0.07936374843120575,
      "rewards/bleu_reward_func/std": 0.044607013463974,
      "step": 797
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 369.5625,
      "completions/mean_terminated_length": 258.77777099609375,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.6384,
      "grad_norm": 7.5998334884643555,
      "kl": 0.0509033203125,
      "learning_rate": 1e-06,
      "loss": 0.0505,
      "num_tokens": 10688821.0,
      "reward": 0.053146444261074066,
      "reward_std": 0.020619917660951614,
      "rewards/bleu_reward_func/mean": 0.053146444261074066,
      "rewards/bleu_reward_func/std": 0.03774289786815643,
      "step": 798
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 314.0,
      "completions/mean_length": 388.0,
      "completions/mean_terminated_length": 264.0,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "epoch": 0.6392,
      "grad_norm": 2.1621346473693848,
      "kl": 0.03448486328125,
      "learning_rate": 1e-06,
      "loss": -0.0252,
      "num_tokens": 10707373.0,
      "reward": 0.01923811435699463,
      "reward_std": 0.0031193974427878857,
      "rewards/bleu_reward_func/mean": 0.01923811435699463,
      "rewards/bleu_reward_func/std": 0.02292207069694996,
      "step": 799
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 430.6875,
      "completions/mean_terminated_length": 375.0526428222656,
      "completions/min_length": 258.0,
      "completions/min_terminated_length": 258.0,
      "epoch": 0.64,
      "grad_norm": 2.0542428493499756,
      "kl": 0.038909912109375,
      "learning_rate": 1e-06,
      "loss": -0.0029,
      "num_tokens": 10724091.0,
      "reward": 0.04182392358779907,
      "reward_std": 0.018276244401931763,
      "rewards/bleu_reward_func/mean": 0.04182392358779907,
      "rewards/bleu_reward_func/std": 0.023004500195384026,
      "step": 800
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 292.4375,
      "completions/mean_terminated_length": 219.25,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.6408,
      "grad_norm": 3.799773931503296,
      "kl": 0.049468994140625,
      "learning_rate": 1e-06,
      "loss": -0.1183,
      "num_tokens": 10736089.0,
      "reward": 0.05615938454866409,
      "reward_std": 0.019352678209543228,
      "rewards/bleu_reward_func/mean": 0.05615938454866409,
      "rewards/bleu_reward_func/std": 0.04468993842601776,
      "step": 801
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 253.09375,
      "completions/mean_terminated_length": 235.83334350585938,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.6416,
      "grad_norm": 2.5247280597686768,
      "kl": 0.024139404296875,
      "learning_rate": 1e-06,
      "loss": -0.1418,
      "num_tokens": 10748996.0,
      "reward": 0.1489913910627365,
      "reward_std": 0.06708651781082153,
      "rewards/bleu_reward_func/mean": 0.1489913910627365,
      "rewards/bleu_reward_func/std": 0.24455946683883667,
      "step": 802
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 443.0,
      "completions/mean_length": 262.65625,
      "completions/mean_terminated_length": 192.83999633789062,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.6424,
      "grad_norm": 3.614666223526001,
      "kl": 0.07672119140625,
      "learning_rate": 1e-06,
      "loss": -0.063,
      "num_tokens": 10759945.0,
      "reward": 0.08040255308151245,
      "reward_std": 0.023083828389644623,
      "rewards/bleu_reward_func/mean": 0.08040255308151245,
      "rewards/bleu_reward_func/std": 0.05763205140829086,
      "step": 803
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 379.0,
      "completions/mean_terminated_length": 309.3333435058594,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "epoch": 0.6432,
      "grad_norm": 2.4461886882781982,
      "kl": 0.05419921875,
      "learning_rate": 1e-06,
      "loss": -0.0328,
      "num_tokens": 10777153.0,
      "reward": 0.08907453715801239,
      "reward_std": 0.022760968655347824,
      "rewards/bleu_reward_func/mean": 0.08907453715801239,
      "rewards/bleu_reward_func/std": 0.06350069493055344,
      "step": 804
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 374.28125,
      "completions/mean_terminated_length": 302.1428527832031,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "epoch": 0.644,
      "grad_norm": 2.2589452266693115,
      "kl": 0.0455322265625,
      "learning_rate": 1e-06,
      "loss": 0.1583,
      "num_tokens": 10791714.0,
      "reward": 0.04732927680015564,
      "reward_std": 0.01938834972679615,
      "rewards/bleu_reward_func/mean": 0.04732927680015564,
      "rewards/bleu_reward_func/std": 0.04413124546408653,
      "step": 805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 410.3125,
      "completions/mean_terminated_length": 240.83334350585938,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "epoch": 0.6448,
      "grad_norm": 2.670647382736206,
      "kl": 0.07098388671875,
      "learning_rate": 1e-06,
      "loss": -0.0396,
      "num_tokens": 10809420.0,
      "reward": 0.04357248917222023,
      "reward_std": 0.008856004104018211,
      "rewards/bleu_reward_func/mean": 0.04357248917222023,
      "rewards/bleu_reward_func/std": 0.0222486425191164,
      "step": 806
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 200.65625,
      "completions/mean_terminated_length": 190.61289978027344,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.6456,
      "grad_norm": 5.75560998916626,
      "kl": 0.06439208984375,
      "learning_rate": 1e-06,
      "loss": 0.1668,
      "num_tokens": 10819033.0,
      "reward": 0.054602716118097305,
      "reward_std": 0.0195465050637722,
      "rewards/bleu_reward_func/mean": 0.054602716118097305,
      "rewards/bleu_reward_func/std": 0.03817151114344597,
      "step": 807
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 408.75,
      "completions/mean_terminated_length": 257.8461608886719,
      "completions/min_length": 85.0,
      "completions/min_terminated_length": 85.0,
      "epoch": 0.6464,
      "grad_norm": 2.3174612522125244,
      "kl": 0.0399169921875,
      "learning_rate": 1e-06,
      "loss": -0.0048,
      "num_tokens": 10834729.0,
      "reward": 0.07070265710353851,
      "reward_std": 0.02076653018593788,
      "rewards/bleu_reward_func/mean": 0.07070265710353851,
      "rewards/bleu_reward_func/std": 0.04173728823661804,
      "step": 808
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 389.53125,
      "completions/mean_terminated_length": 385.58062744140625,
      "completions/min_length": 297.0,
      "completions/min_terminated_length": 297.0,
      "epoch": 0.6472,
      "grad_norm": 2.186995267868042,
      "kl": 0.0399169921875,
      "learning_rate": 1e-06,
      "loss": 0.0048,
      "num_tokens": 10852794.0,
      "reward": 0.07276012748479843,
      "reward_std": 0.019968077540397644,
      "rewards/bleu_reward_func/mean": 0.07276012748479843,
      "rewards/bleu_reward_func/std": 0.02828538604080677,
      "step": 809
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 431.0,
      "completions/mean_length": 334.09375,
      "completions/mean_terminated_length": 253.22727966308594,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.648,
      "grad_norm": 2.8079397678375244,
      "kl": 0.055694580078125,
      "learning_rate": 1e-06,
      "loss": -0.1421,
      "num_tokens": 10866021.0,
      "reward": 0.05294843763113022,
      "reward_std": 0.02259877324104309,
      "rewards/bleu_reward_func/mean": 0.05294843763113022,
      "rewards/bleu_reward_func/std": 0.03158554434776306,
      "step": 810
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 364.0625,
      "completions/mean_terminated_length": 306.1739196777344,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.6488,
      "grad_norm": 2.486483335494995,
      "kl": 0.038848876953125,
      "learning_rate": 1e-06,
      "loss": 0.0101,
      "num_tokens": 10879623.0,
      "reward": 0.04237189143896103,
      "reward_std": 0.022582165896892548,
      "rewards/bleu_reward_func/mean": 0.04237189143896103,
      "rewards/bleu_reward_func/std": 0.0278725977987051,
      "step": 811
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 326.96875,
      "completions/mean_terminated_length": 254.56521606445312,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.6496,
      "grad_norm": 2.8792357444763184,
      "kl": 0.0413818359375,
      "learning_rate": 1e-06,
      "loss": -0.0378,
      "num_tokens": 10892558.0,
      "reward": 0.04466039687395096,
      "reward_std": 0.01942962221801281,
      "rewards/bleu_reward_func/mean": 0.04466039687395096,
      "rewards/bleu_reward_func/std": 0.036387860774993896,
      "step": 812
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 334.875,
      "completions/mean_terminated_length": 302.0740661621094,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.6504,
      "grad_norm": 2.8896868228912354,
      "kl": 0.05438232421875,
      "learning_rate": 1e-06,
      "loss": 0.0963,
      "num_tokens": 10905386.0,
      "reward": 0.08064363896846771,
      "reward_std": 0.026876429095864296,
      "rewards/bleu_reward_func/mean": 0.08064363896846771,
      "rewards/bleu_reward_func/std": 0.07220647484064102,
      "step": 813
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 418.3125,
      "completions/mean_terminated_length": 335.6470642089844,
      "completions/min_length": 239.0,
      "completions/min_terminated_length": 239.0,
      "epoch": 0.6512,
      "grad_norm": 2.1156792640686035,
      "kl": 0.041168212890625,
      "learning_rate": 1e-06,
      "loss": -0.0214,
      "num_tokens": 10922348.0,
      "reward": 0.05280526727437973,
      "reward_std": 0.020577870309352875,
      "rewards/bleu_reward_func/mean": 0.05280526727437973,
      "rewards/bleu_reward_func/std": 0.04993457347154617,
      "step": 814
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 377.28125,
      "completions/mean_terminated_length": 324.5652160644531,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "epoch": 0.652,
      "grad_norm": 2.4333136081695557,
      "kl": 0.05401611328125,
      "learning_rate": 1e-06,
      "loss": -0.0442,
      "num_tokens": 10937101.0,
      "reward": 0.042016930878162384,
      "reward_std": 0.013686037622392178,
      "rewards/bleu_reward_func/mean": 0.042016930878162384,
      "rewards/bleu_reward_func/std": 0.023568252101540565,
      "step": 815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 327.4375,
      "completions/mean_terminated_length": 275.7599792480469,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 0.6528,
      "grad_norm": 2.788355827331543,
      "kl": 0.050811767578125,
      "learning_rate": 1e-06,
      "loss": -0.1467,
      "num_tokens": 10950227.0,
      "reward": 0.047510623931884766,
      "reward_std": 0.02448885142803192,
      "rewards/bleu_reward_func/mean": 0.047510623931884766,
      "rewards/bleu_reward_func/std": 0.054734427481889725,
      "step": 816
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 343.71875,
      "completions/mean_terminated_length": 212.8333282470703,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.6536,
      "grad_norm": 3.3152077198028564,
      "kl": 0.0623779296875,
      "learning_rate": 1e-06,
      "loss": 0.1212,
      "num_tokens": 10964714.0,
      "reward": 0.03251378983259201,
      "reward_std": 0.008201804012060165,
      "rewards/bleu_reward_func/mean": 0.03251378983259201,
      "rewards/bleu_reward_func/std": 0.02223658747971058,
      "step": 817
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 211.125,
      "completions/mean_terminated_length": 191.06668090820312,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.6544,
      "grad_norm": 3.372784376144409,
      "kl": 0.07244873046875,
      "learning_rate": 1e-06,
      "loss": -0.0801,
      "num_tokens": 10975582.0,
      "reward": 0.07603277266025543,
      "reward_std": 0.024979114532470703,
      "rewards/bleu_reward_func/mean": 0.07603277266025543,
      "rewards/bleu_reward_func/std": 0.07992418855428696,
      "step": 818
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 334.0,
      "completions/mean_length": 253.15625,
      "completions/mean_terminated_length": 166.875,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.6552,
      "grad_norm": 4.9590744972229,
      "kl": 0.06866455078125,
      "learning_rate": 1e-06,
      "loss": 0.1993,
      "num_tokens": 10985955.0,
      "reward": 0.07587097585201263,
      "reward_std": 0.032769832760095596,
      "rewards/bleu_reward_func/mean": 0.07587097585201263,
      "rewards/bleu_reward_func/std": 0.09669305384159088,
      "step": 819
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 361.96875,
      "completions/mean_terminated_length": 303.2608642578125,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 0.656,
      "grad_norm": 2.3579721450805664,
      "kl": 0.036773681640625,
      "learning_rate": 1e-06,
      "loss": 0.0023,
      "num_tokens": 11001882.0,
      "reward": 0.1000506579875946,
      "reward_std": 0.01921307109296322,
      "rewards/bleu_reward_func/mean": 0.1000506579875946,
      "rewards/bleu_reward_func/std": 0.12420003116130829,
      "step": 820
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 364.375,
      "completions/mean_terminated_length": 323.03997802734375,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "epoch": 0.6568,
      "grad_norm": 3.5548665523529053,
      "kl": 0.0386962890625,
      "learning_rate": 1e-06,
      "loss": 0.0001,
      "num_tokens": 11017006.0,
      "reward": 0.1209985613822937,
      "reward_std": 0.04797535389661789,
      "rewards/bleu_reward_func/mean": 0.1209985613822937,
      "rewards/bleu_reward_func/std": 0.10385487228631973,
      "step": 821
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 419.0,
      "completions/mean_length": 357.34375,
      "completions/mean_terminated_length": 287.04547119140625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "epoch": 0.6576,
      "grad_norm": 2.376903772354126,
      "kl": 0.0343017578125,
      "learning_rate": 1e-06,
      "loss": 0.1205,
      "num_tokens": 11031105.0,
      "reward": 0.043021492660045624,
      "reward_std": 0.020202111452817917,
      "rewards/bleu_reward_func/mean": 0.043021492660045624,
      "rewards/bleu_reward_func/std": 0.03094971366226673,
      "step": 822
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 454.0,
      "completions/mean_length": 295.6875,
      "completions/mean_terminated_length": 197.3636474609375,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.6584,
      "grad_norm": 3.0443296432495117,
      "kl": 0.049591064453125,
      "learning_rate": 1e-06,
      "loss": -0.0138,
      "num_tokens": 11047383.0,
      "reward": 0.09186838567256927,
      "reward_std": 0.024521898478269577,
      "rewards/bleu_reward_func/mean": 0.09186838567256927,
      "rewards/bleu_reward_func/std": 0.1259845644235611,
      "step": 823
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 504.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 226.6875,
      "completions/mean_terminated_length": 226.6875,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.6592,
      "grad_norm": 6.048206329345703,
      "kl": 0.04132080078125,
      "learning_rate": 1e-06,
      "loss": 0.1452,
      "num_tokens": 11057229.0,
      "reward": 0.12500673532485962,
      "reward_std": 0.05105290934443474,
      "rewards/bleu_reward_func/mean": 0.12500673532485962,
      "rewards/bleu_reward_func/std": 0.12394329905509949,
      "step": 824
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 395.625,
      "completions/mean_terminated_length": 263.73333740234375,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 0.66,
      "grad_norm": 2.4559149742126465,
      "kl": 0.06475830078125,
      "learning_rate": 1e-06,
      "loss": 0.0041,
      "num_tokens": 11074705.0,
      "reward": 0.1022477000951767,
      "reward_std": 0.03607521206140518,
      "rewards/bleu_reward_func/mean": 0.1022477000951767,
      "rewards/bleu_reward_func/std": 0.06924700736999512,
      "step": 825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 304.78125,
      "completions/mean_terminated_length": 266.40740966796875,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.6608,
      "grad_norm": 3.020120620727539,
      "kl": 0.05322265625,
      "learning_rate": 1e-06,
      "loss": 0.0953,
      "num_tokens": 11087218.0,
      "reward": 0.02280343510210514,
      "reward_std": 0.004782763309776783,
      "rewards/bleu_reward_func/mean": 0.02280343510210514,
      "rewards/bleu_reward_func/std": 0.005676077678799629,
      "step": 826
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 288.90625,
      "completions/mean_terminated_length": 281.70965576171875,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.6616,
      "grad_norm": 2.7771506309509277,
      "kl": 0.03759765625,
      "learning_rate": 1e-06,
      "loss": -0.1367,
      "num_tokens": 11099039.0,
      "reward": 0.0848507508635521,
      "reward_std": 0.03843146190047264,
      "rewards/bleu_reward_func/mean": 0.0848507508635521,
      "rewards/bleu_reward_func/std": 0.05654771253466606,
      "step": 827
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 283.96875,
      "completions/mean_terminated_length": 241.74073791503906,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.6624,
      "grad_norm": 2.895529270172119,
      "kl": 0.060211181640625,
      "learning_rate": 1e-06,
      "loss": -0.0433,
      "num_tokens": 11110862.0,
      "reward": 0.029147926717996597,
      "reward_std": 0.008030948229134083,
      "rewards/bleu_reward_func/mean": 0.029147926717996597,
      "rewards/bleu_reward_func/std": 0.012568029575049877,
      "step": 828
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 339.15625,
      "completions/mean_terminated_length": 290.7599792480469,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.6632,
      "grad_norm": 2.2857818603515625,
      "kl": 0.0377197265625,
      "learning_rate": 1e-06,
      "loss": 0.1356,
      "num_tokens": 11124107.0,
      "reward": 0.024584250524640083,
      "reward_std": 0.011461092159152031,
      "rewards/bleu_reward_func/mean": 0.024584250524640083,
      "rewards/bleu_reward_func/std": 0.014021635986864567,
      "step": 829
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 339.46875,
      "completions/mean_terminated_length": 261.04547119140625,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.664,
      "grad_norm": 3.047325372695923,
      "kl": 0.07183837890625,
      "learning_rate": 1e-06,
      "loss": -0.0554,
      "num_tokens": 11138034.0,
      "reward": 0.03661057725548744,
      "reward_std": 0.009148099459707737,
      "rewards/bleu_reward_func/mean": 0.03661057725548744,
      "rewards/bleu_reward_func/std": 0.01764591969549656,
      "step": 830
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 340.0625,
      "completions/mean_terminated_length": 272.7826232910156,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.6648,
      "grad_norm": 2.7614407539367676,
      "kl": 0.03033447265625,
      "learning_rate": 1e-06,
      "loss": -0.0483,
      "num_tokens": 11151476.0,
      "reward": 0.05648787319660187,
      "reward_std": 0.035596661269664764,
      "rewards/bleu_reward_func/mean": 0.05648787319660187,
      "rewards/bleu_reward_func/std": 0.055284641683101654,
      "step": 831
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 288.59375,
      "completions/mean_terminated_length": 273.70001220703125,
      "completions/min_length": 79.0,
      "completions/min_terminated_length": 79.0,
      "epoch": 0.6656,
      "grad_norm": 2.4882915019989014,
      "kl": 0.05047607421875,
      "learning_rate": 1e-06,
      "loss": 0.0126,
      "num_tokens": 11165719.0,
      "reward": 0.07529251277446747,
      "reward_std": 0.032291531562805176,
      "rewards/bleu_reward_func/mean": 0.07529251277446747,
      "rewards/bleu_reward_func/std": 0.05323096737265587,
      "step": 832
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 407.15625,
      "completions/mean_terminated_length": 366.13043212890625,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "epoch": 0.6664,
      "grad_norm": 1.9872773885726929,
      "kl": 0.0447998046875,
      "learning_rate": 1e-06,
      "loss": 0.0506,
      "num_tokens": 11181668.0,
      "reward": 0.06329767405986786,
      "reward_std": 0.016374491155147552,
      "rewards/bleu_reward_func/mean": 0.06329767405986786,
      "rewards/bleu_reward_func/std": 0.04609806835651398,
      "step": 833
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 286.03125,
      "completions/mean_terminated_length": 270.9666748046875,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.6672,
      "grad_norm": 2.7774431705474854,
      "kl": 0.045074462890625,
      "learning_rate": 1e-06,
      "loss": 0.0337,
      "num_tokens": 11193869.0,
      "reward": 0.044695161283016205,
      "reward_std": 0.01957538165152073,
      "rewards/bleu_reward_func/mean": 0.044695161283016205,
      "rewards/bleu_reward_func/std": 0.02945699170231819,
      "step": 834
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 258.09375,
      "completions/mean_terminated_length": 249.90321350097656,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "epoch": 0.668,
      "grad_norm": 2.96142315864563,
      "kl": 0.040771484375,
      "learning_rate": 1e-06,
      "loss": 0.3593,
      "num_tokens": 11207264.0,
      "reward": 0.055455561727285385,
      "reward_std": 0.029085490852594376,
      "rewards/bleu_reward_func/mean": 0.055455561727285385,
      "rewards/bleu_reward_func/std": 0.06734622269868851,
      "step": 835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 446.0,
      "completions/mean_length": 368.5625,
      "completions/mean_terminated_length": 225.125,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.6688,
      "grad_norm": 2.931868076324463,
      "kl": 0.0482177734375,
      "learning_rate": 1e-06,
      "loss": -0.0531,
      "num_tokens": 11224506.0,
      "reward": 0.04490472376346588,
      "reward_std": 0.011960483156144619,
      "rewards/bleu_reward_func/mean": 0.04490472376346588,
      "rewards/bleu_reward_func/std": 0.015119385905563831,
      "step": 836
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 426.0,
      "completions/mean_length": 351.0,
      "completions/mean_terminated_length": 266.6666564941406,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.6696,
      "grad_norm": 2.4483470916748047,
      "kl": 0.0392913818359375,
      "learning_rate": 1e-06,
      "loss": -0.0262,
      "num_tokens": 11241138.0,
      "reward": 0.055649228394031525,
      "reward_std": 0.028573956340551376,
      "rewards/bleu_reward_func/mean": 0.055649228394031525,
      "rewards/bleu_reward_func/std": 0.05205146595835686,
      "step": 837
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 406.9375,
      "completions/mean_terminated_length": 359.18182373046875,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 0.6704,
      "grad_norm": 2.2513904571533203,
      "kl": 0.04888916015625,
      "learning_rate": 1e-06,
      "loss": 0.0047,
      "num_tokens": 11256696.0,
      "reward": 0.06957369297742844,
      "reward_std": 0.016341013833880424,
      "rewards/bleu_reward_func/mean": 0.06957369297742844,
      "rewards/bleu_reward_func/std": 0.047397319227457047,
      "step": 838
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 359.65625,
      "completions/mean_terminated_length": 187.00001525878906,
      "completions/min_length": 18.0,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.6712,
      "grad_norm": 4.712099075317383,
      "kl": 0.1260986328125,
      "learning_rate": 1e-06,
      "loss": 0.0522,
      "num_tokens": 11271373.0,
      "reward": 0.0421738438308239,
      "reward_std": 0.012205126695334911,
      "rewards/bleu_reward_func/mean": 0.0421738438308239,
      "rewards/bleu_reward_func/std": 0.017471130937337875,
      "step": 839
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 199.875,
      "completions/mean_terminated_length": 179.06668090820312,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.672,
      "grad_norm": 5.982988357543945,
      "kl": 0.0614013671875,
      "learning_rate": 1e-06,
      "loss": 0.015,
      "num_tokens": 11280041.0,
      "reward": 0.03582533821463585,
      "reward_std": 0.008306249044835567,
      "rewards/bleu_reward_func/mean": 0.03582533821463585,
      "rewards/bleu_reward_func/std": 0.010333981364965439,
      "step": 840
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 478.0,
      "completions/mean_length": 400.71875,
      "completions/mean_terminated_length": 363.625,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 0.6728,
      "grad_norm": 2.2374308109283447,
      "kl": 0.04034423828125,
      "learning_rate": 1e-06,
      "loss": -0.0348,
      "num_tokens": 11297040.0,
      "reward": 0.020916707813739777,
      "reward_std": 0.005250955931842327,
      "rewards/bleu_reward_func/mean": 0.020916707813739777,
      "rewards/bleu_reward_func/std": 0.012848958373069763,
      "step": 841
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 389.71875,
      "completions/mean_terminated_length": 316.3500061035156,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "epoch": 0.6736,
      "grad_norm": 2.3556578159332275,
      "kl": 0.041046142578125,
      "learning_rate": 1e-06,
      "loss": 0.0674,
      "num_tokens": 11313111.0,
      "reward": 0.05335354059934616,
      "reward_std": 0.018081864342093468,
      "rewards/bleu_reward_func/mean": 0.05335354059934616,
      "rewards/bleu_reward_func/std": 0.039343688637018204,
      "step": 842
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 428.0,
      "completions/mean_length": 418.0,
      "completions/mean_terminated_length": 311.4666748046875,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 211.0,
      "epoch": 0.6744,
      "grad_norm": 2.2276229858398438,
      "kl": 0.05731201171875,
      "learning_rate": 1e-06,
      "loss": -0.06,
      "num_tokens": 11333615.0,
      "reward": 0.04341096431016922,
      "reward_std": 0.010301424190402031,
      "rewards/bleu_reward_func/mean": 0.04341096431016922,
      "rewards/bleu_reward_func/std": 0.028370829299092293,
      "step": 843
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 414.0,
      "completions/mean_length": 398.25,
      "completions/mean_terminated_length": 252.00001525878906,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "epoch": 0.6752,
      "grad_norm": 4.02369499206543,
      "kl": 0.051849365234375,
      "learning_rate": 1e-06,
      "loss": -0.1083,
      "num_tokens": 11351647.0,
      "reward": 0.04887228459119797,
      "reward_std": 0.01692984066903591,
      "rewards/bleu_reward_func/mean": 0.04887228459119797,
      "rewards/bleu_reward_func/std": 0.04280164837837219,
      "step": 844
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 417.0,
      "completions/mean_length": 241.84375,
      "completions/mean_terminated_length": 233.1290283203125,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.676,
      "grad_norm": 2.695026397705078,
      "kl": 0.045989990234375,
      "learning_rate": 1e-06,
      "loss": 0.0799,
      "num_tokens": 11362234.0,
      "reward": 0.045725200325250626,
      "reward_std": 0.020237425342202187,
      "rewards/bleu_reward_func/mean": 0.045725200325250626,
      "rewards/bleu_reward_func/std": 0.02247374691069126,
      "step": 845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 336.0,
      "completions/mean_length": 337.78125,
      "completions/mean_terminated_length": 163.5625,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.6768,
      "grad_norm": 2.6178877353668213,
      "kl": 0.0535888671875,
      "learning_rate": 1e-06,
      "loss": -0.0213,
      "num_tokens": 11378195.0,
      "reward": 0.07340110093355179,
      "reward_std": 0.027613390237092972,
      "rewards/bleu_reward_func/mean": 0.07340110093355179,
      "rewards/bleu_reward_func/std": 0.06718737632036209,
      "step": 846
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 471.0,
      "completions/max_terminated_length": 471.0,
      "completions/mean_length": 315.125,
      "completions/mean_terminated_length": 315.125,
      "completions/min_length": 181.0,
      "completions/min_terminated_length": 181.0,
      "epoch": 0.6776,
      "grad_norm": 2.427767753601074,
      "kl": 0.04351806640625,
      "learning_rate": 1e-06,
      "loss": 0.0628,
      "num_tokens": 11390599.0,
      "reward": 0.11289885640144348,
      "reward_std": 0.016688670963048935,
      "rewards/bleu_reward_func/mean": 0.11289885640144348,
      "rewards/bleu_reward_func/std": 0.10308769345283508,
      "step": 847
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 357.65625,
      "completions/mean_terminated_length": 276.8095397949219,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.6784,
      "grad_norm": 2.0324172973632812,
      "kl": 0.04730224609375,
      "learning_rate": 1e-06,
      "loss": 0.0362,
      "num_tokens": 11404364.0,
      "reward": 0.09633086621761322,
      "reward_std": 0.05087493732571602,
      "rewards/bleu_reward_func/mean": 0.09633086621761322,
      "rewards/bleu_reward_func/std": 0.15251684188842773,
      "step": 848
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 483.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 173.59375,
      "completions/mean_terminated_length": 173.59375,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.6792,
      "grad_norm": 3.7329139709472656,
      "kl": 0.0618896484375,
      "learning_rate": 1e-06,
      "loss": 0.0479,
      "num_tokens": 11412271.0,
      "reward": 0.07129530608654022,
      "reward_std": 0.03777293860912323,
      "rewards/bleu_reward_func/mean": 0.07129530608654022,
      "rewards/bleu_reward_func/std": 0.07965229451656342,
      "step": 849
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 319.0,
      "completions/mean_length": 351.46875,
      "completions/mean_terminated_length": 190.9375,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.68,
      "grad_norm": 2.7004735469818115,
      "kl": 0.0643310546875,
      "learning_rate": 1e-06,
      "loss": 0.0879,
      "num_tokens": 11427262.0,
      "reward": 0.10036734491586685,
      "reward_std": 0.018841760233044624,
      "rewards/bleu_reward_func/mean": 0.10036734491586685,
      "rewards/bleu_reward_func/std": 0.1509210765361786,
      "step": 850
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 301.78125,
      "completions/mean_terminated_length": 231.70834350585938,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "epoch": 0.6808,
      "grad_norm": 2.8924641609191895,
      "kl": 0.0609130859375,
      "learning_rate": 1e-06,
      "loss": -0.0404,
      "num_tokens": 11439663.0,
      "reward": 0.02212350070476532,
      "reward_std": 0.007656387519091368,
      "rewards/bleu_reward_func/mean": 0.02212350070476532,
      "rewards/bleu_reward_func/std": 0.017611265182495117,
      "step": 851
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 504.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 168.875,
      "completions/mean_terminated_length": 168.875,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.6816,
      "grad_norm": 4.338345050811768,
      "kl": 0.031097412109375,
      "learning_rate": 1e-06,
      "loss": -0.1581,
      "num_tokens": 11448411.0,
      "reward": 0.055831264704465866,
      "reward_std": 0.034439653158187866,
      "rewards/bleu_reward_func/mean": 0.055831264704465866,
      "rewards/bleu_reward_func/std": 0.03716801106929779,
      "step": 852
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 432.0,
      "completions/mean_length": 241.03125,
      "completions/mean_terminated_length": 213.0,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.6824,
      "grad_norm": 2.851404905319214,
      "kl": 0.0327606201171875,
      "learning_rate": 1e-06,
      "loss": 0.0359,
      "num_tokens": 11458036.0,
      "reward": 0.06256793439388275,
      "reward_std": 0.02849128656089306,
      "rewards/bleu_reward_func/mean": 0.06256793439388275,
      "rewards/bleu_reward_func/std": 0.0491851307451725,
      "step": 853
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 456.0,
      "completions/mean_length": 343.21875,
      "completions/mean_terminated_length": 266.5,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.6832,
      "grad_norm": 2.9721109867095947,
      "kl": 0.03997802734375,
      "learning_rate": 1e-06,
      "loss": -0.0055,
      "num_tokens": 11471459.0,
      "reward": 0.03509419411420822,
      "reward_std": 0.007715485990047455,
      "rewards/bleu_reward_func/mean": 0.03509419411420822,
      "rewards/bleu_reward_func/std": 0.02039431221783161,
      "step": 854
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 383.65625,
      "completions/mean_terminated_length": 316.4285888671875,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "epoch": 0.684,
      "grad_norm": 2.379918336868286,
      "kl": 0.039520263671875,
      "learning_rate": 1e-06,
      "loss": 0.1156,
      "num_tokens": 11486432.0,
      "reward": 0.017331784591078758,
      "reward_std": 0.008001319132745266,
      "rewards/bleu_reward_func/mean": 0.017331784591078758,
      "rewards/bleu_reward_func/std": 0.009670063853263855,
      "step": 855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 480.375,
      "completions/mean_terminated_length": 309.6000061035156,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 0.6848,
      "grad_norm": 2.0767323970794678,
      "kl": 0.05438232421875,
      "learning_rate": 1e-06,
      "loss": -0.0073,
      "num_tokens": 11507884.0,
      "reward": 0.030316852033138275,
      "reward_std": 0.01650041714310646,
      "rewards/bleu_reward_func/mean": 0.030316852033138275,
      "rewards/bleu_reward_func/std": 0.028973711654543877,
      "step": 856
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 365.0,
      "completions/mean_length": 260.28125,
      "completions/mean_terminated_length": 189.8000030517578,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 0.6856,
      "grad_norm": 3.4007835388183594,
      "kl": 0.040740966796875,
      "learning_rate": 1e-06,
      "loss": -0.0284,
      "num_tokens": 11520933.0,
      "reward": 0.051799893379211426,
      "reward_std": 0.018662042915821075,
      "rewards/bleu_reward_func/mean": 0.051799893379211426,
      "rewards/bleu_reward_func/std": 0.03477845713496208,
      "step": 857
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 413.0,
      "completions/max_terminated_length": 413.0,
      "completions/mean_length": 205.84375,
      "completions/mean_terminated_length": 205.84375,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.6864,
      "grad_norm": 3.2630794048309326,
      "kl": 0.057403564453125,
      "learning_rate": 1e-06,
      "loss": 0.0015,
      "num_tokens": 11529584.0,
      "reward": 0.05795145779848099,
      "reward_std": 0.02359882742166519,
      "rewards/bleu_reward_func/mean": 0.05795145779848099,
      "rewards/bleu_reward_func/std": 0.046813130378723145,
      "step": 858
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 479.65625,
      "completions/mean_terminated_length": 397.0,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.6872,
      "grad_norm": 2.053889274597168,
      "kl": 0.036376953125,
      "learning_rate": 1e-06,
      "loss": 0.011,
      "num_tokens": 11547757.0,
      "reward": 0.03168204054236412,
      "reward_std": 0.007757972460240126,
      "rewards/bleu_reward_func/mean": 0.03168204054236412,
      "rewards/bleu_reward_func/std": 0.019959628582000732,
      "step": 859
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 429.65625,
      "completions/mean_terminated_length": 373.3157958984375,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.688,
      "grad_norm": 2.033658504486084,
      "kl": 0.05596923828125,
      "learning_rate": 1e-06,
      "loss": -0.0154,
      "num_tokens": 11565514.0,
      "reward": 0.04394299536943436,
      "reward_std": 0.01149829663336277,
      "rewards/bleu_reward_func/mean": 0.04394299536943436,
      "rewards/bleu_reward_func/std": 0.034712888300418854,
      "step": 860
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 304.6875,
      "completions/mean_terminated_length": 180.3000030517578,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "epoch": 0.6888,
      "grad_norm": 3.552960157394409,
      "kl": 0.054290771484375,
      "learning_rate": 1e-06,
      "loss": -0.0313,
      "num_tokens": 11579096.0,
      "reward": 0.06726164370775223,
      "reward_std": 0.022827234119176865,
      "rewards/bleu_reward_func/mean": 0.06726164370775223,
      "rewards/bleu_reward_func/std": 0.07642409950494766,
      "step": 861
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 438.09375,
      "completions/mean_terminated_length": 364.1875,
      "completions/min_length": 264.0,
      "completions/min_terminated_length": 264.0,
      "epoch": 0.6896,
      "grad_norm": 2.134580135345459,
      "kl": 0.04888916015625,
      "learning_rate": 1e-06,
      "loss": -0.0561,
      "num_tokens": 11596747.0,
      "reward": 0.04212401062250137,
      "reward_std": 0.010989276692271233,
      "rewards/bleu_reward_func/mean": 0.04212401062250137,
      "rewards/bleu_reward_func/std": 0.011919494718313217,
      "step": 862
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 446.0,
      "completions/mean_length": 346.4375,
      "completions/mean_terminated_length": 291.25,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "epoch": 0.6904,
      "grad_norm": 2.5982649326324463,
      "kl": 0.0455322265625,
      "learning_rate": 1e-06,
      "loss": -0.0463,
      "num_tokens": 11614025.0,
      "reward": 0.09341640025377274,
      "reward_std": 0.045992907136678696,
      "rewards/bleu_reward_func/mean": 0.09341640025377274,
      "rewards/bleu_reward_func/std": 0.08428861200809479,
      "step": 863
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 326.3125,
      "completions/mean_terminated_length": 291.9259338378906,
      "completions/min_length": 137.0,
      "completions/min_terminated_length": 137.0,
      "epoch": 0.6912,
      "grad_norm": 2.6331946849823,
      "kl": 0.034912109375,
      "learning_rate": 1e-06,
      "loss": -0.0157,
      "num_tokens": 11626787.0,
      "reward": 0.0679611936211586,
      "reward_std": 0.025558585301041603,
      "rewards/bleu_reward_func/mean": 0.0679611936211586,
      "rewards/bleu_reward_func/std": 0.041840873658657074,
      "step": 864
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 470.0,
      "completions/mean_length": 280.96875,
      "completions/mean_terminated_length": 227.6538543701172,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.692,
      "grad_norm": 6.058828830718994,
      "kl": 0.0814361572265625,
      "learning_rate": 1e-06,
      "loss": -0.0091,
      "num_tokens": 11640106.0,
      "reward": 0.1249103844165802,
      "reward_std": 0.021891392767429352,
      "rewards/bleu_reward_func/mean": 0.1249103844165802,
      "rewards/bleu_reward_func/std": 0.08676422387361526,
      "step": 865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 475.0,
      "completions/mean_length": 382.15625,
      "completions/mean_terminated_length": 281.1666564941406,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "epoch": 0.6928,
      "grad_norm": 2.0802950859069824,
      "kl": 0.0343017578125,
      "learning_rate": 1e-06,
      "loss": -0.072,
      "num_tokens": 11658143.0,
      "reward": 0.1035676896572113,
      "reward_std": 0.04491373896598816,
      "rewards/bleu_reward_func/mean": 0.1035676896572113,
      "rewards/bleu_reward_func/std": 0.08027082681655884,
      "step": 866
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 430.0,
      "completions/max_terminated_length": 430.0,
      "completions/mean_length": 247.9375,
      "completions/mean_terminated_length": 247.9375,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "epoch": 0.6936,
      "grad_norm": 2.882784605026245,
      "kl": 0.061248779296875,
      "learning_rate": 1e-06,
      "loss": 0.1637,
      "num_tokens": 11668869.0,
      "reward": 0.03542046248912811,
      "reward_std": 0.010823436081409454,
      "rewards/bleu_reward_func/mean": 0.03542046248912811,
      "rewards/bleu_reward_func/std": 0.02019406110048294,
      "step": 867
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 487.0,
      "completions/mean_length": 431.28125,
      "completions/mean_terminated_length": 253.6999969482422,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "epoch": 0.6944,
      "grad_norm": 2.128084182739258,
      "kl": 0.040283203125,
      "learning_rate": 1e-06,
      "loss": -0.0763,
      "num_tokens": 11685502.0,
      "reward": 0.02058091014623642,
      "reward_std": 0.005482015199959278,
      "rewards/bleu_reward_func/mean": 0.02058091014623642,
      "rewards/bleu_reward_func/std": 0.01129836868494749,
      "step": 868
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 265.78125,
      "completions/mean_terminated_length": 249.36668395996094,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.6952,
      "grad_norm": 8.264237403869629,
      "kl": 0.040863037109375,
      "learning_rate": 1e-06,
      "loss": 0.0395,
      "num_tokens": 11696007.0,
      "reward": 0.04947113245725632,
      "reward_std": 0.010559817776083946,
      "rewards/bleu_reward_func/mean": 0.04947113245725632,
      "rewards/bleu_reward_func/std": 0.020097067579627037,
      "step": 869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 443.0,
      "completions/mean_length": 282.5625,
      "completions/mean_terminated_length": 192.78260803222656,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.696,
      "grad_norm": 4.883482456207275,
      "kl": 0.053009033203125,
      "learning_rate": 1e-06,
      "loss": -0.0779,
      "num_tokens": 11709769.0,
      "reward": 0.05990312993526459,
      "reward_std": 0.015603918582201004,
      "rewards/bleu_reward_func/mean": 0.05990312993526459,
      "rewards/bleu_reward_func/std": 0.0239902064204216,
      "step": 870
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 320.0,
      "completions/mean_length": 335.125,
      "completions/mean_terminated_length": 179.05882263183594,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "epoch": 0.6968,
      "grad_norm": 2.609515905380249,
      "kl": 0.04803466796875,
      "learning_rate": 1e-06,
      "loss": 0.0612,
      "num_tokens": 11723349.0,
      "reward": 0.03629864379763603,
      "reward_std": 0.01367709320038557,
      "rewards/bleu_reward_func/mean": 0.03629864379763603,
      "rewards/bleu_reward_func/std": 0.018606344237923622,
      "step": 871
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 268.46875,
      "completions/mean_terminated_length": 173.17391967773438,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.6976,
      "grad_norm": 6.626194000244141,
      "kl": 0.10137939453125,
      "learning_rate": 1e-06,
      "loss": -0.1281,
      "num_tokens": 11734708.0,
      "reward": 0.1171385869383812,
      "reward_std": 0.06305581331253052,
      "rewards/bleu_reward_func/mean": 0.1171385869383812,
      "rewards/bleu_reward_func/std": 0.18313851952552795,
      "step": 872
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 294.71875,
      "completions/mean_terminated_length": 222.2916717529297,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.6984,
      "grad_norm": 2.969879388809204,
      "kl": 0.05072021484375,
      "learning_rate": 1e-06,
      "loss": 0.0654,
      "num_tokens": 11746683.0,
      "reward": 0.09640492498874664,
      "reward_std": 0.04729197546839714,
      "rewards/bleu_reward_func/mean": 0.09640492498874664,
      "rewards/bleu_reward_func/std": 0.10373617708683014,
      "step": 873
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 476.0,
      "completions/max_terminated_length": 476.0,
      "completions/mean_length": 241.09375,
      "completions/mean_terminated_length": 241.09375,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.6992,
      "grad_norm": 4.217690944671631,
      "kl": 0.05010986328125,
      "learning_rate": 1e-06,
      "loss": -0.1491,
      "num_tokens": 11758790.0,
      "reward": 0.08641447871923447,
      "reward_std": 0.028802432119846344,
      "rewards/bleu_reward_func/mean": 0.08641447871923447,
      "rewards/bleu_reward_func/std": 0.04867434501647949,
      "step": 874
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 340.46875,
      "completions/mean_terminated_length": 273.34783935546875,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "epoch": 0.7,
      "grad_norm": 2.67362904548645,
      "kl": 0.039642333984375,
      "learning_rate": 1e-06,
      "loss": 0.0822,
      "num_tokens": 11772093.0,
      "reward": 0.05074232816696167,
      "reward_std": 0.023221492767333984,
      "rewards/bleu_reward_func/mean": 0.05074232816696167,
      "rewards/bleu_reward_func/std": 0.03728149086236954,
      "step": 875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 199.28125,
      "completions/mean_terminated_length": 178.433349609375,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.7008,
      "grad_norm": 14.30516529083252,
      "kl": 0.17584228515625,
      "learning_rate": 1e-06,
      "loss": 0.1759,
      "num_tokens": 11782310.0,
      "reward": 0.11937517672777176,
      "reward_std": 0.021323315799236298,
      "rewards/bleu_reward_func/mean": 0.11937517672777176,
      "rewards/bleu_reward_func/std": 0.15308451652526855,
      "step": 876
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 350.1875,
      "completions/mean_terminated_length": 304.8800048828125,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "epoch": 0.7016,
      "grad_norm": 2.588109254837036,
      "kl": 0.044189453125,
      "learning_rate": 1e-06,
      "loss": 0.0225,
      "num_tokens": 11796252.0,
      "reward": 0.05437985807657242,
      "reward_std": 0.017482522875070572,
      "rewards/bleu_reward_func/mean": 0.05437985807657242,
      "rewards/bleu_reward_func/std": 0.028886273503303528,
      "step": 877
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 501.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 344.3125,
      "completions/mean_terminated_length": 344.3125,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "epoch": 0.7024,
      "grad_norm": 3.1232547760009766,
      "kl": 0.033905029296875,
      "learning_rate": 1e-06,
      "loss": 0.0186,
      "num_tokens": 11810094.0,
      "reward": 0.07601115107536316,
      "reward_std": 0.022839991375803947,
      "rewards/bleu_reward_func/mean": 0.07601115107536316,
      "rewards/bleu_reward_func/std": 0.0427204929292202,
      "step": 878
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 382.4375,
      "completions/mean_terminated_length": 339.25,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.7032,
      "grad_norm": 2.555143117904663,
      "kl": 0.05218505859375,
      "learning_rate": 1e-06,
      "loss": 0.0152,
      "num_tokens": 11825772.0,
      "reward": 0.02484015002846718,
      "reward_std": 0.009351451881229877,
      "rewards/bleu_reward_func/mean": 0.02484015002846718,
      "rewards/bleu_reward_func/std": 0.015805674716830254,
      "step": 879
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 474.625,
      "completions/mean_terminated_length": 412.3333435058594,
      "completions/min_length": 274.0,
      "completions/min_terminated_length": 274.0,
      "epoch": 0.704,
      "grad_norm": 2.2161693572998047,
      "kl": 0.0496826171875,
      "learning_rate": 1e-06,
      "loss": -0.0232,
      "num_tokens": 11843736.0,
      "reward": 0.03902929276227951,
      "reward_std": 0.009834162890911102,
      "rewards/bleu_reward_func/mean": 0.03902929276227951,
      "rewards/bleu_reward_func/std": 0.03208939731121063,
      "step": 880
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 486.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 255.03125,
      "completions/mean_terminated_length": 255.03125,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.7048,
      "grad_norm": 3.189509391784668,
      "kl": 0.038238525390625,
      "learning_rate": 1e-06,
      "loss": -0.2021,
      "num_tokens": 11853825.0,
      "reward": 0.06721623241901398,
      "reward_std": 0.04335642606019974,
      "rewards/bleu_reward_func/mean": 0.06721623241901398,
      "rewards/bleu_reward_func/std": 0.06198061630129814,
      "step": 881
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 335.375,
      "completions/mean_terminated_length": 229.40000915527344,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.7056,
      "grad_norm": 2.6181299686431885,
      "kl": 0.030914306640625,
      "learning_rate": 1e-06,
      "loss": 0.0489,
      "num_tokens": 11869869.0,
      "reward": 0.036412715911865234,
      "reward_std": 0.017522014677524567,
      "rewards/bleu_reward_func/mean": 0.036412715911865234,
      "rewards/bleu_reward_func/std": 0.02936912514269352,
      "step": 882
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 30.0,
      "completions/mean_length": 389.875,
      "completions/mean_terminated_length": 23.5,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.7064,
      "grad_norm": 6.321810245513916,
      "kl": 0.11798095703125,
      "learning_rate": 1e-06,
      "loss": -0.0217,
      "num_tokens": 11885249.0,
      "reward": 0.039120785892009735,
      "reward_std": 0.005084656178951263,
      "rewards/bleu_reward_func/mean": 0.039120785892009735,
      "rewards/bleu_reward_func/std": 0.021427100524306297,
      "step": 883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 335.0,
      "completions/mean_terminated_length": 265.7391357421875,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "epoch": 0.7072,
      "grad_norm": 3.0909547805786133,
      "kl": 0.0372314453125,
      "learning_rate": 1e-06,
      "loss": -0.1258,
      "num_tokens": 11899921.0,
      "reward": 0.06846681982278824,
      "reward_std": 0.024172725155949593,
      "rewards/bleu_reward_func/mean": 0.06846681982278824,
      "rewards/bleu_reward_func/std": 0.04641611874103546,
      "step": 884
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 345.0,
      "completions/mean_terminated_length": 298.239990234375,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "epoch": 0.708,
      "grad_norm": 2.417402744293213,
      "kl": 0.027801513671875,
      "learning_rate": 1e-06,
      "loss": 0.1077,
      "num_tokens": 11914009.0,
      "reward": 0.0673043429851532,
      "reward_std": 0.031290117651224136,
      "rewards/bleu_reward_func/mean": 0.0673043429851532,
      "rewards/bleu_reward_func/std": 0.06805533170700073,
      "step": 885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 391.0,
      "completions/mean_length": 244.4375,
      "completions/mean_terminated_length": 155.25,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.7088,
      "grad_norm": 3.7891881465911865,
      "kl": 0.030517578125,
      "learning_rate": 1e-06,
      "loss": 0.0371,
      "num_tokens": 11924071.0,
      "reward": 0.09851931035518646,
      "reward_std": 0.02925381436944008,
      "rewards/bleu_reward_func/mean": 0.09851931035518646,
      "rewards/bleu_reward_func/std": 0.061319418251514435,
      "step": 886
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 462.0,
      "completions/mean_length": 431.71875,
      "completions/mean_terminated_length": 278.4545593261719,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.7096,
      "grad_norm": 2.5203967094421387,
      "kl": 0.043731689453125,
      "learning_rate": 1e-06,
      "loss": -0.0583,
      "num_tokens": 11945294.0,
      "reward": 0.07713477313518524,
      "reward_std": 0.015437297523021698,
      "rewards/bleu_reward_func/mean": 0.07713477313518524,
      "rewards/bleu_reward_func/std": 0.035572707653045654,
      "step": 887
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 426.0,
      "completions/mean_length": 280.3125,
      "completions/mean_terminated_length": 203.08334350585938,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "epoch": 0.7104,
      "grad_norm": 3.726912021636963,
      "kl": 0.0765380859375,
      "learning_rate": 1e-06,
      "loss": 0.1117,
      "num_tokens": 11957336.0,
      "reward": 0.06727063655853271,
      "reward_std": 0.029535435140132904,
      "rewards/bleu_reward_func/mean": 0.06727063655853271,
      "rewards/bleu_reward_func/std": 0.03691576421260834,
      "step": 888
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 244.625,
      "completions/mean_terminated_length": 236.0,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.7112,
      "grad_norm": 3.1454975605010986,
      "kl": 0.0346832275390625,
      "learning_rate": 1e-06,
      "loss": -0.1414,
      "num_tokens": 11967484.0,
      "reward": 0.05579820275306702,
      "reward_std": 0.0413711853325367,
      "rewards/bleu_reward_func/mean": 0.05579820275306702,
      "rewards/bleu_reward_func/std": 0.058547936379909515,
      "step": 889
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 401.40625,
      "completions/mean_terminated_length": 358.13043212890625,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "epoch": 0.712,
      "grad_norm": 2.106522798538208,
      "kl": 0.04827880859375,
      "learning_rate": 1e-06,
      "loss": 0.1188,
      "num_tokens": 11984513.0,
      "reward": 0.0451212078332901,
      "reward_std": 0.021773334592580795,
      "rewards/bleu_reward_func/mean": 0.0451212078332901,
      "rewards/bleu_reward_func/std": 0.028573498129844666,
      "step": 890
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 408.875,
      "completions/mean_terminated_length": 328.6666564941406,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.7128,
      "grad_norm": 2.2036218643188477,
      "kl": 0.05535888671875,
      "learning_rate": 1e-06,
      "loss": 0.0686,
      "num_tokens": 12000717.0,
      "reward": 0.0747871994972229,
      "reward_std": 0.020796824246644974,
      "rewards/bleu_reward_func/mean": 0.0747871994972229,
      "rewards/bleu_reward_func/std": 0.039392877370119095,
      "step": 891
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 346.125,
      "completions/mean_terminated_length": 281.2174072265625,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.7136,
      "grad_norm": 2.303058624267578,
      "kl": 0.04833984375,
      "learning_rate": 1e-06,
      "loss": -0.0624,
      "num_tokens": 12014489.0,
      "reward": 0.08780650794506073,
      "reward_std": 0.030875790864229202,
      "rewards/bleu_reward_func/mean": 0.08780650794506073,
      "rewards/bleu_reward_func/std": 0.06451728194952011,
      "step": 892
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 241.125,
      "completions/mean_terminated_length": 165.27999877929688,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.7144,
      "grad_norm": 3.9183602333068848,
      "kl": 0.0474853515625,
      "learning_rate": 1e-06,
      "loss": 0.083,
      "num_tokens": 12025885.0,
      "reward": 0.0884522795677185,
      "reward_std": 0.03579477593302727,
      "rewards/bleu_reward_func/mean": 0.0884522795677185,
      "rewards/bleu_reward_func/std": 0.09096165746450424,
      "step": 893
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 344.59375,
      "completions/mean_terminated_length": 279.08697509765625,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.7152,
      "grad_norm": 2.937997579574585,
      "kl": 0.0556640625,
      "learning_rate": 1e-06,
      "loss": 0.2263,
      "num_tokens": 12041160.0,
      "reward": 0.035102441906929016,
      "reward_std": 0.016423923894762993,
      "rewards/bleu_reward_func/mean": 0.035102441906929016,
      "rewards/bleu_reward_func/std": 0.03288643807172775,
      "step": 894
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 403.0,
      "completions/mean_terminated_length": 387.4285888671875,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.716,
      "grad_norm": 2.2048861980438232,
      "kl": 0.03814697265625,
      "learning_rate": 1e-06,
      "loss": -0.0469,
      "num_tokens": 12056088.0,
      "reward": 0.02945767343044281,
      "reward_std": 0.014292486011981964,
      "rewards/bleu_reward_func/mean": 0.02945767343044281,
      "rewards/bleu_reward_func/std": 0.015402048826217651,
      "step": 895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 236.46875,
      "completions/mean_terminated_length": 207.96551513671875,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.7168,
      "grad_norm": 3.6095573902130127,
      "kl": 0.0482177734375,
      "learning_rate": 1e-06,
      "loss": 0.1104,
      "num_tokens": 12065823.0,
      "reward": 0.04199256747961044,
      "reward_std": 0.015113498084247112,
      "rewards/bleu_reward_func/mean": 0.04199256747961044,
      "rewards/bleu_reward_func/std": 0.031457021832466125,
      "step": 896
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 470.0,
      "completions/mean_length": 301.8125,
      "completions/mean_terminated_length": 242.95999145507812,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.7176,
      "grad_norm": 4.166423320770264,
      "kl": 0.056304931640625,
      "learning_rate": 1e-06,
      "loss": -0.1865,
      "num_tokens": 12078721.0,
      "reward": 0.06818559765815735,
      "reward_std": 0.049522291868925095,
      "rewards/bleu_reward_func/mean": 0.06818559765815735,
      "rewards/bleu_reward_func/std": 0.11108224838972092,
      "step": 897
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 483.0,
      "completions/mean_length": 318.9375,
      "completions/mean_terminated_length": 254.58334350585938,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.7184,
      "grad_norm": 3.5479068756103516,
      "kl": 0.06591796875,
      "learning_rate": 1e-06,
      "loss": 0.0512,
      "num_tokens": 12094239.0,
      "reward": 0.16995030641555786,
      "reward_std": 0.02772948332130909,
      "rewards/bleu_reward_func/mean": 0.16995030641555786,
      "rewards/bleu_reward_func/std": 0.19171921908855438,
      "step": 898
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 189.34375,
      "completions/mean_terminated_length": 81.79167175292969,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.7192,
      "grad_norm": 7.562760829925537,
      "kl": 0.04559326171875,
      "learning_rate": 1e-06,
      "loss": -0.0977,
      "num_tokens": 12103242.0,
      "reward": 0.06262214481830597,
      "reward_std": 0.01612667180597782,
      "rewards/bleu_reward_func/mean": 0.06262214481830597,
      "rewards/bleu_reward_func/std": 0.051322340965270996,
      "step": 899
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 288.5,
      "completions/mean_terminated_length": 265.3793029785156,
      "completions/min_length": 79.0,
      "completions/min_terminated_length": 79.0,
      "epoch": 0.72,
      "grad_norm": 3.002432107925415,
      "kl": 0.036834716796875,
      "learning_rate": 1e-06,
      "loss": -0.081,
      "num_tokens": 12114562.0,
      "reward": 0.08370120078325272,
      "reward_std": 0.027285337448120117,
      "rewards/bleu_reward_func/mean": 0.08370120078325272,
      "rewards/bleu_reward_func/std": 0.06422236561775208,
      "step": 900
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 413.4375,
      "completions/mean_terminated_length": 346.0,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.7208,
      "grad_norm": 1.9330230951309204,
      "kl": 0.041168212890625,
      "learning_rate": 1e-06,
      "loss": 0.076,
      "num_tokens": 12131424.0,
      "reward": 0.07572037726640701,
      "reward_std": 0.026293717324733734,
      "rewards/bleu_reward_func/mean": 0.07572037726640701,
      "rewards/bleu_reward_func/std": 0.055959172546863556,
      "step": 901
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 323.6875,
      "completions/mean_terminated_length": 250.0,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.7216,
      "grad_norm": 2.5331993103027344,
      "kl": 0.06146240234375,
      "learning_rate": 1e-06,
      "loss": 0.0994,
      "num_tokens": 12145606.0,
      "reward": 0.02315894514322281,
      "reward_std": 0.015285233967006207,
      "rewards/bleu_reward_func/mean": 0.02315894514322281,
      "rewards/bleu_reward_func/std": 0.024109287187457085,
      "step": 902
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 473.0,
      "completions/mean_length": 440.03125,
      "completions/mean_terminated_length": 368.0625,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.7224,
      "grad_norm": 2.0079493522644043,
      "kl": 0.05462646484375,
      "learning_rate": 1e-06,
      "loss": 0.0609,
      "num_tokens": 12162431.0,
      "reward": 0.028865192085504532,
      "reward_std": 0.00568732712417841,
      "rewards/bleu_reward_func/mean": 0.028865192085504532,
      "rewards/bleu_reward_func/std": 0.019452739506959915,
      "step": 903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 274.5,
      "completions/mean_terminated_length": 258.66668701171875,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "epoch": 0.7232,
      "grad_norm": 3.427934169769287,
      "kl": 0.059814453125,
      "learning_rate": 1e-06,
      "loss": 0.0583,
      "num_tokens": 12173223.0,
      "reward": 0.08930553495883942,
      "reward_std": 0.0346166156232357,
      "rewards/bleu_reward_func/mean": 0.08930553495883942,
      "rewards/bleu_reward_func/std": 0.06352285295724869,
      "step": 904
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 412.8125,
      "completions/mean_terminated_length": 325.29412841796875,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "epoch": 0.724,
      "grad_norm": 2.114323854446411,
      "kl": 0.04071044921875,
      "learning_rate": 1e-06,
      "loss": 0.0525,
      "num_tokens": 12189569.0,
      "reward": 0.0793527215719223,
      "reward_std": 0.0372716560959816,
      "rewards/bleu_reward_func/mean": 0.0793527215719223,
      "rewards/bleu_reward_func/std": 0.08270096778869629,
      "step": 905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 493.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 269.40625,
      "completions/mean_terminated_length": 269.40625,
      "completions/min_length": 91.0,
      "completions/min_terminated_length": 91.0,
      "epoch": 0.7248,
      "grad_norm": 3.237521171569824,
      "kl": 0.05322265625,
      "learning_rate": 1e-06,
      "loss": 0.1096,
      "num_tokens": 12200398.0,
      "reward": 0.03743357956409454,
      "reward_std": 0.011607276275753975,
      "rewards/bleu_reward_func/mean": 0.03743357956409454,
      "rewards/bleu_reward_func/std": 0.013176437467336655,
      "step": 906
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 454.0,
      "completions/mean_length": 323.46875,
      "completions/mean_terminated_length": 260.625,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "epoch": 0.7256,
      "grad_norm": 3.23765230178833,
      "kl": 0.05694580078125,
      "learning_rate": 1e-06,
      "loss": -0.0564,
      "num_tokens": 12214461.0,
      "reward": 0.023959729820489883,
      "reward_std": 0.009282315149903297,
      "rewards/bleu_reward_func/mean": 0.023959729820489883,
      "rewards/bleu_reward_func/std": 0.013883906416594982,
      "step": 907
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 438.0,
      "completions/mean_length": 469.96875,
      "completions/mean_terminated_length": 343.875,
      "completions/min_length": 278.0,
      "completions/min_terminated_length": 278.0,
      "epoch": 0.7264,
      "grad_norm": 2.196124315261841,
      "kl": 0.041290283203125,
      "learning_rate": 1e-06,
      "loss": 0.0084,
      "num_tokens": 12233852.0,
      "reward": 0.0776321142911911,
      "reward_std": 0.021009789779782295,
      "rewards/bleu_reward_func/mean": 0.0776321142911911,
      "rewards/bleu_reward_func/std": 0.0949036255478859,
      "step": 908
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 306.34375,
      "completions/mean_terminated_length": 237.7916717529297,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 0.7272,
      "grad_norm": 2.8371667861938477,
      "kl": 0.06854248046875,
      "learning_rate": 1e-06,
      "loss": -0.1164,
      "num_tokens": 12246503.0,
      "reward": 0.03938760608434677,
      "reward_std": 0.012361581437289715,
      "rewards/bleu_reward_func/mean": 0.03938760608434677,
      "rewards/bleu_reward_func/std": 0.028173107653856277,
      "step": 909
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 270.03125,
      "completions/mean_terminated_length": 245.0,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.728,
      "grad_norm": 6.242187023162842,
      "kl": 0.067474365234375,
      "learning_rate": 1e-06,
      "loss": 0.1375,
      "num_tokens": 12258464.0,
      "reward": 0.03787752240896225,
      "reward_std": 0.0169361662119627,
      "rewards/bleu_reward_func/mean": 0.03787752240896225,
      "rewards/bleu_reward_func/std": 0.03063538856804371,
      "step": 910
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 357.0,
      "completions/max_terminated_length": 357.0,
      "completions/mean_length": 177.0,
      "completions/mean_terminated_length": 177.0,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.7288,
      "grad_norm": 3.455866575241089,
      "kl": 0.04833984375,
      "learning_rate": 1e-06,
      "loss": -0.3367,
      "num_tokens": 12267968.0,
      "reward": 0.14032083749771118,
      "reward_std": 0.15820267796516418,
      "rewards/bleu_reward_func/mean": 0.14032083749771118,
      "rewards/bleu_reward_func/std": 0.23967435956001282,
      "step": 911
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 472.53125,
      "completions/mean_terminated_length": 385.70001220703125,
      "completions/min_length": 266.0,
      "completions/min_terminated_length": 266.0,
      "epoch": 0.7296,
      "grad_norm": 2.2268266677856445,
      "kl": 0.04461669921875,
      "learning_rate": 1e-06,
      "loss": 0.0363,
      "num_tokens": 12286913.0,
      "reward": 0.03583249822258949,
      "reward_std": 0.014493023976683617,
      "rewards/bleu_reward_func/mean": 0.03583249822258949,
      "rewards/bleu_reward_func/std": 0.02188328467309475,
      "step": 912
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 374.4375,
      "completions/mean_terminated_length": 320.60870361328125,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 150.0,
      "epoch": 0.7304,
      "grad_norm": 2.3796823024749756,
      "kl": 0.042236328125,
      "learning_rate": 1e-06,
      "loss": -0.0413,
      "num_tokens": 12301247.0,
      "reward": 0.06036565452814102,
      "reward_std": 0.017455367371439934,
      "rewards/bleu_reward_func/mean": 0.06036565452814102,
      "rewards/bleu_reward_func/std": 0.06808813661336899,
      "step": 913
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 325.90625,
      "completions/mean_terminated_length": 319.9032287597656,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.7312,
      "grad_norm": 2.5203535556793213,
      "kl": 0.03826904296875,
      "learning_rate": 1e-06,
      "loss": -0.0426,
      "num_tokens": 12314348.0,
      "reward": 0.037568479776382446,
      "reward_std": 0.017504602670669556,
      "rewards/bleu_reward_func/mean": 0.037568479776382446,
      "rewards/bleu_reward_func/std": 0.025791103020310402,
      "step": 914
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 222.15625,
      "completions/mean_terminated_length": 212.8064422607422,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.732,
      "grad_norm": 4.8649139404296875,
      "kl": 0.05670166015625,
      "learning_rate": 1e-06,
      "loss": -0.0406,
      "num_tokens": 12324497.0,
      "reward": 0.047999829053878784,
      "reward_std": 0.024475431069731712,
      "rewards/bleu_reward_func/mean": 0.047999829053878784,
      "rewards/bleu_reward_func/std": 0.031057659536600113,
      "step": 915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 436.0,
      "completions/mean_length": 320.84375,
      "completions/mean_terminated_length": 257.125,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.7328,
      "grad_norm": 2.876401662826538,
      "kl": 0.03778076171875,
      "learning_rate": 1e-06,
      "loss": -0.0378,
      "num_tokens": 12337788.0,
      "reward": 0.04696403443813324,
      "reward_std": 0.02812850847840309,
      "rewards/bleu_reward_func/mean": 0.04696403443813324,
      "rewards/bleu_reward_func/std": 0.06415504217147827,
      "step": 916
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 407.03125,
      "completions/mean_terminated_length": 272.0714416503906,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.7336,
      "grad_norm": 2.623408079147339,
      "kl": 0.0589599609375,
      "learning_rate": 1e-06,
      "loss": -0.0659,
      "num_tokens": 12354373.0,
      "reward": 0.056746140122413635,
      "reward_std": 0.01886637695133686,
      "rewards/bleu_reward_func/mean": 0.056746140122413635,
      "rewards/bleu_reward_func/std": 0.03354233503341675,
      "step": 917
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 257.0,
      "completions/mean_length": 338.71875,
      "completions/mean_terminated_length": 165.4375,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "epoch": 0.7344,
      "grad_norm": 2.7854504585266113,
      "kl": 0.0494384765625,
      "learning_rate": 1e-06,
      "loss": 0.0484,
      "num_tokens": 12368292.0,
      "reward": 0.04390311986207962,
      "reward_std": 0.016904659569263458,
      "rewards/bleu_reward_func/mean": 0.04390311986207962,
      "rewards/bleu_reward_func/std": 0.039624132215976715,
      "step": 918
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 415.0,
      "completions/mean_length": 186.65625,
      "completions/mean_terminated_length": 164.9666748046875,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.7352,
      "grad_norm": 6.439005374908447,
      "kl": 0.073272705078125,
      "learning_rate": 1e-06,
      "loss": -0.0798,
      "num_tokens": 12377241.0,
      "reward": 0.05543770641088486,
      "reward_std": 0.02084982395172119,
      "rewards/bleu_reward_func/mean": 0.05543770641088486,
      "rewards/bleu_reward_func/std": 0.03484691306948662,
      "step": 919
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 357.15625,
      "completions/mean_terminated_length": 220.5294189453125,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "epoch": 0.736,
      "grad_norm": 2.5606613159179688,
      "kl": 0.04241943359375,
      "learning_rate": 1e-06,
      "loss": -0.0456,
      "num_tokens": 12391358.0,
      "reward": 0.03371516987681389,
      "reward_std": 0.011735515668988228,
      "rewards/bleu_reward_func/mean": 0.03371516987681389,
      "rewards/bleu_reward_func/std": 0.018063481897115707,
      "step": 920
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 294.625,
      "completions/mean_terminated_length": 287.6128845214844,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "epoch": 0.7368,
      "grad_norm": 2.889098644256592,
      "kl": 0.041656494140625,
      "learning_rate": 1e-06,
      "loss": 0.1173,
      "num_tokens": 12402634.0,
      "reward": 0.05923088267445564,
      "reward_std": 0.029831603169441223,
      "rewards/bleu_reward_func/mean": 0.05923088267445564,
      "rewards/bleu_reward_func/std": 0.04481290653347969,
      "step": 921
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 241.3125,
      "completions/mean_terminated_length": 191.1851806640625,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.7376,
      "grad_norm": 4.053657531738281,
      "kl": 0.046722412109375,
      "learning_rate": 1e-06,
      "loss": -0.046,
      "num_tokens": 12412556.0,
      "reward": 0.06542409956455231,
      "reward_std": 0.025028303265571594,
      "rewards/bleu_reward_func/mean": 0.06542409956455231,
      "rewards/bleu_reward_func/std": 0.05025548115372658,
      "step": 922
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 451.0,
      "completions/mean_length": 329.5,
      "completions/mean_terminated_length": 268.66668701171875,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "epoch": 0.7384,
      "grad_norm": 2.5536530017852783,
      "kl": 0.0421142578125,
      "learning_rate": 1e-06,
      "loss": -0.0712,
      "num_tokens": 12425148.0,
      "reward": 0.04631096124649048,
      "reward_std": 0.024599246680736542,
      "rewards/bleu_reward_func/mean": 0.04631096124649048,
      "rewards/bleu_reward_func/std": 0.04204652085900307,
      "step": 923
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 350.40625,
      "completions/mean_terminated_length": 313.1153869628906,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.7392,
      "grad_norm": 3.109853982925415,
      "kl": 0.04412841796875,
      "learning_rate": 1e-06,
      "loss": -0.0658,
      "num_tokens": 12438361.0,
      "reward": 0.032508477568626404,
      "reward_std": 0.016699161380529404,
      "rewards/bleu_reward_func/mean": 0.032508477568626404,
      "rewards/bleu_reward_func/std": 0.028412526473402977,
      "step": 924
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 275.59375,
      "completions/mean_terminated_length": 267.9677429199219,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "epoch": 0.74,
      "grad_norm": 2.6314737796783447,
      "kl": 0.06317138671875,
      "learning_rate": 1e-06,
      "loss": 0.1181,
      "num_tokens": 12449460.0,
      "reward": 0.03400625288486481,
      "reward_std": 0.011289702728390694,
      "rewards/bleu_reward_func/mean": 0.03400625288486481,
      "rewards/bleu_reward_func/std": 0.01916448399424553,
      "step": 925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 310.5,
      "completions/mean_terminated_length": 231.6521759033203,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "epoch": 0.7408,
      "grad_norm": 2.9635672569274902,
      "kl": 0.030517578125,
      "learning_rate": 1e-06,
      "loss": 0.1214,
      "num_tokens": 12461476.0,
      "reward": 0.07918738573789597,
      "reward_std": 0.06687770783901215,
      "rewards/bleu_reward_func/mean": 0.07918738573789597,
      "rewards/bleu_reward_func/std": 0.1105826124548912,
      "step": 926
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 364.875,
      "completions/mean_terminated_length": 198.1333465576172,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.7416,
      "grad_norm": 5.0117106437683105,
      "kl": 0.0650634765625,
      "learning_rate": 1e-06,
      "loss": -0.0201,
      "num_tokens": 12478872.0,
      "reward": 0.10731503367424011,
      "reward_std": 0.03286542743444443,
      "rewards/bleu_reward_func/mean": 0.10731503367424011,
      "rewards/bleu_reward_func/std": 0.059237416833639145,
      "step": 927
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 285.75,
      "completions/mean_terminated_length": 262.3448181152344,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.7424,
      "grad_norm": 3.3980448246002197,
      "kl": 0.042205810546875,
      "learning_rate": 1e-06,
      "loss": -0.025,
      "num_tokens": 12490048.0,
      "reward": 0.0708259865641594,
      "reward_std": 0.02675773948431015,
      "rewards/bleu_reward_func/mean": 0.0708259865641594,
      "rewards/bleu_reward_func/std": 0.04802871122956276,
      "step": 928
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 284.53125,
      "completions/mean_terminated_length": 252.0357208251953,
      "completions/min_length": 70.0,
      "completions/min_terminated_length": 70.0,
      "epoch": 0.7432,
      "grad_norm": 2.9506585597991943,
      "kl": 0.0633544921875,
      "learning_rate": 1e-06,
      "loss": -0.0753,
      "num_tokens": 12501129.0,
      "reward": 0.10486802458763123,
      "reward_std": 0.022330686450004578,
      "rewards/bleu_reward_func/mean": 0.10486802458763123,
      "rewards/bleu_reward_func/std": 0.090861976146698,
      "step": 929
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 341.9375,
      "completions/mean_terminated_length": 252.85714721679688,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "epoch": 0.744,
      "grad_norm": 2.826270580291748,
      "kl": 0.04290771484375,
      "learning_rate": 1e-06,
      "loss": -0.0349,
      "num_tokens": 12516175.0,
      "reward": 0.06202811375260353,
      "reward_std": 0.030784565955400467,
      "rewards/bleu_reward_func/mean": 0.06202811375260353,
      "rewards/bleu_reward_func/std": 0.0687461644411087,
      "step": 930
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 242.90625,
      "completions/mean_terminated_length": 215.0689697265625,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.7448,
      "grad_norm": 4.026130676269531,
      "kl": 0.0509033203125,
      "learning_rate": 1e-06,
      "loss": -0.1079,
      "num_tokens": 12528500.0,
      "reward": 0.0759795531630516,
      "reward_std": 0.07113184779882431,
      "rewards/bleu_reward_func/mean": 0.0759795531630516,
      "rewards/bleu_reward_func/std": 0.14475159347057343,
      "step": 931
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 480.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 188.21875,
      "completions/mean_terminated_length": 188.21875,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.7456,
      "grad_norm": 6.228883266448975,
      "kl": 0.0828857421875,
      "learning_rate": 1e-06,
      "loss": -0.0074,
      "num_tokens": 12538371.0,
      "reward": 0.048906613141298294,
      "reward_std": 0.01954766921699047,
      "rewards/bleu_reward_func/mean": 0.048906613141298294,
      "rewards/bleu_reward_func/std": 0.059886980801820755,
      "step": 932
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 320.5,
      "completions/mean_terminated_length": 266.8800048828125,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "epoch": 0.7464,
      "grad_norm": 2.9069249629974365,
      "kl": 0.0599365234375,
      "learning_rate": 1e-06,
      "loss": 0.0547,
      "num_tokens": 12550459.0,
      "reward": 0.027797240763902664,
      "reward_std": 0.00949520617723465,
      "rewards/bleu_reward_func/mean": 0.027797240763902664,
      "rewards/bleu_reward_func/std": 0.023359699174761772,
      "step": 933
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 280.5625,
      "completions/mean_terminated_length": 273.0967712402344,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.7472,
      "grad_norm": 3.0222465991973877,
      "kl": 0.09490966796875,
      "learning_rate": 1e-06,
      "loss": 0.0435,
      "num_tokens": 12561077.0,
      "reward": 0.05467883497476578,
      "reward_std": 0.01736966334283352,
      "rewards/bleu_reward_func/mean": 0.05467883497476578,
      "rewards/bleu_reward_func/std": 0.05791114643216133,
      "step": 934
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 473.59375,
      "completions/mean_terminated_length": 389.1000061035156,
      "completions/min_length": 242.0,
      "completions/min_terminated_length": 242.0,
      "epoch": 0.748,
      "grad_norm": 2.246959924697876,
      "kl": 0.0468292236328125,
      "learning_rate": 1e-06,
      "loss": 0.0044,
      "num_tokens": 12579592.0,
      "reward": 0.059155356138944626,
      "reward_std": 0.012122802436351776,
      "rewards/bleu_reward_func/mean": 0.059155356138944626,
      "rewards/bleu_reward_func/std": 0.03615579754114151,
      "step": 935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 443.0,
      "completions/max_terminated_length": 443.0,
      "completions/mean_length": 175.96875,
      "completions/mean_terminated_length": 175.96875,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.7488,
      "grad_norm": 3.9639289379119873,
      "kl": 0.05889892578125,
      "learning_rate": 1e-06,
      "loss": 0.1749,
      "num_tokens": 12587911.0,
      "reward": 0.04553116112947464,
      "reward_std": 0.027312763035297394,
      "rewards/bleu_reward_func/mean": 0.04553116112947464,
      "rewards/bleu_reward_func/std": 0.045869771391153336,
      "step": 936
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 298.21875,
      "completions/mean_terminated_length": 291.32257080078125,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.7496,
      "grad_norm": 3.3128387928009033,
      "kl": 0.05743408203125,
      "learning_rate": 1e-06,
      "loss": 0.048,
      "num_tokens": 12601302.0,
      "reward": 0.02700314298272133,
      "reward_std": 0.011166905984282494,
      "rewards/bleu_reward_func/mean": 0.02700314298272133,
      "rewards/bleu_reward_func/std": 0.015710683539509773,
      "step": 937
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 286.0625,
      "completions/mean_terminated_length": 210.75,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.7504,
      "grad_norm": 5.0102081298828125,
      "kl": 0.067474365234375,
      "learning_rate": 1e-06,
      "loss": 0.0025,
      "num_tokens": 12615776.0,
      "reward": 0.052227430045604706,
      "reward_std": 0.014771172776818275,
      "rewards/bleu_reward_func/mean": 0.052227430045604706,
      "rewards/bleu_reward_func/std": 0.019430244341492653,
      "step": 938
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 404.21875,
      "completions/mean_terminated_length": 347.76190185546875,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.7512,
      "grad_norm": 2.042522430419922,
      "kl": 0.04541015625,
      "learning_rate": 1e-06,
      "loss": 0.0251,
      "num_tokens": 12631239.0,
      "reward": 0.05530402064323425,
      "reward_std": 0.019676920026540756,
      "rewards/bleu_reward_func/mean": 0.05530402064323425,
      "rewards/bleu_reward_func/std": 0.027803683653473854,
      "step": 939
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 295.5,
      "completions/mean_terminated_length": 104.47058868408203,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.752,
      "grad_norm": 4.662267208099365,
      "kl": 0.09735107421875,
      "learning_rate": 1e-06,
      "loss": -0.0136,
      "num_tokens": 12643503.0,
      "reward": 0.05965786427259445,
      "reward_std": 0.027585718780755997,
      "rewards/bleu_reward_func/mean": 0.05965786427259445,
      "rewards/bleu_reward_func/std": 0.0370444729924202,
      "step": 940
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 303.625,
      "completions/mean_terminated_length": 273.8571472167969,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.7528,
      "grad_norm": 2.7425622940063477,
      "kl": 0.05535888671875,
      "learning_rate": 1e-06,
      "loss": 0.0114,
      "num_tokens": 12655531.0,
      "reward": 0.04901716113090515,
      "reward_std": 0.00994904711842537,
      "rewards/bleu_reward_func/mean": 0.04901716113090515,
      "rewards/bleu_reward_func/std": 0.040510956197977066,
      "step": 941
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 348.3125,
      "completions/mean_terminated_length": 262.5714416503906,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "epoch": 0.7536,
      "grad_norm": 2.3559534549713135,
      "kl": 0.04656982421875,
      "learning_rate": 1e-06,
      "loss": -0.0641,
      "num_tokens": 12670781.0,
      "reward": 0.03677675127983093,
      "reward_std": 0.009964315220713615,
      "rewards/bleu_reward_func/mean": 0.03677675127983093,
      "rewards/bleu_reward_func/std": 0.019602550193667412,
      "step": 942
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 412.0,
      "completions/mean_length": 229.03125,
      "completions/mean_terminated_length": 163.73077392578125,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.7544,
      "grad_norm": 3.2343432903289795,
      "kl": 0.071380615234375,
      "learning_rate": 1e-06,
      "loss": -0.0938,
      "num_tokens": 12683318.0,
      "reward": 0.03962566331028938,
      "reward_std": 0.011244509369134903,
      "rewards/bleu_reward_func/mean": 0.03962566331028938,
      "rewards/bleu_reward_func/std": 0.01717083901166916,
      "step": 943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 476.0,
      "completions/mean_length": 315.84375,
      "completions/mean_terminated_length": 142.76470947265625,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.7552,
      "grad_norm": 3.105497360229492,
      "kl": 0.10089111328125,
      "learning_rate": 1e-06,
      "loss": 0.0565,
      "num_tokens": 12696873.0,
      "reward": 0.046081312000751495,
      "reward_std": 0.007257817313075066,
      "rewards/bleu_reward_func/mean": 0.046081312000751495,
      "rewards/bleu_reward_func/std": 0.023574350401759148,
      "step": 944
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 460.375,
      "completions/mean_terminated_length": 394.0000305175781,
      "completions/min_length": 304.0,
      "completions/min_terminated_length": 304.0,
      "epoch": 0.756,
      "grad_norm": 2.2013721466064453,
      "kl": 0.062255859375,
      "learning_rate": 1e-06,
      "loss": -0.0391,
      "num_tokens": 12715285.0,
      "reward": 0.07103259861469269,
      "reward_std": 0.017729321494698524,
      "rewards/bleu_reward_func/mean": 0.07103259861469269,
      "rewards/bleu_reward_func/std": 0.03985920175909996,
      "step": 945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 280.65625,
      "completions/mean_terminated_length": 203.5416717529297,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 0.7568,
      "grad_norm": 5.288199424743652,
      "kl": 0.08099365234375,
      "learning_rate": 1e-06,
      "loss": 0.041,
      "num_tokens": 12730138.0,
      "reward": 0.02993028610944748,
      "reward_std": 0.00784086249768734,
      "rewards/bleu_reward_func/mean": 0.02993028610944748,
      "rewards/bleu_reward_func/std": 0.01985708624124527,
      "step": 946
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 404.0,
      "completions/mean_length": 383.3125,
      "completions/mean_terminated_length": 168.83334350585938,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.7576,
      "grad_norm": 3.2791404724121094,
      "kl": 0.07684326171875,
      "learning_rate": 1e-06,
      "loss": -0.0277,
      "num_tokens": 12744788.0,
      "reward": 0.041376739740371704,
      "reward_std": 0.009136617183685303,
      "rewards/bleu_reward_func/mean": 0.041376739740371704,
      "rewards/bleu_reward_func/std": 0.034718483686447144,
      "step": 947
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 306.40625,
      "completions/mean_terminated_length": 268.3333435058594,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "epoch": 0.7584,
      "grad_norm": 2.8197238445281982,
      "kl": 0.037139892578125,
      "learning_rate": 1e-06,
      "loss": -0.0318,
      "num_tokens": 12757025.0,
      "reward": 0.06905034184455872,
      "reward_std": 0.024921495467424393,
      "rewards/bleu_reward_func/mean": 0.06905034184455872,
      "rewards/bleu_reward_func/std": 0.059942930936813354,
      "step": 948
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 278.21875,
      "completions/mean_terminated_length": 212.75999450683594,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.7592,
      "grad_norm": 4.428009510040283,
      "kl": 0.064208984375,
      "learning_rate": 1e-06,
      "loss": 0.1261,
      "num_tokens": 12771512.0,
      "reward": 0.08314824104309082,
      "reward_std": 0.028866248205304146,
      "rewards/bleu_reward_func/mean": 0.08314824104309082,
      "rewards/bleu_reward_func/std": 0.07680681347846985,
      "step": 949
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 471.0,
      "completions/mean_length": 303.3125,
      "completions/mean_terminated_length": 289.4000244140625,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.76,
      "grad_norm": 3.076261043548584,
      "kl": 0.0711669921875,
      "learning_rate": 1e-06,
      "loss": 0.0991,
      "num_tokens": 12783722.0,
      "reward": 0.0795808807015419,
      "reward_std": 0.02329542487859726,
      "rewards/bleu_reward_func/mean": 0.0795808807015419,
      "rewards/bleu_reward_func/std": 0.055487681180238724,
      "step": 950
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 441.65625,
      "completions/mean_terminated_length": 399.45001220703125,
      "completions/min_length": 254.0,
      "completions/min_terminated_length": 254.0,
      "epoch": 0.7608,
      "grad_norm": 2.099597454071045,
      "kl": 0.042877197265625,
      "learning_rate": 1e-06,
      "loss": 0.0511,
      "num_tokens": 12800943.0,
      "reward": 0.04137660562992096,
      "reward_std": 0.01036759465932846,
      "rewards/bleu_reward_func/mean": 0.04137660562992096,
      "rewards/bleu_reward_func/std": 0.016977576538920403,
      "step": 951
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 418.0,
      "completions/mean_length": 216.3125,
      "completions/mean_terminated_length": 196.60000610351562,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.7616,
      "grad_norm": 3.2768120765686035,
      "kl": 0.04909515380859375,
      "learning_rate": 1e-06,
      "loss": 0.0627,
      "num_tokens": 12812129.0,
      "reward": 0.0957229807972908,
      "reward_std": 0.04901205375790596,
      "rewards/bleu_reward_func/mean": 0.0957229807972908,
      "rewards/bleu_reward_func/std": 0.13014653325080872,
      "step": 952
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 507.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 316.53125,
      "completions/mean_terminated_length": 316.53125,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.7624,
      "grad_norm": 2.4262685775756836,
      "kl": 0.03955078125,
      "learning_rate": 1e-06,
      "loss": -0.044,
      "num_tokens": 12824258.0,
      "reward": 0.06452548503875732,
      "reward_std": 0.02723013609647751,
      "rewards/bleu_reward_func/mean": 0.06452548503875732,
      "rewards/bleu_reward_func/std": 0.05586642026901245,
      "step": 953
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 381.0,
      "completions/mean_terminated_length": 344.32000732421875,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.7632,
      "grad_norm": 1.949605107307434,
      "kl": 0.03692626953125,
      "learning_rate": 1e-06,
      "loss": -0.1027,
      "num_tokens": 12838722.0,
      "reward": 0.05391934886574745,
      "reward_std": 0.022434931248426437,
      "rewards/bleu_reward_func/mean": 0.05391934886574745,
      "rewards/bleu_reward_func/std": 0.04070979356765747,
      "step": 954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 455.0,
      "completions/mean_length": 252.34375,
      "completions/mean_terminated_length": 243.9677276611328,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.764,
      "grad_norm": 2.8324923515319824,
      "kl": 0.046630859375,
      "learning_rate": 1e-06,
      "loss": -0.0475,
      "num_tokens": 12849053.0,
      "reward": 0.07158366590738297,
      "reward_std": 0.04349514842033386,
      "rewards/bleu_reward_func/mean": 0.07158366590738297,
      "rewards/bleu_reward_func/std": 0.07585739344358444,
      "step": 955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 469.0,
      "completions/mean_length": 298.46875,
      "completions/mean_terminated_length": 132.38888549804688,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.7648,
      "grad_norm": 5.58326530456543,
      "kl": 0.057861328125,
      "learning_rate": 1e-06,
      "loss": 0.1246,
      "num_tokens": 12863756.0,
      "reward": 0.03694935888051987,
      "reward_std": 0.015439806506037712,
      "rewards/bleu_reward_func/mean": 0.03694935888051987,
      "rewards/bleu_reward_func/std": 0.02632940374314785,
      "step": 956
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 367.34375,
      "completions/mean_terminated_length": 254.8333282470703,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.7656,
      "grad_norm": 3.1758320331573486,
      "kl": 0.0777130126953125,
      "learning_rate": 1e-06,
      "loss": 0.0586,
      "num_tokens": 12877663.0,
      "reward": 0.03734767809510231,
      "reward_std": 0.018983110785484314,
      "rewards/bleu_reward_func/mean": 0.03734767809510231,
      "rewards/bleu_reward_func/std": 0.026389576494693756,
      "step": 957
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 156.90625,
      "completions/mean_terminated_length": 145.4516143798828,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.7664,
      "grad_norm": 4.952297210693359,
      "kl": 0.08062744140625,
      "learning_rate": 1e-06,
      "loss": -0.0385,
      "num_tokens": 12885020.0,
      "reward": 0.029940243810415268,
      "reward_std": 0.010614018887281418,
      "rewards/bleu_reward_func/mean": 0.029940243810415268,
      "rewards/bleu_reward_func/std": 0.018806666135787964,
      "step": 958
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 462.0,
      "completions/mean_length": 378.75,
      "completions/mean_terminated_length": 308.952392578125,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "epoch": 0.7672,
      "grad_norm": 2.2859702110290527,
      "kl": 0.04290771484375,
      "learning_rate": 1e-06,
      "loss": 0.0048,
      "num_tokens": 12900348.0,
      "reward": 0.028122084215283394,
      "reward_std": 0.008207820355892181,
      "rewards/bleu_reward_func/mean": 0.028122084215283394,
      "rewards/bleu_reward_func/std": 0.016654757782816887,
      "step": 959
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 448.0,
      "completions/mean_length": 254.71875,
      "completions/mean_terminated_length": 237.56668090820312,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.768,
      "grad_norm": 3.055973768234253,
      "kl": 0.04168701171875,
      "learning_rate": 1e-06,
      "loss": -0.056,
      "num_tokens": 12912875.0,
      "reward": 0.04096674174070358,
      "reward_std": 0.01361355185508728,
      "rewards/bleu_reward_func/mean": 0.04096674174070358,
      "rewards/bleu_reward_func/std": 0.023293767124414444,
      "step": 960
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 504.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 243.5,
      "completions/mean_terminated_length": 243.5,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.7688,
      "grad_norm": 3.707246780395508,
      "kl": 0.040069580078125,
      "learning_rate": 1e-06,
      "loss": -0.0225,
      "num_tokens": 12923339.0,
      "reward": 0.09101220965385437,
      "reward_std": 0.03245137259364128,
      "rewards/bleu_reward_func/mean": 0.09101220965385437,
      "rewards/bleu_reward_func/std": 0.052579786628484726,
      "step": 961
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 339.0,
      "completions/mean_length": 235.5625,
      "completions/mean_terminated_length": 143.4166717529297,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "epoch": 0.7696,
      "grad_norm": 4.457637310028076,
      "kl": 0.109130859375,
      "learning_rate": 1e-06,
      "loss": 0.0719,
      "num_tokens": 12936781.0,
      "reward": 0.08339278399944305,
      "reward_std": 0.03469950705766678,
      "rewards/bleu_reward_func/mean": 0.08339278399944305,
      "rewards/bleu_reward_func/std": 0.08822762966156006,
      "step": 962
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 379.0,
      "completions/max_terminated_length": 379.0,
      "completions/mean_length": 173.65625,
      "completions/mean_terminated_length": 173.65625,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.7704,
      "grad_norm": 4.618581295013428,
      "kl": 0.072509765625,
      "learning_rate": 1e-06,
      "loss": -0.0867,
      "num_tokens": 12944306.0,
      "reward": 0.03851824253797531,
      "reward_std": 0.029345914721488953,
      "rewards/bleu_reward_func/mean": 0.03851824253797531,
      "rewards/bleu_reward_func/std": 0.04824645444750786,
      "step": 963
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 354.375,
      "completions/mean_terminated_length": 331.8571472167969,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "epoch": 0.7712,
      "grad_norm": 2.1267480850219727,
      "kl": 0.046844482421875,
      "learning_rate": 1e-06,
      "loss": 0.1329,
      "num_tokens": 12957422.0,
      "reward": 0.03777143731713295,
      "reward_std": 0.019737938418984413,
      "rewards/bleu_reward_func/mean": 0.03777143731713295,
      "rewards/bleu_reward_func/std": 0.039779361337423325,
      "step": 964
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 259.96875,
      "completions/mean_terminated_length": 189.39999389648438,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.772,
      "grad_norm": 5.85866641998291,
      "kl": 0.081817626953125,
      "learning_rate": 1e-06,
      "loss": 0.0736,
      "num_tokens": 12972021.0,
      "reward": 0.07170456647872925,
      "reward_std": 0.020086858421564102,
      "rewards/bleu_reward_func/mean": 0.07170456647872925,
      "rewards/bleu_reward_func/std": 0.04547082632780075,
      "step": 965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 408.0,
      "completions/mean_length": 342.09375,
      "completions/mean_terminated_length": 294.5199890136719,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "epoch": 0.7728,
      "grad_norm": 2.059349298477173,
      "kl": 0.028564453125,
      "learning_rate": 1e-06,
      "loss": 0.0158,
      "num_tokens": 12985888.0,
      "reward": 0.06088118255138397,
      "reward_std": 0.01464940793812275,
      "rewards/bleu_reward_func/mean": 0.06088118255138397,
      "rewards/bleu_reward_func/std": 0.031058233231306076,
      "step": 966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 400.0,
      "completions/mean_length": 233.78125,
      "completions/mean_terminated_length": 215.23333740234375,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "epoch": 0.7736,
      "grad_norm": 2.824509620666504,
      "kl": 0.04962158203125,
      "learning_rate": 1e-06,
      "loss": 0.0278,
      "num_tokens": 12995185.0,
      "reward": 0.058438584208488464,
      "reward_std": 0.017195967957377434,
      "rewards/bleu_reward_func/mean": 0.058438584208488464,
      "rewards/bleu_reward_func/std": 0.0505751296877861,
      "step": 967
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 342.1875,
      "completions/mean_terminated_length": 275.7391357421875,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.7744,
      "grad_norm": 4.1130452156066895,
      "kl": 0.0518798828125,
      "learning_rate": 1e-06,
      "loss": -0.2853,
      "num_tokens": 13009615.0,
      "reward": 0.050040245056152344,
      "reward_std": 0.01541995070874691,
      "rewards/bleu_reward_func/mean": 0.050040245056152344,
      "rewards/bleu_reward_func/std": 0.0558185949921608,
      "step": 968
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 450.0,
      "completions/mean_length": 203.75,
      "completions/mean_terminated_length": 183.20001220703125,
      "completions/min_length": 38.0,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.7752,
      "grad_norm": 4.958765506744385,
      "kl": 0.0552978515625,
      "learning_rate": 1e-06,
      "loss": -0.0387,
      "num_tokens": 13020095.0,
      "reward": 0.06027444452047348,
      "reward_std": 0.027696281671524048,
      "rewards/bleu_reward_func/mean": 0.06027444452047348,
      "rewards/bleu_reward_func/std": 0.03125971183180809,
      "step": 969
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 331.8125,
      "completions/mean_terminated_length": 223.6999969482422,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "epoch": 0.776,
      "grad_norm": 3.1290132999420166,
      "kl": 0.0631103515625,
      "learning_rate": 1e-06,
      "loss": 0.0774,
      "num_tokens": 13032841.0,
      "reward": 0.041490666568279266,
      "reward_std": 0.023433692753314972,
      "rewards/bleu_reward_func/mean": 0.041490666568279266,
      "rewards/bleu_reward_func/std": 0.030176030471920967,
      "step": 970
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 231.125,
      "completions/mean_terminated_length": 212.40000915527344,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.7768,
      "grad_norm": 5.468991279602051,
      "kl": 0.04461669921875,
      "learning_rate": 1e-06,
      "loss": 0.0587,
      "num_tokens": 13042621.0,
      "reward": 0.038487743586301804,
      "reward_std": 0.011531597934663296,
      "rewards/bleu_reward_func/mean": 0.038487743586301804,
      "rewards/bleu_reward_func/std": 0.01654433086514473,
      "step": 971
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 274.09375,
      "completions/mean_terminated_length": 240.10714721679688,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.7776,
      "grad_norm": 3.751850128173828,
      "kl": 0.084228515625,
      "learning_rate": 1e-06,
      "loss": 0.0013,
      "num_tokens": 13053616.0,
      "reward": 0.03094501793384552,
      "reward_std": 0.012867014855146408,
      "rewards/bleu_reward_func/mean": 0.03094501793384552,
      "rewards/bleu_reward_func/std": 0.01749509572982788,
      "step": 972
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 313.8125,
      "completions/mean_terminated_length": 285.5,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.7784,
      "grad_norm": 4.073342800140381,
      "kl": 0.043365478515625,
      "learning_rate": 1e-06,
      "loss": 0.0175,
      "num_tokens": 13069146.0,
      "reward": 0.07553990185260773,
      "reward_std": 0.018323319032788277,
      "rewards/bleu_reward_func/mean": 0.07553990185260773,
      "rewards/bleu_reward_func/std": 0.05046038329601288,
      "step": 973
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 287.15625,
      "completions/mean_terminated_length": 245.51852416992188,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.7792,
      "grad_norm": 4.537550449371338,
      "kl": 0.14056396484375,
      "learning_rate": 1e-06,
      "loss": -0.0867,
      "num_tokens": 13081991.0,
      "reward": 0.03124011494219303,
      "reward_std": 0.010205641388893127,
      "rewards/bleu_reward_func/mean": 0.03124011494219303,
      "rewards/bleu_reward_func/std": 0.019145779311656952,
      "step": 974
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 242.28125,
      "completions/mean_terminated_length": 214.37930297851562,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.78,
      "grad_norm": 4.508155822753906,
      "kl": 0.08660888671875,
      "learning_rate": 1e-06,
      "loss": 0.0179,
      "num_tokens": 13092056.0,
      "reward": 0.054113149642944336,
      "reward_std": 0.026252152398228645,
      "rewards/bleu_reward_func/mean": 0.054113149642944336,
      "rewards/bleu_reward_func/std": 0.0647139772772789,
      "step": 975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 284.25,
      "completions/mean_terminated_length": 251.71429443359375,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.7808,
      "grad_norm": 3.1219375133514404,
      "kl": 0.06097412109375,
      "learning_rate": 1e-06,
      "loss": 0.1548,
      "num_tokens": 13104032.0,
      "reward": 0.050552576780319214,
      "reward_std": 0.014210234396159649,
      "rewards/bleu_reward_func/mean": 0.050552576780319214,
      "rewards/bleu_reward_func/std": 0.04330144450068474,
      "step": 976
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 194.8125,
      "completions/mean_terminated_length": 173.6666717529297,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.7816,
      "grad_norm": 6.059234619140625,
      "kl": 0.1226806640625,
      "learning_rate": 1e-06,
      "loss": -0.1676,
      "num_tokens": 13114154.0,
      "reward": 0.07722032815217972,
      "reward_std": 0.03570058196783066,
      "rewards/bleu_reward_func/mean": 0.07722032815217972,
      "rewards/bleu_reward_func/std": 0.06760042905807495,
      "step": 977
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 207.125,
      "completions/mean_terminated_length": 175.58621215820312,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.7824,
      "grad_norm": 4.949698448181152,
      "kl": 0.11578369140625,
      "learning_rate": 1e-06,
      "loss": -0.1415,
      "num_tokens": 13123174.0,
      "reward": 0.09893815964460373,
      "reward_std": 0.022481422871351242,
      "rewards/bleu_reward_func/mean": 0.09893815964460373,
      "rewards/bleu_reward_func/std": 0.05471767112612724,
      "step": 978
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 480.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 265.34375,
      "completions/mean_terminated_length": 265.34375,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.7832,
      "grad_norm": 5.226635932922363,
      "kl": 0.05767822265625,
      "learning_rate": 1e-06,
      "loss": -0.1474,
      "num_tokens": 13134937.0,
      "reward": 0.05351312458515167,
      "reward_std": 0.021641388535499573,
      "rewards/bleu_reward_func/mean": 0.05351312458515167,
      "rewards/bleu_reward_func/std": 0.03878382593393326,
      "step": 979
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 381.8125,
      "completions/mean_terminated_length": 357.7037048339844,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "epoch": 0.784,
      "grad_norm": 2.6123292446136475,
      "kl": 0.04351806640625,
      "learning_rate": 1e-06,
      "loss": -0.0909,
      "num_tokens": 13149627.0,
      "reward": 0.04280403256416321,
      "reward_std": 0.014645563438534737,
      "rewards/bleu_reward_func/mean": 0.04280403256416321,
      "rewards/bleu_reward_func/std": 0.024342091754078865,
      "step": 980
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 384.5625,
      "completions/mean_terminated_length": 355.15386962890625,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "epoch": 0.7848,
      "grad_norm": 2.536220073699951,
      "kl": 0.04449462890625,
      "learning_rate": 1e-06,
      "loss": -0.0598,
      "num_tokens": 13167829.0,
      "reward": 0.04370192438364029,
      "reward_std": 0.022699596360325813,
      "rewards/bleu_reward_func/mean": 0.04370192438364029,
      "rewards/bleu_reward_func/std": 0.029828311875462532,
      "step": 981
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 339.0625,
      "completions/mean_terminated_length": 271.39129638671875,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.7856,
      "grad_norm": 2.4252421855926514,
      "kl": 0.05010986328125,
      "learning_rate": 1e-06,
      "loss": -0.081,
      "num_tokens": 13182295.0,
      "reward": 0.07624062150716782,
      "reward_std": 0.023894930258393288,
      "rewards/bleu_reward_func/mean": 0.07624062150716782,
      "rewards/bleu_reward_func/std": 0.059954702854156494,
      "step": 982
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 379.0,
      "completions/mean_length": 154.65625,
      "completions/mean_terminated_length": 143.1290283203125,
      "completions/min_length": 22.0,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.7864,
      "grad_norm": 5.228031158447266,
      "kl": 0.062255859375,
      "learning_rate": 1e-06,
      "loss": 0.039,
      "num_tokens": 13190772.0,
      "reward": 0.06501435488462448,
      "reward_std": 0.021548718214035034,
      "rewards/bleu_reward_func/mean": 0.06501435488462448,
      "rewards/bleu_reward_func/std": 0.06341297179460526,
      "step": 983
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 390.125,
      "completions/mean_terminated_length": 295.3333435058594,
      "completions/min_length": 124.0,
      "completions/min_terminated_length": 124.0,
      "epoch": 0.7872,
      "grad_norm": 2.599104642868042,
      "kl": 0.04437255859375,
      "learning_rate": 1e-06,
      "loss": -0.0018,
      "num_tokens": 13209592.0,
      "reward": 0.08236557990312576,
      "reward_std": 0.036718301475048065,
      "rewards/bleu_reward_func/mean": 0.08236557990312576,
      "rewards/bleu_reward_func/std": 0.07425292581319809,
      "step": 984
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 462.0,
      "completions/mean_length": 209.71875,
      "completions/mean_terminated_length": 108.95833587646484,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.788,
      "grad_norm": 5.494588375091553,
      "kl": 0.076995849609375,
      "learning_rate": 1e-06,
      "loss": -0.3202,
      "num_tokens": 13221839.0,
      "reward": 0.12329405546188354,
      "reward_std": 0.04771920293569565,
      "rewards/bleu_reward_func/mean": 0.12329405546188354,
      "rewards/bleu_reward_func/std": 0.12109994888305664,
      "step": 985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 368.0625,
      "completions/mean_terminated_length": 269.5789489746094,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.7888,
      "grad_norm": 2.7720680236816406,
      "kl": 0.08441162109375,
      "learning_rate": 1e-06,
      "loss": -0.0384,
      "num_tokens": 13236369.0,
      "reward": 0.05234910920262337,
      "reward_std": 0.015475506894290447,
      "rewards/bleu_reward_func/mean": 0.05234910920262337,
      "rewards/bleu_reward_func/std": 0.03180435299873352,
      "step": 986
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 443.0,
      "completions/mean_length": 356.59375,
      "completions/mean_terminated_length": 263.3500061035156,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "epoch": 0.7896,
      "grad_norm": 2.421665668487549,
      "kl": 0.05206298828125,
      "learning_rate": 1e-06,
      "loss": 0.1374,
      "num_tokens": 13250052.0,
      "reward": 0.032731182873249054,
      "reward_std": 0.012113362550735474,
      "rewards/bleu_reward_func/mean": 0.032731182873249054,
      "rewards/bleu_reward_func/std": 0.020152967423200607,
      "step": 987
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 244.0,
      "completions/max_terminated_length": 244.0,
      "completions/mean_length": 131.03125,
      "completions/mean_terminated_length": 131.03125,
      "completions/min_length": 45.0,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.7904,
      "grad_norm": 3.4968388080596924,
      "kl": 0.0459136962890625,
      "learning_rate": 1e-06,
      "loss": -0.1296,
      "num_tokens": 13258461.0,
      "reward": 0.03249422460794449,
      "reward_std": 0.014297829940915108,
      "rewards/bleu_reward_func/mean": 0.03249422460794449,
      "rewards/bleu_reward_func/std": 0.041689660400152206,
      "step": 988
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 377.21875,
      "completions/mean_terminated_length": 352.2592468261719,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 161.0,
      "epoch": 0.7912,
      "grad_norm": 2.379380702972412,
      "kl": 0.04193115234375,
      "learning_rate": 1e-06,
      "loss": 0.0994,
      "num_tokens": 13273484.0,
      "reward": 0.026992671191692352,
      "reward_std": 0.006863096728920937,
      "rewards/bleu_reward_func/mean": 0.026992671191692352,
      "rewards/bleu_reward_func/std": 0.01573537290096283,
      "step": 989
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 274.40625,
      "completions/mean_terminated_length": 207.87998962402344,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "epoch": 0.792,
      "grad_norm": 2.755162477493286,
      "kl": 0.0350341796875,
      "learning_rate": 1e-06,
      "loss": 0.0889,
      "num_tokens": 13284617.0,
      "reward": 0.0448073148727417,
      "reward_std": 0.02365909144282341,
      "rewards/bleu_reward_func/mean": 0.0448073148727417,
      "rewards/bleu_reward_func/std": 0.03250681236386299,
      "step": 990
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 234.75,
      "completions/mean_terminated_length": 183.40740966796875,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.7928,
      "grad_norm": 3.7924673557281494,
      "kl": 0.0814208984375,
      "learning_rate": 1e-06,
      "loss": 0.051,
      "num_tokens": 13294065.0,
      "reward": 0.025965671986341476,
      "reward_std": 0.009640274569392204,
      "rewards/bleu_reward_func/mean": 0.025965671986341476,
      "rewards/bleu_reward_func/std": 0.013380059972405434,
      "step": 991
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 254.09375,
      "completions/mean_terminated_length": 181.87998962402344,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.7936,
      "grad_norm": 4.138406753540039,
      "kl": 0.06060791015625,
      "learning_rate": 1e-06,
      "loss": 0.1006,
      "num_tokens": 13307548.0,
      "reward": 0.06567200273275375,
      "reward_std": 0.028147000819444656,
      "rewards/bleu_reward_func/mean": 0.06567200273275375,
      "rewards/bleu_reward_func/std": 0.05875537171959877,
      "step": 992
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 442.0,
      "completions/mean_length": 366.71875,
      "completions/mean_terminated_length": 221.4375,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.7944,
      "grad_norm": 2.8539927005767822,
      "kl": 0.0716552734375,
      "learning_rate": 1e-06,
      "loss": 0.0213,
      "num_tokens": 13324051.0,
      "reward": 0.05763605982065201,
      "reward_std": 0.019663766026496887,
      "rewards/bleu_reward_func/mean": 0.05763605982065201,
      "rewards/bleu_reward_func/std": 0.023355742916464806,
      "step": 993
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 467.0,
      "completions/mean_length": 347.25,
      "completions/mean_terminated_length": 201.88235473632812,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.7952,
      "grad_norm": 2.3459670543670654,
      "kl": 0.05255126953125,
      "learning_rate": 1e-06,
      "loss": 0.0511,
      "num_tokens": 13341395.0,
      "reward": 0.034985754638910294,
      "reward_std": 0.017113730311393738,
      "rewards/bleu_reward_func/mean": 0.034985754638910294,
      "rewards/bleu_reward_func/std": 0.029299011453986168,
      "step": 994
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 436.0,
      "completions/mean_length": 342.8125,
      "completions/mean_terminated_length": 265.9090881347656,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.796,
      "grad_norm": 2.3377294540405273,
      "kl": 0.037078857421875,
      "learning_rate": 1e-06,
      "loss": 0.1455,
      "num_tokens": 13355133.0,
      "reward": 0.0256805419921875,
      "reward_std": 0.011619502678513527,
      "rewards/bleu_reward_func/mean": 0.0256805419921875,
      "rewards/bleu_reward_func/std": 0.021871058270335197,
      "step": 995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 352.0,
      "completions/mean_length": 239.15625,
      "completions/mean_terminated_length": 148.20834350585938,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.7968,
      "grad_norm": 3.7179551124572754,
      "kl": 0.059417724609375,
      "learning_rate": 1e-06,
      "loss": 0.1004,
      "num_tokens": 13369122.0,
      "reward": 0.14195622503757477,
      "reward_std": 0.09248249232769012,
      "rewards/bleu_reward_func/mean": 0.14195622503757477,
      "rewards/bleu_reward_func/std": 0.22318032383918762,
      "step": 996
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 352.0,
      "completions/mean_terminated_length": 329.14288330078125,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "epoch": 0.7976,
      "grad_norm": 2.396754026412964,
      "kl": 0.0418701171875,
      "learning_rate": 1e-06,
      "loss": 0.0533,
      "num_tokens": 13383586.0,
      "reward": 0.056401364505290985,
      "reward_std": 0.02100227400660515,
      "rewards/bleu_reward_func/mean": 0.056401364505290985,
      "rewards/bleu_reward_func/std": 0.06413192301988602,
      "step": 997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 437.0,
      "completions/max_terminated_length": 437.0,
      "completions/mean_length": 275.53125,
      "completions/mean_terminated_length": 275.53125,
      "completions/min_length": 93.0,
      "completions/min_terminated_length": 93.0,
      "epoch": 0.7984,
      "grad_norm": 3.08534836769104,
      "kl": 0.057586669921875,
      "learning_rate": 1e-06,
      "loss": 0.0777,
      "num_tokens": 13394491.0,
      "reward": 0.07327760756015778,
      "reward_std": 0.024326477199792862,
      "rewards/bleu_reward_func/mean": 0.07327760756015778,
      "rewards/bleu_reward_func/std": 0.053993549197912216,
      "step": 998
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 446.0,
      "completions/mean_length": 409.625,
      "completions/mean_terminated_length": 260.0,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 103.0,
      "epoch": 0.7992,
      "grad_norm": 2.469346046447754,
      "kl": 0.06561279296875,
      "learning_rate": 1e-06,
      "loss": 0.0564,
      "num_tokens": 13410879.0,
      "reward": 0.026840589940547943,
      "reward_std": 0.007271309848874807,
      "rewards/bleu_reward_func/mean": 0.026840589940547943,
      "rewards/bleu_reward_func/std": 0.018895737826824188,
      "step": 999
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 491.0,
      "completions/mean_length": 336.96875,
      "completions/mean_terminated_length": 296.5769348144531,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.8,
      "grad_norm": 2.4286251068115234,
      "kl": 0.0396728515625,
      "learning_rate": 1e-06,
      "loss": -0.0733,
      "num_tokens": 13426686.0,
      "reward": 0.03406568616628647,
      "reward_std": 0.017840351909399033,
      "rewards/bleu_reward_func/mean": 0.03406568616628647,
      "rewards/bleu_reward_func/std": 0.03109470196068287,
      "step": 1000
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 347.28125,
      "completions/mean_terminated_length": 330.2413635253906,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.8008,
      "grad_norm": 3.0312039852142334,
      "kl": 0.04400634765625,
      "learning_rate": 1e-06,
      "loss": -0.0108,
      "num_tokens": 13439727.0,
      "reward": 0.07976769655942917,
      "reward_std": 0.024406295269727707,
      "rewards/bleu_reward_func/mean": 0.07976769655942917,
      "rewards/bleu_reward_func/std": 0.07874192297458649,
      "step": 1001
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 371.375,
      "completions/mean_terminated_length": 338.923095703125,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 145.0,
      "epoch": 0.8016,
      "grad_norm": 2.266893148422241,
      "kl": 0.04248046875,
      "learning_rate": 1e-06,
      "loss": 0.0417,
      "num_tokens": 13453755.0,
      "reward": 0.030901743099093437,
      "reward_std": 0.010838410817086697,
      "rewards/bleu_reward_func/mean": 0.030901743099093437,
      "rewards/bleu_reward_func/std": 0.015698978677392006,
      "step": 1002
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 321.8125,
      "completions/mean_terminated_length": 222.1904754638672,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.8024,
      "grad_norm": 2.922273874282837,
      "kl": 0.0587158203125,
      "learning_rate": 1e-06,
      "loss": 0.0059,
      "num_tokens": 13466341.0,
      "reward": 0.029490074142813683,
      "reward_std": 0.009350080043077469,
      "rewards/bleu_reward_func/mean": 0.029490074142813683,
      "rewards/bleu_reward_func/std": 0.011806486174464226,
      "step": 1003
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 454.53125,
      "completions/mean_terminated_length": 358.75,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.8032,
      "grad_norm": 1.9764541387557983,
      "kl": 0.07159423828125,
      "learning_rate": 1e-06,
      "loss": -0.0171,
      "num_tokens": 13485782.0,
      "reward": 0.04661983996629715,
      "reward_std": 0.012564010918140411,
      "rewards/bleu_reward_func/mean": 0.04661983996629715,
      "rewards/bleu_reward_func/std": 0.0264245867729187,
      "step": 1004
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 455.0,
      "completions/mean_length": 325.6875,
      "completions/mean_terminated_length": 180.7777862548828,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.804,
      "grad_norm": 3.197694778442383,
      "kl": 0.05792236328125,
      "learning_rate": 1e-06,
      "loss": -0.0419,
      "num_tokens": 13499588.0,
      "reward": 0.09644008427858353,
      "reward_std": 0.03086179867386818,
      "rewards/bleu_reward_func/mean": 0.09644008427858353,
      "rewards/bleu_reward_func/std": 0.08714324980974197,
      "step": 1005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 185.96875,
      "completions/mean_terminated_length": 175.4516143798828,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.8048,
      "grad_norm": 4.771821975708008,
      "kl": 0.08587646484375,
      "learning_rate": 1e-06,
      "loss": 0.0805,
      "num_tokens": 13508259.0,
      "reward": 0.07439538836479187,
      "reward_std": 0.041163403540849686,
      "rewards/bleu_reward_func/mean": 0.07439538836479187,
      "rewards/bleu_reward_func/std": 0.08994947373867035,
      "step": 1006
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 423.0,
      "completions/mean_length": 197.96875,
      "completions/mean_terminated_length": 139.8148193359375,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.8056,
      "grad_norm": 3.3728678226470947,
      "kl": 0.0677490234375,
      "learning_rate": 1e-06,
      "loss": -0.0892,
      "num_tokens": 13516818.0,
      "reward": 0.0446418896317482,
      "reward_std": 0.016185201704502106,
      "rewards/bleu_reward_func/mean": 0.0446418896317482,
      "rewards/bleu_reward_func/std": 0.028570961207151413,
      "step": 1007
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 427.0,
      "completions/max_terminated_length": 427.0,
      "completions/mean_length": 216.4375,
      "completions/mean_terminated_length": 216.4375,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.8064,
      "grad_norm": 3.718839645385742,
      "kl": 0.03741455078125,
      "learning_rate": 1e-06,
      "loss": -0.0024,
      "num_tokens": 13526016.0,
      "reward": 0.10925551503896713,
      "reward_std": 0.03529820218682289,
      "rewards/bleu_reward_func/mean": 0.10925551503896713,
      "rewards/bleu_reward_func/std": 0.1309969276189804,
      "step": 1008
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 250.71875,
      "completions/mean_terminated_length": 163.625,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.8072,
      "grad_norm": 5.933348655700684,
      "kl": 0.0469512939453125,
      "learning_rate": 1e-06,
      "loss": -0.0945,
      "num_tokens": 13541023.0,
      "reward": 0.1345641016960144,
      "reward_std": 0.04520343244075775,
      "rewards/bleu_reward_func/mean": 0.1345641016960144,
      "rewards/bleu_reward_func/std": 0.17300467193126678,
      "step": 1009
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 433.0,
      "completions/mean_length": 323.9375,
      "completions/mean_terminated_length": 238.45455932617188,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.808,
      "grad_norm": 2.280043601989746,
      "kl": 0.0390625,
      "learning_rate": 1e-06,
      "loss": 0.0644,
      "num_tokens": 13556133.0,
      "reward": 0.05336067080497742,
      "reward_std": 0.015810808166861534,
      "rewards/bleu_reward_func/mean": 0.05336067080497742,
      "rewards/bleu_reward_func/std": 0.055556174367666245,
      "step": 1010
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 493.0,
      "completions/mean_length": 376.0,
      "completions/mean_terminated_length": 256.0,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 72.0,
      "epoch": 0.8088,
      "grad_norm": 2.4357681274414062,
      "kl": 0.06976318359375,
      "learning_rate": 1e-06,
      "loss": -0.0255,
      "num_tokens": 13571037.0,
      "reward": 0.054675132036209106,
      "reward_std": 0.013293720781803131,
      "rewards/bleu_reward_func/mean": 0.054675132036209106,
      "rewards/bleu_reward_func/std": 0.07392200827598572,
      "step": 1011
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 434.0,
      "completions/mean_length": 420.3125,
      "completions/mean_terminated_length": 245.27273559570312,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 134.0,
      "epoch": 0.8096,
      "grad_norm": 2.6430299282073975,
      "kl": 0.05645751953125,
      "learning_rate": 1e-06,
      "loss": -0.0093,
      "num_tokens": 13588983.0,
      "reward": 0.051457397639751434,
      "reward_std": 0.013515422120690346,
      "rewards/bleu_reward_func/mean": 0.051457397639751434,
      "rewards/bleu_reward_func/std": 0.03232685476541519,
      "step": 1012
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 237.03125,
      "completions/mean_terminated_length": 208.58621215820312,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.8104,
      "grad_norm": 3.8462376594543457,
      "kl": 0.0496826171875,
      "learning_rate": 1e-06,
      "loss": -0.0937,
      "num_tokens": 13599568.0,
      "reward": 0.039942845702171326,
      "reward_std": 0.011215153150260448,
      "rewards/bleu_reward_func/mean": 0.039942845702171326,
      "rewards/bleu_reward_func/std": 0.044292986392974854,
      "step": 1013
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 468.0,
      "completions/max_terminated_length": 468.0,
      "completions/mean_length": 246.8125,
      "completions/mean_terminated_length": 246.8125,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.8112,
      "grad_norm": 2.7778167724609375,
      "kl": 0.0450439453125,
      "learning_rate": 1e-06,
      "loss": 0.0687,
      "num_tokens": 13609586.0,
      "reward": 0.03794855996966362,
      "reward_std": 0.013286858797073364,
      "rewards/bleu_reward_func/mean": 0.03794855996966362,
      "rewards/bleu_reward_func/std": 0.018705854192376137,
      "step": 1014
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 405.15625,
      "completions/mean_terminated_length": 310.8823547363281,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.812,
      "grad_norm": 2.940082311630249,
      "kl": 0.052734375,
      "learning_rate": 1e-06,
      "loss": -0.1224,
      "num_tokens": 13628087.0,
      "reward": 0.02557062916457653,
      "reward_std": 0.013271758332848549,
      "rewards/bleu_reward_func/mean": 0.02557062916457653,
      "rewards/bleu_reward_func/std": 0.022851891815662384,
      "step": 1015
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 470.0,
      "completions/mean_length": 378.09375,
      "completions/mean_terminated_length": 340.6000061035156,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.8128,
      "grad_norm": 1.974778175354004,
      "kl": 0.0416259765625,
      "learning_rate": 1e-06,
      "loss": 0.0703,
      "num_tokens": 13642746.0,
      "reward": 0.0754852443933487,
      "reward_std": 0.04545840620994568,
      "rewards/bleu_reward_func/mean": 0.0754852443933487,
      "rewards/bleu_reward_func/std": 0.08230523765087128,
      "step": 1016
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 295.78125,
      "completions/mean_terminated_length": 255.74073791503906,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.8136,
      "grad_norm": 6.134253978729248,
      "kl": 0.17449951171875,
      "learning_rate": 1e-06,
      "loss": 0.0521,
      "num_tokens": 13656819.0,
      "reward": 0.10997641086578369,
      "reward_std": 0.030572956427931786,
      "rewards/bleu_reward_func/mean": 0.10997641086578369,
      "rewards/bleu_reward_func/std": 0.10363367944955826,
      "step": 1017
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 368.0,
      "completions/mean_length": 326.40625,
      "completions/mean_terminated_length": 162.64706420898438,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.8144,
      "grad_norm": 3.2220001220703125,
      "kl": 0.09747314453125,
      "learning_rate": 1e-06,
      "loss": 0.1033,
      "num_tokens": 13669280.0,
      "reward": 0.03305242210626602,
      "reward_std": 0.009479910135269165,
      "rewards/bleu_reward_func/mean": 0.03305242210626602,
      "rewards/bleu_reward_func/std": 0.018387850373983383,
      "step": 1018
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 277.0,
      "completions/mean_length": 217.21875,
      "completions/mean_terminated_length": 118.95833587646484,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.8152,
      "grad_norm": 4.264994144439697,
      "kl": 0.08660888671875,
      "learning_rate": 1e-06,
      "loss": 0.1815,
      "num_tokens": 13679071.0,
      "reward": 0.07369013130664825,
      "reward_std": 0.030868127942085266,
      "rewards/bleu_reward_func/mean": 0.07369013130664825,
      "rewards/bleu_reward_func/std": 0.08622196316719055,
      "step": 1019
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 347.78125,
      "completions/mean_terminated_length": 330.7930908203125,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "epoch": 0.816,
      "grad_norm": 2.2914328575134277,
      "kl": 0.04351806640625,
      "learning_rate": 1e-06,
      "loss": -0.0436,
      "num_tokens": 13692432.0,
      "reward": 0.05755352973937988,
      "reward_std": 0.015490137040615082,
      "rewards/bleu_reward_func/mean": 0.05755352973937988,
      "rewards/bleu_reward_func/std": 0.05329318344593048,
      "step": 1020
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 350.375,
      "completions/mean_terminated_length": 276.9090881347656,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "epoch": 0.8168,
      "grad_norm": 3.4762771129608154,
      "kl": 0.06256103515625,
      "learning_rate": 1e-06,
      "loss": -0.0528,
      "num_tokens": 13706084.0,
      "reward": 0.07274037599563599,
      "reward_std": 0.02579139545559883,
      "rewards/bleu_reward_func/mean": 0.07274037599563599,
      "rewards/bleu_reward_func/std": 0.05641184002161026,
      "step": 1021
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 378.125,
      "completions/mean_terminated_length": 359.0000305175781,
      "completions/min_length": 193.0,
      "completions/min_terminated_length": 193.0,
      "epoch": 0.8176,
      "grad_norm": 2.312500476837158,
      "kl": 0.04180908203125,
      "learning_rate": 1e-06,
      "loss": -0.0063,
      "num_tokens": 13720832.0,
      "reward": 0.04683421924710274,
      "reward_std": 0.016781536862254143,
      "rewards/bleu_reward_func/mean": 0.04683421924710274,
      "rewards/bleu_reward_func/std": 0.033053260296583176,
      "step": 1022
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 385.15625,
      "completions/mean_terminated_length": 273.23529052734375,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "epoch": 0.8184,
      "grad_norm": 2.3284387588500977,
      "kl": 0.061279296875,
      "learning_rate": 1e-06,
      "loss": 0.0868,
      "num_tokens": 13736573.0,
      "reward": 0.01643741875886917,
      "reward_std": 0.00393636105582118,
      "rewards/bleu_reward_func/mean": 0.01643741875886917,
      "rewards/bleu_reward_func/std": 0.011089936830103397,
      "step": 1023
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 294.75,
      "completions/mean_terminated_length": 244.61538696289062,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.8192,
      "grad_norm": 4.2773542404174805,
      "kl": 0.080963134765625,
      "learning_rate": 1e-06,
      "loss": -0.0395,
      "num_tokens": 13748445.0,
      "reward": 0.09083592146635056,
      "reward_std": 0.024321725592017174,
      "rewards/bleu_reward_func/mean": 0.09083592146635056,
      "rewards/bleu_reward_func/std": 0.07333005964756012,
      "step": 1024
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 423.75,
      "completions/mean_terminated_length": 276.66668701171875,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 141.0,
      "epoch": 0.82,
      "grad_norm": 2.270141363143921,
      "kl": 0.05609130859375,
      "learning_rate": 1e-06,
      "loss": 0.048,
      "num_tokens": 13766077.0,
      "reward": 0.02606740966439247,
      "reward_std": 0.010008657351136208,
      "rewards/bleu_reward_func/mean": 0.02606740966439247,
      "rewards/bleu_reward_func/std": 0.024353953078389168,
      "step": 1025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 447.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 172.625,
      "completions/mean_terminated_length": 172.625,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.8208,
      "grad_norm": 3.84765887260437,
      "kl": 0.060302734375,
      "learning_rate": 1e-06,
      "loss": 0.0361,
      "num_tokens": 13777121.0,
      "reward": 0.06873498857021332,
      "reward_std": 0.022971976548433304,
      "rewards/bleu_reward_func/mean": 0.06873498857021332,
      "rewards/bleu_reward_func/std": 0.0848066657781601,
      "step": 1026
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 414.0,
      "completions/max_terminated_length": 414.0,
      "completions/mean_length": 192.71875,
      "completions/mean_terminated_length": 192.71875,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "epoch": 0.8216,
      "grad_norm": 5.911830902099609,
      "kl": 0.13067626953125,
      "learning_rate": 1e-06,
      "loss": 0.2607,
      "num_tokens": 13785552.0,
      "reward": 0.12106100469827652,
      "reward_std": 0.032082945108413696,
      "rewards/bleu_reward_func/mean": 0.12106100469827652,
      "rewards/bleu_reward_func/std": 0.17073681950569153,
      "step": 1027
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 266.34375,
      "completions/mean_terminated_length": 249.9666748046875,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.8224,
      "grad_norm": 3.98777437210083,
      "kl": 0.05194091796875,
      "learning_rate": 1e-06,
      "loss": -0.0154,
      "num_tokens": 13798923.0,
      "reward": 0.03594356030225754,
      "reward_std": 0.02046639285981655,
      "rewards/bleu_reward_func/mean": 0.03594356030225754,
      "rewards/bleu_reward_func/std": 0.030679911375045776,
      "step": 1028
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 476.0,
      "completions/mean_length": 263.5,
      "completions/mean_terminated_length": 237.79310607910156,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 0.8232,
      "grad_norm": 3.6039505004882812,
      "kl": 0.06573486328125,
      "learning_rate": 1e-06,
      "loss": -0.0975,
      "num_tokens": 13809459.0,
      "reward": 0.034439343959093094,
      "reward_std": 0.014079989865422249,
      "rewards/bleu_reward_func/mean": 0.034439343959093094,
      "rewards/bleu_reward_func/std": 0.025304077193140984,
      "step": 1029
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 476.0625,
      "completions/mean_terminated_length": 368.25,
      "completions/min_length": 226.0,
      "completions/min_terminated_length": 226.0,
      "epoch": 0.824,
      "grad_norm": 2.245523691177368,
      "kl": 0.057952880859375,
      "learning_rate": 1e-06,
      "loss": -0.0427,
      "num_tokens": 13829117.0,
      "reward": 0.05782981216907501,
      "reward_std": 0.01562406774610281,
      "rewards/bleu_reward_func/mean": 0.05782981216907501,
      "rewards/bleu_reward_func/std": 0.036193206906318665,
      "step": 1030
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 396.0,
      "completions/mean_length": 297.75,
      "completions/mean_terminated_length": 213.9130401611328,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.8248,
      "grad_norm": 2.7590172290802,
      "kl": 0.05572509765625,
      "learning_rate": 1e-06,
      "loss": 0.1533,
      "num_tokens": 13841837.0,
      "reward": 0.15948085486888885,
      "reward_std": 0.051790352910757065,
      "rewards/bleu_reward_func/mean": 0.15948085486888885,
      "rewards/bleu_reward_func/std": 0.1838230937719345,
      "step": 1031
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 496.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 241.0625,
      "completions/mean_terminated_length": 241.0625,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "epoch": 0.8256,
      "grad_norm": 2.9878034591674805,
      "kl": 0.0655517578125,
      "learning_rate": 1e-06,
      "loss": 0.0115,
      "num_tokens": 13851447.0,
      "reward": 0.05960501357913017,
      "reward_std": 0.018924448639154434,
      "rewards/bleu_reward_func/mean": 0.05960501357913017,
      "rewards/bleu_reward_func/std": 0.03717625513672829,
      "step": 1032
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 408.0,
      "completions/mean_length": 267.90625,
      "completions/mean_terminated_length": 186.5416717529297,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "epoch": 0.8264,
      "grad_norm": 3.7042665481567383,
      "kl": 0.095947265625,
      "learning_rate": 1e-06,
      "loss": -0.1878,
      "num_tokens": 13862444.0,
      "reward": 0.027990631759166718,
      "reward_std": 0.010054003447294235,
      "rewards/bleu_reward_func/mean": 0.027990631759166718,
      "rewards/bleu_reward_func/std": 0.024867286905646324,
      "step": 1033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 303.875,
      "completions/mean_terminated_length": 274.14288330078125,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.8272,
      "grad_norm": 3.176471710205078,
      "kl": 0.04852294921875,
      "learning_rate": 1e-06,
      "loss": 0.0688,
      "num_tokens": 13874104.0,
      "reward": 0.028669901192188263,
      "reward_std": 0.009087910875678062,
      "rewards/bleu_reward_func/mean": 0.028669901192188263,
      "rewards/bleu_reward_func/std": 0.012309697456657887,
      "step": 1034
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 280.34375,
      "completions/mean_terminated_length": 264.9000244140625,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.828,
      "grad_norm": 2.438011407852173,
      "kl": 0.048095703125,
      "learning_rate": 1e-06,
      "loss": -0.0052,
      "num_tokens": 13885019.0,
      "reward": 0.07248373329639435,
      "reward_std": 0.022065263241529465,
      "rewards/bleu_reward_func/mean": 0.07248373329639435,
      "rewards/bleu_reward_func/std": 0.060632698237895966,
      "step": 1035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 453.8125,
      "completions/mean_terminated_length": 325.8000183105469,
      "completions/min_length": 257.0,
      "completions/min_terminated_length": 257.0,
      "epoch": 0.8288,
      "grad_norm": 2.2002475261688232,
      "kl": 0.05474853515625,
      "learning_rate": 1e-06,
      "loss": -0.0041,
      "num_tokens": 13903325.0,
      "reward": 0.04414498060941696,
      "reward_std": 0.01269291341304779,
      "rewards/bleu_reward_func/mean": 0.04414498060941696,
      "rewards/bleu_reward_func/std": 0.020714420825242996,
      "step": 1036
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 467.0,
      "completions/mean_length": 281.1875,
      "completions/mean_terminated_length": 238.44444274902344,
      "completions/min_length": 63.0,
      "completions/min_terminated_length": 63.0,
      "epoch": 0.8296,
      "grad_norm": 2.660327434539795,
      "kl": 0.04754638671875,
      "learning_rate": 1e-06,
      "loss": -0.0327,
      "num_tokens": 13917459.0,
      "reward": 0.04724155738949776,
      "reward_std": 0.019107088446617126,
      "rewards/bleu_reward_func/mean": 0.04724155738949776,
      "rewards/bleu_reward_func/std": 0.03912588581442833,
      "step": 1037
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 199.21875,
      "completions/mean_terminated_length": 189.1290283203125,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "epoch": 0.8304,
      "grad_norm": 3.1837692260742188,
      "kl": 0.076416015625,
      "learning_rate": 1e-06,
      "loss": 0.0188,
      "num_tokens": 13926098.0,
      "reward": 0.09293580800294876,
      "reward_std": 0.031207388266921043,
      "rewards/bleu_reward_func/mean": 0.09293580800294876,
      "rewards/bleu_reward_func/std": 0.11434992402791977,
      "step": 1038
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 437.0,
      "completions/mean_length": 222.53125,
      "completions/mean_terminated_length": 213.19354248046875,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.8312,
      "grad_norm": 2.744227409362793,
      "kl": 0.044677734375,
      "learning_rate": 1e-06,
      "loss": 0.0984,
      "num_tokens": 13937107.0,
      "reward": 0.14024724066257477,
      "reward_std": 0.020312879234552383,
      "rewards/bleu_reward_func/mean": 0.14024724066257477,
      "rewards/bleu_reward_func/std": 0.18536008894443512,
      "step": 1039
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 413.0625,
      "completions/mean_terminated_length": 368.0909118652344,
      "completions/min_length": 265.0,
      "completions/min_terminated_length": 265.0,
      "epoch": 0.832,
      "grad_norm": 2.2251086235046387,
      "kl": 0.04925537109375,
      "learning_rate": 1e-06,
      "loss": -0.0266,
      "num_tokens": 13952421.0,
      "reward": 0.019466448575258255,
      "reward_std": 0.004886062350124121,
      "rewards/bleu_reward_func/mean": 0.019466448575258255,
      "rewards/bleu_reward_func/std": 0.010416182689368725,
      "step": 1040
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 355.4375,
      "completions/mean_terminated_length": 217.2941131591797,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.8328,
      "grad_norm": 2.832329750061035,
      "kl": 0.039093017578125,
      "learning_rate": 1e-06,
      "loss": 0.0415,
      "num_tokens": 13967235.0,
      "reward": 0.07604211568832397,
      "reward_std": 0.018734116107225418,
      "rewards/bleu_reward_func/mean": 0.07604211568832397,
      "rewards/bleu_reward_func/std": 0.0775146409869194,
      "step": 1041
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 452.0,
      "completions/max_terminated_length": 452.0,
      "completions/mean_length": 202.84375,
      "completions/mean_terminated_length": 202.84375,
      "completions/min_length": 85.0,
      "completions/min_terminated_length": 85.0,
      "epoch": 0.8336,
      "grad_norm": 3.4680063724517822,
      "kl": 0.1029052734375,
      "learning_rate": 1e-06,
      "loss": -0.0144,
      "num_tokens": 13975950.0,
      "reward": 0.03598593920469284,
      "reward_std": 0.01417174655944109,
      "rewards/bleu_reward_func/mean": 0.03598593920469284,
      "rewards/bleu_reward_func/std": 0.021121855825185776,
      "step": 1042
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 341.125,
      "completions/mean_terminated_length": 309.4814758300781,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 190.0,
      "epoch": 0.8344,
      "grad_norm": 2.456043243408203,
      "kl": 0.0452880859375,
      "learning_rate": 1e-06,
      "loss": -0.0375,
      "num_tokens": 13992426.0,
      "reward": 0.1423608958721161,
      "reward_std": 0.04012633115053177,
      "rewards/bleu_reward_func/mean": 0.1423608958721161,
      "rewards/bleu_reward_func/std": 0.11575107276439667,
      "step": 1043
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 346.53125,
      "completions/mean_terminated_length": 329.4137878417969,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "epoch": 0.8352,
      "grad_norm": 2.6406376361846924,
      "kl": 0.0521240234375,
      "learning_rate": 1e-06,
      "loss": -0.0763,
      "num_tokens": 14005787.0,
      "reward": 0.09989124536514282,
      "reward_std": 0.024893736466765404,
      "rewards/bleu_reward_func/mean": 0.09989124536514282,
      "rewards/bleu_reward_func/std": 0.06233161687850952,
      "step": 1044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 381.90625,
      "completions/mean_terminated_length": 313.76190185546875,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.836,
      "grad_norm": 2.2674143314361572,
      "kl": 0.05908203125,
      "learning_rate": 1e-06,
      "loss": -0.0206,
      "num_tokens": 14023792.0,
      "reward": 0.08537600189447403,
      "reward_std": 0.04241234064102173,
      "rewards/bleu_reward_func/mean": 0.08537600189447403,
      "rewards/bleu_reward_func/std": 0.07984770834445953,
      "step": 1045
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 313.4375,
      "completions/mean_terminated_length": 235.7391357421875,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.8368,
      "grad_norm": 3.6653220653533936,
      "kl": 0.0543212890625,
      "learning_rate": 1e-06,
      "loss": 0.0199,
      "num_tokens": 14036142.0,
      "reward": 0.03577762842178345,
      "reward_std": 0.014606889337301254,
      "rewards/bleu_reward_func/mean": 0.03577762842178345,
      "rewards/bleu_reward_func/std": 0.021284347400069237,
      "step": 1046
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 467.0,
      "completions/mean_length": 263.9375,
      "completions/mean_terminated_length": 218.0,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.8376,
      "grad_norm": 4.6712870597839355,
      "kl": 0.05914306640625,
      "learning_rate": 1e-06,
      "loss": 0.1302,
      "num_tokens": 14046380.0,
      "reward": 0.04743397980928421,
      "reward_std": 0.01841292530298233,
      "rewards/bleu_reward_func/mean": 0.04743397980928421,
      "rewards/bleu_reward_func/std": 0.0290218573063612,
      "step": 1047
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 204.9375,
      "completions/mean_terminated_length": 184.4666748046875,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.8384,
      "grad_norm": 8.011630058288574,
      "kl": 0.0655517578125,
      "learning_rate": 1e-06,
      "loss": -0.0986,
      "num_tokens": 14057290.0,
      "reward": 0.2640398442745209,
      "reward_std": 0.029374700039625168,
      "rewards/bleu_reward_func/mean": 0.2640398442745209,
      "rewards/bleu_reward_func/std": 0.3879520893096924,
      "step": 1048
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 365.0,
      "completions/mean_length": 252.875,
      "completions/mean_terminated_length": 193.07693481445312,
      "completions/min_length": 79.0,
      "completions/min_terminated_length": 79.0,
      "epoch": 0.8392,
      "grad_norm": 3.1190173625946045,
      "kl": 0.0576171875,
      "learning_rate": 1e-06,
      "loss": 0.01,
      "num_tokens": 14069918.0,
      "reward": 0.05034583806991577,
      "reward_std": 0.020114287734031677,
      "rewards/bleu_reward_func/mean": 0.05034583806991577,
      "rewards/bleu_reward_func/std": 0.02764110080897808,
      "step": 1049
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 455.0,
      "completions/mean_length": 270.78125,
      "completions/mean_terminated_length": 203.239990234375,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.84,
      "grad_norm": 3.3168582916259766,
      "kl": 0.039947509765625,
      "learning_rate": 1e-06,
      "loss": -0.05,
      "num_tokens": 14080375.0,
      "reward": 0.0682622641324997,
      "reward_std": 0.026746664196252823,
      "rewards/bleu_reward_func/mean": 0.0682622641324997,
      "rewards/bleu_reward_func/std": 0.04914075881242752,
      "step": 1050
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 356.15625,
      "completions/mean_terminated_length": 274.5238037109375,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "epoch": 0.8408,
      "grad_norm": 2.6723437309265137,
      "kl": 0.0855712890625,
      "learning_rate": 1e-06,
      "loss": -0.0573,
      "num_tokens": 14094404.0,
      "reward": 0.032941900193691254,
      "reward_std": 0.004675520583987236,
      "rewards/bleu_reward_func/mean": 0.032941900193691254,
      "rewards/bleu_reward_func/std": 0.03677041456103325,
      "step": 1051
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 398.8125,
      "completions/mean_terminated_length": 298.9411926269531,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.8416,
      "grad_norm": 2.244532823562622,
      "kl": 0.043792724609375,
      "learning_rate": 1e-06,
      "loss": -0.0009,
      "num_tokens": 14110374.0,
      "reward": 0.03393974155187607,
      "reward_std": 0.009483535774052143,
      "rewards/bleu_reward_func/mean": 0.03393974155187607,
      "rewards/bleu_reward_func/std": 0.0170142725110054,
      "step": 1052
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 215.53125,
      "completions/mean_terminated_length": 184.86207580566406,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 65.0,
      "epoch": 0.8424,
      "grad_norm": 5.13472843170166,
      "kl": 0.13140869140625,
      "learning_rate": 1e-06,
      "loss": 0.0382,
      "num_tokens": 14121543.0,
      "reward": 0.03505343943834305,
      "reward_std": 0.007273062132298946,
      "rewards/bleu_reward_func/mean": 0.03505343943834305,
      "rewards/bleu_reward_func/std": 0.0165236946195364,
      "step": 1053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 423.0,
      "completions/mean_length": 295.65625,
      "completions/mean_terminated_length": 223.5416717529297,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 157.0,
      "epoch": 0.8432,
      "grad_norm": 3.089383125305176,
      "kl": 0.0748291015625,
      "learning_rate": 1e-06,
      "loss": 0.0492,
      "num_tokens": 14133372.0,
      "reward": 0.03953177481889725,
      "reward_std": 0.013197172433137894,
      "rewards/bleu_reward_func/mean": 0.03953177481889725,
      "rewards/bleu_reward_func/std": 0.02268371731042862,
      "step": 1054
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 444.0,
      "completions/mean_length": 360.28125,
      "completions/mean_terminated_length": 226.41175842285156,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.844,
      "grad_norm": 4.491260528564453,
      "kl": 0.08428955078125,
      "learning_rate": 1e-06,
      "loss": -0.0276,
      "num_tokens": 14148525.0,
      "reward": 0.030032211914658546,
      "reward_std": 0.010867346078157425,
      "rewards/bleu_reward_func/mean": 0.030032211914658546,
      "rewards/bleu_reward_func/std": 0.018144365400075912,
      "step": 1055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 463.0,
      "completions/mean_length": 262.84375,
      "completions/mean_terminated_length": 246.2333526611328,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.8448,
      "grad_norm": 2.9773359298706055,
      "kl": 0.046600341796875,
      "learning_rate": 1e-06,
      "loss": -0.0172,
      "num_tokens": 14159544.0,
      "reward": 0.045382194221019745,
      "reward_std": 0.021608000621199608,
      "rewards/bleu_reward_func/mean": 0.045382194221019745,
      "rewards/bleu_reward_func/std": 0.04553521052002907,
      "step": 1056
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 341.5625,
      "completions/mean_terminated_length": 293.8399963378906,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.8456,
      "grad_norm": 3.592421531677246,
      "kl": 0.092529296875,
      "learning_rate": 1e-06,
      "loss": 0.0606,
      "num_tokens": 14173122.0,
      "reward": 0.08871526271104813,
      "reward_std": 0.02755703032016754,
      "rewards/bleu_reward_func/mean": 0.08871526271104813,
      "rewards/bleu_reward_func/std": 0.10348460078239441,
      "step": 1057
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 485.0,
      "completions/mean_length": 255.0625,
      "completions/mean_terminated_length": 237.933349609375,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.8464,
      "grad_norm": 3.676213502883911,
      "kl": 0.0556640625,
      "learning_rate": 1e-06,
      "loss": -0.0521,
      "num_tokens": 14182972.0,
      "reward": 0.03509638085961342,
      "reward_std": 0.011363822966814041,
      "rewards/bleu_reward_func/mean": 0.03509638085961342,
      "rewards/bleu_reward_func/std": 0.019865239039063454,
      "step": 1058
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 274.65625,
      "completions/mean_terminated_length": 230.70370483398438,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "epoch": 0.8472,
      "grad_norm": 3.6321067810058594,
      "kl": 0.053924560546875,
      "learning_rate": 1e-06,
      "loss": 0.0394,
      "num_tokens": 14197185.0,
      "reward": 0.09843635559082031,
      "reward_std": 0.03348027914762497,
      "rewards/bleu_reward_func/mean": 0.09843635559082031,
      "rewards/bleu_reward_func/std": 0.07304807007312775,
      "step": 1059
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 387.34375,
      "completions/mean_terminated_length": 330.68182373046875,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 115.0,
      "epoch": 0.848,
      "grad_norm": 2.2477974891662598,
      "kl": 0.0562744140625,
      "learning_rate": 1e-06,
      "loss": 0.0342,
      "num_tokens": 14213164.0,
      "reward": 0.050817858427762985,
      "reward_std": 0.015856031328439713,
      "rewards/bleu_reward_func/mean": 0.050817858427762985,
      "rewards/bleu_reward_func/std": 0.06451952457427979,
      "step": 1060
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 324.28125,
      "completions/mean_terminated_length": 250.8260955810547,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 0.8488,
      "grad_norm": 2.6654868125915527,
      "kl": 0.0555419921875,
      "learning_rate": 1e-06,
      "loss": -0.072,
      "num_tokens": 14225901.0,
      "reward": 0.040826499462127686,
      "reward_std": 0.01575326919555664,
      "rewards/bleu_reward_func/mean": 0.040826499462127686,
      "rewards/bleu_reward_func/std": 0.015293111093342304,
      "step": 1061
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 484.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 207.75,
      "completions/mean_terminated_length": 207.75,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.8496,
      "grad_norm": 3.5825765132904053,
      "kl": 0.079986572265625,
      "learning_rate": 1e-06,
      "loss": -0.0862,
      "num_tokens": 14234253.0,
      "reward": 0.03848409280180931,
      "reward_std": 0.013242291286587715,
      "rewards/bleu_reward_func/mean": 0.03848409280180931,
      "rewards/bleu_reward_func/std": 0.01712285354733467,
      "step": 1062
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 398.34375,
      "completions/mean_terminated_length": 346.68182373046875,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.8504,
      "grad_norm": 2.3341686725616455,
      "kl": 0.06793212890625,
      "learning_rate": 1e-06,
      "loss": -0.1081,
      "num_tokens": 14249488.0,
      "reward": 0.06572327017784119,
      "reward_std": 0.025146422907710075,
      "rewards/bleu_reward_func/mean": 0.06572327017784119,
      "rewards/bleu_reward_func/std": 0.033365800976753235,
      "step": 1063
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 463.0,
      "completions/mean_length": 376.3125,
      "completions/mean_terminated_length": 338.32000732421875,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 109.0,
      "epoch": 0.8512,
      "grad_norm": 2.4101240634918213,
      "kl": 0.04913330078125,
      "learning_rate": 1e-06,
      "loss": -0.0386,
      "num_tokens": 14264370.0,
      "reward": 0.046704962849617004,
      "reward_std": 0.02123313769698143,
      "rewards/bleu_reward_func/mean": 0.046704962849617004,
      "rewards/bleu_reward_func/std": 0.03704674169421196,
      "step": 1064
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 354.03125,
      "completions/mean_terminated_length": 292.2174072265625,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.852,
      "grad_norm": 2.654803514480591,
      "kl": 0.0799560546875,
      "learning_rate": 1e-06,
      "loss": 0.0249,
      "num_tokens": 14278251.0,
      "reward": 0.036083631217479706,
      "reward_std": 0.009487833827733994,
      "rewards/bleu_reward_func/mean": 0.036083631217479706,
      "rewards/bleu_reward_func/std": 0.03182501345872879,
      "step": 1065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 475.0,
      "completions/mean_length": 343.53125,
      "completions/mean_terminated_length": 287.375,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.8528,
      "grad_norm": 2.5892786979675293,
      "kl": 0.04949951171875,
      "learning_rate": 1e-06,
      "loss": 0.0723,
      "num_tokens": 14292188.0,
      "reward": 0.054016873240470886,
      "reward_std": 0.027925897389650345,
      "rewards/bleu_reward_func/mean": 0.054016873240470886,
      "rewards/bleu_reward_func/std": 0.031196916475892067,
      "step": 1066
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 477.0,
      "completions/mean_length": 261.6875,
      "completions/mean_terminated_length": 245.00001525878906,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.8536,
      "grad_norm": 5.397347450256348,
      "kl": 0.106109619140625,
      "learning_rate": 1e-06,
      "loss": -0.0081,
      "num_tokens": 14302522.0,
      "reward": 0.06117306649684906,
      "reward_std": 0.023319777101278305,
      "rewards/bleu_reward_func/mean": 0.06117306649684906,
      "rewards/bleu_reward_func/std": 0.046599678695201874,
      "step": 1067
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 471.0,
      "completions/mean_length": 302.90625,
      "completions/mean_terminated_length": 244.36000061035156,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.8544,
      "grad_norm": 3.079831600189209,
      "kl": 0.051025390625,
      "learning_rate": 1e-06,
      "loss": -0.0999,
      "num_tokens": 14314487.0,
      "reward": 0.06549815088510513,
      "reward_std": 0.017141040414571762,
      "rewards/bleu_reward_func/mean": 0.06549815088510513,
      "rewards/bleu_reward_func/std": 0.07451631128787994,
      "step": 1068
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 247.96875,
      "completions/mean_terminated_length": 199.07408142089844,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "epoch": 0.8552,
      "grad_norm": 3.2531063556671143,
      "kl": 0.06011962890625,
      "learning_rate": 1e-06,
      "loss": -0.0422,
      "num_tokens": 14324478.0,
      "reward": 0.03668832778930664,
      "reward_std": 0.018588969483971596,
      "rewards/bleu_reward_func/mean": 0.03668832778930664,
      "rewards/bleu_reward_func/std": 0.024964287877082825,
      "step": 1069
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 440.0,
      "completions/mean_length": 288.375,
      "completions/mean_terminated_length": 213.83334350585938,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.856,
      "grad_norm": 2.7994515895843506,
      "kl": 0.037628173828125,
      "learning_rate": 1e-06,
      "loss": -0.0105,
      "num_tokens": 14337218.0,
      "reward": 0.04852975159883499,
      "reward_std": 0.011727490462362766,
      "rewards/bleu_reward_func/mean": 0.04852975159883499,
      "rewards/bleu_reward_func/std": 0.021890046074986458,
      "step": 1070
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 465.0,
      "completions/mean_length": 229.09375,
      "completions/mean_terminated_length": 163.8076934814453,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.8568,
      "grad_norm": 3.4524424076080322,
      "kl": 0.0914306640625,
      "learning_rate": 1e-06,
      "loss": 0.0403,
      "num_tokens": 14347365.0,
      "reward": 0.04209073632955551,
      "reward_std": 0.010336240753531456,
      "rewards/bleu_reward_func/mean": 0.04209073632955551,
      "rewards/bleu_reward_func/std": 0.017330747097730637,
      "step": 1071
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 443.0,
      "completions/mean_length": 325.15625,
      "completions/mean_terminated_length": 319.1290283203125,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "epoch": 0.8576,
      "grad_norm": 2.200551748275757,
      "kl": 0.04730224609375,
      "learning_rate": 1e-06,
      "loss": -0.0014,
      "num_tokens": 14361250.0,
      "reward": 0.06647847592830658,
      "reward_std": 0.02658942900598049,
      "rewards/bleu_reward_func/mean": 0.06647847592830658,
      "rewards/bleu_reward_func/std": 0.08473870158195496,
      "step": 1072
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 434.78125,
      "completions/mean_terminated_length": 306.0833435058594,
      "completions/min_length": 177.0,
      "completions/min_terminated_length": 177.0,
      "epoch": 0.8584,
      "grad_norm": 2.121492385864258,
      "kl": 0.04705810546875,
      "learning_rate": 1e-06,
      "loss": 0.0316,
      "num_tokens": 14377891.0,
      "reward": 0.027470655739307404,
      "reward_std": 0.009700166061520576,
      "rewards/bleu_reward_func/mean": 0.027470655739307404,
      "rewards/bleu_reward_func/std": 0.018717704340815544,
      "step": 1073
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 456.46875,
      "completions/mean_terminated_length": 400.9375,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "epoch": 0.8592,
      "grad_norm": 1.9872575998306274,
      "kl": 0.052825927734375,
      "learning_rate": 1e-06,
      "loss": 0.0179,
      "num_tokens": 14394938.0,
      "reward": 0.031011758372187614,
      "reward_std": 0.006797453388571739,
      "rewards/bleu_reward_func/mean": 0.031011758372187614,
      "rewards/bleu_reward_func/std": 0.013120264746248722,
      "step": 1074
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 327.1875,
      "completions/mean_terminated_length": 230.38095092773438,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.86,
      "grad_norm": 3.2750420570373535,
      "kl": 0.05621337890625,
      "learning_rate": 1e-06,
      "loss": 0.0067,
      "num_tokens": 14407512.0,
      "reward": 0.04015614092350006,
      "reward_std": 0.01684856228530407,
      "rewards/bleu_reward_func/mean": 0.04015614092350006,
      "rewards/bleu_reward_func/std": 0.024104053154587746,
      "step": 1075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 282.78125,
      "completions/mean_terminated_length": 275.3870849609375,
      "completions/min_length": 50.0,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.8608,
      "grad_norm": 2.747910499572754,
      "kl": 0.06060791015625,
      "learning_rate": 1e-06,
      "loss": -0.0831,
      "num_tokens": 14418377.0,
      "reward": 0.059567973017692566,
      "reward_std": 0.02570568397641182,
      "rewards/bleu_reward_func/mean": 0.059567973017692566,
      "rewards/bleu_reward_func/std": 0.06817185878753662,
      "step": 1076
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 284.125,
      "completions/mean_terminated_length": 268.933349609375,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.8616,
      "grad_norm": 3.2307136058807373,
      "kl": 0.0513916015625,
      "learning_rate": 1e-06,
      "loss": 0.0257,
      "num_tokens": 14429501.0,
      "reward": 0.03993712365627289,
      "reward_std": 0.016049114987254143,
      "rewards/bleu_reward_func/mean": 0.03993712365627289,
      "rewards/bleu_reward_func/std": 0.032372910529375076,
      "step": 1077
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 396.59375,
      "completions/mean_terminated_length": 327.3500061035156,
      "completions/min_length": 142.0,
      "completions/min_terminated_length": 142.0,
      "epoch": 0.8624,
      "grad_norm": 2.057788133621216,
      "kl": 0.04718017578125,
      "learning_rate": 1e-06,
      "loss": 0.06,
      "num_tokens": 14444720.0,
      "reward": 0.036224670708179474,
      "reward_std": 0.009702024050056934,
      "rewards/bleu_reward_func/mean": 0.036224670708179474,
      "rewards/bleu_reward_func/std": 0.014296884648501873,
      "step": 1078
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 267.375,
      "completions/mean_terminated_length": 222.07408142089844,
      "completions/min_length": 57.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.8632,
      "grad_norm": 3.0467591285705566,
      "kl": 0.06610107421875,
      "learning_rate": 1e-06,
      "loss": -0.018,
      "num_tokens": 14455836.0,
      "reward": 0.024699728935956955,
      "reward_std": 0.0076002031564712524,
      "rewards/bleu_reward_func/mean": 0.024699728935956955,
      "rewards/bleu_reward_func/std": 0.010115176439285278,
      "step": 1079
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 336.0,
      "completions/mean_length": 244.9375,
      "completions/mean_terminated_length": 155.9166717529297,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.864,
      "grad_norm": 4.053822994232178,
      "kl": 0.1051025390625,
      "learning_rate": 1e-06,
      "loss": -0.179,
      "num_tokens": 14466490.0,
      "reward": 0.021421607583761215,
      "reward_std": 0.008295105770230293,
      "rewards/bleu_reward_func/mean": 0.021421607583761215,
      "rewards/bleu_reward_func/std": 0.015474777668714523,
      "step": 1080
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 449.0,
      "completions/mean_length": 414.25,
      "completions/mean_terminated_length": 271.3846130371094,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 170.0,
      "epoch": 0.8648,
      "grad_norm": 2.2090256214141846,
      "kl": 0.0640869140625,
      "learning_rate": 1e-06,
      "loss": -0.0172,
      "num_tokens": 14483202.0,
      "reward": 0.03543311357498169,
      "reward_std": 0.012544544413685799,
      "rewards/bleu_reward_func/mean": 0.03543311357498169,
      "rewards/bleu_reward_func/std": 0.030554452911019325,
      "step": 1081
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 415.0,
      "completions/max_terminated_length": 415.0,
      "completions/mean_length": 139.4375,
      "completions/mean_terminated_length": 139.4375,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.8656,
      "grad_norm": 8.19969654083252,
      "kl": 0.13037109375,
      "learning_rate": 1e-06,
      "loss": 0.2538,
      "num_tokens": 14491720.0,
      "reward": 0.042763207107782364,
      "reward_std": 0.01436428539454937,
      "rewards/bleu_reward_func/mean": 0.042763207107782364,
      "rewards/bleu_reward_func/std": 0.02836323343217373,
      "step": 1082
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 434.0,
      "completions/max_terminated_length": 434.0,
      "completions/mean_length": 221.1875,
      "completions/mean_terminated_length": 221.1875,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.8664,
      "grad_norm": 4.217365741729736,
      "kl": 0.08453369140625,
      "learning_rate": 1e-06,
      "loss": -0.0391,
      "num_tokens": 14504486.0,
      "reward": 0.05494450032711029,
      "reward_std": 0.0162642952054739,
      "rewards/bleu_reward_func/mean": 0.05494450032711029,
      "rewards/bleu_reward_func/std": 0.025794783607125282,
      "step": 1083
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 328.90625,
      "completions/mean_terminated_length": 267.875,
      "completions/min_length": 46.0,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.8672,
      "grad_norm": 2.3826496601104736,
      "kl": 0.05877685546875,
      "learning_rate": 1e-06,
      "loss": -0.0386,
      "num_tokens": 14521755.0,
      "reward": 0.11050405353307724,
      "reward_std": 0.034873202443122864,
      "rewards/bleu_reward_func/mean": 0.11050405353307724,
      "rewards/bleu_reward_func/std": 0.11130403727293015,
      "step": 1084
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 432.0,
      "completions/mean_length": 252.25,
      "completions/mean_terminated_length": 165.6666717529297,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.868,
      "grad_norm": 3.0554423332214355,
      "kl": 0.079742431640625,
      "learning_rate": 1e-06,
      "loss": -0.0614,
      "num_tokens": 14532419.0,
      "reward": 0.030541863292455673,
      "reward_std": 0.016133692115545273,
      "rewards/bleu_reward_func/mean": 0.030541863292455673,
      "rewards/bleu_reward_func/std": 0.020966939628124237,
      "step": 1085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 440.0,
      "completions/max_terminated_length": 440.0,
      "completions/mean_length": 208.59375,
      "completions/mean_terminated_length": 208.59375,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.8688,
      "grad_norm": 3.5242645740509033,
      "kl": 0.05633544921875,
      "learning_rate": 1e-06,
      "loss": 0.0629,
      "num_tokens": 14544030.0,
      "reward": 0.04443598911166191,
      "reward_std": 0.021839329972863197,
      "rewards/bleu_reward_func/mean": 0.04443598911166191,
      "rewards/bleu_reward_func/std": 0.03833690285682678,
      "step": 1086
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 370.5625,
      "completions/mean_terminated_length": 315.2174072265625,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "epoch": 0.8696,
      "grad_norm": 2.691652774810791,
      "kl": 0.069580078125,
      "learning_rate": 1e-06,
      "loss": -0.061,
      "num_tokens": 14558856.0,
      "reward": 0.04129372909665108,
      "reward_std": 0.013989726081490517,
      "rewards/bleu_reward_func/mean": 0.04129372909665108,
      "rewards/bleu_reward_func/std": 0.02188796177506447,
      "step": 1087
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 229.78125,
      "completions/mean_terminated_length": 220.6774139404297,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.8704,
      "grad_norm": 4.128385543823242,
      "kl": 0.0552978515625,
      "learning_rate": 1e-06,
      "loss": -0.1428,
      "num_tokens": 14569505.0,
      "reward": 0.26207196712493896,
      "reward_std": 0.07250937074422836,
      "rewards/bleu_reward_func/mean": 0.26207196712493896,
      "rewards/bleu_reward_func/std": 0.34467241168022156,
      "step": 1088
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 434.0,
      "completions/mean_length": 381.5625,
      "completions/mean_terminated_length": 251.125,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "epoch": 0.8712,
      "grad_norm": 2.2956695556640625,
      "kl": 0.05340576171875,
      "learning_rate": 1e-06,
      "loss": 0.0395,
      "num_tokens": 14584859.0,
      "reward": 0.08875560760498047,
      "reward_std": 0.03270617872476578,
      "rewards/bleu_reward_func/mean": 0.08875560760498047,
      "rewards/bleu_reward_func/std": 0.10525688529014587,
      "step": 1089
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 432.0,
      "completions/max_terminated_length": 432.0,
      "completions/mean_length": 213.8125,
      "completions/mean_terminated_length": 213.8125,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.872,
      "grad_norm": 4.106683731079102,
      "kl": 0.087982177734375,
      "learning_rate": 1e-06,
      "loss": -0.0803,
      "num_tokens": 14596485.0,
      "reward": 0.07446331530809402,
      "reward_std": 0.02199474349617958,
      "rewards/bleu_reward_func/mean": 0.07446331530809402,
      "rewards/bleu_reward_func/std": 0.056231312453746796,
      "step": 1090
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 465.0,
      "completions/mean_length": 311.46875,
      "completions/mean_terminated_length": 265.19232177734375,
      "completions/min_length": 102.0,
      "completions/min_terminated_length": 102.0,
      "epoch": 0.8728,
      "grad_norm": 2.7658278942108154,
      "kl": 0.07470703125,
      "learning_rate": 1e-06,
      "loss": 0.009,
      "num_tokens": 14608980.0,
      "reward": 0.03502470254898071,
      "reward_std": 0.010442346334457397,
      "rewards/bleu_reward_func/mean": 0.03502470254898071,
      "rewards/bleu_reward_func/std": 0.012194231152534485,
      "step": 1091
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 166.0,
      "completions/mean_length": 308.875,
      "completions/mean_terminated_length": 105.75,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "epoch": 0.8736,
      "grad_norm": 3.983816385269165,
      "kl": 0.10113525390625,
      "learning_rate": 1e-06,
      "loss": -0.0715,
      "num_tokens": 14623104.0,
      "reward": 0.03933839499950409,
      "reward_std": 0.013284911401569843,
      "rewards/bleu_reward_func/mean": 0.03933839499950409,
      "rewards/bleu_reward_func/std": 0.01618482731282711,
      "step": 1092
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 410.84375,
      "completions/mean_terminated_length": 242.25,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.8744,
      "grad_norm": 2.227384090423584,
      "kl": 0.05706787109375,
      "learning_rate": 1e-06,
      "loss": -0.0022,
      "num_tokens": 14641235.0,
      "reward": 0.025841325521469116,
      "reward_std": 0.008813188411295414,
      "rewards/bleu_reward_func/mean": 0.025841325521469116,
      "rewards/bleu_reward_func/std": 0.01780826225876808,
      "step": 1093
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 420.0,
      "completions/max_terminated_length": 420.0,
      "completions/mean_length": 192.6875,
      "completions/mean_terminated_length": 192.6875,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "epoch": 0.8752,
      "grad_norm": 5.167781352996826,
      "kl": 0.081085205078125,
      "learning_rate": 1e-06,
      "loss": -0.0051,
      "num_tokens": 14649313.0,
      "reward": 0.05482110381126404,
      "reward_std": 0.018363406881690025,
      "rewards/bleu_reward_func/mean": 0.05482110381126404,
      "rewards/bleu_reward_func/std": 0.03553561493754387,
      "step": 1094
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 504.0,
      "completions/mean_length": 439.03125,
      "completions/mean_terminated_length": 299.727294921875,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "epoch": 0.876,
      "grad_norm": 2.0647776126861572,
      "kl": 0.0677490234375,
      "learning_rate": 1e-06,
      "loss": 0.0269,
      "num_tokens": 14666842.0,
      "reward": 0.02372824400663376,
      "reward_std": 0.00758307846263051,
      "rewards/bleu_reward_func/mean": 0.02372824400663376,
      "rewards/bleu_reward_func/std": 0.015121630392968655,
      "step": 1095
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 391.0,
      "completions/mean_length": 374.40625,
      "completions/mean_terminated_length": 280.2631530761719,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "epoch": 0.8768,
      "grad_norm": 2.0318610668182373,
      "kl": 0.057373046875,
      "learning_rate": 1e-06,
      "loss": 0.0453,
      "num_tokens": 14681551.0,
      "reward": 0.01864839717745781,
      "reward_std": 0.005486675538122654,
      "rewards/bleu_reward_func/mean": 0.01864839717745781,
      "rewards/bleu_reward_func/std": 0.020449459552764893,
      "step": 1096
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 426.9375,
      "completions/mean_terminated_length": 360.77777099609375,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 154.0,
      "epoch": 0.8776,
      "grad_norm": 2.280367374420166,
      "kl": 0.0638427734375,
      "learning_rate": 1e-06,
      "loss": -0.045,
      "num_tokens": 14700125.0,
      "reward": 0.041227325797080994,
      "reward_std": 0.029045000672340393,
      "rewards/bleu_reward_func/mean": 0.041227325797080994,
      "rewards/bleu_reward_func/std": 0.05818454921245575,
      "step": 1097
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 288.71875,
      "completions/mean_terminated_length": 237.19232177734375,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.8784,
      "grad_norm": 4.109658241271973,
      "kl": 0.07611083984375,
      "learning_rate": 1e-06,
      "loss": -0.0113,
      "num_tokens": 14711996.0,
      "reward": 0.08606592565774918,
      "reward_std": 0.030430836603045464,
      "rewards/bleu_reward_func/mean": 0.08606592565774918,
      "rewards/bleu_reward_func/std": 0.06479021161794662,
      "step": 1098
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 362.0,
      "completions/max_terminated_length": 362.0,
      "completions/mean_length": 238.15625,
      "completions/mean_terminated_length": 238.15625,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "epoch": 0.8792,
      "grad_norm": 2.743206262588501,
      "kl": 0.07562255859375,
      "learning_rate": 1e-06,
      "loss": 0.0047,
      "num_tokens": 14722057.0,
      "reward": 0.04888010770082474,
      "reward_std": 0.016005024313926697,
      "rewards/bleu_reward_func/mean": 0.04888010770082474,
      "rewards/bleu_reward_func/std": 0.026392478495836258,
      "step": 1099
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 461.0,
      "completions/mean_length": 385.84375,
      "completions/mean_terminated_length": 319.76190185546875,
      "completions/min_length": 194.0,
      "completions/min_terminated_length": 194.0,
      "epoch": 0.88,
      "grad_norm": 2.339637279510498,
      "kl": 0.063323974609375,
      "learning_rate": 1e-06,
      "loss": -0.0166,
      "num_tokens": 14737660.0,
      "reward": 0.05646644905209541,
      "reward_std": 0.022537769749760628,
      "rewards/bleu_reward_func/mean": 0.05646644905209541,
      "rewards/bleu_reward_func/std": 0.03494390472769737,
      "step": 1100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 99.0,
      "completions/mean_length": 399.9375,
      "completions/mean_terminated_length": 63.75,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.8808,
      "grad_norm": 4.858819484710693,
      "kl": 0.0853271484375,
      "learning_rate": 1e-06,
      "loss": 0.0164,
      "num_tokens": 14754002.0,
      "reward": 0.08757828921079636,
      "reward_std": 0.027417277917265892,
      "rewards/bleu_reward_func/mean": 0.08757828921079636,
      "rewards/bleu_reward_func/std": 0.07575616985559464,
      "step": 1101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 467.0,
      "completions/mean_length": 381.75,
      "completions/mean_terminated_length": 266.8235168457031,
      "completions/min_length": 79.0,
      "completions/min_terminated_length": 79.0,
      "epoch": 0.8816,
      "grad_norm": 2.965717315673828,
      "kl": 0.0623779296875,
      "learning_rate": 1e-06,
      "loss": -0.0092,
      "num_tokens": 14769226.0,
      "reward": 0.07998257875442505,
      "reward_std": 0.047210924327373505,
      "rewards/bleu_reward_func/mean": 0.07998257875442505,
      "rewards/bleu_reward_func/std": 0.06257197260856628,
      "step": 1102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 469.0,
      "completions/mean_length": 368.625,
      "completions/mean_terminated_length": 282.6000061035156,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.8824,
      "grad_norm": 2.529975652694702,
      "kl": 0.05145263671875,
      "learning_rate": 1e-06,
      "loss": 0.1021,
      "num_tokens": 14783990.0,
      "reward": 0.05064859241247177,
      "reward_std": 0.030285051092505455,
      "rewards/bleu_reward_func/mean": 0.05064859241247177,
      "rewards/bleu_reward_func/std": 0.050740811973810196,
      "step": 1103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 371.0,
      "completions/mean_length": 458.46875,
      "completions/mean_terminated_length": 297.875,
      "completions/min_length": 231.0,
      "completions/min_terminated_length": 231.0,
      "epoch": 0.8832,
      "grad_norm": 2.1087520122528076,
      "kl": 0.05126953125,
      "learning_rate": 1e-06,
      "loss": 0.0163,
      "num_tokens": 14803693.0,
      "reward": 0.08669351041316986,
      "reward_std": 0.02402358688414097,
      "rewards/bleu_reward_func/mean": 0.08669351041316986,
      "rewards/bleu_reward_func/std": 0.08511854708194733,
      "step": 1104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 448.0,
      "completions/max_terminated_length": 448.0,
      "completions/mean_length": 188.1875,
      "completions/mean_terminated_length": 188.1875,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.884,
      "grad_norm": 4.063257217407227,
      "kl": 0.0767822265625,
      "learning_rate": 1e-06,
      "loss": 0.18,
      "num_tokens": 14812451.0,
      "reward": 0.05730229616165161,
      "reward_std": 0.03279360383749008,
      "rewards/bleu_reward_func/mean": 0.05730229616165161,
      "rewards/bleu_reward_func/std": 0.0810193419456482,
      "step": 1105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 391.0,
      "completions/max_terminated_length": 391.0,
      "completions/mean_length": 231.09375,
      "completions/mean_terminated_length": 231.09375,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "epoch": 0.8848,
      "grad_norm": 3.3575527667999268,
      "kl": 0.0682373046875,
      "learning_rate": 1e-06,
      "loss": -0.0423,
      "num_tokens": 14822950.0,
      "reward": 0.08444621413946152,
      "reward_std": 0.0277608260512352,
      "rewards/bleu_reward_func/mean": 0.08444621413946152,
      "rewards/bleu_reward_func/std": 0.03642307594418526,
      "step": 1106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 301.96875,
      "completions/mean_terminated_length": 295.19354248046875,
      "completions/min_length": 94.0,
      "completions/min_terminated_length": 94.0,
      "epoch": 0.8856,
      "grad_norm": 2.9735851287841797,
      "kl": 0.06231689453125,
      "learning_rate": 1e-06,
      "loss": 0.0507,
      "num_tokens": 14835285.0,
      "reward": 0.045117855072021484,
      "reward_std": 0.0092654749751091,
      "rewards/bleu_reward_func/mean": 0.045117855072021484,
      "rewards/bleu_reward_func/std": 0.017960846424102783,
      "step": 1107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 284.6875,
      "completions/mean_terminated_length": 208.9166717529297,
      "completions/min_length": 95.0,
      "completions/min_terminated_length": 95.0,
      "epoch": 0.8864,
      "grad_norm": 2.991635322570801,
      "kl": 0.0849609375,
      "learning_rate": 1e-06,
      "loss": -0.0632,
      "num_tokens": 14847299.0,
      "reward": 0.029840070754289627,
      "reward_std": 0.011749091558158398,
      "rewards/bleu_reward_func/mean": 0.029840070754289627,
      "rewards/bleu_reward_func/std": 0.02153618633747101,
      "step": 1108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 351.3125,
      "completions/mean_terminated_length": 297.75,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "epoch": 0.8872,
      "grad_norm": 2.6633989810943604,
      "kl": 0.07733154296875,
      "learning_rate": 1e-06,
      "loss": -0.0163,
      "num_tokens": 14860997.0,
      "reward": 0.10886503756046295,
      "reward_std": 0.021869435906410217,
      "rewards/bleu_reward_func/mean": 0.10886503756046295,
      "rewards/bleu_reward_func/std": 0.0551656112074852,
      "step": 1109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 349.09375,
      "completions/mean_terminated_length": 318.9259338378906,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.888,
      "grad_norm": 2.456130027770996,
      "kl": 0.05157470703125,
      "learning_rate": 1e-06,
      "loss": 0.0138,
      "num_tokens": 14874728.0,
      "reward": 0.03805284574627876,
      "reward_std": 0.012986007146537304,
      "rewards/bleu_reward_func/mean": 0.03805284574627876,
      "rewards/bleu_reward_func/std": 0.01945861615240574,
      "step": 1110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 415.4375,
      "completions/mean_terminated_length": 364.8571472167969,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 248.0,
      "epoch": 0.8888,
      "grad_norm": 2.130664110183716,
      "kl": 0.050140380859375,
      "learning_rate": 1e-06,
      "loss": -0.0034,
      "num_tokens": 14890278.0,
      "reward": 0.0457424521446228,
      "reward_std": 0.016246361657977104,
      "rewards/bleu_reward_func/mean": 0.0457424521446228,
      "rewards/bleu_reward_func/std": 0.033452119678258896,
      "step": 1111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 511.9375,
      "completions/mean_terminated_length": 510.0,
      "completions/min_length": 510.0,
      "completions/min_terminated_length": 510.0,
      "epoch": 0.8896,
      "grad_norm": 1.8803907632827759,
      "kl": 0.067138671875,
      "learning_rate": 1e-06,
      "loss": 0.0027,
      "num_tokens": 14912388.0,
      "reward": 0.05271358788013458,
      "reward_std": 0.012337183579802513,
      "rewards/bleu_reward_func/mean": 0.05271358788013458,
      "rewards/bleu_reward_func/std": 0.03781459480524063,
      "step": 1112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 419.0,
      "completions/mean_length": 472.5625,
      "completions/mean_terminated_length": 354.25,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 0.8904,
      "grad_norm": 2.125682830810547,
      "kl": 0.0582275390625,
      "learning_rate": 1e-06,
      "loss": 0.02,
      "num_tokens": 14931926.0,
      "reward": 0.037002939730882645,
      "reward_std": 0.006545543670654297,
      "rewards/bleu_reward_func/mean": 0.037002939730882645,
      "rewards/bleu_reward_func/std": 0.02415914461016655,
      "step": 1113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 448.0,
      "completions/mean_length": 285.5625,
      "completions/mean_terminated_length": 233.3076934814453,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.8912,
      "grad_norm": 2.972381353378296,
      "kl": 0.0584716796875,
      "learning_rate": 1e-06,
      "loss": 0.1917,
      "num_tokens": 14944224.0,
      "reward": 0.07146148383617401,
      "reward_std": 0.04880303889513016,
      "rewards/bleu_reward_func/mean": 0.07146148383617401,
      "rewards/bleu_reward_func/std": 0.07057799398899078,
      "step": 1114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 256.875,
      "completions/mean_terminated_length": 209.629638671875,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.892,
      "grad_norm": 4.273919105529785,
      "kl": 0.043212890625,
      "learning_rate": 1e-06,
      "loss": -0.1796,
      "num_tokens": 14958652.0,
      "reward": 0.05981897562742233,
      "reward_std": 0.048418186604976654,
      "rewards/bleu_reward_func/mean": 0.05981897562742233,
      "rewards/bleu_reward_func/std": 0.05630670115351677,
      "step": 1115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 301.65625,
      "completions/mean_terminated_length": 287.63336181640625,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.8928,
      "grad_norm": 3.231736421585083,
      "kl": 0.06976318359375,
      "learning_rate": 1e-06,
      "loss": 0.1316,
      "num_tokens": 14971193.0,
      "reward": 0.09151319414377213,
      "reward_std": 0.03449167683720589,
      "rewards/bleu_reward_func/mean": 0.09151319414377213,
      "rewards/bleu_reward_func/std": 0.08941276371479034,
      "step": 1116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 310.0,
      "completions/mean_terminated_length": 218.18182373046875,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.8936,
      "grad_norm": 3.243455648422241,
      "kl": 0.09100341796875,
      "learning_rate": 1e-06,
      "loss": 0.0944,
      "num_tokens": 14983321.0,
      "reward": 0.07781277596950531,
      "reward_std": 0.015596391633152962,
      "rewards/bleu_reward_func/mean": 0.07781277596950531,
      "rewards/bleu_reward_func/std": 0.08945278078317642,
      "step": 1117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 305.40625,
      "completions/mean_terminated_length": 298.7419128417969,
      "completions/min_length": 71.0,
      "completions/min_terminated_length": 71.0,
      "epoch": 0.8944,
      "grad_norm": 2.6733896732330322,
      "kl": 0.049560546875,
      "learning_rate": 1e-06,
      "loss": 0.1494,
      "num_tokens": 14995086.0,
      "reward": 0.03880076855421066,
      "reward_std": 0.01223827339708805,
      "rewards/bleu_reward_func/mean": 0.03880076855421066,
      "rewards/bleu_reward_func/std": 0.017184842377901077,
      "step": 1118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 321.0,
      "completions/mean_terminated_length": 257.3333435058594,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.8952,
      "grad_norm": 2.7894961833953857,
      "kl": 0.05950927734375,
      "learning_rate": 1e-06,
      "loss": -0.0209,
      "num_tokens": 15010286.0,
      "reward": 0.08945924043655396,
      "reward_std": 0.03418608009815216,
      "rewards/bleu_reward_func/mean": 0.08945924043655396,
      "rewards/bleu_reward_func/std": 0.06008521839976311,
      "step": 1119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 411.0,
      "completions/mean_length": 261.1875,
      "completions/mean_terminated_length": 177.58334350585938,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.896,
      "grad_norm": 2.97663950920105,
      "kl": 0.086181640625,
      "learning_rate": 1e-06,
      "loss": 0.166,
      "num_tokens": 15023892.0,
      "reward": 0.1153651624917984,
      "reward_std": 0.0873895213007927,
      "rewards/bleu_reward_func/mean": 0.1153651624917984,
      "rewards/bleu_reward_func/std": 0.11178025603294373,
      "step": 1120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 449.46875,
      "completions/mean_terminated_length": 345.25,
      "completions/min_length": 220.0,
      "completions/min_terminated_length": 220.0,
      "epoch": 0.8968,
      "grad_norm": 2.1350767612457275,
      "kl": 0.066314697265625,
      "learning_rate": 1e-06,
      "loss": 0.0029,
      "num_tokens": 15043787.0,
      "reward": 0.02139696478843689,
      "reward_std": 0.004052299074828625,
      "rewards/bleu_reward_func/mean": 0.02139696478843689,
      "rewards/bleu_reward_func/std": 0.01895984448492527,
      "step": 1121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 356.59375,
      "completions/mean_terminated_length": 285.9545593261719,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.8976,
      "grad_norm": 2.074108839035034,
      "kl": 0.0611572265625,
      "learning_rate": 1e-06,
      "loss": -0.0119,
      "num_tokens": 15057830.0,
      "reward": 0.04125259816646576,
      "reward_std": 0.017305299639701843,
      "rewards/bleu_reward_func/mean": 0.04125259816646576,
      "rewards/bleu_reward_func/std": 0.02421402372419834,
      "step": 1122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 442.46875,
      "completions/mean_terminated_length": 309.727294921875,
      "completions/min_length": 197.0,
      "completions/min_terminated_length": 197.0,
      "epoch": 0.8984,
      "grad_norm": 2.22540283203125,
      "kl": 0.05780029296875,
      "learning_rate": 1e-06,
      "loss": -0.0008,
      "num_tokens": 15075733.0,
      "reward": 0.06470570713281631,
      "reward_std": 0.0157428327947855,
      "rewards/bleu_reward_func/mean": 0.06470570713281631,
      "rewards/bleu_reward_func/std": 0.021143831312656403,
      "step": 1123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 510.09375,
      "completions/mean_terminated_length": 481.5,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 0.8992,
      "grad_norm": 2.060910701751709,
      "kl": 0.0660400390625,
      "learning_rate": 1e-06,
      "loss": 0.0009,
      "num_tokens": 15095424.0,
      "reward": 0.04325367882847786,
      "reward_std": 0.017079303041100502,
      "rewards/bleu_reward_func/mean": 0.04325367882847786,
      "rewards/bleu_reward_func/std": 0.062008537352085114,
      "step": 1124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 348.8125,
      "completions/mean_terminated_length": 325.5,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 192.0,
      "epoch": 0.9,
      "grad_norm": 2.4369888305664062,
      "kl": 0.05029296875,
      "learning_rate": 1e-06,
      "loss": 0.0233,
      "num_tokens": 15111818.0,
      "reward": 0.044592827558517456,
      "reward_std": 0.022017713636159897,
      "rewards/bleu_reward_func/mean": 0.044592827558517456,
      "rewards/bleu_reward_func/std": 0.042288888245821,
      "step": 1125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 304.0,
      "completions/mean_length": 239.875,
      "completions/mean_terminated_length": 163.67999267578125,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.9008,
      "grad_norm": 2.867711067199707,
      "kl": 0.0728759765625,
      "learning_rate": 1e-06,
      "loss": 0.0002,
      "num_tokens": 15121334.0,
      "reward": 0.01965026557445526,
      "reward_std": 0.00810272991657257,
      "rewards/bleu_reward_func/mean": 0.01965026557445526,
      "rewards/bleu_reward_func/std": 0.010319051332771778,
      "step": 1126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 420.40625,
      "completions/mean_terminated_length": 378.7727355957031,
      "completions/min_length": 221.0,
      "completions/min_terminated_length": 221.0,
      "epoch": 0.9016,
      "grad_norm": 2.170319080352783,
      "kl": 0.039642333984375,
      "learning_rate": 1e-06,
      "loss": -0.0223,
      "num_tokens": 15138355.0,
      "reward": 0.08797721564769745,
      "reward_std": 0.021382782608270645,
      "rewards/bleu_reward_func/mean": 0.08797721564769745,
      "rewards/bleu_reward_func/std": 0.0978621318936348,
      "step": 1127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 432.3125,
      "completions/mean_terminated_length": 315.8461608886719,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.9024,
      "grad_norm": 2.282930850982666,
      "kl": 0.0628662109375,
      "learning_rate": 1e-06,
      "loss": -0.0182,
      "num_tokens": 15154437.0,
      "reward": 0.022853992879390717,
      "reward_std": 0.007267317268997431,
      "rewards/bleu_reward_func/mean": 0.022853992879390717,
      "rewards/bleu_reward_func/std": 0.010599992237985134,
      "step": 1128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 389.34375,
      "completions/mean_terminated_length": 333.5909118652344,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 243.0,
      "epoch": 0.9032,
      "grad_norm": 2.7985355854034424,
      "kl": 0.04034423828125,
      "learning_rate": 1e-06,
      "loss": 0.0193,
      "num_tokens": 15169968.0,
      "reward": 0.05068669840693474,
      "reward_std": 0.011722628958523273,
      "rewards/bleu_reward_func/mean": 0.05068669840693474,
      "rewards/bleu_reward_func/std": 0.028417525812983513,
      "step": 1129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 263.3125,
      "completions/mean_terminated_length": 237.58621215820312,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.904,
      "grad_norm": 3.5234053134918213,
      "kl": 0.0648193359375,
      "learning_rate": 1e-06,
      "loss": 0.0324,
      "num_tokens": 15180522.0,
      "reward": 0.0427127331495285,
      "reward_std": 0.018165353685617447,
      "rewards/bleu_reward_func/mean": 0.0427127331495285,
      "rewards/bleu_reward_func/std": 0.02901976928114891,
      "step": 1130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 482.0,
      "completions/mean_length": 362.46875,
      "completions/mean_terminated_length": 312.625,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "epoch": 0.9048,
      "grad_norm": 2.3100168704986572,
      "kl": 0.0533447265625,
      "learning_rate": 1e-06,
      "loss": -0.0478,
      "num_tokens": 15195017.0,
      "reward": 0.06346133351325989,
      "reward_std": 0.023072250187397003,
      "rewards/bleu_reward_func/mean": 0.06346133351325989,
      "rewards/bleu_reward_func/std": 0.043767333030700684,
      "step": 1131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 401.0,
      "completions/mean_length": 262.3125,
      "completions/mean_terminated_length": 204.69232177734375,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.9056,
      "grad_norm": 3.143620014190674,
      "kl": 0.052947998046875,
      "learning_rate": 1e-06,
      "loss": -0.0513,
      "num_tokens": 15206091.0,
      "reward": 0.0305976253002882,
      "reward_std": 0.014570488594472408,
      "rewards/bleu_reward_func/mean": 0.0305976253002882,
      "rewards/bleu_reward_func/std": 0.018198352307081223,
      "step": 1132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 443.0,
      "completions/mean_length": 328.46875,
      "completions/mean_terminated_length": 267.29168701171875,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "epoch": 0.9064,
      "grad_norm": 2.5279524326324463,
      "kl": 0.06573486328125,
      "learning_rate": 1e-06,
      "loss": 0.0373,
      "num_tokens": 15219594.0,
      "reward": 0.03247949108481407,
      "reward_std": 0.0067100487649440765,
      "rewards/bleu_reward_func/mean": 0.03247949108481407,
      "rewards/bleu_reward_func/std": 0.023386213928461075,
      "step": 1133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 503.0,
      "completions/mean_length": 256.25,
      "completions/mean_terminated_length": 239.20001220703125,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.9072,
      "grad_norm": 2.703608274459839,
      "kl": 0.06512451171875,
      "learning_rate": 1e-06,
      "loss": -0.0771,
      "num_tokens": 15230138.0,
      "reward": 0.055908337235450745,
      "reward_std": 0.02350510098040104,
      "rewards/bleu_reward_func/mean": 0.055908337235450745,
      "rewards/bleu_reward_func/std": 0.04675652086734772,
      "step": 1134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 469.0,
      "completions/mean_length": 287.21875,
      "completions/mean_terminated_length": 263.96551513671875,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.908,
      "grad_norm": 3.3396432399749756,
      "kl": 0.0732421875,
      "learning_rate": 1e-06,
      "loss": -0.1557,
      "num_tokens": 15241889.0,
      "reward": 0.06957037001848221,
      "reward_std": 0.03351438045501709,
      "rewards/bleu_reward_func/mean": 0.06957037001848221,
      "rewards/bleu_reward_func/std": 0.06069939583539963,
      "step": 1135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 342.625,
      "completions/mean_terminated_length": 286.16668701171875,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "epoch": 0.9088,
      "grad_norm": 2.8781590461730957,
      "kl": 0.0584716796875,
      "learning_rate": 1e-06,
      "loss": 0.0245,
      "num_tokens": 15259365.0,
      "reward": 0.04491988569498062,
      "reward_std": 0.012549539096653461,
      "rewards/bleu_reward_func/mean": 0.04491988569498062,
      "rewards/bleu_reward_func/std": 0.019155489280819893,
      "step": 1136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 297.71875,
      "completions/mean_terminated_length": 226.2916717529297,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.9096,
      "grad_norm": 2.8332650661468506,
      "kl": 0.073974609375,
      "learning_rate": 1e-06,
      "loss": 0.0845,
      "num_tokens": 15273084.0,
      "reward": 0.040149278938770294,
      "reward_std": 0.017163297161459923,
      "rewards/bleu_reward_func/mean": 0.040149278938770294,
      "rewards/bleu_reward_func/std": 0.028974436223506927,
      "step": 1137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 376.4375,
      "completions/mean_terminated_length": 323.39129638671875,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "epoch": 0.9104,
      "grad_norm": 2.322246551513672,
      "kl": 0.0478515625,
      "learning_rate": 1e-06,
      "loss": 0.1149,
      "num_tokens": 15287562.0,
      "reward": 0.04656628519296646,
      "reward_std": 0.021638479083776474,
      "rewards/bleu_reward_func/mean": 0.04656628519296646,
      "rewards/bleu_reward_func/std": 0.033972807228565216,
      "step": 1138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 362.0,
      "completions/mean_length": 244.09375,
      "completions/mean_terminated_length": 205.82144165039062,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.9112,
      "grad_norm": 2.6419429779052734,
      "kl": 0.05712890625,
      "learning_rate": 1e-06,
      "loss": 0.0536,
      "num_tokens": 15297725.0,
      "reward": 0.07112360745668411,
      "reward_std": 0.036042600870132446,
      "rewards/bleu_reward_func/mean": 0.07112360745668411,
      "rewards/bleu_reward_func/std": 0.08656957000494003,
      "step": 1139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 405.78125,
      "completions/mean_terminated_length": 357.5,
      "completions/min_length": 207.0,
      "completions/min_terminated_length": 207.0,
      "epoch": 0.912,
      "grad_norm": 2.181318998336792,
      "kl": 0.0428466796875,
      "learning_rate": 1e-06,
      "loss": 0.0321,
      "num_tokens": 15314014.0,
      "reward": 0.051423329859972,
      "reward_std": 0.016586489975452423,
      "rewards/bleu_reward_func/mean": 0.051423329859972,
      "rewards/bleu_reward_func/std": 0.028527207672595978,
      "step": 1140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 324.78125,
      "completions/mean_terminated_length": 272.3599853515625,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "epoch": 0.9128,
      "grad_norm": 2.6057114601135254,
      "kl": 0.05413818359375,
      "learning_rate": 1e-06,
      "loss": 0.0203,
      "num_tokens": 15326527.0,
      "reward": 0.0373198464512825,
      "reward_std": 0.007190403528511524,
      "rewards/bleu_reward_func/mean": 0.0373198464512825,
      "rewards/bleu_reward_func/std": 0.011146528646349907,
      "step": 1141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 453.0,
      "completions/mean_length": 325.96875,
      "completions/mean_terminated_length": 263.9583435058594,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "epoch": 0.9136,
      "grad_norm": 2.6041626930236816,
      "kl": 0.0791015625,
      "learning_rate": 1e-06,
      "loss": -0.0076,
      "num_tokens": 15339198.0,
      "reward": 0.049278415739536285,
      "reward_std": 0.01710457354784012,
      "rewards/bleu_reward_func/mean": 0.049278415739536285,
      "rewards/bleu_reward_func/std": 0.037151776254177094,
      "step": 1142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 376.78125,
      "completions/mean_terminated_length": 202.92857360839844,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.9144,
      "grad_norm": 4.027402877807617,
      "kl": 0.0850830078125,
      "learning_rate": 1e-06,
      "loss": -0.0234,
      "num_tokens": 15357719.0,
      "reward": 0.028639614582061768,
      "reward_std": 0.007897703908383846,
      "rewards/bleu_reward_func/mean": 0.028639614582061768,
      "rewards/bleu_reward_func/std": 0.018189268186688423,
      "step": 1143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 467.0,
      "completions/mean_length": 255.9375,
      "completions/mean_terminated_length": 247.6774139404297,
      "completions/min_length": 17.0,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.9152,
      "grad_norm": 5.938324451446533,
      "kl": 0.082763671875,
      "learning_rate": 1e-06,
      "loss": 0.0391,
      "num_tokens": 15372557.0,
      "reward": 0.06757717579603195,
      "reward_std": 0.023928619921207428,
      "rewards/bleu_reward_func/mean": 0.06757717579603195,
      "rewards/bleu_reward_func/std": 0.04241359233856201,
      "step": 1144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 393.8125,
      "completions/mean_terminated_length": 366.5384826660156,
      "completions/min_length": 218.0,
      "completions/min_terminated_length": 218.0,
      "epoch": 0.916,
      "grad_norm": 2.242014169692993,
      "kl": 0.0362548828125,
      "learning_rate": 1e-06,
      "loss": -0.0631,
      "num_tokens": 15387383.0,
      "reward": 0.05110463500022888,
      "reward_std": 0.0156728345900774,
      "rewards/bleu_reward_func/mean": 0.05110463500022888,
      "rewards/bleu_reward_func/std": 0.04327816516160965,
      "step": 1145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 333.6875,
      "completions/mean_terminated_length": 292.5384826660156,
      "completions/min_length": 173.0,
      "completions/min_terminated_length": 173.0,
      "epoch": 0.9168,
      "grad_norm": 2.574808120727539,
      "kl": 0.04595947265625,
      "learning_rate": 1e-06,
      "loss": 0.0006,
      "num_tokens": 15399901.0,
      "reward": 0.04336816817522049,
      "reward_std": 0.009710687212646008,
      "rewards/bleu_reward_func/mean": 0.04336816817522049,
      "rewards/bleu_reward_func/std": 0.04393818601965904,
      "step": 1146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 444.0,
      "completions/mean_length": 284.375,
      "completions/mean_terminated_length": 220.63999938964844,
      "completions/min_length": 37.0,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.9176,
      "grad_norm": 3.7397375106811523,
      "kl": 0.055633544921875,
      "learning_rate": 1e-06,
      "loss": -0.2068,
      "num_tokens": 15415873.0,
      "reward": 0.08271771669387817,
      "reward_std": 0.03239203989505768,
      "rewards/bleu_reward_func/mean": 0.08271771669387817,
      "rewards/bleu_reward_func/std": 0.05870966985821724,
      "step": 1147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 281.375,
      "completions/mean_terminated_length": 216.79998779296875,
      "completions/min_length": 85.0,
      "completions/min_terminated_length": 85.0,
      "epoch": 0.9184,
      "grad_norm": 3.4246437549591064,
      "kl": 0.03533935546875,
      "learning_rate": 1e-06,
      "loss": 0.0401,
      "num_tokens": 15427453.0,
      "reward": 0.02845573052763939,
      "reward_std": 0.009434389881789684,
      "rewards/bleu_reward_func/mean": 0.02845573052763939,
      "rewards/bleu_reward_func/std": 0.010721604339778423,
      "step": 1148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 330.375,
      "completions/mean_terminated_length": 269.8333435058594,
      "completions/min_length": 98.0,
      "completions/min_terminated_length": 98.0,
      "epoch": 0.9192,
      "grad_norm": 2.9520516395568848,
      "kl": 0.05230712890625,
      "learning_rate": 1e-06,
      "loss": -0.0683,
      "num_tokens": 15441313.0,
      "reward": 0.09095876663923264,
      "reward_std": 0.031137729063630104,
      "rewards/bleu_reward_func/mean": 0.09095876663923264,
      "rewards/bleu_reward_func/std": 0.0966407060623169,
      "step": 1149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 379.0,
      "completions/max_terminated_length": 379.0,
      "completions/mean_length": 196.59375,
      "completions/mean_terminated_length": 196.59375,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "epoch": 0.92,
      "grad_norm": 3.3121564388275146,
      "kl": 0.11737060546875,
      "learning_rate": 1e-06,
      "loss": 0.0138,
      "num_tokens": 15449564.0,
      "reward": 0.04255712777376175,
      "reward_std": 0.015855029225349426,
      "rewards/bleu_reward_func/mean": 0.04255712777376175,
      "rewards/bleu_reward_func/std": 0.018233712762594223,
      "step": 1150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 309.0,
      "completions/mean_length": 231.875,
      "completions/mean_terminated_length": 138.5,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.9208,
      "grad_norm": 3.608539342880249,
      "kl": 0.07647705078125,
      "learning_rate": 1e-06,
      "loss": 0.0542,
      "num_tokens": 15459792.0,
      "reward": 0.04274564981460571,
      "reward_std": 0.016751842573285103,
      "rewards/bleu_reward_func/mean": 0.04274564981460571,
      "rewards/bleu_reward_func/std": 0.025919852778315544,
      "step": 1151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 411.90625,
      "completions/mean_terminated_length": 311.8125,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "epoch": 0.9216,
      "grad_norm": 2.1542413234710693,
      "kl": 0.076904296875,
      "learning_rate": 1e-06,
      "loss": 0.0758,
      "num_tokens": 15475893.0,
      "reward": 0.046681758016347885,
      "reward_std": 0.014086486771702766,
      "rewards/bleu_reward_func/mean": 0.046681758016347885,
      "rewards/bleu_reward_func/std": 0.02536054514348507,
      "step": 1152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 368.0,
      "completions/max_terminated_length": 368.0,
      "completions/mean_length": 148.8125,
      "completions/mean_terminated_length": 148.8125,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.9224,
      "grad_norm": 4.492018222808838,
      "kl": 0.088714599609375,
      "learning_rate": 1e-06,
      "loss": 0.0612,
      "num_tokens": 15483447.0,
      "reward": 0.042024992406368256,
      "reward_std": 0.019391583278775215,
      "rewards/bleu_reward_func/mean": 0.042024992406368256,
      "rewards/bleu_reward_func/std": 0.029765766113996506,
      "step": 1153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 365.21875,
      "completions/mean_terminated_length": 288.3333435058594,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.9232,
      "grad_norm": 2.9250950813293457,
      "kl": 0.0654296875,
      "learning_rate": 1e-06,
      "loss": -0.0311,
      "num_tokens": 15497710.0,
      "reward": 0.0304352305829525,
      "reward_std": 0.01103723980486393,
      "rewards/bleu_reward_func/mean": 0.0304352305829525,
      "rewards/bleu_reward_func/std": 0.03708275035023689,
      "step": 1154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 364.3125,
      "completions/mean_terminated_length": 234.0,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "epoch": 0.924,
      "grad_norm": 2.453594207763672,
      "kl": 0.06170654296875,
      "learning_rate": 1e-06,
      "loss": -0.0598,
      "num_tokens": 15512032.0,
      "reward": 0.07078155130147934,
      "reward_std": 0.021277839317917824,
      "rewards/bleu_reward_func/mean": 0.07078155130147934,
      "rewards/bleu_reward_func/std": 0.05780218914151192,
      "step": 1155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 399.0,
      "completions/mean_length": 267.78125,
      "completions/mean_terminated_length": 232.8928680419922,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "epoch": 0.9248,
      "grad_norm": 2.6847267150878906,
      "kl": 0.0552978515625,
      "learning_rate": 1e-06,
      "loss": -0.0401,
      "num_tokens": 15522577.0,
      "reward": 0.051142215728759766,
      "reward_std": 0.019845107570290565,
      "rewards/bleu_reward_func/mean": 0.051142215728759766,
      "rewards/bleu_reward_func/std": 0.026456067338585854,
      "step": 1156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 308.84375,
      "completions/mean_terminated_length": 279.8214416503906,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.9256,
      "grad_norm": 4.366429805755615,
      "kl": 0.07275390625,
      "learning_rate": 1e-06,
      "loss": -0.0373,
      "num_tokens": 15534556.0,
      "reward": 0.035104669630527496,
      "reward_std": 0.011860767379403114,
      "rewards/bleu_reward_func/mean": 0.035104669630527496,
      "rewards/bleu_reward_func/std": 0.01294864621013403,
      "step": 1157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 285.65625,
      "completions/mean_terminated_length": 253.32144165039062,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.9264,
      "grad_norm": 3.2398438453674316,
      "kl": 0.0435791015625,
      "learning_rate": 1e-06,
      "loss": 0.05,
      "num_tokens": 15545873.0,
      "reward": 0.06170883774757385,
      "reward_std": 0.024604424834251404,
      "rewards/bleu_reward_func/mean": 0.06170883774757385,
      "rewards/bleu_reward_func/std": 0.025780370458960533,
      "step": 1158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 294.125,
      "completions/mean_terminated_length": 208.86956787109375,
      "completions/min_length": 33.0,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.9272,
      "grad_norm": 4.35112190246582,
      "kl": 0.0579833984375,
      "learning_rate": 1e-06,
      "loss": -0.0676,
      "num_tokens": 15559021.0,
      "reward": 0.122794009745121,
      "reward_std": 0.04096021503210068,
      "rewards/bleu_reward_func/mean": 0.122794009745121,
      "rewards/bleu_reward_func/std": 0.1473662406206131,
      "step": 1159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 413.875,
      "completions/mean_terminated_length": 327.29412841796875,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "epoch": 0.928,
      "grad_norm": 2.200023651123047,
      "kl": 0.0390625,
      "learning_rate": 1e-06,
      "loss": 0.0099,
      "num_tokens": 15575625.0,
      "reward": 0.043420542031526566,
      "reward_std": 0.015745732933282852,
      "rewards/bleu_reward_func/mean": 0.043420542031526566,
      "rewards/bleu_reward_func/std": 0.01907273940742016,
      "step": 1160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 358.0,
      "completions/mean_length": 251.3125,
      "completions/mean_terminated_length": 164.4166717529297,
      "completions/min_length": 60.0,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.9288,
      "grad_norm": 3.7651917934417725,
      "kl": 0.1016845703125,
      "learning_rate": 1e-06,
      "loss": 0.0955,
      "num_tokens": 15585923.0,
      "reward": 0.02488887310028076,
      "reward_std": 0.008125792257487774,
      "rewards/bleu_reward_func/mean": 0.02488887310028076,
      "rewards/bleu_reward_func/std": 0.02298036403954029,
      "step": 1161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 460.21875,
      "completions/mean_terminated_length": 384.5384826660156,
      "completions/min_length": 274.0,
      "completions/min_terminated_length": 274.0,
      "epoch": 0.9296,
      "grad_norm": 2.216195583343506,
      "kl": 0.05767822265625,
      "learning_rate": 1e-06,
      "loss": -0.074,
      "num_tokens": 15603114.0,
      "reward": 0.04072732850909233,
      "reward_std": 0.020597945898771286,
      "rewards/bleu_reward_func/mean": 0.04072732850909233,
      "rewards/bleu_reward_func/std": 0.03298228979110718,
      "step": 1162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 346.78125,
      "completions/mean_terminated_length": 233.73684692382812,
      "completions/min_length": 41.0,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.9304,
      "grad_norm": 3.4893603324890137,
      "kl": 0.0799560546875,
      "learning_rate": 1e-06,
      "loss": -0.0215,
      "num_tokens": 15616915.0,
      "reward": 0.06695497781038284,
      "reward_std": 0.024308744817972183,
      "rewards/bleu_reward_func/mean": 0.06695497781038284,
      "rewards/bleu_reward_func/std": 0.035929832607507706,
      "step": 1163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 331.375,
      "completions/mean_terminated_length": 271.16668701171875,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 0.9312,
      "grad_norm": 2.481180429458618,
      "kl": 0.044219970703125,
      "learning_rate": 1e-06,
      "loss": -0.073,
      "num_tokens": 15630071.0,
      "reward": 0.030527595430612564,
      "reward_std": 0.01595548912882805,
      "rewards/bleu_reward_func/mean": 0.030527595430612564,
      "rewards/bleu_reward_func/std": 0.025926506146788597,
      "step": 1164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 404.0,
      "completions/mean_length": 379.875,
      "completions/mean_terminated_length": 277.1111145019531,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "epoch": 0.932,
      "grad_norm": 2.4039766788482666,
      "kl": 0.0494384765625,
      "learning_rate": 1e-06,
      "loss": -0.0824,
      "num_tokens": 15645003.0,
      "reward": 0.06304138153791428,
      "reward_std": 0.02535812184214592,
      "rewards/bleu_reward_func/mean": 0.06304138153791428,
      "rewards/bleu_reward_func/std": 0.06451297551393509,
      "step": 1165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 435.0,
      "completions/mean_length": 321.5,
      "completions/mean_terminated_length": 258.0,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.9328,
      "grad_norm": 3.711174964904785,
      "kl": 0.07073974609375,
      "learning_rate": 1e-06,
      "loss": 0.0119,
      "num_tokens": 15657851.0,
      "reward": 0.02661607414484024,
      "reward_std": 0.006588813848793507,
      "rewards/bleu_reward_func/mean": 0.02661607414484024,
      "rewards/bleu_reward_func/std": 0.014483918435871601,
      "step": 1166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 490.0,
      "completions/mean_length": 449.46875,
      "completions/mean_terminated_length": 421.04547119140625,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 0.9336,
      "grad_norm": 2.1988365650177,
      "kl": 0.05462646484375,
      "learning_rate": 1e-06,
      "loss": -0.0023,
      "num_tokens": 15676234.0,
      "reward": 0.04999478906393051,
      "reward_std": 0.0120457224547863,
      "rewards/bleu_reward_func/mean": 0.04999478906393051,
      "rewards/bleu_reward_func/std": 0.04736227169632912,
      "step": 1167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 506.0,
      "completions/max_terminated_length": 506.0,
      "completions/mean_length": 249.34375,
      "completions/mean_terminated_length": 249.34375,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "epoch": 0.9344,
      "grad_norm": 2.655139207839966,
      "kl": 0.05450439453125,
      "learning_rate": 1e-06,
      "loss": 0.1106,
      "num_tokens": 15686653.0,
      "reward": 0.03939780965447426,
      "reward_std": 0.02221381478011608,
      "rewards/bleu_reward_func/mean": 0.03939780965447426,
      "rewards/bleu_reward_func/std": 0.029370024800300598,
      "step": 1168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 448.0,
      "completions/mean_length": 224.15625,
      "completions/mean_terminated_length": 214.87095642089844,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "epoch": 0.9352,
      "grad_norm": 2.832167863845825,
      "kl": 0.05328369140625,
      "learning_rate": 1e-06,
      "loss": 0.1287,
      "num_tokens": 15695698.0,
      "reward": 0.07494577765464783,
      "reward_std": 0.019553756341338158,
      "rewards/bleu_reward_func/mean": 0.07494577765464783,
      "rewards/bleu_reward_func/std": 0.09201066941022873,
      "step": 1169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 473.0,
      "completions/mean_length": 266.59375,
      "completions/mean_terminated_length": 221.1481475830078,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.936,
      "grad_norm": 3.4584455490112305,
      "kl": 0.0975341796875,
      "learning_rate": 1e-06,
      "loss": -0.0885,
      "num_tokens": 15706157.0,
      "reward": 0.05156881362199783,
      "reward_std": 0.02069806307554245,
      "rewards/bleu_reward_func/mean": 0.05156881362199783,
      "rewards/bleu_reward_func/std": 0.0440862663090229,
      "step": 1170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 371.65625,
      "completions/mean_terminated_length": 287.45001220703125,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 0.9368,
      "grad_norm": 2.4048993587493896,
      "kl": 0.04290771484375,
      "learning_rate": 1e-06,
      "loss": -0.0467,
      "num_tokens": 15720266.0,
      "reward": 0.10653826594352722,
      "reward_std": 0.0501834899187088,
      "rewards/bleu_reward_func/mean": 0.10653826594352722,
      "rewards/bleu_reward_func/std": 0.08799951523542404,
      "step": 1171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 496.0,
      "completions/mean_length": 319.5625,
      "completions/mean_terminated_length": 306.73333740234375,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.9376,
      "grad_norm": 2.697983503341675,
      "kl": 0.07415771484375,
      "learning_rate": 1e-06,
      "loss": -0.015,
      "num_tokens": 15733156.0,
      "reward": 0.038380008190870285,
      "reward_std": 0.011081306263804436,
      "rewards/bleu_reward_func/mean": 0.038380008190870285,
      "rewards/bleu_reward_func/std": 0.018261730670928955,
      "step": 1172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 308.0625,
      "completions/mean_terminated_length": 149.44444274902344,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.9384,
      "grad_norm": 3.311505079269409,
      "kl": 0.06927490234375,
      "learning_rate": 1e-06,
      "loss": -0.0721,
      "num_tokens": 15750158.0,
      "reward": 0.0754072293639183,
      "reward_std": 0.028568794950842857,
      "rewards/bleu_reward_func/mean": 0.0754072293639183,
      "rewards/bleu_reward_func/std": 0.0542432963848114,
      "step": 1173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 422.625,
      "completions/mean_terminated_length": 307.71429443359375,
      "completions/min_length": 47.0,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.9392,
      "grad_norm": 3.5287482738494873,
      "kl": 0.075225830078125,
      "learning_rate": 1e-06,
      "loss": 0.2262,
      "num_tokens": 15766810.0,
      "reward": 0.058596234768629074,
      "reward_std": 0.014998164027929306,
      "rewards/bleu_reward_func/mean": 0.058596234768629074,
      "rewards/bleu_reward_func/std": 0.03898334875702858,
      "step": 1174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 456.0,
      "completions/max_terminated_length": 456.0,
      "completions/mean_length": 211.75,
      "completions/mean_terminated_length": 211.75,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 73.0,
      "epoch": 0.94,
      "grad_norm": 3.2140583992004395,
      "kl": 0.07330322265625,
      "learning_rate": 1e-06,
      "loss": 0.1292,
      "num_tokens": 15775706.0,
      "reward": 0.047663480043411255,
      "reward_std": 0.014913933351635933,
      "rewards/bleu_reward_func/mean": 0.047663480043411255,
      "rewards/bleu_reward_func/std": 0.03790759667754173,
      "step": 1175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 469.0,
      "completions/max_terminated_length": 469.0,
      "completions/mean_length": 231.75,
      "completions/mean_terminated_length": 231.75,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.9408,
      "grad_norm": 3.363187551498413,
      "kl": 0.07049560546875,
      "learning_rate": 1e-06,
      "loss": -0.0657,
      "num_tokens": 15785258.0,
      "reward": 0.07324777543544769,
      "reward_std": 0.022441495209932327,
      "rewards/bleu_reward_func/mean": 0.07324777543544769,
      "rewards/bleu_reward_func/std": 0.05588282272219658,
      "step": 1176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 391.4375,
      "completions/mean_terminated_length": 328.28570556640625,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.9416,
      "grad_norm": 2.2609217166900635,
      "kl": 0.066162109375,
      "learning_rate": 1e-06,
      "loss": 0.0827,
      "num_tokens": 15800352.0,
      "reward": 0.04194977134466171,
      "reward_std": 0.015291344374418259,
      "rewards/bleu_reward_func/mean": 0.04194977134466171,
      "rewards/bleu_reward_func/std": 0.035137806087732315,
      "step": 1177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 366.53125,
      "completions/mean_terminated_length": 253.38888549804688,
      "completions/min_length": 61.0,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.9424,
      "grad_norm": 2.5422301292419434,
      "kl": 0.0784912109375,
      "learning_rate": 1e-06,
      "loss": 0.0902,
      "num_tokens": 15814801.0,
      "reward": 0.024109739810228348,
      "reward_std": 0.008774411864578724,
      "rewards/bleu_reward_func/mean": 0.024109739810228348,
      "rewards/bleu_reward_func/std": 0.012804670259356499,
      "step": 1178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 299.0,
      "completions/max_terminated_length": 299.0,
      "completions/mean_length": 130.0625,
      "completions/mean_terminated_length": 130.0625,
      "completions/min_length": 31.0,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.9432,
      "grad_norm": 4.302926540374756,
      "kl": 0.14208984375,
      "learning_rate": 1e-06,
      "loss": 0.0642,
      "num_tokens": 15825731.0,
      "reward": 0.23027461767196655,
      "reward_std": 0.039070405066013336,
      "rewards/bleu_reward_func/mean": 0.23027461767196655,
      "rewards/bleu_reward_func/std": 0.2543703317642212,
      "step": 1179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 474.0,
      "completions/mean_length": 431.96875,
      "completions/mean_terminated_length": 341.2666931152344,
      "completions/min_length": 169.0,
      "completions/min_terminated_length": 169.0,
      "epoch": 0.944,
      "grad_norm": 2.821815252304077,
      "kl": 0.073486328125,
      "learning_rate": 1e-06,
      "loss": 0.084,
      "num_tokens": 15842882.0,
      "reward": 0.045671649277210236,
      "reward_std": 0.012227097526192665,
      "rewards/bleu_reward_func/mean": 0.045671649277210236,
      "rewards/bleu_reward_func/std": 0.025908511132001877,
      "step": 1180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 376.0,
      "completions/max_terminated_length": 376.0,
      "completions/mean_length": 153.5,
      "completions/mean_terminated_length": 153.5,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "epoch": 0.9448,
      "grad_norm": 3.858163595199585,
      "kl": 0.07586669921875,
      "learning_rate": 1e-06,
      "loss": 0.1228,
      "num_tokens": 15849690.0,
      "reward": 0.0736662819981575,
      "reward_std": 0.03443998470902443,
      "rewards/bleu_reward_func/mean": 0.0736662819981575,
      "rewards/bleu_reward_func/std": 0.05406653508543968,
      "step": 1181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 498.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 228.375,
      "completions/mean_terminated_length": 228.375,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 0.9456,
      "grad_norm": 3.6348228454589844,
      "kl": 0.06280517578125,
      "learning_rate": 1e-06,
      "loss": 0.0013,
      "num_tokens": 15862254.0,
      "reward": 0.03549562767148018,
      "reward_std": 0.013098573312163353,
      "rewards/bleu_reward_func/mean": 0.03549562767148018,
      "rewards/bleu_reward_func/std": 0.016965733841061592,
      "step": 1182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 440.5,
      "completions/mean_terminated_length": 420.47998046875,
      "completions/min_length": 275.0,
      "completions/min_terminated_length": 275.0,
      "epoch": 0.9464,
      "grad_norm": 2.2529780864715576,
      "kl": 0.05780029296875,
      "learning_rate": 1e-06,
      "loss": -0.0692,
      "num_tokens": 15878750.0,
      "reward": 0.07513043284416199,
      "reward_std": 0.025522038340568542,
      "rewards/bleu_reward_func/mean": 0.07513043284416199,
      "rewards/bleu_reward_func/std": 0.06642089039087296,
      "step": 1183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 494.0,
      "completions/mean_length": 470.34375,
      "completions/mean_terminated_length": 378.70001220703125,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "epoch": 0.9472,
      "grad_norm": 1.893031358718872,
      "kl": 0.0584716796875,
      "learning_rate": 1e-06,
      "loss": -0.0729,
      "num_tokens": 15897969.0,
      "reward": 0.0447755828499794,
      "reward_std": 0.01626760885119438,
      "rewards/bleu_reward_func/mean": 0.0447755828499794,
      "rewards/bleu_reward_func/std": 0.02777096815407276,
      "step": 1184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 259.90625,
      "completions/mean_terminated_length": 233.8275909423828,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "epoch": 0.948,
      "grad_norm": 2.977755546569824,
      "kl": 0.074462890625,
      "learning_rate": 1e-06,
      "loss": -0.1044,
      "num_tokens": 15909150.0,
      "reward": 0.05368737503886223,
      "reward_std": 0.015224416740238667,
      "rewards/bleu_reward_func/mean": 0.05368737503886223,
      "rewards/bleu_reward_func/std": 0.03522547334432602,
      "step": 1185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 376.46875,
      "completions/mean_terminated_length": 256.8823547363281,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "epoch": 0.9488,
      "grad_norm": 2.1552228927612305,
      "kl": 0.06488037109375,
      "learning_rate": 1e-06,
      "loss": 0.067,
      "num_tokens": 15924293.0,
      "reward": 0.023215215653181076,
      "reward_std": 0.007974323816597462,
      "rewards/bleu_reward_func/mean": 0.023215215653181076,
      "rewards/bleu_reward_func/std": 0.01594514399766922,
      "step": 1186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 453.0,
      "completions/mean_length": 432.8125,
      "completions/mean_terminated_length": 331.0,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 135.0,
      "epoch": 0.9496,
      "grad_norm": 1.988263487815857,
      "kl": 0.033172607421875,
      "learning_rate": 1e-06,
      "loss": 0.0445,
      "num_tokens": 15942727.0,
      "reward": 0.03754414618015289,
      "reward_std": 0.0140055101364851,
      "rewards/bleu_reward_func/mean": 0.03754414618015289,
      "rewards/bleu_reward_func/std": 0.019993774592876434,
      "step": 1187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 425.0,
      "completions/max_terminated_length": 425.0,
      "completions/mean_length": 240.84375,
      "completions/mean_terminated_length": 240.84375,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "epoch": 0.9504,
      "grad_norm": 3.113128900527954,
      "kl": 0.06243896484375,
      "learning_rate": 1e-06,
      "loss": -0.1811,
      "num_tokens": 15952514.0,
      "reward": 0.030695520341396332,
      "reward_std": 0.009731138125061989,
      "rewards/bleu_reward_func/mean": 0.030695520341396332,
      "rewards/bleu_reward_func/std": 0.013646015897393227,
      "step": 1188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 512.0,
      "completions/mean_length": 329.3125,
      "completions/mean_terminated_length": 317.13336181640625,
      "completions/min_length": 85.0,
      "completions/min_terminated_length": 85.0,
      "epoch": 0.9512,
      "grad_norm": 2.414848804473877,
      "kl": 0.045074462890625,
      "learning_rate": 1e-06,
      "loss": -0.0355,
      "num_tokens": 15965252.0,
      "reward": 0.04139825701713562,
      "reward_std": 0.021453116089105606,
      "rewards/bleu_reward_func/mean": 0.04139825701713562,
      "rewards/bleu_reward_func/std": 0.03966396301984787,
      "step": 1189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 467.0,
      "completions/mean_length": 280.625,
      "completions/mean_terminated_length": 203.5,
      "completions/min_length": 69.0,
      "completions/min_terminated_length": 69.0,
      "epoch": 0.952,
      "grad_norm": 3.5793888568878174,
      "kl": 0.06951904296875,
      "learning_rate": 1e-06,
      "loss": -0.0784,
      "num_tokens": 15978392.0,
      "reward": 0.038947951048612595,
      "reward_std": 0.008256456814706326,
      "rewards/bleu_reward_func/mean": 0.038947951048612595,
      "rewards/bleu_reward_func/std": 0.03032156452536583,
      "step": 1190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 469.0,
      "completions/max_terminated_length": 469.0,
      "completions/mean_length": 178.5625,
      "completions/mean_terminated_length": 178.5625,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.9528,
      "grad_norm": 4.517023086547852,
      "kl": 0.08197021484375,
      "learning_rate": 1e-06,
      "loss": -0.0047,
      "num_tokens": 15987898.0,
      "reward": 0.06466805189847946,
      "reward_std": 0.027795474976301193,
      "rewards/bleu_reward_func/mean": 0.06466805189847946,
      "rewards/bleu_reward_func/std": 0.05030693858861923,
      "step": 1191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 240.625,
      "completions/mean_terminated_length": 164.63999938964844,
      "completions/min_length": 67.0,
      "completions/min_terminated_length": 67.0,
      "epoch": 0.9536,
      "grad_norm": 3.730811357498169,
      "kl": 0.07061767578125,
      "learning_rate": 1e-06,
      "loss": 0.0561,
      "num_tokens": 15998142.0,
      "reward": 0.047298163175582886,
      "reward_std": 0.01436680555343628,
      "rewards/bleu_reward_func/mean": 0.047298163175582886,
      "rewards/bleu_reward_func/std": 0.0181716401129961,
      "step": 1192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 452.0,
      "completions/mean_length": 391.46875,
      "completions/mean_terminated_length": 344.3043518066406,
      "completions/min_length": 202.0,
      "completions/min_terminated_length": 202.0,
      "epoch": 0.9544,
      "grad_norm": 2.0034618377685547,
      "kl": 0.03424072265625,
      "learning_rate": 1e-06,
      "loss": -0.0098,
      "num_tokens": 16015477.0,
      "reward": 0.10216254740953445,
      "reward_std": 0.0467507429420948,
      "rewards/bleu_reward_func/mean": 0.10216254740953445,
      "rewards/bleu_reward_func/std": 0.10135076195001602,
      "step": 1193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 423.46875,
      "completions/mean_terminated_length": 383.227294921875,
      "completions/min_length": 217.0,
      "completions/min_terminated_length": 217.0,
      "epoch": 0.9552,
      "grad_norm": 2.356437921524048,
      "kl": 0.06634521484375,
      "learning_rate": 1e-06,
      "loss": -0.0102,
      "num_tokens": 16033492.0,
      "reward": 0.05241236090660095,
      "reward_std": 0.018471311777830124,
      "rewards/bleu_reward_func/mean": 0.05241236090660095,
      "rewards/bleu_reward_func/std": 0.030719749629497528,
      "step": 1194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 458.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 283.09375,
      "completions/mean_terminated_length": 283.09375,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "epoch": 0.956,
      "grad_norm": 2.7636096477508545,
      "kl": 0.0780029296875,
      "learning_rate": 1e-06,
      "loss": -0.0413,
      "num_tokens": 16044871.0,
      "reward": 0.08429600298404694,
      "reward_std": 0.04460438713431358,
      "rewards/bleu_reward_func/mean": 0.08429600298404694,
      "rewards/bleu_reward_func/std": 0.09819035232067108,
      "step": 1195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 436.0,
      "completions/mean_length": 276.78125,
      "completions/mean_terminated_length": 210.9199981689453,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "epoch": 0.9568,
      "grad_norm": 3.776975393295288,
      "kl": 0.07073974609375,
      "learning_rate": 1e-06,
      "loss": 0.1762,
      "num_tokens": 16056568.0,
      "reward": 0.08197371661663055,
      "reward_std": 0.05241422355175018,
      "rewards/bleu_reward_func/mean": 0.08197371661663055,
      "rewards/bleu_reward_func/std": 0.07864004373550415,
      "step": 1196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 454.40625,
      "completions/mean_terminated_length": 428.227294921875,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 0.9576,
      "grad_norm": 1.9507607221603394,
      "kl": 0.0592041015625,
      "learning_rate": 1e-06,
      "loss": 0.0529,
      "num_tokens": 16074165.0,
      "reward": 0.03464844077825546,
      "reward_std": 0.008609910495579243,
      "rewards/bleu_reward_func/mean": 0.03464844077825546,
      "rewards/bleu_reward_func/std": 0.02310766465961933,
      "step": 1197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 168.84375,
      "completions/mean_terminated_length": 119.8214340209961,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.9584,
      "grad_norm": 4.090397357940674,
      "kl": 0.0902099609375,
      "learning_rate": 1e-06,
      "loss": 0.0189,
      "num_tokens": 16081696.0,
      "reward": 0.06762534379959106,
      "reward_std": 0.053265273571014404,
      "rewards/bleu_reward_func/mean": 0.06762534379959106,
      "rewards/bleu_reward_func/std": 0.08817991614341736,
      "step": 1198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 403.28125,
      "completions/mean_terminated_length": 307.3529357910156,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.9592,
      "grad_norm": 2.376145839691162,
      "kl": 0.06719970703125,
      "learning_rate": 1e-06,
      "loss": 0.0666,
      "num_tokens": 16099561.0,
      "reward": 0.05755573883652687,
      "reward_std": 0.012290094047784805,
      "rewards/bleu_reward_func/mean": 0.05755573883652687,
      "rewards/bleu_reward_func/std": 0.034789226949214935,
      "step": 1199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 453.0,
      "completions/mean_length": 304.28125,
      "completions/mean_terminated_length": 282.7930908203125,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "epoch": 0.96,
      "grad_norm": 2.4257760047912598,
      "kl": 0.046630859375,
      "learning_rate": 1e-06,
      "loss": -0.0061,
      "num_tokens": 16113370.0,
      "reward": 0.1820225715637207,
      "reward_std": 0.07978139072656631,
      "rewards/bleu_reward_func/mean": 0.1820225715637207,
      "rewards/bleu_reward_func/std": 0.09627845138311386,
      "step": 1200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.90625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 456.0,
      "completions/mean_length": 485.15625,
      "completions/mean_terminated_length": 225.6666717529297,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.9608,
      "grad_norm": 2.164067268371582,
      "kl": 0.06500244140625,
      "learning_rate": 1e-06,
      "loss": -0.0841,
      "num_tokens": 16133479.0,
      "reward": 0.030571069568395615,
      "reward_std": 0.009849293157458305,
      "rewards/bleu_reward_func/mean": 0.030571069568395615,
      "rewards/bleu_reward_func/std": 0.015346908010542393,
      "step": 1201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 444.25,
      "completions/mean_terminated_length": 376.5,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.9616,
      "grad_norm": 2.0394792556762695,
      "kl": 0.057525634765625,
      "learning_rate": 1e-06,
      "loss": 0.0648,
      "num_tokens": 16152527.0,
      "reward": 0.013919162563979626,
      "reward_std": 0.006773381493985653,
      "rewards/bleu_reward_func/mean": 0.013919162563979626,
      "rewards/bleu_reward_func/std": 0.010089041665196419,
      "step": 1202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 362.5,
      "completions/mean_terminated_length": 260.2105407714844,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.9624,
      "grad_norm": 3.046417474746704,
      "kl": 0.0723876953125,
      "learning_rate": 1e-06,
      "loss": 0.069,
      "num_tokens": 16166959.0,
      "reward": 0.02852245420217514,
      "reward_std": 0.007626072503626347,
      "rewards/bleu_reward_func/mean": 0.02852245420217514,
      "rewards/bleu_reward_func/std": 0.008153699338436127,
      "step": 1203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 475.0,
      "completions/mean_length": 334.09375,
      "completions/mean_terminated_length": 274.79168701171875,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.9632,
      "grad_norm": 2.7134299278259277,
      "kl": 0.06134033203125,
      "learning_rate": 1e-06,
      "loss": 0.0099,
      "num_tokens": 16181378.0,
      "reward": 0.09094207733869553,
      "reward_std": 0.016675401479005814,
      "rewards/bleu_reward_func/mean": 0.09094207733869553,
      "rewards/bleu_reward_func/std": 0.10062676668167114,
      "step": 1204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 335.25,
      "completions/mean_terminated_length": 316.96551513671875,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "epoch": 0.964,
      "grad_norm": 2.4693562984466553,
      "kl": 0.05059814453125,
      "learning_rate": 1e-06,
      "loss": 0.0418,
      "num_tokens": 16194082.0,
      "reward": 0.08112166076898575,
      "reward_std": 0.030807986855506897,
      "rewards/bleu_reward_func/mean": 0.08112166076898575,
      "rewards/bleu_reward_func/std": 0.0743534192442894,
      "step": 1205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 339.3125,
      "completions/mean_terminated_length": 307.3333435058594,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.9648,
      "grad_norm": 2.691605806350708,
      "kl": 0.0670166015625,
      "learning_rate": 1e-06,
      "loss": -0.133,
      "num_tokens": 16206900.0,
      "reward": 0.028721408918499947,
      "reward_std": 0.012371614575386047,
      "rewards/bleu_reward_func/mean": 0.028721408918499947,
      "rewards/bleu_reward_func/std": 0.013438318856060505,
      "step": 1206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 415.0,
      "completions/max_terminated_length": 415.0,
      "completions/mean_length": 300.875,
      "completions/mean_terminated_length": 300.875,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 148.0,
      "epoch": 0.9656,
      "grad_norm": 2.7378876209259033,
      "kl": 0.0537109375,
      "learning_rate": 1e-06,
      "loss": -0.0358,
      "num_tokens": 16220984.0,
      "reward": 0.07377751916646957,
      "reward_std": 0.025469692423939705,
      "rewards/bleu_reward_func/mean": 0.07377751916646957,
      "rewards/bleu_reward_func/std": 0.059645578265190125,
      "step": 1207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 246.9375,
      "completions/mean_terminated_length": 158.58334350585938,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.9664,
      "grad_norm": 3.1071829795837402,
      "kl": 0.09368896484375,
      "learning_rate": 1e-06,
      "loss": 0.1223,
      "num_tokens": 16231742.0,
      "reward": 0.07445356994867325,
      "reward_std": 0.0362502858042717,
      "rewards/bleu_reward_func/mean": 0.07445356994867325,
      "rewards/bleu_reward_func/std": 0.06206907704472542,
      "step": 1208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 484.0,
      "completions/mean_length": 358.875,
      "completions/mean_terminated_length": 316.0,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.9672,
      "grad_norm": 2.5442681312561035,
      "kl": 0.05780029296875,
      "learning_rate": 1e-06,
      "loss": -0.0651,
      "num_tokens": 16248674.0,
      "reward": 0.0365150049328804,
      "reward_std": 0.0132023636251688,
      "rewards/bleu_reward_func/mean": 0.0365150049328804,
      "rewards/bleu_reward_func/std": 0.02038295939564705,
      "step": 1209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 478.0,
      "completions/mean_length": 367.65625,
      "completions/mean_terminated_length": 302.04547119140625,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "epoch": 0.968,
      "grad_norm": 2.3779213428497314,
      "kl": 0.0618896484375,
      "learning_rate": 1e-06,
      "loss": 0.035,
      "num_tokens": 16262847.0,
      "reward": 0.07498673349618912,
      "reward_std": 0.023969056084752083,
      "rewards/bleu_reward_func/mean": 0.07498673349618912,
      "rewards/bleu_reward_func/std": 0.0473894327878952,
      "step": 1210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 366.28125,
      "completions/mean_terminated_length": 309.2608642578125,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.9688,
      "grad_norm": 2.7652668952941895,
      "kl": 0.08758544921875,
      "learning_rate": 1e-06,
      "loss": 0.0972,
      "num_tokens": 16277592.0,
      "reward": 0.05950748920440674,
      "reward_std": 0.02408502995967865,
      "rewards/bleu_reward_func/mean": 0.05950748920440674,
      "rewards/bleu_reward_func/std": 0.06841208040714264,
      "step": 1211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 471.0,
      "completions/mean_length": 292.65625,
      "completions/mean_terminated_length": 219.5416717529297,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.9696,
      "grad_norm": 3.904935359954834,
      "kl": 0.09576416015625,
      "learning_rate": 1e-06,
      "loss": -0.0152,
      "num_tokens": 16290237.0,
      "reward": 0.0675918310880661,
      "reward_std": 0.013060121797025204,
      "rewards/bleu_reward_func/mean": 0.0675918310880661,
      "rewards/bleu_reward_func/std": 0.05720841512084007,
      "step": 1212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 509.0,
      "completions/mean_length": 420.28125,
      "completions/mean_terminated_length": 378.5909118652344,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.9704,
      "grad_norm": 2.223933219909668,
      "kl": 0.05810546875,
      "learning_rate": 1e-06,
      "loss": -0.0617,
      "num_tokens": 16306462.0,
      "reward": 0.06224057823419571,
      "reward_std": 0.022418636828660965,
      "rewards/bleu_reward_func/mean": 0.06224057823419571,
      "rewards/bleu_reward_func/std": 0.03051481395959854,
      "step": 1213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 489.0,
      "completions/mean_length": 234.4375,
      "completions/mean_terminated_length": 183.0370330810547,
      "completions/min_length": 28.0,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.9712,
      "grad_norm": 5.2260236740112305,
      "kl": 0.071075439453125,
      "learning_rate": 1e-06,
      "loss": -0.1912,
      "num_tokens": 16316172.0,
      "reward": 0.053409986197948456,
      "reward_std": 0.025385111570358276,
      "rewards/bleu_reward_func/mean": 0.053409986197948456,
      "rewards/bleu_reward_func/std": 0.03583426773548126,
      "step": 1214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 292.53125,
      "completions/mean_terminated_length": 219.375,
      "completions/min_length": 64.0,
      "completions/min_terminated_length": 64.0,
      "epoch": 0.972,
      "grad_norm": 3.1006455421447754,
      "kl": 0.05877685546875,
      "learning_rate": 1e-06,
      "loss": -0.0024,
      "num_tokens": 16331325.0,
      "reward": 0.03592420741915703,
      "reward_std": 0.00873337872326374,
      "rewards/bleu_reward_func/mean": 0.03592420741915703,
      "rewards/bleu_reward_func/std": 0.02614421211183071,
      "step": 1215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 508.0,
      "completions/mean_length": 263.75,
      "completions/mean_terminated_length": 181.0,
      "completions/min_length": 26.0,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.9728,
      "grad_norm": 11.719066619873047,
      "kl": 0.18597412109375,
      "learning_rate": 1e-06,
      "loss": -0.1213,
      "num_tokens": 16345373.0,
      "reward": 0.04153061658143997,
      "reward_std": 0.015695935115218163,
      "rewards/bleu_reward_func/mean": 0.04153061658143997,
      "rewards/bleu_reward_func/std": 0.031689949333667755,
      "step": 1216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 495.0,
      "completions/mean_length": 378.28125,
      "completions/mean_terminated_length": 347.423095703125,
      "completions/min_length": 75.0,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.9736,
      "grad_norm": 1.9226611852645874,
      "kl": 0.072265625,
      "learning_rate": 1e-06,
      "loss": -0.0867,
      "num_tokens": 16362718.0,
      "reward": 0.03419474512338638,
      "reward_std": 0.02102738618850708,
      "rewards/bleu_reward_func/mean": 0.03419474512338638,
      "rewards/bleu_reward_func/std": 0.033396027982234955,
      "step": 1217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 411.0,
      "completions/mean_length": 249.0,
      "completions/mean_terminated_length": 240.51612854003906,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "epoch": 0.9744,
      "grad_norm": 3.4589998722076416,
      "kl": 0.07830810546875,
      "learning_rate": 1e-06,
      "loss": -0.1008,
      "num_tokens": 16372462.0,
      "reward": 0.032402701675891876,
      "reward_std": 0.013161510229110718,
      "rewards/bleu_reward_func/mean": 0.032402701675891876,
      "rewards/bleu_reward_func/std": 0.018716424703598022,
      "step": 1218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 465.0,
      "completions/mean_length": 353.625,
      "completions/mean_terminated_length": 291.6521911621094,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 107.0,
      "epoch": 0.9752,
      "grad_norm": 2.2739009857177734,
      "kl": 0.04034423828125,
      "learning_rate": 1e-06,
      "loss": 0.0097,
      "num_tokens": 16386298.0,
      "reward": 0.11592083424329758,
      "reward_std": 0.02954115904867649,
      "rewards/bleu_reward_func/mean": 0.11592083424329758,
      "rewards/bleu_reward_func/std": 0.10508622229099274,
      "step": 1219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 406.0,
      "completions/mean_length": 187.0,
      "completions/mean_terminated_length": 176.51612854003906,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.976,
      "grad_norm": 3.4201154708862305,
      "kl": 0.0491943359375,
      "learning_rate": 1e-06,
      "loss": 0.0096,
      "num_tokens": 16396570.0,
      "reward": 0.09430958330631256,
      "reward_std": 0.04518333077430725,
      "rewards/bleu_reward_func/mean": 0.09430958330631256,
      "rewards/bleu_reward_func/std": 0.06046329066157341,
      "step": 1220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 465.0,
      "completions/max_terminated_length": 465.0,
      "completions/mean_length": 223.9375,
      "completions/mean_terminated_length": 223.9375,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.9768,
      "grad_norm": 4.284764766693115,
      "kl": 0.10467529296875,
      "learning_rate": 1e-06,
      "loss": -0.0042,
      "num_tokens": 16406816.0,
      "reward": 0.0637713298201561,
      "reward_std": 0.019082939252257347,
      "rewards/bleu_reward_func/mean": 0.0637713298201561,
      "rewards/bleu_reward_func/std": 0.027624597772955894,
      "step": 1221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 288.78125,
      "completions/mean_terminated_length": 201.43478393554688,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "epoch": 0.9776,
      "grad_norm": 3.965935468673706,
      "kl": 0.0511474609375,
      "learning_rate": 1e-06,
      "loss": -0.0813,
      "num_tokens": 16419049.0,
      "reward": 0.03097653202712536,
      "reward_std": 0.009922297671437263,
      "rewards/bleu_reward_func/mean": 0.03097653202712536,
      "rewards/bleu_reward_func/std": 0.013633164577186108,
      "step": 1222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 276.0,
      "completions/mean_length": 202.5625,
      "completions/mean_terminated_length": 115.91999816894531,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.9784,
      "grad_norm": 4.116911888122559,
      "kl": 0.095611572265625,
      "learning_rate": 1e-06,
      "loss": 0.0455,
      "num_tokens": 16429107.0,
      "reward": 0.05831120163202286,
      "reward_std": 0.027516763657331467,
      "rewards/bleu_reward_func/mean": 0.05831120163202286,
      "rewards/bleu_reward_func/std": 0.04774592071771622,
      "step": 1223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 421.0,
      "completions/mean_length": 342.59375,
      "completions/mean_terminated_length": 286.125,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "epoch": 0.9792,
      "grad_norm": 2.510331630706787,
      "kl": 0.06231689453125,
      "learning_rate": 1e-06,
      "loss": -0.0519,
      "num_tokens": 16442350.0,
      "reward": 0.0445207916200161,
      "reward_std": 0.014958234503865242,
      "rewards/bleu_reward_func/mean": 0.0445207916200161,
      "rewards/bleu_reward_func/std": 0.03363141417503357,
      "step": 1224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 445.0,
      "completions/max_terminated_length": 445.0,
      "completions/mean_length": 234.84375,
      "completions/mean_terminated_length": 234.84375,
      "completions/min_length": 79.0,
      "completions/min_terminated_length": 79.0,
      "epoch": 0.98,
      "grad_norm": 3.040597915649414,
      "kl": 0.0728759765625,
      "learning_rate": 1e-06,
      "loss": 0.1173,
      "num_tokens": 16451969.0,
      "reward": 0.06547506153583527,
      "reward_std": 0.020516231656074524,
      "rewards/bleu_reward_func/mean": 0.06547506153583527,
      "rewards/bleu_reward_func/std": 0.055397652089595795,
      "step": 1225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 205.09375,
      "completions/mean_terminated_length": 195.19354248046875,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.9808,
      "grad_norm": 2.874229907989502,
      "kl": 0.039337158203125,
      "learning_rate": 1e-06,
      "loss": 0.1034,
      "num_tokens": 16461804.0,
      "reward": 0.03653764724731445,
      "reward_std": 0.022169658914208412,
      "rewards/bleu_reward_func/mean": 0.03653764724731445,
      "rewards/bleu_reward_func/std": 0.03301393613219261,
      "step": 1226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 465.0,
      "completions/mean_length": 399.1875,
      "completions/mean_terminated_length": 271.3333435058594,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "epoch": 0.9816,
      "grad_norm": 3.17681622505188,
      "kl": 0.07647705078125,
      "learning_rate": 1e-06,
      "loss": 0.0085,
      "num_tokens": 16479034.0,
      "reward": 0.052174534648656845,
      "reward_std": 0.025132428854703903,
      "rewards/bleu_reward_func/mean": 0.052174534648656845,
      "rewards/bleu_reward_func/std": 0.037799958139657974,
      "step": 1227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 270.65625,
      "completions/mean_terminated_length": 245.6896514892578,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "epoch": 0.9824,
      "grad_norm": 2.6783218383789062,
      "kl": 0.067138671875,
      "learning_rate": 1e-06,
      "loss": -0.0257,
      "num_tokens": 16493439.0,
      "reward": 0.0533532090485096,
      "reward_std": 0.025761138647794724,
      "rewards/bleu_reward_func/mean": 0.0533532090485096,
      "rewards/bleu_reward_func/std": 0.04583704099059105,
      "step": 1228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 307.0,
      "completions/max_terminated_length": 307.0,
      "completions/mean_length": 184.3125,
      "completions/mean_terminated_length": 184.3125,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.9832,
      "grad_norm": 3.430950403213501,
      "kl": 0.0555419921875,
      "learning_rate": 1e-06,
      "loss": 0.1023,
      "num_tokens": 16502305.0,
      "reward": 0.08854292333126068,
      "reward_std": 0.03958655521273613,
      "rewards/bleu_reward_func/mean": 0.08854292333126068,
      "rewards/bleu_reward_func/std": 0.1360902488231659,
      "step": 1229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 501.0,
      "completions/mean_length": 439.28125,
      "completions/mean_terminated_length": 356.86669921875,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.984,
      "grad_norm": 2.2083981037139893,
      "kl": 0.062896728515625,
      "learning_rate": 1e-06,
      "loss": 0.102,
      "num_tokens": 16520890.0,
      "reward": 0.1506825089454651,
      "reward_std": 0.0348081961274147,
      "rewards/bleu_reward_func/mean": 0.1506825089454651,
      "rewards/bleu_reward_func/std": 0.1484927535057068,
      "step": 1230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 448.0,
      "completions/mean_length": 326.21875,
      "completions/mean_terminated_length": 253.52174377441406,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 97.0,
      "epoch": 0.9848,
      "grad_norm": 3.313183307647705,
      "kl": 0.1004638671875,
      "learning_rate": 1e-06,
      "loss": -0.056,
      "num_tokens": 16533841.0,
      "reward": 0.034567564725875854,
      "reward_std": 0.010901572182774544,
      "rewards/bleu_reward_func/mean": 0.034567564725875854,
      "rewards/bleu_reward_func/std": 0.017620721831917763,
      "step": 1231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 487.0,
      "completions/max_terminated_length": 487.0,
      "completions/mean_length": 225.0,
      "completions/mean_terminated_length": 225.0,
      "completions/min_length": 48.0,
      "completions/min_terminated_length": 48.0,
      "epoch": 0.9856,
      "grad_norm": 3.367202043533325,
      "kl": 0.08966064453125,
      "learning_rate": 1e-06,
      "loss": 0.0463,
      "num_tokens": 16543385.0,
      "reward": 0.07799240946769714,
      "reward_std": 0.019178325310349464,
      "rewards/bleu_reward_func/mean": 0.07799240946769714,
      "rewards/bleu_reward_func/std": 0.06506384909152985,
      "step": 1232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 500.0,
      "completions/mean_length": 377.875,
      "completions/mean_terminated_length": 273.5555725097656,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.9864,
      "grad_norm": 2.6554360389709473,
      "kl": 0.04986572265625,
      "learning_rate": 1e-06,
      "loss": 0.1209,
      "num_tokens": 16558325.0,
      "reward": 0.07152421027421951,
      "reward_std": 0.01307828351855278,
      "rewards/bleu_reward_func/mean": 0.07152421027421951,
      "rewards/bleu_reward_func/std": 0.0781111940741539,
      "step": 1233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 436.0,
      "completions/mean_length": 415.96875,
      "completions/mean_terminated_length": 319.9375,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "epoch": 0.9872,
      "grad_norm": 2.2727460861206055,
      "kl": 0.04705810546875,
      "learning_rate": 1e-06,
      "loss": 0.025,
      "num_tokens": 16575612.0,
      "reward": 0.04410577565431595,
      "reward_std": 0.009795146994292736,
      "rewards/bleu_reward_func/mean": 0.04410577565431595,
      "rewards/bleu_reward_func/std": 0.03114950843155384,
      "step": 1234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 479.0,
      "completions/mean_length": 370.21875,
      "completions/mean_terminated_length": 209.53334045410156,
      "completions/min_length": 59.0,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.988,
      "grad_norm": 3.1277663707733154,
      "kl": 0.0838623046875,
      "learning_rate": 1e-06,
      "loss": -0.0597,
      "num_tokens": 16590387.0,
      "reward": 0.0662313848733902,
      "reward_std": 0.01718086190521717,
      "rewards/bleu_reward_func/mean": 0.0662313848733902,
      "rewards/bleu_reward_func/std": 0.04029948636889458,
      "step": 1235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 511.0,
      "completions/mean_length": 378.78125,
      "completions/mean_terminated_length": 275.1666564941406,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 58.0,
      "epoch": 0.9888,
      "grad_norm": 2.5095629692077637,
      "kl": 0.08868408203125,
      "learning_rate": 1e-06,
      "loss": -0.0581,
      "num_tokens": 16605916.0,
      "reward": 0.05035623162984848,
      "reward_std": 0.01956663653254509,
      "rewards/bleu_reward_func/mean": 0.05035623162984848,
      "rewards/bleu_reward_func/std": 0.022561483085155487,
      "step": 1236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 486.0,
      "completions/mean_length": 328.6875,
      "completions/mean_terminated_length": 277.3599853515625,
      "completions/min_length": 74.0,
      "completions/min_terminated_length": 74.0,
      "epoch": 0.9896,
      "grad_norm": 2.212676525115967,
      "kl": 0.060302734375,
      "learning_rate": 1e-06,
      "loss": 0.1095,
      "num_tokens": 16619338.0,
      "reward": 0.09644582122564316,
      "reward_std": 0.0474303662776947,
      "rewards/bleu_reward_func/mean": 0.09644582122564316,
      "rewards/bleu_reward_func/std": 0.13067010045051575,
      "step": 1237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 497.0,
      "completions/mean_length": 399.34375,
      "completions/mean_terminated_length": 348.1363830566406,
      "completions/min_length": 174.0,
      "completions/min_terminated_length": 174.0,
      "epoch": 0.9904,
      "grad_norm": 2.4537155628204346,
      "kl": 0.045166015625,
      "learning_rate": 1e-06,
      "loss": -0.0291,
      "num_tokens": 16635413.0,
      "reward": 0.07322587072849274,
      "reward_std": 0.02353905513882637,
      "rewards/bleu_reward_func/mean": 0.07322587072849274,
      "rewards/bleu_reward_func/std": 0.058683864772319794,
      "step": 1238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 498.0,
      "completions/mean_length": 357.90625,
      "completions/mean_terminated_length": 297.60870361328125,
      "completions/min_length": 53.0,
      "completions/min_terminated_length": 53.0,
      "epoch": 0.9912,
      "grad_norm": 2.812755823135376,
      "kl": 0.0718994140625,
      "learning_rate": 1e-06,
      "loss": -0.007,
      "num_tokens": 16650594.0,
      "reward": 0.04551263526082039,
      "reward_std": 0.008796430192887783,
      "rewards/bleu_reward_func/mean": 0.04551263526082039,
      "rewards/bleu_reward_func/std": 0.021256960928440094,
      "step": 1239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 437.0,
      "completions/mean_length": 378.21875,
      "completions/mean_terminated_length": 244.4375,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.992,
      "grad_norm": 2.3463385105133057,
      "kl": 0.06915283203125,
      "learning_rate": 1e-06,
      "loss": -0.0618,
      "num_tokens": 16666545.0,
      "reward": 0.03398827463388443,
      "reward_std": 0.007483157329261303,
      "rewards/bleu_reward_func/mean": 0.03398827463388443,
      "rewards/bleu_reward_func/std": 0.015963921323418617,
      "step": 1240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 472.0,
      "completions/max_terminated_length": 472.0,
      "completions/mean_length": 288.71875,
      "completions/mean_terminated_length": 288.71875,
      "completions/min_length": 106.0,
      "completions/min_terminated_length": 106.0,
      "epoch": 0.9928,
      "grad_norm": 2.5251529216766357,
      "kl": 0.0472412109375,
      "learning_rate": 1e-06,
      "loss": -0.02,
      "num_tokens": 16677928.0,
      "reward": 0.0579800084233284,
      "reward_std": 0.016186034306883812,
      "rewards/bleu_reward_func/mean": 0.0579800084233284,
      "rewards/bleu_reward_func/std": 0.046285130083560944,
      "step": 1241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 505.0,
      "completions/mean_length": 329.21875,
      "completions/mean_terminated_length": 287.0384826660156,
      "completions/min_length": 110.0,
      "completions/min_terminated_length": 110.0,
      "epoch": 0.9936,
      "grad_norm": 2.9839494228363037,
      "kl": 0.05584716796875,
      "learning_rate": 1e-06,
      "loss": -0.1253,
      "num_tokens": 16690791.0,
      "reward": 0.08833082020282745,
      "reward_std": 0.037769023329019547,
      "rewards/bleu_reward_func/mean": 0.08833082020282745,
      "rewards/bleu_reward_func/std": 0.06798525899648666,
      "step": 1242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 510.0,
      "completions/mean_length": 381.875,
      "completions/mean_terminated_length": 292.84210205078125,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 0.9944,
      "grad_norm": 2.434730291366577,
      "kl": 0.05291748046875,
      "learning_rate": 1e-06,
      "loss": -0.0027,
      "num_tokens": 16707947.0,
      "reward": 0.08765649050474167,
      "reward_std": 0.032737139612436295,
      "rewards/bleu_reward_func/mean": 0.08765649050474167,
      "rewards/bleu_reward_func/std": 0.06498604267835617,
      "step": 1243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 416.0,
      "completions/mean_length": 223.25,
      "completions/mean_terminated_length": 213.9354705810547,
      "completions/min_length": 55.0,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.9952,
      "grad_norm": 2.878969430923462,
      "kl": 0.063812255859375,
      "learning_rate": 1e-06,
      "loss": 0.0242,
      "num_tokens": 16718507.0,
      "reward": 0.062188923358917236,
      "reward_std": 0.015940211713314056,
      "rewards/bleu_reward_func/mean": 0.062188923358917236,
      "rewards/bleu_reward_func/std": 0.06735063344240189,
      "step": 1244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 487.0,
      "completions/mean_length": 417.53125,
      "completions/mean_terminated_length": 237.18182373046875,
      "completions/min_length": 88.0,
      "completions/min_terminated_length": 88.0,
      "epoch": 0.996,
      "grad_norm": 2.7636101245880127,
      "kl": 0.08599853515625,
      "learning_rate": 1e-06,
      "loss": 0.0432,
      "num_tokens": 16735052.0,
      "reward": 0.05753006786108017,
      "reward_std": 0.015103975310921669,
      "rewards/bleu_reward_func/mean": 0.05753006786108017,
      "rewards/bleu_reward_func/std": 0.058132320642471313,
      "step": 1245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 426.90625,
      "completions/mean_terminated_length": 368.6842041015625,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "epoch": 0.9968,
      "grad_norm": 2.3084988594055176,
      "kl": 0.0635986328125,
      "learning_rate": 1e-06,
      "loss": -0.0934,
      "num_tokens": 16752049.0,
      "reward": 0.046952854841947556,
      "reward_std": 0.012242695316672325,
      "rewards/bleu_reward_func/mean": 0.046952854841947556,
      "rewards/bleu_reward_func/std": 0.018551921471953392,
      "step": 1246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 393.0,
      "completions/max_terminated_length": 393.0,
      "completions/mean_length": 236.28125,
      "completions/mean_terminated_length": 236.28125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "epoch": 0.9976,
      "grad_norm": 2.7959964275360107,
      "kl": 0.0523681640625,
      "learning_rate": 1e-06,
      "loss": -0.0355,
      "num_tokens": 16761786.0,
      "reward": 0.04307990521192551,
      "reward_std": 0.014406087808310986,
      "rewards/bleu_reward_func/mean": 0.04307990521192551,
      "rewards/bleu_reward_func/std": 0.02000701241195202,
      "step": 1247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 502.0,
      "completions/mean_length": 320.6875,
      "completions/mean_terminated_length": 267.1199951171875,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "epoch": 0.9984,
      "grad_norm": 2.47160005569458,
      "kl": 0.0494384765625,
      "learning_rate": 1e-06,
      "loss": 0.0789,
      "num_tokens": 16774280.0,
      "reward": 0.04456353932619095,
      "reward_std": 0.015042895451188087,
      "rewards/bleu_reward_func/mean": 0.04456353932619095,
      "rewards/bleu_reward_func/std": 0.06020372360944748,
      "step": 1248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 512.0,
      "completions/max_terminated_length": 507.0,
      "completions/mean_length": 261.75,
      "completions/mean_terminated_length": 245.06668090820312,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.9992,
      "grad_norm": 2.724760055541992,
      "kl": 0.05419921875,
      "learning_rate": 1e-06,
      "loss": 0.1319,
      "num_tokens": 16785400.0,
      "reward": 0.03123091161251068,
      "reward_std": 0.020780162885785103,
      "rewards/bleu_reward_func/mean": 0.03123091161251068,
      "rewards/bleu_reward_func/std": 0.029042916372418404,
      "step": 1249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 391.0,
      "completions/max_terminated_length": 391.0,
      "completions/mean_length": 297.375,
      "completions/mean_terminated_length": 297.375,
      "completions/min_length": 235.0,
      "completions/min_terminated_length": 235.0,
      "epoch": 1.0,
      "grad_norm": 2.339179277420044,
      "kl": 0.0479736328125,
      "learning_rate": 1e-06,
      "loss": -0.0543,
      "num_tokens": 16804158.0,
      "reward": 0.0883922278881073,
      "reward_std": 0.03340703248977661,
      "rewards/bleu_reward_func/mean": 0.0883922278881073,
      "rewards/bleu_reward_func/std": 0.07894789427518845,
      "step": 1250
    }
  ],
  "logging_steps": 1,
  "max_steps": 1250,
  "num_input_tokens_seen": 16804158,
  "num_train_epochs": 1,
  "save_steps": 250,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}