diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8973 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 687, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 146.640625, + "epoch": 0.001455604075691412, + "grad_norm": 2.0520484071530207, + "kl": 0.0, + "learning_rate": 4.7619047619047613e-08, + "loss": 0.0027, + "reward": -0.6811327934265137, + "reward_std": 1.2931699752807617, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 0.4427083134651184, + "step": 1 + }, + { + "completion_length": 141.84375, + "epoch": 0.002911208151382824, + "grad_norm": 2.0555621453821775, + "kl": 0.0, + "learning_rate": 9.523809523809523e-08, + "loss": 0.0025, + "reward": -0.39337241649627686, + "reward_std": 1.4007469415664673, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.5158854722976685, + "step": 2 + }, + { + "completion_length": 131.109375, + "epoch": 0.004366812227074236, + "grad_norm": 2.0052335093689617, + "kl": 0.000347137451171875, + "learning_rate": 1.4285714285714285e-07, + "loss": 0.0032, + "reward": -0.4100520610809326, + "reward_std": 1.3564808368682861, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.5098437666893005, + "step": 3 + }, + { + "completion_length": 147.109375, + "epoch": 0.005822416302765648, + "grad_norm": 9.897016495998724, + "kl": 0.0006103515625, + "learning_rate": 1.9047619047619045e-07, + "loss": -0.0005, + "reward": -0.5236002206802368, + "reward_std": 1.1223150491714478, + "rewards/accuracy_reward": 0.203125, + "rewards/format_reward": 0.5694466233253479, + "step": 4 + }, + { + "completion_length": 115.609375, + "epoch": 0.00727802037845706, + "grad_norm": 2.2619632214263254, + "kl": 0.000347137451171875, + "learning_rate": 2.3809523809523806e-07, + "loss": 0.0001, + "reward": -0.4710937440395355, + "reward_std": 1.1760644912719727, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 0.48657551407814026, + "step": 5 + }, + { + "completion_length": 128.390625, + "epoch": 0.008733624454148471, + "grad_norm": 2.025201991853294, + "kl": 0.000396728515625, + "learning_rate": 2.857142857142857e-07, + "loss": -0.0004, + "reward": -0.3249348998069763, + "reward_std": 1.27274489402771, + "rewards/accuracy_reward": 0.265625, + "rewards/format_reward": 0.5628255009651184, + "step": 6 + }, + { + "completion_length": 124.640625, + "epoch": 0.010189228529839884, + "grad_norm": 3.3130345990864427, + "kl": 0.00043487548828125, + "learning_rate": 3.333333333333333e-07, + "loss": -0.0007, + "reward": -0.6294205188751221, + "reward_std": 1.305110216140747, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 0.44719398021698, + "step": 7 + }, + { + "completion_length": 125.171875, + "epoch": 0.011644832605531296, + "grad_norm": 2.2278530283663818, + "kl": 0.0003452301025390625, + "learning_rate": 3.809523809523809e-07, + "loss": -0.0019, + "reward": -0.396751344203949, + "reward_std": 1.1712387800216675, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 0.5593684911727905, + "step": 8 + }, + { + "completion_length": 142.28125, + "epoch": 0.013100436681222707, + "grad_norm": 1.9409494681196875, + "kl": 0.0004138946533203125, + "learning_rate": 4.285714285714285e-07, + "loss": 0.001, + "reward": -0.7128320336341858, + "reward_std": 0.9856235980987549, + "rewards/accuracy_reward": 0.140625, + "rewards/format_reward": 0.5527409315109253, + "step": 9 + }, + { + "completion_length": 139.53125, + "epoch": 0.01455604075691412, + "grad_norm": 4.067951440500351, + "kl": 0.000446319580078125, + "learning_rate": 4.761904761904761e-07, + "loss": 0.0003, + "reward": -0.6937109231948853, + "reward_std": 1.2031134366989136, + "rewards/accuracy_reward": 0.171875, + "rewards/format_reward": 0.483502596616745, + "step": 10 + }, + { + "completion_length": 131.96875, + "epoch": 0.01601164483260553, + "grad_norm": 1.9513622824677586, + "kl": 0.0004024505615234375, + "learning_rate": 5.238095238095238e-07, + "loss": -0.0019, + "reward": -0.44268879294395447, + "reward_std": 1.2512993812561035, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.5169466137886047, + "step": 11 + }, + { + "completion_length": 128.296875, + "epoch": 0.017467248908296942, + "grad_norm": 2.6073182015857483, + "kl": 0.0003662109375, + "learning_rate": 5.714285714285714e-07, + "loss": 0.0019, + "reward": -0.11339845508337021, + "reward_std": 1.379326581954956, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 0.5592578053474426, + "step": 12 + }, + { + "completion_length": 131.5625, + "epoch": 0.018922852983988356, + "grad_norm": 2.038224554514447, + "kl": 0.0004138946533203125, + "learning_rate": 6.19047619047619e-07, + "loss": 0.0019, + "reward": -0.5053775906562805, + "reward_std": 1.0422844886779785, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.5684505105018616, + "step": 13 + }, + { + "completion_length": 130.015625, + "epoch": 0.020378457059679767, + "grad_norm": 8.376191441689384, + "kl": 0.00080108642578125, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0024, + "reward": -0.5506510734558105, + "reward_std": 1.289644479751587, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.45373696088790894, + "step": 14 + }, + { + "completion_length": 143.125, + "epoch": 0.021834061135371178, + "grad_norm": 8.674544330550578, + "kl": 0.0009918212890625, + "learning_rate": 7.142857142857143e-07, + "loss": 0.001, + "reward": -0.517591118812561, + "reward_std": 1.3039864301681519, + "rewards/accuracy_reward": 0.265625, + "rewards/format_reward": 0.48772138357162476, + "step": 15 + }, + { + "completion_length": 152.59375, + "epoch": 0.023289665211062592, + "grad_norm": 2.5986008608286215, + "kl": 0.0004673004150390625, + "learning_rate": 7.619047619047618e-07, + "loss": 0.0034, + "reward": -0.47635418176651, + "reward_std": 1.3350398540496826, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.4758723974227905, + "step": 16 + }, + { + "completion_length": 126.203125, + "epoch": 0.024745269286754003, + "grad_norm": 4.121392021849402, + "kl": 0.00116729736328125, + "learning_rate": 8.095238095238095e-07, + "loss": -0.0004, + "reward": -0.13250651955604553, + "reward_std": 1.1838587522506714, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 0.5381835699081421, + "step": 17 + }, + { + "completion_length": 130.640625, + "epoch": 0.026200873362445413, + "grad_norm": 3.933006083601506, + "kl": 0.000957489013671875, + "learning_rate": 8.57142857142857e-07, + "loss": -0.0009, + "reward": -0.29349610209465027, + "reward_std": 1.1805915832519531, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.5601236820220947, + "step": 18 + }, + { + "completion_length": 134.53125, + "epoch": 0.027656477438136828, + "grad_norm": 2.7373891464842437, + "kl": 0.00140380859375, + "learning_rate": 9.047619047619047e-07, + "loss": 0.0008, + "reward": -0.2783724069595337, + "reward_std": 1.172222375869751, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.5397005081176758, + "step": 19 + }, + { + "completion_length": 130.90625, + "epoch": 0.02911208151382824, + "grad_norm": 2.400503043349488, + "kl": 0.000904083251953125, + "learning_rate": 9.523809523809522e-07, + "loss": -0.0003, + "reward": -0.4976627826690674, + "reward_std": 1.1664050817489624, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.4811263084411621, + "step": 20 + }, + { + "completion_length": 139.421875, + "epoch": 0.03056768558951965, + "grad_norm": 1.7123015669045267, + "kl": 0.0011749267578125, + "learning_rate": 1e-06, + "loss": 0.0013, + "reward": -0.6022005081176758, + "reward_std": 0.8686124682426453, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.5883203148841858, + "step": 21 + }, + { + "completion_length": 144.796875, + "epoch": 0.03202328966521106, + "grad_norm": 1.9937512224942535, + "kl": 0.00160980224609375, + "learning_rate": 9.99994437237857e-07, + "loss": 0.0011, + "reward": -0.14617840945720673, + "reward_std": 1.3580482006072998, + "rewards/accuracy_reward": 0.328125, + "rewards/format_reward": 0.5967773199081421, + "step": 22 + }, + { + "completion_length": 124.6875, + "epoch": 0.033478893740902474, + "grad_norm": 2.0814826578918852, + "kl": 0.00128936767578125, + "learning_rate": 9.999777490752055e-07, + "loss": 0.0029, + "reward": -0.21195964515209198, + "reward_std": 1.2764006853103638, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.5546939969062805, + "step": 23 + }, + { + "completion_length": 140.765625, + "epoch": 0.034934497816593885, + "grad_norm": 1.77589421887042, + "kl": 0.0024871826171875, + "learning_rate": 9.999499358833744e-07, + "loss": 0.0023, + "reward": -0.4434700608253479, + "reward_std": 1.2595456838607788, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 0.589902400970459, + "step": 24 + }, + { + "completion_length": 135.53125, + "epoch": 0.036390101892285295, + "grad_norm": 1.9978141335810846, + "kl": 0.0023956298828125, + "learning_rate": 9.999109982812366e-07, + "loss": 0.0004, + "reward": 0.10095702111721039, + "reward_std": 1.251359462738037, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 0.6653059720993042, + "step": 25 + }, + { + "completion_length": 143.03125, + "epoch": 0.03784570596797671, + "grad_norm": 2.05478507206434, + "kl": 0.001953125, + "learning_rate": 9.998609371351943e-07, + "loss": 0.0017, + "reward": -0.4979492127895355, + "reward_std": 1.096949815750122, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.6168684959411621, + "step": 26 + }, + { + "completion_length": 142.390625, + "epoch": 0.039301310043668124, + "grad_norm": 9.38751585986725, + "kl": 0.2451171875, + "learning_rate": 9.997997535591607e-07, + "loss": 0.0004, + "reward": -0.13885416090488434, + "reward_std": 1.2641448974609375, + "rewards/accuracy_reward": 0.328125, + "rewards/format_reward": 0.5863932371139526, + "step": 27 + }, + { + "completion_length": 122.46875, + "epoch": 0.040756914119359534, + "grad_norm": 2.02901198267971, + "kl": 0.002410888671875, + "learning_rate": 9.997274489145347e-07, + "loss": 0.0019, + "reward": 0.0855598971247673, + "reward_std": 1.314124345779419, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 0.5462110042572021, + "step": 28 + }, + { + "completion_length": 138.21875, + "epoch": 0.042212518195050945, + "grad_norm": 1.9186274674092676, + "kl": 0.003692626953125, + "learning_rate": 9.9964402481017e-07, + "loss": 0.0007, + "reward": 0.14319661259651184, + "reward_std": 1.2557392120361328, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 0.6084700226783752, + "step": 29 + }, + { + "completion_length": 133.484375, + "epoch": 0.043668122270742356, + "grad_norm": 1.8000804655795466, + "kl": 0.00665283203125, + "learning_rate": 9.995494831023408e-07, + "loss": -0.0004, + "reward": -0.0694987028837204, + "reward_std": 1.264149785041809, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.6460742354393005, + "step": 30 + }, + { + "completion_length": 132.125, + "epoch": 0.04512372634643377, + "grad_norm": 2.7510703510304624, + "kl": 0.0225830078125, + "learning_rate": 9.994438258946988e-07, + "loss": -0.0004, + "reward": -0.2966731786727905, + "reward_std": 1.048037052154541, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.6053190231323242, + "step": 31 + }, + { + "completion_length": 138.5625, + "epoch": 0.046579330422125184, + "grad_norm": 1.702872778334861, + "kl": 0.0030059814453125, + "learning_rate": 9.993270555382281e-07, + "loss": -0.0019, + "reward": 0.10822266340255737, + "reward_std": 1.0069178342819214, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.6695116758346558, + "step": 32 + }, + { + "completion_length": 153.203125, + "epoch": 0.048034934497816595, + "grad_norm": 1.6790629896240423, + "kl": 0.0033721923828125, + "learning_rate": 9.991991746311915e-07, + "loss": -0.0004, + "reward": 0.37479168176651, + "reward_std": 1.2309496402740479, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.7032031416893005, + "step": 33 + }, + { + "completion_length": 122.59375, + "epoch": 0.049490538573508006, + "grad_norm": 1.926350880972545, + "kl": 0.004180908203125, + "learning_rate": 9.99060186019073e-07, + "loss": -0.0042, + "reward": 0.2343815118074417, + "reward_std": 0.9639967083930969, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.6073763370513916, + "step": 34 + }, + { + "completion_length": 121.46875, + "epoch": 0.050946142649199416, + "grad_norm": 1.9507933134145956, + "kl": 0.006256103515625, + "learning_rate": 9.989100927945153e-07, + "loss": -0.002, + "reward": 0.36109375953674316, + "reward_std": 1.2289752960205078, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 0.6778646111488342, + "step": 35 + }, + { + "completion_length": 113.640625, + "epoch": 0.05240174672489083, + "grad_norm": 1.7649346907748196, + "kl": 0.00714111328125, + "learning_rate": 9.9874889829725e-07, + "loss": -0.0019, + "reward": -0.09406250715255737, + "reward_std": 0.98655104637146, + "rewards/accuracy_reward": 0.28125, + "rewards/format_reward": 0.6245573163032532, + "step": 36 + }, + { + "completion_length": 140.671875, + "epoch": 0.053857350800582245, + "grad_norm": 1.6599989465316414, + "kl": 0.004180908203125, + "learning_rate": 9.985766061140232e-07, + "loss": 0.0024, + "reward": -0.051347650587558746, + "reward_std": 1.1097118854522705, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.6869857311248779, + "step": 37 + }, + { + "completion_length": 131.609375, + "epoch": 0.055312954876273655, + "grad_norm": 1.9002315583578684, + "kl": 0.004150390625, + "learning_rate": 9.983932200785172e-07, + "loss": -0.0051, + "reward": 0.016217432916164398, + "reward_std": 1.0292843580245972, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.6955012679100037, + "step": 38 + }, + { + "completion_length": 133.46875, + "epoch": 0.056768558951965066, + "grad_norm": 1.703318134865483, + "kl": 0.005767822265625, + "learning_rate": 9.98198744271263e-07, + "loss": -0.0047, + "reward": 0.3018620014190674, + "reward_std": 1.063158392906189, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.669726550579071, + "step": 39 + }, + { + "completion_length": 106.578125, + "epoch": 0.05822416302765648, + "grad_norm": 1.9072082640759345, + "kl": 0.020751953125, + "learning_rate": 9.979931830195522e-07, + "loss": -0.0023, + "reward": -0.04225911945104599, + "reward_std": 0.9956592321395874, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.6362695693969727, + "step": 40 + }, + { + "completion_length": 116.96875, + "epoch": 0.05967976710334789, + "grad_norm": 1.8801906920255198, + "kl": 0.006744384765625, + "learning_rate": 9.977765408973374e-07, + "loss": -0.005, + "reward": 0.21731121838092804, + "reward_std": 0.9288095235824585, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 0.6459180116653442, + "step": 41 + }, + { + "completion_length": 118.265625, + "epoch": 0.0611353711790393, + "grad_norm": 1.9537453615984006, + "kl": 0.009765625, + "learning_rate": 9.975488227251329e-07, + "loss": -0.0017, + "reward": -0.03567056730389595, + "reward_std": 1.0284496545791626, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.645800769329071, + "step": 42 + }, + { + "completion_length": 106.71875, + "epoch": 0.06259097525473072, + "grad_norm": 2.0269318680154806, + "kl": 0.00640869140625, + "learning_rate": 9.973100335699073e-07, + "loss": -0.0026, + "reward": 0.33292967081069946, + "reward_std": 1.1819396018981934, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 0.6497005224227905, + "step": 43 + }, + { + "completion_length": 120.84375, + "epoch": 0.06404657933042213, + "grad_norm": 1.954056839145857, + "kl": 0.00958251953125, + "learning_rate": 9.970601787449696e-07, + "loss": 0.0031, + "reward": 0.3369661271572113, + "reward_std": 1.1963883638381958, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.6441406011581421, + "step": 44 + }, + { + "completion_length": 119.484375, + "epoch": 0.06550218340611354, + "grad_norm": 1.9048660482499535, + "kl": 0.007110595703125, + "learning_rate": 9.967992638098515e-07, + "loss": 0.0017, + "reward": -0.12632161378860474, + "reward_std": 0.811303973197937, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 0.6902539134025574, + "step": 45 + }, + { + "completion_length": 108.234375, + "epoch": 0.06695778748180495, + "grad_norm": 1.954242990541008, + "kl": 0.01055908203125, + "learning_rate": 9.965272945701838e-07, + "loss": 0.0053, + "reward": 0.1287955790758133, + "reward_std": 1.0276615619659424, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 0.6505273580551147, + "step": 46 + }, + { + "completion_length": 111.203125, + "epoch": 0.06841339155749636, + "grad_norm": 2.0265711266703805, + "kl": 0.0101318359375, + "learning_rate": 9.962442770775673e-07, + "loss": -0.0007, + "reward": 0.44425129890441895, + "reward_std": 1.1991455554962158, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.675071656703949, + "step": 47 + }, + { + "completion_length": 109.75, + "epoch": 0.06986899563318777, + "grad_norm": 2.071460406757796, + "kl": 0.0103759765625, + "learning_rate": 9.959502176294382e-07, + "loss": 0.0048, + "reward": -0.17327472567558289, + "reward_std": 0.8208831548690796, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 0.678769588470459, + "step": 48 + }, + { + "completion_length": 101.453125, + "epoch": 0.07132459970887918, + "grad_norm": 2.1718985191781695, + "kl": 0.021728515625, + "learning_rate": 9.956451227689277e-07, + "loss": -0.0001, + "reward": 0.15954425930976868, + "reward_std": 0.9772539734840393, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.6399348974227905, + "step": 49 + }, + { + "completion_length": 106.046875, + "epoch": 0.07278020378457059, + "grad_norm": 2.1050743827136533, + "kl": 0.01318359375, + "learning_rate": 9.953289992847158e-07, + "loss": -0.0002, + "reward": 0.5692838430404663, + "reward_std": 1.0812323093414307, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.6715234518051147, + "step": 50 + }, + { + "completion_length": 102.359375, + "epoch": 0.07423580786026202, + "grad_norm": 1.9030170365144239, + "kl": 0.0135498046875, + "learning_rate": 9.950018542108817e-07, + "loss": 0.0007, + "reward": 0.5510026216506958, + "reward_std": 1.1338927745819092, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.6648176908493042, + "step": 51 + }, + { + "completion_length": 89.515625, + "epoch": 0.07569141193595343, + "grad_norm": 2.120469098312219, + "kl": 0.0157470703125, + "learning_rate": 9.946636948267467e-07, + "loss": 0.0016, + "reward": 0.5175260901451111, + "reward_std": 1.038745641708374, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.6268749833106995, + "step": 52 + }, + { + "completion_length": 91.75, + "epoch": 0.07714701601164484, + "grad_norm": 2.086860625748496, + "kl": 0.0145263671875, + "learning_rate": 9.943145286567113e-07, + "loss": -0.0006, + "reward": 0.42378902435302734, + "reward_std": 1.124879240989685, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.6217838525772095, + "step": 53 + }, + { + "completion_length": 97.9375, + "epoch": 0.07860262008733625, + "grad_norm": 2.019967385216881, + "kl": 0.0146484375, + "learning_rate": 9.93954363470089e-07, + "loss": 0.0035, + "reward": 0.3268880248069763, + "reward_std": 1.1483426094055176, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.6423567533493042, + "step": 54 + }, + { + "completion_length": 99.625, + "epoch": 0.08005822416302766, + "grad_norm": 2.046031430253623, + "kl": 0.0186767578125, + "learning_rate": 9.935832072809327e-07, + "loss": 0.0026, + "reward": 0.30184245109558105, + "reward_std": 1.0335478782653809, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 0.6567773818969727, + "step": 55 + }, + { + "completion_length": 93.578125, + "epoch": 0.08151382823871907, + "grad_norm": 2.1224187100626404, + "kl": 0.0186767578125, + "learning_rate": 9.932010683478573e-07, + "loss": -0.0015, + "reward": 0.7045247554779053, + "reward_std": 1.007187843322754, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.6370638608932495, + "step": 56 + }, + { + "completion_length": 99.71875, + "epoch": 0.08296943231441048, + "grad_norm": 2.0226555290069905, + "kl": 0.019287109375, + "learning_rate": 9.928079551738541e-07, + "loss": 0.004, + "reward": 0.24904297292232513, + "reward_std": 1.0046416521072388, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 0.6374022960662842, + "step": 57 + }, + { + "completion_length": 92.59375, + "epoch": 0.08442503639010189, + "grad_norm": 2.031040356216584, + "kl": 0.0264892578125, + "learning_rate": 9.92403876506104e-07, + "loss": -0.0008, + "reward": 0.5398828387260437, + "reward_std": 0.9229820966720581, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.639635443687439, + "step": 58 + }, + { + "completion_length": 91.3125, + "epoch": 0.0858806404657933, + "grad_norm": 2.018669545596409, + "kl": 0.026611328125, + "learning_rate": 9.919888413357807e-07, + "loss": 0.0024, + "reward": 0.05692709982395172, + "reward_std": 1.0041440725326538, + "rewards/accuracy_reward": 0.328125, + "rewards/format_reward": 0.6213802099227905, + "step": 59 + }, + { + "completion_length": 90.390625, + "epoch": 0.08733624454148471, + "grad_norm": 2.0871119564538034, + "kl": 0.0245361328125, + "learning_rate": 9.91562858897852e-07, + "loss": 0.0021, + "reward": 0.27358072996139526, + "reward_std": 0.9488109350204468, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 0.6085677146911621, + "step": 60 + }, + { + "completion_length": 90.953125, + "epoch": 0.08879184861717612, + "grad_norm": 2.0846084451671727, + "kl": 0.022216796875, + "learning_rate": 9.91125938670874e-07, + "loss": 0.0028, + "reward": 0.9214128255844116, + "reward_std": 0.9024526476860046, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.6374154090881348, + "step": 61 + }, + { + "completion_length": 87.796875, + "epoch": 0.09024745269286755, + "grad_norm": 2.0274598852707553, + "kl": 0.02490234375, + "learning_rate": 9.906780903767798e-07, + "loss": -0.0017, + "reward": 0.5340690016746521, + "reward_std": 0.9889509677886963, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.6374675035476685, + "step": 62 + }, + { + "completion_length": 88.484375, + "epoch": 0.09170305676855896, + "grad_norm": 2.37260281994688, + "kl": 0.02880859375, + "learning_rate": 9.902193239806634e-07, + "loss": 0.0023, + "reward": 0.42514973878860474, + "reward_std": 0.8481540679931641, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.6194596290588379, + "step": 63 + }, + { + "completion_length": 86.8125, + "epoch": 0.09315866084425037, + "grad_norm": 2.128381129079585, + "kl": 0.023193359375, + "learning_rate": 9.897496496905583e-07, + "loss": -0.0011, + "reward": 0.3611133098602295, + "reward_std": 1.0007762908935547, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.6315299272537231, + "step": 64 + }, + { + "completion_length": 83.96875, + "epoch": 0.09461426491994178, + "grad_norm": 2.1833555342889226, + "kl": 0.0262451171875, + "learning_rate": 9.892690779572096e-07, + "loss": 0.0028, + "reward": 0.4069466292858124, + "reward_std": 0.8976852297782898, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.6169465780258179, + "step": 65 + }, + { + "completion_length": 100.078125, + "epoch": 0.09606986899563319, + "grad_norm": 1.95497475262489, + "kl": 0.02783203125, + "learning_rate": 9.887776194738431e-07, + "loss": -0.0006, + "reward": 0.8320702910423279, + "reward_std": 1.0947431325912476, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.6451822519302368, + "step": 66 + }, + { + "completion_length": 89.546875, + "epoch": 0.0975254730713246, + "grad_norm": 2.4296708038911046, + "kl": 0.027099609375, + "learning_rate": 9.882752851759247e-07, + "loss": 0.0042, + "reward": 0.3521158695220947, + "reward_std": 0.9503006935119629, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 0.6041340827941895, + "step": 67 + }, + { + "completion_length": 90.5, + "epoch": 0.09898107714701601, + "grad_norm": 2.116653369847408, + "kl": 0.0250244140625, + "learning_rate": 9.877620862409192e-07, + "loss": 0.0022, + "reward": 0.1717708259820938, + "reward_std": 0.9714279174804688, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 0.6394791603088379, + "step": 68 + }, + { + "completion_length": 88.53125, + "epoch": 0.10043668122270742, + "grad_norm": 2.1712245174516704, + "kl": 0.031982421875, + "learning_rate": 9.872380340880416e-07, + "loss": -0.0013, + "reward": 0.5086783766746521, + "reward_std": 1.0054187774658203, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.6380794048309326, + "step": 69 + }, + { + "completion_length": 88.265625, + "epoch": 0.10189228529839883, + "grad_norm": 2.206351390007687, + "kl": 0.03466796875, + "learning_rate": 9.867031403780013e-07, + "loss": -0.006, + "reward": 0.5890104174613953, + "reward_std": 0.8417867422103882, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.6225130558013916, + "step": 70 + }, + { + "completion_length": 91.484375, + "epoch": 0.10334788937409024, + "grad_norm": 2.0985651707368183, + "kl": 0.0296630859375, + "learning_rate": 9.861574170127444e-07, + "loss": -0.0003, + "reward": 0.848574161529541, + "reward_std": 0.9138556718826294, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.6420508027076721, + "step": 71 + }, + { + "completion_length": 81.1875, + "epoch": 0.10480349344978165, + "grad_norm": 2.3307465788240993, + "kl": 0.037109375, + "learning_rate": 9.85600876135188e-07, + "loss": 0.0021, + "reward": 0.5502018332481384, + "reward_std": 0.8912982940673828, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.6144856214523315, + "step": 72 + }, + { + "completion_length": 86.5625, + "epoch": 0.10625909752547306, + "grad_norm": 2.1316996208970833, + "kl": 0.02880859375, + "learning_rate": 9.850335301289504e-07, + "loss": 0.0014, + "reward": -0.0640755146741867, + "reward_std": 0.8301602602005005, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.6316145658493042, + "step": 73 + }, + { + "completion_length": 78.46875, + "epoch": 0.10771470160116449, + "grad_norm": 2.147131800814305, + "kl": 0.03857421875, + "learning_rate": 9.844553916180746e-07, + "loss": -0.0031, + "reward": 0.55866539478302, + "reward_std": 1.0137114524841309, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.606178343296051, + "step": 74 + }, + { + "completion_length": 94.375, + "epoch": 0.1091703056768559, + "grad_norm": 2.134513729452084, + "kl": 0.037109375, + "learning_rate": 9.838664734667495e-07, + "loss": 0.0012, + "reward": 0.6748111844062805, + "reward_std": 0.863685131072998, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.6456054449081421, + "step": 75 + }, + { + "completion_length": 77.546875, + "epoch": 0.11062590975254731, + "grad_norm": 2.1654090092198013, + "kl": 0.033447265625, + "learning_rate": 9.832667887790206e-07, + "loss": -0.0003, + "reward": 0.39906901121139526, + "reward_std": 0.7819440364837646, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 0.6166341304779053, + "step": 76 + }, + { + "completion_length": 87.6875, + "epoch": 0.11208151382823872, + "grad_norm": 2.2117240462456476, + "kl": 0.033935546875, + "learning_rate": 9.826563508985016e-07, + "loss": -0.0008, + "reward": 0.10059896111488342, + "reward_std": 0.775175929069519, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 0.591393232345581, + "step": 77 + }, + { + "completion_length": 84.84375, + "epoch": 0.11353711790393013, + "grad_norm": 2.216126758365653, + "kl": 0.034912109375, + "learning_rate": 9.820351734080754e-07, + "loss": -0.0009, + "reward": 0.46383464336395264, + "reward_std": 0.6518522500991821, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.6175326108932495, + "step": 78 + }, + { + "completion_length": 71.5, + "epoch": 0.11499272197962154, + "grad_norm": 2.68226792040073, + "kl": 0.04345703125, + "learning_rate": 9.81403270129592e-07, + "loss": -0.0057, + "reward": 1.0123958587646484, + "reward_std": 0.7545869946479797, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.5960806608200073, + "step": 79 + }, + { + "completion_length": 80.859375, + "epoch": 0.11644832605531295, + "grad_norm": 2.249003158750042, + "kl": 0.03564453125, + "learning_rate": 9.807606551235627e-07, + "loss": -0.0056, + "reward": 0.9187760353088379, + "reward_std": 0.778544545173645, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.620690107345581, + "step": 80 + }, + { + "completion_length": 81.265625, + "epoch": 0.11790393013100436, + "grad_norm": 2.4482747382151695, + "kl": 0.043212890625, + "learning_rate": 9.801073426888446e-07, + "loss": 0.0019, + "reward": 0.7133723497390747, + "reward_std": 0.7031675577163696, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.6206510663032532, + "step": 81 + }, + { + "completion_length": 77.8125, + "epoch": 0.11935953420669577, + "grad_norm": 2.2325416864250265, + "kl": 0.04052734375, + "learning_rate": 9.794433473623248e-07, + "loss": 0.0035, + "reward": 0.6323372721672058, + "reward_std": 0.9128393530845642, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.6164127588272095, + "step": 82 + }, + { + "completion_length": 82.046875, + "epoch": 0.12081513828238719, + "grad_norm": 2.1020592126259126, + "kl": 0.032958984375, + "learning_rate": 9.787686839185954e-07, + "loss": -0.0001, + "reward": 1.0958268642425537, + "reward_std": 0.6903193593025208, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.622962236404419, + "step": 83 + }, + { + "completion_length": 75.625, + "epoch": 0.1222707423580786, + "grad_norm": 2.225966097945577, + "kl": 0.041015625, + "learning_rate": 9.780833673696254e-07, + "loss": 0.0035, + "reward": 0.7522070407867432, + "reward_std": 0.7671902179718018, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.6155664920806885, + "step": 84 + }, + { + "completion_length": 83.359375, + "epoch": 0.12372634643377002, + "grad_norm": 2.2693402594500034, + "kl": 0.0458984375, + "learning_rate": 9.773874129644267e-07, + "loss": -0.0006, + "reward": 0.6304752826690674, + "reward_std": 0.8777204751968384, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.6244465708732605, + "step": 85 + }, + { + "completion_length": 76.90625, + "epoch": 0.12518195050946143, + "grad_norm": 2.138087833605439, + "kl": 0.0478515625, + "learning_rate": 9.766808361887148e-07, + "loss": -0.0009, + "reward": 0.8740299344062805, + "reward_std": 0.823813796043396, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.6129882335662842, + "step": 86 + }, + { + "completion_length": 78.640625, + "epoch": 0.12663755458515283, + "grad_norm": 2.644901616780376, + "kl": 0.0400390625, + "learning_rate": 9.759636527645632e-07, + "loss": 0.0005, + "reward": 0.7980924248695374, + "reward_std": 0.8134920001029968, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.6185481548309326, + "step": 87 + }, + { + "completion_length": 73.578125, + "epoch": 0.12809315866084425, + "grad_norm": 2.308714342568071, + "kl": 0.047607421875, + "learning_rate": 9.752358786500558e-07, + "loss": 0.0015, + "reward": 0.7030664682388306, + "reward_std": 0.874236524105072, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.6089127659797668, + "step": 88 + }, + { + "completion_length": 82.390625, + "epoch": 0.12954876273653565, + "grad_norm": 2.2405963594949982, + "kl": 0.038330078125, + "learning_rate": 9.744975300389293e-07, + "loss": 0.0043, + "reward": 0.5240559577941895, + "reward_std": 0.8964687585830688, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.6264387369155884, + "step": 89 + }, + { + "completion_length": 80.78125, + "epoch": 0.13100436681222707, + "grad_norm": 2.2976726395774385, + "kl": 0.034423828125, + "learning_rate": 9.737486233602147e-07, + "loss": 0.0015, + "reward": 0.5632421970367432, + "reward_std": 0.7829184532165527, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.6213281750679016, + "step": 90 + }, + { + "completion_length": 78.078125, + "epoch": 0.1324599708879185, + "grad_norm": 2.102179867053138, + "kl": 0.05712890625, + "learning_rate": 9.729891752778711e-07, + "loss": -0.0009, + "reward": 0.9788346290588379, + "reward_std": 0.6263606548309326, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.6095768213272095, + "step": 91 + }, + { + "completion_length": 89.0625, + "epoch": 0.1339155749636099, + "grad_norm": 2.4407305687279295, + "kl": 0.034912109375, + "learning_rate": 9.722192026904144e-07, + "loss": -0.0027, + "reward": 0.9294661283493042, + "reward_std": 0.9660316109657288, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.6371874809265137, + "step": 92 + }, + { + "completion_length": 82.0625, + "epoch": 0.13537117903930132, + "grad_norm": 2.0885102072083717, + "kl": 0.0400390625, + "learning_rate": 9.71438722730542e-07, + "loss": 0.0019, + "reward": 1.2173632383346558, + "reward_std": 0.5946205854415894, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.6133268475532532, + "step": 93 + }, + { + "completion_length": 77.515625, + "epoch": 0.13682678311499272, + "grad_norm": 2.356967419590387, + "kl": 0.038330078125, + "learning_rate": 9.706477527647516e-07, + "loss": -0.0035, + "reward": 0.7038216590881348, + "reward_std": 0.6569962501525879, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.6144986748695374, + "step": 94 + }, + { + "completion_length": 81.21875, + "epoch": 0.13828238719068414, + "grad_norm": 2.297586439480163, + "kl": 0.03857421875, + "learning_rate": 9.698463103929541e-07, + "loss": 0.0037, + "reward": 0.02611328661441803, + "reward_std": 0.5660784840583801, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.6138085722923279, + "step": 95 + }, + { + "completion_length": 78.6875, + "epoch": 0.13973799126637554, + "grad_norm": 2.183856708817705, + "kl": 0.04736328125, + "learning_rate": 9.69034413448083e-07, + "loss": -0.0024, + "reward": 0.8519986867904663, + "reward_std": 0.8133624792098999, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.6173242330551147, + "step": 96 + }, + { + "completion_length": 71.03125, + "epoch": 0.14119359534206696, + "grad_norm": 2.497616068899775, + "kl": 0.08154296875, + "learning_rate": 9.682120799956961e-07, + "loss": 0.0024, + "reward": 0.7480989694595337, + "reward_std": 0.7188245058059692, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.59375, + "step": 97 + }, + { + "completion_length": 75.328125, + "epoch": 0.14264919941775836, + "grad_norm": 2.521282427156024, + "kl": 0.0556640625, + "learning_rate": 9.673793283335756e-07, + "loss": -0.002, + "reward": 1.0954035520553589, + "reward_std": 0.6585423946380615, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.6114453077316284, + "step": 98 + }, + { + "completion_length": 82.234375, + "epoch": 0.14410480349344978, + "grad_norm": 2.1738986479204794, + "kl": 0.04345703125, + "learning_rate": 9.665361769913186e-07, + "loss": -0.0008, + "reward": 0.48210933804512024, + "reward_std": 0.6862488389015198, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.5999479293823242, + "step": 99 + }, + { + "completion_length": 85.5625, + "epoch": 0.14556040756914118, + "grad_norm": 2.1837000916722658, + "kl": 0.044189453125, + "learning_rate": 9.656826447299271e-07, + "loss": 0.0001, + "reward": 0.8835351467132568, + "reward_std": 0.5615145564079285, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.6321679949760437, + "step": 100 + }, + { + "completion_length": 83.546875, + "epoch": 0.1470160116448326, + "grad_norm": 2.087738988339914, + "kl": 0.04052734375, + "learning_rate": 9.648187505413884e-07, + "loss": 0.0008, + "reward": 0.7421875, + "reward_std": 0.6311359405517578, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.6250260472297668, + "step": 101 + }, + { + "completion_length": 88.03125, + "epoch": 0.14847161572052403, + "grad_norm": 2.1250846849492526, + "kl": 0.04638671875, + "learning_rate": 9.639445136482546e-07, + "loss": -0.0005, + "reward": 0.9741601943969727, + "reward_std": 0.681174635887146, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.6371548771858215, + "step": 102 + }, + { + "completion_length": 74.359375, + "epoch": 0.14992721979621543, + "grad_norm": 2.409258859083457, + "kl": 0.04150390625, + "learning_rate": 9.63059953503213e-07, + "loss": 0.0013, + "reward": 1.1404622793197632, + "reward_std": 0.38158488273620605, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.6089909672737122, + "step": 103 + }, + { + "completion_length": 87.359375, + "epoch": 0.15138282387190685, + "grad_norm": 2.088104027671848, + "kl": 0.0380859375, + "learning_rate": 9.621650897886541e-07, + "loss": -0.0018, + "reward": 0.7542252540588379, + "reward_std": 0.6290829181671143, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.6356835961341858, + "step": 104 + }, + { + "completion_length": 80.703125, + "epoch": 0.15283842794759825, + "grad_norm": 1.965318350453434, + "kl": 0.035400390625, + "learning_rate": 9.612599424162343e-07, + "loss": -0.0008, + "reward": 1.0488801002502441, + "reward_std": 0.48284074664115906, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.6244661808013916, + "step": 105 + }, + { + "completion_length": 93.875, + "epoch": 0.15429403202328967, + "grad_norm": 2.1219245641446975, + "kl": 0.038818359375, + "learning_rate": 9.603445315264316e-07, + "loss": 0.0014, + "reward": 1.0476692914962769, + "reward_std": 0.5120445489883423, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.6368489265441895, + "step": 106 + }, + { + "completion_length": 86.890625, + "epoch": 0.15574963609898107, + "grad_norm": 2.036182966894686, + "kl": 0.044189453125, + "learning_rate": 9.59418877488098e-07, + "loss": -0.004, + "reward": 0.955091118812561, + "reward_std": 0.804840624332428, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.6345182657241821, + "step": 107 + }, + { + "completion_length": 90.484375, + "epoch": 0.1572052401746725, + "grad_norm": 2.042159784075266, + "kl": 0.03173828125, + "learning_rate": 9.584830008980067e-07, + "loss": 0.0002, + "reward": 0.3213476538658142, + "reward_std": 0.6555300354957581, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 0.63895183801651, + "step": 108 + }, + { + "completion_length": 94.625, + "epoch": 0.1586608442503639, + "grad_norm": 1.9758420259115719, + "kl": 0.040771484375, + "learning_rate": 9.57536922580393e-07, + "loss": -0.0006, + "reward": 1.0781381130218506, + "reward_std": 0.7523989677429199, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.6387500762939453, + "step": 109 + }, + { + "completion_length": 91.6875, + "epoch": 0.16011644832605532, + "grad_norm": 2.331594748973951, + "kl": 0.0419921875, + "learning_rate": 9.565806635864917e-07, + "loss": -0.0045, + "reward": 0.5411393046379089, + "reward_std": 0.7696890830993652, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.6179623007774353, + "step": 110 + }, + { + "completion_length": 90.84375, + "epoch": 0.1615720524017467, + "grad_norm": 2.2436142517878648, + "kl": 0.038330078125, + "learning_rate": 9.556142451940679e-07, + "loss": 0.0008, + "reward": 0.2418619841337204, + "reward_std": 0.7631776332855225, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 0.6416015625, + "step": 111 + }, + { + "completion_length": 90.125, + "epoch": 0.16302765647743814, + "grad_norm": 1.8856149828565558, + "kl": 0.0478515625, + "learning_rate": 9.546376889069443e-07, + "loss": -0.0035, + "reward": 0.37279295921325684, + "reward_std": 0.6837524175643921, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.6407356858253479, + "step": 112 + }, + { + "completion_length": 98.203125, + "epoch": 0.16448326055312956, + "grad_norm": 2.173251868707325, + "kl": 0.03466796875, + "learning_rate": 9.536510164545222e-07, + "loss": 0.0007, + "reward": 1.06194007396698, + "reward_std": 0.6035024523735046, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.6475651264190674, + "step": 113 + }, + { + "completion_length": 103.3125, + "epoch": 0.16593886462882096, + "grad_norm": 2.0219223010119887, + "kl": 0.0439453125, + "learning_rate": 9.526542497912983e-07, + "loss": 0.0012, + "reward": 0.4283137917518616, + "reward_std": 0.6393612027168274, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.6627669334411621, + "step": 114 + }, + { + "completion_length": 100.796875, + "epoch": 0.16739446870451238, + "grad_norm": 2.0798140558182134, + "kl": 0.031005859375, + "learning_rate": 9.516474110963761e-07, + "loss": -0.0026, + "reward": 1.1091991662979126, + "reward_std": 0.4762105345726013, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.66357421875, + "step": 115 + }, + { + "completion_length": 100.421875, + "epoch": 0.16885007278020378, + "grad_norm": 2.2236939272257445, + "kl": 0.0478515625, + "learning_rate": 9.506305227729723e-07, + "loss": 0.0058, + "reward": 0.6438932418823242, + "reward_std": 0.509583592414856, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.6582422256469727, + "step": 116 + }, + { + "completion_length": 101.171875, + "epoch": 0.1703056768558952, + "grad_norm": 2.1980519442731468, + "kl": 0.0281982421875, + "learning_rate": 9.496036074479184e-07, + "loss": -0.001, + "reward": 0.827063798904419, + "reward_std": 0.9014047384262085, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.65771484375, + "step": 117 + }, + { + "completion_length": 96.125, + "epoch": 0.1717612809315866, + "grad_norm": 2.037191204255324, + "kl": 0.04150390625, + "learning_rate": 9.48566687971157e-07, + "loss": -0.0004, + "reward": 1.3227994441986084, + "reward_std": 0.5190377235412598, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.6540234684944153, + "step": 118 + }, + { + "completion_length": 109.09375, + "epoch": 0.17321688500727803, + "grad_norm": 2.0272853115442544, + "kl": 0.03466796875, + "learning_rate": 9.475197874152339e-07, + "loss": -0.0, + "reward": -0.18306639790534973, + "reward_std": 0.5607576966285706, + "rewards/accuracy_reward": 0.21875, + "rewards/format_reward": 0.664681077003479, + "step": 119 + }, + { + "completion_length": 96.515625, + "epoch": 0.17467248908296942, + "grad_norm": 2.0492268918355374, + "kl": 0.041748046875, + "learning_rate": 9.464629290747842e-07, + "loss": -0.0008, + "reward": 0.7684569954872131, + "reward_std": 0.7265533208847046, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.6427409052848816, + "step": 120 + }, + { + "completion_length": 102.375, + "epoch": 0.17612809315866085, + "grad_norm": 1.9898855505081523, + "kl": 0.041259765625, + "learning_rate": 9.453961364660142e-07, + "loss": -0.001, + "reward": 0.7294921875, + "reward_std": 0.3104066252708435, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.6654947996139526, + "step": 121 + }, + { + "completion_length": 100.15625, + "epoch": 0.17758369723435224, + "grad_norm": 2.3735086249364046, + "kl": 0.037109375, + "learning_rate": 9.443194333261779e-07, + "loss": -0.0009, + "reward": 1.1522916555404663, + "reward_std": 0.48791223764419556, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.660559892654419, + "step": 122 + }, + { + "completion_length": 98.1875, + "epoch": 0.17903930131004367, + "grad_norm": 2.2233975652336175, + "kl": 0.02880859375, + "learning_rate": 9.432328436130493e-07, + "loss": -0.0013, + "reward": 1.3009765148162842, + "reward_std": 0.29463040828704834, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.656653642654419, + "step": 123 + }, + { + "completion_length": 114.890625, + "epoch": 0.1804949053857351, + "grad_norm": 2.0022084670374865, + "kl": 0.037841796875, + "learning_rate": 9.421363915043889e-07, + "loss": 0.0007, + "reward": 0.9197134971618652, + "reward_std": 0.3231443166732788, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.6893749833106995, + "step": 124 + }, + { + "completion_length": 99.171875, + "epoch": 0.1819505094614265, + "grad_norm": 1.8630637664051692, + "kl": 0.029296875, + "learning_rate": 9.410301013974056e-07, + "loss": -0.0012, + "reward": 0.63825523853302, + "reward_std": 0.3934669494628906, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.6578906178474426, + "step": 125 + }, + { + "completion_length": 100.859375, + "epoch": 0.18340611353711792, + "grad_norm": 1.9824110470356529, + "kl": 0.039794921875, + "learning_rate": 9.399139979082147e-07, + "loss": -0.0003, + "reward": 0.7100846171379089, + "reward_std": 0.5975295901298523, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.6606184244155884, + "step": 126 + }, + { + "completion_length": 116.953125, + "epoch": 0.1848617176128093, + "grad_norm": 1.8889996265777742, + "kl": 0.027587890625, + "learning_rate": 9.387881058712888e-07, + "loss": 0.005, + "reward": 0.8783528804779053, + "reward_std": 0.8908267021179199, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.6875325441360474, + "step": 127 + }, + { + "completion_length": 107.375, + "epoch": 0.18631732168850074, + "grad_norm": 2.077089346407135, + "kl": 0.03173828125, + "learning_rate": 9.376524503389065e-07, + "loss": -0.002, + "reward": 0.4938216507434845, + "reward_std": 0.5928758978843689, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.669837236404419, + "step": 128 + }, + { + "completion_length": 117.109375, + "epoch": 0.18777292576419213, + "grad_norm": 1.8893938122618363, + "kl": 0.0294189453125, + "learning_rate": 9.36507056580594e-07, + "loss": -0.0018, + "reward": 0.7212304472923279, + "reward_std": 0.593620777130127, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.6912304759025574, + "step": 129 + }, + { + "completion_length": 123.234375, + "epoch": 0.18922852983988356, + "grad_norm": 1.7898651835340285, + "kl": 0.03173828125, + "learning_rate": 9.353519500825637e-07, + "loss": 0.0031, + "reward": 0.4010286331176758, + "reward_std": 0.5465176105499268, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 0.7014323472976685, + "step": 130 + }, + { + "completion_length": 104.0, + "epoch": 0.19068413391557495, + "grad_norm": 2.2172071907289346, + "kl": 0.0277099609375, + "learning_rate": 9.341871565471463e-07, + "loss": 0.0001, + "reward": 0.7860090732574463, + "reward_std": 0.4027497172355652, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.6570768356323242, + "step": 131 + }, + { + "completion_length": 114.546875, + "epoch": 0.19213973799126638, + "grad_norm": 1.886899003286242, + "kl": 0.0296630859375, + "learning_rate": 9.330127018922193e-07, + "loss": -0.0045, + "reward": 0.7923893332481384, + "reward_std": 0.5422953367233276, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.6841601133346558, + "step": 132 + }, + { + "completion_length": 127.171875, + "epoch": 0.19359534206695778, + "grad_norm": 1.940672486847996, + "kl": 0.0286865234375, + "learning_rate": 9.318286122506302e-07, + "loss": 0.0018, + "reward": 0.7649609446525574, + "reward_std": 0.9051897525787354, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.6996744871139526, + "step": 133 + }, + { + "completion_length": 122.59375, + "epoch": 0.1950509461426492, + "grad_norm": 2.012027311337077, + "kl": 0.03271484375, + "learning_rate": 9.306349139696154e-07, + "loss": 0.0021, + "reward": 0.9356836080551147, + "reward_std": 0.3557165265083313, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.6972851157188416, + "step": 134 + }, + { + "completion_length": 125.34375, + "epoch": 0.1965065502183406, + "grad_norm": 1.9803313783521628, + "kl": 0.03466796875, + "learning_rate": 9.29431633610213e-07, + "loss": -0.0039, + "reward": 1.0579817295074463, + "reward_std": 0.7984186410903931, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.6721354126930237, + "step": 135 + }, + { + "completion_length": 151.75, + "epoch": 0.19796215429403202, + "grad_norm": 1.6537670662618684, + "kl": 0.0308837890625, + "learning_rate": 9.282187979466729e-07, + "loss": 0.0042, + "reward": 0.8291862607002258, + "reward_std": 0.5554116368293762, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.7382357120513916, + "step": 136 + }, + { + "completion_length": 147.265625, + "epoch": 0.19941775836972345, + "grad_norm": 1.7201446218582972, + "kl": 0.031982421875, + "learning_rate": 9.269964339658604e-07, + "loss": 0.006, + "reward": 1.283261775970459, + "reward_std": 0.5926542282104492, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.7361132502555847, + "step": 137 + }, + { + "completion_length": 134.375, + "epoch": 0.20087336244541484, + "grad_norm": 1.874315712431397, + "kl": 0.03173828125, + "learning_rate": 9.257645688666555e-07, + "loss": -0.0038, + "reward": 1.069654941558838, + "reward_std": 0.37698203325271606, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.7255665063858032, + "step": 138 + }, + { + "completion_length": 136.75, + "epoch": 0.20232896652110627, + "grad_norm": 1.814100036642101, + "kl": 0.0301513671875, + "learning_rate": 9.245232300593488e-07, + "loss": 0.0007, + "reward": 0.5383853912353516, + "reward_std": 0.32863178849220276, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.7274609804153442, + "step": 139 + }, + { + "completion_length": 156.84375, + "epoch": 0.20378457059679767, + "grad_norm": 1.7660157876424847, + "kl": 0.042236328125, + "learning_rate": 9.232724451650302e-07, + "loss": -0.0023, + "reward": 1.042018175125122, + "reward_std": 0.4296218752861023, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.7598958611488342, + "step": 140 + }, + { + "completion_length": 152.078125, + "epoch": 0.2052401746724891, + "grad_norm": 1.9326818117418496, + "kl": 0.03662109375, + "learning_rate": 9.220122420149752e-07, + "loss": -0.0015, + "reward": 0.7897005081176758, + "reward_std": 0.8649002313613892, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.7523828148841858, + "step": 141 + }, + { + "completion_length": 146.171875, + "epoch": 0.2066957787481805, + "grad_norm": 2.3170747889179975, + "kl": 0.04150390625, + "learning_rate": 9.207426486500251e-07, + "loss": 0.0029, + "reward": 0.9602214097976685, + "reward_std": 0.48202353715896606, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.7439453601837158, + "step": 142 + }, + { + "completion_length": 153.78125, + "epoch": 0.2081513828238719, + "grad_norm": 1.8295711779834491, + "kl": 0.0380859375, + "learning_rate": 9.194636933199637e-07, + "loss": -0.0005, + "reward": 0.6159765720367432, + "reward_std": 0.6443432569503784, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.7611978650093079, + "step": 143 + }, + { + "completion_length": 167.6875, + "epoch": 0.2096069868995633, + "grad_norm": 1.8055857818112881, + "kl": 0.03466796875, + "learning_rate": 9.18175404482888e-07, + "loss": -0.0004, + "reward": 0.7853385210037231, + "reward_std": 0.6822813153266907, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.7819271087646484, + "step": 144 + }, + { + "completion_length": 149.59375, + "epoch": 0.21106259097525473, + "grad_norm": 1.7611225555314238, + "kl": 0.0361328125, + "learning_rate": 9.168778108045758e-07, + "loss": 0.0014, + "reward": 1.3780207633972168, + "reward_std": 0.5230543613433838, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.7534895539283752, + "step": 145 + }, + { + "completion_length": 154.515625, + "epoch": 0.21251819505094613, + "grad_norm": 1.8484664379659073, + "kl": 0.03515625, + "learning_rate": 9.155709411578467e-07, + "loss": -0.0028, + "reward": 0.663769543170929, + "reward_std": 0.8413479328155518, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.7554622292518616, + "step": 146 + }, + { + "completion_length": 169.25, + "epoch": 0.21397379912663755, + "grad_norm": 1.5659465541368485, + "kl": 0.03125, + "learning_rate": 9.14254824621921e-07, + "loss": 0.0025, + "reward": 0.8898242115974426, + "reward_std": 0.61174476146698, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.7810482382774353, + "step": 147 + }, + { + "completion_length": 183.078125, + "epoch": 0.21542940320232898, + "grad_norm": 1.9240333532034173, + "kl": 0.0299072265625, + "learning_rate": 9.129294904817715e-07, + "loss": 0.0055, + "reward": -0.32472002506256104, + "reward_std": 0.5251954793930054, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.804186224937439, + "step": 148 + }, + { + "completion_length": 188.078125, + "epoch": 0.21688500727802038, + "grad_norm": 1.584600042627141, + "kl": 0.037353515625, + "learning_rate": 9.115949682274727e-07, + "loss": -0.0006, + "reward": 1.0250911712646484, + "reward_std": 0.7966851592063904, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.7925260066986084, + "step": 149 + }, + { + "completion_length": 192.75, + "epoch": 0.2183406113537118, + "grad_norm": 1.7968297696360862, + "kl": 0.034912109375, + "learning_rate": 9.102512875535438e-07, + "loss": 0.0024, + "reward": 0.9074022769927979, + "reward_std": 0.5267736315727234, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.8198763132095337, + "step": 150 + }, + { + "completion_length": 181.109375, + "epoch": 0.2197962154294032, + "grad_norm": 1.487459256780106, + "kl": 0.034912109375, + "learning_rate": 9.088984783582889e-07, + "loss": 0.0008, + "reward": 1.1278971433639526, + "reward_std": 0.6906546950340271, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.771959662437439, + "step": 151 + }, + { + "completion_length": 168.859375, + "epoch": 0.22125181950509462, + "grad_norm": 1.6688905020817022, + "kl": 0.033203125, + "learning_rate": 9.075365707431311e-07, + "loss": 0.0037, + "reward": 0.38999348878860474, + "reward_std": 0.7020710110664368, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.7868424654006958, + "step": 152 + }, + { + "completion_length": 189.75, + "epoch": 0.22270742358078602, + "grad_norm": 1.5057560252284368, + "kl": 0.03369140625, + "learning_rate": 9.061655950119429e-07, + "loss": -0.0008, + "reward": 0.7166080474853516, + "reward_std": 0.7406556010246277, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.7738346457481384, + "step": 153 + }, + { + "completion_length": 195.203125, + "epoch": 0.22416302765647744, + "grad_norm": 1.6050100488071573, + "kl": 0.029296875, + "learning_rate": 9.04785581670372e-07, + "loss": -0.0032, + "reward": 0.00641275942325592, + "reward_std": 0.9503659605979919, + "rewards/accuracy_reward": 0.3125, + "rewards/format_reward": 0.8109830617904663, + "step": 154 + }, + { + "completion_length": 209.5625, + "epoch": 0.22561863173216884, + "grad_norm": 1.671842517453717, + "kl": 0.030517578125, + "learning_rate": 9.033965614251622e-07, + "loss": 0.002, + "reward": 1.051744818687439, + "reward_std": 0.6885015964508057, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.8231119513511658, + "step": 155 + }, + { + "completion_length": 182.359375, + "epoch": 0.22707423580786026, + "grad_norm": 1.7685493760830322, + "kl": 0.03466796875, + "learning_rate": 9.019985651834703e-07, + "loss": 0.0004, + "reward": 1.1568944454193115, + "reward_std": 0.6391496658325195, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.7899413704872131, + "step": 156 + }, + { + "completion_length": 203.671875, + "epoch": 0.22852983988355166, + "grad_norm": 1.6158229805841648, + "kl": 0.04541015625, + "learning_rate": 9.005916240521787e-07, + "loss": -0.0008, + "reward": 1.3469856977462769, + "reward_std": 0.5584310293197632, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.8243294358253479, + "step": 157 + }, + { + "completion_length": 198.0, + "epoch": 0.22998544395924309, + "grad_norm": 1.5287888769470326, + "kl": 0.03173828125, + "learning_rate": 8.99175769337203e-07, + "loss": -0.0002, + "reward": 0.5042838454246521, + "reward_std": 0.3807409107685089, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8300260901451111, + "step": 158 + }, + { + "completion_length": 198.5625, + "epoch": 0.2314410480349345, + "grad_norm": 1.6980632238166218, + "kl": 0.045654296875, + "learning_rate": 8.97751032542795e-07, + "loss": 0.0048, + "reward": 0.7278580665588379, + "reward_std": 0.8472484350204468, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.809654951095581, + "step": 159 + }, + { + "completion_length": 209.734375, + "epoch": 0.2328966521106259, + "grad_norm": 1.4763866341269176, + "kl": 0.03759765625, + "learning_rate": 8.963174453708424e-07, + "loss": -0.0002, + "reward": 1.0725455284118652, + "reward_std": 0.9471028447151184, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.8029752969741821, + "step": 160 + }, + { + "completion_length": 204.84375, + "epoch": 0.23435225618631733, + "grad_norm": 1.509135110734139, + "kl": 0.039794921875, + "learning_rate": 8.94875039720163e-07, + "loss": 0.0021, + "reward": 0.7245508432388306, + "reward_std": 0.28923317790031433, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.8137694597244263, + "step": 161 + }, + { + "completion_length": 205.6875, + "epoch": 0.23580786026200873, + "grad_norm": 1.670333061374936, + "kl": 0.03466796875, + "learning_rate": 8.934238476857949e-07, + "loss": -0.0015, + "reward": 1.22831392288208, + "reward_std": 0.6582134366035461, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.7960742115974426, + "step": 162 + }, + { + "completion_length": 219.171875, + "epoch": 0.23726346433770015, + "grad_norm": 1.4108741219708183, + "kl": 0.04345703125, + "learning_rate": 8.919639015582828e-07, + "loss": 0.0007, + "reward": 0.8317968845367432, + "reward_std": 0.8654596209526062, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.8558332920074463, + "step": 163 + }, + { + "completion_length": 221.921875, + "epoch": 0.23871906841339155, + "grad_norm": 1.5210214995683051, + "kl": 0.038330078125, + "learning_rate": 8.904952338229587e-07, + "loss": 0.0028, + "reward": 1.3629167079925537, + "reward_std": 0.4224995970726013, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.8020312786102295, + "step": 164 + }, + { + "completion_length": 221.109375, + "epoch": 0.24017467248908297, + "grad_norm": 1.570373533347191, + "kl": 0.047119140625, + "learning_rate": 8.890178771592197e-07, + "loss": -0.003, + "reward": 1.1900064945220947, + "reward_std": 0.45047634840011597, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.8544987440109253, + "step": 165 + }, + { + "completion_length": 226.859375, + "epoch": 0.24163027656477437, + "grad_norm": 1.4369155698170966, + "kl": 0.041748046875, + "learning_rate": 8.875318644398007e-07, + "loss": 0.0001, + "reward": 1.1164518594741821, + "reward_std": 0.6411557793617249, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.8440819978713989, + "step": 166 + }, + { + "completion_length": 217.828125, + "epoch": 0.2430858806404658, + "grad_norm": 1.5294020027875026, + "kl": 0.04150390625, + "learning_rate": 8.860372287300431e-07, + "loss": -0.0025, + "reward": 0.8133528828620911, + "reward_std": 0.3728664517402649, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.8488085269927979, + "step": 167 + }, + { + "completion_length": 232.46875, + "epoch": 0.2445414847161572, + "grad_norm": 1.4493765933491527, + "kl": 0.043701171875, + "learning_rate": 8.845340032871583e-07, + "loss": 0.0056, + "reward": 0.8572330474853516, + "reward_std": 0.6895395517349243, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.82414710521698, + "step": 168 + }, + { + "completion_length": 226.5, + "epoch": 0.24599708879184862, + "grad_norm": 1.4764300782122841, + "kl": 0.0556640625, + "learning_rate": 8.83022221559489e-07, + "loss": 0.0009, + "reward": 0.47426432371139526, + "reward_std": 0.8444293737411499, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8511523604393005, + "step": 169 + }, + { + "completion_length": 237.8125, + "epoch": 0.24745269286754004, + "grad_norm": 1.6769320026302001, + "kl": 0.0498046875, + "learning_rate": 8.815019171857637e-07, + "loss": -0.0027, + "reward": 1.0890560150146484, + "reward_std": 0.5113028287887573, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.8479622602462769, + "step": 170 + }, + { + "completion_length": 261.34375, + "epoch": 0.24890829694323144, + "grad_norm": 1.2657644862990098, + "kl": 0.039794921875, + "learning_rate": 8.799731239943487e-07, + "loss": 0.0036, + "reward": 1.0662890672683716, + "reward_std": 0.6503676176071167, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.8455598950386047, + "step": 171 + }, + { + "completion_length": 242.8125, + "epoch": 0.25036390101892286, + "grad_norm": 1.5140610941306871, + "kl": 0.0439453125, + "learning_rate": 8.784358760024959e-07, + "loss": 0.0, + "reward": 1.0661003589630127, + "reward_std": 0.7506436705589294, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.7991602420806885, + "step": 172 + }, + { + "completion_length": 272.265625, + "epoch": 0.25181950509461426, + "grad_norm": 1.3002869973291449, + "kl": 0.04345703125, + "learning_rate": 8.768902074155847e-07, + "loss": 0.0012, + "reward": 0.5323176383972168, + "reward_std": 0.7601783275604248, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.8805208206176758, + "step": 173 + }, + { + "completion_length": 260.171875, + "epoch": 0.25327510917030566, + "grad_norm": 1.2395481937954542, + "kl": 0.04345703125, + "learning_rate": 8.753361526263621e-07, + "loss": 0.0013, + "reward": 1.0828580856323242, + "reward_std": 0.8798565864562988, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.8691210746765137, + "step": 174 + }, + { + "completion_length": 268.65625, + "epoch": 0.2547307132459971, + "grad_norm": 1.324781469590268, + "kl": 0.046875, + "learning_rate": 8.737737462141768e-07, + "loss": -0.0019, + "reward": 1.4227733612060547, + "reward_std": 0.8888717293739319, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.8309636116027832, + "step": 175 + }, + { + "completion_length": 266.9375, + "epoch": 0.2561863173216885, + "grad_norm": 1.2486364117958004, + "kl": 0.049072265625, + "learning_rate": 8.722030229442095e-07, + "loss": 0.0013, + "reward": 1.1527018547058105, + "reward_std": 0.9395639300346375, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.8161133527755737, + "step": 176 + }, + { + "completion_length": 287.296875, + "epoch": 0.2576419213973799, + "grad_norm": 1.3449980174695708, + "kl": 0.0458984375, + "learning_rate": 8.706240177667001e-07, + "loss": -0.0028, + "reward": 0.6147265434265137, + "reward_std": 0.7540205717086792, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.8909245133399963, + "step": 177 + }, + { + "completion_length": 270.5, + "epoch": 0.2590975254730713, + "grad_norm": 1.6585671029978815, + "kl": 0.052490234375, + "learning_rate": 8.690367658161694e-07, + "loss": 0.0004, + "reward": 0.9076562523841858, + "reward_std": 0.698926568031311, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.88106769323349, + "step": 178 + }, + { + "completion_length": 299.03125, + "epoch": 0.26055312954876275, + "grad_norm": 1.27502354364528, + "kl": 0.04443359375, + "learning_rate": 8.674413024106379e-07, + "loss": 0.0011, + "reward": 0.06214843690395355, + "reward_std": 0.6285444498062134, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.8771094083786011, + "step": 179 + }, + { + "completion_length": 311.5625, + "epoch": 0.26200873362445415, + "grad_norm": 1.1519559026695043, + "kl": 0.044677734375, + "learning_rate": 8.658376630508391e-07, + "loss": 0.0012, + "reward": 0.49518883228302, + "reward_std": 1.1731948852539062, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.8484830856323242, + "step": 180 + }, + { + "completion_length": 338.453125, + "epoch": 0.26346433770014555, + "grad_norm": 1.1630608606801263, + "kl": 0.03662109375, + "learning_rate": 8.642258834194305e-07, + "loss": 0.0, + "reward": 0.5747005343437195, + "reward_std": 1.185742974281311, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.855039119720459, + "step": 181 + }, + { + "completion_length": 298.203125, + "epoch": 0.264919941775837, + "grad_norm": 1.092799001313928, + "kl": 0.05078125, + "learning_rate": 8.626059993801986e-07, + "loss": 0.0013, + "reward": 1.1474609375, + "reward_std": 1.3674826622009277, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.7989974021911621, + "step": 182 + }, + { + "completion_length": 316.59375, + "epoch": 0.2663755458515284, + "grad_norm": 1.1905408752898774, + "kl": 0.03662109375, + "learning_rate": 8.609780469772621e-07, + "loss": -0.0003, + "reward": 0.4997330904006958, + "reward_std": 1.4191782474517822, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.7479101419448853, + "step": 183 + }, + { + "completion_length": 343.40625, + "epoch": 0.2678311499272198, + "grad_norm": 1.1407997883526075, + "kl": 0.038818359375, + "learning_rate": 8.593420624342691e-07, + "loss": -0.0005, + "reward": -0.2410416603088379, + "reward_std": 1.0026159286499023, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.806236982345581, + "step": 184 + }, + { + "completion_length": 292.59375, + "epoch": 0.2692867540029112, + "grad_norm": 1.3091775192956971, + "kl": 0.057373046875, + "learning_rate": 8.57698082153591e-07, + "loss": 0.0022, + "reward": 0.5668359398841858, + "reward_std": 1.0860553979873657, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.8349218368530273, + "step": 185 + }, + { + "completion_length": 297.1875, + "epoch": 0.27074235807860264, + "grad_norm": 1.1878094968164294, + "kl": 0.03857421875, + "learning_rate": 8.560461427155128e-07, + "loss": 0.0024, + "reward": 0.8997591137886047, + "reward_std": 1.2781544923782349, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.8541991710662842, + "step": 186 + }, + { + "completion_length": 283.4375, + "epoch": 0.27219796215429404, + "grad_norm": 1.3178336609529575, + "kl": 0.048828125, + "learning_rate": 8.543862808774191e-07, + "loss": -0.0003, + "reward": 1.5494986772537231, + "reward_std": 0.4223147928714752, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9061654210090637, + "step": 187 + }, + { + "completion_length": 296.890625, + "epoch": 0.27365356622998543, + "grad_norm": 1.2645455173847566, + "kl": 0.04541015625, + "learning_rate": 8.527185335729765e-07, + "loss": 0.0035, + "reward": 0.7015169262886047, + "reward_std": 1.2943627834320068, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.8435481786727905, + "step": 188 + }, + { + "completion_length": 329.90625, + "epoch": 0.27510917030567683, + "grad_norm": 1.1581867103902632, + "kl": 0.0498046875, + "learning_rate": 8.510429379113113e-07, + "loss": -0.004, + "reward": 0.47220054268836975, + "reward_std": 0.8791263103485107, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.8547526001930237, + "step": 189 + }, + { + "completion_length": 330.734375, + "epoch": 0.2765647743813683, + "grad_norm": 1.242052928211484, + "kl": 0.0419921875, + "learning_rate": 8.493595311761836e-07, + "loss": 0.0019, + "reward": 0.0584830716252327, + "reward_std": 0.9290468096733093, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 0.7730534076690674, + "step": 190 + }, + { + "completion_length": 298.8125, + "epoch": 0.2780203784570597, + "grad_norm": 1.0587431693427292, + "kl": 0.04931640625, + "learning_rate": 8.47668350825159e-07, + "loss": 0.002, + "reward": 0.8388606309890747, + "reward_std": 0.9359092712402344, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.8803450465202332, + "step": 191 + }, + { + "completion_length": 295.765625, + "epoch": 0.2794759825327511, + "grad_norm": 1.2325069141615899, + "kl": 0.044189453125, + "learning_rate": 8.459694344887731e-07, + "loss": -0.0002, + "reward": 1.281217336654663, + "reward_std": 0.7809577584266663, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.8709959983825684, + "step": 192 + }, + { + "completion_length": 344.203125, + "epoch": 0.28093158660844253, + "grad_norm": 1.0295971882868977, + "kl": 0.03857421875, + "learning_rate": 8.44262819969696e-07, + "loss": 0.0018, + "reward": -0.01013021171092987, + "reward_std": 1.1098777055740356, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.8218880295753479, + "step": 193 + }, + { + "completion_length": 341.453125, + "epoch": 0.2823871906841339, + "grad_norm": 1.1002232540991268, + "kl": 0.043212890625, + "learning_rate": 8.425485452418905e-07, + "loss": -0.0031, + "reward": 1.3040039539337158, + "reward_std": 1.2101850509643555, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.8196288347244263, + "step": 194 + }, + { + "completion_length": 313.125, + "epoch": 0.2838427947598253, + "grad_norm": 1.2142916830344008, + "kl": 0.04443359375, + "learning_rate": 8.408266484497664e-07, + "loss": 0.0029, + "reward": 0.5552148222923279, + "reward_std": 0.7991162538528442, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.8420246839523315, + "step": 195 + }, + { + "completion_length": 295.796875, + "epoch": 0.2852983988355167, + "grad_norm": 1.0476158305891778, + "kl": 0.0439453125, + "learning_rate": 8.39097167907333e-07, + "loss": -0.0038, + "reward": 0.6949739456176758, + "reward_std": 1.1475563049316406, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.8432031273841858, + "step": 196 + }, + { + "completion_length": 286.109375, + "epoch": 0.2867540029112082, + "grad_norm": 1.326917469312428, + "kl": 0.04931640625, + "learning_rate": 8.373601420973463e-07, + "loss": -0.0043, + "reward": 0.5129296779632568, + "reward_std": 0.5196168422698975, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.9051302075386047, + "step": 197 + }, + { + "completion_length": 267.21875, + "epoch": 0.28820960698689957, + "grad_norm": 1.4099736885345076, + "kl": 0.054443359375, + "learning_rate": 8.356156096704514e-07, + "loss": 0.0035, + "reward": 0.9967187643051147, + "reward_std": 0.6953111886978149, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.8652864694595337, + "step": 198 + }, + { + "completion_length": 291.3125, + "epoch": 0.28966521106259097, + "grad_norm": 1.3657308201508438, + "kl": 0.04296875, + "learning_rate": 8.338636094443241e-07, + "loss": 0.0016, + "reward": 0.6036978960037231, + "reward_std": 0.5057134628295898, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.8577994704246521, + "step": 199 + }, + { + "completion_length": 301.484375, + "epoch": 0.29112081513828236, + "grad_norm": 1.1206020594596815, + "kl": 0.04345703125, + "learning_rate": 8.32104180402807e-07, + "loss": 0.001, + "reward": 0.6353580951690674, + "reward_std": 1.240022897720337, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.8110611438751221, + "step": 200 + }, + { + "completion_length": 276.0, + "epoch": 0.2925764192139738, + "grad_norm": 1.4814987178126335, + "kl": 0.041015625, + "learning_rate": 8.303373616950406e-07, + "loss": 0.0003, + "reward": 0.9168750047683716, + "reward_std": 0.5409839749336243, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.8954036235809326, + "step": 201 + }, + { + "completion_length": 284.796875, + "epoch": 0.2940320232896652, + "grad_norm": 1.3014402895334682, + "kl": 0.04296875, + "learning_rate": 8.285631926345943e-07, + "loss": 0.0009, + "reward": 1.4197134971618652, + "reward_std": 0.7836724519729614, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9230338335037231, + "step": 202 + }, + { + "completion_length": 303.765625, + "epoch": 0.2954876273653566, + "grad_norm": 1.4100146478905302, + "kl": 0.044921875, + "learning_rate": 8.267817126985897e-07, + "loss": 0.0032, + "reward": 1.148378849029541, + "reward_std": 0.8374971151351929, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.8262174725532532, + "step": 203 + }, + { + "completion_length": 259.140625, + "epoch": 0.29694323144104806, + "grad_norm": 1.3834197464383897, + "kl": 0.046630859375, + "learning_rate": 8.249929615268233e-07, + "loss": 0.0032, + "reward": 0.3902604579925537, + "reward_std": 0.24537065625190735, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.9234635829925537, + "step": 204 + }, + { + "completion_length": 309.109375, + "epoch": 0.29839883551673946, + "grad_norm": 1.2269049796652634, + "kl": 0.03857421875, + "learning_rate": 8.231969789208845e-07, + "loss": -0.0007, + "reward": 0.6886327862739563, + "reward_std": 1.174818992614746, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.8217057585716248, + "step": 205 + }, + { + "completion_length": 285.484375, + "epoch": 0.29985443959243085, + "grad_norm": 1.2547594996135933, + "kl": 0.0458984375, + "learning_rate": 8.213938048432696e-07, + "loss": 0.0074, + "reward": 0.8353320360183716, + "reward_std": 0.9169821739196777, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.8625064492225647, + "step": 206 + }, + { + "completion_length": 269.609375, + "epoch": 0.30131004366812225, + "grad_norm": 1.3754430566964044, + "kl": 0.046142578125, + "learning_rate": 8.195834794164924e-07, + "loss": 0.0009, + "reward": 1.1299283504486084, + "reward_std": 0.8097370862960815, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.91706383228302, + "step": 207 + }, + { + "completion_length": 278.21875, + "epoch": 0.3027656477438137, + "grad_norm": 1.5758484032719748, + "kl": 0.045166015625, + "learning_rate": 8.17766042922192e-07, + "loss": -0.006, + "reward": 1.1924219131469727, + "reward_std": 0.35831767320632935, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.8785676956176758, + "step": 208 + }, + { + "completion_length": 255.671875, + "epoch": 0.3042212518195051, + "grad_norm": 1.3050143773992107, + "kl": 0.0458984375, + "learning_rate": 8.15941535800236e-07, + "loss": -0.003, + "reward": 1.0622721910476685, + "reward_std": 0.30105501413345337, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.87548828125, + "step": 209 + }, + { + "completion_length": 275.90625, + "epoch": 0.3056768558951965, + "grad_norm": 1.1249884714957055, + "kl": 0.041015625, + "learning_rate": 8.141099986478212e-07, + "loss": -0.0028, + "reward": 1.0719857215881348, + "reward_std": 1.051162838935852, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.8484700918197632, + "step": 210 + }, + { + "completion_length": 285.9375, + "epoch": 0.3071324599708879, + "grad_norm": 1.3343580700065274, + "kl": 0.049560546875, + "learning_rate": 8.122714722185695e-07, + "loss": 0.0006, + "reward": 1.1121940612792969, + "reward_std": 0.45470452308654785, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.8535741567611694, + "step": 211 + }, + { + "completion_length": 286.96875, + "epoch": 0.30858806404657935, + "grad_norm": 1.2721134321125864, + "kl": 0.04052734375, + "learning_rate": 8.104259974216218e-07, + "loss": 0.0011, + "reward": 1.3401693105697632, + "reward_std": 0.7562965154647827, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9284895658493042, + "step": 212 + }, + { + "completion_length": 263.28125, + "epoch": 0.31004366812227074, + "grad_norm": 1.2793045432243102, + "kl": 0.0419921875, + "learning_rate": 8.085736153207276e-07, + "loss": -0.0016, + "reward": 1.68442702293396, + "reward_std": 0.5725850462913513, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.8700129985809326, + "step": 213 + }, + { + "completion_length": 292.578125, + "epoch": 0.31149927219796214, + "grad_norm": 1.1759325546130317, + "kl": 0.039794921875, + "learning_rate": 8.067143671333309e-07, + "loss": -0.0033, + "reward": 0.6993098855018616, + "reward_std": 1.116674542427063, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.8843098878860474, + "step": 214 + }, + { + "completion_length": 293.84375, + "epoch": 0.3129548762736536, + "grad_norm": 1.3260579782974318, + "kl": 0.036865234375, + "learning_rate": 8.048482942296535e-07, + "loss": -0.0036, + "reward": 0.4943684935569763, + "reward_std": 0.884007453918457, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.8536393642425537, + "step": 215 + }, + { + "completion_length": 313.453125, + "epoch": 0.314410480349345, + "grad_norm": 1.244589688629832, + "kl": 0.041259765625, + "learning_rate": 8.02975438131774e-07, + "loss": 0.0005, + "reward": 0.7252734899520874, + "reward_std": 1.0751287937164307, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.8259244561195374, + "step": 216 + }, + { + "completion_length": 320.90625, + "epoch": 0.3158660844250364, + "grad_norm": 1.2906749232909462, + "kl": 0.0341796875, + "learning_rate": 8.010958405127047e-07, + "loss": 0.0034, + "reward": 1.0169856548309326, + "reward_std": 1.000779628753662, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.8830013275146484, + "step": 217 + }, + { + "completion_length": 255.71875, + "epoch": 0.3173216885007278, + "grad_norm": 1.309866431103487, + "kl": 0.042236328125, + "learning_rate": 7.992095431954634e-07, + "loss": 0.0027, + "reward": 0.8923372626304626, + "reward_std": 0.30078914761543274, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.8763346672058105, + "step": 218 + }, + { + "completion_length": 258.109375, + "epoch": 0.31877729257641924, + "grad_norm": 1.3975143387297508, + "kl": 0.040771484375, + "learning_rate": 7.973165881521433e-07, + "loss": 0.0012, + "reward": 1.0036718845367432, + "reward_std": 0.9058128595352173, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.8883463740348816, + "step": 219 + }, + { + "completion_length": 276.6875, + "epoch": 0.32023289665211063, + "grad_norm": 1.2837989231665352, + "kl": 0.03759765625, + "learning_rate": 7.954170175029791e-07, + "loss": -0.0022, + "reward": 0.8400716185569763, + "reward_std": 0.7807621955871582, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9180272817611694, + "step": 220 + }, + { + "completion_length": 280.125, + "epoch": 0.32168850072780203, + "grad_norm": 1.4163987682065777, + "kl": 0.04736328125, + "learning_rate": 7.935108735154092e-07, + "loss": 0.0044, + "reward": 0.7034765481948853, + "reward_std": 0.44748401641845703, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.8766406178474426, + "step": 221 + }, + { + "completion_length": 277.4375, + "epoch": 0.3231441048034934, + "grad_norm": 1.2806875439187504, + "kl": 0.04150390625, + "learning_rate": 7.915981986031366e-07, + "loss": 0.0006, + "reward": 1.4459569454193115, + "reward_std": 0.6417677402496338, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.8500716090202332, + "step": 222 + }, + { + "completion_length": 304.625, + "epoch": 0.3245997088791849, + "grad_norm": 1.1924907907457698, + "kl": 0.037353515625, + "learning_rate": 7.896790353251835e-07, + "loss": 0.0001, + "reward": 0.46823570132255554, + "reward_std": 0.7769339084625244, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.8649283647537231, + "step": 223 + }, + { + "completion_length": 303.1875, + "epoch": 0.3260553129548763, + "grad_norm": 1.1503304963010326, + "kl": 0.03466796875, + "learning_rate": 7.877534263849451e-07, + "loss": 0.0005, + "reward": 1.0099999904632568, + "reward_std": 0.964565634727478, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.8617057204246521, + "step": 224 + }, + { + "completion_length": 282.5, + "epoch": 0.32751091703056767, + "grad_norm": 1.1761505740028426, + "kl": 0.039306640625, + "learning_rate": 7.858214146292393e-07, + "loss": -0.0016, + "reward": 0.07833331823348999, + "reward_std": 1.1625947952270508, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 0.8088932037353516, + "step": 225 + }, + { + "completion_length": 312.515625, + "epoch": 0.3289665211062591, + "grad_norm": 1.1648396839632396, + "kl": 0.0361328125, + "learning_rate": 7.838830430473538e-07, + "loss": 0.001, + "reward": -0.08507812023162842, + "reward_std": 0.9128743410110474, + "rewards/accuracy_reward": 0.34375, + "rewards/format_reward": 0.8148698210716248, + "step": 226 + }, + { + "completion_length": 306.5, + "epoch": 0.3304221251819505, + "grad_norm": 0.9816488205657019, + "kl": 0.036376953125, + "learning_rate": 7.819383547700889e-07, + "loss": 0.0021, + "reward": 0.7689843773841858, + "reward_std": 1.1138982772827148, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.8328125476837158, + "step": 227 + }, + { + "completion_length": 266.03125, + "epoch": 0.3318777292576419, + "grad_norm": 1.3503817696996208, + "kl": 0.048828125, + "learning_rate": 7.799873930687977e-07, + "loss": 0.002, + "reward": 0.3949218690395355, + "reward_std": 0.8604871034622192, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.8740885257720947, + "step": 228 + }, + { + "completion_length": 294.15625, + "epoch": 0.3333333333333333, + "grad_norm": 1.0990493246122626, + "kl": 0.03369140625, + "learning_rate": 7.780302013544238e-07, + "loss": 0.0002, + "reward": 1.1642253398895264, + "reward_std": 0.8671401143074036, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.8458007574081421, + "step": 229 + }, + { + "completion_length": 321.265625, + "epoch": 0.33478893740902477, + "grad_norm": 0.9938258028137048, + "kl": 0.037841796875, + "learning_rate": 7.760668231765351e-07, + "loss": -0.0014, + "reward": 0.6853646039962769, + "reward_std": 1.0251922607421875, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.8308333158493042, + "step": 230 + }, + { + "completion_length": 309.53125, + "epoch": 0.33624454148471616, + "grad_norm": 1.123171517093054, + "kl": 0.037353515625, + "learning_rate": 7.740973022223549e-07, + "loss": -0.0024, + "reward": 1.0861718654632568, + "reward_std": 1.1104838848114014, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.8241406083106995, + "step": 231 + }, + { + "completion_length": 309.796875, + "epoch": 0.33770014556040756, + "grad_norm": 1.0934016007582914, + "kl": 0.033935546875, + "learning_rate": 7.721216823157894e-07, + "loss": -0.0063, + "reward": 1.000657558441162, + "reward_std": 0.6190701127052307, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.875084638595581, + "step": 232 + }, + { + "completion_length": 277.453125, + "epoch": 0.33915574963609896, + "grad_norm": 1.4389877479066715, + "kl": 0.035400390625, + "learning_rate": 7.701400074164535e-07, + "loss": -0.0021, + "reward": 1.146998643875122, + "reward_std": 0.6130830645561218, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.8996028900146484, + "step": 233 + }, + { + "completion_length": 292.296875, + "epoch": 0.3406113537117904, + "grad_norm": 1.254688662869592, + "kl": 0.031982421875, + "learning_rate": 7.681523216186911e-07, + "loss": -0.0019, + "reward": 1.3471484184265137, + "reward_std": 0.8775838613510132, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.92557293176651, + "step": 234 + }, + { + "completion_length": 277.265625, + "epoch": 0.3420669577874818, + "grad_norm": 1.1736210621507872, + "kl": 0.037841796875, + "learning_rate": 7.661586691505961e-07, + "loss": 0.0023, + "reward": 0.8513671159744263, + "reward_std": 0.8359072208404541, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9101953506469727, + "step": 235 + }, + { + "completion_length": 262.328125, + "epoch": 0.3435225618631732, + "grad_norm": 1.2639347156124685, + "kl": 0.03857421875, + "learning_rate": 7.641590943730258e-07, + "loss": 0.0009, + "reward": 1.421054720878601, + "reward_std": 0.36540117859840393, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9068489670753479, + "step": 236 + }, + { + "completion_length": 289.109375, + "epoch": 0.34497816593886466, + "grad_norm": 1.2779446490859168, + "kl": 0.0419921875, + "learning_rate": 7.621536417786158e-07, + "loss": 0.0021, + "reward": 0.836510419845581, + "reward_std": 0.8601652383804321, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.8924999833106995, + "step": 237 + }, + { + "completion_length": 287.0625, + "epoch": 0.34643377001455605, + "grad_norm": 1.255028005763755, + "kl": 0.033935546875, + "learning_rate": 7.601423559907894e-07, + "loss": 0.0035, + "reward": 0.6539322733879089, + "reward_std": 0.5522056818008423, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9175911545753479, + "step": 238 + }, + { + "completion_length": 289.578125, + "epoch": 0.34788937409024745, + "grad_norm": 1.1981964024570817, + "kl": 0.035400390625, + "learning_rate": 7.581252817627644e-07, + "loss": -0.0022, + "reward": 1.0136327743530273, + "reward_std": 0.5532514452934265, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.8591275215148926, + "step": 239 + }, + { + "completion_length": 294.765625, + "epoch": 0.34934497816593885, + "grad_norm": 1.3473651417615715, + "kl": 0.039306640625, + "learning_rate": 7.561024639765571e-07, + "loss": -0.0036, + "reward": 0.2712695300579071, + "reward_std": 0.544538676738739, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.8741991519927979, + "step": 240 + }, + { + "completion_length": 294.953125, + "epoch": 0.3508005822416303, + "grad_norm": 1.2518247684034733, + "kl": 0.039306640625, + "learning_rate": 7.540739476419846e-07, + "loss": -0.0002, + "reward": 0.4361327886581421, + "reward_std": 0.4436946213245392, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8888281583786011, + "step": 241 + }, + { + "completion_length": 285.640625, + "epoch": 0.3522561863173217, + "grad_norm": 1.3471385559147369, + "kl": 0.037109375, + "learning_rate": 7.520397778956622e-07, + "loss": 0.0002, + "reward": 0.8719531297683716, + "reward_std": 0.7764471769332886, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9443880319595337, + "step": 242 + }, + { + "completion_length": 278.0625, + "epoch": 0.3537117903930131, + "grad_norm": 1.245756955751304, + "kl": 0.0458984375, + "learning_rate": 7.5e-07, + "loss": 0.0024, + "reward": 0.9515169262886047, + "reward_std": 0.46474969387054443, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.8945116400718689, + "step": 243 + }, + { + "completion_length": 273.65625, + "epoch": 0.3551673944687045, + "grad_norm": 1.3186357128195956, + "kl": 0.041015625, + "learning_rate": 7.479546593421947e-07, + "loss": 0.0008, + "reward": 0.5497395992279053, + "reward_std": 0.9145887494087219, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.9244270324707031, + "step": 244 + }, + { + "completion_length": 302.703125, + "epoch": 0.35662299854439594, + "grad_norm": 1.1625630519045944, + "kl": 0.035400390625, + "learning_rate": 7.459038014332209e-07, + "loss": 0.0016, + "reward": 0.05195310711860657, + "reward_std": 0.509819507598877, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 0.9129297733306885, + "step": 245 + }, + { + "completion_length": 311.40625, + "epoch": 0.35807860262008734, + "grad_norm": 1.3125269152541976, + "kl": 0.0380859375, + "learning_rate": 7.438474719068173e-07, + "loss": -0.0042, + "reward": 1.84199857711792, + "reward_std": 0.3553314805030823, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.9321680068969727, + "step": 246 + }, + { + "completion_length": 295.4375, + "epoch": 0.35953420669577874, + "grad_norm": 1.2573734594613415, + "kl": 0.036865234375, + "learning_rate": 7.417857165184723e-07, + "loss": 0.0018, + "reward": 0.6018099188804626, + "reward_std": 0.4691210389137268, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.9123698472976685, + "step": 247 + }, + { + "completion_length": 306.125, + "epoch": 0.3609898107714702, + "grad_norm": 1.2433464807920045, + "kl": 0.03564453125, + "learning_rate": 7.397185811444049e-07, + "loss": 0.0005, + "reward": 0.889244794845581, + "reward_std": 0.806644856929779, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.8913020491600037, + "step": 248 + }, + { + "completion_length": 315.3125, + "epoch": 0.3624454148471616, + "grad_norm": 1.1237789290801656, + "kl": 0.042236328125, + "learning_rate": 7.376461117805449e-07, + "loss": -0.001, + "reward": 1.3869401216506958, + "reward_std": 0.9981783628463745, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.8924478888511658, + "step": 249 + }, + { + "completion_length": 299.75, + "epoch": 0.363901018922853, + "grad_norm": 1.1632799586225018, + "kl": 0.03759765625, + "learning_rate": 7.355683545415089e-07, + "loss": 0.0006, + "reward": 0.85239577293396, + "reward_std": 0.6559892892837524, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.8976563215255737, + "step": 250 + }, + { + "completion_length": 279.09375, + "epoch": 0.3653566229985444, + "grad_norm": 1.1514063608017875, + "kl": 0.04248046875, + "learning_rate": 7.33485355659574e-07, + "loss": -0.0002, + "reward": 0.6367447972297668, + "reward_std": 1.1905419826507568, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.913867175579071, + "step": 251 + }, + { + "completion_length": 319.765625, + "epoch": 0.36681222707423583, + "grad_norm": 0.9469865528331761, + "kl": 0.0361328125, + "learning_rate": 7.313971614836495e-07, + "loss": -0.0009, + "reward": 0.50822913646698, + "reward_std": 0.788324236869812, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.8841667175292969, + "step": 252 + }, + { + "completion_length": 321.515625, + "epoch": 0.3682678311499272, + "grad_norm": 1.1463942286208362, + "kl": 0.038818359375, + "learning_rate": 7.293038184782454e-07, + "loss": -0.0042, + "reward": 1.2917838096618652, + "reward_std": 0.6375994086265564, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.89473956823349, + "step": 253 + }, + { + "completion_length": 341.265625, + "epoch": 0.3697234352256186, + "grad_norm": 0.9758868458165839, + "kl": 0.032958984375, + "learning_rate": 7.272053732224387e-07, + "loss": 0.004, + "reward": -0.20222003757953644, + "reward_std": 0.5149335265159607, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.8958659172058105, + "step": 254 + }, + { + "completion_length": 332.5625, + "epoch": 0.37117903930131, + "grad_norm": 1.1030567827011573, + "kl": 0.039794921875, + "learning_rate": 7.251018724088366e-07, + "loss": 0.0035, + "reward": 0.6221875548362732, + "reward_std": 1.2777836322784424, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.87088543176651, + "step": 255 + }, + { + "completion_length": 296.59375, + "epoch": 0.3726346433770015, + "grad_norm": 1.1648513737927273, + "kl": 0.042724609375, + "learning_rate": 7.22993362842538e-07, + "loss": 0.0038, + "reward": 0.18544921278953552, + "reward_std": 0.922585666179657, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 0.8398241996765137, + "step": 256 + }, + { + "completion_length": 325.078125, + "epoch": 0.37409024745269287, + "grad_norm": 1.063758965452858, + "kl": 0.03662109375, + "learning_rate": 7.208798914400915e-07, + "loss": 0.0011, + "reward": 0.6968294382095337, + "reward_std": 1.144836187362671, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.8460221290588379, + "step": 257 + }, + { + "completion_length": 315.84375, + "epoch": 0.37554585152838427, + "grad_norm": 1.1772919287263088, + "kl": 0.036376953125, + "learning_rate": 7.187615052284521e-07, + "loss": -0.0012, + "reward": 1.352858066558838, + "reward_std": 0.5259904861450195, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9533268213272095, + "step": 258 + }, + { + "completion_length": 317.984375, + "epoch": 0.37700145560407566, + "grad_norm": 1.0657146598710652, + "kl": 0.03759765625, + "learning_rate": 7.166382513439343e-07, + "loss": 0.0, + "reward": 0.8918294310569763, + "reward_std": 0.6458997130393982, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.8966862559318542, + "step": 259 + }, + { + "completion_length": 319.5, + "epoch": 0.3784570596797671, + "grad_norm": 1.074343844728951, + "kl": 0.037841796875, + "learning_rate": 7.145101770311633e-07, + "loss": -0.0006, + "reward": 0.8149153590202332, + "reward_std": 1.0369267463684082, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.86767578125, + "step": 260 + }, + { + "completion_length": 308.359375, + "epoch": 0.3799126637554585, + "grad_norm": 1.291348430451882, + "kl": 0.03662109375, + "learning_rate": 7.12377329642024e-07, + "loss": 0.0032, + "reward": 1.153606653213501, + "reward_std": 0.9500166773796082, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.8819010853767395, + "step": 261 + }, + { + "completion_length": 325.78125, + "epoch": 0.3813682678311499, + "grad_norm": 1.0948918878420022, + "kl": 0.0380859375, + "learning_rate": 7.102397566346072e-07, + "loss": -0.0025, + "reward": 0.6883333325386047, + "reward_std": 0.6285832524299622, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.9229167103767395, + "step": 262 + }, + { + "completion_length": 303.671875, + "epoch": 0.38282387190684136, + "grad_norm": 1.2246966458821558, + "kl": 0.039306640625, + "learning_rate": 7.080975055721537e-07, + "loss": 0.0028, + "reward": 1.2784569263458252, + "reward_std": 0.6068631410598755, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.9090169072151184, + "step": 263 + }, + { + "completion_length": 301.515625, + "epoch": 0.38427947598253276, + "grad_norm": 1.274689095724219, + "kl": 0.034423828125, + "learning_rate": 7.059506241219964e-07, + "loss": 0.0038, + "reward": 0.5575065016746521, + "reward_std": 0.2938391864299774, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.9139648079872131, + "step": 264 + }, + { + "completion_length": 317.65625, + "epoch": 0.38573508005822416, + "grad_norm": 1.1114171001211495, + "kl": 0.037353515625, + "learning_rate": 7.037991600544982e-07, + "loss": 0.0001, + "reward": 0.32826173305511475, + "reward_std": 1.0576179027557373, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.8831444978713989, + "step": 265 + }, + { + "completion_length": 307.125, + "epoch": 0.38719068413391555, + "grad_norm": 1.0365067938257357, + "kl": 0.03271484375, + "learning_rate": 7.016431612419906e-07, + "loss": 0.0014, + "reward": 0.7509114742279053, + "reward_std": 1.0201146602630615, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.8540234565734863, + "step": 266 + }, + { + "completion_length": 315.0625, + "epoch": 0.388646288209607, + "grad_norm": 1.1542972359276211, + "kl": 0.039794921875, + "learning_rate": 6.994826756577081e-07, + "loss": 0.0028, + "reward": 0.3422461152076721, + "reward_std": 0.7236604690551758, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.8930273652076721, + "step": 267 + }, + { + "completion_length": 292.453125, + "epoch": 0.3901018922852984, + "grad_norm": 1.247691385135112, + "kl": 0.03759765625, + "learning_rate": 6.973177513747204e-07, + "loss": 0.004, + "reward": 1.1121549606323242, + "reward_std": 0.2723959684371948, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9376627206802368, + "step": 268 + }, + { + "completion_length": 327.25, + "epoch": 0.3915574963609898, + "grad_norm": 1.1547141576656188, + "kl": 0.03662109375, + "learning_rate": 6.951484365648627e-07, + "loss": 0.0021, + "reward": 0.8944270610809326, + "reward_std": 0.8052605390548706, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.8651041984558105, + "step": 269 + }, + { + "completion_length": 303.03125, + "epoch": 0.3930131004366812, + "grad_norm": 1.3851574923540422, + "kl": 0.03955078125, + "learning_rate": 6.929747794976643e-07, + "loss": -0.0045, + "reward": 1.0955729484558105, + "reward_std": 0.38242411613464355, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9138151407241821, + "step": 270 + }, + { + "completion_length": 271.625, + "epoch": 0.39446870451237265, + "grad_norm": 1.3401818240693362, + "kl": 0.037353515625, + "learning_rate": 6.907968285392743e-07, + "loss": -0.0006, + "reward": 1.7740495204925537, + "reward_std": 0.47597813606262207, + "rewards/accuracy_reward": 0.953125, + "rewards/format_reward": 0.9092708826065063, + "step": 271 + }, + { + "completion_length": 294.453125, + "epoch": 0.39592430858806404, + "grad_norm": 1.2186833540315785, + "kl": 0.036865234375, + "learning_rate": 6.886146321513849e-07, + "loss": -0.0039, + "reward": 0.8727213740348816, + "reward_std": 0.7979456186294556, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.86697918176651, + "step": 272 + }, + { + "completion_length": 299.578125, + "epoch": 0.39737991266375544, + "grad_norm": 1.1830905690032625, + "kl": 0.0390625, + "learning_rate": 6.864282388901543e-07, + "loss": -0.0064, + "reward": 0.28068357706069946, + "reward_std": 0.5684016942977905, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9229100942611694, + "step": 273 + }, + { + "completion_length": 298.40625, + "epoch": 0.3988355167394469, + "grad_norm": 1.1221914279750511, + "kl": 0.035888671875, + "learning_rate": 6.84237697405125e-07, + "loss": -0.003, + "reward": 1.0086263418197632, + "reward_std": 0.8679967522621155, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.8907877206802368, + "step": 274 + }, + { + "completion_length": 326.9375, + "epoch": 0.4002911208151383, + "grad_norm": 1.0580769064192943, + "kl": 0.0400390625, + "learning_rate": 6.820430564381419e-07, + "loss": -0.0007, + "reward": 0.6745051741600037, + "reward_std": 0.84361732006073, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.8712891340255737, + "step": 275 + }, + { + "completion_length": 293.09375, + "epoch": 0.4017467248908297, + "grad_norm": 1.2858645148857393, + "kl": 0.04541015625, + "learning_rate": 6.79844364822268e-07, + "loss": -0.0051, + "reward": 0.7683008313179016, + "reward_std": 0.6962195634841919, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.893470048904419, + "step": 276 + }, + { + "completion_length": 328.234375, + "epoch": 0.4032023289665211, + "grad_norm": 1.0468195864436123, + "kl": 0.038818359375, + "learning_rate": 6.776416714806969e-07, + "loss": -0.0035, + "reward": 1.0561068058013916, + "reward_std": 1.1647756099700928, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.8450260162353516, + "step": 277 + }, + { + "completion_length": 309.640625, + "epoch": 0.40465793304221254, + "grad_norm": 1.3703569385022722, + "kl": 0.042724609375, + "learning_rate": 6.754350254256652e-07, + "loss": 0.003, + "reward": 1.3729296922683716, + "reward_std": 0.5724613070487976, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9223437309265137, + "step": 278 + }, + { + "completion_length": 299.453125, + "epoch": 0.40611353711790393, + "grad_norm": 1.1513843198550995, + "kl": 0.0458984375, + "learning_rate": 6.732244757573618e-07, + "loss": 0.0021, + "reward": 0.58970046043396, + "reward_std": 0.6756365299224854, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.8674869537353516, + "step": 279 + }, + { + "completion_length": 285.890625, + "epoch": 0.40756914119359533, + "grad_norm": 1.293846147841921, + "kl": 0.0390625, + "learning_rate": 6.710100716628344e-07, + "loss": 0.0049, + "reward": 0.7233072519302368, + "reward_std": 0.29892754554748535, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.8979557752609253, + "step": 280 + }, + { + "completion_length": 266.390625, + "epoch": 0.4090247452692867, + "grad_norm": 1.2289155325973107, + "kl": 0.041259765625, + "learning_rate": 6.687918624148963e-07, + "loss": -0.0007, + "reward": 0.04064452648162842, + "reward_std": 0.2825842499732971, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 0.8914909362792969, + "step": 281 + }, + { + "completion_length": 297.8125, + "epoch": 0.4104803493449782, + "grad_norm": 1.1879387558526018, + "kl": 0.037109375, + "learning_rate": 6.665698973710288e-07, + "loss": -0.0028, + "reward": 0.9278711080551147, + "reward_std": 0.7110856771469116, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.8445507884025574, + "step": 282 + }, + { + "completion_length": 300.5625, + "epoch": 0.4119359534206696, + "grad_norm": 1.1205832368306863, + "kl": 0.03759765625, + "learning_rate": 6.643442259722845e-07, + "loss": -0.0021, + "reward": 0.5016796588897705, + "reward_std": 0.8245861530303955, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.8688150644302368, + "step": 283 + }, + { + "completion_length": 291.46875, + "epoch": 0.413391557496361, + "grad_norm": 1.1232126206960045, + "kl": 0.040771484375, + "learning_rate": 6.621148977421855e-07, + "loss": 0.0054, + "reward": 1.0367382764816284, + "reward_std": 0.8832352757453918, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9021419286727905, + "step": 284 + }, + { + "completion_length": 288.328125, + "epoch": 0.4148471615720524, + "grad_norm": 1.186415542575965, + "kl": 0.035888671875, + "learning_rate": 6.598819622856226e-07, + "loss": -0.0035, + "reward": 1.6761784553527832, + "reward_std": 0.69477379322052, + "rewards/accuracy_reward": 0.921875, + "rewards/format_reward": 0.9105534553527832, + "step": 285 + }, + { + "completion_length": 292.609375, + "epoch": 0.4163027656477438, + "grad_norm": 1.2789098266158088, + "kl": 0.043212890625, + "learning_rate": 6.576454692877512e-07, + "loss": 0.0009, + "reward": 1.4440560340881348, + "reward_std": 0.40633606910705566, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9008529186248779, + "step": 286 + }, + { + "completion_length": 300.9375, + "epoch": 0.4177583697234352, + "grad_norm": 1.1125232437916257, + "kl": 0.039306640625, + "learning_rate": 6.554054685128856e-07, + "loss": 0.003, + "reward": 0.5055012702941895, + "reward_std": 0.5191164612770081, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.9176367521286011, + "step": 287 + }, + { + "completion_length": 305.1875, + "epoch": 0.4192139737991266, + "grad_norm": 1.3508336793233238, + "kl": 0.042236328125, + "learning_rate": 6.531620098033918e-07, + "loss": -0.0003, + "reward": 0.727037787437439, + "reward_std": 0.7846024036407471, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.9105533957481384, + "step": 288 + }, + { + "completion_length": 286.953125, + "epoch": 0.42066957787481807, + "grad_norm": 1.355472255753869, + "kl": 0.0419921875, + "learning_rate": 6.509151430785785e-07, + "loss": 0.0057, + "reward": 1.5655077695846558, + "reward_std": 0.363979309797287, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9299349188804626, + "step": 289 + }, + { + "completion_length": 324.296875, + "epoch": 0.42212518195050946, + "grad_norm": 1.0617875027109431, + "kl": 0.042236328125, + "learning_rate": 6.486649183335862e-07, + "loss": 0.0028, + "reward": 0.3495572805404663, + "reward_std": 0.61403489112854, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.8596354126930237, + "step": 290 + }, + { + "completion_length": 303.375, + "epoch": 0.42358078602620086, + "grad_norm": 1.2306825555758596, + "kl": 0.040283203125, + "learning_rate": 6.464113856382751e-07, + "loss": 0.0013, + "reward": 0.9183528423309326, + "reward_std": 0.4856081008911133, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.88413405418396, + "step": 291 + }, + { + "completion_length": 339.09375, + "epoch": 0.42503639010189226, + "grad_norm": 0.9937441597737549, + "kl": 0.035400390625, + "learning_rate": 6.441545951361109e-07, + "loss": -0.0029, + "reward": 0.5682356357574463, + "reward_std": 1.1054012775421143, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.8453580737113953, + "step": 292 + }, + { + "completion_length": 325.109375, + "epoch": 0.4264919941775837, + "grad_norm": 1.1320952087408245, + "kl": 0.03857421875, + "learning_rate": 6.418945970430485e-07, + "loss": -0.0018, + "reward": 1.6307356357574463, + "reward_std": 0.8666630983352661, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.9119856357574463, + "step": 293 + }, + { + "completion_length": 312.953125, + "epoch": 0.4279475982532751, + "grad_norm": 1.0902739141585274, + "kl": 0.042236328125, + "learning_rate": 6.39631441646415e-07, + "loss": -0.003, + "reward": 0.8973242044448853, + "reward_std": 0.8014050722122192, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.8655273914337158, + "step": 294 + }, + { + "completion_length": 317.765625, + "epoch": 0.4294032023289665, + "grad_norm": 1.1492762649925403, + "kl": 0.04052734375, + "learning_rate": 6.373651793037916e-07, + "loss": -0.001, + "reward": 0.6366666555404663, + "reward_std": 0.6303349733352661, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9156770706176758, + "step": 295 + }, + { + "completion_length": 303.640625, + "epoch": 0.43085880640465796, + "grad_norm": 1.1237468628260645, + "kl": 0.044921875, + "learning_rate": 6.35095860441891e-07, + "loss": 0.0027, + "reward": 0.8331836462020874, + "reward_std": 0.38058170676231384, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9308528900146484, + "step": 296 + }, + { + "completion_length": 309.765625, + "epoch": 0.43231441048034935, + "grad_norm": 1.079089420677735, + "kl": 0.041015625, + "learning_rate": 6.328235355554381e-07, + "loss": 0.0039, + "reward": 1.14655601978302, + "reward_std": 0.9032782912254333, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9077669382095337, + "step": 297 + }, + { + "completion_length": 327.1875, + "epoch": 0.43377001455604075, + "grad_norm": 1.1012850272675234, + "kl": 0.041015625, + "learning_rate": 6.305482552060441e-07, + "loss": -0.0018, + "reward": 0.41550779342651367, + "reward_std": 0.9270626306533813, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.8848437070846558, + "step": 298 + }, + { + "completion_length": 321.90625, + "epoch": 0.43522561863173215, + "grad_norm": 1.1334862363298608, + "kl": 0.0439453125, + "learning_rate": 6.282700700210826e-07, + "loss": 0.0016, + "reward": 0.8490754961967468, + "reward_std": 0.8852044939994812, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.950390636920929, + "step": 299 + }, + { + "completion_length": 319.46875, + "epoch": 0.4366812227074236, + "grad_norm": 1.3237664738865698, + "kl": 0.04052734375, + "learning_rate": 6.259890306925626e-07, + "loss": 0.0024, + "reward": 1.1324349641799927, + "reward_std": 0.6891584992408752, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9130728840827942, + "step": 300 + }, + { + "completion_length": 340.21875, + "epoch": 0.438136826783115, + "grad_norm": 0.925628291932368, + "kl": 0.039794921875, + "learning_rate": 6.237051879760013e-07, + "loss": -0.0004, + "reward": -0.13870440423488617, + "reward_std": 0.4732888340950012, + "rewards/accuracy_reward": 0.328125, + "rewards/format_reward": 0.8486002683639526, + "step": 301 + }, + { + "completion_length": 293.5, + "epoch": 0.4395924308588064, + "grad_norm": 1.3848838366572507, + "kl": 0.048583984375, + "learning_rate": 6.214185926892935e-07, + "loss": 0.0011, + "reward": 0.7430468797683716, + "reward_std": 0.5002673864364624, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.9450520873069763, + "step": 302 + }, + { + "completion_length": 301.5, + "epoch": 0.4410480349344978, + "grad_norm": 1.1287435583620058, + "kl": 0.057373046875, + "learning_rate": 6.191292957115824e-07, + "loss": 0.0004, + "reward": 1.1467642784118652, + "reward_std": 0.2716268002986908, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9220898151397705, + "step": 303 + }, + { + "completion_length": 313.65625, + "epoch": 0.44250363901018924, + "grad_norm": 1.1031415899812522, + "kl": 0.0419921875, + "learning_rate": 6.168373479821263e-07, + "loss": 0.0023, + "reward": 0.6278645992279053, + "reward_std": 0.4927278757095337, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9092448353767395, + "step": 304 + }, + { + "completion_length": 324.65625, + "epoch": 0.44395924308588064, + "grad_norm": 1.1753628489624894, + "kl": 0.04296875, + "learning_rate": 6.145428004991649e-07, + "loss": 0.0031, + "reward": 1.2421419620513916, + "reward_std": 0.5803054571151733, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.93757164478302, + "step": 305 + }, + { + "completion_length": 329.421875, + "epoch": 0.44541484716157204, + "grad_norm": 1.0124718909261303, + "kl": 0.042724609375, + "learning_rate": 6.122457043187862e-07, + "loss": 0.0026, + "reward": 1.2116667032241821, + "reward_std": 0.6427367925643921, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.90095055103302, + "step": 306 + }, + { + "completion_length": 308.265625, + "epoch": 0.4468704512372635, + "grad_norm": 1.0168849937197908, + "kl": 0.047607421875, + "learning_rate": 6.099461105537888e-07, + "loss": -0.0027, + "reward": 0.7654426693916321, + "reward_std": 0.5294202566146851, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9123828411102295, + "step": 307 + }, + { + "completion_length": 312.984375, + "epoch": 0.4483260553129549, + "grad_norm": 1.304050444945634, + "kl": 0.04443359375, + "learning_rate": 6.076440703725452e-07, + "loss": -0.0045, + "reward": 0.4798697829246521, + "reward_std": 0.11288902163505554, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9344010353088379, + "step": 308 + }, + { + "completion_length": 316.625, + "epoch": 0.4497816593886463, + "grad_norm": 1.0318927611491964, + "kl": 0.047607421875, + "learning_rate": 6.053396349978631e-07, + "loss": -0.0003, + "reward": 0.36076819896698, + "reward_std": 0.37749600410461426, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.8674870133399963, + "step": 309 + }, + { + "completion_length": 321.265625, + "epoch": 0.4512372634643377, + "grad_norm": 1.054780956617541, + "kl": 0.0517578125, + "learning_rate": 6.030328557058463e-07, + "loss": 0.0008, + "reward": 0.5132877826690674, + "reward_std": 0.8486453294754028, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.8326627612113953, + "step": 310 + }, + { + "completion_length": 321.21875, + "epoch": 0.45269286754002913, + "grad_norm": 1.1712340167228774, + "kl": 0.045654296875, + "learning_rate": 6.007237838247525e-07, + "loss": 0.0004, + "reward": 1.3080989122390747, + "reward_std": 0.7990570068359375, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.91447913646698, + "step": 311 + }, + { + "completion_length": 306.953125, + "epoch": 0.45414847161572053, + "grad_norm": 1.2366218066275552, + "kl": 0.050048828125, + "learning_rate": 5.984124707338527e-07, + "loss": -0.0022, + "reward": 1.22621750831604, + "reward_std": 0.586514413356781, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.9193814396858215, + "step": 312 + }, + { + "completion_length": 290.796875, + "epoch": 0.4556040756914119, + "grad_norm": 1.4431350657604642, + "kl": 0.051513671875, + "learning_rate": 5.960989678622864e-07, + "loss": 0.0034, + "reward": 1.7498502731323242, + "reward_std": 0.417039155960083, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9267643094062805, + "step": 313 + }, + { + "completion_length": 324.75, + "epoch": 0.4570596797671033, + "grad_norm": 1.2410969845454607, + "kl": 0.0419921875, + "learning_rate": 5.937833266879186e-07, + "loss": -0.0065, + "reward": 1.30293607711792, + "reward_std": 1.056250810623169, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9061523675918579, + "step": 314 + }, + { + "completion_length": 299.71875, + "epoch": 0.4585152838427948, + "grad_norm": 1.206619606919444, + "kl": 0.049072265625, + "learning_rate": 5.914655987361933e-07, + "loss": 0.0001, + "reward": 1.1884570121765137, + "reward_std": 0.744032621383667, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.8389909267425537, + "step": 315 + }, + { + "completion_length": 320.125, + "epoch": 0.45997088791848617, + "grad_norm": 1.3350598933022013, + "kl": 0.048583984375, + "learning_rate": 5.891458355789879e-07, + "loss": 0.0027, + "reward": 0.7094922065734863, + "reward_std": 0.5736180543899536, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.9550390839576721, + "step": 316 + }, + { + "completion_length": 317.203125, + "epoch": 0.46142649199417757, + "grad_norm": 1.1181223383097147, + "kl": 0.05078125, + "learning_rate": 5.868240888334652e-07, + "loss": 0.001, + "reward": 0.6829947829246521, + "reward_std": 0.4590635299682617, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.9185676574707031, + "step": 317 + }, + { + "completion_length": 340.171875, + "epoch": 0.462882096069869, + "grad_norm": 1.0567451773483107, + "kl": 0.052734375, + "learning_rate": 5.845004101609246e-07, + "loss": 0.0011, + "reward": 0.6952344179153442, + "reward_std": 0.4979342818260193, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.9068880677223206, + "step": 318 + }, + { + "completion_length": 320.375, + "epoch": 0.4643377001455604, + "grad_norm": 1.216157657027118, + "kl": 0.0498046875, + "learning_rate": 5.82174851265653e-07, + "loss": 0.0047, + "reward": 0.7820637822151184, + "reward_std": 0.6226429343223572, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.8854231834411621, + "step": 319 + }, + { + "completion_length": 324.625, + "epoch": 0.4657933042212518, + "grad_norm": 1.091320761141732, + "kl": 0.04541015625, + "learning_rate": 5.798474638937747e-07, + "loss": 0.0003, + "reward": 1.2951171398162842, + "reward_std": 0.6852482557296753, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.9411588907241821, + "step": 320 + }, + { + "completion_length": 310.015625, + "epoch": 0.4672489082969432, + "grad_norm": 1.305156958844844, + "kl": 0.054443359375, + "learning_rate": 5.775182998320989e-07, + "loss": -0.0021, + "reward": 1.2777929306030273, + "reward_std": 1.1254336833953857, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.8748111724853516, + "step": 321 + }, + { + "completion_length": 330.21875, + "epoch": 0.46870451237263466, + "grad_norm": 1.2302790562732457, + "kl": 0.045166015625, + "learning_rate": 5.751874109069684e-07, + "loss": -0.0025, + "reward": 1.56145179271698, + "reward_std": 0.5644485950469971, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9349414110183716, + "step": 322 + }, + { + "completion_length": 341.703125, + "epoch": 0.47016011644832606, + "grad_norm": 1.0929022937248165, + "kl": 0.04736328125, + "learning_rate": 5.728548489831057e-07, + "loss": 0.0025, + "reward": 0.21611331403255463, + "reward_std": 0.6904685497283936, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.8760481476783752, + "step": 323 + }, + { + "completion_length": 316.984375, + "epoch": 0.47161572052401746, + "grad_norm": 1.048375002277041, + "kl": 0.04736328125, + "learning_rate": 5.705206659624596e-07, + "loss": -0.001, + "reward": 1.0557878017425537, + "reward_std": 0.761435866355896, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9400846362113953, + "step": 324 + }, + { + "completion_length": 323.625, + "epoch": 0.47307132459970885, + "grad_norm": 1.1891964320438795, + "kl": 0.0498046875, + "learning_rate": 5.6818491378305e-07, + "loss": -0.0003, + "reward": 1.0568814277648926, + "reward_std": 0.422406941652298, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9388867020606995, + "step": 325 + }, + { + "completion_length": 358.796875, + "epoch": 0.4745269286754003, + "grad_norm": 1.1285434321384047, + "kl": 0.046630859375, + "learning_rate": 5.658476444178118e-07, + "loss": -0.0037, + "reward": 0.16022136807441711, + "reward_std": 0.4567154049873352, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 0.9228385090827942, + "step": 326 + }, + { + "completion_length": 311.484375, + "epoch": 0.4759825327510917, + "grad_norm": 1.2560433618274836, + "kl": 0.05224609375, + "learning_rate": 5.635089098734393e-07, + "loss": 0.0005, + "reward": 1.5854361057281494, + "reward_std": 0.5700551271438599, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9499154090881348, + "step": 327 + }, + { + "completion_length": 350.953125, + "epoch": 0.4774381368267831, + "grad_norm": 1.0742309488389477, + "kl": 0.046142578125, + "learning_rate": 5.611687621892286e-07, + "loss": 0.0009, + "reward": 1.1679883003234863, + "reward_std": 0.7857703566551208, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9162304401397705, + "step": 328 + }, + { + "completion_length": 341.96875, + "epoch": 0.47889374090247455, + "grad_norm": 1.1156444737560618, + "kl": 0.046142578125, + "learning_rate": 5.588272534359192e-07, + "loss": 0.005, + "reward": 1.3799349069595337, + "reward_std": 0.4269542694091797, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9424349069595337, + "step": 329 + }, + { + "completion_length": 329.125, + "epoch": 0.48034934497816595, + "grad_norm": 1.0577263144575106, + "kl": 0.04833984375, + "learning_rate": 5.564844357145364e-07, + "loss": 0.0029, + "reward": 1.4966275691986084, + "reward_std": 0.5279096961021423, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.9164062738418579, + "step": 330 + }, + { + "completion_length": 336.40625, + "epoch": 0.48180494905385735, + "grad_norm": 1.218724139541029, + "kl": 0.051513671875, + "learning_rate": 5.541403611552309e-07, + "loss": 0.0004, + "reward": 0.5468424558639526, + "reward_std": 0.6334984302520752, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.9350846409797668, + "step": 331 + }, + { + "completion_length": 372.921875, + "epoch": 0.48326055312954874, + "grad_norm": 0.9652643655545916, + "kl": 0.043212890625, + "learning_rate": 5.517950819161196e-07, + "loss": -0.0024, + "reward": 0.44192707538604736, + "reward_std": 1.4124200344085693, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.8328776359558105, + "step": 332 + }, + { + "completion_length": 353.0, + "epoch": 0.4847161572052402, + "grad_norm": 1.0409879516651472, + "kl": 0.046142578125, + "learning_rate": 5.49448650182125e-07, + "loss": 0.0024, + "reward": 0.6646744608879089, + "reward_std": 0.7230473756790161, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.8712239265441895, + "step": 333 + }, + { + "completion_length": 333.578125, + "epoch": 0.4861717612809316, + "grad_norm": 1.0907676643802855, + "kl": 0.052978515625, + "learning_rate": 5.47101118163813e-07, + "loss": 0.0032, + "reward": 1.0463411808013916, + "reward_std": 0.6876223087310791, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9171745181083679, + "step": 334 + }, + { + "completion_length": 345.171875, + "epoch": 0.487627365356623, + "grad_norm": 1.0768322878334375, + "kl": 0.051025390625, + "learning_rate": 5.447525380962334e-07, + "loss": 0.0012, + "reward": 0.48595699667930603, + "reward_std": 0.8556101322174072, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.8658137917518616, + "step": 335 + }, + { + "completion_length": 357.90625, + "epoch": 0.4890829694323144, + "grad_norm": 1.0818933763915959, + "kl": 0.0439453125, + "learning_rate": 5.424029622377546e-07, + "loss": -0.0016, + "reward": 0.9220898151397705, + "reward_std": 1.0697258710861206, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.8551627397537231, + "step": 336 + }, + { + "completion_length": 367.09375, + "epoch": 0.49053857350800584, + "grad_norm": 0.9408245385193819, + "kl": 0.046142578125, + "learning_rate": 5.400524428689035e-07, + "loss": -0.0019, + "reward": 0.23695963621139526, + "reward_std": 0.7709278464317322, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 0.8613867163658142, + "step": 337 + }, + { + "completion_length": 341.09375, + "epoch": 0.49199417758369723, + "grad_norm": 0.9905746624278071, + "kl": 0.050048828125, + "learning_rate": 5.377010322912008e-07, + "loss": -0.0, + "reward": 0.875012993812561, + "reward_std": 0.9516023397445679, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.8464062213897705, + "step": 338 + }, + { + "completion_length": 340.9375, + "epoch": 0.49344978165938863, + "grad_norm": 1.1600321561212144, + "kl": 0.052001953125, + "learning_rate": 5.353487828259972e-07, + "loss": 0.0018, + "reward": 0.9952148795127869, + "reward_std": 0.7256425619125366, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9173762798309326, + "step": 339 + }, + { + "completion_length": 362.953125, + "epoch": 0.4949053857350801, + "grad_norm": 0.9677169222469415, + "kl": 0.043212890625, + "learning_rate": 5.329957468133103e-07, + "loss": -0.0019, + "reward": 0.9681054353713989, + "reward_std": 0.8885953426361084, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.8975194692611694, + "step": 340 + }, + { + "completion_length": 348.46875, + "epoch": 0.4963609898107715, + "grad_norm": 0.9992111334554165, + "kl": 0.04541015625, + "learning_rate": 5.306419766106581e-07, + "loss": -0.0002, + "reward": 0.6875976324081421, + "reward_std": 0.5412222146987915, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.9448763132095337, + "step": 341 + }, + { + "completion_length": 339.234375, + "epoch": 0.4978165938864629, + "grad_norm": 1.0798384611489298, + "kl": 0.047119140625, + "learning_rate": 5.282875245918962e-07, + "loss": -0.0015, + "reward": 1.2949869632720947, + "reward_std": 0.6558061838150024, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.8986979722976685, + "step": 342 + }, + { + "completion_length": 328.71875, + "epoch": 0.4992721979621543, + "grad_norm": 1.1460335335257221, + "kl": 0.056884765625, + "learning_rate": 5.259324431460506e-07, + "loss": 0.0016, + "reward": 1.0809309482574463, + "reward_std": 0.42390185594558716, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9173502326011658, + "step": 343 + }, + { + "completion_length": 327.828125, + "epoch": 0.5007278020378457, + "grad_norm": 1.1732063642209178, + "kl": 0.046142578125, + "learning_rate": 5.235767846761529e-07, + "loss": -0.003, + "reward": 1.85539710521698, + "reward_std": 0.26673340797424316, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.9491471648216248, + "step": 344 + }, + { + "completion_length": 351.890625, + "epoch": 0.5021834061135371, + "grad_norm": 1.1152451844872624, + "kl": 0.0478515625, + "learning_rate": 5.212206015980741e-07, + "loss": 0.0033, + "reward": 1.0626237392425537, + "reward_std": 0.6199690103530884, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9055924415588379, + "step": 345 + }, + { + "completion_length": 339.921875, + "epoch": 0.5036390101892285, + "grad_norm": 1.1045572732346232, + "kl": 0.046142578125, + "learning_rate": 5.188639463393586e-07, + "loss": 0.0048, + "reward": 1.1775846481323242, + "reward_std": 0.4957619905471802, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9695507287979126, + "step": 346 + }, + { + "completion_length": 356.515625, + "epoch": 0.50509461426492, + "grad_norm": 1.0563165828060364, + "kl": 0.04638671875, + "learning_rate": 5.165068713380567e-07, + "loss": 0.0018, + "reward": 1.2808787822723389, + "reward_std": 0.6754826903343201, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.8902539610862732, + "step": 347 + }, + { + "completion_length": 333.265625, + "epoch": 0.5065502183406113, + "grad_norm": 1.1133538964235559, + "kl": 0.0458984375, + "learning_rate": 5.141494290415591e-07, + "loss": 0.001, + "reward": 1.0276042222976685, + "reward_std": 0.6215536594390869, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.951171875, + "step": 348 + }, + { + "completion_length": 355.34375, + "epoch": 0.5080058224163028, + "grad_norm": 1.0253948324473836, + "kl": 0.04150390625, + "learning_rate": 5.117916719054285e-07, + "loss": 0.0003, + "reward": 0.8305078744888306, + "reward_std": 0.8258182406425476, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.90471351146698, + "step": 349 + }, + { + "completion_length": 329.4375, + "epoch": 0.5094614264919942, + "grad_norm": 1.1732665473569692, + "kl": 0.049072265625, + "learning_rate": 5.094336523922335e-07, + "loss": 0.0032, + "reward": 0.745130181312561, + "reward_std": 0.2655390501022339, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.948046863079071, + "step": 350 + }, + { + "completion_length": 358.890625, + "epoch": 0.5109170305676856, + "grad_norm": 1.1759706647822885, + "kl": 0.046142578125, + "learning_rate": 5.07075422970381e-07, + "loss": 0.0022, + "reward": 1.322669267654419, + "reward_std": 0.8540600538253784, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9266666173934937, + "step": 351 + }, + { + "completion_length": 340.71875, + "epoch": 0.512372634643377, + "grad_norm": 1.280765883903055, + "kl": 0.04931640625, + "learning_rate": 5.047170361129483e-07, + "loss": -0.0051, + "reward": 0.7140104174613953, + "reward_std": 0.7314082384109497, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.8824348449707031, + "step": 352 + }, + { + "completion_length": 365.671875, + "epoch": 0.5138282387190685, + "grad_norm": 1.0208673688095518, + "kl": 0.041259765625, + "learning_rate": 5.023585442965162e-07, + "loss": 0.002, + "reward": 0.8170312643051147, + "reward_std": 0.6544123888015747, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.8933594226837158, + "step": 353 + }, + { + "completion_length": 343.65625, + "epoch": 0.5152838427947598, + "grad_norm": 1.0616017021974933, + "kl": 0.044921875, + "learning_rate": 5e-07, + "loss": -0.0017, + "reward": 1.360579490661621, + "reward_std": 0.6262814998626709, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9170117378234863, + "step": 354 + }, + { + "completion_length": 346.1875, + "epoch": 0.5167394468704513, + "grad_norm": 0.9668421147018488, + "kl": 0.04296875, + "learning_rate": 4.976414557034839e-07, + "loss": 0.0022, + "reward": -0.3669726252555847, + "reward_std": 0.33296334743499756, + "rewards/accuracy_reward": 0.234375, + "rewards/format_reward": 0.9034309387207031, + "step": 355 + }, + { + "completion_length": 345.4375, + "epoch": 0.5181950509461426, + "grad_norm": 1.2913148050931513, + "kl": 0.041015625, + "learning_rate": 4.952829638870515e-07, + "loss": -0.0009, + "reward": 0.7532292008399963, + "reward_std": 0.7976954579353333, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9168879985809326, + "step": 356 + }, + { + "completion_length": 368.5, + "epoch": 0.519650655021834, + "grad_norm": 0.8900198844773329, + "kl": 0.040283203125, + "learning_rate": 4.92924577029619e-07, + "loss": -0.0012, + "reward": 1.5500717163085938, + "reward_std": 1.1232110261917114, + "rewards/accuracy_reward": 0.890625, + "rewards/format_reward": 0.8781965970993042, + "step": 357 + }, + { + "completion_length": 350.390625, + "epoch": 0.5211062590975255, + "grad_norm": 1.0655510860501003, + "kl": 0.050537109375, + "learning_rate": 4.905663476077665e-07, + "loss": -0.0054, + "reward": 1.3917381763458252, + "reward_std": 0.9621044397354126, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9068945050239563, + "step": 358 + }, + { + "completion_length": 329.6875, + "epoch": 0.5225618631732168, + "grad_norm": 1.1100376051956256, + "kl": 0.048828125, + "learning_rate": 4.882083280945716e-07, + "loss": 0.0028, + "reward": 0.8387891054153442, + "reward_std": 0.6230899691581726, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9546484351158142, + "step": 359 + }, + { + "completion_length": 346.484375, + "epoch": 0.5240174672489083, + "grad_norm": 1.2301004521682009, + "kl": 0.047607421875, + "learning_rate": 4.85850570958441e-07, + "loss": 0.0047, + "reward": 0.77734375, + "reward_std": 0.2949136197566986, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.93436199426651, + "step": 360 + }, + { + "completion_length": 324.4375, + "epoch": 0.5254730713245997, + "grad_norm": 1.150417363080493, + "kl": 0.04541015625, + "learning_rate": 4.834931286619432e-07, + "loss": -0.0014, + "reward": 1.0515625476837158, + "reward_std": 0.45344188809394836, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9319791793823242, + "step": 361 + }, + { + "completion_length": 338.5625, + "epoch": 0.5269286754002911, + "grad_norm": 1.0871105100886687, + "kl": 0.05224609375, + "learning_rate": 4.811360536606415e-07, + "loss": 0.0006, + "reward": 1.0285286903381348, + "reward_std": 0.5896121263504028, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9137629866600037, + "step": 362 + }, + { + "completion_length": 346.4375, + "epoch": 0.5283842794759825, + "grad_norm": 1.16086348005986, + "kl": 0.047119140625, + "learning_rate": 4.787793984019259e-07, + "loss": -0.0023, + "reward": 1.1483073234558105, + "reward_std": 0.1926122009754181, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9851562976837158, + "step": 363 + }, + { + "completion_length": 330.25, + "epoch": 0.529839883551674, + "grad_norm": 1.122777391792208, + "kl": 0.04345703125, + "learning_rate": 4.764232153238472e-07, + "loss": -0.0037, + "reward": 1.3178515434265137, + "reward_std": 0.3849244713783264, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.9673958420753479, + "step": 364 + }, + { + "completion_length": 331.015625, + "epoch": 0.5312954876273653, + "grad_norm": 1.2307126175016525, + "kl": 0.047607421875, + "learning_rate": 4.7406755685394943e-07, + "loss": -0.0013, + "reward": 0.8999348878860474, + "reward_std": 0.7801786065101624, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.910559892654419, + "step": 365 + }, + { + "completion_length": 335.90625, + "epoch": 0.5327510917030568, + "grad_norm": 1.222675950836803, + "kl": 0.04931640625, + "learning_rate": 4.7171247540810377e-07, + "loss": 0.0039, + "reward": 0.7975065112113953, + "reward_std": 0.47923994064331055, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9561002254486084, + "step": 366 + }, + { + "completion_length": 335.640625, + "epoch": 0.5342066957787481, + "grad_norm": 1.0798428455730729, + "kl": 0.04443359375, + "learning_rate": 4.693580233893419e-07, + "loss": -0.0001, + "reward": 0.7751432657241821, + "reward_std": 0.8045872449874878, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9402864575386047, + "step": 367 + }, + { + "completion_length": 327.671875, + "epoch": 0.5356622998544396, + "grad_norm": 1.2982851617600433, + "kl": 0.052001953125, + "learning_rate": 4.6700425318668983e-07, + "loss": -0.0008, + "reward": 1.1960091590881348, + "reward_std": 0.4194261431694031, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9822461009025574, + "step": 368 + }, + { + "completion_length": 311.640625, + "epoch": 0.537117903930131, + "grad_norm": 1.1430814971743037, + "kl": 0.046630859375, + "learning_rate": 4.646512171740027e-07, + "loss": 0.002, + "reward": 0.6666406393051147, + "reward_std": 0.41208669543266296, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9482681751251221, + "step": 369 + }, + { + "completion_length": 332.796875, + "epoch": 0.5385735080058224, + "grad_norm": 1.1537373469060968, + "kl": 0.043701171875, + "learning_rate": 4.6229896770879925e-07, + "loss": -0.0008, + "reward": 0.8343229293823242, + "reward_std": 0.35879752039909363, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9422265291213989, + "step": 370 + }, + { + "completion_length": 329.578125, + "epoch": 0.5400291120815138, + "grad_norm": 1.117973192100589, + "kl": 0.04541015625, + "learning_rate": 4.599475571310964e-07, + "loss": 0.0006, + "reward": 0.95947265625, + "reward_std": 0.6068175435066223, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9388346672058105, + "step": 371 + }, + { + "completion_length": 344.375, + "epoch": 0.5414847161572053, + "grad_norm": 1.196872378128055, + "kl": 0.04931640625, + "learning_rate": 4.5759703776224555e-07, + "loss": 0.0017, + "reward": 1.1496614217758179, + "reward_std": 0.7437654733657837, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9437239170074463, + "step": 372 + }, + { + "completion_length": 323.984375, + "epoch": 0.5429403202328966, + "grad_norm": 1.2948089305623915, + "kl": 0.050537109375, + "learning_rate": 4.552474619037668e-07, + "loss": -0.0012, + "reward": 1.1745052337646484, + "reward_std": 0.1955292969942093, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9643619656562805, + "step": 373 + }, + { + "completion_length": 334.40625, + "epoch": 0.5443959243085881, + "grad_norm": 1.1752903191077182, + "kl": 0.045166015625, + "learning_rate": 4.528988818361869e-07, + "loss": -0.0005, + "reward": 0.9639453291893005, + "reward_std": 0.7098885774612427, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9328385591506958, + "step": 374 + }, + { + "completion_length": 336.34375, + "epoch": 0.5458515283842795, + "grad_norm": 1.0217530778000898, + "kl": 0.0458984375, + "learning_rate": 4.505513498178751e-07, + "loss": 0.0044, + "reward": 1.0537173748016357, + "reward_std": 0.42270171642303467, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.936022162437439, + "step": 375 + }, + { + "completion_length": 336.140625, + "epoch": 0.5473071324599709, + "grad_norm": 1.1035824214262215, + "kl": 0.041748046875, + "learning_rate": 4.4820491808388035e-07, + "loss": 0.0024, + "reward": 1.3975260257720947, + "reward_std": 0.5420940518379211, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9560156464576721, + "step": 376 + }, + { + "completion_length": 319.171875, + "epoch": 0.5487627365356623, + "grad_norm": 1.1858924715525574, + "kl": 0.048583984375, + "learning_rate": 4.45859638844769e-07, + "loss": 0.0027, + "reward": 1.609043002128601, + "reward_std": 0.04546473175287247, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9743945598602295, + "step": 377 + }, + { + "completion_length": 332.078125, + "epoch": 0.5502183406113537, + "grad_norm": 1.137595761389025, + "kl": 0.051513671875, + "learning_rate": 4.4351556428546365e-07, + "loss": 0.0042, + "reward": 1.075364589691162, + "reward_std": 0.9671132564544678, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.8985416293144226, + "step": 378 + }, + { + "completion_length": 365.34375, + "epoch": 0.5516739446870451, + "grad_norm": 1.064698625138544, + "kl": 0.0439453125, + "learning_rate": 4.411727465640808e-07, + "loss": 0.0007, + "reward": 0.705507755279541, + "reward_std": 0.2698853611946106, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.958437442779541, + "step": 379 + }, + { + "completion_length": 332.1875, + "epoch": 0.5531295487627366, + "grad_norm": 0.954227809746772, + "kl": 0.047119140625, + "learning_rate": 4.388312378107714e-07, + "loss": 0.0019, + "reward": 0.017766959965229034, + "reward_std": 0.4671540856361389, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 0.9211132526397705, + "step": 380 + }, + { + "completion_length": 357.34375, + "epoch": 0.5545851528384279, + "grad_norm": 1.0746234998917534, + "kl": 0.04638671875, + "learning_rate": 4.364910901265606e-07, + "loss": 0.0009, + "reward": 0.9512304663658142, + "reward_std": 0.9008158445358276, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9242773056030273, + "step": 381 + }, + { + "completion_length": 353.171875, + "epoch": 0.5560407569141194, + "grad_norm": 1.1000427889048656, + "kl": 0.04638671875, + "learning_rate": 4.341523555821881e-07, + "loss": 0.001, + "reward": 0.6827343702316284, + "reward_std": 0.32222965359687805, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.8887369632720947, + "step": 382 + }, + { + "completion_length": 330.015625, + "epoch": 0.5574963609898108, + "grad_norm": 1.0062009920301431, + "kl": 0.052734375, + "learning_rate": 4.3181508621695015e-07, + "loss": 0.0044, + "reward": 1.2704167366027832, + "reward_std": 0.3719358444213867, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.9685026407241821, + "step": 383 + }, + { + "completion_length": 358.90625, + "epoch": 0.5589519650655022, + "grad_norm": 1.0500378444634122, + "kl": 0.055419921875, + "learning_rate": 4.294793340375404e-07, + "loss": 0.0022, + "reward": 0.5693033933639526, + "reward_std": 0.7073459029197693, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.9236393570899963, + "step": 384 + }, + { + "completion_length": 317.875, + "epoch": 0.5604075691411936, + "grad_norm": 1.2585391427250612, + "kl": 0.052978515625, + "learning_rate": 4.271451510168943e-07, + "loss": 0.0019, + "reward": 1.0041340589523315, + "reward_std": 0.6024578809738159, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9240299463272095, + "step": 385 + }, + { + "completion_length": 340.671875, + "epoch": 0.5618631732168851, + "grad_norm": 1.171525425235524, + "kl": 0.055419921875, + "learning_rate": 4.248125890930316e-07, + "loss": 0.001, + "reward": 0.5767643451690674, + "reward_std": 0.9831317067146301, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.868079423904419, + "step": 386 + }, + { + "completion_length": 348.484375, + "epoch": 0.5633187772925764, + "grad_norm": 1.1137932938055821, + "kl": 0.0498046875, + "learning_rate": 4.22481700167901e-07, + "loss": 0.0038, + "reward": 1.6028320789337158, + "reward_std": 0.785223126411438, + "rewards/accuracy_reward": 0.890625, + "rewards/format_reward": 0.9309570789337158, + "step": 387 + }, + { + "completion_length": 357.015625, + "epoch": 0.5647743813682679, + "grad_norm": 1.1683554196934958, + "kl": 0.04541015625, + "learning_rate": 4.201525361062254e-07, + "loss": 0.002, + "reward": 0.885696530342102, + "reward_std": 0.9379022121429443, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.9048242568969727, + "step": 388 + }, + { + "completion_length": 349.765625, + "epoch": 0.5662299854439592, + "grad_norm": 1.0285239615931752, + "kl": 0.045654296875, + "learning_rate": 4.17825148734347e-07, + "loss": -0.0016, + "reward": 1.4252278804779053, + "reward_std": 0.555104672908783, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9401627779006958, + "step": 389 + }, + { + "completion_length": 344.578125, + "epoch": 0.5676855895196506, + "grad_norm": 0.9824416716745002, + "kl": 0.046875, + "learning_rate": 4.154995898390755e-07, + "loss": 0.0001, + "reward": 0.30329427123069763, + "reward_std": 0.8160994648933411, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.8936979174613953, + "step": 390 + }, + { + "completion_length": 362.265625, + "epoch": 0.5691411935953421, + "grad_norm": 1.0347605373720594, + "kl": 0.04345703125, + "learning_rate": 4.131759111665348e-07, + "loss": 0.0006, + "reward": 0.3581119775772095, + "reward_std": 0.6576919555664062, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.8985155820846558, + "step": 391 + }, + { + "completion_length": 354.8125, + "epoch": 0.5705967976710334, + "grad_norm": 1.0801343806418962, + "kl": 0.04736328125, + "learning_rate": 4.1085416442101203e-07, + "loss": 0.0022, + "reward": 1.1929036378860474, + "reward_std": 1.0718597173690796, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.8935286402702332, + "step": 392 + }, + { + "completion_length": 337.828125, + "epoch": 0.5720524017467249, + "grad_norm": 0.9437613999658896, + "kl": 0.042236328125, + "learning_rate": 4.0853440126380666e-07, + "loss": -0.0027, + "reward": 1.2306054830551147, + "reward_std": 0.06505398452281952, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9732356667518616, + "step": 393 + }, + { + "completion_length": 353.15625, + "epoch": 0.5735080058224163, + "grad_norm": 1.0240210848060178, + "kl": 0.042724609375, + "learning_rate": 4.0621667331208156e-07, + "loss": 0.0022, + "reward": 1.047447919845581, + "reward_std": 0.5841343402862549, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9282682538032532, + "step": 394 + }, + { + "completion_length": 371.28125, + "epoch": 0.5749636098981077, + "grad_norm": 0.9665992608409852, + "kl": 0.04931640625, + "learning_rate": 4.0390103213771363e-07, + "loss": -0.0038, + "reward": 0.1188020408153534, + "reward_std": 0.9848098158836365, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 0.8411197662353516, + "step": 395 + }, + { + "completion_length": 341.75, + "epoch": 0.5764192139737991, + "grad_norm": 1.2322164783845382, + "kl": 0.04833984375, + "learning_rate": 4.015875292661473e-07, + "loss": 0.0022, + "reward": 0.32570311427116394, + "reward_std": 0.9908155798912048, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 0.9544401168823242, + "step": 396 + }, + { + "completion_length": 333.5625, + "epoch": 0.5778748180494906, + "grad_norm": 1.082096942344887, + "kl": 0.0498046875, + "learning_rate": 3.9927621617524736e-07, + "loss": 0.0069, + "reward": 1.061464786529541, + "reward_std": 0.4430278539657593, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.8951106667518616, + "step": 397 + }, + { + "completion_length": 349.640625, + "epoch": 0.5793304221251819, + "grad_norm": 1.1730680460847622, + "kl": 0.046142578125, + "learning_rate": 3.969671442941538e-07, + "loss": 0.0014, + "reward": 0.3286914527416229, + "reward_std": 0.6564339995384216, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.9076367616653442, + "step": 398 + }, + { + "completion_length": 347.859375, + "epoch": 0.5807860262008734, + "grad_norm": 0.936643056977854, + "kl": 0.048095703125, + "learning_rate": 3.94660365002137e-07, + "loss": 0.001, + "reward": -0.10537109524011612, + "reward_std": 0.5952030420303345, + "rewards/accuracy_reward": 0.328125, + "rewards/format_reward": 0.8903840780258179, + "step": 399 + }, + { + "completion_length": 361.015625, + "epoch": 0.5822416302765647, + "grad_norm": 0.8588675688237927, + "kl": 0.047119140625, + "learning_rate": 3.923559296274549e-07, + "loss": -0.0018, + "reward": 0.81843101978302, + "reward_std": 0.8495593667030334, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.895423173904419, + "step": 400 + }, + { + "completion_length": 327.78125, + "epoch": 0.5836972343522562, + "grad_norm": 1.1927270347808032, + "kl": 0.046142578125, + "learning_rate": 3.900538894462112e-07, + "loss": -0.0011, + "reward": 0.4946874976158142, + "reward_std": 0.5388225317001343, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.9400129914283752, + "step": 401 + }, + { + "completion_length": 335.125, + "epoch": 0.5851528384279476, + "grad_norm": 1.1416346509953534, + "kl": 0.050048828125, + "learning_rate": 3.877542956812136e-07, + "loss": -0.0009, + "reward": 1.2370572090148926, + "reward_std": 0.7617213129997253, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.9355729222297668, + "step": 402 + }, + { + "completion_length": 339.640625, + "epoch": 0.586608442503639, + "grad_norm": 1.2059860449472852, + "kl": 0.048828125, + "learning_rate": 3.8545719950083503e-07, + "loss": 0.0032, + "reward": 1.136875033378601, + "reward_std": 0.3422207832336426, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9610416293144226, + "step": 403 + }, + { + "completion_length": 311.203125, + "epoch": 0.5880640465793304, + "grad_norm": 1.2427528264658227, + "kl": 0.057373046875, + "learning_rate": 3.831626520178738e-07, + "loss": 0.0021, + "reward": 1.1694010496139526, + "reward_std": 0.21720562875270844, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9581640958786011, + "step": 404 + }, + { + "completion_length": 337.6875, + "epoch": 0.5895196506550219, + "grad_norm": 1.1003634317266355, + "kl": 0.05224609375, + "learning_rate": 3.8087070428841753e-07, + "loss": 0.0002, + "reward": 0.48251304030418396, + "reward_std": 0.9312198758125305, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.9204167127609253, + "step": 405 + }, + { + "completion_length": 339.484375, + "epoch": 0.5909752547307132, + "grad_norm": 0.9678972898174403, + "kl": 0.044677734375, + "learning_rate": 3.785814073107064e-07, + "loss": 0.0013, + "reward": 1.5340235233306885, + "reward_std": 0.5008847117424011, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.94998699426651, + "step": 406 + }, + { + "completion_length": 308.375, + "epoch": 0.5924308588064047, + "grad_norm": 1.047424956436592, + "kl": 0.047607421875, + "learning_rate": 3.762948120239988e-07, + "loss": -0.0021, + "reward": 0.9827408790588379, + "reward_std": 0.627294659614563, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9047591090202332, + "step": 407 + }, + { + "completion_length": 317.78125, + "epoch": 0.5938864628820961, + "grad_norm": 1.31987404266799, + "kl": 0.049560546875, + "learning_rate": 3.7401096930743746e-07, + "loss": -0.0035, + "reward": 1.220130205154419, + "reward_std": 0.5085282325744629, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9641405940055847, + "step": 408 + }, + { + "completion_length": 317.328125, + "epoch": 0.5953420669577875, + "grad_norm": 1.258115336162508, + "kl": 0.0458984375, + "learning_rate": 3.717299299789175e-07, + "loss": 0.0015, + "reward": 0.8206315040588379, + "reward_std": 0.16567295789718628, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.980527400970459, + "step": 409 + }, + { + "completion_length": 311.0625, + "epoch": 0.5967976710334789, + "grad_norm": 1.0797344698183151, + "kl": 0.0400390625, + "learning_rate": 3.6945174479395584e-07, + "loss": 0.0006, + "reward": 0.4124348759651184, + "reward_std": 0.5062814950942993, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.9456771612167358, + "step": 410 + }, + { + "completion_length": 326.015625, + "epoch": 0.5982532751091703, + "grad_norm": 0.9392678790614373, + "kl": 0.051513671875, + "learning_rate": 3.6717646444456193e-07, + "loss": 0.0014, + "reward": 0.35626304149627686, + "reward_std": 0.4074528217315674, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.9396615028381348, + "step": 411 + }, + { + "completion_length": 320.078125, + "epoch": 0.5997088791848617, + "grad_norm": 1.3610941268678678, + "kl": 0.044921875, + "learning_rate": 3.649041395581089e-07, + "loss": 0.0001, + "reward": 0.6594465970993042, + "reward_std": 0.6716386079788208, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.9064517617225647, + "step": 412 + }, + { + "completion_length": 327.734375, + "epoch": 0.6011644832605532, + "grad_norm": 1.2129906269391977, + "kl": 0.048095703125, + "learning_rate": 3.6263482069620865e-07, + "loss": -0.0036, + "reward": 1.199205756187439, + "reward_std": 0.3400747776031494, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9805078506469727, + "step": 413 + }, + { + "completion_length": 322.78125, + "epoch": 0.6026200873362445, + "grad_norm": 1.0044565070885603, + "kl": 0.047607421875, + "learning_rate": 3.6036855835358496e-07, + "loss": 0.0017, + "reward": 0.64725261926651, + "reward_std": 0.4769784212112427, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9339061975479126, + "step": 414 + }, + { + "completion_length": 356.59375, + "epoch": 0.604075691411936, + "grad_norm": 1.144784540293772, + "kl": 0.0478515625, + "learning_rate": 3.581054029569516e-07, + "loss": -0.005, + "reward": 0.7913802266120911, + "reward_std": 0.649086058139801, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9033724069595337, + "step": 415 + }, + { + "completion_length": 327.5625, + "epoch": 0.6055312954876274, + "grad_norm": 1.1441222735837988, + "kl": 0.050537109375, + "learning_rate": 3.55845404863889e-07, + "loss": 0.0033, + "reward": 1.7637498378753662, + "reward_std": 0.5468021631240845, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9512500166893005, + "step": 416 + }, + { + "completion_length": 304.46875, + "epoch": 0.6069868995633187, + "grad_norm": 1.2559225472205957, + "kl": 0.04638671875, + "learning_rate": 3.535886143617248e-07, + "loss": -0.0045, + "reward": 0.8564192652702332, + "reward_std": 0.08503374457359314, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9544661641120911, + "step": 417 + }, + { + "completion_length": 319.765625, + "epoch": 0.6084425036390102, + "grad_norm": 1.1233374483471503, + "kl": 0.048583984375, + "learning_rate": 3.513350816664138e-07, + "loss": 0.0005, + "reward": 1.250644564628601, + "reward_std": 0.6800768375396729, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.93666011095047, + "step": 418 + }, + { + "completion_length": 319.390625, + "epoch": 0.6098981077147017, + "grad_norm": 1.1149493029080904, + "kl": 0.04736328125, + "learning_rate": 3.4908485692142164e-07, + "loss": 0.0031, + "reward": 1.9074804782867432, + "reward_std": 0.20882548391819, + "rewards/accuracy_reward": 0.984375, + "rewards/format_reward": 0.9543554782867432, + "step": 419 + }, + { + "completion_length": 322.484375, + "epoch": 0.611353711790393, + "grad_norm": 1.1771319406793013, + "kl": 0.05078125, + "learning_rate": 3.4683799019660833e-07, + "loss": -0.0012, + "reward": 0.4527343511581421, + "reward_std": 0.1347397118806839, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9300520420074463, + "step": 420 + }, + { + "completion_length": 320.984375, + "epoch": 0.6128093158660844, + "grad_norm": 1.253919818449011, + "kl": 0.04541015625, + "learning_rate": 3.4459453148711437e-07, + "loss": -0.0025, + "reward": 1.2054883241653442, + "reward_std": 0.34866058826446533, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9499544501304626, + "step": 421 + }, + { + "completion_length": 308.75, + "epoch": 0.6142649199417758, + "grad_norm": 1.2228103194283946, + "kl": 0.0546875, + "learning_rate": 3.423545307122488e-07, + "loss": 0.0045, + "reward": 1.38002610206604, + "reward_std": 0.5425729751586914, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9391015768051147, + "step": 422 + }, + { + "completion_length": 334.921875, + "epoch": 0.6157205240174672, + "grad_norm": 1.0491711033896147, + "kl": 0.0517578125, + "learning_rate": 3.4011803771437735e-07, + "loss": 0.0027, + "reward": 1.441979169845581, + "reward_std": 0.9929073452949524, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9042838215827942, + "step": 423 + }, + { + "completion_length": 341.25, + "epoch": 0.6171761280931587, + "grad_norm": 1.1145287911850592, + "kl": 0.046142578125, + "learning_rate": 3.378851022578146e-07, + "loss": -0.0005, + "reward": 0.8648567199707031, + "reward_std": 0.6062259674072266, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.94259113073349, + "step": 424 + }, + { + "completion_length": 323.328125, + "epoch": 0.61863173216885, + "grad_norm": 1.2328706888305836, + "kl": 0.059326171875, + "learning_rate": 3.356557740277156e-07, + "loss": -0.0007, + "reward": 1.2751758098602295, + "reward_std": 0.894574761390686, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.9299153685569763, + "step": 425 + }, + { + "completion_length": 331.3125, + "epoch": 0.6200873362445415, + "grad_norm": 1.047466025286303, + "kl": 0.048828125, + "learning_rate": 3.334301026289712e-07, + "loss": 0.0037, + "reward": 1.5336458683013916, + "reward_std": 0.252202570438385, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.9498567581176758, + "step": 426 + }, + { + "completion_length": 338.671875, + "epoch": 0.6215429403202329, + "grad_norm": 1.0464703790815202, + "kl": 0.04833984375, + "learning_rate": 3.312081375851038e-07, + "loss": 0.0015, + "reward": 1.5034700632095337, + "reward_std": 0.9077455997467041, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.9250195026397705, + "step": 427 + }, + { + "completion_length": 334.8125, + "epoch": 0.6229985443959243, + "grad_norm": 1.1030610799748342, + "kl": 0.047119140625, + "learning_rate": 3.2898992833716563e-07, + "loss": 0.002, + "reward": 1.0385351181030273, + "reward_std": 0.5042393803596497, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9632617235183716, + "step": 428 + }, + { + "completion_length": 341.953125, + "epoch": 0.6244541484716157, + "grad_norm": 0.9140662347223634, + "kl": 0.05224609375, + "learning_rate": 3.2677552424263834e-07, + "loss": -0.0018, + "reward": 0.9026367664337158, + "reward_std": 0.9520148038864136, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.8862174153327942, + "step": 429 + }, + { + "completion_length": 355.328125, + "epoch": 0.6259097525473072, + "grad_norm": 1.0653314224895472, + "kl": 0.046630859375, + "learning_rate": 3.2456497457433475e-07, + "loss": 0.0019, + "reward": 0.8542708158493042, + "reward_std": 0.8604847192764282, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.8829036355018616, + "step": 430 + }, + { + "completion_length": 353.859375, + "epoch": 0.6273653566229985, + "grad_norm": 1.0396201810405379, + "kl": 0.0478515625, + "learning_rate": 3.2235832851930315e-07, + "loss": -0.0011, + "reward": 1.552734375, + "reward_std": 0.19750574231147766, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.974609375, + "step": 431 + }, + { + "completion_length": 330.3125, + "epoch": 0.62882096069869, + "grad_norm": 1.149398763893242, + "kl": 0.048828125, + "learning_rate": 3.201556351777321e-07, + "loss": 0.0008, + "reward": 1.3697071075439453, + "reward_std": 0.5695977210998535, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9322070479393005, + "step": 432 + }, + { + "completion_length": 339.59375, + "epoch": 0.6302765647743813, + "grad_norm": 1.2047591749520952, + "kl": 0.045654296875, + "learning_rate": 3.1795694356185797e-07, + "loss": -0.0004, + "reward": 1.2822070121765137, + "reward_std": 0.7462552785873413, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.9377539157867432, + "step": 433 + }, + { + "completion_length": 354.828125, + "epoch": 0.6317321688500728, + "grad_norm": 1.0709344253665385, + "kl": 0.056396484375, + "learning_rate": 3.157623025948751e-07, + "loss": 0.0054, + "reward": 0.5403645634651184, + "reward_std": 0.7718614339828491, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.9401432275772095, + "step": 434 + }, + { + "completion_length": 367.015625, + "epoch": 0.6331877729257642, + "grad_norm": 0.9081689546614261, + "kl": 0.042236328125, + "learning_rate": 3.135717611098457e-07, + "loss": -0.0028, + "reward": 0.5762760639190674, + "reward_std": 0.9539381265640259, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.9339452981948853, + "step": 435 + }, + { + "completion_length": 361.609375, + "epoch": 0.6346433770014556, + "grad_norm": 0.9935443191736562, + "kl": 0.050048828125, + "learning_rate": 3.11385367848615e-07, + "loss": 0.0029, + "reward": 1.0945442914962769, + "reward_std": 0.364665687084198, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9768098592758179, + "step": 436 + }, + { + "completion_length": 365.984375, + "epoch": 0.636098981077147, + "grad_norm": 1.1539983550423447, + "kl": 0.046142578125, + "learning_rate": 3.0920317146072574e-07, + "loss": -0.003, + "reward": 0.9360742568969727, + "reward_std": 0.656904935836792, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9138607382774353, + "step": 437 + }, + { + "completion_length": 364.125, + "epoch": 0.6375545851528385, + "grad_norm": 1.0332344243928102, + "kl": 0.04638671875, + "learning_rate": 3.070252205023355e-07, + "loss": -0.0011, + "reward": 0.920188844203949, + "reward_std": 0.4112701714038849, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.9488345980644226, + "step": 438 + }, + { + "completion_length": 363.140625, + "epoch": 0.6390101892285298, + "grad_norm": 0.8434589833001687, + "kl": 0.04345703125, + "learning_rate": 3.048515634351373e-07, + "loss": -0.0007, + "reward": 1.021744728088379, + "reward_std": 0.9885683655738831, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9123697876930237, + "step": 439 + }, + { + "completion_length": 349.859375, + "epoch": 0.6404657933042213, + "grad_norm": 1.100810311524533, + "kl": 0.049072265625, + "learning_rate": 3.026822486252796e-07, + "loss": 0.004, + "reward": 1.1635351181030273, + "reward_std": 0.23715665936470032, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9604101777076721, + "step": 440 + }, + { + "completion_length": 361.28125, + "epoch": 0.6419213973799127, + "grad_norm": 0.9996154209580299, + "kl": 0.043701171875, + "learning_rate": 3.005173243422918e-07, + "loss": -0.0018, + "reward": 1.4946484565734863, + "reward_std": 0.36391258239746094, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9633983969688416, + "step": 441 + }, + { + "completion_length": 355.890625, + "epoch": 0.6433770014556041, + "grad_norm": 1.064092737707726, + "kl": 0.04931640625, + "learning_rate": 2.983568387580093e-07, + "loss": 0.0009, + "reward": 0.9472330808639526, + "reward_std": 1.0713083744049072, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9316080808639526, + "step": 442 + }, + { + "completion_length": 352.546875, + "epoch": 0.6448326055312955, + "grad_norm": 1.2213995677397735, + "kl": 0.046142578125, + "learning_rate": 2.9620083994550184e-07, + "loss": -0.0025, + "reward": 1.1790754795074463, + "reward_std": 0.2134915590286255, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9699999690055847, + "step": 443 + }, + { + "completion_length": 343.625, + "epoch": 0.6462882096069869, + "grad_norm": 0.9866828975217354, + "kl": 0.046142578125, + "learning_rate": 2.940493758780037e-07, + "loss": 0.0007, + "reward": 1.290442705154419, + "reward_std": 0.4115654230117798, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.945742130279541, + "step": 444 + }, + { + "completion_length": 369.09375, + "epoch": 0.6477438136826783, + "grad_norm": 0.7976157219662836, + "kl": 0.04833984375, + "learning_rate": 2.919024944278462e-07, + "loss": -0.0017, + "reward": 1.1873372793197632, + "reward_std": 0.38416576385498047, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9347070455551147, + "step": 445 + }, + { + "completion_length": 349.140625, + "epoch": 0.6491994177583698, + "grad_norm": 1.034098435240967, + "kl": 0.047119140625, + "learning_rate": 2.8976024336539297e-07, + "loss": -0.001, + "reward": 0.949485719203949, + "reward_std": 0.7218962907791138, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9301497936248779, + "step": 446 + }, + { + "completion_length": 361.703125, + "epoch": 0.6506550218340611, + "grad_norm": 0.9093087268112631, + "kl": 0.0537109375, + "learning_rate": 2.8762267035797606e-07, + "loss": -0.003, + "reward": 1.5500586032867432, + "reward_std": 0.852545976638794, + "rewards/accuracy_reward": 0.890625, + "rewards/format_reward": 0.8781836032867432, + "step": 447 + }, + { + "completion_length": 362.65625, + "epoch": 0.6521106259097526, + "grad_norm": 0.9962666508897389, + "kl": 0.047607421875, + "learning_rate": 2.8548982296883685e-07, + "loss": -0.0015, + "reward": 0.7888085842132568, + "reward_std": 0.24111366271972656, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9591341614723206, + "step": 448 + }, + { + "completion_length": 365.578125, + "epoch": 0.653566229985444, + "grad_norm": 1.0644973570656397, + "kl": 0.047607421875, + "learning_rate": 2.8336174865606583e-07, + "loss": -0.0012, + "reward": 1.458815097808838, + "reward_std": 0.6925506591796875, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9265494346618652, + "step": 449 + }, + { + "completion_length": 377.78125, + "epoch": 0.6550218340611353, + "grad_norm": 0.8690653477908721, + "kl": 0.04541015625, + "learning_rate": 2.8123849477154806e-07, + "loss": 0.0022, + "reward": 0.6074088215827942, + "reward_std": 0.6371817588806152, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.91539067029953, + "step": 450 + }, + { + "completion_length": 343.5625, + "epoch": 0.6564774381368268, + "grad_norm": 0.9959008179108618, + "kl": 0.045166015625, + "learning_rate": 2.791201085599084e-07, + "loss": 0.0026, + "reward": 1.1102409362792969, + "reward_std": 0.37362387776374817, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9482877254486084, + "step": 451 + }, + { + "completion_length": 372.015625, + "epoch": 0.6579330422125182, + "grad_norm": 1.1144444930254087, + "kl": 0.0517578125, + "learning_rate": 2.770066371574621e-07, + "loss": 0.0046, + "reward": 0.5106966495513916, + "reward_std": 0.35071802139282227, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.9128841161727905, + "step": 452 + }, + { + "completion_length": 370.703125, + "epoch": 0.6593886462882096, + "grad_norm": 0.9690064498501554, + "kl": 0.047119140625, + "learning_rate": 2.748981275911633e-07, + "loss": -0.0043, + "reward": 0.20597657561302185, + "reward_std": 0.7211107015609741, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.8844531178474426, + "step": 453 + }, + { + "completion_length": 364.328125, + "epoch": 0.660844250363901, + "grad_norm": 0.9366506361406456, + "kl": 0.044189453125, + "learning_rate": 2.7279462677756126e-07, + "loss": -0.0036, + "reward": 0.7447395920753479, + "reward_std": 0.6155897974967957, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9127213954925537, + "step": 454 + }, + { + "completion_length": 350.484375, + "epoch": 0.6622998544395924, + "grad_norm": 1.0086261231667006, + "kl": 0.047607421875, + "learning_rate": 2.7069618152175464e-07, + "loss": -0.0014, + "reward": 1.0693293809890747, + "reward_std": 0.44240403175354004, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9077669382095337, + "step": 455 + }, + { + "completion_length": 352.890625, + "epoch": 0.6637554585152838, + "grad_norm": 1.017662444972447, + "kl": 0.051513671875, + "learning_rate": 2.6860283851635063e-07, + "loss": -0.0016, + "reward": 1.1334245204925537, + "reward_std": 0.8161476850509644, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9266666769981384, + "step": 456 + }, + { + "completion_length": 363.09375, + "epoch": 0.6652110625909753, + "grad_norm": 0.9088775376959461, + "kl": 0.049072265625, + "learning_rate": 2.6651464434042596e-07, + "loss": 0.0007, + "reward": 1.1859569549560547, + "reward_std": 0.7904758453369141, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9305403232574463, + "step": 457 + }, + { + "completion_length": 358.1875, + "epoch": 0.6666666666666666, + "grad_norm": 0.956917415239815, + "kl": 0.054931640625, + "learning_rate": 2.6443164545849113e-07, + "loss": -0.0023, + "reward": 0.31612628698349, + "reward_std": 0.6449480652809143, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.8944726586341858, + "step": 458 + }, + { + "completion_length": 346.40625, + "epoch": 0.6681222707423581, + "grad_norm": 1.1043376858496878, + "kl": 0.048828125, + "learning_rate": 2.6235388821945495e-07, + "loss": 0.0022, + "reward": 1.320787787437439, + "reward_std": 0.5146819353103638, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9273632764816284, + "step": 459 + }, + { + "completion_length": 358.453125, + "epoch": 0.6695778748180495, + "grad_norm": 1.1203658951352162, + "kl": 0.054443359375, + "learning_rate": 2.602814188555951e-07, + "loss": -0.004, + "reward": 0.97126305103302, + "reward_std": 0.8944222927093506, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9062368869781494, + "step": 460 + }, + { + "completion_length": 373.3125, + "epoch": 0.6710334788937409, + "grad_norm": 1.0496443261695805, + "kl": 0.04443359375, + "learning_rate": 2.5821428348152786e-07, + "loss": -0.0001, + "reward": 0.44576171040534973, + "reward_std": 1.2483347654342651, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.8427278399467468, + "step": 461 + }, + { + "completion_length": 344.953125, + "epoch": 0.6724890829694323, + "grad_norm": 1.053156674987038, + "kl": 0.050048828125, + "learning_rate": 2.561525280931828e-07, + "loss": 0.0003, + "reward": 0.9904752373695374, + "reward_std": 0.34265437722206116, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9724804759025574, + "step": 462 + }, + { + "completion_length": 370.65625, + "epoch": 0.6739446870451238, + "grad_norm": 0.940859770356533, + "kl": 0.046875, + "learning_rate": 2.5409619856677913e-07, + "loss": -0.0002, + "reward": 0.28072917461395264, + "reward_std": 0.7043707370758057, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 0.9188151359558105, + "step": 463 + }, + { + "completion_length": 350.5, + "epoch": 0.6754002911208151, + "grad_norm": 0.9974930451895179, + "kl": 0.052490234375, + "learning_rate": 2.5204534065780533e-07, + "loss": 0.0036, + "reward": 1.0816080570220947, + "reward_std": 0.7134989500045776, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9230663776397705, + "step": 464 + }, + { + "completion_length": 344.09375, + "epoch": 0.6768558951965066, + "grad_norm": 1.0005662998651903, + "kl": 0.05419921875, + "learning_rate": 2.500000000000001e-07, + "loss": 0.0036, + "reward": 1.0266536474227905, + "reward_std": 0.7755259871482849, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.909726619720459, + "step": 465 + }, + { + "completion_length": 373.8125, + "epoch": 0.6783114992721979, + "grad_norm": 0.8697379409789556, + "kl": 0.050048828125, + "learning_rate": 2.4796022210433764e-07, + "loss": 0.0015, + "reward": 0.6482356786727905, + "reward_std": 1.0505759716033936, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.8648502230644226, + "step": 466 + }, + { + "completion_length": 350.203125, + "epoch": 0.6797671033478894, + "grad_norm": 1.0448464801188178, + "kl": 0.04345703125, + "learning_rate": 2.4592605235801537e-07, + "loss": 0.0024, + "reward": 1.170351505279541, + "reward_std": 0.9689666032791138, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.91190105676651, + "step": 467 + }, + { + "completion_length": 349.625, + "epoch": 0.6812227074235808, + "grad_norm": 1.0606796035208255, + "kl": 0.048828125, + "learning_rate": 2.438975360234429e-07, + "loss": 0.0004, + "reward": 1.5108983516693115, + "reward_std": 0.5824877023696899, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9749218821525574, + "step": 468 + }, + { + "completion_length": 334.25, + "epoch": 0.6826783114992722, + "grad_norm": 1.0491904469297986, + "kl": 0.049072265625, + "learning_rate": 2.4187471823723555e-07, + "loss": -0.0005, + "reward": 1.7903971672058105, + "reward_std": 0.5473105907440186, + "rewards/accuracy_reward": 0.953125, + "rewards/format_reward": 0.9310221672058105, + "step": 469 + }, + { + "completion_length": 368.1875, + "epoch": 0.6841339155749636, + "grad_norm": 0.8795420237835467, + "kl": 0.04296875, + "learning_rate": 2.3985764400921054e-07, + "loss": -0.0016, + "reward": 0.8880664110183716, + "reward_std": 0.7747170329093933, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.9181705713272095, + "step": 470 + }, + { + "completion_length": 353.734375, + "epoch": 0.6855895196506551, + "grad_norm": 1.0274200723626312, + "kl": 0.051513671875, + "learning_rate": 2.378463582213842e-07, + "loss": 0.0011, + "reward": 1.3674869537353516, + "reward_std": 0.8639088869094849, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9249348640441895, + "step": 471 + }, + { + "completion_length": 371.578125, + "epoch": 0.6870451237263464, + "grad_norm": 0.9918082918405166, + "kl": 0.04443359375, + "learning_rate": 2.3584090562697424e-07, + "loss": -0.0038, + "reward": 0.47871094942092896, + "reward_std": 0.9248123168945312, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.9306640625, + "step": 472 + }, + { + "completion_length": 367.421875, + "epoch": 0.6885007278020379, + "grad_norm": 0.9925350173805925, + "kl": 0.053466796875, + "learning_rate": 2.33841330849404e-07, + "loss": -0.0016, + "reward": 0.6097005009651184, + "reward_std": 0.6457391381263733, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9147526025772095, + "step": 473 + }, + { + "completion_length": 354.078125, + "epoch": 0.6899563318777293, + "grad_norm": 0.9429812617452892, + "kl": 0.044189453125, + "learning_rate": 2.3184767838130882e-07, + "loss": -0.002, + "reward": 0.7430534362792969, + "reward_std": 1.0212041139602661, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.8673632740974426, + "step": 474 + }, + { + "completion_length": 350.171875, + "epoch": 0.6914119359534207, + "grad_norm": 1.2082129742972214, + "kl": 0.04638671875, + "learning_rate": 2.298599925835466e-07, + "loss": 0.0004, + "reward": 0.455533891916275, + "reward_std": 0.4722123444080353, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9366145730018616, + "step": 475 + }, + { + "completion_length": 366.515625, + "epoch": 0.6928675400291121, + "grad_norm": 1.0274981124427542, + "kl": 0.048828125, + "learning_rate": 2.2787831768421046e-07, + "loss": 0.0033, + "reward": 0.10612629354000092, + "reward_std": 0.7683642506599426, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 0.9080273509025574, + "step": 476 + }, + { + "completion_length": 369.484375, + "epoch": 0.6943231441048034, + "grad_norm": 1.0175711783602979, + "kl": 0.04052734375, + "learning_rate": 2.2590269777764514e-07, + "loss": -0.0005, + "reward": 1.336686134338379, + "reward_std": 0.7981055974960327, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9460612535476685, + "step": 477 + }, + { + "completion_length": 337.78125, + "epoch": 0.6957787481804949, + "grad_norm": 1.1846182441564443, + "kl": 0.048828125, + "learning_rate": 2.2393317682346479e-07, + "loss": 0.0045, + "reward": 0.29799482226371765, + "reward_std": 0.4017283320426941, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9656379818916321, + "step": 478 + }, + { + "completion_length": 342.171875, + "epoch": 0.6972343522561864, + "grad_norm": 1.0582752970242983, + "kl": 0.04443359375, + "learning_rate": 2.219697986455762e-07, + "loss": -0.0034, + "reward": 0.9232031106948853, + "reward_std": 0.38019859790802, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.988684892654419, + "step": 479 + }, + { + "completion_length": 331.15625, + "epoch": 0.6986899563318777, + "grad_norm": 0.956549348435419, + "kl": 0.0537109375, + "learning_rate": 2.2001260693120232e-07, + "loss": -0.0002, + "reward": 0.63113933801651, + "reward_std": 0.21865397691726685, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.9857747554779053, + "step": 480 + }, + { + "completion_length": 320.21875, + "epoch": 0.7001455604075691, + "grad_norm": 1.149313470515847, + "kl": 0.056396484375, + "learning_rate": 2.1806164522991115e-07, + "loss": 0.0058, + "reward": 1.1596614122390747, + "reward_std": 0.2571667730808258, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9488281011581421, + "step": 481 + }, + { + "completion_length": 347.265625, + "epoch": 0.7016011644832606, + "grad_norm": 1.0533814952805514, + "kl": 0.048095703125, + "learning_rate": 2.1611695695264605e-07, + "loss": -0.0009, + "reward": 0.5740299224853516, + "reward_std": 0.5495303869247437, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.9683268666267395, + "step": 482 + }, + { + "completion_length": 331.109375, + "epoch": 0.7030567685589519, + "grad_norm": 0.934218888424527, + "kl": 0.049560546875, + "learning_rate": 2.1417858537076067e-07, + "loss": 0.0015, + "reward": 1.4878125190734863, + "reward_std": 0.37252911925315857, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9549609422683716, + "step": 483 + }, + { + "completion_length": 359.734375, + "epoch": 0.7045123726346434, + "grad_norm": 1.0518061573140363, + "kl": 0.046142578125, + "learning_rate": 2.122465736150549e-07, + "loss": 0.0049, + "reward": 1.4919662475585938, + "reward_std": 0.26884597539901733, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9607161283493042, + "step": 484 + }, + { + "completion_length": 340.125, + "epoch": 0.7059679767103348, + "grad_norm": 0.9147536840648907, + "kl": 0.046630859375, + "learning_rate": 2.1032096467481664e-07, + "loss": 0.0008, + "reward": 0.4311913847923279, + "reward_std": 0.18409988284111023, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.9738216400146484, + "step": 485 + }, + { + "completion_length": 348.34375, + "epoch": 0.7074235807860262, + "grad_norm": 1.1058446642922939, + "kl": 0.044677734375, + "learning_rate": 2.0840180139686332e-07, + "loss": 0.0021, + "reward": -0.23982422053813934, + "reward_std": 0.7118780612945557, + "rewards/accuracy_reward": 0.265625, + "rewards/format_reward": 0.941347599029541, + "step": 486 + }, + { + "completion_length": 348.796875, + "epoch": 0.7088791848617176, + "grad_norm": 0.936587157845369, + "kl": 0.044921875, + "learning_rate": 2.0648912648459072e-07, + "loss": 0.0001, + "reward": 0.8201823234558105, + "reward_std": 0.6160249710083008, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9445312023162842, + "step": 487 + }, + { + "completion_length": 350.484375, + "epoch": 0.710334788937409, + "grad_norm": 1.0710239125130703, + "kl": 0.0439453125, + "learning_rate": 2.0458298249702095e-07, + "loss": 0.0004, + "reward": 1.0420703887939453, + "reward_std": 0.5851905345916748, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9298177361488342, + "step": 488 + }, + { + "completion_length": 356.015625, + "epoch": 0.7117903930131004, + "grad_norm": 0.8863497690353069, + "kl": 0.04296875, + "learning_rate": 2.026834118478567e-07, + "loss": -0.0044, + "reward": 1.2863867282867432, + "reward_std": 0.7339239716529846, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.9426367282867432, + "step": 489 + }, + { + "completion_length": 345.890625, + "epoch": 0.7132459970887919, + "grad_norm": 1.0595443329712118, + "kl": 0.0498046875, + "learning_rate": 2.007904568045366e-07, + "loss": -0.0012, + "reward": 0.6528515815734863, + "reward_std": 0.4416601359844208, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9612500071525574, + "step": 490 + }, + { + "completion_length": 340.859375, + "epoch": 0.7147016011644832, + "grad_norm": 1.102733406546571, + "kl": 0.052001953125, + "learning_rate": 1.9890415948729534e-07, + "loss": -0.0017, + "reward": 1.5185351371765137, + "reward_std": 0.5196734666824341, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.93973308801651, + "step": 491 + }, + { + "completion_length": 344.796875, + "epoch": 0.7161572052401747, + "grad_norm": 1.013811804087309, + "kl": 0.04296875, + "learning_rate": 1.9702456186822592e-07, + "loss": 0.0057, + "reward": 1.5015950202941895, + "reward_std": 0.2588142156600952, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9537825584411621, + "step": 492 + }, + { + "completion_length": 323.671875, + "epoch": 0.7176128093158661, + "grad_norm": 1.1965785844666434, + "kl": 0.06201171875, + "learning_rate": 1.9515170577034657e-07, + "loss": -0.0009, + "reward": 1.3499219417572021, + "reward_std": 0.564841091632843, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9540885090827942, + "step": 493 + }, + { + "completion_length": 337.328125, + "epoch": 0.7190684133915575, + "grad_norm": 1.1320885181605083, + "kl": 0.053955078125, + "learning_rate": 1.93285632866669e-07, + "loss": -0.0003, + "reward": 1.0488346815109253, + "reward_std": 0.430465966463089, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9381445646286011, + "step": 494 + }, + { + "completion_length": 336.59375, + "epoch": 0.7205240174672489, + "grad_norm": 0.7656987950423686, + "kl": 0.05029296875, + "learning_rate": 1.914263846792725e-07, + "loss": -0.0014, + "reward": 0.2095833271741867, + "reward_std": 0.22649352252483368, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 0.9746744632720947, + "step": 495 + }, + { + "completion_length": 345.4375, + "epoch": 0.7219796215429404, + "grad_norm": 1.0169013561590603, + "kl": 0.043701171875, + "learning_rate": 1.895740025783782e-07, + "loss": -0.0001, + "reward": 1.433619737625122, + "reward_std": 0.5415338277816772, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9466406106948853, + "step": 496 + }, + { + "completion_length": 358.96875, + "epoch": 0.7234352256186317, + "grad_norm": 1.036810610063429, + "kl": 0.0458984375, + "learning_rate": 1.8772852778143062e-07, + "loss": -0.0019, + "reward": 1.3713606595993042, + "reward_std": 0.6118265390396118, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9330794811248779, + "step": 497 + }, + { + "completion_length": 347.625, + "epoch": 0.7248908296943232, + "grad_norm": 0.9756569197859266, + "kl": 0.0478515625, + "learning_rate": 1.858900013521788e-07, + "loss": -0.0005, + "reward": 1.2231640815734863, + "reward_std": 0.06950952857732773, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9696353673934937, + "step": 498 + }, + { + "completion_length": 342.609375, + "epoch": 0.7263464337700145, + "grad_norm": 1.104737132675653, + "kl": 0.047119140625, + "learning_rate": 1.8405846419976394e-07, + "loss": 0.0046, + "reward": 1.1284375190734863, + "reward_std": 0.30641376972198486, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9205468893051147, + "step": 499 + }, + { + "completion_length": 341.171875, + "epoch": 0.727802037845706, + "grad_norm": 1.0685649265760062, + "kl": 0.047119140625, + "learning_rate": 1.8223395707780786e-07, + "loss": -0.0022, + "reward": 0.746009111404419, + "reward_std": 0.37746497988700867, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.95305335521698, + "step": 500 + }, + { + "completion_length": 339.109375, + "epoch": 0.7292576419213974, + "grad_norm": 1.1494787457478763, + "kl": 0.06201171875, + "learning_rate": 1.8041652058350766e-07, + "loss": 0.0024, + "reward": 0.6165429353713989, + "reward_std": 1.0153552293777466, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9255794286727905, + "step": 501 + }, + { + "completion_length": 347.234375, + "epoch": 0.7307132459970888, + "grad_norm": 1.0597503387535725, + "kl": 0.044677734375, + "learning_rate": 1.7860619515673032e-07, + "loss": 0.0031, + "reward": 1.761705756187439, + "reward_std": 0.34419363737106323, + "rewards/accuracy_reward": 0.921875, + "rewards/format_reward": 0.9930989742279053, + "step": 502 + }, + { + "completion_length": 358.609375, + "epoch": 0.7321688500727802, + "grad_norm": 1.1777505730146933, + "kl": 0.0498046875, + "learning_rate": 1.7680302107911544e-07, + "loss": 0.0014, + "reward": 0.7594987154006958, + "reward_std": 0.6815189123153687, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9232096672058105, + "step": 503 + }, + { + "completion_length": 352.9375, + "epoch": 0.7336244541484717, + "grad_norm": 1.0262394911227826, + "kl": 0.051025390625, + "learning_rate": 1.7500703847317662e-07, + "loss": 0.0005, + "reward": 0.17281901836395264, + "reward_std": 0.6659858226776123, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 0.8982356786727905, + "step": 504 + }, + { + "completion_length": 366.046875, + "epoch": 0.735080058224163, + "grad_norm": 0.8757486128031936, + "kl": 0.046142578125, + "learning_rate": 1.7321828730141037e-07, + "loss": -0.003, + "reward": 0.9162174463272095, + "reward_std": 0.6034565567970276, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.9449284076690674, + "step": 505 + }, + { + "completion_length": 350.703125, + "epoch": 0.7365356622998545, + "grad_norm": 1.027601535912833, + "kl": 0.048828125, + "learning_rate": 1.7143680736540572e-07, + "loss": -0.0017, + "reward": 0.7806705236434937, + "reward_std": 0.4177248477935791, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9474283456802368, + "step": 506 + }, + { + "completion_length": 345.546875, + "epoch": 0.7379912663755459, + "grad_norm": 0.9635347526020929, + "kl": 0.051025390625, + "learning_rate": 1.6966263830495935e-07, + "loss": 0.0, + "reward": 1.5247917175292969, + "reward_std": 0.18780048191547394, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9935416579246521, + "step": 507 + }, + { + "completion_length": 347.53125, + "epoch": 0.7394468704512372, + "grad_norm": 1.0624576316371095, + "kl": 0.048095703125, + "learning_rate": 1.6789581959719294e-07, + "loss": 0.0018, + "reward": 0.1759960949420929, + "reward_std": 0.6078127026557922, + "rewards/accuracy_reward": 0.40625, + "rewards/format_reward": 0.9451627731323242, + "step": 508 + }, + { + "completion_length": 357.21875, + "epoch": 0.7409024745269287, + "grad_norm": 0.9931955691969966, + "kl": 0.045654296875, + "learning_rate": 1.661363905556758e-07, + "loss": 0.0016, + "reward": 0.9983984231948853, + "reward_std": 0.25032860040664673, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9772396087646484, + "step": 509 + }, + { + "completion_length": 335.78125, + "epoch": 0.74235807860262, + "grad_norm": 1.1766955680022009, + "kl": 0.056640625, + "learning_rate": 1.6438439032954853e-07, + "loss": -0.0019, + "reward": 0.9316536784172058, + "reward_std": 0.9252973794937134, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.9565234184265137, + "step": 510 + }, + { + "completion_length": 351.8125, + "epoch": 0.7438136826783115, + "grad_norm": 1.0104315217544244, + "kl": 0.047119140625, + "learning_rate": 1.6263985790265383e-07, + "loss": -0.0008, + "reward": 0.7268945574760437, + "reward_std": 0.5565764904022217, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.9422982335090637, + "step": 511 + }, + { + "completion_length": 339.59375, + "epoch": 0.745269286754003, + "grad_norm": 1.0913421651153437, + "kl": 0.053955078125, + "learning_rate": 1.609028320926668e-07, + "loss": 0.0001, + "reward": 0.7289843559265137, + "reward_std": 0.42006969451904297, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.9395182132720947, + "step": 512 + }, + { + "completion_length": 328.71875, + "epoch": 0.7467248908296943, + "grad_norm": 1.3247436820269498, + "kl": 0.056396484375, + "learning_rate": 1.5917335155023366e-07, + "loss": -0.0024, + "reward": 1.460852861404419, + "reward_std": 0.3853009045124054, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9632877707481384, + "step": 513 + }, + { + "completion_length": 343.625, + "epoch": 0.7481804949053857, + "grad_norm": 0.8746716302390848, + "kl": 0.04541015625, + "learning_rate": 1.574514547581095e-07, + "loss": 0.0023, + "reward": 0.48206380009651184, + "reward_std": 0.2509358525276184, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.9288216233253479, + "step": 514 + }, + { + "completion_length": 341.65625, + "epoch": 0.7496360989810772, + "grad_norm": 1.1764333370638496, + "kl": 0.046875, + "learning_rate": 1.557371800303039e-07, + "loss": -0.0049, + "reward": 1.1197460889816284, + "reward_std": 0.38245856761932373, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9527279138565063, + "step": 515 + }, + { + "completion_length": 347.953125, + "epoch": 0.7510917030567685, + "grad_norm": 0.9861112517300733, + "kl": 0.055419921875, + "learning_rate": 1.5403056551122694e-07, + "loss": -0.0008, + "reward": 1.1204687356948853, + "reward_std": 0.37004244327545166, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.96037757396698, + "step": 516 + }, + { + "completion_length": 344.5, + "epoch": 0.75254730713246, + "grad_norm": 1.1969290490481124, + "kl": 0.04833984375, + "learning_rate": 1.5233164917484114e-07, + "loss": -0.0003, + "reward": 1.2382487058639526, + "reward_std": 0.022989844903349876, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9881705641746521, + "step": 517 + }, + { + "completion_length": 351.859375, + "epoch": 0.7540029112081513, + "grad_norm": 0.8687338106556315, + "kl": 0.044677734375, + "learning_rate": 1.5064046882381626e-07, + "loss": 0.0009, + "reward": 0.549817681312561, + "reward_std": 0.1450074017047882, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.9933593273162842, + "step": 518 + }, + { + "completion_length": 342.375, + "epoch": 0.7554585152838428, + "grad_norm": 1.1222413107637277, + "kl": 0.051513671875, + "learning_rate": 1.4895706208868876e-07, + "loss": 0.0008, + "reward": 0.9473632574081421, + "reward_std": 0.6324166655540466, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9235742092132568, + "step": 519 + }, + { + "completion_length": 361.09375, + "epoch": 0.7569141193595342, + "grad_norm": 0.8767855411501952, + "kl": 0.04248046875, + "learning_rate": 1.4728146642702338e-07, + "loss": 0.0043, + "reward": 0.6358333826065063, + "reward_std": 0.42567020654678345, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9458463788032532, + "step": 520 + }, + { + "completion_length": 339.40625, + "epoch": 0.7583697234352256, + "grad_norm": 1.1957414378411515, + "kl": 0.046875, + "learning_rate": 1.4561371912258098e-07, + "loss": 0.0018, + "reward": 1.0426563024520874, + "reward_std": 0.49093982577323914, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.972434937953949, + "step": 521 + }, + { + "completion_length": 356.4375, + "epoch": 0.759825327510917, + "grad_norm": 1.1172524921233873, + "kl": 0.05419921875, + "learning_rate": 1.4395385728448727e-07, + "loss": -0.0025, + "reward": 1.7248958349227905, + "reward_std": 0.41973060369491577, + "rewards/accuracy_reward": 0.921875, + "rewards/format_reward": 0.9582551717758179, + "step": 522 + }, + { + "completion_length": 338.921875, + "epoch": 0.7612809315866085, + "grad_norm": 1.0582473955837757, + "kl": 0.044921875, + "learning_rate": 1.423019178464091e-07, + "loss": 0.0002, + "reward": 1.3815103769302368, + "reward_std": 0.5760092735290527, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9437500238418579, + "step": 523 + }, + { + "completion_length": 343.59375, + "epoch": 0.7627365356622998, + "grad_norm": 0.9984931203320635, + "kl": 0.046875, + "learning_rate": 1.406579375657308e-07, + "loss": 0.0039, + "reward": 0.6505143642425537, + "reward_std": 0.33663293719291687, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.9126237034797668, + "step": 524 + }, + { + "completion_length": 341.40625, + "epoch": 0.7641921397379913, + "grad_norm": 1.1754051815019624, + "kl": 0.051513671875, + "learning_rate": 1.3902195302273778e-07, + "loss": -0.004, + "reward": 1.7554621696472168, + "reward_std": 0.6817976236343384, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9429622888565063, + "step": 525 + }, + { + "completion_length": 333.265625, + "epoch": 0.7656477438136827, + "grad_norm": 1.0633972900525368, + "kl": 0.05078125, + "learning_rate": 1.373940006198014e-07, + "loss": 0.0001, + "reward": 1.441927194595337, + "reward_std": 0.5073419809341431, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9575521349906921, + "step": 526 + }, + { + "completion_length": 338.953125, + "epoch": 0.7671033478893741, + "grad_norm": 0.98335951118238, + "kl": 0.0478515625, + "learning_rate": 1.3577411658056965e-07, + "loss": 0.0003, + "reward": 0.7285221219062805, + "reward_std": 0.41777336597442627, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.9417122602462769, + "step": 527 + }, + { + "completion_length": 342.265625, + "epoch": 0.7685589519650655, + "grad_norm": 1.104214294364568, + "kl": 0.048828125, + "learning_rate": 1.3416233694916086e-07, + "loss": -0.0018, + "reward": 1.0809439420700073, + "reward_std": 0.498748779296875, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9592252969741821, + "step": 528 + }, + { + "completion_length": 365.03125, + "epoch": 0.7700145560407569, + "grad_norm": 0.8864241710102599, + "kl": 0.05322265625, + "learning_rate": 1.325586975893621e-07, + "loss": -0.001, + "reward": 1.0926563739776611, + "reward_std": 0.29806235432624817, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9340364933013916, + "step": 529 + }, + { + "completion_length": 355.90625, + "epoch": 0.7714701601164483, + "grad_norm": 0.8844305655178724, + "kl": 0.045654296875, + "learning_rate": 1.3096323418383043e-07, + "loss": 0.0012, + "reward": 1.4333789348602295, + "reward_std": 0.4198879301548004, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9490039348602295, + "step": 530 + }, + { + "completion_length": 343.890625, + "epoch": 0.7729257641921398, + "grad_norm": 1.253496882295048, + "kl": 0.048828125, + "learning_rate": 1.2937598223330005e-07, + "loss": 0.0014, + "reward": 1.2209309339523315, + "reward_std": 0.08089350908994675, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.96630859375, + "step": 531 + }, + { + "completion_length": 338.375, + "epoch": 0.7743813682678311, + "grad_norm": 1.1782096109944808, + "kl": 0.046630859375, + "learning_rate": 1.2779697705579058e-07, + "loss": 0.0036, + "reward": 0.9942382574081421, + "reward_std": 0.33707913756370544, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.97488933801651, + "step": 532 + }, + { + "completion_length": 356.609375, + "epoch": 0.7758369723435226, + "grad_norm": 1.0305776858800557, + "kl": 0.04345703125, + "learning_rate": 1.262262537858233e-07, + "loss": -0.0023, + "reward": 0.9166210889816284, + "reward_std": 0.7489937543869019, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.9442252516746521, + "step": 533 + }, + { + "completion_length": 366.984375, + "epoch": 0.777292576419214, + "grad_norm": 1.0309599470321495, + "kl": 0.043212890625, + "learning_rate": 1.2466384737363779e-07, + "loss": 0.0028, + "reward": 0.9351236820220947, + "reward_std": 1.2396140098571777, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9184830784797668, + "step": 534 + }, + { + "completion_length": 347.921875, + "epoch": 0.7787481804949054, + "grad_norm": 0.9649435296818741, + "kl": 0.05078125, + "learning_rate": 1.231097925844153e-07, + "loss": -0.003, + "reward": 1.10358726978302, + "reward_std": 0.4084942936897278, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.94621741771698, + "step": 535 + }, + { + "completion_length": 353.625, + "epoch": 0.7802037845705968, + "grad_norm": 1.2290305710541891, + "kl": 0.050537109375, + "learning_rate": 1.215641239975042e-07, + "loss": 0.004, + "reward": 0.35907554626464844, + "reward_std": 0.4438338279724121, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.9410285949707031, + "step": 536 + }, + { + "completion_length": 356.546875, + "epoch": 0.7816593886462883, + "grad_norm": 0.7192201796065968, + "kl": 0.046142578125, + "learning_rate": 1.2002687600565137e-07, + "loss": -0.0018, + "reward": 1.6950325965881348, + "reward_std": 0.20082132518291473, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.97628253698349, + "step": 537 + }, + { + "completion_length": 348.703125, + "epoch": 0.7831149927219796, + "grad_norm": 1.0661868149196754, + "kl": 0.052734375, + "learning_rate": 1.1849808281423635e-07, + "loss": 0.002, + "reward": 1.1826952695846558, + "reward_std": 0.5324006080627441, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9778385162353516, + "step": 538 + }, + { + "completion_length": 354.0625, + "epoch": 0.784570596797671, + "grad_norm": 1.0553040942110303, + "kl": 0.049072265625, + "learning_rate": 1.1697777844051104e-07, + "loss": -0.0017, + "reward": 1.595253825187683, + "reward_std": 0.9109958410263062, + "rewards/accuracy_reward": 0.890625, + "rewards/format_reward": 0.9233788847923279, + "step": 539 + }, + { + "completion_length": 352.875, + "epoch": 0.7860262008733624, + "grad_norm": 0.8135854803568479, + "kl": 0.052734375, + "learning_rate": 1.1546599671284158e-07, + "loss": 0.0048, + "reward": 0.06436197459697723, + "reward_std": 0.49560049176216125, + "rewards/accuracy_reward": 0.375, + "rewards/format_reward": 0.9329817295074463, + "step": 540 + }, + { + "completion_length": 356.609375, + "epoch": 0.7874818049490538, + "grad_norm": 1.098756609309082, + "kl": 0.043701171875, + "learning_rate": 1.1396277126995707e-07, + "loss": -0.0004, + "reward": 0.7316992282867432, + "reward_std": 0.6466116905212402, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.9482096433639526, + "step": 541 + }, + { + "completion_length": 360.78125, + "epoch": 0.7889374090247453, + "grad_norm": 1.0121213548186219, + "kl": 0.046630859375, + "learning_rate": 1.1246813556019924e-07, + "loss": -0.0013, + "reward": 1.4149739742279053, + "reward_std": 0.7345938086509705, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9305989742279053, + "step": 542 + }, + { + "completion_length": 351.8125, + "epoch": 0.7903930131004366, + "grad_norm": 1.1589095869023704, + "kl": 0.04736328125, + "learning_rate": 1.1098212284078035e-07, + "loss": -0.0025, + "reward": 1.481673240661621, + "reward_std": 0.5085718631744385, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9943815469741821, + "step": 543 + }, + { + "completion_length": 345.8125, + "epoch": 0.7918486171761281, + "grad_norm": 0.9205851416631115, + "kl": 0.048583984375, + "learning_rate": 1.0950476617704124e-07, + "loss": 0.0009, + "reward": 1.3531510829925537, + "reward_std": 0.41115716099739075, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9585286378860474, + "step": 544 + }, + { + "completion_length": 364.125, + "epoch": 0.7933042212518195, + "grad_norm": 0.9830413575226257, + "kl": 0.041748046875, + "learning_rate": 1.0803609844171719e-07, + "loss": -0.0028, + "reward": 0.51806640625, + "reward_std": 0.560853123664856, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.9600846767425537, + "step": 545 + }, + { + "completion_length": 363.234375, + "epoch": 0.7947598253275109, + "grad_norm": 0.9801296629236323, + "kl": 0.04736328125, + "learning_rate": 1.0657615231420491e-07, + "loss": 0.0013, + "reward": 0.36118483543395996, + "reward_std": 0.406131386756897, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.9475390315055847, + "step": 546 + }, + { + "completion_length": 372.859375, + "epoch": 0.7962154294032023, + "grad_norm": 0.8933107929367656, + "kl": 0.0458984375, + "learning_rate": 1.0512496027983714e-07, + "loss": 0.0033, + "reward": 0.27970701456069946, + "reward_std": 0.969652533531189, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.871985673904419, + "step": 547 + }, + { + "completion_length": 376.1875, + "epoch": 0.7976710334788938, + "grad_norm": 0.8824515938688806, + "kl": 0.04541015625, + "learning_rate": 1.0368255462915765e-07, + "loss": 0.0006, + "reward": 0.76199871301651, + "reward_std": 0.5525078773498535, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9308398365974426, + "step": 548 + }, + { + "completion_length": 372.171875, + "epoch": 0.7991266375545851, + "grad_norm": 0.9171428811714127, + "kl": 0.04541015625, + "learning_rate": 1.0224896745720512e-07, + "loss": 0.002, + "reward": 1.1354882717132568, + "reward_std": 1.028761863708496, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.8847330808639526, + "step": 549 + }, + { + "completion_length": 374.25, + "epoch": 0.8005822416302766, + "grad_norm": 0.9349834701869527, + "kl": 0.04931640625, + "learning_rate": 1.00824230662797e-07, + "loss": 0.0021, + "reward": 0.27811199426651, + "reward_std": 1.2048439979553223, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 0.9164583683013916, + "step": 550 + }, + { + "completion_length": 354.34375, + "epoch": 0.8020378457059679, + "grad_norm": 1.0746749824810193, + "kl": 0.04443359375, + "learning_rate": 9.940837594782125e-08, + "loss": 0.001, + "reward": 0.9130924344062805, + "reward_std": 0.9886895418167114, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.9405273199081421, + "step": 551 + }, + { + "completion_length": 369.28125, + "epoch": 0.8034934497816594, + "grad_norm": 0.9674452242381952, + "kl": 0.04443359375, + "learning_rate": 9.800143481652979e-08, + "loss": -0.0007, + "reward": 0.9438866972923279, + "reward_std": 0.6287566423416138, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9270898103713989, + "step": 552 + }, + { + "completion_length": 375.828125, + "epoch": 0.8049490538573508, + "grad_norm": 0.9793357755444789, + "kl": 0.042236328125, + "learning_rate": 9.660343857483799e-08, + "loss": -0.0034, + "reward": 1.215846300125122, + "reward_std": 1.2285206317901611, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.8707551956176758, + "step": 553 + }, + { + "completion_length": 355.03125, + "epoch": 0.8064046579330422, + "grad_norm": 0.9372538556562692, + "kl": 0.045166015625, + "learning_rate": 9.521441832962801e-08, + "loss": -0.0035, + "reward": 0.5855793952941895, + "reward_std": 0.4810252785682678, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.9439908862113953, + "step": 554 + }, + { + "completion_length": 340.40625, + "epoch": 0.8078602620087336, + "grad_norm": 1.131677567848908, + "kl": 0.0478515625, + "learning_rate": 9.383440498805712e-08, + "loss": 0.0006, + "reward": 1.6682096719741821, + "reward_std": 0.6599315404891968, + "rewards/accuracy_reward": 0.921875, + "rewards/format_reward": 0.9025846719741821, + "step": 555 + }, + { + "completion_length": 360.125, + "epoch": 0.8093158660844251, + "grad_norm": 0.9570116291494392, + "kl": 0.048583984375, + "learning_rate": 9.246342925686884e-08, + "loss": -0.0017, + "reward": 0.6679362058639526, + "reward_std": 0.5891071557998657, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.9264127612113953, + "step": 556 + }, + { + "completion_length": 350.296875, + "epoch": 0.8107714701601164, + "grad_norm": 1.0041594196588526, + "kl": 0.047607421875, + "learning_rate": 9.110152164171125e-08, + "loss": 0.0003, + "reward": 1.5817317962646484, + "reward_std": 0.5429558157920837, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9567317962646484, + "step": 557 + }, + { + "completion_length": 346.484375, + "epoch": 0.8122270742358079, + "grad_norm": 1.1612136223210474, + "kl": 0.045654296875, + "learning_rate": 8.974871244645626e-08, + "loss": -0.0012, + "reward": 1.318763017654419, + "reward_std": 0.8850969672203064, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9250651001930237, + "step": 558 + }, + { + "completion_length": 346.96875, + "epoch": 0.8136826783114993, + "grad_norm": 0.9594471396023049, + "kl": 0.05224609375, + "learning_rate": 8.840503177252745e-08, + "loss": -0.0016, + "reward": 1.168027400970459, + "reward_std": 0.8090516328811646, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9134830832481384, + "step": 559 + }, + { + "completion_length": 355.625, + "epoch": 0.8151382823871907, + "grad_norm": 1.165829749030139, + "kl": 0.0458984375, + "learning_rate": 8.707050951822842e-08, + "loss": -0.0029, + "reward": 0.23499347269535065, + "reward_std": 0.6717128753662109, + "rewards/accuracy_reward": 0.4375, + "rewards/format_reward": 0.9124023914337158, + "step": 560 + }, + { + "completion_length": 348.375, + "epoch": 0.8165938864628821, + "grad_norm": 0.8797964228562034, + "kl": 0.0439453125, + "learning_rate": 8.574517537807896e-08, + "loss": 0.0035, + "reward": 0.8617708086967468, + "reward_std": 0.06106572225689888, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9768619537353516, + "step": 561 + }, + { + "completion_length": 356.109375, + "epoch": 0.8180494905385735, + "grad_norm": 0.9540926767615193, + "kl": 0.04833984375, + "learning_rate": 8.442905884215329e-08, + "loss": 0.0001, + "reward": 0.7149023413658142, + "reward_std": 0.884148895740509, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.9297460913658142, + "step": 562 + }, + { + "completion_length": 337.484375, + "epoch": 0.8195050946142649, + "grad_norm": 1.0541518957324363, + "kl": 0.0498046875, + "learning_rate": 8.31221891954243e-08, + "loss": -0.0055, + "reward": 0.9072656631469727, + "reward_std": 0.35776764154434204, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.9742317199707031, + "step": 563 + }, + { + "completion_length": 349.96875, + "epoch": 0.8209606986899564, + "grad_norm": 1.0803926660638092, + "kl": 0.047119140625, + "learning_rate": 8.182459551711197e-08, + "loss": 0.001, + "reward": 1.44936203956604, + "reward_std": 0.49504512548446655, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9640494585037231, + "step": 564 + }, + { + "completion_length": 350.421875, + "epoch": 0.8224163027656477, + "grad_norm": 1.0513767136216559, + "kl": 0.0576171875, + "learning_rate": 8.053630668003642e-08, + "loss": 0.0045, + "reward": 0.4982747435569763, + "reward_std": 0.013658922165632248, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9944205284118652, + "step": 565 + }, + { + "completion_length": 372.59375, + "epoch": 0.8238719068413392, + "grad_norm": 0.9692434119961149, + "kl": 0.04833984375, + "learning_rate": 7.925735134997491e-08, + "loss": 0.0001, + "reward": 0.5592772960662842, + "reward_std": 0.5283293724060059, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.9178320169448853, + "step": 566 + }, + { + "completion_length": 342.453125, + "epoch": 0.8253275109170306, + "grad_norm": 1.0913320663341668, + "kl": 0.048095703125, + "learning_rate": 7.798775798502482e-08, + "loss": 0.0006, + "reward": 1.8661328554153442, + "reward_std": 0.2512561082839966, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.9598828554153442, + "step": 567 + }, + { + "completion_length": 344.15625, + "epoch": 0.826783114992722, + "grad_norm": 1.0413698797681266, + "kl": 0.04638671875, + "learning_rate": 7.672755483496979e-08, + "loss": 0.0034, + "reward": 1.9249870777130127, + "reward_std": 0.19554658234119415, + "rewards/accuracy_reward": 0.984375, + "rewards/format_reward": 0.9718619585037231, + "step": 568 + }, + { + "completion_length": 355.34375, + "epoch": 0.8282387190684134, + "grad_norm": 1.1642983816622974, + "kl": 0.04541015625, + "learning_rate": 7.547676994065116e-08, + "loss": -0.0025, + "reward": 1.4583983421325684, + "reward_std": 0.6337254643440247, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9729557037353516, + "step": 569 + }, + { + "completion_length": 358.421875, + "epoch": 0.8296943231441049, + "grad_norm": 0.8768810528600219, + "kl": 0.045654296875, + "learning_rate": 7.423543113334435e-08, + "loss": -0.0034, + "reward": 0.9223567843437195, + "reward_std": 0.24935288727283478, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.9516797065734863, + "step": 570 + }, + { + "completion_length": 377.25, + "epoch": 0.8311499272197962, + "grad_norm": 0.9846837227637683, + "kl": 0.04443359375, + "learning_rate": 7.300356603413965e-08, + "loss": 0.0001, + "reward": 1.3689582347869873, + "reward_std": 1.1683030128479004, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.8845832347869873, + "step": 571 + }, + { + "completion_length": 360.484375, + "epoch": 0.8326055312954876, + "grad_norm": 1.008771276578655, + "kl": 0.051025390625, + "learning_rate": 7.178120205332716e-08, + "loss": -0.0002, + "reward": 1.635097622871399, + "reward_std": 0.7983198165893555, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.9163476228713989, + "step": 572 + }, + { + "completion_length": 352.296875, + "epoch": 0.834061135371179, + "grad_norm": 1.1642594254533514, + "kl": 0.0498046875, + "learning_rate": 7.056836638978696e-08, + "loss": 0.0036, + "reward": 0.44958335161209106, + "reward_std": 0.6141500473022461, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9442839026451111, + "step": 573 + }, + { + "completion_length": 336.0625, + "epoch": 0.8355167394468704, + "grad_norm": 1.0723516782661382, + "kl": 0.05029296875, + "learning_rate": 6.936508603038465e-08, + "loss": 0.0009, + "reward": 1.4923112392425537, + "reward_std": 0.3686758875846863, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9570898413658142, + "step": 574 + }, + { + "completion_length": 362.203125, + "epoch": 0.8369723435225619, + "grad_norm": 0.8489752039238883, + "kl": 0.04443359375, + "learning_rate": 6.817138774936975e-08, + "loss": -0.0025, + "reward": 1.5311849117279053, + "reward_std": 0.48614656925201416, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.952552080154419, + "step": 575 + }, + { + "completion_length": 321.90625, + "epoch": 0.8384279475982532, + "grad_norm": 1.2381358943286065, + "kl": 0.057861328125, + "learning_rate": 6.698729810778064e-08, + "loss": 0.0007, + "reward": 1.2007226943969727, + "reward_std": 0.3871755599975586, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9507226943969727, + "step": 576 + }, + { + "completion_length": 345.765625, + "epoch": 0.8398835516739447, + "grad_norm": 0.9963944986671528, + "kl": 0.04638671875, + "learning_rate": 6.58128434528537e-08, + "loss": -0.0006, + "reward": 0.7609505653381348, + "reward_std": 0.811994194984436, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9278906583786011, + "step": 577 + }, + { + "completion_length": 345.453125, + "epoch": 0.8413391557496361, + "grad_norm": 0.9963445120426506, + "kl": 0.0498046875, + "learning_rate": 6.464804991743628e-08, + "loss": 0.0036, + "reward": 1.5395898818969727, + "reward_std": 0.23713621497154236, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.959876298904419, + "step": 578 + }, + { + "completion_length": 357.3125, + "epoch": 0.8427947598253275, + "grad_norm": 0.9805282214375126, + "kl": 0.048095703125, + "learning_rate": 6.349294341940592e-08, + "loss": 0.0006, + "reward": 1.4974348545074463, + "reward_std": 0.36625921726226807, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9642578363418579, + "step": 579 + }, + { + "completion_length": 368.546875, + "epoch": 0.8442503639010189, + "grad_norm": 0.9270265739588998, + "kl": 0.043701171875, + "learning_rate": 6.234754966109351e-08, + "loss": -0.001, + "reward": 1.1266862154006958, + "reward_std": 0.35941964387893677, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9662825465202332, + "step": 580 + }, + { + "completion_length": 338.53125, + "epoch": 0.8457059679767104, + "grad_norm": 1.0906414405939546, + "kl": 0.049072265625, + "learning_rate": 6.12118941287112e-08, + "loss": 0.002, + "reward": 1.5338281393051147, + "reward_std": 0.24283231794834137, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.9543749690055847, + "step": 581 + }, + { + "completion_length": 343.84375, + "epoch": 0.8471615720524017, + "grad_norm": 0.9818201939354958, + "kl": 0.04638671875, + "learning_rate": 6.008600209178538e-08, + "loss": -0.0039, + "reward": 1.5875390768051147, + "reward_std": 0.33346858620643616, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9625390768051147, + "step": 582 + }, + { + "completion_length": 375.671875, + "epoch": 0.8486171761280932, + "grad_norm": 1.0416004517786164, + "kl": 0.04736328125, + "learning_rate": 5.8969898602594325e-08, + "loss": 0.0057, + "reward": 0.5021549463272095, + "reward_std": 0.6197409629821777, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.9480664134025574, + "step": 583 + }, + { + "completion_length": 368.109375, + "epoch": 0.8500727802037845, + "grad_norm": 0.8470887591348754, + "kl": 0.048828125, + "learning_rate": 5.786360849561117e-08, + "loss": 0.0019, + "reward": 1.3838281631469727, + "reward_std": 0.6718348264694214, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9463281631469727, + "step": 584 + }, + { + "completion_length": 379.875, + "epoch": 0.851528384279476, + "grad_norm": 0.9181167404523447, + "kl": 0.044921875, + "learning_rate": 5.676715638695062e-08, + "loss": 0.0009, + "reward": 0.4963216185569763, + "reward_std": 0.826927900314331, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.9025716185569763, + "step": 585 + }, + { + "completion_length": 366.78125, + "epoch": 0.8529839883551674, + "grad_norm": 1.2129962706817803, + "kl": 0.05419921875, + "learning_rate": 5.5680566673822096e-08, + "loss": -0.0009, + "reward": 0.641100287437439, + "reward_std": 1.051483154296875, + "rewards/accuracy_reward": 0.5625, + "rewards/format_reward": 0.9454361796379089, + "step": 586 + }, + { + "completion_length": 352.34375, + "epoch": 0.8544395924308588, + "grad_norm": 0.9410634324985974, + "kl": 0.04931640625, + "learning_rate": 5.4603863533985825e-08, + "loss": -0.0023, + "reward": 1.0430793762207031, + "reward_std": 0.46832895278930664, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9298372268676758, + "step": 587 + }, + { + "completion_length": 362.78125, + "epoch": 0.8558951965065502, + "grad_norm": 1.0843006668744697, + "kl": 0.044921875, + "learning_rate": 5.353707092521581e-08, + "loss": 0.0068, + "reward": 1.073815107345581, + "reward_std": 0.4660176932811737, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9153515100479126, + "step": 588 + }, + { + "completion_length": 348.21875, + "epoch": 0.8573508005822417, + "grad_norm": 1.0292956885097058, + "kl": 0.048095703125, + "learning_rate": 5.2480212584766035e-08, + "loss": -0.0002, + "reward": 1.5367252826690674, + "reward_std": 0.8215746879577637, + "rewards/accuracy_reward": 0.875, + "rewards/format_reward": 0.9117252826690674, + "step": 589 + }, + { + "completion_length": 356.265625, + "epoch": 0.858806404657933, + "grad_norm": 0.9099297684040104, + "kl": 0.04638671875, + "learning_rate": 5.143331202884299e-08, + "loss": -0.0024, + "reward": 1.44970703125, + "reward_std": 0.39304301142692566, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9629882574081421, + "step": 590 + }, + { + "completion_length": 364.65625, + "epoch": 0.8602620087336245, + "grad_norm": 0.9309941077444687, + "kl": 0.048583984375, + "learning_rate": 5.039639255208156e-08, + "loss": -0.0002, + "reward": 1.375429630279541, + "reward_std": 0.5939683318138123, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9332812428474426, + "step": 591 + }, + { + "completion_length": 367.625, + "epoch": 0.8617176128093159, + "grad_norm": 0.8096322535425639, + "kl": 0.044921875, + "learning_rate": 4.9369477227027614e-08, + "loss": -0.0, + "reward": 1.422376275062561, + "reward_std": 0.4559106230735779, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9346680045127869, + "step": 592 + }, + { + "completion_length": 348.34375, + "epoch": 0.8631732168850073, + "grad_norm": 0.9888972852645406, + "kl": 0.047119140625, + "learning_rate": 4.835258890362387e-08, + "loss": 0.0014, + "reward": 0.8151302337646484, + "reward_std": 0.5511770844459534, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9372526407241821, + "step": 593 + }, + { + "completion_length": 359.03125, + "epoch": 0.8646288209606987, + "grad_norm": 1.0504733411057308, + "kl": 0.050537109375, + "learning_rate": 4.7345750208701684e-08, + "loss": -0.0021, + "reward": 1.6032031774520874, + "reward_std": 0.6642707586288452, + "rewards/accuracy_reward": 0.890625, + "rewards/format_reward": 0.9313281178474426, + "step": 594 + }, + { + "completion_length": 360.5625, + "epoch": 0.86608442503639, + "grad_norm": 0.9767718704060626, + "kl": 0.047119140625, + "learning_rate": 4.634898354547778e-08, + "loss": 0.0029, + "reward": 0.6824283599853516, + "reward_std": 0.3918249011039734, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.8971809148788452, + "step": 595 + }, + { + "completion_length": 365.375, + "epoch": 0.8675400291120815, + "grad_norm": 0.7220183880376431, + "kl": 0.0419921875, + "learning_rate": 4.536231109305577e-08, + "loss": -0.0012, + "reward": 1.7962956428527832, + "reward_std": 0.3715449273586273, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9828841686248779, + "step": 596 + }, + { + "completion_length": 369.734375, + "epoch": 0.868995633187773, + "grad_norm": 1.013373247031297, + "kl": 0.048828125, + "learning_rate": 4.4385754805932095e-08, + "loss": 0.0003, + "reward": 1.1727409362792969, + "reward_std": 0.9622257351875305, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.9174544215202332, + "step": 597 + }, + { + "completion_length": 373.765625, + "epoch": 0.8704512372634643, + "grad_norm": 0.9611521678835144, + "kl": 0.0478515625, + "learning_rate": 4.341933641350842e-08, + "loss": 0.0004, + "reward": 0.015071600675582886, + "reward_std": 0.68045574426651, + "rewards/accuracy_reward": 0.359375, + "rewards/format_reward": 0.9341601729393005, + "step": 598 + }, + { + "completion_length": 354.03125, + "epoch": 0.8719068413391557, + "grad_norm": 1.1525733336623423, + "kl": 0.046630859375, + "learning_rate": 4.2463077419606976e-08, + "loss": 0.0017, + "reward": 1.3347785472869873, + "reward_std": 0.567120373249054, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9418619871139526, + "step": 599 + }, + { + "completion_length": 363.328125, + "epoch": 0.8733624454148472, + "grad_norm": 1.0773160423814951, + "kl": 0.048095703125, + "learning_rate": 4.151699910199336e-08, + "loss": 0.0036, + "reward": 0.6707291603088379, + "reward_std": 0.8057493567466736, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.8792187571525574, + "step": 600 + }, + { + "completion_length": 359.28125, + "epoch": 0.8748180494905385, + "grad_norm": 0.9746476457350777, + "kl": 0.0537109375, + "learning_rate": 4.058112251190193e-08, + "loss": -0.0024, + "reward": 0.4454817771911621, + "reward_std": 0.7263391613960266, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.8914192318916321, + "step": 601 + }, + { + "completion_length": 380.09375, + "epoch": 0.87627365356623, + "grad_norm": 0.9946978532938755, + "kl": 0.046630859375, + "learning_rate": 3.9655468473568435e-08, + "loss": -0.0026, + "reward": 0.7158983945846558, + "reward_std": 0.7142089009284973, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.9306380152702332, + "step": 602 + }, + { + "completion_length": 370.9375, + "epoch": 0.8777292576419214, + "grad_norm": 1.0502943403626153, + "kl": 0.0498046875, + "learning_rate": 3.8740057583765694e-08, + "loss": -0.0026, + "reward": 1.6357030868530273, + "reward_std": 0.7968278527259827, + "rewards/accuracy_reward": 0.90625, + "rewards/format_reward": 0.9169531464576721, + "step": 603 + }, + { + "completion_length": 367.421875, + "epoch": 0.8791848617176128, + "grad_norm": 0.8640177808219943, + "kl": 0.048828125, + "learning_rate": 3.783491021134588e-08, + "loss": 0.0039, + "reward": 1.4072265625, + "reward_std": 0.551764965057373, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9208593368530273, + "step": 604 + }, + { + "completion_length": 366.359375, + "epoch": 0.8806404657933042, + "grad_norm": 0.8946058550020072, + "kl": 0.053466796875, + "learning_rate": 3.694004649678706e-08, + "loss": -0.0015, + "reward": 1.261816382408142, + "reward_std": 0.89951491355896, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.9160351753234863, + "step": 605 + }, + { + "completion_length": 346.609375, + "epoch": 0.8820960698689956, + "grad_norm": 0.9508950509942107, + "kl": 0.046142578125, + "learning_rate": 3.6055486351745324e-08, + "loss": 0.0035, + "reward": 1.8074610233306885, + "reward_std": 0.5373015999794006, + "rewards/accuracy_reward": 0.953125, + "rewards/format_reward": 0.9480859041213989, + "step": 606 + }, + { + "completion_length": 359.171875, + "epoch": 0.883551673944687, + "grad_norm": 1.0635120558945663, + "kl": 0.050537109375, + "learning_rate": 3.51812494586114e-08, + "loss": 0.0011, + "reward": -0.21584634482860565, + "reward_std": 0.69797682762146, + "rewards/accuracy_reward": 0.296875, + "rewards/format_reward": 0.8804166316986084, + "step": 607 + }, + { + "completion_length": 359.65625, + "epoch": 0.8850072780203785, + "grad_norm": 0.9616362685575608, + "kl": 0.0478515625, + "learning_rate": 3.4317355270072954e-08, + "loss": 0.0031, + "reward": 0.5914518237113953, + "reward_std": 0.7840473651885986, + "rewards/accuracy_reward": 0.546875, + "rewards/format_reward": 0.9474804997444153, + "step": 608 + }, + { + "completion_length": 374.515625, + "epoch": 0.8864628820960698, + "grad_norm": 0.9665570781215977, + "kl": 0.0537109375, + "learning_rate": 3.3463823008681334e-08, + "loss": 0.0009, + "reward": 0.6901432275772095, + "reward_std": 0.44793906807899475, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.9485155940055847, + "step": 609 + }, + { + "completion_length": 348.96875, + "epoch": 0.8879184861717613, + "grad_norm": 1.1514296655136664, + "kl": 0.056396484375, + "learning_rate": 3.2620671666424515e-08, + "loss": -0.0005, + "reward": 1.0574414730072021, + "reward_std": 0.2184952199459076, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9922200441360474, + "step": 610 + }, + { + "completion_length": 386.796875, + "epoch": 0.8893740902474527, + "grad_norm": 0.8572317909478371, + "kl": 0.046630859375, + "learning_rate": 3.17879200043038e-08, + "loss": -0.0004, + "reward": 0.48298177123069763, + "reward_std": 0.4573482573032379, + "rewards/accuracy_reward": 0.515625, + "rewards/format_reward": 0.93610680103302, + "step": 611 + }, + { + "completion_length": 364.4375, + "epoch": 0.8908296943231441, + "grad_norm": 0.80677467402995, + "kl": 0.046142578125, + "learning_rate": 3.0965586551917054e-08, + "loss": 0.0023, + "reward": 0.9373893141746521, + "reward_std": 0.7911602258682251, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9207747578620911, + "step": 612 + }, + { + "completion_length": 377.734375, + "epoch": 0.8922852983988355, + "grad_norm": 0.7229029396190814, + "kl": 0.046875, + "learning_rate": 3.015368960704584e-08, + "loss": 0.0026, + "reward": 1.031211018562317, + "reward_std": 0.7297381162643433, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.91817706823349, + "step": 613 + }, + { + "completion_length": 348.046875, + "epoch": 0.893740902474527, + "grad_norm": 1.2890732275734214, + "kl": 0.056640625, + "learning_rate": 2.935224723524843e-08, + "loss": -0.0016, + "reward": -0.2784309983253479, + "reward_std": 0.33096808195114136, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.9581446051597595, + "step": 614 + }, + { + "completion_length": 341.65625, + "epoch": 0.8951965065502183, + "grad_norm": 1.1372194523462842, + "kl": 0.04638671875, + "learning_rate": 2.8561277269457895e-08, + "loss": -0.0019, + "reward": 1.5267903804779053, + "reward_std": 0.19310006499290466, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.99211585521698, + "step": 615 + }, + { + "completion_length": 340.84375, + "epoch": 0.8966521106259098, + "grad_norm": 1.1633831512202182, + "kl": 0.054443359375, + "learning_rate": 2.7780797309585603e-08, + "loss": 0.0015, + "reward": 1.0949218273162842, + "reward_std": 0.4195671081542969, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9380208253860474, + "step": 616 + }, + { + "completion_length": 340.375, + "epoch": 0.8981077147016011, + "grad_norm": 1.0572682704568244, + "kl": 0.0546875, + "learning_rate": 2.701082472212879e-08, + "loss": 0.0005, + "reward": 1.088769555091858, + "reward_std": 1.067856788635254, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9237044453620911, + "step": 617 + }, + { + "completion_length": 365.3125, + "epoch": 0.8995633187772926, + "grad_norm": 0.8282223286971002, + "kl": 0.054931640625, + "learning_rate": 2.625137663978516e-08, + "loss": 0.0001, + "reward": 1.419173240661621, + "reward_std": 0.7184128761291504, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9299284219741821, + "step": 618 + }, + { + "completion_length": 354.203125, + "epoch": 0.901018922852984, + "grad_norm": 1.0373995243480785, + "kl": 0.046142578125, + "learning_rate": 2.5502469961070637e-08, + "loss": -0.0006, + "reward": 0.5479947924613953, + "reward_std": 0.4919097423553467, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.9511458873748779, + "step": 619 + }, + { + "completion_length": 370.484375, + "epoch": 0.9024745269286754, + "grad_norm": 0.9212810693479083, + "kl": 0.046142578125, + "learning_rate": 2.4764121349944265e-08, + "loss": 0.0015, + "reward": 0.3472330868244171, + "reward_std": 0.448018878698349, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.9308269023895264, + "step": 620 + }, + { + "completion_length": 358.921875, + "epoch": 0.9039301310043668, + "grad_norm": 0.9860633655951645, + "kl": 0.0478515625, + "learning_rate": 2.4036347235436738e-08, + "loss": 0.0069, + "reward": 0.37333983182907104, + "reward_std": 0.5936962962150574, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.9631054997444153, + "step": 621 + }, + { + "completion_length": 351.390625, + "epoch": 0.9053857350800583, + "grad_norm": 0.9338847956064392, + "kl": 0.04736328125, + "learning_rate": 2.331916381128535e-08, + "loss": 0.0042, + "reward": 1.354824185371399, + "reward_std": 0.553050696849823, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9633399248123169, + "step": 622 + }, + { + "completion_length": 363.1875, + "epoch": 0.9068413391557496, + "grad_norm": 0.877438715066185, + "kl": 0.04443359375, + "learning_rate": 2.2612587035573226e-08, + "loss": 0.0024, + "reward": 0.77873694896698, + "reward_std": 0.24482090771198273, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9492838382720947, + "step": 623 + }, + { + "completion_length": 377.453125, + "epoch": 0.9082969432314411, + "grad_norm": 0.9221142915068276, + "kl": 0.04150390625, + "learning_rate": 2.1916632630374577e-08, + "loss": 0.0007, + "reward": 1.2329556941986084, + "reward_std": 0.8266670107841492, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.8883593082427979, + "step": 624 + }, + { + "completion_length": 357.9375, + "epoch": 0.9097525473071325, + "grad_norm": 1.0177312459104024, + "kl": 0.048583984375, + "learning_rate": 2.123131608140455e-08, + "loss": -0.001, + "reward": 1.74072265625, + "reward_std": 0.59820157289505, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.92822265625, + "step": 625 + }, + { + "completion_length": 352.171875, + "epoch": 0.9112081513828238, + "grad_norm": 0.8169726421475969, + "kl": 0.051025390625, + "learning_rate": 2.0556652637675144e-08, + "loss": -0.0005, + "reward": 1.4498176574707031, + "reward_std": 0.48735594749450684, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9654426574707031, + "step": 626 + }, + { + "completion_length": 347.40625, + "epoch": 0.9126637554585153, + "grad_norm": 1.043701865873264, + "kl": 0.049072265625, + "learning_rate": 1.989265731115525e-08, + "loss": 0.003, + "reward": 1.333925724029541, + "reward_std": 0.8411956429481506, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9422720670700073, + "step": 627 + }, + { + "completion_length": 363.640625, + "epoch": 0.9141193595342066, + "grad_norm": 0.9420184278378598, + "kl": 0.047607421875, + "learning_rate": 1.9239344876437248e-08, + "loss": 0.0015, + "reward": 1.0000911951065063, + "reward_std": 0.5910583138465881, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9331510663032532, + "step": 628 + }, + { + "completion_length": 355.1875, + "epoch": 0.9155749636098981, + "grad_norm": 0.9554452056503712, + "kl": 0.052978515625, + "learning_rate": 1.8596729870407835e-08, + "loss": 0.0032, + "reward": 1.8222005367279053, + "reward_std": 0.49411827325820923, + "rewards/accuracy_reward": 0.953125, + "rewards/format_reward": 0.9628255367279053, + "step": 629 + }, + { + "completion_length": 347.140625, + "epoch": 0.9170305676855895, + "grad_norm": 0.9306151510914769, + "kl": 0.046142578125, + "learning_rate": 1.796482659192472e-08, + "loss": 0.0041, + "reward": 1.479524850845337, + "reward_std": 0.41134506464004517, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9424023032188416, + "step": 630 + }, + { + "completion_length": 374.625, + "epoch": 0.9184861717612809, + "grad_norm": 0.8480676856677177, + "kl": 0.047119140625, + "learning_rate": 1.7343649101498327e-08, + "loss": 0.0028, + "reward": 1.3734700679779053, + "reward_std": 0.3788111209869385, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9359700679779053, + "step": 631 + }, + { + "completion_length": 353.765625, + "epoch": 0.9199417758369723, + "grad_norm": 0.8894056614696153, + "kl": 0.0595703125, + "learning_rate": 1.6733211220979315e-08, + "loss": -0.0011, + "reward": 0.7032226324081421, + "reward_std": 0.6665799617767334, + "rewards/accuracy_reward": 0.59375, + "rewards/format_reward": 0.9133919477462769, + "step": 632 + }, + { + "completion_length": 354.5, + "epoch": 0.9213973799126638, + "grad_norm": 1.2302918226628299, + "kl": 0.0478515625, + "learning_rate": 1.6133526533250563e-08, + "loss": -0.0005, + "reward": 1.0562500953674316, + "reward_std": 0.6852624416351318, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9466797113418579, + "step": 633 + }, + { + "completion_length": 364.296875, + "epoch": 0.9228529839883551, + "grad_norm": 0.794210762993565, + "kl": 0.042236328125, + "learning_rate": 1.5544608381925285e-08, + "loss": 0.0023, + "reward": 1.3560612201690674, + "reward_std": 0.5529812574386597, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9654362201690674, + "step": 634 + }, + { + "completion_length": 375.125, + "epoch": 0.9243085880640466, + "grad_norm": 1.1478859293196242, + "kl": 0.046630859375, + "learning_rate": 1.4966469871049604e-08, + "loss": -0.0025, + "reward": 0.9822721481323242, + "reward_std": 0.7605947256088257, + "rewards/accuracy_reward": 0.6875, + "rewards/format_reward": 0.9166861772537231, + "step": 635 + }, + { + "completion_length": 367.328125, + "epoch": 0.925764192139738, + "grad_norm": 0.9763592041679303, + "kl": 0.0458984375, + "learning_rate": 1.4399123864811902e-08, + "loss": 0.0015, + "reward": 1.4207422733306885, + "reward_std": 0.8127451539039612, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9341145753860474, + "step": 636 + }, + { + "completion_length": 350.515625, + "epoch": 0.9272197962154294, + "grad_norm": 1.133505236634761, + "kl": 0.053466796875, + "learning_rate": 1.384258298725549e-08, + "loss": -0.002, + "reward": 1.8829882144927979, + "reward_std": 0.3248525857925415, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.9767382740974426, + "step": 637 + }, + { + "completion_length": 362.71875, + "epoch": 0.9286754002911208, + "grad_norm": 0.9584500222009057, + "kl": 0.0439453125, + "learning_rate": 1.3296859621998668e-08, + "loss": 0.0008, + "reward": 1.1840624809265137, + "reward_std": 0.1869007647037506, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9802994728088379, + "step": 638 + }, + { + "completion_length": 351.375, + "epoch": 0.9301310043668122, + "grad_norm": 1.10049930360626, + "kl": 0.051513671875, + "learning_rate": 1.2761965911958384e-08, + "loss": 0.0002, + "reward": 1.2603776454925537, + "reward_std": 0.6707699298858643, + "rewards/accuracy_reward": 0.78125, + "rewards/format_reward": 0.9109375476837158, + "step": 639 + }, + { + "completion_length": 366.0625, + "epoch": 0.9315866084425036, + "grad_norm": 1.1123121973401144, + "kl": 0.047607421875, + "learning_rate": 1.2237913759080676e-08, + "loss": 0.001, + "reward": 0.48548179864883423, + "reward_std": 0.06013864278793335, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.9767447710037231, + "step": 640 + }, + { + "completion_length": 366.96875, + "epoch": 0.9330422125181951, + "grad_norm": 1.0164203486015158, + "kl": 0.047607421875, + "learning_rate": 1.1724714824075332e-08, + "loss": -0.0036, + "reward": 0.3951367139816284, + "reward_std": 0.7576174736022949, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.931347668170929, + "step": 641 + }, + { + "completion_length": 360.703125, + "epoch": 0.9344978165938864, + "grad_norm": 0.899137742081826, + "kl": 0.045654296875, + "learning_rate": 1.1222380526156927e-08, + "loss": -0.0016, + "reward": 0.7644987106323242, + "reward_std": 0.2871595323085785, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9332357048988342, + "step": 642 + }, + { + "completion_length": 343.53125, + "epoch": 0.9359534206695779, + "grad_norm": 1.1461565307561832, + "kl": 0.05078125, + "learning_rate": 1.073092204279019e-08, + "loss": -0.0061, + "reward": 1.073411464691162, + "reward_std": 0.4045974314212799, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9620703458786011, + "step": 643 + }, + { + "completion_length": 359.09375, + "epoch": 0.9374090247452693, + "grad_norm": 0.9684956712684953, + "kl": 0.05224609375, + "learning_rate": 1.0250350309441825e-08, + "loss": 0.0023, + "reward": 1.4904427528381348, + "reward_std": 0.36797118186950684, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9588932394981384, + "step": 644 + }, + { + "completion_length": 355.78125, + "epoch": 0.9388646288209607, + "grad_norm": 1.0295786196933356, + "kl": 0.0498046875, + "learning_rate": 9.780676019336632e-09, + "loss": -0.0004, + "reward": 1.1840624809265137, + "reward_std": 0.1880166232585907, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9797786474227905, + "step": 645 + }, + { + "completion_length": 341.78125, + "epoch": 0.9403202328966521, + "grad_norm": 0.8966605070859263, + "kl": 0.048095703125, + "learning_rate": 9.32190962322027e-09, + "loss": 0.0022, + "reward": 1.1638997793197632, + "reward_std": 0.4949903190135956, + "rewards/accuracy_reward": 0.734375, + "rewards/format_reward": 0.9582747220993042, + "step": 646 + }, + { + "completion_length": 350.515625, + "epoch": 0.9417758369723436, + "grad_norm": 1.1365206921282076, + "kl": 0.050537109375, + "learning_rate": 8.874061329125936e-09, + "loss": 0.0033, + "reward": 0.8706445097923279, + "reward_std": 0.02185887284576893, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9920898675918579, + "step": 647 + }, + { + "completion_length": 368.203125, + "epoch": 0.9432314410480349, + "grad_norm": 0.9073495313031953, + "kl": 0.048095703125, + "learning_rate": 8.437141102147882e-09, + "loss": -0.0029, + "reward": 0.3735416531562805, + "reward_std": 0.5898939967155457, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.9196093678474426, + "step": 648 + }, + { + "completion_length": 355.296875, + "epoch": 0.9446870451237264, + "grad_norm": 0.9644518486493937, + "kl": 0.04638671875, + "learning_rate": 8.011158664219253e-09, + "loss": 0.0028, + "reward": 1.4891471862792969, + "reward_std": 0.37131497263908386, + "rewards/accuracy_reward": 0.84375, + "rewards/format_reward": 0.9561783671379089, + "step": 649 + }, + { + "completion_length": 357.859375, + "epoch": 0.9461426491994177, + "grad_norm": 0.915855189506075, + "kl": 0.052001953125, + "learning_rate": 7.59612349389599e-09, + "loss": -0.0002, + "reward": 1.8685481548309326, + "reward_std": 0.24537202715873718, + "rewards/accuracy_reward": 0.96875, + "rewards/format_reward": 0.9622981548309326, + "step": 650 + }, + { + "completion_length": 344.15625, + "epoch": 0.9475982532751092, + "grad_norm": 0.9405891712802695, + "kl": 0.050537109375, + "learning_rate": 7.1920448261457715e-09, + "loss": -0.0006, + "reward": 0.40004557371139526, + "reward_std": 0.2768644690513611, + "rewards/accuracy_reward": 0.484375, + "rewards/format_reward": 0.9441732168197632, + "step": 651 + }, + { + "completion_length": 374.21875, + "epoch": 0.9490538573508006, + "grad_norm": 0.9649816496242495, + "kl": 0.05078125, + "learning_rate": 6.798931652142737e-09, + "loss": -0.0019, + "reward": 0.29408854246139526, + "reward_std": 0.7213122248649597, + "rewards/accuracy_reward": 0.453125, + "rewards/format_reward": 0.9326822757720947, + "step": 652 + }, + { + "completion_length": 367.28125, + "epoch": 0.950509461426492, + "grad_norm": 0.9385056166107855, + "kl": 0.05224609375, + "learning_rate": 6.416792719067143e-09, + "loss": 0.0026, + "reward": 0.6902929544448853, + "reward_std": 0.43664127588272095, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.9491862058639526, + "step": 653 + }, + { + "completion_length": 349.265625, + "epoch": 0.9519650655021834, + "grad_norm": 1.1227123204211185, + "kl": 0.05419921875, + "learning_rate": 6.045636529911025e-09, + "loss": 0.0015, + "reward": 0.9635221362113953, + "reward_std": 0.18498189747333527, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.9879622459411621, + "step": 654 + }, + { + "completion_length": 356.90625, + "epoch": 0.9534206695778749, + "grad_norm": 0.9808379639308646, + "kl": 0.05859375, + "learning_rate": 5.685471343288672e-09, + "loss": 0.0018, + "reward": 1.1685742139816284, + "reward_std": 0.8473724722862244, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.8689258098602295, + "step": 655 + }, + { + "completion_length": 367.015625, + "epoch": 0.9548762736535662, + "grad_norm": 0.9461780907460997, + "kl": 0.05078125, + "learning_rate": 5.33630517325323e-09, + "loss": 0.0026, + "reward": 0.7643359303474426, + "reward_std": 0.6334630250930786, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9341406226158142, + "step": 656 + }, + { + "completion_length": 354.828125, + "epoch": 0.9563318777292577, + "grad_norm": 1.0187044029454269, + "kl": 0.056396484375, + "learning_rate": 4.998145789118114e-09, + "loss": 0.0004, + "reward": 0.34561195969581604, + "reward_std": 0.3423462212085724, + "rewards/accuracy_reward": 0.46875, + "rewards/format_reward": 0.9248437285423279, + "step": 657 + }, + { + "completion_length": 356.8125, + "epoch": 0.9577874818049491, + "grad_norm": 0.9264971002008862, + "kl": 0.0478515625, + "learning_rate": 4.671000715284146e-09, + "loss": 0.0036, + "reward": 1.7602018117904663, + "reward_std": 0.6684818267822266, + "rewards/accuracy_reward": 0.9375, + "rewards/format_reward": 0.9477018117904663, + "step": 658 + }, + { + "completion_length": 360.484375, + "epoch": 0.9592430858806404, + "grad_norm": 0.899630251698082, + "kl": 0.048583984375, + "learning_rate": 4.354877231072307e-09, + "loss": 0.0017, + "reward": 0.73576819896698, + "reward_std": 0.8185403347015381, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.8940234184265137, + "step": 659 + }, + { + "completion_length": 368.375, + "epoch": 0.9606986899563319, + "grad_norm": 0.7875268897862577, + "kl": 0.04345703125, + "learning_rate": 4.049782370561583e-09, + "loss": 0.0017, + "reward": 0.9524999856948853, + "reward_std": 0.8121271133422852, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9330078363418579, + "step": 660 + }, + { + "completion_length": 359.25, + "epoch": 0.9621542940320232, + "grad_norm": 1.0118312486353827, + "kl": 0.046630859375, + "learning_rate": 3.755722922432481e-09, + "loss": -0.0042, + "reward": 1.806471347808838, + "reward_std": 0.41752493381500244, + "rewards/accuracy_reward": 0.953125, + "rewards/format_reward": 0.9470964074134827, + "step": 661 + }, + { + "completion_length": 365.0, + "epoch": 0.9636098981077147, + "grad_norm": 1.029349953751044, + "kl": 0.048828125, + "learning_rate": 3.4727054298161473e-09, + "loss": 0.0019, + "reward": 0.17289060354232788, + "reward_std": 0.7982980012893677, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 0.8995833396911621, + "step": 662 + }, + { + "completion_length": 381.40625, + "epoch": 0.9650655021834061, + "grad_norm": 0.8922375425023275, + "kl": 0.0478515625, + "learning_rate": 3.200736190148545e-09, + "loss": 0.0037, + "reward": 1.12339186668396, + "reward_std": 0.8738340139389038, + "rewards/accuracy_reward": 0.75, + "rewards/format_reward": 0.8733919262886047, + "step": 663 + }, + { + "completion_length": 347.40625, + "epoch": 0.9665211062590975, + "grad_norm": 1.046079535967673, + "kl": 0.050537109375, + "learning_rate": 2.9398212550303945e-09, + "loss": 0.0013, + "reward": 1.3840559720993042, + "reward_std": 0.5832023024559021, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9441732168197632, + "step": 664 + }, + { + "completion_length": 339.375, + "epoch": 0.9679767103347889, + "grad_norm": 0.9349120246936703, + "kl": 0.048095703125, + "learning_rate": 2.6899664300925607e-09, + "loss": -0.0006, + "reward": 0.6614192724227905, + "reward_std": 0.7153116464614868, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.9226171970367432, + "step": 665 + }, + { + "completion_length": 369.828125, + "epoch": 0.9694323144104804, + "grad_norm": 0.8189860245778907, + "kl": 0.046875, + "learning_rate": 2.451177274866989e-09, + "loss": -0.0006, + "reward": 0.96561199426651, + "reward_std": 0.43986329436302185, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9498568177223206, + "step": 666 + }, + { + "completion_length": 359.171875, + "epoch": 0.9708879184861717, + "grad_norm": 0.9437761108308588, + "kl": 0.0478515625, + "learning_rate": 2.2234591026626946e-09, + "loss": -0.0038, + "reward": 0.9382357001304626, + "reward_std": 0.76799476146698, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9165430068969727, + "step": 667 + }, + { + "completion_length": 378.453125, + "epoch": 0.9723435225618632, + "grad_norm": 0.9460750716630619, + "kl": 0.044677734375, + "learning_rate": 2.0068169804478564e-09, + "loss": 0.0022, + "reward": 0.8572070002555847, + "reward_std": 0.639157772064209, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.9332357048988342, + "step": 668 + }, + { + "completion_length": 324.75, + "epoch": 0.9737991266375546, + "grad_norm": 1.0371341680952215, + "kl": 0.04833984375, + "learning_rate": 1.8012557287367391e-09, + "loss": -0.0001, + "reward": 0.5965365171432495, + "reward_std": 0.1833423525094986, + "rewards/accuracy_reward": 0.53125, + "rewards/format_reward": 0.9920833706855774, + "step": 669 + }, + { + "completion_length": 347.953125, + "epoch": 0.975254730713246, + "grad_norm": 1.00932152995975, + "kl": 0.056640625, + "learning_rate": 1.6067799214828926e-09, + "loss": 0.0008, + "reward": 1.5535807609558105, + "reward_std": 0.20138989388942719, + "rewards/accuracy_reward": 0.859375, + "rewards/format_reward": 0.9731640815734863, + "step": 670 + }, + { + "completion_length": 342.890625, + "epoch": 0.9767103347889374, + "grad_norm": 0.9803179005691086, + "kl": 0.05224609375, + "learning_rate": 1.4233938859767868e-09, + "loss": 0.0033, + "reward": 1.337246060371399, + "reward_std": 0.5977544784545898, + "rewards/accuracy_reward": 0.796875, + "rewards/format_reward": 0.9449414610862732, + "step": 671 + }, + { + "completion_length": 361.375, + "epoch": 0.9781659388646288, + "grad_norm": 0.9610606033508311, + "kl": 0.0498046875, + "learning_rate": 1.251101702750168e-09, + "loss": 0.004, + "reward": 0.9087890386581421, + "reward_std": 0.7360955476760864, + "rewards/accuracy_reward": 0.65625, + "rewards/format_reward": 0.9337239265441895, + "step": 672 + }, + { + "completion_length": 357.65625, + "epoch": 0.9796215429403202, + "grad_norm": 0.9783219664641567, + "kl": 0.048095703125, + "learning_rate": 1.0899072054846303e-09, + "loss": 0.0021, + "reward": 1.361875057220459, + "reward_std": 0.5983988046646118, + "rewards/accuracy_reward": 0.8125, + "rewards/format_reward": 0.9243749380111694, + "step": 673 + }, + { + "completion_length": 353.78125, + "epoch": 0.9810771470160117, + "grad_norm": 0.8663994288006219, + "kl": 0.05712890625, + "learning_rate": 9.398139809268514e-10, + "loss": 0.0056, + "reward": 1.0991926193237305, + "reward_std": 0.4083336889743805, + "rewards/accuracy_reward": 0.71875, + "rewards/format_reward": 0.9410547018051147, + "step": 674 + }, + { + "completion_length": 369.703125, + "epoch": 0.982532751091703, + "grad_norm": 1.0224494770074284, + "kl": 0.047119140625, + "learning_rate": 8.008253688084887e-10, + "loss": -0.0016, + "reward": 0.15850260853767395, + "reward_std": 0.9653068780899048, + "rewards/accuracy_reward": 0.421875, + "rewards/format_reward": 0.8858072757720947, + "step": 675 + }, + { + "completion_length": 349.8125, + "epoch": 0.9839883551673945, + "grad_norm": 1.0418234894905996, + "kl": 0.047607421875, + "learning_rate": 6.729444617717961e-10, + "loss": -0.0003, + "reward": 1.2996224164962769, + "reward_std": 0.4023718535900116, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.9947004914283752, + "step": 676 + }, + { + "completion_length": 363.703125, + "epoch": 0.9854439592430859, + "grad_norm": 1.0444763973789726, + "kl": 0.046875, + "learning_rate": 5.56174105301066e-10, + "loss": 0.0024, + "reward": 0.9435481429100037, + "reward_std": 0.7777630090713501, + "rewards/accuracy_reward": 0.671875, + "rewards/format_reward": 0.9277669191360474, + "step": 677 + }, + { + "completion_length": 343.484375, + "epoch": 0.9868995633187773, + "grad_norm": 0.97449213723852, + "kl": 0.052734375, + "learning_rate": 4.5051689765929213e-10, + "loss": -0.0009, + "reward": 0.13971352577209473, + "reward_std": 0.23464931547641754, + "rewards/accuracy_reward": 0.390625, + "rewards/format_reward": 0.9606120586395264, + "step": 678 + }, + { + "completion_length": 360.53125, + "epoch": 0.9883551673944687, + "grad_norm": 0.9192762084564559, + "kl": 0.050048828125, + "learning_rate": 3.559751898299934e-10, + "loss": 0.0019, + "reward": 1.4475326538085938, + "reward_std": 0.4938472509384155, + "rewards/accuracy_reward": 0.828125, + "rewards/format_reward": 0.9631575345993042, + "step": 679 + }, + { + "completion_length": 362.625, + "epoch": 0.9898107714701602, + "grad_norm": 0.7829504697419019, + "kl": 0.046875, + "learning_rate": 2.725510854653668e-10, + "loss": 0.0006, + "reward": 0.9168750047683716, + "reward_std": 0.1419457346200943, + "rewards/accuracy_reward": 0.640625, + "rewards/format_reward": 0.9950000047683716, + "step": 680 + }, + { + "completion_length": 345.0625, + "epoch": 0.9912663755458515, + "grad_norm": 1.0399738160256335, + "kl": 0.060302734375, + "learning_rate": 2.002464408392135e-10, + "loss": 0.0024, + "reward": 1.6794726848602295, + "reward_std": 0.7667683959007263, + "rewards/accuracy_reward": 0.921875, + "rewards/format_reward": 0.9138476848602295, + "step": 681 + }, + { + "completion_length": 361.859375, + "epoch": 0.992721979621543, + "grad_norm": 0.9175441007019236, + "kl": 0.048828125, + "learning_rate": 1.390628648056391e-10, + "loss": 0.001, + "reward": 0.7809114456176758, + "reward_std": 0.5863924622535706, + "rewards/accuracy_reward": 0.609375, + "rewards/format_reward": 0.9467839002609253, + "step": 682 + }, + { + "completion_length": 361.796875, + "epoch": 0.9941775836972343, + "grad_norm": 0.9367681869995809, + "kl": 0.044677734375, + "learning_rate": 8.900171876341511e-11, + "loss": 0.0024, + "reward": 0.8453580737113953, + "reward_std": 0.8822904229164124, + "rewards/accuracy_reward": 0.625, + "rewards/format_reward": 0.9622981548309326, + "step": 683 + }, + { + "completion_length": 360.484375, + "epoch": 0.9956331877729258, + "grad_norm": 0.9988321881169316, + "kl": 0.046630859375, + "learning_rate": 5.006411662555887e-11, + "loss": -0.0066, + "reward": 0.6559830904006958, + "reward_std": 0.5935498476028442, + "rewards/accuracy_reward": 0.578125, + "rewards/format_reward": 0.9169206023216248, + "step": 684 + }, + { + "completion_length": 344.28125, + "epoch": 0.9970887918486172, + "grad_norm": 1.0544057765799288, + "kl": 0.049072265625, + "learning_rate": 2.2250924794520175e-11, + "loss": 0.0023, + "reward": 1.2797396183013916, + "reward_std": 0.35647517442703247, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.9755468368530273, + "step": 685 + }, + { + "completion_length": 370.171875, + "epoch": 0.9985443959243085, + "grad_norm": 0.872227207794643, + "kl": 0.04541015625, + "learning_rate": 5.562762142974353e-12, + "loss": 0.0014, + "reward": 1.0113476514816284, + "reward_std": 0.7452950477600098, + "rewards/accuracy_reward": 0.703125, + "rewards/format_reward": 0.9019725918769836, + "step": 686 + }, + { + "completion_length": 382.890625, + "epoch": 1.0, + "grad_norm": 0.9830707932910157, + "kl": 0.042724609375, + "learning_rate": 0.0, + "loss": 0.0014, + "reward": 1.15053391456604, + "reward_std": 1.0555355548858643, + "rewards/accuracy_reward": 0.765625, + "rewards/format_reward": 0.8536588549613953, + "step": 687 + }, + { + "epoch": 1.0, + "step": 687, + "total_flos": 0.0, + "train_loss": 0.00033314188768961123, + "train_runtime": 116282.9812, + "train_samples_per_second": 0.047, + "train_steps_per_second": 0.006 + } + ], + "logging_steps": 1.0, + "max_steps": 687, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}