diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6607 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "episode": 20032, + "epoch": 3.7803359124363087, + "eval_steps": 20, + "global_step": 313, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "episode": 64, + "epoch": 0.012077750518965842, + "eps": 0, + "loss/policy_avg": -0.0021184529177844524, + "loss/value_avg": 0.9311372637748718, + "lr": 0.0, + "objective/entropy": -600.715087890625, + "objective/kl": 0.46257561445236206, + "objective/non_score_reward": -0.013877267949283123, + "objective/rlhf_reward": 0.384560227394104, + "objective/scores": 0.3984375, + "policy/approxkl_avg": 0.00044652423821389675, + "policy/clipfrac_avg": 0.005666394717991352, + "policy/entropy_avg": 0.21374297142028809, + "step": 1, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 1.000056505203247, + "val/ratio_var": 4.3476825339894276e-07 + }, + { + "episode": 128, + "epoch": 0.024155501037931685, + "eps": 0, + "loss/policy_avg": -0.0030812141485512257, + "loss/value_avg": 0.8693833351135254, + "lr": 3.125e-08, + "objective/entropy": -595.1883544921875, + "objective/kl": 0.6688432097434998, + "objective/non_score_reward": -0.020065294578671455, + "objective/rlhf_reward": 0.3813018798828125, + "objective/scores": 0.40234375, + "policy/approxkl_avg": 0.0004370739625301212, + "policy/clipfrac_avg": 0.006835754029452801, + "policy/entropy_avg": 0.21932220458984375, + "step": 2, + "val/clipfrac_avg": 0.00023904947738628834, + "val/num_eos_tokens": 49, + "val/ratio": 0.9999904632568359, + "val/ratio_var": 5.597846097771253e-07 + }, + { + "episode": 192, + "epoch": 0.03623325155689753, + "eps": 0, + "loss/policy_avg": -0.0009602411882951856, + "loss/value_avg": 0.9131457209587097, + "lr": 6.25e-08, + "objective/entropy": -561.6600341796875, + "objective/kl": 0.7238848805427551, + "objective/non_score_reward": -0.021716546267271042, + "objective/rlhf_reward": 0.39674046635627747, + "objective/scores": 0.41796875, + "policy/approxkl_avg": 0.00045788957504555583, + "policy/clipfrac_avg": 0.007044724188745022, + "policy/entropy_avg": 0.2180023193359375, + "step": 3, + "val/clipfrac_avg": 0.00015014366363175213, + "val/num_eos_tokens": 50, + "val/ratio": 1.0000163316726685, + "val/ratio_var": 5.647702892019879e-07 + }, + { + "episode": 256, + "epoch": 0.04831100207586337, + "eps": 0, + "loss/policy_avg": -0.0013294187374413013, + "loss/value_avg": 0.9107441902160645, + "lr": 9.375e-08, + "objective/entropy": -489.9579162597656, + "objective/kl": 0.710690438747406, + "objective/non_score_reward": -0.021320713683962822, + "objective/rlhf_reward": 0.2943531274795532, + "objective/scores": 0.31640625, + "policy/approxkl_avg": 0.0008129056077450514, + "policy/clipfrac_avg": 0.0068802861496806145, + "policy/entropy_avg": 0.20705923438072205, + "step": 4, + "val/clipfrac_avg": 0.00046966708032414317, + "val/num_eos_tokens": 36, + "val/ratio": 0.999975860118866, + "val/ratio_var": 7.655679041818075e-07 + }, + { + "episode": 320, + "epoch": 0.06038875259482921, + "eps": 0, + "loss/policy_avg": -0.0032467995770275593, + "loss/value_avg": 0.8995952606201172, + "lr": 1.25e-07, + "objective/entropy": -685.2054443359375, + "objective/kl": 0.3006611764431, + "objective/non_score_reward": -0.00901983492076397, + "objective/rlhf_reward": 0.4690075218677521, + "objective/scores": 0.478515625, + "policy/approxkl_avg": 0.0003806678578257561, + "policy/clipfrac_avg": 0.006184540688991547, + "policy/entropy_avg": 0.2066497802734375, + "step": 5, + "val/clipfrac_avg": 0.0001254370145034045, + "val/num_eos_tokens": 42, + "val/ratio": 1.0000379085540771, + "val/ratio_var": 4.958400268151308e-07 + }, + { + "episode": 384, + "epoch": 0.07246650311379506, + "eps": 0, + "loss/policy_avg": -0.00029869808349758387, + "loss/value_avg": 0.9305676221847534, + "lr": 1.5624999999999999e-07, + "objective/entropy": -588.39697265625, + "objective/kl": 0.5641751885414124, + "objective/non_score_reward": -0.016925256699323654, + "objective/rlhf_reward": 0.39762550592422485, + "objective/scores": 0.4140625, + "policy/approxkl_avg": 0.00041988492012023926, + "policy/clipfrac_avg": 0.006766438018530607, + "policy/entropy_avg": 0.20317253470420837, + "step": 6, + "val/clipfrac_avg": 0.00019526462710928172, + "val/num_eos_tokens": 45, + "val/ratio": 0.9999103546142578, + "val/ratio_var": 6.99273925874877e-07 + }, + { + "episode": 448, + "epoch": 0.0845442536327609, + "eps": 0, + "loss/policy_avg": -0.0019068828551098704, + "loss/value_avg": 0.8919577598571777, + "lr": 1.875e-07, + "objective/entropy": -614.7843017578125, + "objective/kl": 0.33637887239456177, + "objective/non_score_reward": -0.010091365315020084, + "objective/rlhf_reward": 0.3663734793663025, + "objective/scores": 0.376953125, + "policy/approxkl_avg": 0.0004250165948178619, + "policy/clipfrac_avg": 0.0070974379777908325, + "policy/entropy_avg": 0.2131398618221283, + "step": 7, + "val/clipfrac_avg": 0.00019152543973177671, + "val/num_eos_tokens": 43, + "val/ratio": 1.000044822692871, + "val/ratio_var": 5.280454047351668e-07 + }, + { + "episode": 512, + "epoch": 0.09662200415172674, + "eps": 0, + "loss/policy_avg": -0.003216695738956332, + "loss/value_avg": 0.8838874101638794, + "lr": 2.1875e-07, + "objective/entropy": -576.663330078125, + "objective/kl": 0.7862333059310913, + "objective/non_score_reward": -0.023586997762322426, + "objective/rlhf_reward": 0.3934051990509033, + "objective/scores": 0.41796875, + "policy/approxkl_avg": 0.000438332324847579, + "policy/clipfrac_avg": 0.006604321300983429, + "policy/entropy_avg": 0.20785841345787048, + "step": 8, + "val/clipfrac_avg": 0.0003107336815446615, + "val/num_eos_tokens": 36, + "val/ratio": 0.9998782873153687, + "val/ratio_var": 6.208914555827505e-07 + }, + { + "episode": 576, + "epoch": 0.10869975467069258, + "eps": 0, + "loss/policy_avg": -0.002265141811221838, + "loss/value_avg": 0.869255542755127, + "lr": 2.5e-07, + "objective/entropy": -627.076171875, + "objective/kl": 0.32534003257751465, + "objective/non_score_reward": -0.009760200046002865, + "objective/rlhf_reward": 0.39697808027267456, + "objective/scores": 0.40625, + "policy/approxkl_avg": 0.00039745302638038993, + "policy/clipfrac_avg": 0.006331109441816807, + "policy/entropy_avg": 0.19233450293540955, + "step": 9, + "val/clipfrac_avg": 0.00015899499703664333, + "val/num_eos_tokens": 41, + "val/ratio": 1.0001146793365479, + "val/ratio_var": 7.010530111983826e-07 + }, + { + "episode": 640, + "epoch": 0.12077750518965842, + "eps": 0, + "loss/policy_avg": -0.0026860858779400587, + "loss/value_avg": 0.8342069387435913, + "lr": 2.8125e-07, + "objective/entropy": -670.0674438476562, + "objective/kl": 0.7622801661491394, + "objective/non_score_reward": -0.022868404164910316, + "objective/rlhf_reward": 0.38533473014831543, + "objective/scores": 0.408203125, + "policy/approxkl_avg": 0.0003712985198944807, + "policy/clipfrac_avg": 0.006433566566556692, + "policy/entropy_avg": 0.20289739966392517, + "step": 10, + "val/clipfrac_avg": 0.027581503614783287, + "val/num_eos_tokens": 45, + "val/ratio": 0.999953031539917, + "val/ratio_var": 6.136452270766313e-07 + }, + { + "episode": 704, + "epoch": 0.13285525570862428, + "eps": 0, + "loss/policy_avg": -0.008414413779973984, + "loss/value_avg": 0.7493016123771667, + "lr": 3.1249999999999997e-07, + "objective/entropy": -643.5463256835938, + "objective/kl": 0.8814950585365295, + "objective/non_score_reward": -0.026444854214787483, + "objective/rlhf_reward": 0.42033249139785767, + "objective/scores": 0.447265625, + "policy/approxkl_avg": 0.0004149469896219671, + "policy/clipfrac_avg": 0.0073681240901350975, + "policy/entropy_avg": 0.21221670508384705, + "step": 11, + "val/clipfrac_avg": 0.0004212568746879697, + "val/num_eos_tokens": 36, + "val/ratio": 0.9999436736106873, + "val/ratio_var": 6.905478926455544e-07 + }, + { + "episode": 768, + "epoch": 0.14493300622759012, + "eps": 0, + "loss/policy_avg": -0.008181717246770859, + "loss/value_avg": 0.7314225435256958, + "lr": 3.4375e-07, + "objective/entropy": -599.110595703125, + "objective/kl": 0.7627489566802979, + "objective/non_score_reward": -0.02288246899843216, + "objective/rlhf_reward": 0.44879722595214844, + "objective/scores": 0.47265625, + "policy/approxkl_avg": 0.00046156253665685654, + "policy/clipfrac_avg": 0.0068002426996827126, + "policy/entropy_avg": 0.2235361784696579, + "step": 12, + "val/clipfrac_avg": 0.00017142172146122903, + "val/num_eos_tokens": 41, + "val/ratio": 0.9999423027038574, + "val/ratio_var": 8.461731226816482e-07 + }, + { + "episode": 832, + "epoch": 0.15701075674655596, + "eps": 0, + "loss/policy_avg": -0.006054941564798355, + "loss/value_avg": 0.7129493951797485, + "lr": 3.75e-07, + "objective/entropy": -625.335693359375, + "objective/kl": 0.9311845898628235, + "objective/non_score_reward": -0.027935536578297615, + "objective/rlhf_reward": 0.3563418388366699, + "objective/scores": 0.384765625, + "policy/approxkl_avg": 0.0003990530385635793, + "policy/clipfrac_avg": 0.006634948309510946, + "policy/entropy_avg": 0.20673498511314392, + "step": 13, + "val/clipfrac_avg": 0.00031250983010977507, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999412298202515, + "val/ratio_var": 6.676891075585445e-07 + }, + { + "episode": 896, + "epoch": 0.1690885072655218, + "eps": 0, + "loss/policy_avg": -0.007185523398220539, + "loss/value_avg": 0.663692831993103, + "lr": 4.0625e-07, + "objective/entropy": -611.6224365234375, + "objective/kl": 0.598267674446106, + "objective/non_score_reward": -0.017948029562830925, + "objective/rlhf_reward": 0.3819543123245239, + "objective/scores": 0.400390625, + "policy/approxkl_avg": 0.0004264025192242116, + "policy/clipfrac_avg": 0.0068409196101129055, + "policy/entropy_avg": 0.20851516723632812, + "step": 14, + "val/clipfrac_avg": 0.17968440055847168, + "val/num_eos_tokens": 45, + "val/ratio": 0.9998716115951538, + "val/ratio_var": 8.126443162836949e-07 + }, + { + "episode": 960, + "epoch": 0.18116625778448764, + "eps": 0, + "loss/policy_avg": -0.01342801284044981, + "loss/value_avg": 0.5221339464187622, + "lr": 4.375e-07, + "objective/entropy": -577.73486328125, + "objective/kl": 1.043047547340393, + "objective/non_score_reward": -0.031291425228118896, + "objective/rlhf_reward": 0.3266187310218811, + "objective/scores": 0.357421875, + "policy/approxkl_avg": 0.00048208641237579286, + "policy/clipfrac_avg": 0.00789344497025013, + "policy/entropy_avg": 0.21881103515625, + "step": 15, + "val/clipfrac_avg": 0.015999414026737213, + "val/num_eos_tokens": 44, + "val/ratio": 1.0000479221343994, + "val/ratio_var": 7.270523383340333e-07 + }, + { + "episode": 1024, + "epoch": 0.19324400830345348, + "eps": 0, + "loss/policy_avg": -0.014559760689735413, + "loss/value_avg": 0.4795520305633545, + "lr": 4.6874999999999996e-07, + "objective/entropy": -704.927978515625, + "objective/kl": 1.3000061511993408, + "objective/non_score_reward": -0.03900018334388733, + "objective/rlhf_reward": 0.39117559790611267, + "objective/scores": 0.4296875, + "policy/approxkl_avg": 0.00035720731830224395, + "policy/clipfrac_avg": 0.006962340325117111, + "policy/entropy_avg": 0.19104096293449402, + "step": 16, + "val/clipfrac_avg": 0.0004542362876236439, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999203681945801, + "val/ratio_var": 6.327572918962687e-07 + }, + { + "episode": 1088, + "epoch": 0.20532175882241932, + "eps": 0, + "loss/policy_avg": -0.017483970150351524, + "loss/value_avg": 0.4525485634803772, + "lr": 5e-07, + "objective/entropy": -632.856689453125, + "objective/kl": 1.6142749786376953, + "objective/non_score_reward": -0.0484282523393631, + "objective/rlhf_reward": 0.3153412640094757, + "objective/scores": 0.36328125, + "policy/approxkl_avg": 0.0004206376615911722, + "policy/clipfrac_avg": 0.007758093532174826, + "policy/entropy_avg": 0.20397186279296875, + "step": 17, + "val/clipfrac_avg": 0.001352960942313075, + "val/num_eos_tokens": 55, + "val/ratio": 1.0000765323638916, + "val/ratio_var": 6.663805720563687e-07 + }, + { + "episode": 1152, + "epoch": 0.21739950934138516, + "eps": 0, + "loss/policy_avg": -0.01164332777261734, + "loss/value_avg": 0.414880633354187, + "lr": 4.983164983164983e-07, + "objective/entropy": -660.91943359375, + "objective/kl": 2.686311721801758, + "objective/non_score_reward": -0.08058934658765793, + "objective/rlhf_reward": 0.3207778334617615, + "objective/scores": 0.40234375, + "policy/approxkl_avg": 0.0004323392640799284, + "policy/clipfrac_avg": 0.008179331198334694, + "policy/entropy_avg": 0.19593684375286102, + "step": 18, + "val/clipfrac_avg": 0.0011506883893162012, + "val/num_eos_tokens": 51, + "val/ratio": 0.9998383522033691, + "val/ratio_var": 7.198556772891607e-07 + }, + { + "episode": 1216, + "epoch": 0.229477259860351, + "eps": 0, + "loss/policy_avg": -0.011872556060552597, + "loss/value_avg": 0.38459596037864685, + "lr": 4.966329966329966e-07, + "objective/entropy": -632.638916015625, + "objective/kl": 3.21876859664917, + "objective/non_score_reward": -0.09656305611133575, + "objective/rlhf_reward": 0.23790958523750305, + "objective/scores": 0.333984375, + "policy/approxkl_avg": 0.0004764531913679093, + "policy/clipfrac_avg": 0.008287805132567883, + "policy/entropy_avg": 0.2162272185087204, + "step": 19, + "val/clipfrac_avg": 0.00905265286564827, + "val/num_eos_tokens": 37, + "val/ratio": 1.0000991821289062, + "val/ratio_var": 7.565479904769745e-07 + }, + { + "episode": 1280, + "epoch": 0.24155501037931684, + "eps": 0, + "loss/policy_avg": -0.014671847224235535, + "loss/value_avg": 0.3181418478488922, + "lr": 4.949494949494949e-07, + "objective/entropy": -706.05078125, + "objective/kl": 3.669142007827759, + "objective/non_score_reward": -0.11007425934076309, + "objective/rlhf_reward": 0.3664882481098175, + "objective/scores": 0.4765625, + "policy/approxkl_avg": 0.0003931926330551505, + "policy/clipfrac_avg": 0.008725658059120178, + "policy/entropy_avg": 0.17680613696575165, + "step": 20, + "val/clipfrac_avg": 0.03898124024271965, + "val/num_eos_tokens": 54, + "val/ratio": 1.0001307725906372, + "val/ratio_var": 6.557406777574215e-07 + }, + { + "episode": 1344, + "epoch": 0.2536327608982827, + "eps": 0, + "loss/policy_avg": -0.014831740409135818, + "loss/value_avg": 0.2612203061580658, + "lr": 4.932659932659932e-07, + "objective/entropy": -704.768310546875, + "objective/kl": 3.722026824951172, + "objective/non_score_reward": -0.11166080832481384, + "objective/rlhf_reward": 0.33413997292518616, + "objective/scores": 0.4453125, + "policy/approxkl_avg": 0.00046230730367824435, + "policy/clipfrac_avg": 0.008072879165410995, + "policy/entropy_avg": 0.18310165405273438, + "step": 21, + "val/clipfrac_avg": 0.006685478147119284, + "val/num_eos_tokens": 47, + "val/ratio": 0.9999584555625916, + "val/ratio_var": 6.261999487833236e-07 + }, + { + "episode": 1408, + "epoch": 0.26571051141724855, + "eps": 0, + "loss/policy_avg": -0.015546409413218498, + "loss/value_avg": 0.22234514355659485, + "lr": 4.915824915824915e-07, + "objective/entropy": -686.92041015625, + "objective/kl": 5.413008689880371, + "objective/non_score_reward": -0.16239026188850403, + "objective/rlhf_reward": 0.30050036311149597, + "objective/scores": 0.462890625, + "policy/approxkl_avg": 0.00041679860441945493, + "policy/clipfrac_avg": 0.00833301804959774, + "policy/entropy_avg": 0.1929423063993454, + "step": 22, + "val/clipfrac_avg": 0.009531511925160885, + "val/num_eos_tokens": 48, + "val/ratio": 0.999933660030365, + "val/ratio_var": 7.41119151825842e-07 + }, + { + "episode": 1472, + "epoch": 0.27778826193621436, + "eps": 0, + "loss/policy_avg": -0.013549113646149635, + "loss/value_avg": 0.20537236332893372, + "lr": 4.898989898989898e-07, + "objective/entropy": -710.58544921875, + "objective/kl": 5.630204200744629, + "objective/non_score_reward": -0.16890612244606018, + "objective/rlhf_reward": 0.2529688775539398, + "objective/scores": 0.421875, + "policy/approxkl_avg": 0.00040828564669936895, + "policy/clipfrac_avg": 0.009321928024291992, + "policy/entropy_avg": 0.18656031787395477, + "step": 23, + "val/clipfrac_avg": 0.0010966494446620345, + "val/num_eos_tokens": 45, + "val/ratio": 1.0000791549682617, + "val/ratio_var": 8.043146522140887e-07 + }, + { + "episode": 1536, + "epoch": 0.28986601245518023, + "eps": 0, + "loss/policy_avg": -0.014127358794212341, + "loss/value_avg": 0.184538334608078, + "lr": 4.882154882154882e-07, + "objective/entropy": -721.2072143554688, + "objective/kl": 7.244493007659912, + "objective/non_score_reward": -0.2173347771167755, + "objective/rlhf_reward": 0.2309074103832245, + "objective/scores": 0.44921875, + "policy/approxkl_avg": 0.00042275150190107524, + "policy/clipfrac_avg": 0.009146442636847496, + "policy/entropy_avg": 0.19444656372070312, + "step": 24, + "val/clipfrac_avg": 0.00022602800163440406, + "val/num_eos_tokens": 48, + "val/ratio": 0.9999423027038574, + "val/ratio_var": 6.538029424518754e-07 + }, + { + "episode": 1600, + "epoch": 0.30194376297414605, + "eps": 0, + "loss/policy_avg": -0.013546439819037914, + "loss/value_avg": 0.15871131420135498, + "lr": 4.865319865319866e-07, + "objective/entropy": -625.06640625, + "objective/kl": 6.906096935272217, + "objective/non_score_reward": -0.20718291401863098, + "objective/rlhf_reward": 0.14267060160636902, + "objective/scores": 0.349609375, + "policy/approxkl_avg": 0.0004575018538162112, + "policy/clipfrac_avg": 0.009727372787892818, + "policy/entropy_avg": 0.2004598081111908, + "step": 25, + "val/clipfrac_avg": 9.441343718208373e-05, + "val/num_eos_tokens": 33, + "val/ratio": 1.0002121925354004, + "val/ratio_var": 6.285458766797092e-07 + }, + { + "episode": 1664, + "epoch": 0.3140215134931119, + "eps": 0, + "loss/policy_avg": -0.011472932994365692, + "loss/value_avg": 0.15153326094150543, + "lr": 4.848484848484849e-07, + "objective/entropy": -657.1741333007812, + "objective/kl": 8.73617172241211, + "objective/non_score_reward": -0.26208510994911194, + "objective/rlhf_reward": 0.11779768764972687, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.00045209572999738157, + "policy/clipfrac_avg": 0.0087648406624794, + "policy/entropy_avg": 0.20401255786418915, + "step": 26, + "val/clipfrac_avg": 0.00012975589197594672, + "val/num_eos_tokens": 49, + "val/ratio": 1.0000041723251343, + "val/ratio_var": 9.090064736483328e-07 + }, + { + "episode": 1728, + "epoch": 0.3260992640120777, + "eps": 0, + "loss/policy_avg": -0.018957365304231644, + "loss/value_avg": 0.1270497739315033, + "lr": 4.831649831649832e-07, + "objective/entropy": -725.923828125, + "objective/kl": 9.15277099609375, + "objective/non_score_reward": -0.2745831608772278, + "objective/rlhf_reward": 0.1878191977739334, + "objective/scores": 0.462890625, + "policy/approxkl_avg": 0.00039935283712111413, + "policy/clipfrac_avg": 0.009701108559966087, + "policy/entropy_avg": 0.18258032202720642, + "step": 27, + "val/clipfrac_avg": 0.0003692947211675346, + "val/num_eos_tokens": 50, + "val/ratio": 0.9999105334281921, + "val/ratio_var": 6.027379413353628e-07 + }, + { + "episode": 1792, + "epoch": 0.3381770145310436, + "eps": 0, + "loss/policy_avg": -0.015594224445521832, + "loss/value_avg": 0.11441653966903687, + "lr": 4.814814814814814e-07, + "objective/entropy": -735.0335693359375, + "objective/kl": 9.175653457641602, + "objective/non_score_reward": -0.27526962757110596, + "objective/rlhf_reward": 0.13391008973121643, + "objective/scores": 0.41015625, + "policy/approxkl_avg": 0.0004158214433118701, + "policy/clipfrac_avg": 0.008673434145748615, + "policy/entropy_avg": 0.17068862915039062, + "step": 28, + "val/clipfrac_avg": 0.0001518530771136284, + "val/num_eos_tokens": 52, + "val/ratio": 0.9999884366989136, + "val/ratio_var": 8.806293294583156e-07 + }, + { + "episode": 1856, + "epoch": 0.3502547650500094, + "eps": 0, + "loss/policy_avg": -0.011532934382557869, + "loss/value_avg": 0.09116180986166, + "lr": 4.797979797979798e-07, + "objective/entropy": -773.42919921875, + "objective/kl": 10.120838165283203, + "objective/non_score_reward": -0.3036251366138458, + "objective/rlhf_reward": 0.17586705088615417, + "objective/scores": 0.48046875, + "policy/approxkl_avg": 0.00036347960121929646, + "policy/clipfrac_avg": 0.009156275540590286, + "policy/entropy_avg": 0.16425704956054688, + "step": 29, + "val/clipfrac_avg": 3.780891711357981e-05, + "val/num_eos_tokens": 50, + "val/ratio": 1.0000102519989014, + "val/ratio_var": 5.752245328949357e-07 + }, + { + "episode": 1920, + "epoch": 0.3623325155689753, + "eps": 0, + "loss/policy_avg": -0.01278415322303772, + "loss/value_avg": 0.08662945032119751, + "lr": 4.781144781144781e-07, + "objective/entropy": -718.801025390625, + "objective/kl": 10.949674606323242, + "objective/non_score_reward": -0.3284902274608612, + "objective/rlhf_reward": 0.0650644600391388, + "objective/scores": 0.39453125, + "policy/approxkl_avg": 0.0004170535539742559, + "policy/clipfrac_avg": 0.009168609045445919, + "policy/entropy_avg": 0.17079035937786102, + "step": 30, + "val/clipfrac_avg": 0.00021001597633585334, + "val/num_eos_tokens": 53, + "val/ratio": 0.9999901652336121, + "val/ratio_var": 7.901667800069845e-07 + }, + { + "episode": 1984, + "epoch": 0.37441026608794115, + "eps": 0, + "loss/policy_avg": -0.013262813910841942, + "loss/value_avg": 0.07429289817810059, + "lr": 4.7643097643097643e-07, + "objective/entropy": -708.4739379882812, + "objective/kl": 11.724746704101562, + "objective/non_score_reward": -0.35174238681793213, + "objective/rlhf_reward": -0.014340057969093323, + "objective/scores": 0.337890625, + "policy/approxkl_avg": 0.0004177941009402275, + "policy/clipfrac_avg": 0.009081902913749218, + "policy/entropy_avg": 0.18209967017173767, + "step": 31, + "val/clipfrac_avg": 0.0003269795561209321, + "val/num_eos_tokens": 49, + "val/ratio": 0.999956488609314, + "val/ratio_var": 6.545072324115608e-07 + }, + { + "episode": 2048, + "epoch": 0.38648801660690696, + "eps": 0, + "loss/policy_avg": -0.018486851826310158, + "loss/value_avg": 0.06496821343898773, + "lr": 4.7474747474747474e-07, + "objective/entropy": -730.5189819335938, + "objective/kl": 13.40658187866211, + "objective/non_score_reward": -0.4021974205970764, + "objective/rlhf_reward": -0.03842787444591522, + "objective/scores": 0.36328125, + "policy/approxkl_avg": 0.00040673528565093875, + "policy/clipfrac_avg": 0.008559602312743664, + "policy/entropy_avg": 0.16196060180664062, + "step": 32, + "val/clipfrac_avg": 8.302954665850848e-05, + "val/num_eos_tokens": 51, + "val/ratio": 0.9999591708183289, + "val/ratio_var": 6.225269544302137e-07 + }, + { + "episode": 2112, + "epoch": 0.3985657671258728, + "eps": 0, + "loss/policy_avg": -0.022785823792219162, + "loss/value_avg": 0.054975174367427826, + "lr": 4.7306397306397305e-07, + "objective/entropy": -750.11865234375, + "objective/kl": 12.143804550170898, + "objective/non_score_reward": -0.36431413888931274, + "objective/rlhf_reward": 0.09711165726184845, + "objective/scores": 0.4609375, + "policy/approxkl_avg": 0.00039204792119562626, + "policy/clipfrac_avg": 0.009267458692193031, + "policy/entropy_avg": 0.15810012817382812, + "step": 33, + "val/clipfrac_avg": 6.448548811022192e-05, + "val/num_eos_tokens": 51, + "val/ratio": 1.0001405477523804, + "val/ratio_var": 6.74541126954864e-07 + }, + { + "episode": 2176, + "epoch": 0.41064351764483864, + "eps": 0, + "loss/policy_avg": -0.021774116903543472, + "loss/value_avg": 0.048116378486156464, + "lr": 4.7138047138047136e-07, + "objective/entropy": -726.4152221679688, + "objective/kl": 12.474294662475586, + "objective/non_score_reward": -0.3742288053035736, + "objective/rlhf_reward": 0.02713838219642639, + "objective/scores": 0.40234375, + "policy/approxkl_avg": 0.0006013754173181951, + "policy/clipfrac_avg": 0.009044626727700233, + "policy/entropy_avg": 0.17004776000976562, + "step": 34, + "val/clipfrac_avg": 1.9868224626407027e-05, + "val/num_eos_tokens": 57, + "val/ratio": 1.0000003576278687, + "val/ratio_var": 5.519239607565396e-07 + }, + { + "episode": 2240, + "epoch": 0.4227212681638045, + "eps": 0, + "loss/policy_avg": -0.011024661362171173, + "loss/value_avg": 0.04245440661907196, + "lr": 4.696969696969697e-07, + "objective/entropy": -765.597900390625, + "objective/kl": 12.814224243164062, + "objective/non_score_reward": -0.38442671298980713, + "objective/rlhf_reward": -0.020657174289226532, + "objective/scores": 0.36328125, + "policy/approxkl_avg": 0.00036188805825076997, + "policy/clipfrac_avg": 0.009150207042694092, + "policy/entropy_avg": 0.14880117774009705, + "step": 35, + "val/clipfrac_avg": 6.972333721932955e-06, + "val/num_eos_tokens": 54, + "val/ratio": 0.9999343752861023, + "val/ratio_var": 4.806460651707312e-07 + }, + { + "episode": 2304, + "epoch": 0.4347990186827703, + "eps": 0, + "loss/policy_avg": -0.018790725618600845, + "loss/value_avg": 0.038986437022686005, + "lr": 4.68013468013468e-07, + "objective/entropy": -748.5819091796875, + "objective/kl": 13.179786682128906, + "objective/non_score_reward": -0.39539361000061035, + "objective/rlhf_reward": 0.02013372629880905, + "objective/scores": 0.416015625, + "policy/approxkl_avg": 0.00038032219163142145, + "policy/clipfrac_avg": 0.009533729404211044, + "policy/entropy_avg": 0.15250270068645477, + "step": 36, + "val/clipfrac_avg": 2.8229449526406825e-05, + "val/num_eos_tokens": 55, + "val/ratio": 1.0000063180923462, + "val/ratio_var": 7.776847610330151e-07 + }, + { + "episode": 2368, + "epoch": 0.4468767692017362, + "eps": 0, + "loss/policy_avg": -0.018214058130979538, + "loss/value_avg": 0.034576669335365295, + "lr": 4.663299663299663e-07, + "objective/entropy": -702.28369140625, + "objective/kl": 15.299257278442383, + "objective/non_score_reward": -0.45897769927978516, + "objective/rlhf_reward": -0.15282535552978516, + "objective/scores": 0.306640625, + "policy/approxkl_avg": 0.00047967006685212255, + "policy/clipfrac_avg": 0.009605104103684425, + "policy/entropy_avg": 0.16119003295898438, + "step": 37, + "val/clipfrac_avg": 1.4692055628984235e-05, + "val/num_eos_tokens": 44, + "val/ratio": 1.0000842809677124, + "val/ratio_var": 6.511489800686832e-07 + }, + { + "episode": 2432, + "epoch": 0.458954519720702, + "eps": 0, + "loss/policy_avg": -0.022558456286787987, + "loss/value_avg": 0.032565370202064514, + "lr": 4.646464646464646e-07, + "objective/entropy": -743.125732421875, + "objective/kl": 15.134292602539062, + "objective/non_score_reward": -0.45402878522872925, + "objective/rlhf_reward": -0.12102095782756805, + "objective/scores": 0.33203125, + "policy/approxkl_avg": 0.0005570814246311784, + "policy/clipfrac_avg": 0.009378625079989433, + "policy/entropy_avg": 0.15362422168254852, + "step": 38, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 52, + "val/ratio": 1.0000710487365723, + "val/ratio_var": 7.270390369740198e-07 + }, + { + "episode": 2496, + "epoch": 0.47103227023966787, + "eps": 0, + "loss/policy_avg": -0.029226083308458328, + "loss/value_avg": 0.029515882954001427, + "lr": 4.6296296296296297e-07, + "objective/entropy": -698.0175170898438, + "objective/kl": 15.121957778930664, + "objective/non_score_reward": -0.4536587595939636, + "objective/rlhf_reward": -0.06596343964338303, + "objective/scores": 0.38671875, + "policy/approxkl_avg": 0.00044198459363542497, + "policy/clipfrac_avg": 0.008979331701993942, + "policy/entropy_avg": 0.172088623046875, + "step": 39, + "val/clipfrac_avg": 1.966176751011517e-05, + "val/num_eos_tokens": 48, + "val/ratio": 1.0001137256622314, + "val/ratio_var": 9.834893717197701e-07 + }, + { + "episode": 2560, + "epoch": 0.4831100207586337, + "eps": 0, + "loss/policy_avg": -0.025167806074023247, + "loss/value_avg": 0.027836887165904045, + "lr": 4.612794612794613e-07, + "objective/entropy": -725.21875, + "objective/kl": 14.953241348266602, + "objective/non_score_reward": -0.4485971927642822, + "objective/rlhf_reward": -0.04869486391544342, + "objective/scores": 0.400390625, + "policy/approxkl_avg": 0.0005358229391276836, + "policy/clipfrac_avg": 0.008523606695234776, + "policy/entropy_avg": 0.15728633105754852, + "step": 40, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.00004243850708, + "val/ratio_var": 5.402997089731798e-07 + }, + { + "episode": 2624, + "epoch": 0.49518777127759955, + "eps": 0, + "loss/policy_avg": -0.030176600441336632, + "loss/value_avg": 0.026436101645231247, + "lr": 4.595959595959596e-07, + "objective/entropy": -781.7913818359375, + "objective/kl": 14.04931926727295, + "objective/non_score_reward": -0.42147958278656006, + "objective/rlhf_reward": 0.09707509726285934, + "objective/scores": 0.51953125, + "policy/approxkl_avg": 0.0003690444864332676, + "policy/clipfrac_avg": 0.008865940384566784, + "policy/entropy_avg": 0.14566168189048767, + "step": 41, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 62, + "val/ratio": 1.0000038146972656, + "val/ratio_var": 8.146014920384914e-07 + }, + { + "episode": 2688, + "epoch": 0.5072655217965654, + "eps": 0, + "loss/policy_avg": -0.02603175863623619, + "loss/value_avg": 0.026311784982681274, + "lr": 4.579124579124579e-07, + "objective/entropy": -753.4428100585938, + "objective/kl": 15.049712181091309, + "objective/non_score_reward": -0.4514913558959961, + "objective/rlhf_reward": -0.0354757234454155, + "objective/scores": 0.416015625, + "policy/approxkl_avg": 0.0003813736548181623, + "policy/clipfrac_avg": 0.009573683142662048, + "policy/entropy_avg": 0.17486445605754852, + "step": 42, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 57, + "val/ratio": 1.0002083778381348, + "val/ratio_var": 6.971042694203788e-07 + }, + { + "episode": 2752, + "epoch": 0.5193432723155312, + "eps": 0, + "loss/policy_avg": -0.027423618361353874, + "loss/value_avg": 0.024181833490729332, + "lr": 4.562289562289562e-07, + "objective/entropy": -720.7274780273438, + "objective/kl": 17.52047348022461, + "objective/non_score_reward": -0.5256141424179077, + "objective/rlhf_reward": -0.1222938597202301, + "objective/scores": 0.40234375, + "policy/approxkl_avg": 0.00042545428732410073, + "policy/clipfrac_avg": 0.00958210788667202, + "policy/entropy_avg": 0.1733601987361908, + "step": 43, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 54, + "val/ratio": 0.9999567866325378, + "val/ratio_var": 7.062473628138832e-07 + }, + { + "episode": 2816, + "epoch": 0.5314210228344971, + "eps": 0, + "loss/policy_avg": -0.033347710967063904, + "loss/value_avg": 0.023924967274069786, + "lr": 4.545454545454545e-07, + "objective/entropy": -753.5906982421875, + "objective/kl": 16.592561721801758, + "objective/non_score_reward": -0.4977768063545227, + "objective/rlhf_reward": -0.04953461140394211, + "objective/scores": 0.44921875, + "policy/approxkl_avg": 0.000394497939851135, + "policy/clipfrac_avg": 0.009626075625419617, + "policy/entropy_avg": 0.14611563086509705, + "step": 44, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 1.0000271797180176, + "val/ratio_var": 8.373976356779167e-07 + }, + { + "episode": 2880, + "epoch": 0.543498773353463, + "eps": 0, + "loss/policy_avg": -0.03707805275917053, + "loss/value_avg": 0.02223985455930233, + "lr": 4.5286195286195283e-07, + "objective/entropy": -725.1084594726562, + "objective/kl": 15.50640869140625, + "objective/non_score_reward": -0.4651922583580017, + "objective/rlhf_reward": -0.021344579756259918, + "objective/scores": 0.443359375, + "policy/approxkl_avg": 0.0004309536307118833, + "policy/clipfrac_avg": 0.009066203609108925, + "policy/entropy_avg": 0.16333135962486267, + "step": 45, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 1.00018310546875, + "val/ratio_var": 1.699194172033458e-06 + }, + { + "episode": 2944, + "epoch": 0.5555765238724287, + "eps": 0, + "loss/policy_avg": -0.03546188026666641, + "loss/value_avg": 0.021326132118701935, + "lr": 4.5117845117845114e-07, + "objective/entropy": -742.35107421875, + "objective/kl": 15.387621879577637, + "objective/non_score_reward": -0.46162867546081543, + "objective/rlhf_reward": -0.03047630935907364, + "objective/scores": 0.431640625, + "policy/approxkl_avg": 0.0004147663130424917, + "policy/clipfrac_avg": 0.009872214868664742, + "policy/entropy_avg": 0.15110652148723602, + "step": 46, + "val/clipfrac_avg": 9.790101103135385e-06, + "val/num_eos_tokens": 43, + "val/ratio": 1.000044822692871, + "val/ratio_var": 8.873777233020519e-07 + }, + { + "episode": 3008, + "epoch": 0.5676542743913946, + "eps": 0, + "loss/policy_avg": -0.022155379876494408, + "loss/value_avg": 0.021301649510860443, + "lr": 4.494949494949495e-07, + "objective/entropy": -754.4105834960938, + "objective/kl": 14.502567291259766, + "objective/non_score_reward": -0.4350770115852356, + "objective/rlhf_reward": -0.0415223091840744, + "objective/scores": 0.39453125, + "policy/approxkl_avg": 0.0004166339640505612, + "policy/clipfrac_avg": 0.008984292857348919, + "policy/entropy_avg": 0.14285914599895477, + "step": 47, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 1.0002052783966064, + "val/ratio_var": 1.7462091363995569e-06 + }, + { + "episode": 3072, + "epoch": 0.5797320249103605, + "eps": 0, + "loss/policy_avg": -0.027517154812812805, + "loss/value_avg": 0.020472221076488495, + "lr": 4.478114478114478e-07, + "objective/entropy": -718.2733154296875, + "objective/kl": 16.20379638671875, + "objective/non_score_reward": -0.48611387610435486, + "objective/rlhf_reward": -0.15676817297935486, + "objective/scores": 0.330078125, + "policy/approxkl_avg": 0.00043310271576046944, + "policy/clipfrac_avg": 0.009795701131224632, + "policy/entropy_avg": 0.16522979736328125, + "step": 48, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 53, + "val/ratio": 1.000115156173706, + "val/ratio_var": 8.753415272622078e-07 + }, + { + "episode": 3136, + "epoch": 0.5918097754293263, + "eps": 0, + "loss/policy_avg": -0.037540458142757416, + "loss/value_avg": 0.01891172304749489, + "lr": 4.461279461279461e-07, + "objective/entropy": -738.3416748046875, + "objective/kl": 15.657567977905273, + "objective/non_score_reward": -0.4697270393371582, + "objective/rlhf_reward": 0.01708938181400299, + "objective/scores": 0.486328125, + "policy/approxkl_avg": 0.00037665231502614915, + "policy/clipfrac_avg": 0.008994007483124733, + "policy/entropy_avg": 0.14120499789714813, + "step": 49, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 56, + "val/ratio": 1.000009536743164, + "val/ratio_var": 6.900321523062303e-07 + }, + { + "episode": 3200, + "epoch": 0.6038875259482921, + "eps": 0, + "loss/policy_avg": -0.03546880930662155, + "loss/value_avg": 0.01827932894229889, + "lr": 4.444444444444444e-07, + "objective/entropy": -700.5809936523438, + "objective/kl": 16.617774963378906, + "objective/non_score_reward": -0.4985332190990448, + "objective/rlhf_reward": -0.1123027503490448, + "objective/scores": 0.38671875, + "policy/approxkl_avg": 0.00040262818220071495, + "policy/clipfrac_avg": 0.009067821316421032, + "policy/entropy_avg": 0.13315296173095703, + "step": 50, + "val/clipfrac_avg": 6.10590086580487e-06, + "val/num_eos_tokens": 53, + "val/ratio": 1.0000566244125366, + "val/ratio_var": 8.310821044688055e-07 + }, + { + "episode": 3264, + "epoch": 0.615965276467258, + "eps": 0, + "loss/policy_avg": -0.022362984716892242, + "loss/value_avg": 0.01790526881814003, + "lr": 4.4276094276094275e-07, + "objective/entropy": -797.2921752929688, + "objective/kl": 14.45788860321045, + "objective/non_score_reward": -0.4337366223335266, + "objective/rlhf_reward": 0.025736041367053986, + "objective/scores": 0.458984375, + "policy/approxkl_avg": 0.00034859025618061423, + "policy/clipfrac_avg": 0.008412575349211693, + "policy/entropy_avg": 0.12615332007408142, + "step": 51, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 58, + "val/ratio": 1.000011682510376, + "val/ratio_var": 5.029750127505395e-07 + }, + { + "episode": 3328, + "epoch": 0.6280430269862238, + "eps": 0, + "loss/policy_avg": -0.03401505947113037, + "loss/value_avg": 0.018212419003248215, + "lr": 4.4107744107744106e-07, + "objective/entropy": -690.7019653320312, + "objective/kl": 15.584673881530762, + "objective/non_score_reward": -0.4675402045249939, + "objective/rlhf_reward": -0.03858514130115509, + "objective/scores": 0.4296875, + "policy/approxkl_avg": 0.0003729221352841705, + "policy/clipfrac_avg": 0.00865244958549738, + "policy/entropy_avg": 0.13912074267864227, + "step": 52, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 1.000160574913025, + "val/ratio_var": 6.19857644323929e-07 + }, + { + "episode": 3392, + "epoch": 0.6401207775051897, + "eps": 0, + "loss/policy_avg": -0.012439190410077572, + "loss/value_avg": 0.016416631639003754, + "lr": 4.3939393939393937e-07, + "objective/entropy": -722.3007202148438, + "objective/kl": 15.750506401062012, + "objective/non_score_reward": -0.47251516580581665, + "objective/rlhf_reward": -0.16929252445697784, + "objective/scores": 0.302734375, + "policy/approxkl_avg": 0.00040123704820871353, + "policy/clipfrac_avg": 0.009440924972295761, + "policy/entropy_avg": 0.14617919921875, + "step": 53, + "val/clipfrac_avg": 6.367155947373249e-06, + "val/num_eos_tokens": 51, + "val/ratio": 0.9999165534973145, + "val/ratio_var": 5.078387061985268e-07 + }, + { + "episode": 3456, + "epoch": 0.6521985280241555, + "eps": 0, + "loss/policy_avg": -0.026336457580327988, + "loss/value_avg": 0.01662503555417061, + "lr": 4.377104377104377e-07, + "objective/entropy": -792.0408935546875, + "objective/kl": 15.416799545288086, + "objective/non_score_reward": -0.4625040292739868, + "objective/rlhf_reward": -0.012308701872825623, + "objective/scores": 0.44921875, + "policy/approxkl_avg": 0.0010212509660050273, + "policy/clipfrac_avg": 0.008898193016648293, + "policy/entropy_avg": 0.13633601367473602, + "step": 54, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 54, + "val/ratio": 0.9999532699584961, + "val/ratio_var": 4.915744966638158e-07 + }, + { + "episode": 3520, + "epoch": 0.6642762785431213, + "eps": 0, + "loss/policy_avg": -0.03656713292002678, + "loss/value_avg": 0.016763746738433838, + "lr": 4.3602693602693604e-07, + "objective/entropy": -728.6891479492188, + "objective/kl": 15.019660949707031, + "objective/non_score_reward": -0.4505898356437683, + "objective/rlhf_reward": -0.0018593519926071167, + "objective/scores": 0.44921875, + "policy/approxkl_avg": 0.0011785384267568588, + "policy/clipfrac_avg": 0.009183433838188648, + "policy/entropy_avg": 0.15262095630168915, + "step": 55, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 1.0000860691070557, + "val/ratio_var": 8.0383756539959e-07 + }, + { + "episode": 3584, + "epoch": 0.6763540290620872, + "eps": 0, + "loss/policy_avg": -0.022373374551534653, + "loss/value_avg": 0.016118954867124557, + "lr": 4.3434343434343435e-07, + "objective/entropy": -780.7221069335938, + "objective/kl": 14.14107608795166, + "objective/non_score_reward": -0.4242323040962219, + "objective/rlhf_reward": -0.03507213294506073, + "objective/scores": 0.388671875, + "policy/approxkl_avg": 0.0003644491662271321, + "policy/clipfrac_avg": 0.009697480127215385, + "policy/entropy_avg": 0.1477101743221283, + "step": 56, + "val/clipfrac_avg": 4.006410563306417e-06, + "val/num_eos_tokens": 58, + "val/ratio": 1.0000145435333252, + "val/ratio_var": 4.715680574918224e-07 + }, + { + "episode": 3648, + "epoch": 0.6884317795810531, + "eps": 0, + "loss/policy_avg": -0.02042277157306671, + "loss/value_avg": 0.015200886875391006, + "lr": 4.326599326599326e-07, + "objective/entropy": -730.958740234375, + "objective/kl": 15.971136093139648, + "objective/non_score_reward": -0.47913408279418945, + "objective/rlhf_reward": -0.11194658279418945, + "objective/scores": 0.3671875, + "policy/approxkl_avg": 0.0004102127568330616, + "policy/clipfrac_avg": 0.009771636687219143, + "policy/entropy_avg": 0.14384841918945312, + "step": 57, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 54, + "val/ratio": 1.0000948905944824, + "val/ratio_var": 5.781259915238479e-07 + }, + { + "episode": 3712, + "epoch": 0.7005095301000188, + "eps": 0, + "loss/policy_avg": -0.04674074053764343, + "loss/value_avg": 0.015770789235830307, + "lr": 4.309764309764309e-07, + "objective/entropy": -689.0645141601562, + "objective/kl": 15.481245994567871, + "objective/non_score_reward": -0.4644373655319214, + "objective/rlhf_reward": -0.0034998655319213867, + "objective/scores": 0.4609375, + "policy/approxkl_avg": 0.00042656456935219467, + "policy/clipfrac_avg": 0.009159904904663563, + "policy/entropy_avg": 0.15465545654296875, + "step": 58, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 1.000055193901062, + "val/ratio_var": 6.497099320768029e-07 + }, + { + "episode": 3776, + "epoch": 0.7125872806189847, + "eps": 0, + "loss/policy_avg": -0.018530046567320824, + "loss/value_avg": 0.014360702596604824, + "lr": 4.292929292929293e-07, + "objective/entropy": -639.9326782226562, + "objective/kl": 17.16830825805664, + "objective/non_score_reward": -0.5150492787361145, + "objective/rlhf_reward": -0.2088969349861145, + "objective/scores": 0.306640625, + "policy/approxkl_avg": 0.00047735171392560005, + "policy/clipfrac_avg": 0.010190478526055813, + "policy/entropy_avg": 0.17077922821044922, + "step": 59, + "val/clipfrac_avg": 8.251181498053484e-06, + "val/num_eos_tokens": 47, + "val/ratio": 1.0000905990600586, + "val/ratio_var": 1.1262843599979533e-06 + }, + { + "episode": 3840, + "epoch": 0.7246650311379506, + "eps": 0, + "loss/policy_avg": -0.03477172553539276, + "loss/value_avg": 0.014889835380017757, + "lr": 4.276094276094276e-07, + "objective/entropy": -693.819091796875, + "objective/kl": 14.728355407714844, + "objective/non_score_reward": -0.4418506622314453, + "objective/rlhf_reward": -0.03706549108028412, + "objective/scores": 0.404296875, + "policy/approxkl_avg": 0.00047620845725759864, + "policy/clipfrac_avg": 0.00950541626662016, + "policy/entropy_avg": 0.15099716186523438, + "step": 60, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 1.0001107454299927, + "val/ratio_var": 7.359848268606584e-07 + }, + { + "episode": 3904, + "epoch": 0.7367427816569164, + "eps": 0, + "loss/policy_avg": -0.02886144444346428, + "loss/value_avg": 0.014064384624361992, + "lr": 4.259259259259259e-07, + "objective/entropy": -732.0101318359375, + "objective/kl": 16.070905685424805, + "objective/non_score_reward": -0.48212718963623047, + "objective/rlhf_reward": -0.09003733098506927, + "objective/scores": 0.392578125, + "policy/approxkl_avg": 0.0004534229519777, + "policy/clipfrac_avg": 0.008900020271539688, + "policy/entropy_avg": 0.15126292407512665, + "step": 61, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999483823776245, + "val/ratio_var": 5.890042302780785e-07 + }, + { + "episode": 3968, + "epoch": 0.7488205321758823, + "eps": 0, + "loss/policy_avg": -0.03744254261255264, + "loss/value_avg": 0.015733784064650536, + "lr": 4.242424242424242e-07, + "objective/entropy": -729.4462890625, + "objective/kl": 15.581929206848145, + "objective/non_score_reward": -0.4674578905105591, + "objective/rlhf_reward": 0.05256165564060211, + "objective/scores": 0.51953125, + "policy/approxkl_avg": 0.0003785984590649605, + "policy/clipfrac_avg": 0.008160373196005821, + "policy/entropy_avg": 0.14229774475097656, + "step": 62, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 58, + "val/ratio": 1.0000548362731934, + "val/ratio_var": 4.930154204885184e-07 + }, + { + "episode": 4032, + "epoch": 0.760898282694848, + "eps": 0, + "loss/policy_avg": -0.030014147982001305, + "loss/value_avg": 0.014109417796134949, + "lr": 4.225589225589226e-07, + "objective/entropy": -747.200439453125, + "objective/kl": 15.121429443359375, + "objective/non_score_reward": -0.453642874956131, + "objective/rlhf_reward": -0.04592801630496979, + "objective/scores": 0.408203125, + "policy/approxkl_avg": 0.0004679747798945755, + "policy/clipfrac_avg": 0.008998386561870575, + "policy/entropy_avg": 0.15067800879478455, + "step": 63, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 55, + "val/ratio": 1.0000910758972168, + "val/ratio_var": 5.872569204257161e-07 + }, + { + "episode": 4096, + "epoch": 0.7729760332138139, + "eps": 0, + "loss/policy_avg": -0.02917386218905449, + "loss/value_avg": 0.013095545582473278, + "lr": 4.208754208754209e-07, + "objective/entropy": -728.0292358398438, + "objective/kl": 15.903536796569824, + "objective/non_score_reward": -0.4771060347557068, + "objective/rlhf_reward": -0.11333651840686798, + "objective/scores": 0.36328125, + "policy/approxkl_avg": 0.000516406842507422, + "policy/clipfrac_avg": 0.008886368945240974, + "policy/entropy_avg": 0.1517333984375, + "step": 64, + "val/clipfrac_avg": 5.552594302571379e-06, + "val/num_eos_tokens": 51, + "val/ratio": 0.9999017119407654, + "val/ratio_var": 7.202033316389134e-07 + }, + { + "episode": 4160, + "epoch": 0.7850537837327798, + "eps": 0, + "loss/policy_avg": -0.026137467473745346, + "loss/value_avg": 0.012687700800597668, + "lr": 4.1919191919191915e-07, + "objective/entropy": -767.3764038085938, + "objective/kl": 15.278279304504395, + "objective/non_score_reward": -0.4583483934402466, + "objective/rlhf_reward": -0.008641347289085388, + "objective/scores": 0.44921875, + "policy/approxkl_avg": 0.0003777016536332667, + "policy/clipfrac_avg": 0.009344375692307949, + "policy/entropy_avg": 0.14270401000976562, + "step": 65, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 0.9999732971191406, + "val/ratio_var": 6.31260888894758e-07 + }, + { + "episode": 4224, + "epoch": 0.7971315342517457, + "eps": 0, + "loss/policy_avg": -0.02155480347573757, + "loss/value_avg": 0.012883363291621208, + "lr": 4.1750841750841746e-07, + "objective/entropy": -701.712890625, + "objective/kl": 15.514598846435547, + "objective/non_score_reward": -0.4654379189014435, + "objective/rlhf_reward": -0.08604338765144348, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.00044276120024733245, + "policy/clipfrac_avg": 0.00973192136734724, + "policy/entropy_avg": 0.15912756323814392, + "step": 66, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 1.0000464916229248, + "val/ratio_var": 7.58379371745832e-07 + }, + { + "episode": 4288, + "epoch": 0.8092092847707114, + "eps": 0, + "loss/policy_avg": -0.03506336733698845, + "loss/value_avg": 0.014060527086257935, + "lr": 4.158249158249158e-07, + "objective/entropy": -723.3513793945312, + "objective/kl": 16.13052749633789, + "objective/non_score_reward": -0.4839158058166504, + "objective/rlhf_reward": -0.03518534451723099, + "objective/scores": 0.44921875, + "policy/approxkl_avg": 0.0006458936259150505, + "policy/clipfrac_avg": 0.009050115011632442, + "policy/entropy_avg": 0.14800135791301727, + "step": 67, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 0.9999250173568726, + "val/ratio_var": 6.238656169443857e-07 + }, + { + "episode": 4352, + "epoch": 0.8212870352896773, + "eps": 0, + "loss/policy_avg": -0.03492492437362671, + "loss/value_avg": 0.013395547866821289, + "lr": 4.1414141414141413e-07, + "objective/entropy": -756.134765625, + "objective/kl": 14.228071212768555, + "objective/non_score_reward": -0.4268421530723572, + "objective/rlhf_reward": 0.005775056779384613, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.00040148766129277647, + "policy/clipfrac_avg": 0.008605660870671272, + "policy/entropy_avg": 0.14455795288085938, + "step": 68, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 0.9998807907104492, + "val/ratio_var": 5.104772071717889e-07 + }, + { + "episode": 4416, + "epoch": 0.8333647858086431, + "eps": 0, + "loss/policy_avg": -0.03622628003358841, + "loss/value_avg": 0.012129010632634163, + "lr": 4.1245791245791244e-07, + "objective/entropy": -692.700439453125, + "objective/kl": 15.215728759765625, + "objective/non_score_reward": -0.45647183060646057, + "objective/rlhf_reward": -0.03947964310646057, + "objective/scores": 0.41796875, + "policy/approxkl_avg": 0.00043082493357360363, + "policy/clipfrac_avg": 0.009177702479064465, + "policy/entropy_avg": 0.15514373779296875, + "step": 69, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 1.0000324249267578, + "val/ratio_var": 8.417625281254004e-07 + }, + { + "episode": 4480, + "epoch": 0.845442536327609, + "eps": 0, + "loss/policy_avg": -0.0306796133518219, + "loss/value_avg": 0.011692370288074017, + "lr": 4.1077441077441075e-07, + "objective/entropy": -700.46533203125, + "objective/kl": 14.929758071899414, + "objective/non_score_reward": -0.4478927254676819, + "objective/rlhf_reward": -0.05726771056652069, + "objective/scores": 0.390625, + "policy/approxkl_avg": 0.000412381486967206, + "policy/clipfrac_avg": 0.008844866417348385, + "policy/entropy_avg": 0.15450796484947205, + "step": 70, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 52, + "val/ratio": 1.0001542568206787, + "val/ratio_var": 7.975154403538909e-07 + }, + { + "episode": 4544, + "epoch": 0.8575202868465748, + "eps": 0, + "loss/policy_avg": -0.03419748693704605, + "loss/value_avg": 0.012416936457157135, + "lr": 4.090909090909091e-07, + "objective/entropy": -772.2770385742188, + "objective/kl": 12.517084121704102, + "objective/non_score_reward": -0.3755125403404236, + "objective/rlhf_reward": 0.14401870965957642, + "objective/scores": 0.51953125, + "policy/approxkl_avg": 0.00036138106952421367, + "policy/clipfrac_avg": 0.008941545151174068, + "policy/entropy_avg": 0.13918177783489227, + "step": 71, + "val/clipfrac_avg": 9.753433914738707e-06, + "val/num_eos_tokens": 55, + "val/ratio": 0.9999549388885498, + "val/ratio_var": 4.947730758431135e-07 + }, + { + "episode": 4608, + "epoch": 0.8695980373655406, + "eps": 0, + "loss/policy_avg": -0.0443786196410656, + "loss/value_avg": 0.011243673972785473, + "lr": 4.0740740740740737e-07, + "objective/entropy": -702.0181884765625, + "objective/kl": 14.902286529541016, + "objective/non_score_reward": -0.44706863164901733, + "objective/rlhf_reward": -0.01152174174785614, + "objective/scores": 0.435546875, + "policy/approxkl_avg": 0.00046824943274259567, + "policy/clipfrac_avg": 0.009215106256306171, + "policy/entropy_avg": 0.15385818481445312, + "step": 72, + "val/clipfrac_avg": 4.2761357690324076e-06, + "val/num_eos_tokens": 51, + "val/ratio": 0.9999047517776489, + "val/ratio_var": 8.828073418953863e-07 + }, + { + "episode": 4672, + "epoch": 0.8816757878845065, + "eps": 0, + "loss/policy_avg": -0.044579483568668365, + "loss/value_avg": 0.013477655127644539, + "lr": 4.057239057239057e-07, + "objective/entropy": -750.3492431640625, + "objective/kl": 12.742916107177734, + "objective/non_score_reward": -0.38228750228881836, + "objective/rlhf_reward": 0.16458749771118164, + "objective/scores": 0.546875, + "policy/approxkl_avg": 0.00037200923543423414, + "policy/clipfrac_avg": 0.008558372035622597, + "policy/entropy_avg": 0.14228439331054688, + "step": 73, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 0.999910831451416, + "val/ratio_var": 6.942154300304537e-07 + }, + { + "episode": 4736, + "epoch": 0.8937535384034724, + "eps": 0, + "loss/policy_avg": -0.020492155104875565, + "loss/value_avg": 0.01326964795589447, + "lr": 4.04040404040404e-07, + "objective/entropy": -776.6446533203125, + "objective/kl": 14.296285629272461, + "objective/non_score_reward": -0.42888855934143066, + "objective/rlhf_reward": 0.007146604359149933, + "objective/scores": 0.435546875, + "policy/approxkl_avg": 0.00041618672548793256, + "policy/clipfrac_avg": 0.009593900293111801, + "policy/entropy_avg": 0.15248744189739227, + "step": 74, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999758005142212, + "val/ratio_var": 6.57985367524816e-07 + }, + { + "episode": 4800, + "epoch": 0.9058312889224382, + "eps": 0, + "loss/policy_avg": -0.03961968421936035, + "loss/value_avg": 0.013151820749044418, + "lr": 4.0235690235690236e-07, + "objective/entropy": -710.4595336914062, + "objective/kl": 14.758489608764648, + "objective/non_score_reward": -0.44275468587875366, + "objective/rlhf_reward": -0.023809373378753662, + "objective/scores": 0.41796875, + "policy/approxkl_avg": 0.0004320571315474808, + "policy/clipfrac_avg": 0.009666088968515396, + "policy/entropy_avg": 0.16128668189048767, + "step": 75, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 1.0000278949737549, + "val/ratio_var": 7.166216846599127e-07 + }, + { + "episode": 4864, + "epoch": 0.917909039441404, + "eps": 0, + "loss/policy_avg": -0.04150720685720444, + "loss/value_avg": 0.013188062235713005, + "lr": 4.0067340067340067e-07, + "objective/entropy": -744.156005859375, + "objective/kl": 15.219215393066406, + "objective/non_score_reward": -0.4565764367580414, + "objective/rlhf_reward": 0.07613840699195862, + "objective/scores": 0.53125, + "policy/approxkl_avg": 0.0003768115711864084, + "policy/clipfrac_avg": 0.00844576582312584, + "policy/entropy_avg": 0.15825144946575165, + "step": 76, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 0.9999443292617798, + "val/ratio_var": 8.649810183669615e-07 + }, + { + "episode": 4928, + "epoch": 0.9299867899603699, + "eps": 0, + "loss/policy_avg": -0.025156065821647644, + "loss/value_avg": 0.011967229656875134, + "lr": 3.98989898989899e-07, + "objective/entropy": -674.7507934570312, + "objective/kl": 14.667293548583984, + "objective/non_score_reward": -0.44001880288124084, + "objective/rlhf_reward": -0.10603442043066025, + "objective/scores": 0.333984375, + "policy/approxkl_avg": 0.00046505866339430213, + "policy/clipfrac_avg": 0.009665473364293575, + "policy/entropy_avg": 0.15473303198814392, + "step": 77, + "val/clipfrac_avg": 7.867573003750294e-06, + "val/num_eos_tokens": 47, + "val/ratio": 0.9999678134918213, + "val/ratio_var": 6.291583645179344e-07 + }, + { + "episode": 4992, + "epoch": 0.9420645404793357, + "eps": 0, + "loss/policy_avg": -0.04044274613261223, + "loss/value_avg": 0.012631962075829506, + "lr": 3.973063973063973e-07, + "objective/entropy": -680.3353271484375, + "objective/kl": 14.307917594909668, + "objective/non_score_reward": -0.4292375147342682, + "objective/rlhf_reward": -0.0073625147342681885, + "objective/scores": 0.421875, + "policy/approxkl_avg": 0.0005386772681958973, + "policy/clipfrac_avg": 0.008763562887907028, + "policy/entropy_avg": 0.16646194458007812, + "step": 78, + "val/clipfrac_avg": 1.0013714927481487e-05, + "val/num_eos_tokens": 53, + "val/ratio": 0.9999792575836182, + "val/ratio_var": 7.390693212983024e-07 + }, + { + "episode": 5056, + "epoch": 0.9541422909983016, + "eps": 0, + "loss/policy_avg": -0.048411011695861816, + "loss/value_avg": 0.011281365528702736, + "lr": 3.956228956228956e-07, + "objective/entropy": -662.266845703125, + "objective/kl": 15.492654800415039, + "objective/non_score_reward": -0.46477964520454407, + "objective/rlhf_reward": -0.049252308905124664, + "objective/scores": 0.416015625, + "policy/approxkl_avg": 0.0005287184612825513, + "policy/clipfrac_avg": 0.009335631504654884, + "policy/entropy_avg": 0.18247604370117188, + "step": 79, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 40, + "val/ratio": 1.0000967979431152, + "val/ratio_var": 5.371284714783542e-07 + }, + { + "episode": 5120, + "epoch": 0.9662200415172674, + "eps": 0, + "loss/policy_avg": -0.05055360123515129, + "loss/value_avg": 0.01074596494436264, + "lr": 3.939393939393939e-07, + "objective/entropy": -696.76025390625, + "objective/kl": 13.393022537231445, + "objective/non_score_reward": -0.40179064869880676, + "objective/rlhf_reward": 0.028385117650032043, + "objective/scores": 0.4296875, + "policy/approxkl_avg": 0.00045566324843093753, + "policy/clipfrac_avg": 0.01011097151786089, + "policy/entropy_avg": 0.16834895312786102, + "step": 80, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 58, + "val/ratio": 1.0001335144042969, + "val/ratio_var": 8.958918442658614e-07 + }, + { + "episode": 5184, + "epoch": 0.9782977920362332, + "eps": 0, + "loss/policy_avg": -0.03783099725842476, + "loss/value_avg": 0.010885774157941341, + "lr": 3.922558922558922e-07, + "objective/entropy": -701.8968505859375, + "objective/kl": 14.081245422363281, + "objective/non_score_reward": -0.4224373698234558, + "objective/rlhf_reward": -0.01716393232345581, + "objective/scores": 0.40625, + "policy/approxkl_avg": 0.00042904424481093884, + "policy/clipfrac_avg": 0.008891528472304344, + "policy/entropy_avg": 0.16935603320598602, + "step": 81, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 55, + "val/ratio": 0.9999715089797974, + "val/ratio_var": 8.756766760598111e-07 + }, + { + "episode": 5248, + "epoch": 0.9903755425551991, + "eps": 0, + "loss/policy_avg": -0.03562987968325615, + "loss/value_avg": 0.010367941111326218, + "lr": 3.9057239057239053e-07, + "objective/entropy": -733.0692749023438, + "objective/kl": 12.260353088378906, + "objective/non_score_reward": -0.36781054735183716, + "objective/rlhf_reward": 0.10777536779642105, + "objective/scores": 0.4765625, + "policy/approxkl_avg": 0.00047890731366351247, + "policy/clipfrac_avg": 0.008688896894454956, + "policy/entropy_avg": 0.1473541259765625, + "step": 82, + "val/clipfrac_avg": 4.130320121475961e-06, + "val/num_eos_tokens": 54, + "val/ratio": 1.000084400177002, + "val/ratio_var": 5.904771569475997e-07 + }, + { + "episode": 5312, + "epoch": 1.002453293074165, + "eps": 0, + "loss/policy_avg": -0.03139907121658325, + "loss/value_avg": 0.010464398190379143, + "lr": 3.888888888888889e-07, + "objective/entropy": -696.6053466796875, + "objective/kl": 12.150976181030273, + "objective/non_score_reward": -0.36452925205230713, + "objective/rlhf_reward": -0.008083932101726532, + "objective/scores": 0.35546875, + "policy/approxkl_avg": 0.00038306796341203153, + "policy/clipfrac_avg": 0.009303595870733261, + "policy/entropy_avg": 0.17142996191978455, + "step": 83, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 52, + "val/ratio": 1.0002813339233398, + "val/ratio_var": 6.230750955182884e-07 + }, + { + "episode": 5376, + "epoch": 1.0145310435931307, + "eps": 0, + "loss/policy_avg": -0.03326902911067009, + "loss/value_avg": 0.009499987587332726, + "lr": 3.872053872053872e-07, + "objective/entropy": -681.0773315429688, + "objective/kl": 13.791988372802734, + "objective/non_score_reward": -0.41375964879989624, + "objective/rlhf_reward": 0.0037208348512649536, + "objective/scores": 0.41796875, + "policy/approxkl_avg": 0.00044912411249242723, + "policy/clipfrac_avg": 0.009229006245732307, + "policy/entropy_avg": 0.16622290015220642, + "step": 84, + "val/clipfrac_avg": 1.3086264516459778e-05, + "val/num_eos_tokens": 50, + "val/ratio": 1.0001062154769897, + "val/ratio_var": 6.348414558488003e-07 + }, + { + "episode": 5440, + "epoch": 1.0266087941120967, + "eps": 0, + "loss/policy_avg": -0.024213604629039764, + "loss/value_avg": 0.009874923154711723, + "lr": 3.855218855218855e-07, + "objective/entropy": -694.9255981445312, + "objective/kl": 12.345937728881836, + "objective/non_score_reward": -0.37037813663482666, + "objective/rlhf_reward": 0.06370390206575394, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.0010469364933669567, + "policy/clipfrac_avg": 0.007577784359455109, + "policy/entropy_avg": 0.17229843139648438, + "step": 85, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 35, + "val/ratio": 1.000024676322937, + "val/ratio_var": 6.738481488355319e-07 + }, + { + "episode": 5504, + "epoch": 1.0386865446310625, + "eps": 0, + "loss/policy_avg": -0.024558693170547485, + "loss/value_avg": 0.00916454941034317, + "lr": 3.8383838383838377e-07, + "objective/entropy": -758.008544921875, + "objective/kl": 11.96615219116211, + "objective/non_score_reward": -0.3589845895767212, + "objective/rlhf_reward": 0.05996072292327881, + "objective/scores": 0.41796875, + "policy/approxkl_avg": 0.0004611395706888288, + "policy/clipfrac_avg": 0.009094095788896084, + "policy/entropy_avg": 0.1533660888671875, + "step": 86, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 1.0000550746917725, + "val/ratio_var": 5.709296146960696e-07 + }, + { + "episode": 5568, + "epoch": 1.0507642951500282, + "eps": 0, + "loss/policy_avg": -0.03639592230319977, + "loss/value_avg": 0.010131916962563992, + "lr": 3.8215488215488214e-07, + "objective/entropy": -700.35693359375, + "objective/kl": 13.447929382324219, + "objective/non_score_reward": -0.40343791246414185, + "objective/rlhf_reward": -0.0006058886647224426, + "objective/scores": 0.40234375, + "policy/approxkl_avg": 0.00040239907684735954, + "policy/clipfrac_avg": 0.00921421404927969, + "policy/entropy_avg": 0.17262396216392517, + "step": 87, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 1.0000275373458862, + "val/ratio_var": 6.002900931889599e-07 + }, + { + "episode": 5632, + "epoch": 1.0628420456689942, + "eps": 0, + "loss/policy_avg": -0.01775004342198372, + "loss/value_avg": 0.010766441933810711, + "lr": 3.8047138047138045e-07, + "objective/entropy": -757.93701171875, + "objective/kl": 12.896736145019531, + "objective/non_score_reward": -0.3869020938873291, + "objective/rlhf_reward": 0.0408322811126709, + "objective/scores": 0.427734375, + "policy/approxkl_avg": 0.0003993526042904705, + "policy/clipfrac_avg": 0.00875895470380783, + "policy/entropy_avg": 0.158355712890625, + "step": 88, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 61, + "val/ratio": 0.9999960660934448, + "val/ratio_var": 5.071574946668989e-07 + }, + { + "episode": 5696, + "epoch": 1.07491979618796, + "eps": 0, + "loss/policy_avg": -0.019584549590945244, + "loss/value_avg": 0.009444857016205788, + "lr": 3.7878787878787876e-07, + "objective/entropy": -716.8355102539062, + "objective/kl": 12.824501037597656, + "objective/non_score_reward": -0.3847350478172302, + "objective/rlhf_reward": 0.03274543583393097, + "objective/scores": 0.41796875, + "policy/approxkl_avg": 0.00036994865513406694, + "policy/clipfrac_avg": 0.008531251922249794, + "policy/entropy_avg": 0.1638692319393158, + "step": 89, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 1.000080943107605, + "val/ratio_var": 4.814820613319171e-07 + }, + { + "episode": 5760, + "epoch": 1.086997546706926, + "eps": 0, + "loss/policy_avg": -0.03423365205526352, + "loss/value_avg": 0.009232178330421448, + "lr": 3.7710437710437707e-07, + "objective/entropy": -693.8837280273438, + "objective/kl": 14.44405746459961, + "objective/non_score_reward": -0.4333217144012451, + "objective/rlhf_reward": 0.0012486129999160767, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.0004389469395391643, + "policy/clipfrac_avg": 0.009360795840620995, + "policy/entropy_avg": 0.17364120483398438, + "step": 90, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 55, + "val/ratio": 1.000040054321289, + "val/ratio_var": 6.926705395926547e-07 + }, + { + "episode": 5824, + "epoch": 1.0990752972258917, + "eps": 0, + "loss/policy_avg": -0.032273001968860626, + "loss/value_avg": 0.008706326596438885, + "lr": 3.7542087542087543e-07, + "objective/entropy": -689.3602294921875, + "objective/kl": 12.720474243164062, + "objective/non_score_reward": -0.38161420822143555, + "objective/rlhf_reward": 0.004616260528564453, + "objective/scores": 0.38671875, + "policy/approxkl_avg": 0.0006273721810430288, + "policy/clipfrac_avg": 0.009181271307170391, + "policy/entropy_avg": 0.18628311157226562, + "step": 91, + "val/clipfrac_avg": 1.6534391761524603e-05, + "val/num_eos_tokens": 41, + "val/ratio": 1.0000529289245605, + "val/ratio_var": 8.713162742424174e-07 + }, + { + "episode": 5888, + "epoch": 1.1111530477448575, + "eps": 0, + "loss/policy_avg": -0.02137349173426628, + "loss/value_avg": 0.009642090648412704, + "lr": 3.7373737373737374e-07, + "objective/entropy": -692.1001586914062, + "objective/kl": 14.837923049926758, + "objective/non_score_reward": -0.4451376795768738, + "objective/rlhf_reward": -0.06671970337629318, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.00043404646567068994, + "policy/clipfrac_avg": 0.009237932972609997, + "policy/entropy_avg": 0.17236074805259705, + "step": 92, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 56, + "val/ratio": 1.0002210140228271, + "val/ratio_var": 5.661090085595788e-07 + }, + { + "episode": 5952, + "epoch": 1.1232307982638234, + "eps": 0, + "loss/policy_avg": -0.02585110068321228, + "loss/value_avg": 0.008299533277750015, + "lr": 3.7205387205387205e-07, + "objective/entropy": -694.7158813476562, + "objective/kl": 13.083290100097656, + "objective/non_score_reward": -0.3924986720085144, + "objective/rlhf_reward": -0.0001646876335144043, + "objective/scores": 0.392578125, + "policy/approxkl_avg": 0.00045136193512007594, + "policy/clipfrac_avg": 0.009170552715659142, + "policy/entropy_avg": 0.1658935546875, + "step": 93, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 1.0000104904174805, + "val/ratio_var": 6.102125666984648e-07 + }, + { + "episode": 6016, + "epoch": 1.1353085487827892, + "eps": 0, + "loss/policy_avg": -0.009908072650432587, + "loss/value_avg": 0.008584199473261833, + "lr": 3.703703703703703e-07, + "objective/entropy": -704.2706298828125, + "objective/kl": 13.47339916229248, + "objective/non_score_reward": -0.4042019844055176, + "objective/rlhf_reward": -0.041409000754356384, + "objective/scores": 0.36328125, + "policy/approxkl_avg": 0.0004217799287289381, + "policy/clipfrac_avg": 0.009384381584823132, + "policy/entropy_avg": 0.15645718574523926, + "step": 94, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 0.9999314546585083, + "val/ratio_var": 5.486367058438191e-07 + }, + { + "episode": 6080, + "epoch": 1.147386299301755, + "eps": 0, + "loss/policy_avg": -0.017584126442670822, + "loss/value_avg": 0.008460859768092632, + "lr": 3.686868686868687e-07, + "objective/entropy": -668.5169677734375, + "objective/kl": 13.884815216064453, + "objective/non_score_reward": -0.41654446721076965, + "objective/rlhf_reward": -0.041544459760189056, + "objective/scores": 0.375, + "policy/approxkl_avg": 0.00046371493954211473, + "policy/clipfrac_avg": 0.0087856724858284, + "policy/entropy_avg": 0.17994818091392517, + "step": 95, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.0000855922698975, + "val/ratio_var": 8.110731641863822e-07 + }, + { + "episode": 6144, + "epoch": 1.159464049820721, + "eps": 0, + "loss/policy_avg": -0.044272422790527344, + "loss/value_avg": 0.008409342728555202, + "lr": 3.67003367003367e-07, + "objective/entropy": -681.5213623046875, + "objective/kl": 12.985597610473633, + "objective/non_score_reward": -0.38956788182258606, + "objective/rlhf_reward": 0.06941649317741394, + "objective/scores": 0.458984375, + "policy/approxkl_avg": 0.0004158633528277278, + "policy/clipfrac_avg": 0.008287884294986725, + "policy/entropy_avg": 0.17576727271080017, + "step": 96, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 0.9998923540115356, + "val/ratio_var": 7.354411764026736e-07 + }, + { + "episode": 6208, + "epoch": 1.1715418003396867, + "eps": 0, + "loss/policy_avg": -0.021865837275981903, + "loss/value_avg": 0.008920140564441681, + "lr": 3.653198653198653e-07, + "objective/entropy": -668.6990966796875, + "objective/kl": 13.717606544494629, + "objective/non_score_reward": -0.41152817010879517, + "objective/rlhf_reward": -0.03311019390821457, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.0005852892645634711, + "policy/clipfrac_avg": 0.009737148880958557, + "policy/entropy_avg": 0.17390570044517517, + "step": 97, + "val/clipfrac_avg": 5.780614628747571e-06, + "val/num_eos_tokens": 49, + "val/ratio": 1.0000433921813965, + "val/ratio_var": 7.945446327539685e-07 + }, + { + "episode": 6272, + "epoch": 1.1836195508586527, + "eps": 0, + "loss/policy_avg": -0.01734349876642227, + "loss/value_avg": 0.008430123329162598, + "lr": 3.636363636363636e-07, + "objective/entropy": -680.8672485351562, + "objective/kl": 13.248254776000977, + "objective/non_score_reward": -0.3974476158618927, + "objective/rlhf_reward": -0.0346546545624733, + "objective/scores": 0.36328125, + "policy/approxkl_avg": 0.0004206518060527742, + "policy/clipfrac_avg": 0.008977975696325302, + "policy/entropy_avg": 0.18427658081054688, + "step": 98, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 0.9999732971191406, + "val/ratio_var": 7.201407470347476e-07 + }, + { + "episode": 6336, + "epoch": 1.1956973013776184, + "eps": 0, + "loss/policy_avg": -0.01861773617565632, + "loss/value_avg": 0.007797658443450928, + "lr": 3.6195286195286197e-07, + "objective/entropy": -750.1210327148438, + "objective/kl": 12.454809188842773, + "objective/non_score_reward": -0.37364426255226135, + "objective/rlhf_reward": 0.08729323744773865, + "objective/scores": 0.4609375, + "policy/approxkl_avg": 0.00039422509144060314, + "policy/clipfrac_avg": 0.008532920852303505, + "policy/entropy_avg": 0.15262095630168915, + "step": 99, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 1.0000927448272705, + "val/ratio_var": 6.317893621599069e-07 + }, + { + "episode": 6400, + "epoch": 1.2077750518965842, + "eps": 0, + "loss/policy_avg": -0.018518339842557907, + "loss/value_avg": 0.008531475439667702, + "lr": 3.602693602693603e-07, + "objective/entropy": -659.4075317382812, + "objective/kl": 14.026962280273438, + "objective/non_score_reward": -0.4208088517189026, + "objective/rlhf_reward": -0.014070577919483185, + "objective/scores": 0.40625, + "policy/approxkl_avg": 0.0004532616585493088, + "policy/clipfrac_avg": 0.009551241993904114, + "policy/entropy_avg": 0.1776987761259079, + "step": 100, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.0001015663146973, + "val/ratio_var": 7.798981869200361e-07 + }, + { + "episode": 6464, + "epoch": 1.2198528024155502, + "eps": 0, + "loss/policy_avg": -0.016511594876646996, + "loss/value_avg": 0.007756595965474844, + "lr": 3.5858585858585854e-07, + "objective/entropy": -679.53662109375, + "objective/kl": 11.8580322265625, + "objective/non_score_reward": -0.3557409346103668, + "objective/rlhf_reward": 0.003634057939052582, + "objective/scores": 0.359375, + "policy/approxkl_avg": 0.000443882163381204, + "policy/clipfrac_avg": 0.008907586336135864, + "policy/entropy_avg": 0.16290760040283203, + "step": 101, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.0000029802322388, + "val/ratio_var": 6.948280315555166e-07 + }, + { + "episode": 6528, + "epoch": 1.231930552934516, + "eps": 0, + "loss/policy_avg": -0.0391608364880085, + "loss/value_avg": 0.00805463083088398, + "lr": 3.5690235690235685e-07, + "objective/entropy": -706.1781616210938, + "objective/kl": 12.108580589294434, + "objective/non_score_reward": -0.36325740814208984, + "objective/rlhf_reward": 0.11525820195674896, + "objective/scores": 0.478515625, + "policy/approxkl_avg": 0.00041044512181542814, + "policy/clipfrac_avg": 0.009424544870853424, + "policy/entropy_avg": 0.18370692431926727, + "step": 102, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 0.9999408721923828, + "val/ratio_var": 7.074789323269215e-07 + }, + { + "episode": 6592, + "epoch": 1.2440083034534817, + "eps": 0, + "loss/policy_avg": -0.028079848736524582, + "loss/value_avg": 0.007725001312792301, + "lr": 3.552188552188552e-07, + "objective/entropy": -759.4559326171875, + "objective/kl": 10.86036491394043, + "objective/non_score_reward": -0.3258109390735626, + "objective/rlhf_reward": 0.17907187342643738, + "objective/scores": 0.50390625, + "policy/approxkl_avg": 0.0003799691912718117, + "policy/clipfrac_avg": 0.009264815598726273, + "policy/entropy_avg": 0.16744868457317352, + "step": 103, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 57, + "val/ratio": 1.0000367164611816, + "val/ratio_var": 8.271580327345873e-07 + }, + { + "episode": 6656, + "epoch": 1.2560860539724477, + "eps": 0, + "loss/policy_avg": -0.029258184134960175, + "loss/value_avg": 0.008032194338738918, + "lr": 3.535353535353535e-07, + "objective/entropy": -731.3373413085938, + "objective/kl": 12.31856632232666, + "objective/non_score_reward": -0.3695569932460785, + "objective/rlhf_reward": 0.0728258341550827, + "objective/scores": 0.44140625, + "policy/approxkl_avg": 0.00038161594420671463, + "policy/clipfrac_avg": 0.008145595900714397, + "policy/entropy_avg": 0.15793482959270477, + "step": 104, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 0.9998757839202881, + "val/ratio_var": 5.09608128140826e-07 + }, + { + "episode": 6720, + "epoch": 1.2681638044914134, + "eps": 0, + "loss/policy_avg": -0.02511785924434662, + "loss/value_avg": 0.007496064528822899, + "lr": 3.5185185185185183e-07, + "objective/entropy": -748.223388671875, + "objective/kl": 10.859156608581543, + "objective/non_score_reward": -0.3257746994495392, + "objective/rlhf_reward": 0.12442061305046082, + "objective/scores": 0.44921875, + "policy/approxkl_avg": 0.00035941399983130395, + "policy/clipfrac_avg": 0.007973343133926392, + "policy/entropy_avg": 0.1588083952665329, + "step": 105, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 0.9999687075614929, + "val/ratio_var": 5.749298566115613e-07 + }, + { + "episode": 6784, + "epoch": 1.2802415550103794, + "eps": 0, + "loss/policy_avg": -0.016160570085048676, + "loss/value_avg": 0.0075116828083992004, + "lr": 3.5016835016835014e-07, + "objective/entropy": -734.2767333984375, + "objective/kl": 10.657108306884766, + "objective/non_score_reward": -0.3197132647037506, + "objective/rlhf_reward": 0.08360705524682999, + "objective/scores": 0.40234375, + "policy/approxkl_avg": 0.0003991243429481983, + "policy/clipfrac_avg": 0.008354730904102325, + "policy/entropy_avg": 0.17373785376548767, + "step": 106, + "val/clipfrac_avg": 6.916777692822507e-06, + "val/num_eos_tokens": 48, + "val/ratio": 0.9999769926071167, + "val/ratio_var": 2.0136023977102013e-06 + }, + { + "episode": 6848, + "epoch": 1.2923193055293452, + "eps": 0, + "loss/policy_avg": -0.031159965321421623, + "loss/value_avg": 0.007707377430051565, + "lr": 3.484848484848485e-07, + "objective/entropy": -751.6048583984375, + "objective/kl": 10.399733543395996, + "objective/non_score_reward": -0.31199198961257935, + "objective/rlhf_reward": 0.16603532433509827, + "objective/scores": 0.478515625, + "policy/approxkl_avg": 0.0003584410878829658, + "policy/clipfrac_avg": 0.0083873700350523, + "policy/entropy_avg": 0.17502593994140625, + "step": 107, + "val/clipfrac_avg": 4.006410563306417e-06, + "val/num_eos_tokens": 59, + "val/ratio": 1.0000163316726685, + "val/ratio_var": 4.1057577959691116e-07 + }, + { + "episode": 6912, + "epoch": 1.304397056048311, + "eps": 0, + "loss/policy_avg": -0.028454942628741264, + "loss/value_avg": 0.007068981416523457, + "lr": 3.4680134680134676e-07, + "objective/entropy": -743.0848388671875, + "objective/kl": 11.543351173400879, + "objective/non_score_reward": -0.34630051255226135, + "objective/rlhf_reward": 0.08485182374715805, + "objective/scores": 0.431640625, + "policy/approxkl_avg": 0.00046511981054209173, + "policy/clipfrac_avg": 0.00906567182391882, + "policy/entropy_avg": 0.17058055102825165, + "step": 108, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.0001753568649292, + "val/ratio_var": 5.996980121381057e-07 + }, + { + "episode": 6976, + "epoch": 1.316474806567277, + "eps": 0, + "loss/policy_avg": -0.025691799819469452, + "loss/value_avg": 0.007134787738323212, + "lr": 3.451178451178451e-07, + "objective/entropy": -738.3368530273438, + "objective/kl": 12.694337844848633, + "objective/non_score_reward": -0.38083016872406006, + "objective/rlhf_reward": 0.022490166127681732, + "objective/scores": 0.40234375, + "policy/approxkl_avg": 0.0003700514789670706, + "policy/clipfrac_avg": 0.008202668279409409, + "policy/entropy_avg": 0.16368231177330017, + "step": 109, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999978542327881, + "val/ratio_var": 7.537715305261372e-07 + }, + { + "episode": 7040, + "epoch": 1.3285525570862426, + "eps": 0, + "loss/policy_avg": -0.02102075144648552, + "loss/value_avg": 0.0069115618243813515, + "lr": 3.434343434343434e-07, + "objective/entropy": -674.031494140625, + "objective/kl": 11.79364013671875, + "objective/non_score_reward": -0.3538092076778412, + "objective/rlhf_reward": 0.0021478235721588135, + "objective/scores": 0.35546875, + "policy/approxkl_avg": 0.0004363355692476034, + "policy/clipfrac_avg": 0.009443921968340874, + "policy/entropy_avg": 0.18636958301067352, + "step": 110, + "val/clipfrac_avg": 1.0153373295906931e-05, + "val/num_eos_tokens": 40, + "val/ratio": 1.0000288486480713, + "val/ratio_var": 8.098888883978361e-07 + }, + { + "episode": 7104, + "epoch": 1.3406303076052086, + "eps": 0, + "loss/policy_avg": -0.032738231122493744, + "loss/value_avg": 0.006995133124291897, + "lr": 3.4175084175084175e-07, + "objective/entropy": -694.0885620117188, + "objective/kl": 11.100361824035645, + "objective/non_score_reward": -0.33301082253456116, + "objective/rlhf_reward": 0.11034853756427765, + "objective/scores": 0.443359375, + "policy/approxkl_avg": 0.0004124289262108505, + "policy/clipfrac_avg": 0.009696437045931816, + "policy/entropy_avg": 0.16990280151367188, + "step": 111, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 41, + "val/ratio": 1.000012993812561, + "val/ratio_var": 6.493988848887966e-07 + }, + { + "episode": 7168, + "epoch": 1.3527080581241744, + "eps": 0, + "loss/policy_avg": -0.014131966978311539, + "loss/value_avg": 0.0068663340061903, + "lr": 3.4006734006734006e-07, + "objective/entropy": -708.8604125976562, + "objective/kl": 12.0945405960083, + "objective/non_score_reward": -0.36283618211746216, + "objective/rlhf_reward": 0.07319895178079605, + "objective/scores": 0.435546875, + "policy/approxkl_avg": 0.0003713433106895536, + "policy/clipfrac_avg": 0.008440559729933739, + "policy/entropy_avg": 0.17221546173095703, + "step": 112, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 0.9999561309814453, + "val/ratio_var": 7.880616976763122e-07 + }, + { + "episode": 7232, + "epoch": 1.3647858086431401, + "eps": 0, + "loss/policy_avg": -0.025116082280874252, + "loss/value_avg": 0.006925811991095543, + "lr": 3.3838383838383837e-07, + "objective/entropy": -764.6071166992188, + "objective/kl": 9.985912322998047, + "objective/non_score_reward": -0.29957738518714905, + "objective/rlhf_reward": 0.20384058356285095, + "objective/scores": 0.50390625, + "policy/approxkl_avg": 0.00035295903217047453, + "policy/clipfrac_avg": 0.008869624696671963, + "policy/entropy_avg": 0.14903895556926727, + "step": 113, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 56, + "val/ratio": 0.9999246597290039, + "val/ratio_var": 6.762850830455136e-07 + }, + { + "episode": 7296, + "epoch": 1.3768635591621061, + "eps": 0, + "loss/policy_avg": -0.03208712860941887, + "loss/value_avg": 0.006576072424650192, + "lr": 3.3670033670033673e-07, + "objective/entropy": -727.884521484375, + "objective/kl": 10.056183815002441, + "objective/non_score_reward": -0.30168551206588745, + "objective/rlhf_reward": 0.20173244178295135, + "objective/scores": 0.50390625, + "policy/approxkl_avg": 0.00036313707823865116, + "policy/clipfrac_avg": 0.008787401020526886, + "policy/entropy_avg": 0.16502508521080017, + "step": 114, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 55, + "val/ratio": 1.000067114830017, + "val/ratio_var": 6.396308549483365e-07 + }, + { + "episode": 7360, + "epoch": 1.3889413096810719, + "eps": 0, + "loss/policy_avg": -0.012382835149765015, + "loss/value_avg": 0.007463869638741016, + "lr": 3.35016835016835e-07, + "objective/entropy": -742.7560424804688, + "objective/kl": 11.383095741271973, + "objective/non_score_reward": -0.3414928615093231, + "objective/rlhf_reward": 0.06329229474067688, + "objective/scores": 0.404296875, + "policy/approxkl_avg": 0.0003729486488737166, + "policy/clipfrac_avg": 0.008608178235590458, + "policy/entropy_avg": 0.15799586474895477, + "step": 115, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.0000958442687988, + "val/ratio_var": 5.714954340874101e-07 + }, + { + "episode": 7424, + "epoch": 1.4010190602000376, + "eps": 0, + "loss/policy_avg": -0.025664834305644035, + "loss/value_avg": 0.007761640008538961, + "lr": 3.333333333333333e-07, + "objective/entropy": -698.7653198242188, + "objective/kl": 11.45352554321289, + "objective/non_score_reward": -0.34360575675964355, + "objective/rlhf_reward": 0.14565207064151764, + "objective/scores": 0.48828125, + "policy/approxkl_avg": 0.0004009153926745057, + "policy/clipfrac_avg": 0.009290624409914017, + "policy/entropy_avg": 0.17402777075767517, + "step": 116, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 1.0000454187393188, + "val/ratio_var": 5.306055754772387e-07 + }, + { + "episode": 7488, + "epoch": 1.4130968107190036, + "eps": 0, + "loss/policy_avg": 0.002953662071377039, + "loss/value_avg": 0.008123742416501045, + "lr": 3.316498316498316e-07, + "objective/entropy": -682.9542236328125, + "objective/kl": 11.626581192016602, + "objective/non_score_reward": -0.348797470331192, + "objective/rlhf_reward": -0.012859970331192017, + "objective/scores": 0.3359375, + "policy/approxkl_avg": 0.00040126274689100683, + "policy/clipfrac_avg": 0.008363310247659683, + "policy/entropy_avg": 0.17609915137290955, + "step": 117, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 1.0000150203704834, + "val/ratio_var": 5.960429234619369e-07 + }, + { + "episode": 7552, + "epoch": 1.4251745612379694, + "eps": 0, + "loss/policy_avg": -0.024843420833349228, + "loss/value_avg": 0.008115454576909542, + "lr": 3.2996632996633e-07, + "objective/entropy": -706.5005493164062, + "objective/kl": 10.046842575073242, + "objective/non_score_reward": -0.30140525102615356, + "objective/rlhf_reward": 0.17369240522384644, + "objective/scores": 0.474609375, + "policy/approxkl_avg": 0.0003913644468411803, + "policy/clipfrac_avg": 0.008209867402911186, + "policy/entropy_avg": 0.16899840533733368, + "step": 118, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 1.000002145767212, + "val/ratio_var": 4.6929091013225843e-07 + }, + { + "episode": 7616, + "epoch": 1.4372523117569354, + "eps": 0, + "loss/policy_avg": -0.027740802615880966, + "loss/value_avg": 0.008931613527238369, + "lr": 3.282828282828283e-07, + "objective/entropy": -657.9352416992188, + "objective/kl": 13.040338516235352, + "objective/non_score_reward": -0.39121013879776, + "objective/rlhf_reward": -0.047460153698921204, + "objective/scores": 0.34375, + "policy/approxkl_avg": 0.0005606787162832916, + "policy/clipfrac_avg": 0.010201944038271904, + "policy/entropy_avg": 0.19644801318645477, + "step": 119, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 0.9999899864196777, + "val/ratio_var": 7.442695277859457e-07 + }, + { + "episode": 7680, + "epoch": 1.449330062275901, + "eps": 0, + "loss/policy_avg": -0.020683161914348602, + "loss/value_avg": 0.009700208902359009, + "lr": 3.265993265993266e-07, + "objective/entropy": -690.8984375, + "objective/kl": 11.422820091247559, + "objective/non_score_reward": -0.3426845967769623, + "objective/rlhf_reward": 0.07919040322303772, + "objective/scores": 0.421875, + "policy/approxkl_avg": 0.0004005637892987579, + "policy/clipfrac_avg": 0.008436471223831177, + "policy/entropy_avg": 0.17758052051067352, + "step": 120, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 1.00006103515625, + "val/ratio_var": 6.303826580733585e-07 + }, + { + "episode": 7744, + "epoch": 1.4614078127948669, + "eps": 0, + "loss/policy_avg": -0.013201544992625713, + "loss/value_avg": 0.008843690156936646, + "lr": 3.249158249158249e-07, + "objective/entropy": -661.7539672851562, + "objective/kl": 11.296271324157715, + "objective/non_score_reward": -0.33888810873031616, + "objective/rlhf_reward": 0.023416556417942047, + "objective/scores": 0.36328125, + "policy/approxkl_avg": 0.00045110570499673486, + "policy/clipfrac_avg": 0.008799891918897629, + "policy/entropy_avg": 0.1949361264705658, + "step": 121, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 0.9999641180038452, + "val/ratio_var": 8.055627063185966e-07 + }, + { + "episode": 7808, + "epoch": 1.4734855633138328, + "eps": 0, + "loss/policy_avg": -0.017284566536545753, + "loss/value_avg": 0.011503017507493496, + "lr": 3.2323232323232327e-07, + "objective/entropy": -659.644287109375, + "objective/kl": 11.284911155700684, + "objective/non_score_reward": -0.33854734897613525, + "objective/rlhf_reward": 0.04646243155002594, + "objective/scores": 0.384765625, + "policy/approxkl_avg": 0.00042937506805174053, + "policy/clipfrac_avg": 0.008618071675300598, + "policy/entropy_avg": 0.19123205542564392, + "step": 122, + "val/clipfrac_avg": 0.00022032562992535532, + "val/num_eos_tokens": 43, + "val/ratio": 1.0000629425048828, + "val/ratio_var": 6.566247634509637e-07 + }, + { + "episode": 7872, + "epoch": 1.4855633138327986, + "eps": 0, + "loss/policy_avg": -0.021937822923064232, + "loss/value_avg": 0.009018287062644958, + "lr": 3.2154882154882153e-07, + "objective/entropy": -657.0106811523438, + "objective/kl": 10.135122299194336, + "objective/non_score_reward": -0.3040536642074585, + "objective/rlhf_reward": 0.10219632089138031, + "objective/scores": 0.40625, + "policy/approxkl_avg": 0.0003951935505028814, + "policy/clipfrac_avg": 0.008559124544262886, + "policy/entropy_avg": 0.17734527587890625, + "step": 123, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.0001037120819092, + "val/ratio_var": 6.916661732248031e-07 + }, + { + "episode": 7936, + "epoch": 1.4976410643517646, + "eps": 0, + "loss/policy_avg": -0.013917829841375351, + "loss/value_avg": 0.009599328972399235, + "lr": 3.1986531986531984e-07, + "objective/entropy": -703.0108642578125, + "objective/kl": 11.34697151184082, + "objective/non_score_reward": -0.34040915966033936, + "objective/rlhf_reward": 0.048262715339660645, + "objective/scores": 0.388671875, + "policy/approxkl_avg": 0.0003819286357611418, + "policy/clipfrac_avg": 0.008146028965711594, + "policy/entropy_avg": 0.1836903989315033, + "step": 124, + "val/clipfrac_avg": 8.00051202531904e-06, + "val/num_eos_tokens": 60, + "val/ratio": 1.0001184940338135, + "val/ratio_var": 8.107883218144707e-07 + }, + { + "episode": 8000, + "epoch": 1.5097188148707303, + "eps": 0, + "loss/policy_avg": -0.02694123610854149, + "loss/value_avg": 0.00868251547217369, + "lr": 3.1818181818181815e-07, + "objective/entropy": -613.013671875, + "objective/kl": 13.164669036865234, + "objective/non_score_reward": -0.394940048456192, + "objective/rlhf_reward": -0.043865837156772614, + "objective/scores": 0.3515625, + "policy/approxkl_avg": 0.0005462130066007376, + "policy/clipfrac_avg": 0.008205180056393147, + "policy/entropy_avg": 0.20569229125976562, + "step": 125, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.0000652074813843, + "val/ratio_var": 7.626629781043448e-07 + }, + { + "episode": 8064, + "epoch": 1.521796565389696, + "eps": 0, + "loss/policy_avg": -0.021035056561231613, + "loss/value_avg": 0.008920412510633469, + "lr": 3.164983164983165e-07, + "objective/entropy": -658.7114868164062, + "objective/kl": 11.970376968383789, + "objective/non_score_reward": -0.35911130905151367, + "objective/rlhf_reward": 0.07545901089906693, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.00043813008232973516, + "policy/clipfrac_avg": 0.00897503923624754, + "policy/entropy_avg": 0.1892774999141693, + "step": 126, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 0.9998860359191895, + "val/ratio_var": 5.986601081531262e-07 + }, + { + "episode": 8128, + "epoch": 1.533874315908662, + "eps": 0, + "loss/policy_avg": -0.009907988831400871, + "loss/value_avg": 0.007388514932245016, + "lr": 3.148148148148148e-07, + "objective/entropy": -683.1361083984375, + "objective/kl": 11.096467971801758, + "objective/non_score_reward": -0.3328940272331238, + "objective/rlhf_reward": 0.006461434066295624, + "objective/scores": 0.33984375, + "policy/approxkl_avg": 0.0004163893754594028, + "policy/clipfrac_avg": 0.00852059293538332, + "policy/entropy_avg": 0.1878509521484375, + "step": 127, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.000105381011963, + "val/ratio_var": 7.102952963577991e-07 + }, + { + "episode": 8192, + "epoch": 1.5459520664276278, + "eps": 0, + "loss/policy_avg": -0.026054969057440758, + "loss/value_avg": 0.0072658974677324295, + "lr": 3.1313131313131313e-07, + "objective/entropy": -708.2806396484375, + "objective/kl": 10.58319091796875, + "objective/non_score_reward": -0.31749576330184937, + "objective/rlhf_reward": 0.10144957154989243, + "objective/scores": 0.41796875, + "policy/approxkl_avg": 0.0004081852675881237, + "policy/clipfrac_avg": 0.008715375326573849, + "policy/entropy_avg": 0.17965063452720642, + "step": 128, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 1.0000120401382446, + "val/ratio_var": 7.822923180356156e-07 + }, + { + "episode": 8256, + "epoch": 1.5580298169465936, + "eps": 0, + "loss/policy_avg": -0.0229483749717474, + "loss/value_avg": 0.007459428161382675, + "lr": 3.1144781144781144e-07, + "objective/entropy": -659.6595458984375, + "objective/kl": 10.650728225708008, + "objective/non_score_reward": -0.31952184438705444, + "objective/rlhf_reward": 0.08672817051410675, + "objective/scores": 0.40625, + "policy/approxkl_avg": 0.00047252658987417817, + "policy/clipfrac_avg": 0.008922006003558636, + "policy/entropy_avg": 0.18110594153404236, + "step": 129, + "val/clipfrac_avg": 4.006410563306417e-06, + "val/num_eos_tokens": 41, + "val/ratio": 1.0000056028366089, + "val/ratio_var": 6.670750281045912e-07 + }, + { + "episode": 8320, + "epoch": 1.5701075674655596, + "eps": 0, + "loss/policy_avg": -0.02617826871573925, + "loss/value_avg": 0.007010661065578461, + "lr": 3.0976430976430975e-07, + "objective/entropy": -678.2298583984375, + "objective/kl": 12.300437927246094, + "objective/non_score_reward": -0.36901313066482544, + "objective/rlhf_reward": 0.03870171308517456, + "objective/scores": 0.408203125, + "policy/approxkl_avg": 0.0006277774227783084, + "policy/clipfrac_avg": 0.009144840762019157, + "policy/entropy_avg": 0.19200897216796875, + "step": 130, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 1.0000146627426147, + "val/ratio_var": 6.612851848331047e-07 + }, + { + "episode": 8384, + "epoch": 1.5821853179845253, + "eps": 0, + "loss/policy_avg": -0.010964921675622463, + "loss/value_avg": 0.006414837669581175, + "lr": 3.0808080808080806e-07, + "objective/entropy": -716.4149169921875, + "objective/kl": 10.809772491455078, + "objective/non_score_reward": -0.3242931365966797, + "objective/rlhf_reward": 0.038011543452739716, + "objective/scores": 0.36328125, + "policy/approxkl_avg": 0.0003922901814803481, + "policy/clipfrac_avg": 0.00839821808040142, + "policy/entropy_avg": 0.18431854248046875, + "step": 131, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 52, + "val/ratio": 0.9999863505363464, + "val/ratio_var": 6.125242180132773e-07 + }, + { + "episode": 8448, + "epoch": 1.5942630685034913, + "eps": 0, + "loss/policy_avg": -0.02889040671288967, + "loss/value_avg": 0.00650613009929657, + "lr": 3.063973063973064e-07, + "objective/entropy": -713.104736328125, + "objective/kl": 10.613985061645508, + "objective/non_score_reward": -0.31841954588890076, + "objective/rlhf_reward": 0.11712733656167984, + "objective/scores": 0.435546875, + "policy/approxkl_avg": 0.00041002966463565826, + "policy/clipfrac_avg": 0.008486878126859665, + "policy/entropy_avg": 0.1799418181180954, + "step": 132, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 1.0001921653747559, + "val/ratio_var": 7.679034865759604e-07 + }, + { + "episode": 8512, + "epoch": 1.606340819022457, + "eps": 0, + "loss/policy_avg": -0.017285965383052826, + "loss/value_avg": 0.005914734210819006, + "lr": 3.047138047138047e-07, + "objective/entropy": -597.8193359375, + "objective/kl": 10.843616485595703, + "objective/non_score_reward": -0.3253084719181061, + "objective/rlhf_reward": 0.024300895631313324, + "objective/scores": 0.349609375, + "policy/approxkl_avg": 0.0005316430469974875, + "policy/clipfrac_avg": 0.008745117112994194, + "policy/entropy_avg": 0.18398921191692352, + "step": 133, + "val/clipfrac_avg": 1.3069845408608671e-05, + "val/num_eos_tokens": 43, + "val/ratio": 1.0001111030578613, + "val/ratio_var": 1.0886411700994358e-06 + }, + { + "episode": 8576, + "epoch": 1.6184185695414228, + "eps": 0, + "loss/policy_avg": -0.0031869204249233007, + "loss/value_avg": 0.0062155104242265224, + "lr": 3.0303030303030305e-07, + "objective/entropy": -668.35791015625, + "objective/kl": 11.458172798156738, + "objective/non_score_reward": -0.3437451720237732, + "objective/rlhf_reward": 0.018315374851226807, + "objective/scores": 0.361328125, + "policy/approxkl_avg": 0.00043556466698646545, + "policy/clipfrac_avg": 0.009600733406841755, + "policy/entropy_avg": 0.1877404898405075, + "step": 134, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 1.0000466108322144, + "val/ratio_var": 5.28957230017113e-07 + }, + { + "episode": 8640, + "epoch": 1.6304963200603888, + "eps": 0, + "loss/policy_avg": -0.02297011762857437, + "loss/value_avg": 0.005568277090787888, + "lr": 3.0134680134680136e-07, + "objective/entropy": -664.0026245117188, + "objective/kl": 10.111173629760742, + "objective/non_score_reward": -0.30333518981933594, + "objective/rlhf_reward": 0.15467262268066406, + "objective/scores": 0.45703125, + "policy/approxkl_avg": 0.0005884866695851088, + "policy/clipfrac_avg": 0.008443444035947323, + "policy/entropy_avg": 0.1873784065246582, + "step": 135, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 0.9999123811721802, + "val/ratio_var": 6.365415856635082e-07 + }, + { + "episode": 8704, + "epoch": 1.6425740705793546, + "eps": 0, + "loss/policy_avg": -0.032759517431259155, + "loss/value_avg": 0.005268210079520941, + "lr": 2.9966329966329967e-07, + "objective/entropy": -600.26708984375, + "objective/kl": 11.405153274536133, + "objective/non_score_reward": -0.3421545624732971, + "objective/rlhf_reward": 0.05628291517496109, + "objective/scores": 0.3984375, + "policy/approxkl_avg": 0.0004980469821020961, + "policy/clipfrac_avg": 0.00921311229467392, + "policy/entropy_avg": 0.1832377165555954, + "step": 136, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.0000090599060059, + "val/ratio_var": 8.463471772302e-07 + }, + { + "episode": 8768, + "epoch": 1.6546518210983205, + "eps": 0, + "loss/policy_avg": -0.02364802360534668, + "loss/value_avg": 0.005613654851913452, + "lr": 2.9797979797979793e-07, + "objective/entropy": -710.4251708984375, + "objective/kl": 10.040770530700684, + "objective/non_score_reward": -0.30122309923171997, + "objective/rlhf_reward": 0.18656986951828003, + "objective/scores": 0.48828125, + "policy/approxkl_avg": 0.00038759977906011045, + "policy/clipfrac_avg": 0.008407797664403915, + "policy/entropy_avg": 0.17165374755859375, + "step": 137, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 1.0000735521316528, + "val/ratio_var": 5.960384896752657e-07 + }, + { + "episode": 8832, + "epoch": 1.6667295716172863, + "eps": 0, + "loss/policy_avg": -0.02203012816607952, + "loss/value_avg": 0.00595112843438983, + "lr": 2.962962962962963e-07, + "objective/entropy": -672.9989624023438, + "objective/kl": 10.036592483520508, + "objective/non_score_reward": -0.3010977804660797, + "objective/rlhf_reward": 0.07243738323450089, + "objective/scores": 0.373046875, + "policy/approxkl_avg": 0.00040325592271983624, + "policy/clipfrac_avg": 0.008593715727329254, + "policy/entropy_avg": 0.17470209300518036, + "step": 138, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 0.9999663233757019, + "val/ratio_var": 5.66750088637491e-07 + }, + { + "episode": 8896, + "epoch": 1.678807322136252, + "eps": 0, + "loss/policy_avg": -0.026858514174818993, + "loss/value_avg": 0.005622117780148983, + "lr": 2.946127946127946e-07, + "objective/entropy": -661.4876708984375, + "objective/kl": 11.446596145629883, + "objective/non_score_reward": -0.343397855758667, + "objective/rlhf_reward": 0.11412166804075241, + "objective/scores": 0.45703125, + "policy/approxkl_avg": 0.00048249890096485615, + "policy/clipfrac_avg": 0.008328979834914207, + "policy/entropy_avg": 0.18089675903320312, + "step": 139, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 1.0000536441802979, + "val/ratio_var": 7.740272849332541e-07 + }, + { + "episode": 8960, + "epoch": 1.690885072655218, + "eps": 0, + "loss/policy_avg": -0.012071679346263409, + "loss/value_avg": 0.005673854611814022, + "lr": 2.929292929292929e-07, + "objective/entropy": -716.2989501953125, + "objective/kl": 9.62091064453125, + "objective/non_score_reward": -0.2886272668838501, + "objective/rlhf_reward": 0.07660708576440811, + "objective/scores": 0.365234375, + "policy/approxkl_avg": 0.0004490650608204305, + "policy/clipfrac_avg": 0.008078145794570446, + "policy/entropy_avg": 0.17590078711509705, + "step": 140, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 0.9998876452445984, + "val/ratio_var": 7.121066687432176e-07 + }, + { + "episode": 9024, + "epoch": 1.7029628231741838, + "eps": 0, + "loss/policy_avg": -0.01407882571220398, + "loss/value_avg": 0.005721048917621374, + "lr": 2.912457912457912e-07, + "objective/entropy": -635.1033325195312, + "objective/kl": 11.491706848144531, + "objective/non_score_reward": -0.34475117921829224, + "objective/rlhf_reward": 0.012670673429965973, + "objective/scores": 0.357421875, + "policy/approxkl_avg": 0.00046376598766073585, + "policy/clipfrac_avg": 0.008426757529377937, + "policy/entropy_avg": 0.18351492285728455, + "step": 141, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9998941421508789, + "val/ratio_var": 7.447699772455962e-07 + }, + { + "episode": 9088, + "epoch": 1.7150405736931496, + "eps": 0, + "loss/policy_avg": -0.026968976482748985, + "loss/value_avg": 0.005361597985029221, + "lr": 2.895622895622896e-07, + "objective/entropy": -688.4439086914062, + "objective/kl": 10.791741371154785, + "objective/non_score_reward": -0.323752224445343, + "objective/rlhf_reward": 0.10886494815349579, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.00042141028097830713, + "policy/clipfrac_avg": 0.008808997459709644, + "policy/entropy_avg": 0.18024954199790955, + "step": 142, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.0000135898590088, + "val/ratio_var": 5.709226229555497e-07 + }, + { + "episode": 9152, + "epoch": 1.7271183242121155, + "eps": 0, + "loss/policy_avg": -0.027867591008543968, + "loss/value_avg": 0.005742911249399185, + "lr": 2.878787878787879e-07, + "objective/entropy": -663.561279296875, + "objective/kl": 9.091588973999023, + "objective/non_score_reward": -0.272747665643692, + "objective/rlhf_reward": 0.17207655310630798, + "objective/scores": 0.4453125, + "policy/approxkl_avg": 0.00043330591870471835, + "policy/clipfrac_avg": 0.008460369892418385, + "policy/entropy_avg": 0.19001516699790955, + "step": 143, + "val/clipfrac_avg": 6.127450888016028e-06, + "val/num_eos_tokens": 39, + "val/ratio": 1.0000602006912231, + "val/ratio_var": 6.067602953407913e-07 + }, + { + "episode": 9216, + "epoch": 1.7391960747310813, + "eps": 0, + "loss/policy_avg": -0.048003654927015305, + "loss/value_avg": 0.004765670746564865, + "lr": 2.8619528619528615e-07, + "objective/entropy": -609.2054443359375, + "objective/kl": 11.087499618530273, + "objective/non_score_reward": -0.33262500166893005, + "objective/rlhf_reward": 0.15663282573223114, + "objective/scores": 0.48828125, + "policy/approxkl_avg": 0.00043542124330997467, + "policy/clipfrac_avg": 0.008409352041780949, + "policy/entropy_avg": 0.18917052447795868, + "step": 144, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 0.9998970031738281, + "val/ratio_var": 7.055933224364708e-07 + }, + { + "episode": 9280, + "epoch": 1.7512738252500473, + "eps": 0, + "loss/policy_avg": -0.022776823490858078, + "loss/value_avg": 0.005445465445518494, + "lr": 2.8451178451178446e-07, + "objective/entropy": -649.3797607421875, + "objective/kl": 11.225455284118652, + "objective/non_score_reward": -0.3367636799812317, + "objective/rlhf_reward": 0.056791022419929504, + "objective/scores": 0.39453125, + "policy/approxkl_avg": 0.00043666589772328734, + "policy/clipfrac_avg": 0.008527948521077633, + "policy/entropy_avg": 0.1882273405790329, + "step": 145, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 1.0000823736190796, + "val/ratio_var": 7.544516051893879e-07 + }, + { + "episode": 9344, + "epoch": 1.763351575769013, + "eps": 0, + "loss/policy_avg": -0.02756238356232643, + "loss/value_avg": 0.005087685771286488, + "lr": 2.8282828282828283e-07, + "objective/entropy": -622.69384765625, + "objective/kl": 10.821589469909668, + "objective/non_score_reward": -0.3246476650238037, + "objective/rlhf_reward": 0.08306717872619629, + "objective/scores": 0.408203125, + "policy/approxkl_avg": 0.00043378135887905955, + "policy/clipfrac_avg": 0.008366484194993973, + "policy/entropy_avg": 0.18392562866210938, + "step": 146, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 0.9999205470085144, + "val/ratio_var": 7.217399229375587e-07 + }, + { + "episode": 9408, + "epoch": 1.7754293262879788, + "eps": 0, + "loss/policy_avg": -0.01545548252761364, + "loss/value_avg": 0.005610906984657049, + "lr": 2.8114478114478114e-07, + "objective/entropy": -710.6151733398438, + "objective/kl": 10.56408977508545, + "objective/non_score_reward": -0.316922664642334, + "objective/rlhf_reward": 0.09079217165708542, + "objective/scores": 0.408203125, + "policy/approxkl_avg": 0.00047942213132046163, + "policy/clipfrac_avg": 0.008930440992116928, + "policy/entropy_avg": 0.1723581999540329, + "step": 147, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 0.9999326467514038, + "val/ratio_var": 5.841548045282252e-07 + }, + { + "episode": 9472, + "epoch": 1.7875070768069448, + "eps": 0, + "loss/policy_avg": -0.030797995626926422, + "loss/value_avg": 0.005243232008069754, + "lr": 2.7946127946127945e-07, + "objective/entropy": -729.7215576171875, + "objective/kl": 7.677038192749023, + "objective/non_score_reward": -0.23031114041805267, + "objective/rlhf_reward": 0.26822400093078613, + "objective/scores": 0.498046875, + "policy/approxkl_avg": 0.0003217764897271991, + "policy/clipfrac_avg": 0.007711475715041161, + "policy/entropy_avg": 0.15406641364097595, + "step": 148, + "val/clipfrac_avg": 2.40384615608491e-05, + "val/num_eos_tokens": 51, + "val/ratio": 0.9999207854270935, + "val/ratio_var": 6.207331466612231e-07 + }, + { + "episode": 9536, + "epoch": 1.7995848273259105, + "eps": 0, + "loss/policy_avg": -0.013873748481273651, + "loss/value_avg": 0.005824836902320385, + "lr": 2.7777777777777776e-07, + "objective/entropy": -724.1422729492188, + "objective/kl": 9.410755157470703, + "objective/non_score_reward": -0.28232264518737793, + "objective/rlhf_reward": 0.10390782356262207, + "objective/scores": 0.38671875, + "policy/approxkl_avg": 0.00034999821218661964, + "policy/clipfrac_avg": 0.00804916676133871, + "policy/entropy_avg": 0.1864827573299408, + "step": 149, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.0000414848327637, + "val/ratio_var": 6.957430400689191e-07 + }, + { + "episode": 9600, + "epoch": 1.8116625778448765, + "eps": 0, + "loss/policy_avg": -0.03579283133149147, + "loss/value_avg": 0.0054255155846476555, + "lr": 2.760942760942761e-07, + "objective/entropy": -673.954833984375, + "objective/kl": 11.36821174621582, + "objective/non_score_reward": -0.3410463333129883, + "objective/rlhf_reward": 0.10768412053585052, + "objective/scores": 0.44921875, + "policy/approxkl_avg": 0.00042447494342923164, + "policy/clipfrac_avg": 0.00855330191552639, + "policy/entropy_avg": 0.19662603735923767, + "step": 150, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 53, + "val/ratio": 1.0001200437545776, + "val/ratio_var": 7.517347171415167e-07 + }, + { + "episode": 9664, + "epoch": 1.8237403283638423, + "eps": 0, + "loss/policy_avg": -0.008654760196805, + "loss/value_avg": 0.005138866137713194, + "lr": 2.7441077441077443e-07, + "objective/entropy": -660.7220458984375, + "objective/kl": 11.443527221679688, + "objective/non_score_reward": -0.3433057963848114, + "objective/rlhf_reward": 0.031938336789608, + "objective/scores": 0.375, + "policy/approxkl_avg": 0.00044237799011170864, + "policy/clipfrac_avg": 0.008529680781066418, + "policy/entropy_avg": 0.18802008032798767, + "step": 151, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 1.0000139474868774, + "val/ratio_var": 7.119359111129597e-07 + }, + { + "episode": 9728, + "epoch": 1.835818078882808, + "eps": 0, + "loss/policy_avg": -0.02010141685605049, + "loss/value_avg": 0.0053723035380244255, + "lr": 2.727272727272727e-07, + "objective/entropy": -687.9390869140625, + "objective/kl": 9.019115447998047, + "objective/non_score_reward": -0.2705734670162201, + "objective/rlhf_reward": 0.0761062279343605, + "objective/scores": 0.34765625, + "policy/approxkl_avg": 0.00040762912249192595, + "policy/clipfrac_avg": 0.008179357275366783, + "policy/entropy_avg": 0.18950526416301727, + "step": 152, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 0.9998996257781982, + "val/ratio_var": 6.454416165979637e-07 + }, + { + "episode": 9792, + "epoch": 1.847895829401774, + "eps": 0, + "loss/policy_avg": -0.003955461550503969, + "loss/value_avg": 0.0057320622727274895, + "lr": 2.71043771043771e-07, + "objective/entropy": -709.93115234375, + "objective/kl": 9.436538696289062, + "objective/non_score_reward": -0.2830961346626282, + "objective/rlhf_reward": 0.09629838913679123, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.00038871431024745107, + "policy/clipfrac_avg": 0.0074014379642903805, + "policy/entropy_avg": 0.18319067358970642, + "step": 153, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 1.0000646114349365, + "val/ratio_var": 5.15572935455566e-07 + }, + { + "episode": 9856, + "epoch": 1.8599735799207398, + "eps": 0, + "loss/policy_avg": -0.02044486068189144, + "loss/value_avg": 0.00510798767209053, + "lr": 2.6936026936026936e-07, + "objective/entropy": -618.0216064453125, + "objective/kl": 10.58153247833252, + "objective/non_score_reward": -0.3174459636211395, + "objective/rlhf_reward": 0.05462434142827988, + "objective/scores": 0.37109375, + "policy/approxkl_avg": 0.00045742533984594047, + "policy/clipfrac_avg": 0.008799334987998009, + "policy/entropy_avg": 0.2110799252986908, + "step": 154, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 0.9999656677246094, + "val/ratio_var": 6.85510656239785e-07 + }, + { + "episode": 9920, + "epoch": 1.8720513304397055, + "eps": 0, + "loss/policy_avg": -0.030187513679265976, + "loss/value_avg": 0.0049795545637607574, + "lr": 2.676767676767677e-07, + "objective/entropy": -632.8074951171875, + "objective/kl": 10.401787757873535, + "objective/non_score_reward": -0.3120536208152771, + "objective/rlhf_reward": 0.12495807558298111, + "objective/scores": 0.4375, + "policy/approxkl_avg": 0.0004211895284242928, + "policy/clipfrac_avg": 0.007907616905868053, + "policy/entropy_avg": 0.20108795166015625, + "step": 155, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 0.999963641166687, + "val/ratio_var": 6.760049586773675e-07 + }, + { + "episode": 9984, + "epoch": 1.8841290809586715, + "eps": 0, + "loss/policy_avg": -0.029912468045949936, + "loss/value_avg": 0.005053409840911627, + "lr": 2.65993265993266e-07, + "objective/entropy": -617.66943359375, + "objective/kl": 11.245973587036133, + "objective/non_score_reward": -0.33737921714782715, + "objective/rlhf_reward": 0.07375361770391464, + "objective/scores": 0.41015625, + "policy/approxkl_avg": 0.00048606080235913396, + "policy/clipfrac_avg": 0.00850730575621128, + "policy/entropy_avg": 0.2064310759305954, + "step": 156, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 40, + "val/ratio": 1.000071406364441, + "val/ratio_var": 9.804023193282774e-07 + }, + { + "episode": 10048, + "epoch": 1.8962068314776372, + "eps": 0, + "loss/policy_avg": -0.04771365970373154, + "loss/value_avg": 0.004816756118088961, + "lr": 2.643097643097643e-07, + "objective/entropy": -655.9871826171875, + "objective/kl": 9.88708782196045, + "objective/non_score_reward": -0.2966126501560211, + "objective/rlhf_reward": 0.19313344359397888, + "objective/scores": 0.490234375, + "policy/approxkl_avg": 0.00041826663073152304, + "policy/clipfrac_avg": 0.00776095874607563, + "policy/entropy_avg": 0.18035888671875, + "step": 157, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 1.0000017881393433, + "val/ratio_var": 6.541795869452471e-07 + }, + { + "episode": 10112, + "epoch": 1.9082845819966032, + "eps": 0, + "loss/policy_avg": -0.015434409491717815, + "loss/value_avg": 0.004857035353779793, + "lr": 2.6262626262626266e-07, + "objective/entropy": -639.9647216796875, + "objective/kl": 9.765108108520508, + "objective/non_score_reward": -0.2929532527923584, + "objective/rlhf_reward": 0.09498622268438339, + "objective/scores": 0.388671875, + "policy/approxkl_avg": 0.00043778051622211933, + "policy/clipfrac_avg": 0.009346296079456806, + "policy/entropy_avg": 0.1973876953125, + "step": 158, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 1.0000224113464355, + "val/ratio_var": 8.540955036551168e-07 + }, + { + "episode": 10176, + "epoch": 1.920362332515569, + "eps": 0, + "loss/policy_avg": -0.029195090755820274, + "loss/value_avg": 0.004645572509616613, + "lr": 2.609427609427609e-07, + "objective/entropy": -655.7578125, + "objective/kl": 11.296720504760742, + "objective/non_score_reward": -0.3389016389846802, + "objective/rlhf_reward": 0.10152805596590042, + "objective/scores": 0.44140625, + "policy/approxkl_avg": 0.0004221507697366178, + "policy/clipfrac_avg": 0.008524060249328613, + "policy/entropy_avg": 0.1910552978515625, + "step": 159, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 0.999944806098938, + "val/ratio_var": 5.19986315339338e-07 + }, + { + "episode": 10240, + "epoch": 1.9324400830345347, + "eps": 0, + "loss/policy_avg": -0.026694564148783684, + "loss/value_avg": 0.0052330996841192245, + "lr": 2.5925925925925923e-07, + "objective/entropy": -705.4793701171875, + "objective/kl": 9.390169143676758, + "objective/non_score_reward": -0.2817050516605377, + "objective/rlhf_reward": 0.17459377646446228, + "objective/scores": 0.45703125, + "policy/approxkl_avg": 0.000462901167338714, + "policy/clipfrac_avg": 0.00837808009237051, + "policy/entropy_avg": 0.18092474341392517, + "step": 160, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 40, + "val/ratio": 1.0000178813934326, + "val/ratio_var": 6.675589929727721e-07 + }, + { + "episode": 10304, + "epoch": 1.9445178335535007, + "eps": 0, + "loss/policy_avg": -0.03287532925605774, + "loss/value_avg": 0.004949102643877268, + "lr": 2.5757575757575754e-07, + "objective/entropy": -731.1031494140625, + "objective/kl": 9.570659637451172, + "objective/non_score_reward": -0.2871198058128357, + "objective/rlhf_reward": 0.2748919129371643, + "objective/scores": 0.5625, + "policy/approxkl_avg": 0.00036840554093942046, + "policy/clipfrac_avg": 0.008850205689668655, + "policy/entropy_avg": 0.17508062720298767, + "step": 161, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999874830245972, + "val/ratio_var": 6.138056392046565e-07 + }, + { + "episode": 10368, + "epoch": 1.9565955840724665, + "eps": 0, + "loss/policy_avg": -0.028138628229498863, + "loss/value_avg": 0.004971574060618877, + "lr": 2.558922558922559e-07, + "objective/entropy": -692.5169677734375, + "objective/kl": 9.982763290405273, + "objective/non_score_reward": -0.29948288202285767, + "objective/rlhf_reward": 0.18977493047714233, + "objective/scores": 0.48828125, + "policy/approxkl_avg": 0.0013259215047582984, + "policy/clipfrac_avg": 0.007515524979680777, + "policy/entropy_avg": 0.18165206909179688, + "step": 162, + "val/clipfrac_avg": 4.633748631022172e-06, + "val/num_eos_tokens": 40, + "val/ratio": 0.9999215006828308, + "val/ratio_var": 6.52264759537502e-07 + }, + { + "episode": 10432, + "epoch": 1.9686733345914325, + "eps": 0, + "loss/policy_avg": -0.007500559091567993, + "loss/value_avg": 0.005513847805559635, + "lr": 2.542087542087542e-07, + "objective/entropy": -709.478515625, + "objective/kl": 8.840448379516602, + "objective/non_score_reward": -0.2652134299278259, + "objective/rlhf_reward": 0.15617327392101288, + "objective/scores": 0.421875, + "policy/approxkl_avg": 0.00044637074461206794, + "policy/clipfrac_avg": 0.007994147948920727, + "policy/entropy_avg": 0.185394287109375, + "step": 163, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 1.0000178813934326, + "val/ratio_var": 6.560539986821823e-07 + }, + { + "episode": 10496, + "epoch": 1.9807510851103982, + "eps": 0, + "loss/policy_avg": -0.012414928525686264, + "loss/value_avg": 0.004942988511174917, + "lr": 2.525252525252525e-07, + "objective/entropy": -699.0042724609375, + "objective/kl": 9.015774726867676, + "objective/non_score_reward": -0.2704732418060303, + "objective/rlhf_reward": 0.17386269569396973, + "objective/scores": 0.4453125, + "policy/approxkl_avg": 0.0003955226275138557, + "policy/clipfrac_avg": 0.00792492926120758, + "policy/entropy_avg": 0.18563461303710938, + "step": 164, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 1.0000154972076416, + "val/ratio_var": 7.629991500834876e-07 + }, + { + "episode": 10560, + "epoch": 1.992828835629364, + "eps": 0, + "loss/policy_avg": -0.009898051619529724, + "loss/value_avg": 0.004960807505995035, + "lr": 2.5084175084175083e-07, + "objective/entropy": -661.8831787109375, + "objective/kl": 9.828740119934082, + "objective/non_score_reward": -0.29486221075057983, + "objective/rlhf_reward": 0.04888780415058136, + "objective/scores": 0.34375, + "policy/approxkl_avg": 0.00043535567237995565, + "policy/clipfrac_avg": 0.008342267014086246, + "policy/entropy_avg": 0.1956939697265625, + "step": 165, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 0.9997992515563965, + "val/ratio_var": 8.213489763875259e-07 + }, + { + "episode": 10624, + "epoch": 2.00490658614833, + "eps": 0, + "loss/policy_avg": 0.0006142702768556774, + "loss/value_avg": 0.0057089244946837425, + "lr": 2.4915824915824914e-07, + "objective/entropy": -616.60546875, + "objective/kl": 11.599992752075195, + "objective/non_score_reward": -0.34799978137016296, + "objective/rlhf_reward": -0.018409937620162964, + "objective/scores": 0.330078125, + "policy/approxkl_avg": 0.000447861006250605, + "policy/clipfrac_avg": 0.008485405705869198, + "policy/entropy_avg": 0.19893011450767517, + "step": 166, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 35, + "val/ratio": 0.9999897480010986, + "val/ratio_var": 8.826224302538321e-07 + }, + { + "episode": 10688, + "epoch": 2.0169843366672957, + "eps": 0, + "loss/policy_avg": -0.029148969799280167, + "loss/value_avg": 0.0051203519105911255, + "lr": 2.4747474747474745e-07, + "objective/entropy": -707.4173583984375, + "objective/kl": 9.282992362976074, + "objective/non_score_reward": -0.27848976850509644, + "objective/rlhf_reward": 0.21418601274490356, + "objective/scores": 0.4921875, + "policy/approxkl_avg": 0.00035875054891221225, + "policy/clipfrac_avg": 0.007683487143367529, + "policy/entropy_avg": 0.15778478980064392, + "step": 167, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 0.9999899864196777, + "val/ratio_var": 5.554063591262093e-07 + }, + { + "episode": 10752, + "epoch": 2.0290620871862615, + "eps": 0, + "loss/policy_avg": -0.020282533019781113, + "loss/value_avg": 0.00484459986910224, + "lr": 2.4579124579124576e-07, + "objective/entropy": -612.2288818359375, + "objective/kl": 10.996728897094727, + "objective/non_score_reward": -0.32990187406539917, + "objective/rlhf_reward": 0.05437546968460083, + "objective/scores": 0.384765625, + "policy/approxkl_avg": 0.0004697911790572107, + "policy/clipfrac_avg": 0.00856415368616581, + "policy/entropy_avg": 0.19417700171470642, + "step": 168, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 1.0000133514404297, + "val/ratio_var": 9.346218803329975e-07 + }, + { + "episode": 10816, + "epoch": 2.0411398377052272, + "eps": 0, + "loss/policy_avg": -0.0368703156709671, + "loss/value_avg": 0.005465418100357056, + "lr": 2.441077441077441e-07, + "objective/entropy": -655.9403076171875, + "objective/kl": 10.055724143981934, + "objective/non_score_reward": -0.30167171359062195, + "objective/rlhf_reward": 0.18416813015937805, + "objective/scores": 0.486328125, + "policy/approxkl_avg": 0.00042185792699456215, + "policy/clipfrac_avg": 0.00785021297633648, + "policy/entropy_avg": 0.17223486304283142, + "step": 169, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 41, + "val/ratio": 1.0000656843185425, + "val/ratio_var": 6.886172627673659e-07 + }, + { + "episode": 10880, + "epoch": 2.0532175882241934, + "eps": 0, + "loss/policy_avg": -0.05005396902561188, + "loss/value_avg": 0.00523067032918334, + "lr": 2.4242424242424244e-07, + "objective/entropy": -686.6971435546875, + "objective/kl": 10.104284286499023, + "objective/non_score_reward": -0.30312851071357727, + "objective/rlhf_reward": 0.20663711428642273, + "objective/scores": 0.5078125, + "policy/approxkl_avg": 0.0010287510231137276, + "policy/clipfrac_avg": 0.0077257584780454636, + "policy/entropy_avg": 0.16867446899414062, + "step": 170, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.0000211000442505, + "val/ratio_var": 8.372552429136704e-07 + }, + { + "episode": 10944, + "epoch": 2.065295338743159, + "eps": 0, + "loss/policy_avg": -0.025773359462618828, + "loss/value_avg": 0.004905715584754944, + "lr": 2.407407407407407e-07, + "objective/entropy": -692.7886352539062, + "objective/kl": 8.267000198364258, + "objective/non_score_reward": -0.24800997972488403, + "objective/rlhf_reward": 0.18021267652511597, + "objective/scores": 0.427734375, + "policy/approxkl_avg": 0.0003900658048223704, + "policy/clipfrac_avg": 0.008449830114841461, + "policy/entropy_avg": 0.16828536987304688, + "step": 171, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 1.000011682510376, + "val/ratio_var": 7.061549354148156e-07 + }, + { + "episode": 11008, + "epoch": 2.077373089262125, + "eps": 0, + "loss/policy_avg": -0.013189585879445076, + "loss/value_avg": 0.005143987946212292, + "lr": 2.3905723905723906e-07, + "objective/entropy": -726.6845703125, + "objective/kl": 9.84701156616211, + "objective/non_score_reward": -0.2954103648662567, + "objective/rlhf_reward": 0.2207029163837433, + "objective/scores": 0.515625, + "policy/approxkl_avg": 0.000674139242619276, + "policy/clipfrac_avg": 0.008698908612132072, + "policy/entropy_avg": 0.17170843482017517, + "step": 172, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 1.000104308128357, + "val/ratio_var": 6.811029038544802e-07 + }, + { + "episode": 11072, + "epoch": 2.0894508397810907, + "eps": 0, + "loss/policy_avg": -0.015965035185217857, + "loss/value_avg": 0.004617646336555481, + "lr": 2.3737373737373737e-07, + "objective/entropy": -650.63671875, + "objective/kl": 11.198604583740234, + "objective/non_score_reward": -0.3359581232070923, + "objective/rlhf_reward": 0.009256713092327118, + "objective/scores": 0.345703125, + "policy/approxkl_avg": 0.0004264616873115301, + "policy/clipfrac_avg": 0.008644884452223778, + "policy/entropy_avg": 0.1950274407863617, + "step": 173, + "val/clipfrac_avg": 5.36388643013197e-06, + "val/num_eos_tokens": 48, + "val/ratio": 1.000180721282959, + "val/ratio_var": 1.0091738431583508e-06 + }, + { + "episode": 11136, + "epoch": 2.1015285903000565, + "eps": 0, + "loss/policy_avg": -0.034316565841436386, + "loss/value_avg": 0.004706418141722679, + "lr": 2.3569023569023568e-07, + "objective/entropy": -705.0397338867188, + "objective/kl": 8.905685424804688, + "objective/non_score_reward": -0.26717060804367065, + "objective/rlhf_reward": 0.27775126695632935, + "objective/scores": 0.546875, + "policy/approxkl_avg": 0.0004036706523038447, + "policy/clipfrac_avg": 0.007710058242082596, + "policy/entropy_avg": 0.1774342954158783, + "step": 174, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 41, + "val/ratio": 0.9999071955680847, + "val/ratio_var": 6.414345534722088e-07 + }, + { + "episode": 11200, + "epoch": 2.1136063408190227, + "eps": 0, + "loss/policy_avg": -0.02365967631340027, + "loss/value_avg": 0.004380353260785341, + "lr": 2.34006734006734e-07, + "objective/entropy": -633.821044921875, + "objective/kl": 9.5961332321167, + "objective/non_score_reward": -0.287883996963501, + "objective/rlhf_reward": 0.11690116673707962, + "objective/scores": 0.404296875, + "policy/approxkl_avg": 0.00041495892219245434, + "policy/clipfrac_avg": 0.008334731683135033, + "policy/entropy_avg": 0.18314361572265625, + "step": 175, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 40, + "val/ratio": 1.0000516176223755, + "val/ratio_var": 5.880065145902336e-07 + }, + { + "episode": 11264, + "epoch": 2.1256840913379884, + "eps": 0, + "loss/policy_avg": -0.012720011174678802, + "loss/value_avg": 0.004893209785223007, + "lr": 2.323232323232323e-07, + "objective/entropy": -651.2650146484375, + "objective/kl": 9.453258514404297, + "objective/non_score_reward": -0.28359776735305786, + "objective/rlhf_reward": 0.10409756004810333, + "objective/scores": 0.38671875, + "policy/approxkl_avg": 0.0007613954949192703, + "policy/clipfrac_avg": 0.007399224676191807, + "policy/entropy_avg": 0.17717742919921875, + "step": 176, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.0000202655792236, + "val/ratio_var": 6.240634888854402e-07 + }, + { + "episode": 11328, + "epoch": 2.137761841856954, + "eps": 0, + "loss/policy_avg": -0.027373038232326508, + "loss/value_avg": 0.0046966951340436935, + "lr": 2.3063973063973064e-07, + "objective/entropy": -717.2123413085938, + "objective/kl": 8.953085899353027, + "objective/non_score_reward": -0.26859256625175476, + "objective/rlhf_reward": 0.21578243374824524, + "objective/scores": 0.484375, + "policy/approxkl_avg": 0.00035203900188207626, + "policy/clipfrac_avg": 0.007572174072265625, + "policy/entropy_avg": 0.159637451171875, + "step": 177, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 54, + "val/ratio": 1.0000574588775635, + "val/ratio_var": 5.12606447955477e-07 + }, + { + "episode": 11392, + "epoch": 2.14983959237592, + "eps": 0, + "loss/policy_avg": -0.006380847655236721, + "loss/value_avg": 0.0049251774325966835, + "lr": 2.2895622895622895e-07, + "objective/entropy": -712.037841796875, + "objective/kl": 8.245269775390625, + "objective/non_score_reward": -0.24735809862613678, + "objective/rlhf_reward": 0.13057157397270203, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.0003549880930222571, + "policy/clipfrac_avg": 0.007925866171717644, + "policy/entropy_avg": 0.1665090024471283, + "step": 178, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.0001332759857178, + "val/ratio_var": 5.663550268764084e-07 + }, + { + "episode": 11456, + "epoch": 2.1619173428948857, + "eps": 0, + "loss/policy_avg": -0.0014782699290663004, + "loss/value_avg": 0.004644377622753382, + "lr": 2.2727272727272726e-07, + "objective/entropy": -662.6466064453125, + "objective/kl": 9.573324203491211, + "objective/non_score_reward": -0.2871997356414795, + "objective/rlhf_reward": 0.0892651155591011, + "objective/scores": 0.376953125, + "policy/approxkl_avg": 0.0003882443706970662, + "policy/clipfrac_avg": 0.007613882422447205, + "policy/entropy_avg": 0.17233356833457947, + "step": 179, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 1.0000505447387695, + "val/ratio_var": 5.133804279466858e-07 + }, + { + "episode": 11520, + "epoch": 2.173995093413852, + "eps": 0, + "loss/policy_avg": -0.01099494006484747, + "loss/value_avg": 0.00442532729357481, + "lr": 2.2558922558922557e-07, + "objective/entropy": -641.4156494140625, + "objective/kl": 9.477804183959961, + "objective/non_score_reward": -0.28433412313461304, + "objective/rlhf_reward": 0.05966003239154816, + "objective/scores": 0.34375, + "policy/approxkl_avg": 0.0004196200461592525, + "policy/clipfrac_avg": 0.008064374327659607, + "policy/entropy_avg": 0.1929067075252533, + "step": 180, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 40, + "val/ratio": 0.9999755620956421, + "val/ratio_var": 6.541202992593753e-07 + }, + { + "episode": 11584, + "epoch": 2.1860728439328176, + "eps": 0, + "loss/policy_avg": -0.0352904237806797, + "loss/value_avg": 0.004265302326530218, + "lr": 2.239057239057239e-07, + "objective/entropy": -642.61669921875, + "objective/kl": 10.230189323425293, + "objective/non_score_reward": -0.30690568685531616, + "objective/rlhf_reward": 0.11008650809526443, + "objective/scores": 0.41796875, + "policy/approxkl_avg": 0.0004299771972000599, + "policy/clipfrac_avg": 0.008415701799094677, + "policy/entropy_avg": 0.18710581958293915, + "step": 181, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.0000258684158325, + "val/ratio_var": 7.682069735892583e-07 + }, + { + "episode": 11648, + "epoch": 2.1981505944517834, + "eps": 0, + "loss/policy_avg": -0.018043210729956627, + "loss/value_avg": 0.004413206595927477, + "lr": 2.222222222222222e-07, + "objective/entropy": -659.050537109375, + "objective/kl": 9.415782928466797, + "objective/non_score_reward": -0.28247350454330444, + "objective/rlhf_reward": 0.09106165170669556, + "objective/scores": 0.373046875, + "policy/approxkl_avg": 0.00045144298928789794, + "policy/clipfrac_avg": 0.008088944479823112, + "policy/entropy_avg": 0.18143844604492188, + "step": 182, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 39, + "val/ratio": 1.000122308731079, + "val/ratio_var": 6.821082365604525e-07 + }, + { + "episode": 11712, + "epoch": 2.210228344970749, + "eps": 0, + "loss/policy_avg": -0.01957491599023342, + "loss/value_avg": 0.004746252205222845, + "lr": 2.2053872053872053e-07, + "objective/entropy": -673.1300659179688, + "objective/kl": 10.322259902954102, + "objective/non_score_reward": -0.30966776609420776, + "objective/rlhf_reward": 0.12490253895521164, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.00038717369898222387, + "policy/clipfrac_avg": 0.007530445232987404, + "policy/entropy_avg": 0.18404261767864227, + "step": 183, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 1.000051498413086, + "val/ratio_var": 7.273123401319026e-07 + }, + { + "episode": 11776, + "epoch": 2.222306095489715, + "eps": 0, + "loss/policy_avg": 0.01077682338654995, + "loss/value_avg": 0.004722831770777702, + "lr": 2.1885521885521884e-07, + "objective/entropy": -616.521484375, + "objective/kl": 10.416351318359375, + "objective/non_score_reward": -0.3124905228614807, + "objective/rlhf_reward": -0.03417021036148071, + "objective/scores": 0.27734375, + "policy/approxkl_avg": 0.0005021474789828062, + "policy/clipfrac_avg": 0.008746202103793621, + "policy/entropy_avg": 0.2025197446346283, + "step": 184, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 0.9999576807022095, + "val/ratio_var": 6.454907293118595e-07 + }, + { + "episode": 11840, + "epoch": 2.2343838460086807, + "eps": 0, + "loss/policy_avg": -0.0027505457401275635, + "loss/value_avg": 0.0048131197690963745, + "lr": 2.1717171717171718e-07, + "objective/entropy": -694.5650634765625, + "objective/kl": 9.623019218444824, + "objective/non_score_reward": -0.28869056701660156, + "objective/rlhf_reward": 0.10217857360839844, + "objective/scores": 0.390625, + "policy/approxkl_avg": 0.0003909420920535922, + "policy/clipfrac_avg": 0.009193172678351402, + "policy/entropy_avg": 0.17480087280273438, + "step": 185, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.00017249584198, + "val/ratio_var": 6.435092814172094e-07 + }, + { + "episode": 11904, + "epoch": 2.246461596527647, + "eps": 0, + "loss/policy_avg": -0.02041536569595337, + "loss/value_avg": 0.005415412597358227, + "lr": 2.1548821548821546e-07, + "objective/entropy": -636.8676147460938, + "objective/kl": 10.659719467163086, + "objective/non_score_reward": -0.3197915852069855, + "objective/rlhf_reward": 0.11331389844417572, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.00040606613038107753, + "policy/clipfrac_avg": 0.008012184873223305, + "policy/entropy_avg": 0.18035762012004852, + "step": 186, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 0.9999368190765381, + "val/ratio_var": 5.389789521359489e-07 + }, + { + "episode": 11968, + "epoch": 2.2585393470466126, + "eps": 0, + "loss/policy_avg": -0.00026329857064411044, + "loss/value_avg": 0.0059758638963103294, + "lr": 2.138047138047138e-07, + "objective/entropy": -671.1956787109375, + "objective/kl": 10.020427703857422, + "objective/non_score_reward": -0.30061283707618713, + "objective/rlhf_reward": 0.10319577157497406, + "objective/scores": 0.404296875, + "policy/approxkl_avg": 0.0004317883576732129, + "policy/clipfrac_avg": 0.007914026267826557, + "policy/entropy_avg": 0.19041061401367188, + "step": 187, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 1.0000234842300415, + "val/ratio_var": 6.567967147930176e-07 + }, + { + "episode": 12032, + "epoch": 2.2706170975655784, + "eps": 0, + "loss/policy_avg": -0.016351381316781044, + "loss/value_avg": 0.004221225157380104, + "lr": 2.121212121212121e-07, + "objective/entropy": -581.1964721679688, + "objective/kl": 11.140705108642578, + "objective/non_score_reward": -0.3342211842536926, + "objective/rlhf_reward": 0.006599150598049164, + "objective/scores": 0.33984375, + "policy/approxkl_avg": 0.0005099625559523702, + "policy/clipfrac_avg": 0.008483211509883404, + "policy/entropy_avg": 0.2137502133846283, + "step": 188, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.00002121925354, + "val/ratio_var": 7.082234105837415e-07 + }, + { + "episode": 12096, + "epoch": 2.282694848084544, + "eps": 0, + "loss/policy_avg": -0.004529799334704876, + "loss/value_avg": 0.00449700653553009, + "lr": 2.1043771043771044e-07, + "objective/entropy": -733.6138305664062, + "objective/kl": 9.21728229522705, + "objective/non_score_reward": -0.27651846408843994, + "objective/rlhf_reward": 0.08773934096097946, + "objective/scores": 0.36328125, + "policy/approxkl_avg": 0.00036096826079301536, + "policy/clipfrac_avg": 0.0077320970594882965, + "policy/entropy_avg": 0.16572698950767517, + "step": 189, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 0.9999610781669617, + "val/ratio_var": 5.408356287261995e-07 + }, + { + "episode": 12160, + "epoch": 2.29477259860351, + "eps": 0, + "loss/policy_avg": -0.011266498826444149, + "loss/value_avg": 0.004761071410030127, + "lr": 2.0875420875420873e-07, + "objective/entropy": -695.8765869140625, + "objective/kl": 10.30718994140625, + "objective/non_score_reward": -0.3092157244682312, + "objective/rlhf_reward": 0.1111944392323494, + "objective/scores": 0.419921875, + "policy/approxkl_avg": 0.0003880340082105249, + "policy/clipfrac_avg": 0.008478551171720028, + "policy/entropy_avg": 0.1890207976102829, + "step": 190, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 0.9999769926071167, + "val/ratio_var": 6.441308642024524e-07 + }, + { + "episode": 12224, + "epoch": 2.306850349122476, + "eps": 0, + "loss/policy_avg": -0.020808562636375427, + "loss/value_avg": 0.00449121231213212, + "lr": 2.0707070707070707e-07, + "objective/entropy": -632.7673950195312, + "objective/kl": 9.880966186523438, + "objective/non_score_reward": -0.29642897844314575, + "objective/rlhf_reward": 0.15034836530685425, + "objective/scores": 0.447265625, + "policy/approxkl_avg": 0.0005901949480175972, + "policy/clipfrac_avg": 0.007830065675079823, + "policy/entropy_avg": 0.1974080502986908, + "step": 191, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 0.9999300241470337, + "val/ratio_var": 5.447765261124005e-07 + }, + { + "episode": 12288, + "epoch": 2.318928099641442, + "eps": 0, + "loss/policy_avg": -0.01821664161980152, + "loss/value_avg": 0.004452117718756199, + "lr": 2.0538720538720538e-07, + "objective/entropy": -589.8839721679688, + "objective/kl": 11.97592544555664, + "objective/non_score_reward": -0.35927775502204895, + "objective/rlhf_reward": -0.04506877064704895, + "objective/scores": 0.314453125, + "policy/approxkl_avg": 0.000494088395498693, + "policy/clipfrac_avg": 0.008692565374076366, + "policy/entropy_avg": 0.20995458960533142, + "step": 192, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 1.000139594078064, + "val/ratio_var": 7.803449193488632e-07 + }, + { + "episode": 12352, + "epoch": 2.3310058501604076, + "eps": 0, + "loss/policy_avg": 0.004255164880305529, + "loss/value_avg": 0.004434296861290932, + "lr": 2.0370370370370369e-07, + "objective/entropy": -708.4127197265625, + "objective/kl": 9.581413269042969, + "objective/non_score_reward": -0.2874424159526825, + "objective/rlhf_reward": 0.0768154114484787, + "objective/scores": 0.36328125, + "policy/approxkl_avg": 0.0004709034110419452, + "policy/clipfrac_avg": 0.007526259869337082, + "policy/entropy_avg": 0.19600550830364227, + "step": 193, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 52, + "val/ratio": 0.9999573230743408, + "val/ratio_var": 6.966297405597288e-07 + }, + { + "episode": 12416, + "epoch": 2.3430836006793734, + "eps": 0, + "loss/policy_avg": -0.030452650040388107, + "loss/value_avg": 0.004531817510724068, + "lr": 2.02020202020202e-07, + "objective/entropy": -637.3931274414062, + "objective/kl": 10.20181941986084, + "objective/non_score_reward": -0.30605456233024597, + "objective/rlhf_reward": 0.14560559391975403, + "objective/scores": 0.451171875, + "policy/approxkl_avg": 0.0004737515118904412, + "policy/clipfrac_avg": 0.00787600688636303, + "policy/entropy_avg": 0.1870168149471283, + "step": 194, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 1.0000483989715576, + "val/ratio_var": 8.018863582037739e-07 + }, + { + "episode": 12480, + "epoch": 2.355161351198339, + "eps": 0, + "loss/policy_avg": -0.014566441997885704, + "loss/value_avg": 0.004195361863821745, + "lr": 2.0033670033670033e-07, + "objective/entropy": -724.8233642578125, + "objective/kl": 8.487796783447266, + "objective/non_score_reward": -0.25463390350341797, + "objective/rlhf_reward": 0.19556137919425964, + "objective/scores": 0.44921875, + "policy/approxkl_avg": 0.00040911376709118485, + "policy/clipfrac_avg": 0.008261054754257202, + "policy/entropy_avg": 0.1738535612821579, + "step": 195, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.0000691413879395, + "val/ratio_var": 5.904955173718918e-07 + }, + { + "episode": 12544, + "epoch": 2.3672391017173053, + "eps": 0, + "loss/policy_avg": -0.022108733654022217, + "loss/value_avg": 0.0041517410427331924, + "lr": 1.9865319865319864e-07, + "objective/entropy": -605.9757080078125, + "objective/kl": 10.502775192260742, + "objective/non_score_reward": -0.31508326530456543, + "objective/rlhf_reward": 0.07261204719543457, + "objective/scores": 0.38671875, + "policy/approxkl_avg": 0.00047133685438893735, + "policy/clipfrac_avg": 0.007839309982955456, + "policy/entropy_avg": 0.19527944922447205, + "step": 196, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 0.9999372959136963, + "val/ratio_var": 8.094798431557138e-07 + }, + { + "episode": 12608, + "epoch": 2.379316852236271, + "eps": 0, + "loss/policy_avg": -0.02246645651757717, + "loss/value_avg": 0.004199231043457985, + "lr": 1.9696969696969696e-07, + "objective/entropy": -639.3452758789062, + "objective/kl": 9.353677749633789, + "objective/non_score_reward": -0.2806103229522705, + "objective/rlhf_reward": 0.0772998258471489, + "objective/scores": 0.357421875, + "policy/approxkl_avg": 0.0004011366399936378, + "policy/clipfrac_avg": 0.007909499108791351, + "policy/entropy_avg": 0.1853407323360443, + "step": 197, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999942779541016, + "val/ratio_var": 5.837881076331541e-07 + }, + { + "episode": 12672, + "epoch": 2.391394602755237, + "eps": 0, + "loss/policy_avg": -0.035098060965538025, + "loss/value_avg": 0.004067492671310902, + "lr": 1.9528619528619527e-07, + "objective/entropy": -698.6280517578125, + "objective/kl": 8.177114486694336, + "objective/non_score_reward": -0.24531343579292297, + "objective/rlhf_reward": 0.18925687670707703, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.00038718507857993245, + "policy/clipfrac_avg": 0.008279062807559967, + "policy/entropy_avg": 0.18160438537597656, + "step": 198, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 1.0000050067901611, + "val/ratio_var": 5.738119170928258e-07 + }, + { + "episode": 12736, + "epoch": 2.4034723532742026, + "eps": 0, + "loss/policy_avg": -0.022469520568847656, + "loss/value_avg": 0.00446331687271595, + "lr": 1.936026936026936e-07, + "objective/entropy": -676.8287963867188, + "objective/kl": 7.68222713470459, + "objective/non_score_reward": -0.2304668128490448, + "objective/rlhf_reward": 0.1865253746509552, + "objective/scores": 0.41796875, + "policy/approxkl_avg": 0.00038739325827918947, + "policy/clipfrac_avg": 0.008100518956780434, + "policy/entropy_avg": 0.17807134985923767, + "step": 199, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999980926513672, + "val/ratio_var": 7.030320716694405e-07 + }, + { + "episode": 12800, + "epoch": 2.4155501037931684, + "eps": 0, + "loss/policy_avg": -0.008693840354681015, + "loss/value_avg": 0.004308072850108147, + "lr": 1.9191919191919189e-07, + "objective/entropy": -678.0347900390625, + "objective/kl": 8.073586463928223, + "objective/non_score_reward": -0.2422075867652893, + "objective/rlhf_reward": 0.1515912413597107, + "objective/scores": 0.39453125, + "policy/approxkl_avg": 0.00046054236008785665, + "policy/clipfrac_avg": 0.008118792437016964, + "policy/entropy_avg": 0.19109344482421875, + "step": 200, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 0.999969482421875, + "val/ratio_var": 5.471772510645678e-07 + }, + { + "episode": 12864, + "epoch": 2.4276278543121346, + "eps": 0, + "loss/policy_avg": -0.02215806394815445, + "loss/value_avg": 0.00444747693836689, + "lr": 1.9023569023569022e-07, + "objective/entropy": -715.016357421875, + "objective/kl": 8.792640686035156, + "objective/non_score_reward": -0.2637792229652405, + "objective/rlhf_reward": 0.22645515203475952, + "objective/scores": 0.490234375, + "policy/approxkl_avg": 0.0003587045648600906, + "policy/clipfrac_avg": 0.0076547968201339245, + "policy/entropy_avg": 0.1849416196346283, + "step": 201, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.000044584274292, + "val/ratio_var": 5.405810838965408e-07 + }, + { + "episode": 12928, + "epoch": 2.4397056048311003, + "eps": 0, + "loss/policy_avg": 0.004962640814483166, + "loss/value_avg": 0.004433006979525089, + "lr": 1.8855218855218853e-07, + "objective/entropy": -646.1973876953125, + "objective/kl": 9.110560417175293, + "objective/non_score_reward": -0.27331680059432983, + "objective/rlhf_reward": 0.10168319940567017, + "objective/scores": 0.375, + "policy/approxkl_avg": 0.0004163091944064945, + "policy/clipfrac_avg": 0.007584965787827969, + "policy/entropy_avg": 0.18780645728111267, + "step": 202, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 0.9999338984489441, + "val/ratio_var": 9.266473739444336e-07 + }, + { + "episode": 12992, + "epoch": 2.451783355350066, + "eps": 0, + "loss/policy_avg": -0.011171831749379635, + "loss/value_avg": 0.0047411127015948296, + "lr": 1.8686868686868687e-07, + "objective/entropy": -690.5891723632812, + "objective/kl": 8.7337646484375, + "objective/non_score_reward": -0.26201295852661133, + "objective/rlhf_reward": 0.13154172897338867, + "objective/scores": 0.39453125, + "policy/approxkl_avg": 0.00038152310298755765, + "policy/clipfrac_avg": 0.007856165990233421, + "policy/entropy_avg": 0.1933492124080658, + "step": 203, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 0.9999629855155945, + "val/ratio_var": 6.461675070568162e-07 + }, + { + "episode": 13056, + "epoch": 2.463861105869032, + "eps": 0, + "loss/policy_avg": -0.03274771571159363, + "loss/value_avg": 0.004181091673672199, + "lr": 1.8518518518518516e-07, + "objective/entropy": -734.372802734375, + "objective/kl": 6.88037109375, + "objective/non_score_reward": -0.20641113817691803, + "objective/rlhf_reward": 0.25159668922424316, + "objective/scores": 0.45703125, + "policy/approxkl_avg": 0.0003489043447189033, + "policy/clipfrac_avg": 0.007448253687471151, + "policy/entropy_avg": 0.17730967700481415, + "step": 204, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 53, + "val/ratio": 1.0000073909759521, + "val/ratio_var": 6.07091635629331e-07 + }, + { + "episode": 13120, + "epoch": 2.4759388563879976, + "eps": 0, + "loss/policy_avg": -0.016816487535834312, + "loss/value_avg": 0.0042554219253361225, + "lr": 1.835016835016835e-07, + "objective/entropy": -794.4978637695312, + "objective/kl": 5.602289199829102, + "objective/non_score_reward": -0.16806866228580475, + "objective/rlhf_reward": 0.39443135261535645, + "objective/scores": 0.5625, + "policy/approxkl_avg": 0.0002819629153236747, + "policy/clipfrac_avg": 0.006934004835784435, + "policy/entropy_avg": 0.15758514404296875, + "step": 205, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 0.9999727010726929, + "val/ratio_var": 4.5193257847131463e-07 + }, + { + "episode": 13184, + "epoch": 2.4880166069069634, + "eps": 0, + "loss/policy_avg": -0.017609162256121635, + "loss/value_avg": 0.00427992781624198, + "lr": 1.818181818181818e-07, + "objective/entropy": -712.7396850585938, + "objective/kl": 8.044666290283203, + "objective/non_score_reward": -0.24133998155593872, + "objective/rlhf_reward": 0.19616001844406128, + "objective/scores": 0.4375, + "policy/approxkl_avg": 0.000342213868862018, + "policy/clipfrac_avg": 0.006520026829093695, + "policy/entropy_avg": 0.17752330005168915, + "step": 206, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 0.9999613165855408, + "val/ratio_var": 5.999673931000871e-07 + }, + { + "episode": 13248, + "epoch": 2.5000943574259296, + "eps": 0, + "loss/policy_avg": -0.03822872042655945, + "loss/value_avg": 0.0038146479055285454, + "lr": 1.8013468013468014e-07, + "objective/entropy": -676.6316528320312, + "objective/kl": 8.43276596069336, + "objective/non_score_reward": -0.2529829740524292, + "objective/rlhf_reward": 0.2226029485464096, + "objective/scores": 0.4765625, + "policy/approxkl_avg": 0.00040029053343459964, + "policy/clipfrac_avg": 0.007741398643702269, + "policy/entropy_avg": 0.18157577514648438, + "step": 207, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 38, + "val/ratio": 1.0000382661819458, + "val/ratio_var": 7.777077826176537e-07 + }, + { + "episode": 13312, + "epoch": 2.5121721079448953, + "eps": 0, + "loss/policy_avg": -0.017532743513584137, + "loss/value_avg": 0.004521360620856285, + "lr": 1.7845117845117842e-07, + "objective/entropy": -642.2452392578125, + "objective/kl": 9.599320411682129, + "objective/non_score_reward": -0.2879796028137207, + "objective/rlhf_reward": 0.0982508733868599, + "objective/scores": 0.38671875, + "policy/approxkl_avg": 0.0005058823153376579, + "policy/clipfrac_avg": 0.007621736731380224, + "policy/entropy_avg": 0.19717535376548767, + "step": 208, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 0.9999377727508545, + "val/ratio_var": 6.733653208357282e-07 + }, + { + "episode": 13376, + "epoch": 2.524249858463861, + "eps": 0, + "loss/policy_avg": -0.012456863187253475, + "loss/value_avg": 0.004330017603933811, + "lr": 1.7676767676767676e-07, + "objective/entropy": -684.7198486328125, + "objective/kl": 8.890013694763184, + "objective/non_score_reward": -0.2667003870010376, + "objective/rlhf_reward": 0.1131824254989624, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.0003934592823497951, + "policy/clipfrac_avg": 0.008775051683187485, + "policy/entropy_avg": 0.18945693969726562, + "step": 209, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 1.0001251697540283, + "val/ratio_var": 6.748607574991183e-07 + }, + { + "episode": 13440, + "epoch": 2.536327608982827, + "eps": 0, + "loss/policy_avg": -0.01945299468934536, + "loss/value_avg": 0.0042161582969129086, + "lr": 1.7508417508417507e-07, + "objective/entropy": -704.7137451171875, + "objective/kl": 9.013429641723633, + "objective/non_score_reward": -0.27040284872055054, + "objective/rlhf_reward": 0.13438229262828827, + "objective/scores": 0.404296875, + "policy/approxkl_avg": 0.0003603402874432504, + "policy/clipfrac_avg": 0.008057435974478722, + "policy/entropy_avg": 0.18650183081626892, + "step": 210, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 1.0000133514404297, + "val/ratio_var": 5.646013505611336e-07 + }, + { + "episode": 13504, + "epoch": 2.5484053595017926, + "eps": 0, + "loss/policy_avg": -0.01828945055603981, + "loss/value_avg": 0.004183897748589516, + "lr": 1.7340067340067338e-07, + "objective/entropy": -715.1771240234375, + "objective/kl": 7.647032737731934, + "objective/non_score_reward": -0.22941097617149353, + "objective/rlhf_reward": 0.21931949257850647, + "objective/scores": 0.44921875, + "policy/approxkl_avg": 0.0005932983476668596, + "policy/clipfrac_avg": 0.006059914827346802, + "policy/entropy_avg": 0.17059580981731415, + "step": 211, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 0.9999650716781616, + "val/ratio_var": 4.5039382712275255e-07 + }, + { + "episode": 13568, + "epoch": 2.560483110020759, + "eps": 0, + "loss/policy_avg": -0.01205519214272499, + "loss/value_avg": 0.004262065049260855, + "lr": 1.717171717171717e-07, + "objective/entropy": -680.666015625, + "objective/kl": 9.011266708374023, + "objective/non_score_reward": -0.2703379988670349, + "objective/rlhf_reward": 0.14860734343528748, + "objective/scores": 0.41796875, + "policy/approxkl_avg": 0.0003913857217412442, + "policy/clipfrac_avg": 0.008144761435687542, + "policy/entropy_avg": 0.1854146420955658, + "step": 212, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 1.000110387802124, + "val/ratio_var": 4.5980641516507603e-07 + }, + { + "episode": 13632, + "epoch": 2.5725608605397245, + "eps": 0, + "loss/policy_avg": -0.0006327772280201316, + "loss/value_avg": 0.004324252717196941, + "lr": 1.7003367003367003e-07, + "objective/entropy": -680.5126953125, + "objective/kl": 8.441967010498047, + "objective/non_score_reward": -0.25325900316238403, + "objective/rlhf_reward": 0.11051052808761597, + "objective/scores": 0.36328125, + "policy/approxkl_avg": 0.0003816906246356666, + "policy/clipfrac_avg": 0.007939379662275314, + "policy/entropy_avg": 0.19525527954101562, + "step": 213, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.0000630617141724, + "val/ratio_var": 6.067442654966726e-07 + }, + { + "episode": 13696, + "epoch": 2.5846386110586903, + "eps": 0, + "loss/policy_avg": -0.03070930764079094, + "loss/value_avg": 0.0039781369268894196, + "lr": 1.6835016835016837e-07, + "objective/entropy": -648.4881591796875, + "objective/kl": 8.808910369873047, + "objective/non_score_reward": -0.26426729559898376, + "objective/rlhf_reward": 0.16542020440101624, + "objective/scores": 0.4296875, + "policy/approxkl_avg": 0.0004112160240765661, + "policy/clipfrac_avg": 0.008430849760770798, + "policy/entropy_avg": 0.18458303809165955, + "step": 214, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 41, + "val/ratio": 1.0000827312469482, + "val/ratio_var": 5.764974844169046e-07 + }, + { + "episode": 13760, + "epoch": 2.596716361577656, + "eps": 0, + "loss/policy_avg": -0.01275802031159401, + "loss/value_avg": 0.004106822889298201, + "lr": 1.6666666666666665e-07, + "objective/entropy": -706.819580078125, + "objective/kl": 8.268199920654297, + "objective/non_score_reward": -0.24804598093032837, + "objective/rlhf_reward": 0.18847745656967163, + "objective/scores": 0.4375, + "policy/approxkl_avg": 0.0003638950875028968, + "policy/clipfrac_avg": 0.00869040098041296, + "policy/entropy_avg": 0.18547821044921875, + "step": 215, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.0000922679901123, + "val/ratio_var": 5.714567805625848e-07 + }, + { + "episode": 13824, + "epoch": 2.608794112096622, + "eps": 0, + "loss/policy_avg": -0.014732430689036846, + "loss/value_avg": 0.004122150130569935, + "lr": 1.64983164983165e-07, + "objective/entropy": -678.539306640625, + "objective/kl": 8.51124382019043, + "objective/non_score_reward": -0.25533732771873474, + "objective/rlhf_reward": 0.13821735978126526, + "objective/scores": 0.39453125, + "policy/approxkl_avg": 0.0003934210108127445, + "policy/clipfrac_avg": 0.008303534239530563, + "policy/entropy_avg": 0.19069163501262665, + "step": 216, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.0001001358032227, + "val/ratio_var": 6.961653866710549e-07 + }, + { + "episode": 13888, + "epoch": 2.620871862615588, + "eps": 0, + "loss/policy_avg": -0.0114736994728446, + "loss/value_avg": 0.004468954633921385, + "lr": 1.632996632996633e-07, + "objective/entropy": -760.7581787109375, + "objective/kl": 6.764780044555664, + "objective/non_score_reward": -0.20294338464736938, + "objective/rlhf_reward": 0.2609238028526306, + "objective/scores": 0.46484375, + "policy/approxkl_avg": 0.000336907512973994, + "policy/clipfrac_avg": 0.007351381238549948, + "policy/entropy_avg": 0.17353948950767517, + "step": 217, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 1.000031590461731, + "val/ratio_var": 5.147477963873826e-07 + }, + { + "episode": 13952, + "epoch": 2.632949613134554, + "eps": 0, + "loss/policy_avg": -0.025034895166754723, + "loss/value_avg": 0.004216045141220093, + "lr": 1.6161616161616163e-07, + "objective/entropy": -698.8699951171875, + "objective/kl": 7.570901870727539, + "objective/non_score_reward": -0.2271270602941513, + "objective/rlhf_reward": 0.2767791748046875, + "objective/scores": 0.50390625, + "policy/approxkl_avg": 0.0004006924282293767, + "policy/clipfrac_avg": 0.007912924513220787, + "policy/entropy_avg": 0.17928314208984375, + "step": 218, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 52, + "val/ratio": 1.0000429153442383, + "val/ratio_var": 5.616009843834036e-07 + }, + { + "episode": 14016, + "epoch": 2.6450273636535195, + "eps": 0, + "loss/policy_avg": -0.010850876569747925, + "loss/value_avg": 0.004759899340569973, + "lr": 1.5993265993265992e-07, + "objective/entropy": -677.7619018554688, + "objective/kl": 9.129847526550293, + "objective/non_score_reward": -0.27389541268348694, + "objective/rlhf_reward": 0.10403427481651306, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.00039389575249515474, + "policy/clipfrac_avg": 0.008119095116853714, + "policy/entropy_avg": 0.19026947021484375, + "step": 219, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 0.9999043345451355, + "val/ratio_var": 6.120080797700211e-07 + }, + { + "episode": 14080, + "epoch": 2.6571051141724853, + "eps": 0, + "loss/policy_avg": 0.0019542532972991467, + "loss/value_avg": 0.004348535090684891, + "lr": 1.5824915824915826e-07, + "objective/entropy": -694.8416748046875, + "objective/kl": 7.038139343261719, + "objective/non_score_reward": -0.21114417910575867, + "objective/rlhf_reward": 0.17752769589424133, + "objective/scores": 0.388671875, + "policy/approxkl_avg": 0.0003559057950042188, + "policy/clipfrac_avg": 0.007189783733338118, + "policy/entropy_avg": 0.16869863867759705, + "step": 220, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 38, + "val/ratio": 0.9999446868896484, + "val/ratio_var": 6.206274747455609e-07 + }, + { + "episode": 14144, + "epoch": 2.669182864691451, + "eps": 0, + "loss/policy_avg": -0.013687599450349808, + "loss/value_avg": 0.004020463675260544, + "lr": 1.5656565656565657e-07, + "objective/entropy": -699.5345458984375, + "objective/kl": 8.55117416381836, + "objective/non_score_reward": -0.25653523206710815, + "objective/rlhf_reward": 0.11699993908405304, + "objective/scores": 0.373046875, + "policy/approxkl_avg": 0.00037346064345911145, + "policy/clipfrac_avg": 0.008171791210770607, + "policy/entropy_avg": 0.19686762988567352, + "step": 221, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.0000358819961548, + "val/ratio_var": 5.536150524676486e-07 + }, + { + "episode": 14208, + "epoch": 2.6812606152104173, + "eps": 0, + "loss/policy_avg": -0.016610831022262573, + "loss/value_avg": 0.003959144465625286, + "lr": 1.5488215488215488e-07, + "objective/entropy": -733.2947998046875, + "objective/kl": 7.419867515563965, + "objective/non_score_reward": -0.22259601950645447, + "objective/rlhf_reward": 0.25543132424354553, + "objective/scores": 0.478515625, + "policy/approxkl_avg": 0.0003535112482495606, + "policy/clipfrac_avg": 0.007308521773666143, + "policy/entropy_avg": 0.1968231201171875, + "step": 222, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 0.9999250173568726, + "val/ratio_var": 4.929243573315034e-07 + }, + { + "episode": 14272, + "epoch": 2.693338365729383, + "eps": 0, + "loss/policy_avg": -0.03384992107748985, + "loss/value_avg": 0.003912989050149918, + "lr": 1.531986531986532e-07, + "objective/entropy": -646.734130859375, + "objective/kl": 8.461997985839844, + "objective/non_score_reward": -0.25385990738868713, + "objective/rlhf_reward": 0.16215571761131287, + "objective/scores": 0.416015625, + "policy/approxkl_avg": 0.0005014871712774038, + "policy/clipfrac_avg": 0.00828520953655243, + "policy/entropy_avg": 0.19067637622356415, + "step": 223, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 1.0000280141830444, + "val/ratio_var": 7.016036533968872e-07 + }, + { + "episode": 14336, + "epoch": 2.7054161162483488, + "eps": 0, + "loss/policy_avg": -0.0035064732655882835, + "loss/value_avg": 0.003849966451525688, + "lr": 1.5151515151515152e-07, + "objective/entropy": -686.4854736328125, + "objective/kl": 8.041788101196289, + "objective/non_score_reward": -0.24125364422798157, + "objective/rlhf_reward": 0.12544558942317963, + "objective/scores": 0.3671875, + "policy/approxkl_avg": 0.00039225397631525993, + "policy/clipfrac_avg": 0.00783010758459568, + "policy/entropy_avg": 0.19433467090129852, + "step": 224, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 0.9999794960021973, + "val/ratio_var": 5.356313295123982e-07 + }, + { + "episode": 14400, + "epoch": 2.7174938667673145, + "eps": 0, + "loss/policy_avg": -0.005889839958399534, + "loss/value_avg": 0.004009313881397247, + "lr": 1.4983164983164983e-07, + "objective/entropy": -688.8871459960938, + "objective/kl": 7.107503890991211, + "objective/non_score_reward": -0.2132251262664795, + "objective/rlhf_reward": 0.1681225299835205, + "objective/scores": 0.380859375, + "policy/approxkl_avg": 0.00035966013092547655, + "policy/clipfrac_avg": 0.0072424449026584625, + "policy/entropy_avg": 0.1872762143611908, + "step": 225, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 1.0000863075256348, + "val/ratio_var": 6.394271281351394e-07 + }, + { + "episode": 14464, + "epoch": 2.7295716172862803, + "eps": 0, + "loss/policy_avg": -0.03164386376738548, + "loss/value_avg": 0.003974028863012791, + "lr": 1.4814814814814815e-07, + "objective/entropy": -636.7869873046875, + "objective/kl": 9.086867332458496, + "objective/non_score_reward": -0.2726060152053833, + "objective/rlhf_reward": 0.1607435941696167, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.0004679976846091449, + "policy/clipfrac_avg": 0.007549212779849768, + "policy/entropy_avg": 0.1928914487361908, + "step": 226, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 0.9999264478683472, + "val/ratio_var": 5.871561938874947e-07 + }, + { + "episode": 14528, + "epoch": 2.7416493678052465, + "eps": 0, + "loss/policy_avg": -0.029706722125411034, + "loss/value_avg": 0.003999405540525913, + "lr": 1.4646464646464646e-07, + "objective/entropy": -714.131103515625, + "objective/kl": 8.953620910644531, + "objective/non_score_reward": -0.26860859990119934, + "objective/rlhf_reward": 0.22016091644763947, + "objective/scores": 0.48828125, + "policy/approxkl_avg": 0.0004997976357117295, + "policy/clipfrac_avg": 0.0073518408462405205, + "policy/entropy_avg": 0.19288381934165955, + "step": 227, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 0.999913215637207, + "val/ratio_var": 6.02664385951357e-07 + }, + { + "episode": 14592, + "epoch": 2.7537271183242122, + "eps": 0, + "loss/policy_avg": -0.024763260036706924, + "loss/value_avg": 0.003725615097209811, + "lr": 1.447811447811448e-07, + "objective/entropy": -598.0142822265625, + "objective/kl": 8.618000030517578, + "objective/non_score_reward": -0.2585400342941284, + "objective/rlhf_reward": 0.17554199695587158, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.00044109029113315046, + "policy/clipfrac_avg": 0.008174901828169823, + "policy/entropy_avg": 0.207733154296875, + "step": 228, + "val/clipfrac_avg": 1.2475050425564405e-05, + "val/num_eos_tokens": 35, + "val/ratio": 0.9999794363975525, + "val/ratio_var": 7.014916718617314e-07 + }, + { + "episode": 14656, + "epoch": 2.765804868843178, + "eps": 0, + "loss/policy_avg": -0.025254826992750168, + "loss/value_avg": 0.004053793381899595, + "lr": 1.4309764309764308e-07, + "objective/entropy": -670.363037109375, + "objective/kl": 7.579680442810059, + "objective/non_score_reward": -0.22739042341709137, + "objective/rlhf_reward": 0.23550020158290863, + "objective/scores": 0.462890625, + "policy/approxkl_avg": 0.00038262922316789627, + "policy/clipfrac_avg": 0.007922390475869179, + "policy/entropy_avg": 0.18486277759075165, + "step": 229, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 36, + "val/ratio": 1.0000662803649902, + "val/ratio_var": 6.002863983667339e-07 + }, + { + "episode": 14720, + "epoch": 2.7778826193621438, + "eps": 0, + "loss/policy_avg": -0.009143723174929619, + "loss/value_avg": 0.004357549827545881, + "lr": 1.4141414141414141e-07, + "objective/entropy": -646.9794921875, + "objective/kl": 9.054327964782715, + "objective/non_score_reward": -0.27162984013557434, + "objective/rlhf_reward": 0.09873150289058685, + "objective/scores": 0.37109375, + "policy/approxkl_avg": 0.00039568787906318903, + "policy/clipfrac_avg": 0.007754582446068525, + "policy/entropy_avg": 0.1911672055721283, + "step": 230, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 1.000026822090149, + "val/ratio_var": 7.080183195284917e-07 + }, + { + "episode": 14784, + "epoch": 2.7899603698811095, + "eps": 0, + "loss/policy_avg": -0.03357026353478432, + "loss/value_avg": 0.00450306199491024, + "lr": 1.3973063973063972e-07, + "objective/entropy": -694.3631591796875, + "objective/kl": 7.8914995193481445, + "objective/non_score_reward": -0.2367449700832367, + "objective/rlhf_reward": 0.2432354986667633, + "objective/scores": 0.48046875, + "policy/approxkl_avg": 0.00037845989572815597, + "policy/clipfrac_avg": 0.007996518164873123, + "policy/entropy_avg": 0.19372813403606415, + "step": 231, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9998430013656616, + "val/ratio_var": 5.800552003165649e-07 + }, + { + "episode": 14848, + "epoch": 2.8020381204000753, + "eps": 0, + "loss/policy_avg": 0.001023156102746725, + "loss/value_avg": 0.004101088736206293, + "lr": 1.3804713804713806e-07, + "objective/entropy": -682.6402587890625, + "objective/kl": 9.124707221984863, + "objective/non_score_reward": -0.273741215467453, + "objective/rlhf_reward": 0.102479487657547, + "objective/scores": 0.376953125, + "policy/approxkl_avg": 0.0006178760668262839, + "policy/clipfrac_avg": 0.007577064447104931, + "policy/entropy_avg": 0.19171142578125, + "step": 232, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 53, + "val/ratio": 0.9999958276748657, + "val/ratio_var": 4.763281822306453e-07 + }, + { + "episode": 14912, + "epoch": 2.8141158709190415, + "eps": 0, + "loss/policy_avg": -0.011374952271580696, + "loss/value_avg": 0.004087153356522322, + "lr": 1.3636363636363635e-07, + "objective/entropy": -650.131103515625, + "objective/kl": 9.105212211608887, + "objective/non_score_reward": -0.27315637469291687, + "objective/rlhf_reward": 0.15897253155708313, + "objective/scores": 0.431640625, + "policy/approxkl_avg": 0.00038859708001837134, + "policy/clipfrac_avg": 0.007399349473416805, + "policy/entropy_avg": 0.17907333374023438, + "step": 233, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 1.0000715255737305, + "val/ratio_var": 5.53652057533327e-07 + }, + { + "episode": 14976, + "epoch": 2.8261936214380072, + "eps": 0, + "loss/policy_avg": -0.01846359483897686, + "loss/value_avg": 0.003997947089374065, + "lr": 1.3468013468013468e-07, + "objective/entropy": -721.1434326171875, + "objective/kl": 7.176656723022461, + "objective/non_score_reward": -0.21529969573020935, + "objective/rlhf_reward": 0.24759092926979065, + "objective/scores": 0.462890625, + "policy/approxkl_avg": 0.0003538678865879774, + "policy/clipfrac_avg": 0.00723436800763011, + "policy/entropy_avg": 0.17639541625976562, + "step": 234, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 51, + "val/ratio": 1.0001486539840698, + "val/ratio_var": 5.433992669168219e-07 + }, + { + "episode": 15040, + "epoch": 2.838271371956973, + "eps": 0, + "loss/policy_avg": -0.003030909225344658, + "loss/value_avg": 0.0041738273575901985, + "lr": 1.32996632996633e-07, + "objective/entropy": -593.6438598632812, + "objective/kl": 10.389270782470703, + "objective/non_score_reward": -0.31167811155319214, + "objective/rlhf_reward": 0.017911747097969055, + "objective/scores": 0.330078125, + "policy/approxkl_avg": 0.00048703886568546295, + "policy/clipfrac_avg": 0.007803687360137701, + "policy/entropy_avg": 0.20614878833293915, + "step": 235, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 41, + "val/ratio": 0.9999415278434753, + "val/ratio_var": 8.815354703983758e-07 + }, + { + "episode": 15104, + "epoch": 2.8503491224759387, + "eps": 0, + "loss/policy_avg": -0.028734426945447922, + "loss/value_avg": 0.00377194257453084, + "lr": 1.3131313131313133e-07, + "objective/entropy": -750.08837890625, + "objective/kl": 7.2935791015625, + "objective/non_score_reward": -0.2188073694705963, + "objective/rlhf_reward": 0.2689855992794037, + "objective/scores": 0.48828125, + "policy/approxkl_avg": 0.0003561212797649205, + "policy/clipfrac_avg": 0.007725189905613661, + "policy/entropy_avg": 0.1868082731962204, + "step": 236, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 54, + "val/ratio": 0.9999493360519409, + "val/ratio_var": 5.775621048087487e-07 + }, + { + "episode": 15168, + "epoch": 2.8624268729949045, + "eps": 0, + "loss/policy_avg": -0.02059931308031082, + "loss/value_avg": 0.004160116892307997, + "lr": 1.2962962962962961e-07, + "objective/entropy": -689.4764404296875, + "objective/kl": 7.785543441772461, + "objective/non_score_reward": -0.2335663139820099, + "objective/rlhf_reward": 0.1804961860179901, + "objective/scores": 0.4140625, + "policy/approxkl_avg": 0.0003638736379798502, + "policy/clipfrac_avg": 0.007504904642701149, + "policy/entropy_avg": 0.19126257300376892, + "step": 237, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 1.000117540359497, + "val/ratio_var": 5.907762670176453e-07 + }, + { + "episode": 15232, + "epoch": 2.8745046235138707, + "eps": 0, + "loss/policy_avg": -0.002399355173110962, + "loss/value_avg": 0.003902244148775935, + "lr": 1.2794612794612795e-07, + "objective/entropy": -693.060791015625, + "objective/kl": 6.487215042114258, + "objective/non_score_reward": -0.19461645185947418, + "objective/rlhf_reward": 0.18038354814052582, + "objective/scores": 0.375, + "policy/approxkl_avg": 0.00036661222111433744, + "policy/clipfrac_avg": 0.007351069711148739, + "policy/entropy_avg": 0.1860555112361908, + "step": 238, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 1.0000566244125366, + "val/ratio_var": 6.291702447924763e-07 + }, + { + "episode": 15296, + "epoch": 2.8865823740328365, + "eps": 0, + "loss/policy_avg": -0.007437670137733221, + "loss/value_avg": 0.004013408906757832, + "lr": 1.2626262626262626e-07, + "objective/entropy": -704.9505615234375, + "objective/kl": 6.939170837402344, + "objective/non_score_reward": -0.20817512273788452, + "objective/rlhf_reward": 0.17024284601211548, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.00033546844497323036, + "policy/clipfrac_avg": 0.007042970508337021, + "policy/entropy_avg": 0.1695149838924408, + "step": 239, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.0000756978988647, + "val/ratio_var": 6.734092039550887e-07 + }, + { + "episode": 15360, + "epoch": 2.898660124551802, + "eps": 0, + "loss/policy_avg": -0.028576653450727463, + "loss/value_avg": 0.004010652657598257, + "lr": 1.2457912457912457e-07, + "objective/entropy": -654.0175170898438, + "objective/kl": 8.195779800415039, + "objective/non_score_reward": -0.24587342143058777, + "objective/rlhf_reward": 0.18869690597057343, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.0004402039048727602, + "policy/clipfrac_avg": 0.007189431693404913, + "policy/entropy_avg": 0.17950567603111267, + "step": 240, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 0.9998716115951538, + "val/ratio_var": 6.682057573925704e-07 + }, + { + "episode": 15424, + "epoch": 2.910737875070768, + "eps": 0, + "loss/policy_avg": -0.005296625196933746, + "loss/value_avg": 0.0039635319262743, + "lr": 1.2289562289562288e-07, + "objective/entropy": -656.5180053710938, + "objective/kl": 8.551060676574707, + "objective/non_score_reward": -0.2565317749977112, + "objective/rlhf_reward": 0.13604632019996643, + "objective/scores": 0.392578125, + "policy/approxkl_avg": 0.00038327404763549566, + "policy/clipfrac_avg": 0.007787951733916998, + "policy/entropy_avg": 0.17977142333984375, + "step": 241, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.0001020431518555, + "val/ratio_var": 5.759193868470902e-07 + }, + { + "episode": 15488, + "epoch": 2.9228156255897337, + "eps": 0, + "loss/policy_avg": -0.009168568067252636, + "loss/value_avg": 0.004365907050669193, + "lr": 1.2121212121212122e-07, + "objective/entropy": -702.2899169921875, + "objective/kl": 7.23637056350708, + "objective/non_score_reward": -0.21709111332893372, + "objective/rlhf_reward": 0.22870966792106628, + "objective/scores": 0.4453125, + "policy/approxkl_avg": 0.0003652493469417095, + "policy/clipfrac_avg": 0.007043258287012577, + "policy/entropy_avg": 0.1999460905790329, + "step": 242, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 0.9999537467956543, + "val/ratio_var": 4.7287392135331174e-07 + }, + { + "episode": 15552, + "epoch": 2.9348933761087, + "eps": 0, + "loss/policy_avg": -0.03705034777522087, + "loss/value_avg": 0.0035443564411252737, + "lr": 1.1952861952861953e-07, + "objective/entropy": -629.838623046875, + "objective/kl": 8.425390243530273, + "objective/non_score_reward": -0.25276172161102295, + "objective/rlhf_reward": 0.22135938704013824, + "objective/scores": 0.474609375, + "policy/approxkl_avg": 0.00040427930071018636, + "policy/clipfrac_avg": 0.00702214939519763, + "policy/entropy_avg": 0.19646072387695312, + "step": 243, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 1.0001170635223389, + "val/ratio_var": 6.153204026304593e-07 + }, + { + "episode": 15616, + "epoch": 2.9469711266276657, + "eps": 0, + "loss/policy_avg": -0.03318122774362564, + "loss/value_avg": 0.003984754905104637, + "lr": 1.1784511784511784e-07, + "objective/entropy": -658.1822509765625, + "objective/kl": 8.162740707397461, + "objective/non_score_reward": -0.2448822408914566, + "objective/rlhf_reward": 0.2568267583847046, + "objective/scores": 0.5, + "policy/approxkl_avg": 0.00037857884308323264, + "policy/clipfrac_avg": 0.0069054896011948586, + "policy/entropy_avg": 0.18754324316978455, + "step": 244, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 37, + "val/ratio": 0.9999241828918457, + "val/ratio_var": 6.279624926719407e-07 + }, + { + "episode": 15680, + "epoch": 2.9590488771466315, + "eps": 0, + "loss/policy_avg": -0.019931495189666748, + "loss/value_avg": 0.003651971695944667, + "lr": 1.1616161616161615e-07, + "objective/entropy": -606.1823120117188, + "objective/kl": 7.644600868225098, + "objective/non_score_reward": -0.22933802008628845, + "objective/rlhf_reward": 0.14859166741371155, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.0005081476992927492, + "policy/clipfrac_avg": 0.007128735538572073, + "policy/entropy_avg": 0.19517645239830017, + "step": 245, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 40, + "val/ratio": 1.0000133514404297, + "val/ratio_var": 6.433290309360018e-07 + }, + { + "episode": 15744, + "epoch": 2.971126627665597, + "eps": 0, + "loss/policy_avg": -0.044989436864852905, + "loss/value_avg": 0.004129257518798113, + "lr": 1.1447811447811447e-07, + "objective/entropy": -718.4443359375, + "objective/kl": 6.58664608001709, + "objective/non_score_reward": -0.197599396109581, + "objective/rlhf_reward": 0.3522053062915802, + "objective/scores": 0.55078125, + "policy/approxkl_avg": 0.00035908090649172664, + "policy/clipfrac_avg": 0.006561779882758856, + "policy/entropy_avg": 0.17469915747642517, + "step": 246, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 1.000040888786316, + "val/ratio_var": 6.177034492793609e-07 + }, + { + "episode": 15808, + "epoch": 2.983204378184563, + "eps": 0, + "loss/policy_avg": -0.006306433584541082, + "loss/value_avg": 0.00377975357696414, + "lr": 1.1279461279461279e-07, + "objective/entropy": -669.0179443359375, + "objective/kl": 7.104648590087891, + "objective/non_score_reward": -0.21313944458961487, + "objective/rlhf_reward": 0.19604022800922394, + "objective/scores": 0.41015625, + "policy/approxkl_avg": 0.0003768262395169586, + "policy/clipfrac_avg": 0.007034813519567251, + "policy/entropy_avg": 0.1779836118221283, + "step": 247, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 0.999952495098114, + "val/ratio_var": 5.565264018514426e-07 + }, + { + "episode": 15872, + "epoch": 2.995282128703529, + "eps": 0, + "loss/policy_avg": -0.030956070870161057, + "loss/value_avg": 0.0035534966737031937, + "lr": 1.111111111111111e-07, + "objective/entropy": -611.2562255859375, + "objective/kl": 8.399555206298828, + "objective/non_score_reward": -0.25198665261268616, + "objective/rlhf_reward": 0.16500553488731384, + "objective/scores": 0.41796875, + "policy/approxkl_avg": 0.00039667676901444793, + "policy/clipfrac_avg": 0.007198335137218237, + "policy/entropy_avg": 0.19819514453411102, + "step": 248, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 0.999969482421875, + "val/ratio_var": 4.946619469592406e-07 + }, + { + "episode": 15936, + "epoch": 3.007359879222495, + "eps": 0, + "loss/policy_avg": -0.017340319231152534, + "loss/value_avg": 0.004106181673705578, + "lr": 1.0942760942760942e-07, + "objective/entropy": -661.902099609375, + "objective/kl": 7.087057113647461, + "objective/non_score_reward": -0.21261171996593475, + "objective/rlhf_reward": 0.24832576513290405, + "objective/scores": 0.4609375, + "policy/approxkl_avg": 0.00036943715531378984, + "policy/clipfrac_avg": 0.0072855958715081215, + "policy/entropy_avg": 0.19068431854248047, + "step": 249, + "val/clipfrac_avg": 5.44804743185523e-06, + "val/num_eos_tokens": 46, + "val/ratio": 1.0000860691070557, + "val/ratio_var": 6.541467314491456e-07 + }, + { + "episode": 16000, + "epoch": 3.0194376297414607, + "eps": 0, + "loss/policy_avg": -0.006075289100408554, + "loss/value_avg": 0.0037019317969679832, + "lr": 1.0774410774410773e-07, + "objective/entropy": -622.1962890625, + "objective/kl": 8.35627555847168, + "objective/non_score_reward": -0.25068825483322144, + "objective/rlhf_reward": 0.055464085191488266, + "objective/scores": 0.306640625, + "policy/approxkl_avg": 0.0004216305387672037, + "policy/clipfrac_avg": 0.00783085823059082, + "policy/entropy_avg": 0.2043101042509079, + "step": 250, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 33, + "val/ratio": 0.9999951124191284, + "val/ratio_var": 7.26553423646692e-07 + }, + { + "episode": 16064, + "epoch": 3.0315153802604264, + "eps": 0, + "loss/policy_avg": -0.02081982046365738, + "loss/value_avg": 0.0035106416326016188, + "lr": 1.0606060606060605e-07, + "objective/entropy": -641.5849609375, + "objective/kl": 8.316513061523438, + "objective/non_score_reward": -0.24949535727500916, + "objective/rlhf_reward": 0.19825854897499084, + "objective/scores": 0.447265625, + "policy/approxkl_avg": 0.00040991941932588816, + "policy/clipfrac_avg": 0.007065493613481522, + "policy/entropy_avg": 0.19249090552330017, + "step": 251, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 0.999900221824646, + "val/ratio_var": 8.348190476681339e-07 + }, + { + "episode": 16128, + "epoch": 3.043593130779392, + "eps": 0, + "loss/policy_avg": -0.03310645744204521, + "loss/value_avg": 0.003968073055148125, + "lr": 1.0437710437710436e-07, + "objective/entropy": -636.48974609375, + "objective/kl": 7.7286529541015625, + "objective/non_score_reward": -0.23185959458351135, + "objective/rlhf_reward": 0.17146071791648865, + "objective/scores": 0.40234375, + "policy/approxkl_avg": 0.00042435387149453163, + "policy/clipfrac_avg": 0.007897584699094296, + "policy/entropy_avg": 0.20297622680664062, + "step": 252, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 0.99993896484375, + "val/ratio_var": 8.238701525442593e-07 + }, + { + "episode": 16192, + "epoch": 3.0556708812983584, + "eps": 0, + "loss/policy_avg": -0.01020126324146986, + "loss/value_avg": 0.0036760650109499693, + "lr": 1.0269360269360269e-07, + "objective/entropy": -679.7139892578125, + "objective/kl": 7.155096054077148, + "objective/non_score_reward": -0.2146528959274292, + "objective/rlhf_reward": 0.216499462723732, + "objective/scores": 0.431640625, + "policy/approxkl_avg": 0.0003815985983237624, + "policy/clipfrac_avg": 0.007189772091805935, + "policy/entropy_avg": 0.19544348120689392, + "step": 253, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999157190322876, + "val/ratio_var": 5.615696636596113e-07 + }, + { + "episode": 16256, + "epoch": 3.067748631817324, + "eps": 0, + "loss/policy_avg": -0.005997738800942898, + "loss/value_avg": 0.004006213508546352, + "lr": 1.01010101010101e-07, + "objective/entropy": -635.3106689453125, + "objective/kl": 8.364971160888672, + "objective/non_score_reward": -0.2509491443634033, + "objective/rlhf_reward": 0.12844537198543549, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.000423733436036855, + "policy/clipfrac_avg": 0.006593957543373108, + "policy/entropy_avg": 0.2101338803768158, + "step": 254, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999582171440125, + "val/ratio_var": 5.447363378152659e-07 + }, + { + "episode": 16320, + "epoch": 3.07982638233629, + "eps": 0, + "loss/policy_avg": -0.016920043155550957, + "loss/value_avg": 0.0037770867347717285, + "lr": 9.932659932659932e-08, + "objective/entropy": -681.9376220703125, + "objective/kl": 7.062074661254883, + "objective/non_score_reward": -0.2118622362613678, + "objective/rlhf_reward": 0.2236846387386322, + "objective/scores": 0.435546875, + "policy/approxkl_avg": 0.00037374263047240674, + "policy/clipfrac_avg": 0.0069709960371255875, + "policy/entropy_avg": 0.18284988403320312, + "step": 255, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 40, + "val/ratio": 1.0000779628753662, + "val/ratio_var": 6.587182497241884e-07 + }, + { + "episode": 16384, + "epoch": 3.0919041328552557, + "eps": 0, + "loss/policy_avg": -0.01712847873568535, + "loss/value_avg": 0.0041097188368439674, + "lr": 9.764309764309763e-08, + "objective/entropy": -683.023193359375, + "objective/kl": 6.437891960144043, + "objective/non_score_reward": -0.19313675165176392, + "objective/rlhf_reward": 0.18479293584823608, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.0003696854109875858, + "policy/clipfrac_avg": 0.007503229193389416, + "policy/entropy_avg": 0.18361155688762665, + "step": 256, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 1.0000218152999878, + "val/ratio_var": 5.72627527617442e-07 + }, + { + "episode": 16448, + "epoch": 3.1039818833742214, + "eps": 0, + "loss/policy_avg": -0.03213302046060562, + "loss/value_avg": 0.0037582411896437407, + "lr": 9.595959595959594e-08, + "objective/entropy": -648.9036254882812, + "objective/kl": 7.319805145263672, + "objective/non_score_reward": -0.21959413588047028, + "objective/rlhf_reward": 0.22278867661952972, + "objective/scores": 0.44140625, + "policy/approxkl_avg": 0.0003951989929191768, + "policy/clipfrac_avg": 0.006181157194077969, + "policy/entropy_avg": 0.18743896484375, + "step": 257, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 1.0000240802764893, + "val/ratio_var": 6.705196256007184e-07 + }, + { + "episode": 16512, + "epoch": 3.116059633893187, + "eps": 0, + "loss/policy_avg": -0.026423348113894463, + "loss/value_avg": 0.00365483108907938, + "lr": 9.427609427609427e-08, + "objective/entropy": -658.4329223632812, + "objective/kl": 6.591666221618652, + "objective/non_score_reward": -0.19774997234344482, + "objective/rlhf_reward": 0.23535549640655518, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.0003852533991448581, + "policy/clipfrac_avg": 0.006929041352123022, + "policy/entropy_avg": 0.18116506934165955, + "step": 258, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.000004768371582, + "val/ratio_var": 6.329533448479197e-07 + }, + { + "episode": 16576, + "epoch": 3.1281373844121534, + "eps": 0, + "loss/policy_avg": -0.01229805313050747, + "loss/value_avg": 0.004078000318259001, + "lr": 9.259259259259258e-08, + "objective/entropy": -706.8790893554688, + "objective/kl": 6.781729698181152, + "objective/non_score_reward": -0.20345187187194824, + "objective/rlhf_reward": 0.25748562812805176, + "objective/scores": 0.4609375, + "policy/approxkl_avg": 0.00035077554639428854, + "policy/clipfrac_avg": 0.00699991500005126, + "policy/entropy_avg": 0.18130874633789062, + "step": 259, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999758005142212, + "val/ratio_var": 5.301695296111575e-07 + }, + { + "episode": 16640, + "epoch": 3.140215134931119, + "eps": 0, + "loss/policy_avg": -0.03154832124710083, + "loss/value_avg": 0.003632882609963417, + "lr": 9.09090909090909e-08, + "objective/entropy": -613.3504638671875, + "objective/kl": 7.657683372497559, + "objective/non_score_reward": -0.2297305017709732, + "objective/rlhf_reward": 0.148199200630188, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.0005249691312201321, + "policy/clipfrac_avg": 0.007619872223585844, + "policy/entropy_avg": 0.18793997168540955, + "step": 260, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.000046730041504, + "val/ratio_var": 7.239053161356424e-07 + }, + { + "episode": 16704, + "epoch": 3.152292885450085, + "eps": 0, + "loss/policy_avg": -0.01213142555207014, + "loss/value_avg": 0.0035843513906002045, + "lr": 8.922558922558921e-08, + "objective/entropy": -649.6549072265625, + "objective/kl": 9.2880277633667, + "objective/non_score_reward": -0.27864083647727966, + "objective/rlhf_reward": 0.12467947602272034, + "objective/scores": 0.40234375, + "policy/approxkl_avg": 0.00038886774564161897, + "policy/clipfrac_avg": 0.006718775257468224, + "policy/entropy_avg": 0.18620681762695312, + "step": 261, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 0.999856173992157, + "val/ratio_var": 4.936819664180803e-07 + }, + { + "episode": 16768, + "epoch": 3.1643706359690507, + "eps": 0, + "loss/policy_avg": -0.007179616950452328, + "loss/value_avg": 0.0035246573388576508, + "lr": 8.754208754208754e-08, + "objective/entropy": -623.578857421875, + "objective/kl": 8.410058975219727, + "objective/non_score_reward": -0.25230175256729126, + "objective/rlhf_reward": 0.08363573253154755, + "objective/scores": 0.3359375, + "policy/approxkl_avg": 0.00044147943845018744, + "policy/clipfrac_avg": 0.007476884871721268, + "policy/entropy_avg": 0.20351791381835938, + "step": 262, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 41, + "val/ratio": 1.0000280141830444, + "val/ratio_var": 6.352038326440379e-07 + }, + { + "episode": 16832, + "epoch": 3.1764483864880164, + "eps": 0, + "loss/policy_avg": -0.003754607168957591, + "loss/value_avg": 0.003930999897420406, + "lr": 8.585858585858585e-08, + "objective/entropy": -696.397705078125, + "objective/kl": 8.027302742004395, + "objective/non_score_reward": -0.2408190667629242, + "objective/rlhf_reward": 0.0904797613620758, + "objective/scores": 0.33203125, + "policy/approxkl_avg": 0.0003533074341248721, + "policy/clipfrac_avg": 0.00765426829457283, + "policy/entropy_avg": 0.18895339965820312, + "step": 263, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.99993896484375, + "val/ratio_var": 4.5711240659329633e-07 + }, + { + "episode": 16896, + "epoch": 3.1885261370069826, + "eps": 0, + "loss/policy_avg": -0.017033755779266357, + "loss/value_avg": 0.003304037032648921, + "lr": 8.417508417508418e-08, + "objective/entropy": -598.6932373046875, + "objective/kl": 7.669593811035156, + "objective/non_score_reward": -0.23008780181407928, + "objective/rlhf_reward": 0.13075204193592072, + "objective/scores": 0.361328125, + "policy/approxkl_avg": 0.00047474654274992645, + "policy/clipfrac_avg": 0.007650506682693958, + "policy/entropy_avg": 0.20692571997642517, + "step": 264, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 39, + "val/ratio": 1.0001184940338135, + "val/ratio_var": 9.710288395581301e-07 + }, + { + "episode": 16960, + "epoch": 3.2006038875259484, + "eps": 0, + "loss/policy_avg": -0.02458018623292446, + "loss/value_avg": 0.00406844075769186, + "lr": 8.24915824915825e-08, + "objective/entropy": -673.213134765625, + "objective/kl": 6.648907661437988, + "objective/non_score_reward": -0.1994672268629074, + "objective/rlhf_reward": 0.2600054144859314, + "objective/scores": 0.458984375, + "policy/approxkl_avg": 0.0003697554930113256, + "policy/clipfrac_avg": 0.0070952074602246284, + "policy/entropy_avg": 0.1852405071258545, + "step": 265, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 53, + "val/ratio": 1.0000247955322266, + "val/ratio_var": 5.141792485119367e-07 + }, + { + "episode": 17024, + "epoch": 3.212681638044914, + "eps": 0, + "loss/policy_avg": -0.009238027967512608, + "loss/value_avg": 0.004165910184383392, + "lr": 8.080808080808082e-08, + "objective/entropy": -735.2806396484375, + "objective/kl": 6.235321998596191, + "objective/non_score_reward": -0.1870596557855606, + "objective/rlhf_reward": 0.2616708278656006, + "objective/scores": 0.44921875, + "policy/approxkl_avg": 0.00032986787846311927, + "policy/clipfrac_avg": 0.006603958085179329, + "policy/entropy_avg": 0.17215602099895477, + "step": 266, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 0.9999458193778992, + "val/ratio_var": 5.301350824993278e-07 + }, + { + "episode": 17088, + "epoch": 3.22475938856388, + "eps": 0, + "loss/policy_avg": -0.01698639616370201, + "loss/value_avg": 0.0037531605921685696, + "lr": 7.912457912457913e-08, + "objective/entropy": -652.947265625, + "objective/kl": 7.257780075073242, + "objective/non_score_reward": -0.21773339807987213, + "objective/rlhf_reward": 0.21537207067012787, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.000403220416046679, + "policy/clipfrac_avg": 0.006984008476138115, + "policy/entropy_avg": 0.1990203857421875, + "step": 267, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 1.0000309944152832, + "val/ratio_var": 6.781556862733851e-07 + }, + { + "episode": 17152, + "epoch": 3.2368371390828456, + "eps": 0, + "loss/policy_avg": -0.028039943426847458, + "loss/value_avg": 0.003373341169208288, + "lr": 7.744107744107744e-08, + "objective/entropy": -569.8634033203125, + "objective/kl": 7.230891227722168, + "objective/non_score_reward": -0.21692675352096558, + "objective/rlhf_reward": 0.14879590272903442, + "objective/scores": 0.365234375, + "policy/approxkl_avg": 0.0004610806645359844, + "policy/clipfrac_avg": 0.007782801054418087, + "policy/entropy_avg": 0.21935272216796875, + "step": 268, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 38, + "val/ratio": 0.9999889135360718, + "val/ratio_var": 6.67467986659176e-07 + }, + { + "episode": 17216, + "epoch": 3.248914889601812, + "eps": 0, + "loss/policy_avg": 0.012302754446864128, + "loss/value_avg": 0.0036267773248255253, + "lr": 7.575757575757576e-08, + "objective/entropy": -655.1212768554688, + "objective/kl": 8.060359954833984, + "objective/non_score_reward": -0.24181079864501953, + "objective/rlhf_reward": 0.03614329546689987, + "objective/scores": 0.27734375, + "policy/approxkl_avg": 0.0003768009482882917, + "policy/clipfrac_avg": 0.0069151511415839195, + "policy/entropy_avg": 0.20404815673828125, + "step": 269, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 0.9999961256980896, + "val/ratio_var": 5.263832463242579e-07 + }, + { + "episode": 17280, + "epoch": 3.2609926401207776, + "eps": 0, + "loss/policy_avg": -0.004253086168318987, + "loss/value_avg": 0.0037821203004568815, + "lr": 7.407407407407407e-08, + "objective/entropy": -601.41015625, + "objective/kl": 7.641479015350342, + "objective/non_score_reward": -0.2292443811893463, + "objective/rlhf_reward": 0.1254919469356537, + "objective/scores": 0.35546875, + "policy/approxkl_avg": 0.0004039438790641725, + "policy/clipfrac_avg": 0.0064240507781505585, + "policy/entropy_avg": 0.19796499609947205, + "step": 270, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 41, + "val/ratio": 0.9998639822006226, + "val/ratio_var": 5.743943347624736e-07 + }, + { + "episode": 17344, + "epoch": 3.2730703906397434, + "eps": 0, + "loss/policy_avg": -0.03380737453699112, + "loss/value_avg": 0.0034328820183873177, + "lr": 7.23905723905724e-08, + "objective/entropy": -647.0220947265625, + "objective/kl": 7.08984899520874, + "objective/non_score_reward": -0.21269546449184418, + "objective/rlhf_reward": 0.27851545810699463, + "objective/scores": 0.4921875, + "policy/approxkl_avg": 0.00040110870031639934, + "policy/clipfrac_avg": 0.007048811763525009, + "policy/entropy_avg": 0.19513702392578125, + "step": 271, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 0.9998549222946167, + "val/ratio_var": 6.118988267189707e-07 + }, + { + "episode": 17408, + "epoch": 3.285148141158709, + "eps": 0, + "loss/policy_avg": -0.014850600622594357, + "loss/value_avg": 0.0036329745780676603, + "lr": 7.070707070707071e-08, + "objective/entropy": -586.4075317382812, + "objective/kl": 8.249979019165039, + "objective/non_score_reward": -0.24749934673309326, + "objective/rlhf_reward": 0.15679752826690674, + "objective/scores": 0.404296875, + "policy/approxkl_avg": 0.00042898274841718376, + "policy/clipfrac_avg": 0.0069151753559708595, + "policy/entropy_avg": 0.19608816504478455, + "step": 272, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 41, + "val/ratio": 1.0001020431518555, + "val/ratio_var": 8.127553314807301e-07 + }, + { + "episode": 17472, + "epoch": 3.297225891677675, + "eps": 0, + "loss/policy_avg": -0.01862741820514202, + "loss/value_avg": 0.0038365069776773453, + "lr": 6.902356902356903e-08, + "objective/entropy": -696.0003051757812, + "objective/kl": 7.6856584548950195, + "objective/non_score_reward": -0.2305697500705719, + "objective/rlhf_reward": 0.2308560311794281, + "objective/scores": 0.4609375, + "policy/approxkl_avg": 0.00037589838029816747, + "policy/clipfrac_avg": 0.007162098772823811, + "policy/entropy_avg": 0.186614990234375, + "step": 273, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 1.0000542402267456, + "val/ratio_var": 5.687811608368065e-07 + }, + { + "episode": 17536, + "epoch": 3.309303642196641, + "eps": 0, + "loss/policy_avg": -0.02315349504351616, + "loss/value_avg": 0.0040380991995334625, + "lr": 6.734006734006734e-08, + "objective/entropy": -722.7449951171875, + "objective/kl": 6.226775646209717, + "objective/non_score_reward": -0.18680325150489807, + "objective/rlhf_reward": 0.24581393599510193, + "objective/scores": 0.43359375, + "policy/approxkl_avg": 0.0003334844659548253, + "policy/clipfrac_avg": 0.0064841099083423615, + "policy/entropy_avg": 0.18703460693359375, + "step": 274, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.0000557899475098, + "val/ratio_var": 5.662852800014662e-07 + }, + { + "episode": 17600, + "epoch": 3.321381392715607, + "eps": 0, + "loss/policy_avg": -0.02682194858789444, + "loss/value_avg": 0.0037449407391250134, + "lr": 6.565656565656566e-08, + "objective/entropy": -635.6344604492188, + "objective/kl": 7.442312240600586, + "objective/non_score_reward": -0.22326935827732086, + "objective/rlhf_reward": 0.22106656432151794, + "objective/scores": 0.4453125, + "policy/approxkl_avg": 0.00040927709778770804, + "policy/clipfrac_avg": 0.006474938243627548, + "policy/entropy_avg": 0.19021479785442352, + "step": 275, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 1.0002195835113525, + "val/ratio_var": 6.032373676134739e-07 + }, + { + "episode": 17664, + "epoch": 3.3334591432345726, + "eps": 0, + "loss/policy_avg": -0.005783136934041977, + "loss/value_avg": 0.003839105134829879, + "lr": 6.397306397306398e-08, + "objective/entropy": -667.8414306640625, + "objective/kl": 6.772984504699707, + "objective/non_score_reward": -0.20318952202796936, + "objective/rlhf_reward": 0.18743547797203064, + "objective/scores": 0.390625, + "policy/approxkl_avg": 0.0003651580773293972, + "policy/clipfrac_avg": 0.007003391161561012, + "policy/entropy_avg": 0.19083023071289062, + "step": 276, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 1.0000221729278564, + "val/ratio_var": 5.736771981901256e-07 + }, + { + "episode": 17728, + "epoch": 3.3455368937535384, + "eps": 0, + "loss/policy_avg": -0.006362794898450375, + "loss/value_avg": 0.003617867361754179, + "lr": 6.228956228956229e-08, + "objective/entropy": -669.930419921875, + "objective/kl": 7.383747100830078, + "objective/non_score_reward": -0.22151240706443787, + "objective/rlhf_reward": 0.18522588908672333, + "objective/scores": 0.40625, + "policy/approxkl_avg": 0.00036430690670385957, + "policy/clipfrac_avg": 0.007155537139624357, + "policy/entropy_avg": 0.18641917407512665, + "step": 277, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 0.9999706745147705, + "val/ratio_var": 6.842114999017213e-07 + }, + { + "episode": 17792, + "epoch": 3.357614644272504, + "eps": 0, + "loss/policy_avg": -0.03245866298675537, + "loss/value_avg": 0.003861590987071395, + "lr": 6.060606060606061e-08, + "objective/entropy": -712.3450927734375, + "objective/kl": 6.383450984954834, + "objective/non_score_reward": -0.19150352478027344, + "objective/rlhf_reward": 0.31337928771972656, + "objective/scores": 0.50390625, + "policy/approxkl_avg": 0.00036257284227758646, + "policy/clipfrac_avg": 0.006432620342820883, + "policy/entropy_avg": 0.18203863501548767, + "step": 278, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999188184738159, + "val/ratio_var": 3.687934793106251e-07 + }, + { + "episode": 17856, + "epoch": 3.36969239479147, + "eps": 0, + "loss/policy_avg": -0.025677090510725975, + "loss/value_avg": 0.003610721556469798, + "lr": 5.892255892255892e-08, + "objective/entropy": -635.7150268554688, + "objective/kl": 6.59326171875, + "objective/non_score_reward": -0.19779784977436066, + "objective/rlhf_reward": 0.20503418147563934, + "objective/scores": 0.40234375, + "policy/approxkl_avg": 0.0003899303264915943, + "policy/clipfrac_avg": 0.006962133105844259, + "policy/entropy_avg": 0.19839096069335938, + "step": 279, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 47, + "val/ratio": 1.0000261068344116, + "val/ratio_var": 5.185344775782141e-07 + }, + { + "episode": 17920, + "epoch": 3.381770145310436, + "eps": 0, + "loss/policy_avg": -0.028471484780311584, + "loss/value_avg": 0.0032405236270278692, + "lr": 5.723905723905724e-08, + "objective/entropy": -642.4448852539062, + "objective/kl": 6.222779750823975, + "objective/non_score_reward": -0.18668338656425476, + "objective/rlhf_reward": 0.26058220863342285, + "objective/scores": 0.447265625, + "policy/approxkl_avg": 0.0004811930702999234, + "policy/clipfrac_avg": 0.007410434540361166, + "policy/entropy_avg": 0.201324462890625, + "step": 280, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 0.9999183416366577, + "val/ratio_var": 6.145901920717733e-07 + }, + { + "episode": 17984, + "epoch": 3.393847895829402, + "eps": 0, + "loss/policy_avg": -0.018460756167769432, + "loss/value_avg": 0.0036118782591074705, + "lr": 5.555555555555555e-08, + "objective/entropy": -647.02392578125, + "objective/kl": 7.956416130065918, + "objective/non_score_reward": -0.2386924773454666, + "objective/rlhf_reward": 0.1660926640033722, + "objective/scores": 0.404296875, + "policy/approxkl_avg": 0.00041467935079708695, + "policy/clipfrac_avg": 0.007317427080124617, + "policy/entropy_avg": 0.20193736255168915, + "step": 281, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 1.0000567436218262, + "val/ratio_var": 5.499433655131725e-07 + }, + { + "episode": 18048, + "epoch": 3.4059256463483676, + "eps": 0, + "loss/policy_avg": -0.012457543984055519, + "loss/value_avg": 0.0038994457572698593, + "lr": 5.3872053872053865e-08, + "objective/entropy": -716.0316162109375, + "objective/kl": 5.913008689880371, + "objective/non_score_reward": -0.17739026248455048, + "objective/rlhf_reward": 0.30991441011428833, + "objective/scores": 0.48828125, + "policy/approxkl_avg": 0.0003301530086901039, + "policy/clipfrac_avg": 0.0064779892563819885, + "policy/entropy_avg": 0.18742243945598602, + "step": 282, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.0000253915786743, + "val/ratio_var": 4.4198901605341234e-07 + }, + { + "episode": 18112, + "epoch": 3.4180033968673333, + "eps": 0, + "loss/policy_avg": -0.013042353093624115, + "loss/value_avg": 0.0037472937256097794, + "lr": 5.218855218855218e-08, + "objective/entropy": -752.298828125, + "objective/kl": 6.022947311401367, + "objective/non_score_reward": -0.18068841099739075, + "objective/rlhf_reward": 0.35056155920028687, + "objective/scores": 0.53125, + "policy/approxkl_avg": 0.0003260195953771472, + "policy/clipfrac_avg": 0.0073262769728899, + "policy/entropy_avg": 0.193359375, + "step": 283, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 58, + "val/ratio": 1.00008225440979, + "val/ratio_var": 5.816585826323717e-07 + }, + { + "episode": 18176, + "epoch": 3.430081147386299, + "eps": 0, + "loss/policy_avg": -0.022922195494174957, + "loss/value_avg": 0.003924447111785412, + "lr": 5.05050505050505e-08, + "objective/entropy": -727.8096923828125, + "objective/kl": 6.566383361816406, + "objective/non_score_reward": -0.19699150323867798, + "objective/rlhf_reward": 0.274688184261322, + "objective/scores": 0.47265625, + "policy/approxkl_avg": 0.0003402878064662218, + "policy/clipfrac_avg": 0.006386288907378912, + "policy/entropy_avg": 0.17614874243736267, + "step": 284, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 56, + "val/ratio": 1.0001291036605835, + "val/ratio_var": 7.245762958518753e-07 + }, + { + "episode": 18240, + "epoch": 3.4421588979052653, + "eps": 0, + "loss/policy_avg": -0.005379597656428814, + "loss/value_avg": 0.003668400924652815, + "lr": 4.8821548821548816e-08, + "objective/entropy": -657.02294921875, + "objective/kl": 7.11133337020874, + "objective/non_score_reward": -0.21333999931812286, + "objective/rlhf_reward": 0.26126939058303833, + "objective/scores": 0.474609375, + "policy/approxkl_avg": 0.0003993379359599203, + "policy/clipfrac_avg": 0.00656835176050663, + "policy/entropy_avg": 0.19129817187786102, + "step": 285, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 39, + "val/ratio": 1.0000909566879272, + "val/ratio_var": 4.280297787317977e-07 + }, + { + "episode": 18304, + "epoch": 3.454236648424231, + "eps": 0, + "loss/policy_avg": -0.0007903016521595418, + "loss/value_avg": 0.003940465394407511, + "lr": 4.7138047138047134e-08, + "objective/entropy": -678.7791748046875, + "objective/kl": 7.29849100112915, + "objective/non_score_reward": -0.21895474195480347, + "objective/rlhf_reward": 0.15433627367019653, + "objective/scores": 0.373046875, + "policy/approxkl_avg": 0.0003655221953522414, + "policy/clipfrac_avg": 0.006983469240367413, + "policy/entropy_avg": 0.19457626342773438, + "step": 286, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 41, + "val/ratio": 1.0000255107879639, + "val/ratio_var": 6.261829526010843e-07 + }, + { + "episode": 18368, + "epoch": 3.466314398943197, + "eps": 0, + "loss/policy_avg": -0.013323694467544556, + "loss/value_avg": 0.003565411549061537, + "lr": 4.545454545454545e-08, + "objective/entropy": -666.1871948242188, + "objective/kl": 6.453955173492432, + "objective/non_score_reward": -0.19361865520477295, + "objective/rlhf_reward": 0.23753370344638824, + "objective/scores": 0.431640625, + "policy/approxkl_avg": 0.0003667096607387066, + "policy/clipfrac_avg": 0.0066523477435112, + "policy/entropy_avg": 0.19250616431236267, + "step": 287, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 41, + "val/ratio": 0.9998884201049805, + "val/ratio_var": 4.2977848124792217e-07 + }, + { + "episode": 18432, + "epoch": 3.4783921494621626, + "eps": 0, + "loss/policy_avg": -0.01416645385324955, + "loss/value_avg": 0.0035817499738186598, + "lr": 4.377104377104377e-08, + "objective/entropy": -700.9827270507812, + "objective/kl": 6.070189476013184, + "objective/non_score_reward": -0.1821056753396988, + "objective/rlhf_reward": 0.2661365270614624, + "objective/scores": 0.44921875, + "policy/approxkl_avg": 0.0003335383953526616, + "policy/clipfrac_avg": 0.006237865425646305, + "policy/entropy_avg": 0.18711933493614197, + "step": 288, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 1.000044584274292, + "val/ratio_var": 4.85237990233145e-07 + }, + { + "episode": 18496, + "epoch": 3.4904698999811283, + "eps": 0, + "loss/policy_avg": -0.022722497582435608, + "loss/value_avg": 0.0033240667544305325, + "lr": 4.208754208754209e-08, + "objective/entropy": -738.82373046875, + "objective/kl": 5.70413064956665, + "objective/non_score_reward": -0.1711239218711853, + "objective/rlhf_reward": 0.3274112343788147, + "objective/scores": 0.498046875, + "policy/approxkl_avg": 0.0003393371298443526, + "policy/clipfrac_avg": 0.007512836717069149, + "policy/entropy_avg": 0.1853078305721283, + "step": 289, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 0.9999982714653015, + "val/ratio_var": 4.772057877744373e-07 + }, + { + "episode": 18560, + "epoch": 3.5025476505000945, + "eps": 0, + "loss/policy_avg": -0.0007028168765828013, + "loss/value_avg": 0.0038722134195268154, + "lr": 4.040404040404041e-08, + "objective/entropy": -688.8043212890625, + "objective/kl": 7.602431297302246, + "objective/non_score_reward": -0.2280729115009308, + "objective/rlhf_reward": 0.1908724009990692, + "objective/scores": 0.41796875, + "policy/approxkl_avg": 0.00037353829247877, + "policy/clipfrac_avg": 0.007045034319162369, + "policy/entropy_avg": 0.19284312427043915, + "step": 290, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 0.9999067187309265, + "val/ratio_var": 6.341031166812172e-07 + }, + { + "episode": 18624, + "epoch": 3.5146254010190603, + "eps": 0, + "loss/policy_avg": -0.013677339069545269, + "loss/value_avg": 0.0034771780483424664, + "lr": 3.872053872053872e-08, + "objective/entropy": -623.31884765625, + "objective/kl": 6.28436803817749, + "objective/non_score_reward": -0.1885310411453247, + "objective/rlhf_reward": 0.1708439588546753, + "objective/scores": 0.359375, + "policy/approxkl_avg": 0.00041524547850713134, + "policy/clipfrac_avg": 0.006799499504268169, + "policy/entropy_avg": 0.182159423828125, + "step": 291, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 35, + "val/ratio": 1.0000431537628174, + "val/ratio_var": 6.771080052203615e-07 + }, + { + "episode": 18688, + "epoch": 3.526703151538026, + "eps": 0, + "loss/policy_avg": -0.02629309892654419, + "loss/value_avg": 0.003640729933977127, + "lr": 3.7037037037037036e-08, + "objective/entropy": -668.0806884765625, + "objective/kl": 7.632940292358398, + "objective/non_score_reward": -0.22898820042610168, + "objective/rlhf_reward": 0.24610945582389832, + "objective/scores": 0.474609375, + "policy/approxkl_avg": 0.00038289197254925966, + "policy/clipfrac_avg": 0.0064846850000321865, + "policy/entropy_avg": 0.19228872656822205, + "step": 292, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 1.0000791549682617, + "val/ratio_var": 6.841331696705311e-07 + }, + { + "episode": 18752, + "epoch": 3.538780902056992, + "eps": 0, + "loss/policy_avg": -0.012860393151640892, + "loss/value_avg": 0.004311088938266039, + "lr": 3.5353535353535353e-08, + "objective/entropy": -699.3877563476562, + "objective/kl": 6.807313919067383, + "objective/non_score_reward": -0.20421940088272095, + "objective/rlhf_reward": 0.27234309911727905, + "objective/scores": 0.4765625, + "policy/approxkl_avg": 0.00037193228490650654, + "policy/clipfrac_avg": 0.006834262516349554, + "policy/entropy_avg": 0.17558543384075165, + "step": 293, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 40, + "val/ratio": 0.9999904632568359, + "val/ratio_var": 5.597553922598308e-07 + }, + { + "episode": 18816, + "epoch": 3.5508586525759576, + "eps": 0, + "loss/policy_avg": -0.03740035742521286, + "loss/value_avg": 0.003467664122581482, + "lr": 3.367003367003367e-08, + "objective/entropy": -682.0465698242188, + "objective/kl": 7.4768147468566895, + "objective/non_score_reward": -0.2243044376373291, + "objective/rlhf_reward": 0.2590939998626709, + "objective/scores": 0.484375, + "policy/approxkl_avg": 0.0004095996846444905, + "policy/clipfrac_avg": 0.006977352779358625, + "policy/entropy_avg": 0.19226329028606415, + "step": 294, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 39, + "val/ratio": 0.9999417066574097, + "val/ratio_var": 6.161255896586226e-07 + }, + { + "episode": 18880, + "epoch": 3.5629364030949238, + "eps": 0, + "loss/policy_avg": -0.02602003514766693, + "loss/value_avg": 0.003619457595050335, + "lr": 3.198653198653199e-08, + "objective/entropy": -725.4286499023438, + "objective/kl": 6.233606338500977, + "objective/non_score_reward": -0.18700820207595825, + "objective/rlhf_reward": 0.21924179792404175, + "objective/scores": 0.40625, + "policy/approxkl_avg": 0.00038076151395216584, + "policy/clipfrac_avg": 0.006789907813072205, + "policy/entropy_avg": 0.1863047331571579, + "step": 295, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 44, + "val/ratio": 1.0000081062316895, + "val/ratio_var": 6.039576874172781e-07 + }, + { + "episode": 18944, + "epoch": 3.5750141536138895, + "eps": 0, + "loss/policy_avg": -0.02581647038459778, + "loss/value_avg": 0.003386137541383505, + "lr": 3.0303030303030305e-08, + "objective/entropy": -664.8191528320312, + "objective/kl": 7.328958988189697, + "objective/non_score_reward": -0.21986877918243408, + "objective/rlhf_reward": 0.17368590831756592, + "objective/scores": 0.39453125, + "policy/approxkl_avg": 0.0003837857802864164, + "policy/clipfrac_avg": 0.006847723387181759, + "policy/entropy_avg": 0.2102610319852829, + "step": 296, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 0.9997448921203613, + "val/ratio_var": 6.460118697759754e-07 + }, + { + "episode": 19008, + "epoch": 3.5870919041328553, + "eps": 0, + "loss/policy_avg": -0.01845662109553814, + "loss/value_avg": 0.0036777183413505554, + "lr": 2.861952861952862e-08, + "objective/entropy": -644.3194580078125, + "objective/kl": 6.178158760070801, + "objective/non_score_reward": -0.18534475564956665, + "objective/rlhf_reward": 0.24483102560043335, + "objective/scores": 0.4296875, + "policy/approxkl_avg": 0.0003848365449812263, + "policy/clipfrac_avg": 0.007219640072435141, + "policy/entropy_avg": 0.18966802954673767, + "step": 297, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 34, + "val/ratio": 1.0000991821289062, + "val/ratio_var": 4.2051803461617965e-07 + }, + { + "episode": 19072, + "epoch": 3.599169654651821, + "eps": 0, + "loss/policy_avg": -0.03636598959565163, + "loss/value_avg": 0.0033596050925552845, + "lr": 2.6936026936026933e-08, + "objective/entropy": -675.494384765625, + "objective/kl": 6.287810802459717, + "objective/non_score_reward": -0.18863432109355927, + "objective/rlhf_reward": 0.34456878900527954, + "objective/scores": 0.53125, + "policy/approxkl_avg": 0.000387437641620636, + "policy/clipfrac_avg": 0.006644147448241711, + "policy/entropy_avg": 0.2019907683134079, + "step": 298, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 0.9999553561210632, + "val/ratio_var": 5.573217549681431e-07 + }, + { + "episode": 19136, + "epoch": 3.611247405170787, + "eps": 0, + "loss/policy_avg": -0.00651153177022934, + "loss/value_avg": 0.003799569560214877, + "lr": 2.525252525252525e-08, + "objective/entropy": -662.1578369140625, + "objective/kl": 6.4900946617126465, + "objective/non_score_reward": -0.1947028487920761, + "objective/rlhf_reward": 0.1832268387079239, + "objective/scores": 0.37890625, + "policy/approxkl_avg": 0.0003824663581326604, + "policy/clipfrac_avg": 0.006935178767889738, + "policy/entropy_avg": 0.18524932861328125, + "step": 299, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999604225158691, + "val/ratio_var": 5.727119400944503e-07 + }, + { + "episode": 19200, + "epoch": 3.623325155689753, + "eps": 0, + "loss/policy_avg": 4.80624566989718e-06, + "loss/value_avg": 0.003957442473620176, + "lr": 2.3569023569023567e-08, + "objective/entropy": -613.0647583007812, + "objective/kl": 7.1408891677856445, + "objective/non_score_reward": -0.21422669291496277, + "objective/rlhf_reward": 0.14368346333503723, + "objective/scores": 0.357421875, + "policy/approxkl_avg": 0.0004018023028038442, + "policy/clipfrac_avg": 0.007539949379861355, + "policy/entropy_avg": 0.18967437744140625, + "step": 300, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 1.0000495910644531, + "val/ratio_var": 7.146343250497011e-07 + }, + { + "episode": 19264, + "epoch": 3.6354029062087188, + "eps": 0, + "loss/policy_avg": -0.019861344248056412, + "loss/value_avg": 0.0036555491387844086, + "lr": 2.1885521885521884e-08, + "objective/entropy": -695.1436767578125, + "objective/kl": 5.617988586425781, + "objective/non_score_reward": -0.16853967308998108, + "objective/rlhf_reward": 0.2938627004623413, + "objective/scores": 0.462890625, + "policy/approxkl_avg": 0.00036370521411299706, + "policy/clipfrac_avg": 0.00675880815833807, + "policy/entropy_avg": 0.19301223754882812, + "step": 301, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 53, + "val/ratio": 0.9999930262565613, + "val/ratio_var": 4.901943952972942e-07 + }, + { + "episode": 19328, + "epoch": 3.6474806567276845, + "eps": 0, + "loss/policy_avg": 6.916312031535199e-06, + "loss/value_avg": 0.003553304821252823, + "lr": 2.0202020202020204e-08, + "objective/entropy": -603.779541015625, + "objective/kl": 7.373934745788574, + "objective/non_score_reward": -0.2212180495262146, + "objective/rlhf_reward": 0.1664772629737854, + "objective/scores": 0.38671875, + "policy/approxkl_avg": 0.0004275983665138483, + "policy/clipfrac_avg": 0.006720641162246466, + "policy/entropy_avg": 0.20763906836509705, + "step": 302, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 1.0000828504562378, + "val/ratio_var": 5.606044624073547e-07 + }, + { + "episode": 19392, + "epoch": 3.6595584072466503, + "eps": 0, + "loss/policy_avg": -0.016103968024253845, + "loss/value_avg": 0.0041246358305215836, + "lr": 1.8518518518518518e-08, + "objective/entropy": -669.2377319335938, + "objective/kl": 5.863403797149658, + "objective/non_score_reward": -0.1759021282196045, + "objective/rlhf_reward": 0.2698986530303955, + "objective/scores": 0.4453125, + "policy/approxkl_avg": 0.00045897456584498286, + "policy/clipfrac_avg": 0.005576698109507561, + "policy/entropy_avg": 0.17554092407226562, + "step": 303, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 1.0000157356262207, + "val/ratio_var": 6.06298158345453e-07 + }, + { + "episode": 19456, + "epoch": 3.671636157765616, + "eps": 0, + "loss/policy_avg": -0.014837692491710186, + "loss/value_avg": 0.0034644536208361387, + "lr": 1.6835016835016835e-08, + "objective/entropy": -638.3107299804688, + "objective/kl": 6.990595817565918, + "objective/non_score_reward": -0.20971786975860596, + "objective/rlhf_reward": 0.19067275524139404, + "objective/scores": 0.400390625, + "policy/approxkl_avg": 0.00039926738827489316, + "policy/clipfrac_avg": 0.0072073861956596375, + "policy/entropy_avg": 0.19615554809570312, + "step": 304, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 55, + "val/ratio": 0.9998650550842285, + "val/ratio_var": 6.70040776640235e-07 + }, + { + "episode": 19520, + "epoch": 3.683713908284582, + "eps": 0, + "loss/policy_avg": -0.03533481806516647, + "loss/value_avg": 0.003643455682322383, + "lr": 1.5151515151515152e-08, + "objective/entropy": -708.4891357421875, + "objective/kl": 6.058835506439209, + "objective/non_score_reward": -0.18176504969596863, + "objective/rlhf_reward": 0.36803963780403137, + "objective/scores": 0.55078125, + "policy/approxkl_avg": 0.00035262128221802413, + "policy/clipfrac_avg": 0.006895636674016714, + "policy/entropy_avg": 0.18742243945598602, + "step": 305, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 49, + "val/ratio": 1.0000662803649902, + "val/ratio_var": 4.963605420016393e-07 + }, + { + "episode": 19584, + "epoch": 3.695791658803548, + "eps": 0, + "loss/policy_avg": -0.02189006470143795, + "loss/value_avg": 0.0033562961034476757, + "lr": 1.3468013468013466e-08, + "objective/entropy": -635.7755737304688, + "objective/kl": 7.733546257019043, + "objective/non_score_reward": -0.23200638592243195, + "objective/rlhf_reward": 0.15861861407756805, + "objective/scores": 0.390625, + "policy/approxkl_avg": 0.00040599549538455904, + "policy/clipfrac_avg": 0.006881616078317165, + "policy/entropy_avg": 0.19137954711914062, + "step": 306, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 1.0000026226043701, + "val/ratio_var": 6.138251933407446e-07 + }, + { + "episode": 19648, + "epoch": 3.7078694093225137, + "eps": 0, + "loss/policy_avg": -0.03934434801340103, + "loss/value_avg": 0.004052530974149704, + "lr": 1.1784511784511783e-08, + "objective/entropy": -738.7037353515625, + "objective/kl": 6.419657230377197, + "objective/non_score_reward": -0.19258970022201538, + "objective/rlhf_reward": 0.3157110810279846, + "objective/scores": 0.5078125, + "policy/approxkl_avg": 0.0003348543250467628, + "policy/clipfrac_avg": 0.006614835001528263, + "policy/entropy_avg": 0.1747385710477829, + "step": 307, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 50, + "val/ratio": 1.0000545978546143, + "val/ratio_var": 4.5710487484029727e-07 + }, + { + "episode": 19712, + "epoch": 3.7199471598414795, + "eps": 0, + "loss/policy_avg": -0.01892966404557228, + "loss/value_avg": 0.0036056172102689743, + "lr": 1.0101010101010102e-08, + "objective/entropy": -665.5927124023438, + "objective/kl": 6.821819305419922, + "objective/non_score_reward": -0.20465457439422607, + "objective/rlhf_reward": 0.2562829256057739, + "objective/scores": 0.4609375, + "policy/approxkl_avg": 0.0003650089493021369, + "policy/clipfrac_avg": 0.006426130421459675, + "policy/entropy_avg": 0.19108709692955017, + "step": 308, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 46, + "val/ratio": 0.9999858736991882, + "val/ratio_var": 4.877447850049066e-07 + }, + { + "episode": 19776, + "epoch": 3.7320249103604453, + "eps": 0, + "loss/policy_avg": -0.012006578966975212, + "loss/value_avg": 0.003470144933089614, + "lr": 8.417508417508418e-09, + "objective/entropy": -619.7354736328125, + "objective/kl": 8.15022087097168, + "objective/non_score_reward": -0.2445066124200821, + "objective/rlhf_reward": 0.11438010632991791, + "objective/scores": 0.359375, + "policy/approxkl_avg": 0.00041244737803936005, + "policy/clipfrac_avg": 0.007321036886423826, + "policy/entropy_avg": 0.19420623779296875, + "step": 309, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 42, + "val/ratio": 1.000077486038208, + "val/ratio_var": 6.490017199212161e-07 + }, + { + "episode": 19840, + "epoch": 3.744102660879411, + "eps": 0, + "loss/policy_avg": -0.015695635229349136, + "loss/value_avg": 0.00352904899045825, + "lr": 6.734006734006733e-09, + "objective/entropy": -641.0952758789062, + "objective/kl": 7.442835330963135, + "objective/non_score_reward": -0.22328504920005798, + "objective/rlhf_reward": 0.20689073204994202, + "objective/scores": 0.4296875, + "policy/approxkl_avg": 0.00039796438068151474, + "policy/clipfrac_avg": 0.007473438512533903, + "policy/entropy_avg": 0.19350814819335938, + "step": 310, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 43, + "val/ratio": 0.9999562501907349, + "val/ratio_var": 4.872820227319608e-07 + }, + { + "episode": 19904, + "epoch": 3.756180411398377, + "eps": 0, + "loss/policy_avg": 0.0016891969135031104, + "loss/value_avg": 0.003535608295351267, + "lr": 5.050505050505051e-09, + "objective/entropy": -627.7642822265625, + "objective/kl": 6.384559631347656, + "objective/non_score_reward": -0.1915367841720581, + "objective/rlhf_reward": 0.1114417240023613, + "objective/scores": 0.302734375, + "policy/approxkl_avg": 0.0004028166295029223, + "policy/clipfrac_avg": 0.006718785967677832, + "policy/entropy_avg": 0.2012532651424408, + "step": 311, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 45, + "val/ratio": 1.0000300407409668, + "val/ratio_var": 6.633476914430503e-07 + }, + { + "episode": 19968, + "epoch": 3.768258161917343, + "eps": 0, + "loss/policy_avg": -0.012200575321912766, + "loss/value_avg": 0.003601629287004471, + "lr": 3.3670033670033666e-09, + "objective/entropy": -709.2785034179688, + "objective/kl": 6.045020580291748, + "objective/non_score_reward": -0.18135061860084534, + "objective/rlhf_reward": 0.14872750639915466, + "objective/scores": 0.330078125, + "policy/approxkl_avg": 0.00035006398684345186, + "policy/clipfrac_avg": 0.006922123488038778, + "policy/entropy_avg": 0.1940714567899704, + "step": 312, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 52, + "val/ratio": 0.9998501539230347, + "val/ratio_var": 5.097400048725831e-07 + }, + { + "episode": 20032, + "epoch": 3.7803359124363087, + "eps": 0, + "loss/policy_avg": -0.005085974466055632, + "loss/value_avg": 0.003475761041045189, + "lr": 1.6835016835016833e-09, + "objective/entropy": -692.533203125, + "objective/kl": 6.632614612579346, + "objective/non_score_reward": -0.19897842407226562, + "objective/rlhf_reward": 0.22387313842773438, + "objective/scores": 0.421875, + "policy/approxkl_avg": 0.000355798052623868, + "policy/clipfrac_avg": 0.006758556701242924, + "policy/entropy_avg": 0.17910131812095642, + "step": 313, + "val/clipfrac_avg": 0.0, + "val/num_eos_tokens": 48, + "val/ratio": 1.0001018047332764, + "val/ratio_var": 6.0731622397725e-07 + } + ], + "logging_steps": 500, + "max_steps": 313, + "num_input_tokens_seen": 0, + "num_train_epochs": 3.774297037176826, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0, + "train_batch_size": null, + "trial_name": null, + "trial_params": null +}