{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8397480755773268, "eval_steps": 500, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "clipped_completions_ratio": 0.0, "epoch": 0.0003498950314905528, "grad_norm": 0.15604860850093621, "kl": 0.000347137451171875, "learning_rate": 1.7482517482517483e-08, "loss": -0.0056, "max_completion_length": 235.0, "max_terminated_completion_length": 235.0, "mean_completion_length": 142.25, "mean_terminated_completion_length": 142.25, "min_completion_length": 101.0, "min_terminated_completion_length": 101.0, "num_tokens": 18358.0, "reward": 1.027963399887085, "reward_std": 0.11163154989480972, "rewards/check_gptzero_func/mean": 0.01785714365541935, "rewards/check_gptzero_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.047691233456134796, "rewards/check_winston_local_func/std": 0.07273447513580322, "rewards/sentence_count_match_reward_logic/mean": 0.9624150395393372, "rewards/sentence_count_match_reward_logic/std": 0.07955337315797806, "step": 1 }, { "clip_ratio": 0.0, "epoch": 0.0006997900629811056, "grad_norm": 0.1564888592895808, "kl": 0.000347137451171875, "learning_rate": 3.4965034965034967e-08, "loss": -0.0056, "step": 2 }, { "clip_ratio": 0.0012734374031424522, "epoch": 0.0010496850944716584, "grad_norm": 0.15318676035505885, "kl": 0.000385284423828125, "learning_rate": 5.244755244755245e-08, "loss": -0.006, "step": 3 }, { "clip_ratio": 0.0009106070501729846, "epoch": 0.0013995801259622112, "grad_norm": 0.1542417818277146, "kl": 0.0004100799560546875, "learning_rate": 6.993006993006993e-08, "loss": -0.0058, "step": 4 }, { "clip_ratio": 0.001787298358976841, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.0017494751574527643, "grad_norm": 0.12665112776384926, "kl": 0.000331878662109375, "learning_rate": 8.741258741258742e-08, "loss": -0.0037, "max_completion_length": 256.0, "max_terminated_completion_length": 199.0, "mean_completion_length": 169.5178680419922, "mean_terminated_completion_length": 155.1041717529297, "min_completion_length": 96.0, "min_terminated_completion_length": 96.0, "num_tokens": 40859.0, "reward": 0.9693944454193115, "reward_std": 0.034251611679792404, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.016006160527467728, "rewards/check_winston_local_func/std": 0.026313001289963722, "rewards/sentence_count_match_reward_logic/mean": 0.9533882737159729, "rewards/sentence_count_match_reward_logic/std": 0.0976247787475586, "step": 5 }, { "clip_ratio": 0.0011673311237245798, "epoch": 0.002099370188943317, "grad_norm": 0.12994867790639283, "kl": 0.0003833770751953125, "learning_rate": 1.048951048951049e-07, "loss": -0.0038, "step": 6 }, { "clip_ratio": 0.001526201842352748, "epoch": 0.00244926522043387, "grad_norm": 0.1271368674349101, "kl": 0.0003509521484375, "learning_rate": 1.223776223776224e-07, "loss": -0.0037, "step": 7 }, { "clip_ratio": 0.001877069240435958, "epoch": 0.0027991602519244225, "grad_norm": 0.12761547493964923, "kl": 0.000362396240234375, "learning_rate": 1.3986013986013987e-07, "loss": -0.0035, "step": 8 }, { "clip_ratio": 0.0005686726653948426, "clipped_completions_ratio": 0.0, "epoch": 0.0031490552834149755, "grad_norm": 0.1421107039127329, "kl": 0.00034332275390625, "learning_rate": 1.5734265734265737e-07, "loss": 0.0011, "max_completion_length": 227.0, "max_terminated_completion_length": 227.0, "mean_completion_length": 133.5178680419922, "mean_terminated_completion_length": 133.5178680419922, "min_completion_length": 76.0, "min_terminated_completion_length": 76.0, "num_tokens": 58272.0, "reward": 1.2088803052902222, "reward_std": 0.1628313809633255, "rewards/check_gptzero_func/mean": 0.25, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.009475446306169033, "rewards/check_winston_local_func/std": 0.0035964413546025753, "rewards/sentence_count_match_reward_logic/mean": 0.949404776096344, "rewards/sentence_count_match_reward_logic/std": 0.09491965174674988, "step": 9 }, { "clip_ratio": 0.0008743013022467494, "epoch": 0.0034989503149055285, "grad_norm": 0.1352714979979412, "kl": 0.0003414154052734375, "learning_rate": 1.7482517482517484e-07, "loss": 0.0009, "step": 10 }, { "clip_ratio": 0.0009331351611763239, "epoch": 0.003848845346396081, "grad_norm": 0.13596328815608683, "kl": 0.0003566741943359375, "learning_rate": 1.9230769230769234e-07, "loss": 0.0009, "step": 11 }, { "clip_ratio": 0.0007563710096292198, "epoch": 0.004198740377886634, "grad_norm": 0.13688318901354837, "kl": 0.000316619873046875, "learning_rate": 2.097902097902098e-07, "loss": 0.0005, "step": 12 }, { "clip_ratio": 0.0020645298063755035, "clipped_completions_ratio": 0.0, "epoch": 0.004548635409377187, "grad_norm": 0.14248551441743038, "kl": 0.0003986358642578125, "learning_rate": 2.2727272727272729e-07, "loss": -0.0092, "max_completion_length": 180.0, "max_terminated_completion_length": 180.0, "mean_completion_length": 142.71429443359375, "mean_terminated_completion_length": 142.71429443359375, "min_completion_length": 104.0, "min_terminated_completion_length": 104.0, "num_tokens": 77176.0, "reward": 1.1301062107086182, "reward_std": 0.1335202008485794, "rewards/check_gptzero_func/mean": 0.1428571492433548, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.03129660710692406, "rewards/check_winston_local_func/std": 0.05566581338644028, "rewards/sentence_count_match_reward_logic/mean": 0.9559523463249207, "rewards/sentence_count_match_reward_logic/std": 0.08117996901273727, "step": 13 }, { "clip_ratio": 0.0015187658136710525, "epoch": 0.00489853044086774, "grad_norm": 0.14719885553805923, "kl": 0.0003871917724609375, "learning_rate": 2.447552447552448e-07, "loss": -0.0093, "step": 14 }, { "clip_ratio": 0.0016739974962547421, "epoch": 0.005248425472358292, "grad_norm": 0.15304324385105791, "kl": 0.0004520416259765625, "learning_rate": 2.622377622377623e-07, "loss": -0.0091, "step": 15 }, { "clip_ratio": 0.0020409999415278435, "epoch": 0.005598320503848845, "grad_norm": 0.13481624075450307, "kl": 0.0004482269287109375, "learning_rate": 2.7972027972027973e-07, "loss": -0.009, "step": 16 }, { "clip_ratio": 0.0007307584746740758, "clipped_completions_ratio": 0.0, "epoch": 0.005948215535339398, "grad_norm": 0.16467771085140873, "kl": 0.0003871917724609375, "learning_rate": 2.9720279720279723e-07, "loss": -0.004, "max_completion_length": 206.0, "max_terminated_completion_length": 206.0, "mean_completion_length": 132.5357208251953, "mean_terminated_completion_length": 132.5357208251953, "min_completion_length": 88.0, "min_terminated_completion_length": 88.0, "num_tokens": 94358.0, "reward": 1.1520633697509766, "reward_std": 0.062435030937194824, "rewards/check_gptzero_func/mean": 0.1607142835855484, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.02587294764816761, "rewards/check_winston_local_func/std": 0.04284830018877983, "rewards/sentence_count_match_reward_logic/mean": 0.9654761552810669, "rewards/sentence_count_match_reward_logic/std": 0.07493142038583755, "step": 17 }, { "clip_ratio": 0.001998351654037833, "epoch": 0.006298110566829951, "grad_norm": 0.1585384085212393, "kl": 0.00037384033203125, "learning_rate": 3.1468531468531473e-07, "loss": -0.0044, "step": 18 }, { "clip_ratio": 0.0021041384898126125, "epoch": 0.006648005598320504, "grad_norm": 0.16400277239151576, "kl": 0.0003814697265625, "learning_rate": 3.321678321678322e-07, "loss": -0.0044, "step": 19 }, { "clip_ratio": 0.0008231443935073912, "epoch": 0.006997900629811057, "grad_norm": 0.1556949652124487, "kl": 0.0003070831298828125, "learning_rate": 3.496503496503497e-07, "loss": -0.0043, "step": 20 }, { "clip_ratio": 0.0017110828775912523, "clipped_completions_ratio": 0.0, "epoch": 0.00734779566130161, "grad_norm": 0.1950688865556283, "kl": 0.000339508056640625, "learning_rate": 3.6713286713286713e-07, "loss": -0.0046, "max_completion_length": 142.0, "max_terminated_completion_length": 142.0, "mean_completion_length": 106.62500762939453, "mean_terminated_completion_length": 106.62500762939453, "min_completion_length": 48.0, "min_terminated_completion_length": 48.0, "num_tokens": 108801.0, "reward": 1.0040022134780884, "reward_std": 0.013341100886464119, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.00846641045063734, "rewards/check_winston_local_func/std": 0.00142973056063056, "rewards/sentence_count_match_reward_logic/mean": 0.9955357313156128, "rewards/sentence_count_match_reward_logic/std": 0.033407654613256454, "step": 21 }, { "clip_ratio": 0.001749910064972937, "epoch": 0.007697690692792162, "grad_norm": 0.19153660893581456, "kl": 0.000408172607421875, "learning_rate": 3.846153846153847e-07, "loss": -0.0038, "step": 22 }, { "clip_ratio": 0.0019295389065518975, "epoch": 0.008047585724282715, "grad_norm": 0.1925759827579246, "kl": 0.0003509521484375, "learning_rate": 4.020979020979021e-07, "loss": -0.0041, "step": 23 }, { "clip_ratio": 0.0012743175029754639, "epoch": 0.008397480755773267, "grad_norm": 0.18271178634522894, "kl": 0.0003490447998046875, "learning_rate": 4.195804195804196e-07, "loss": -0.0038, "step": 24 }, { "clip_ratio": 0.0016077148029580712, "clipped_completions_ratio": 0.0, "epoch": 0.00874737578726382, "grad_norm": 0.17652826092880958, "kl": 0.0003986358642578125, "learning_rate": 4.3706293706293707e-07, "loss": -0.0229, "max_completion_length": 208.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 151.94644165039062, "mean_terminated_completion_length": 151.94644165039062, "min_completion_length": 102.0, "min_terminated_completion_length": 102.0, "num_tokens": 129342.0, "reward": 0.9421312212944031, "reward_std": 0.10177424550056458, "rewards/check_gptzero_func/mean": 0.0357142873108387, "rewards/check_gptzero_func/std": 0.1872563362121582, "rewards/check_winston_local_func/mean": 0.016535982489585876, "rewards/check_winston_local_func/std": 0.026169732213020325, "rewards/sentence_count_match_reward_logic/mean": 0.8898809552192688, "rewards/sentence_count_match_reward_logic/std": 0.11914917826652527, "step": 25 }, { "clip_ratio": 0.0018346880096942186, "epoch": 0.009097270818754374, "grad_norm": 0.17437115026380864, "kl": 0.0004596710205078125, "learning_rate": 4.5454545454545457e-07, "loss": -0.0227, "step": 26 }, { "clip_ratio": 0.001498962170444429, "epoch": 0.009447165850244927, "grad_norm": 0.17186897847550012, "kl": 0.0004329681396484375, "learning_rate": 4.72027972027972e-07, "loss": -0.0222, "step": 27 }, { "clip_ratio": 0.0010252047795802355, "epoch": 0.00979706088173548, "grad_norm": 0.17632164587077087, "kl": 0.0004291534423828125, "learning_rate": 4.895104895104896e-07, "loss": -0.0228, "step": 28 }, { "clip_ratio": 0.0013223332352936268, "clipped_completions_ratio": 0.0, "epoch": 0.010146955913226032, "grad_norm": 0.2012250283627851, "kl": 0.0003108978271484375, "learning_rate": 5.06993006993007e-07, "loss": -0.026, "max_completion_length": 201.0, "max_terminated_completion_length": 201.0, "mean_completion_length": 137.96429443359375, "mean_terminated_completion_length": 137.96429443359375, "min_completion_length": 71.0, "min_terminated_completion_length": 71.0, "num_tokens": 147508.0, "reward": 1.1577715873718262, "reward_std": 0.11889660358428955, "rewards/check_gptzero_func/mean": 0.1964285671710968, "rewards/check_gptzero_func/std": 0.40089187026023865, "rewards/check_winston_local_func/mean": 0.00896194577217102, "rewards/check_winston_local_func/std": 0.0014196429401636124, "rewards/sentence_count_match_reward_logic/mean": 0.9523809552192688, "rewards/sentence_count_match_reward_logic/std": 0.08311660587787628, "step": 29 }, { "clip_ratio": 0.001670246827416122, "epoch": 0.010496850944716585, "grad_norm": 0.1959898367987916, "kl": 0.0003414154052734375, "learning_rate": 5.244755244755246e-07, "loss": -0.0256, "step": 30 }, { "clip_ratio": 0.0006227659177966416, "epoch": 0.010846745976207137, "grad_norm": 0.20198731466578826, "kl": 0.0002956390380859375, "learning_rate": 5.41958041958042e-07, "loss": -0.0252, "step": 31 }, { "clip_ratio": 0.0013944937381893396, "epoch": 0.01119664100769769, "grad_norm": 0.19978010245455707, "kl": 0.00031280517578125, "learning_rate": 5.594405594405595e-07, "loss": -0.0262, "step": 32 }, { "clip_ratio": 0.0008371163858100772, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.011546536039188244, "grad_norm": 0.13110549063374324, "kl": 0.00030517578125, "learning_rate": 5.76923076923077e-07, "loss": 0.0082, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 189.6428680419922, "mean_terminated_completion_length": 187.1851806640625, "min_completion_length": 109.0, "min_terminated_completion_length": 109.0, "num_tokens": 171216.0, "reward": 1.1065973043441772, "reward_std": 0.13755860924720764, "rewards/check_gptzero_func/mean": 0.125, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.01835324987769127, "rewards/check_winston_local_func/std": 0.03254404664039612, "rewards/sentence_count_match_reward_logic/mean": 0.9632440209388733, "rewards/sentence_count_match_reward_logic/std": 0.07162924855947495, "step": 33 }, { "clip_ratio": 0.001596953021362424, "epoch": 0.011896431070678797, "grad_norm": 0.1295946006228766, "kl": 0.000324249267578125, "learning_rate": 5.944055944055945e-07, "loss": 0.0084, "step": 34 }, { "clip_ratio": 0.0013471191050484776, "epoch": 0.01224632610216935, "grad_norm": 0.12739265251011667, "kl": 0.00037384033203125, "learning_rate": 6.118881118881119e-07, "loss": 0.0085, "step": 35 }, { "clip_ratio": 0.0008085081935860217, "epoch": 0.012596221133659902, "grad_norm": 0.1288028490453211, "kl": 0.0003108978271484375, "learning_rate": 6.293706293706295e-07, "loss": 0.0083, "step": 36 }, { "clip_ratio": 0.0014256552094593644, "clipped_completions_ratio": 0.0, "epoch": 0.012946116165150455, "grad_norm": 0.1475010695202692, "kl": 0.00043487548828125, "learning_rate": 6.468531468531469e-07, "loss": -0.0031, "max_completion_length": 223.0, "max_terminated_completion_length": 223.0, "mean_completion_length": 147.7857208251953, "mean_terminated_completion_length": 147.7857208251953, "min_completion_length": 86.0, "min_terminated_completion_length": 86.0, "num_tokens": 190036.0, "reward": 0.9944818019866943, "reward_std": 0.01688719354569912, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.009787963703274727, "rewards/check_winston_local_func/std": 0.0058782403357326984, "rewards/sentence_count_match_reward_logic/mean": 0.9846938848495483, "rewards/sentence_count_match_reward_logic/std": 0.05224866420030594, "step": 37 }, { "clip_ratio": 0.0010645099682733417, "epoch": 0.013296011196641007, "grad_norm": 0.14480985925537146, "kl": 0.000438690185546875, "learning_rate": 6.643356643356644e-07, "loss": -0.0032, "step": 38 }, { "clip_ratio": 0.0009934036061167717, "epoch": 0.01364590622813156, "grad_norm": 0.13617270165362724, "kl": 0.00046539306640625, "learning_rate": 6.818181818181818e-07, "loss": -0.003, "step": 39 }, { "clip_ratio": 0.0009956662543118, "epoch": 0.013995801259622114, "grad_norm": 0.1457279599661168, "kl": 0.0004138946533203125, "learning_rate": 6.993006993006994e-07, "loss": -0.0031, "step": 40 }, { "clip_ratio": 0.0018181123305112123, "clipped_completions_ratio": 0.0, "epoch": 0.014345696291112667, "grad_norm": 0.1552018783379293, "kl": 0.000507354736328125, "learning_rate": 7.167832167832168e-07, "loss": -0.0036, "max_completion_length": 237.0, "max_terminated_completion_length": 237.0, "mean_completion_length": 153.10714721679688, "mean_terminated_completion_length": 153.10714721679688, "min_completion_length": 101.0, "min_terminated_completion_length": 101.0, "num_tokens": 210274.0, "reward": 1.0516618490219116, "reward_std": 0.14977985620498657, "rewards/check_gptzero_func/mean": 0.0714285746216774, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.022643929347395897, "rewards/check_winston_local_func/std": 0.05506661906838417, "rewards/sentence_count_match_reward_logic/mean": 0.9575892686843872, "rewards/sentence_count_match_reward_logic/std": 0.07389380782842636, "step": 41 }, { "clip_ratio": 0.0023421570658683777, "epoch": 0.01469559132260322, "grad_norm": 0.15942146767084864, "kl": 0.00048065185546875, "learning_rate": 7.342657342657343e-07, "loss": -0.0036, "step": 42 }, { "clip_ratio": 0.0022415423300117254, "epoch": 0.015045486354093772, "grad_norm": 0.15938822551025808, "kl": 0.00045013427734375, "learning_rate": 7.517482517482517e-07, "loss": -0.0035, "step": 43 }, { "clip_ratio": 0.001745353452861309, "epoch": 0.015395381385584325, "grad_norm": 0.15814121020311733, "kl": 0.000499725341796875, "learning_rate": 7.692307692307694e-07, "loss": -0.0043, "step": 44 }, { "clip_ratio": 0.0012391515774652362, "clipped_completions_ratio": 0.0, "epoch": 0.015745276417074877, "grad_norm": 0.13657388993614417, "kl": 0.0004329681396484375, "learning_rate": 7.867132867132868e-07, "loss": 0.0013, "max_completion_length": 203.0, "max_terminated_completion_length": 203.0, "mean_completion_length": 141.1607208251953, "mean_terminated_completion_length": 141.1607208251953, "min_completion_length": 75.0, "min_terminated_completion_length": 75.0, "num_tokens": 228875.0, "reward": 0.9927800893783569, "reward_std": 0.019307691603899002, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.008511303924024105, "rewards/check_winston_local_func/std": 0.000745341763831675, "rewards/sentence_count_match_reward_logic/mean": 0.9842687249183655, "rewards/sentence_count_match_reward_logic/std": 0.04591693356633186, "step": 45 }, { "clip_ratio": 0.0011367915431037545, "epoch": 0.01609517144856543, "grad_norm": 0.13686571611056847, "kl": 0.000354766845703125, "learning_rate": 8.041958041958043e-07, "loss": 0.0009, "step": 46 }, { "clip_ratio": 0.0017213127575814724, "epoch": 0.016445066480055982, "grad_norm": 0.13925123176513934, "kl": 0.0003681182861328125, "learning_rate": 8.216783216783217e-07, "loss": 0.0012, "step": 47 }, { "clip_ratio": 0.00167716050054878, "epoch": 0.016794961511546535, "grad_norm": 0.1366099253631194, "kl": 0.0004138946533203125, "learning_rate": 8.391608391608393e-07, "loss": 0.0005, "step": 48 }, { "clip_ratio": 0.0009064878104254603, "clipped_completions_ratio": 0.0, "epoch": 0.017144856543037088, "grad_norm": 0.14661431178912798, "kl": 0.000408172607421875, "learning_rate": 8.566433566433567e-07, "loss": -0.0063, "max_completion_length": 208.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 156.1607208251953, "mean_terminated_completion_length": 156.1607208251953, "min_completion_length": 93.0, "min_terminated_completion_length": 93.0, "num_tokens": 249228.0, "reward": 1.0332729816436768, "reward_std": 0.16659313440322876, "rewards/check_gptzero_func/mean": 0.0892857164144516, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.01382141001522541, "rewards/check_winston_local_func/std": 0.020801104605197906, "rewards/sentence_count_match_reward_logic/mean": 0.9301658272743225, "rewards/sentence_count_match_reward_logic/std": 0.10285109281539917, "step": 49 }, { "clip_ratio": 0.0008892911137081683, "epoch": 0.01749475157452764, "grad_norm": 0.1473727026478572, "kl": 0.00038909912109375, "learning_rate": 8.741258741258741e-07, "loss": -0.006, "step": 50 }, { "clip_ratio": 0.001018206006847322, "epoch": 0.017844646606018196, "grad_norm": 0.14799310554077133, "kl": 0.000392913818359375, "learning_rate": 8.916083916083917e-07, "loss": -0.0065, "step": 51 }, { "clip_ratio": 0.000713663874194026, "epoch": 0.01819454163750875, "grad_norm": 0.14754416000249407, "kl": 0.000370025634765625, "learning_rate": 9.090909090909091e-07, "loss": -0.0063, "step": 52 }, { "clip_ratio": 0.0007794369594193995, "clipped_completions_ratio": 0.0, "epoch": 0.0185444366689993, "grad_norm": 0.13793324303534046, "kl": 0.0004329681396484375, "learning_rate": 9.265734265734266e-07, "loss": -0.007, "max_completion_length": 246.0, "max_terminated_completion_length": 246.0, "mean_completion_length": 154.2857208251953, "mean_terminated_completion_length": 154.2857208251953, "min_completion_length": 93.0, "min_terminated_completion_length": 93.0, "num_tokens": 269588.0, "reward": 1.1091949939727783, "reward_std": 0.10727981477975845, "rewards/check_gptzero_func/mean": 0.1785714328289032, "rewards/check_gptzero_func/std": 0.3864591121673584, "rewards/check_winston_local_func/mean": 0.015445036813616753, "rewards/check_winston_local_func/std": 0.031776320189237595, "rewards/sentence_count_match_reward_logic/mean": 0.9151785969734192, "rewards/sentence_count_match_reward_logic/std": 0.12376086413860321, "step": 53 }, { "clip_ratio": 0.0010614198399707675, "epoch": 0.018894331700489854, "grad_norm": 0.14026263476278783, "kl": 0.0004749298095703125, "learning_rate": 9.44055944055944e-07, "loss": -0.0072, "step": 54 }, { "clip_ratio": 0.0011637042043730617, "epoch": 0.019244226731980407, "grad_norm": 0.1348709570776347, "kl": 0.000396728515625, "learning_rate": 9.615384615384617e-07, "loss": -0.0072, "step": 55 }, { "clip_ratio": 0.0012680984800681472, "epoch": 0.01959412176347096, "grad_norm": 0.13383551891133807, "kl": 0.000446319580078125, "learning_rate": 9.790209790209791e-07, "loss": -0.0072, "step": 56 }, { "clip_ratio": 0.0013119291979819536, "clipped_completions_ratio": 0.0, "epoch": 0.01994401679496151, "grad_norm": 0.15520819313451964, "kl": 0.00041961669921875, "learning_rate": 9.965034965034966e-07, "loss": -0.0053, "max_completion_length": 191.0, "max_terminated_completion_length": 191.0, "mean_completion_length": 145.32144165039062, "mean_terminated_completion_length": 145.32144165039062, "min_completion_length": 107.0, "min_terminated_completion_length": 107.0, "num_tokens": 288670.0, "reward": 1.295470952987671, "reward_std": 0.08690717071294785, "rewards/check_gptzero_func/mean": 0.2678571343421936, "rewards/check_gptzero_func/std": 0.44685041904449463, "rewards/check_winston_local_func/mean": 0.0556749626994133, "rewards/check_winston_local_func/std": 0.10967353731393814, "rewards/sentence_count_match_reward_logic/mean": 0.9719387888908386, "rewards/sentence_count_match_reward_logic/std": 0.0742039605975151, "step": 57 }, { "clip_ratio": 0.0011204816401004791, "epoch": 0.020293911826452064, "grad_norm": 0.14932778906141583, "kl": 0.0003833770751953125, "learning_rate": 1.013986013986014e-06, "loss": -0.0048, "step": 58 }, { "clip_ratio": 0.0010584326228126884, "epoch": 0.020643806857942617, "grad_norm": 0.15543091761609318, "kl": 0.0003910064697265625, "learning_rate": 1.0314685314685317e-06, "loss": -0.0049, "step": 59 }, { "clip_ratio": 0.0011917722877115011, "epoch": 0.02099370188943317, "grad_norm": 0.1560069974505382, "kl": 0.000446319580078125, "learning_rate": 1.0489510489510491e-06, "loss": -0.0049, "step": 60 }, { "clip_ratio": 0.002090836176648736, "clipped_completions_ratio": 0.0, "epoch": 0.021343596920923722, "grad_norm": 0.14444851974726244, "kl": 0.0003643035888671875, "learning_rate": 1.0664335664335666e-06, "loss": -0.0126, "max_completion_length": 200.0, "max_terminated_completion_length": 200.0, "mean_completion_length": 156.48214721679688, "mean_terminated_completion_length": 156.48214721679688, "min_completion_length": 86.0, "min_terminated_completion_length": 86.0, "num_tokens": 308961.0, "reward": 1.0341435670852661, "reward_std": 0.1190982237458229, "rewards/check_gptzero_func/mean": 0.0357142873108387, "rewards/check_gptzero_func/std": 0.1872563362121582, "rewards/check_winston_local_func/mean": 0.04281703382730484, "rewards/check_winston_local_func/std": 0.07692530751228333, "rewards/sentence_count_match_reward_logic/mean": 0.9556122422218323, "rewards/sentence_count_match_reward_logic/std": 0.08135567605495453, "step": 61 }, { "clip_ratio": 0.0017940533580258489, "epoch": 0.021693491952414275, "grad_norm": 0.14454118388033146, "kl": 0.0004215240478515625, "learning_rate": 1.083916083916084e-06, "loss": -0.0123, "step": 62 }, { "clip_ratio": 0.0018457976402714849, "epoch": 0.022043386983904827, "grad_norm": 0.14096613864156043, "kl": 0.00038909912109375, "learning_rate": 1.1013986013986015e-06, "loss": -0.0125, "step": 63 }, { "clip_ratio": 0.0013796831481158733, "epoch": 0.02239328201539538, "grad_norm": 0.1451248881033893, "kl": 0.00040435791015625, "learning_rate": 1.118881118881119e-06, "loss": -0.0123, "step": 64 }, { "clip_ratio": 0.0016179136000573635, "clipped_completions_ratio": 0.017857142857142905, "epoch": 0.022743177046885936, "grad_norm": 0.15736191055363327, "kl": 0.00041961669921875, "learning_rate": 1.1363636363636364e-06, "loss": -0.0085, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 160.46429443359375, "mean_terminated_completion_length": 158.72726440429688, "min_completion_length": 99.0, "min_terminated_completion_length": 99.0, "num_tokens": 329723.0, "reward": 0.9990012645721436, "reward_std": 0.034558068960905075, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.011402000673115253, "rewards/check_winston_local_func/std": 0.010932626202702522, "rewards/sentence_count_match_reward_logic/mean": 0.98759925365448, "rewards/sentence_count_match_reward_logic/std": 0.04705694690346718, "step": 65 }, { "clip_ratio": 0.0008446626598015428, "epoch": 0.02309307207837649, "grad_norm": 0.16451742765893593, "kl": 0.0004520416259765625, "learning_rate": 1.153846153846154e-06, "loss": -0.0079, "step": 66 }, { "clip_ratio": 0.0019442616030573845, "epoch": 0.02344296710986704, "grad_norm": 0.16315834414565716, "kl": 0.00043487548828125, "learning_rate": 1.1713286713286715e-06, "loss": -0.009, "step": 67 }, { "clip_ratio": 0.0015139945317059755, "epoch": 0.023792862141357594, "grad_norm": 0.1917167864013618, "kl": 0.0004291534423828125, "learning_rate": 1.188811188811189e-06, "loss": -0.0085, "step": 68 }, { "clip_ratio": 0.000865303329192102, "clipped_completions_ratio": 0.0, "epoch": 0.024142757172848146, "grad_norm": 0.16957958600477915, "kl": 0.000507354736328125, "learning_rate": 1.2062937062937064e-06, "loss": -0.0091, "max_completion_length": 196.0, "max_terminated_completion_length": 196.0, "mean_completion_length": 125.10714721679688, "mean_terminated_completion_length": 125.10714721679688, "min_completion_length": 63.0, "min_terminated_completion_length": 63.0, "num_tokens": 346377.0, "reward": 0.9952715635299683, "reward_std": 0.011652219109237194, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.009160464629530907, "rewards/check_winston_local_func/std": 0.002573850564658642, "rewards/sentence_count_match_reward_logic/mean": 0.9861111044883728, "rewards/sentence_count_match_reward_logic/std": 0.04270589351654053, "step": 69 }, { "clip_ratio": 0.001333007705397904, "epoch": 0.0244926522043387, "grad_norm": 0.17196012515870857, "kl": 0.0004711151123046875, "learning_rate": 1.2237762237762238e-06, "loss": -0.0091, "step": 70 }, { "clip_ratio": 0.0014280558098107576, "epoch": 0.02484254723582925, "grad_norm": 0.1768749189686263, "kl": 0.0004558563232421875, "learning_rate": 1.2412587412587413e-06, "loss": -0.0096, "step": 71 }, { "clip_ratio": 0.0021324928384274244, "epoch": 0.025192442267319804, "grad_norm": 0.17546440219483814, "kl": 0.0003910064697265625, "learning_rate": 1.258741258741259e-06, "loss": -0.0095, "step": 72 }, { "clip_ratio": 0.0012409196933731437, "clipped_completions_ratio": 0.0, "epoch": 0.025542337298810357, "grad_norm": 0.13209451297401148, "kl": 0.000408172607421875, "learning_rate": 1.2762237762237764e-06, "loss": -0.0052, "max_completion_length": 240.0, "max_terminated_completion_length": 240.0, "mean_completion_length": 161.44644165039062, "mean_terminated_completion_length": 161.44644165039062, "min_completion_length": 78.0, "min_terminated_completion_length": 78.0, "num_tokens": 366906.0, "reward": 1.0440776348114014, "reward_std": 0.09200991690158844, "rewards/check_gptzero_func/mean": 0.0535714291036129, "rewards/check_gptzero_func/std": 0.22720779478549957, "rewards/check_winston_local_func/mean": 0.008363393135368824, "rewards/check_winston_local_func/std": 0.0012247682316228747, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.05016207695007324, "step": 73 }, { "clip_ratio": 0.0012254769681021571, "epoch": 0.02589223233030091, "grad_norm": 0.1289591769519732, "kl": 0.0004291534423828125, "learning_rate": 1.2937062937062938e-06, "loss": -0.006, "step": 74 }, { "clip_ratio": 0.001650493242777884, "epoch": 0.026242127361791462, "grad_norm": 0.12764543957278343, "kl": 0.000396728515625, "learning_rate": 1.3111888111888113e-06, "loss": -0.0058, "step": 75 }, { "clip_ratio": 0.0019091581925749779, "epoch": 0.026592022393282014, "grad_norm": 0.13066249265719662, "kl": 0.0004215240478515625, "learning_rate": 1.3286713286713287e-06, "loss": -0.0055, "step": 76 }, { "clip_ratio": 0.0029880902729928493, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.026941917424772567, "grad_norm": 0.2006007942923338, "kl": 0.000484466552734375, "learning_rate": 1.3461538461538462e-06, "loss": -0.0095, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 143.07144165039062, "mean_terminated_completion_length": 138.88888549804688, "min_completion_length": 81.0, "min_terminated_completion_length": 81.0, "num_tokens": 385678.0, "reward": 0.9297885894775391, "reward_std": 0.03944239020347595, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.01322819571942091, "rewards/check_winston_local_func/std": 0.012358245439827442, "rewards/sentence_count_match_reward_logic/mean": 0.9165603518486023, "rewards/sentence_count_match_reward_logic/std": 0.12505148351192474, "step": 77 }, { "clip_ratio": 0.0023102746345102787, "epoch": 0.02729181245626312, "grad_norm": 0.1994682350283797, "kl": 0.0004749298095703125, "learning_rate": 1.3636363636363636e-06, "loss": -0.0098, "step": 78 }, { "clip_ratio": 0.00200393027625978, "epoch": 0.027641707487753672, "grad_norm": 0.1939899335856474, "kl": 0.000537872314453125, "learning_rate": 1.381118881118881e-06, "loss": -0.0096, "step": 79 }, { "clip_ratio": 0.0014901651302352548, "epoch": 0.02799160251924423, "grad_norm": 0.19770811070186314, "kl": 0.000499725341796875, "learning_rate": 1.3986013986013987e-06, "loss": -0.0096, "step": 80 }, { "clip_ratio": 0.001384148607030511, "clipped_completions_ratio": 0.0, "epoch": 0.02834149755073478, "grad_norm": 0.12289707597395486, "kl": 0.0004730224609375, "learning_rate": 1.4160839160839162e-06, "loss": 0.0005, "max_completion_length": 252.0, "max_terminated_completion_length": 252.0, "mean_completion_length": 159.75, "mean_terminated_completion_length": 159.75, "min_completion_length": 118.0, "min_terminated_completion_length": 118.0, "num_tokens": 406248.0, "reward": 1.012542486190796, "reward_std": 0.004423472099006176, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.012542463839054108, "rewards/check_winston_local_func/std": 0.010056114755570889, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 81 }, { "clip_ratio": 0.0009347718441858888, "epoch": 0.028691392582225334, "grad_norm": 0.12031267684494637, "kl": 0.00048065185546875, "learning_rate": 1.4335664335664336e-06, "loss": 0.0009, "step": 82 }, { "clip_ratio": 0.0006393774528987706, "epoch": 0.029041287613715886, "grad_norm": 0.12178332592891467, "kl": 0.0004749298095703125, "learning_rate": 1.451048951048951e-06, "loss": 0.0009, "step": 83 }, { "clip_ratio": 0.00040197392809204757, "epoch": 0.02939118264520644, "grad_norm": 0.11957628862461026, "kl": 0.000530242919921875, "learning_rate": 1.4685314685314685e-06, "loss": 0.0008, "step": 84 }, { "clip_ratio": 0.0008054598001763225, "clipped_completions_ratio": 0.0, "epoch": 0.02974107767669699, "grad_norm": 0.19449911488722912, "kl": 0.000701904296875, "learning_rate": 1.486013986013986e-06, "loss": -0.0061, "max_completion_length": 208.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 137.0, "mean_terminated_completion_length": 137.0, "min_completion_length": 74.0, "min_terminated_completion_length": 74.0, "num_tokens": 424200.0, "reward": 0.9855987429618835, "reward_std": 0.029486961662769318, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.02046266198158264, "rewards/check_winston_local_func/std": 0.033273376524448395, "rewards/sentence_count_match_reward_logic/mean": 0.9651360511779785, "rewards/sentence_count_match_reward_logic/std": 0.07755036652088165, "step": 85 }, { "clip_ratio": 0.0008178059360943735, "epoch": 0.030090972708187544, "grad_norm": 0.19165401950873384, "kl": 0.000701904296875, "learning_rate": 1.5034965034965034e-06, "loss": -0.006, "step": 86 }, { "clip_ratio": 0.0013006406370550394, "epoch": 0.030440867739678096, "grad_norm": 0.19734902592199582, "kl": 0.000843048095703125, "learning_rate": 1.5209790209790213e-06, "loss": -0.0067, "step": 87 }, { "clip_ratio": 0.001238521421328187, "epoch": 0.03079076277116865, "grad_norm": 0.1865231075546581, "kl": 0.00096893310546875, "learning_rate": 1.5384615384615387e-06, "loss": -0.0062, "step": 88 }, { "clip_ratio": 0.0011896340874955058, "clipped_completions_ratio": 0.0, "epoch": 0.0311406578026592, "grad_norm": 0.21193437468198667, "kl": 0.0009765625, "learning_rate": 1.5559440559440562e-06, "loss": -0.0031, "max_completion_length": 184.0, "max_terminated_completion_length": 184.0, "mean_completion_length": 110.14286041259766, "mean_terminated_completion_length": 110.14286041259766, "min_completion_length": 62.0, "min_terminated_completion_length": 62.0, "num_tokens": 438928.0, "reward": 1.201305627822876, "reward_std": 0.07843895256519318, "rewards/check_gptzero_func/mean": 0.125, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.10011505335569382, "rewards/check_winston_local_func/std": 0.16671770811080933, "rewards/sentence_count_match_reward_logic/mean": 0.976190447807312, "rewards/sentence_count_match_reward_logic/std": 0.058848995715379715, "step": 89 }, { "clip_ratio": 0.002002167981117964, "epoch": 0.031490552834149754, "grad_norm": 0.20959233753839696, "kl": 0.001129150390625, "learning_rate": 1.5734265734265736e-06, "loss": -0.0026, "step": 90 }, { "clip_ratio": 0.0012342904228717089, "epoch": 0.03184044786564031, "grad_norm": 0.20398049330728743, "kl": 0.00116729736328125, "learning_rate": 1.590909090909091e-06, "loss": -0.0032, "step": 91 }, { "clip_ratio": 0.0010104280663654208, "epoch": 0.03219034289713086, "grad_norm": 0.21338029464883168, "kl": 0.00121307373046875, "learning_rate": 1.6083916083916085e-06, "loss": -0.0036, "step": 92 }, { "clip_ratio": 0.001493118004873395, "clipped_completions_ratio": 0.0, "epoch": 0.032540237928621416, "grad_norm": 0.13091019142389937, "kl": 0.0008392333984375, "learning_rate": 1.625874125874126e-06, "loss": 0.0017, "max_completion_length": 209.0, "max_terminated_completion_length": 209.0, "mean_completion_length": 153.23214721679688, "mean_terminated_completion_length": 153.23214721679688, "min_completion_length": 60.0, "min_terminated_completion_length": 60.0, "num_tokens": 458485.0, "reward": 1.0304702520370483, "reward_std": 0.023107368499040604, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.03493448346853256, "rewards/check_winston_local_func/std": 0.07684969902038574, "rewards/sentence_count_match_reward_logic/mean": 0.9955357313156128, "rewards/sentence_count_match_reward_logic/std": 0.023407040163874626, "step": 93 }, { "clip_ratio": 0.0012789476895704865, "epoch": 0.032890132960111965, "grad_norm": 0.13545373839581665, "kl": 0.000858306884765625, "learning_rate": 1.6433566433566434e-06, "loss": 0.0019, "step": 94 }, { "clip_ratio": 0.0012665691319853067, "epoch": 0.03324002799160252, "grad_norm": 0.1327720548177077, "kl": 0.000904083251953125, "learning_rate": 1.660839160839161e-06, "loss": 0.0021, "step": 95 }, { "clip_ratio": 0.001798853394575417, "epoch": 0.03358992302309307, "grad_norm": 0.12935260564042378, "kl": 0.000911712646484375, "learning_rate": 1.6783216783216785e-06, "loss": 0.0019, "step": 96 }, { "clip_ratio": 0.0014206410851329565, "clipped_completions_ratio": 0.0, "epoch": 0.033939818054583626, "grad_norm": 0.16566251368369006, "kl": 0.00139617919921875, "learning_rate": 1.695804195804196e-06, "loss": -0.0048, "max_completion_length": 242.0, "max_terminated_completion_length": 242.0, "mean_completion_length": 152.0, "mean_terminated_completion_length": 152.0, "min_completion_length": 101.0, "min_terminated_completion_length": 101.0, "num_tokens": 478413.0, "reward": 1.0353927612304688, "reward_std": 0.10026410222053528, "rewards/check_gptzero_func/mean": 0.0357142873108387, "rewards/check_gptzero_func/std": 0.1872563362121582, "rewards/check_winston_local_func/mean": 0.021702269092202187, "rewards/check_winston_local_func/std": 0.042708683758974075, "rewards/sentence_count_match_reward_logic/mean": 0.9779762029647827, "rewards/sentence_count_match_reward_logic/std": 0.05904177203774452, "step": 97 }, { "clip_ratio": 0.0013817674480378628, "epoch": 0.034289713086074175, "grad_norm": 0.1619595835852496, "kl": 0.00141143798828125, "learning_rate": 1.7132867132867134e-06, "loss": -0.0053, "step": 98 }, { "clip_ratio": 0.000995420035906136, "epoch": 0.03463960811756473, "grad_norm": 0.1670414980013785, "kl": 0.00140380859375, "learning_rate": 1.7307692307692308e-06, "loss": -0.0046, "step": 99 }, { "clip_ratio": 0.0016007774975150824, "epoch": 0.03498950314905528, "grad_norm": 0.1662570772246414, "kl": 0.00150299072265625, "learning_rate": 1.7482517482517483e-06, "loss": -0.0048, "step": 100 }, { "clip_ratio": 0.002137560397386551, "clipped_completions_ratio": 0.0, "epoch": 0.035339398180545836, "grad_norm": 0.1951410648145512, "kl": 0.0012359619140625, "learning_rate": 1.7657342657342657e-06, "loss": -0.0087, "max_completion_length": 237.0, "max_terminated_completion_length": 237.0, "mean_completion_length": 143.83929443359375, "mean_terminated_completion_length": 143.83929443359375, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 497268.0, "reward": 1.2954431772232056, "reward_std": 0.2519323229789734, "rewards/check_gptzero_func/mean": 0.2321428507566452, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.0963360071182251, "rewards/check_winston_local_func/std": 0.12231005728244781, "rewards/sentence_count_match_reward_logic/mean": 0.9669643044471741, "rewards/sentence_count_match_reward_logic/std": 0.07263030856847763, "step": 101 }, { "clip_ratio": 0.0012427462497726083, "epoch": 0.03568929321203639, "grad_norm": 0.19774542961436462, "kl": 0.0013275146484375, "learning_rate": 1.7832167832167834e-06, "loss": -0.0078, "step": 102 }, { "clip_ratio": 0.0014089412288740277, "epoch": 0.03603918824352694, "grad_norm": 0.20202606763451122, "kl": 0.00133514404296875, "learning_rate": 1.8006993006993008e-06, "loss": -0.0085, "step": 103 }, { "clip_ratio": 0.0013234001817181706, "epoch": 0.0363890832750175, "grad_norm": 0.20085935422351076, "kl": 0.0014801025390625, "learning_rate": 1.8181818181818183e-06, "loss": -0.0085, "step": 104 }, { "clip_ratio": 0.0005831809248775244, "clipped_completions_ratio": 0.125, "epoch": 0.03673897830650805, "grad_norm": 0.14200238443667912, "kl": 0.0012664794921875, "learning_rate": 1.8356643356643357e-06, "loss": -0.0003, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 148.08929443359375, "mean_terminated_completion_length": 132.6734619140625, "min_completion_length": 84.0, "min_terminated_completion_length": 84.0, "num_tokens": 516617.0, "reward": 1.042804479598999, "reward_std": 0.09937547147274017, "rewards/check_gptzero_func/mean": 0.0892857164144516, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.042655568569898605, "rewards/check_winston_local_func/std": 0.10691433399915695, "rewards/sentence_count_match_reward_logic/mean": 0.9108630418777466, "rewards/sentence_count_match_reward_logic/std": 0.123735211789608, "step": 105 }, { "clip_ratio": 0.0018107175128534436, "epoch": 0.0370888733379986, "grad_norm": 0.15449171948351473, "kl": 0.00150299072265625, "learning_rate": 1.8531468531468532e-06, "loss": -0.0003, "step": 106 }, { "clip_ratio": 0.0017213155515491962, "epoch": 0.03743876836948915, "grad_norm": 0.1921157191915207, "kl": 0.001617431640625, "learning_rate": 1.8706293706293706e-06, "loss": -0.0004, "step": 107 }, { "clip_ratio": 0.00182064448017627, "epoch": 0.03778866340097971, "grad_norm": 0.14987936909390065, "kl": 0.001708984375, "learning_rate": 1.888111888111888e-06, "loss": -0.0008, "step": 108 }, { "clip_ratio": 0.0017104912549257278, "clipped_completions_ratio": 0.0, "epoch": 0.03813855843247026, "grad_norm": 0.1701656188591399, "kl": 0.0022125244140625, "learning_rate": 1.9055944055944055e-06, "loss": -0.0081, "max_completion_length": 251.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 151.08929443359375, "mean_terminated_completion_length": 151.08929443359375, "min_completion_length": 84.0, "min_terminated_completion_length": 84.0, "num_tokens": 536422.0, "reward": 1.053730845451355, "reward_std": 0.05409376323223114, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.07357203960418701, "rewards/check_winston_local_func/std": 0.13068802654743195, "rewards/sentence_count_match_reward_logic/mean": 0.9801587462425232, "rewards/sentence_count_match_reward_logic/std": 0.05236126855015755, "step": 109 }, { "clip_ratio": 0.0015924223698675632, "epoch": 0.03848845346396081, "grad_norm": 0.17406678038081427, "kl": 0.00244140625, "learning_rate": 1.9230769230769234e-06, "loss": -0.0084, "step": 110 }, { "clip_ratio": 0.0024672977160662413, "epoch": 0.03883834849545136, "grad_norm": 0.16090393967041516, "kl": 0.002532958984375, "learning_rate": 1.9405594405594406e-06, "loss": -0.0088, "step": 111 }, { "clip_ratio": 0.0021542049944400787, "epoch": 0.03918824352694192, "grad_norm": 0.15963110184774976, "kl": 0.0027923583984375, "learning_rate": 1.9580419580419583e-06, "loss": -0.008, "step": 112 }, { "clip_ratio": 0.0018701717490330338, "clipped_completions_ratio": 0.0, "epoch": 0.03953813855843247, "grad_norm": 0.1872156469756879, "kl": 0.0033721923828125, "learning_rate": 1.9755244755244755e-06, "loss": -0.002, "max_completion_length": 223.0, "max_terminated_completion_length": 223.0, "mean_completion_length": 138.7678680419922, "mean_terminated_completion_length": 138.7678680419922, "min_completion_length": 81.0, "min_terminated_completion_length": 81.0, "num_tokens": 554353.0, "reward": 1.0404943227767944, "reward_std": 0.06427661329507828, "rewards/check_gptzero_func/mean": 0.01785714365541935, "rewards/check_gptzero_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.022637125104665756, "rewards/check_winston_local_func/std": 0.041727058589458466, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 113 }, { "clip_ratio": 0.0017876612255349755, "epoch": 0.03988803358992302, "grad_norm": 0.18641190381725894, "kl": 0.0036773681640625, "learning_rate": 1.993006993006993e-06, "loss": -0.0009, "step": 114 }, { "clip_ratio": 0.0017201625742018223, "epoch": 0.04023792862141357, "grad_norm": 0.1872797301241185, "kl": 0.003997802734375, "learning_rate": 2.0104895104895104e-06, "loss": -0.0015, "step": 115 }, { "clip_ratio": 0.001393677550368011, "epoch": 0.04058782365290413, "grad_norm": 0.19233325906724053, "kl": 0.004119873046875, "learning_rate": 2.027972027972028e-06, "loss": -0.0017, "step": 116 }, { "clip_ratio": 0.0006513464031741023, "clipped_completions_ratio": 0.017857142857142905, "epoch": 0.040937718684394685, "grad_norm": 0.14873329880455033, "kl": 0.0030670166015625, "learning_rate": 2.0454545454545457e-06, "loss": -0.0057, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 172.23214721679688, "mean_terminated_completion_length": 170.70909118652344, "min_completion_length": 113.0, "min_terminated_completion_length": 113.0, "num_tokens": 576326.0, "reward": 1.1380540132522583, "reward_std": 0.09132251143455505, "rewards/check_gptzero_func/mean": 0.125, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.03448256850242615, "rewards/check_winston_local_func/std": 0.06236815080046654, "rewards/sentence_count_match_reward_logic/mean": 0.9785714149475098, "rewards/sentence_count_match_reward_logic/std": 0.05115313082933426, "step": 117 }, { "clip_ratio": 0.0012866177130490541, "epoch": 0.041287613715885234, "grad_norm": 0.1479706654622498, "kl": 0.003173828125, "learning_rate": 2.0629370629370634e-06, "loss": -0.0058, "step": 118 }, { "clip_ratio": 0.0011989118065685034, "epoch": 0.04163750874737579, "grad_norm": 0.13490060016673566, "kl": 0.00335693359375, "learning_rate": 2.0804195804195806e-06, "loss": -0.0059, "step": 119 }, { "clip_ratio": 0.0008727996610105038, "epoch": 0.04198740377886634, "grad_norm": 0.1466154176151304, "kl": 0.00335693359375, "learning_rate": 2.0979020979020983e-06, "loss": -0.0058, "step": 120 }, { "clip_ratio": 0.001505840104073286, "clipped_completions_ratio": 0.0, "epoch": 0.042337298810356895, "grad_norm": 0.23131658800058774, "kl": 0.0047607421875, "learning_rate": 2.1153846153846155e-06, "loss": 0.0377, "max_completion_length": 240.0, "max_terminated_completion_length": 240.0, "mean_completion_length": 150.7857208251953, "mean_terminated_completion_length": 150.7857208251953, "min_completion_length": 86.0, "min_terminated_completion_length": 86.0, "num_tokens": 596298.0, "reward": 0.9774840474128723, "reward_std": 0.04700876772403717, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.010647322051227093, "rewards/check_winston_local_func/std": 0.010278111323714256, "rewards/sentence_count_match_reward_logic/mean": 0.9668367505073547, "rewards/sentence_count_match_reward_logic/std": 0.07976963371038437, "step": 121 }, { "clip_ratio": 0.0017856280319392681, "epoch": 0.042687193841847444, "grad_norm": 0.25333113083982833, "kl": 0.005340576171875, "learning_rate": 2.132867132867133e-06, "loss": 0.0376, "step": 122 }, { "clip_ratio": 0.002033222233876586, "epoch": 0.043037088873338, "grad_norm": 0.23113915531274454, "kl": 0.005523681640625, "learning_rate": 2.1503496503496504e-06, "loss": 0.0374, "step": 123 }, { "clip_ratio": 0.002554909558966756, "epoch": 0.04338698390482855, "grad_norm": 0.22073568264265492, "kl": 0.00592041015625, "learning_rate": 2.167832167832168e-06, "loss": 0.0374, "step": 124 }, { "clip_ratio": 0.0011007542489096522, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.043736878936319105, "grad_norm": 0.18438298425921806, "kl": 0.004150390625, "learning_rate": 2.1853146853146857e-06, "loss": 0.0009, "max_completion_length": 256.0, "max_terminated_completion_length": 164.0, "mean_completion_length": 151.7857208251953, "mean_terminated_completion_length": 134.4166717529297, "min_completion_length": 110.0, "min_terminated_completion_length": 110.0, "num_tokens": 615894.0, "reward": 1.0290331840515137, "reward_std": 0.03192763403058052, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.02903308905661106, "rewards/check_winston_local_func/std": 0.07828743755817413, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 125 }, { "clip_ratio": 0.000340025348123163, "epoch": 0.044086773967809655, "grad_norm": 0.1792841710753599, "kl": 0.00421142578125, "learning_rate": 2.202797202797203e-06, "loss": 0.0004, "step": 126 }, { "clip_ratio": 0.001085555530153215, "epoch": 0.04443666899930021, "grad_norm": 0.1696007648259844, "kl": 0.00445556640625, "learning_rate": 2.2202797202797206e-06, "loss": 0.0007, "step": 127 }, { "clip_ratio": 0.001833630260080099, "epoch": 0.04478656403079076, "grad_norm": 0.16624572511133423, "kl": 0.00494384765625, "learning_rate": 2.237762237762238e-06, "loss": 0.0004, "step": 128 }, { "clip_ratio": 0.0019068751716986299, "clipped_completions_ratio": 0.0, "epoch": 0.045136459062281316, "grad_norm": 0.1825622602087049, "kl": 0.00927734375, "learning_rate": 2.2552447552447555e-06, "loss": 0.0001, "max_completion_length": 194.0, "max_terminated_completion_length": 194.0, "mean_completion_length": 128.33929443359375, "mean_terminated_completion_length": 128.33929443359375, "min_completion_length": 83.0, "min_terminated_completion_length": 83.0, "num_tokens": 632961.0, "reward": 0.9933937788009644, "reward_std": 0.010045934468507767, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.008699839934706688, "rewards/check_winston_local_func/std": 0.0015708355931565166, "rewards/sentence_count_match_reward_logic/mean": 0.9846938848495483, "rewards/sentence_count_match_reward_logic/std": 0.044584840536117554, "step": 129 }, { "clip_ratio": 0.002688675420358777, "epoch": 0.04548635409377187, "grad_norm": 0.17025900711809705, "kl": 0.010009765625, "learning_rate": 2.2727272727272728e-06, "loss": 0.0001, "step": 130 }, { "clip_ratio": 0.0023143934085965157, "epoch": 0.04583624912526242, "grad_norm": 0.1608599078051215, "kl": 0.01092529296875, "learning_rate": 2.2902097902097904e-06, "loss": 0.0001, "step": 131 }, { "clip_ratio": 0.0035102537367492914, "epoch": 0.04618614415675298, "grad_norm": 0.14850185172851899, "kl": 0.0123291015625, "learning_rate": 2.307692307692308e-06, "loss": -0.0004, "step": 132 }, { "clip_ratio": 0.001118341227993369, "clipped_completions_ratio": 0.0, "epoch": 0.046536039188243526, "grad_norm": 0.1666526905173065, "kl": 0.007415771484375, "learning_rate": 2.3251748251748253e-06, "loss": -0.0054, "max_completion_length": 210.0, "max_terminated_completion_length": 210.0, "mean_completion_length": 135.44644165039062, "mean_terminated_completion_length": 135.44644165039062, "min_completion_length": 73.0, "min_terminated_completion_length": 73.0, "num_tokens": 650666.0, "reward": 0.9724209904670715, "reward_std": 0.03361964225769043, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.023016231134533882, "rewards/check_winston_local_func/std": 0.035749610513448715, "rewards/sentence_count_match_reward_logic/mean": 0.9494047164916992, "rewards/sentence_count_match_reward_logic/std": 0.10009916126728058, "step": 133 }, { "clip_ratio": 0.0017483285628259182, "epoch": 0.04688593421973408, "grad_norm": 0.1660300805481332, "kl": 0.00775146484375, "learning_rate": 2.342657342657343e-06, "loss": -0.0054, "step": 134 }, { "clip_ratio": 0.0016922526992857456, "epoch": 0.04723582925122463, "grad_norm": 0.17911611444361927, "kl": 0.0084228515625, "learning_rate": 2.36013986013986e-06, "loss": -0.0054, "step": 135 }, { "clip_ratio": 0.0029993553180247545, "epoch": 0.04758572428271519, "grad_norm": 0.16468811986681736, "kl": 0.0093994140625, "learning_rate": 2.377622377622378e-06, "loss": -0.0055, "step": 136 }, { "clip_ratio": 0.0017818690976127982, "clipped_completions_ratio": 0.0, "epoch": 0.04793561931420574, "grad_norm": 0.36465557482822447, "kl": 0.0162353515625, "learning_rate": 2.395104895104895e-06, "loss": -0.0041, "max_completion_length": 238.0, "max_terminated_completion_length": 238.0, "mean_completion_length": 136.35714721679688, "mean_terminated_completion_length": 136.35714721679688, "min_completion_length": 60.0, "min_terminated_completion_length": 60.0, "num_tokens": 668430.0, "reward": 0.9923931956291199, "reward_std": 0.036174505949020386, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.042068514972925186, "rewards/check_winston_local_func/std": 0.06980591267347336, "rewards/sentence_count_match_reward_logic/mean": 0.9503246545791626, "rewards/sentence_count_match_reward_logic/std": 0.08510326594114304, "step": 137 }, { "clip_ratio": 0.001808297005482018, "epoch": 0.04828551434569629, "grad_norm": 0.31164069667303157, "kl": 0.0184326171875, "learning_rate": 2.4125874125874128e-06, "loss": -0.0044, "step": 138 }, { "clip_ratio": 0.0023588661570101976, "epoch": 0.04863540937718684, "grad_norm": 0.31666269413381937, "kl": 0.0194091796875, "learning_rate": 2.43006993006993e-06, "loss": -0.0046, "step": 139 }, { "clip_ratio": 0.003702103393152356, "epoch": 0.0489853044086774, "grad_norm": 0.36008664823296355, "kl": 0.02685546875, "learning_rate": 2.4475524475524477e-06, "loss": -0.0047, "step": 140 }, { "clip_ratio": 0.002438049763441086, "clipped_completions_ratio": 0.0, "epoch": 0.04933519944016795, "grad_norm": 0.20513824747406093, "kl": 0.01324462890625, "learning_rate": 2.4650349650349653e-06, "loss": -0.0011, "max_completion_length": 213.0, "max_terminated_completion_length": 213.0, "mean_completion_length": 140.42857360839844, "mean_terminated_completion_length": 140.42857360839844, "min_completion_length": 90.0, "min_terminated_completion_length": 90.0, "num_tokens": 686550.0, "reward": 1.1707265377044678, "reward_std": 0.05482158809900284, "rewards/check_gptzero_func/mean": 0.1607142835855484, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.017154963687062263, "rewards/check_winston_local_func/std": 0.02950689010322094, "rewards/sentence_count_match_reward_logic/mean": 0.9928570985794067, "rewards/sentence_count_match_reward_logic/std": 0.03745126724243164, "step": 141 }, { "clip_ratio": 0.0018358438974246383, "epoch": 0.0496850944716585, "grad_norm": 0.2074206717590051, "kl": 0.0135498046875, "learning_rate": 2.4825174825174825e-06, "loss": -0.0011, "step": 142 }, { "clip_ratio": 0.001661424059420824, "epoch": 0.05003498950314905, "grad_norm": 0.1926814080770855, "kl": 0.01470947265625, "learning_rate": 2.5e-06, "loss": -0.0014, "step": 143 }, { "clip_ratio": 0.0019549315329641104, "epoch": 0.05038488453463961, "grad_norm": 0.1941311496386271, "kl": 0.01458740234375, "learning_rate": 2.517482517482518e-06, "loss": -0.0015, "step": 144 }, { "clip_ratio": 0.0013328128261491656, "clipped_completions_ratio": 0.0, "epoch": 0.050734779566130164, "grad_norm": 0.16317887082265697, "kl": 0.0164794921875, "learning_rate": 2.534965034965035e-06, "loss": -0.0034, "max_completion_length": 221.0, "max_terminated_completion_length": 221.0, "mean_completion_length": 152.44644165039062, "mean_terminated_completion_length": 152.44644165039062, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 706391.0, "reward": 0.9963178634643555, "reward_std": 0.03887765109539032, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.017746374011039734, "rewards/check_winston_local_func/std": 0.038854051381349564, "rewards/sentence_count_match_reward_logic/mean": 0.9785714149475098, "rewards/sentence_count_match_reward_logic/std": 0.06509234011173248, "step": 145 }, { "clip_ratio": 0.001611836371012032, "epoch": 0.05108467459762071, "grad_norm": 0.15696373496095797, "kl": 0.0169677734375, "learning_rate": 2.5524475524475528e-06, "loss": -0.0039, "step": 146 }, { "clip_ratio": 0.0015462765004485846, "epoch": 0.05143456962911127, "grad_norm": 0.15974469047450496, "kl": 0.0185546875, "learning_rate": 2.56993006993007e-06, "loss": -0.004, "step": 147 }, { "clip_ratio": 0.0027786686550825834, "epoch": 0.05178446466060182, "grad_norm": 0.16138599463086192, "kl": 0.0198974609375, "learning_rate": 2.5874125874125877e-06, "loss": -0.0039, "step": 148 }, { "clip_ratio": 0.0015535862185060978, "clipped_completions_ratio": 0.0, "epoch": 0.052134359692092375, "grad_norm": 0.17160314987060338, "kl": 0.0167236328125, "learning_rate": 2.604895104895105e-06, "loss": -0.0038, "max_completion_length": 223.0, "max_terminated_completion_length": 223.0, "mean_completion_length": 167.2857208251953, "mean_terminated_completion_length": 167.2857208251953, "min_completion_length": 104.0, "min_terminated_completion_length": 104.0, "num_tokens": 727575.0, "reward": 1.089809775352478, "reward_std": 0.20597539842128754, "rewards/check_gptzero_func/mean": 0.125, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.01173432171344757, "rewards/check_winston_local_func/std": 0.010153586976230145, "rewards/sentence_count_match_reward_logic/mean": 0.9530754089355469, "rewards/sentence_count_match_reward_logic/std": 0.08403855562210083, "step": 149 }, { "clip_ratio": 0.0011237012222409248, "epoch": 0.052484254723582924, "grad_norm": 0.16793245696374426, "kl": 0.017822265625, "learning_rate": 2.6223776223776225e-06, "loss": -0.0034, "step": 150 }, { "clip_ratio": 0.0011000182712450624, "epoch": 0.05283414975507348, "grad_norm": 0.14966855036106963, "kl": 0.019287109375, "learning_rate": 2.63986013986014e-06, "loss": -0.0039, "step": 151 }, { "clip_ratio": 0.0022163072135299444, "epoch": 0.05318404478656403, "grad_norm": 0.1507441495401124, "kl": 0.0205078125, "learning_rate": 2.6573426573426574e-06, "loss": -0.0038, "step": 152 }, { "clip_ratio": 0.0022296381648629904, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.053533939818054585, "grad_norm": 0.23808985645425618, "kl": 0.0400390625, "learning_rate": 2.674825174825175e-06, "loss": -0.0157, "max_completion_length": 256.0, "max_terminated_completion_length": 192.0, "mean_completion_length": 151.73214721679688, "mean_terminated_completion_length": 134.3541717529297, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 747144.0, "reward": 1.102138638496399, "reward_std": 0.0885152667760849, "rewards/check_gptzero_func/mean": 0.1071428582072258, "rewards/check_gptzero_func/std": 0.3120938837528229, "rewards/check_winston_local_func/mean": 0.008261035196483135, "rewards/check_winston_local_func/std": 0.0015803300775587559, "rewards/sentence_count_match_reward_logic/mean": 0.9867346882820129, "rewards/sentence_count_match_reward_logic/std": 0.04872070997953415, "step": 153 }, { "clip_ratio": 0.0023121244739741087, "epoch": 0.053883834849545134, "grad_norm": 0.22332155474708407, "kl": 0.044921875, "learning_rate": 2.6923076923076923e-06, "loss": -0.0158, "step": 154 }, { "clip_ratio": 0.004165151622146368, "epoch": 0.05423372988103569, "grad_norm": 0.18245475997448057, "kl": 0.05517578125, "learning_rate": 2.70979020979021e-06, "loss": -0.0162, "step": 155 }, { "clip_ratio": 0.005509084090590477, "epoch": 0.05458362491252624, "grad_norm": 0.164822924908704, "kl": 0.0693359375, "learning_rate": 2.7272727272727272e-06, "loss": -0.0164, "step": 156 }, { "clip_ratio": 0.0017200286965817213, "clipped_completions_ratio": 0.0, "epoch": 0.054933519944016795, "grad_norm": 0.1759221206076537, "kl": 0.0224609375, "learning_rate": 2.744755244755245e-06, "loss": -0.0029, "max_completion_length": 190.0, "max_terminated_completion_length": 190.0, "mean_completion_length": 123.67857360839844, "mean_terminated_completion_length": 123.67857360839844, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 763150.0, "reward": 1.0241400003433228, "reward_std": 0.01988224685192108, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.024139944463968277, "rewards/check_winston_local_func/std": 0.030643047764897346, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 157 }, { "clip_ratio": 0.0009271298185922205, "epoch": 0.055283414975507345, "grad_norm": 0.17642752454485244, "kl": 0.0250244140625, "learning_rate": 2.762237762237762e-06, "loss": -0.0029, "step": 158 }, { "clip_ratio": 0.0018082860624417663, "epoch": 0.0556333100069979, "grad_norm": 0.1760337426631105, "kl": 0.0272216796875, "learning_rate": 2.7797202797202798e-06, "loss": -0.0029, "step": 159 }, { "clip_ratio": 0.0033512546215206385, "epoch": 0.05598320503848846, "grad_norm": 0.1612919912358891, "kl": 0.0296630859375, "learning_rate": 2.7972027972027974e-06, "loss": -0.0032, "step": 160 }, { "clip_ratio": 0.0025987790431827307, "clipped_completions_ratio": 0.0, "epoch": 0.056333100069979006, "grad_norm": 0.2577731921299081, "kl": 0.061767578125, "learning_rate": 2.8146853146853147e-06, "loss": -0.003, "max_completion_length": 232.0, "max_terminated_completion_length": 232.0, "mean_completion_length": 133.75, "mean_terminated_completion_length": 133.75, "min_completion_length": 56.0, "min_terminated_completion_length": 56.0, "num_tokens": 780432.0, "reward": 1.3873926401138306, "reward_std": 0.17156186699867249, "rewards/check_gptzero_func/mean": 0.3392857015132904, "rewards/check_gptzero_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.055440984666347504, "rewards/check_winston_local_func/std": 0.09057348221540451, "rewards/sentence_count_match_reward_logic/mean": 0.9926658272743225, "rewards/sentence_count_match_reward_logic/std": 0.0311678946018219, "step": 161 }, { "clip_ratio": 0.0033673590514808893, "epoch": 0.05668299510146956, "grad_norm": 0.19927766365843824, "kl": 0.08154296875, "learning_rate": 2.8321678321678323e-06, "loss": -0.0033, "step": 162 }, { "clip_ratio": 0.0029986007139086723, "epoch": 0.05703289013296011, "grad_norm": 0.2078473087236864, "kl": 0.0888671875, "learning_rate": 2.8496503496503496e-06, "loss": -0.0029, "step": 163 }, { "clip_ratio": 0.004378261044621468, "epoch": 0.05738278516445067, "grad_norm": 0.20351301389669604, "kl": 0.08740234375, "learning_rate": 2.8671328671328672e-06, "loss": -0.0033, "step": 164 }, { "clip_ratio": 0.0007962480303831398, "clipped_completions_ratio": 0.0, "epoch": 0.057732680195941216, "grad_norm": 0.2309117016656268, "kl": 0.034912109375, "learning_rate": 2.8846153846153845e-06, "loss": 0.0049, "max_completion_length": 179.0, "max_terminated_completion_length": 179.0, "mean_completion_length": 127.73214721679688, "mean_terminated_completion_length": 127.73214721679688, "min_completion_length": 75.0, "min_terminated_completion_length": 75.0, "num_tokens": 797201.0, "reward": 1.0340445041656494, "reward_std": 0.05831634998321533, "rewards/check_gptzero_func/mean": 0.01785714365541935, "rewards/check_gptzero_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.016187215223908424, "rewards/check_winston_local_func/std": 0.017381642013788223, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 165 }, { "clip_ratio": 0.001211747876368463, "epoch": 0.05808257522743177, "grad_norm": 0.2320605127164065, "kl": 0.03466796875, "learning_rate": 2.902097902097902e-06, "loss": 0.0045, "step": 166 }, { "clip_ratio": 0.0022008027881383896, "epoch": 0.05843247025892232, "grad_norm": 0.22671763887302246, "kl": 0.037841796875, "learning_rate": 2.91958041958042e-06, "loss": 0.0046, "step": 167 }, { "clip_ratio": 0.0019144663820043206, "epoch": 0.05878236529041288, "grad_norm": 0.215377313224799, "kl": 0.03955078125, "learning_rate": 2.937062937062937e-06, "loss": 0.0042, "step": 168 }, { "clip_ratio": 0.0033355415798723698, "clipped_completions_ratio": 0.0, "epoch": 0.059132260321903427, "grad_norm": 0.2063353446722891, "kl": 0.10498046875, "learning_rate": 2.954545454545455e-06, "loss": -0.0054, "max_completion_length": 223.0, "max_terminated_completion_length": 223.0, "mean_completion_length": 153.6607208251953, "mean_terminated_completion_length": 153.6607208251953, "min_completion_length": 93.0, "min_terminated_completion_length": 93.0, "num_tokens": 816886.0, "reward": 0.9945820569992065, "reward_std": 0.022570796310901642, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.009462964721024036, "rewards/check_winston_local_func/std": 0.004580690059810877, "rewards/sentence_count_match_reward_logic/mean": 0.9851189851760864, "rewards/sentence_count_match_reward_logic/std": 0.047956064343452454, "step": 169 }, { "clip_ratio": 0.0027663831133395433, "epoch": 0.05948215535339398, "grad_norm": 0.1939179840575821, "kl": 0.1220703125, "learning_rate": 2.972027972027972e-06, "loss": -0.0046, "step": 170 }, { "clip_ratio": 0.0047027370892465115, "epoch": 0.05983205038488453, "grad_norm": 0.21448753472323762, "kl": 0.140625, "learning_rate": 2.98951048951049e-06, "loss": -0.005, "step": 171 }, { "clip_ratio": 0.005799442995339632, "epoch": 0.06018194541637509, "grad_norm": 0.25158503390940373, "kl": 0.142578125, "learning_rate": 3.006993006993007e-06, "loss": -0.0055, "step": 172 }, { "clip_ratio": 0.0021850569173693657, "clipped_completions_ratio": 0.0, "epoch": 0.06053184044786564, "grad_norm": 0.1911107552908757, "kl": 0.076171875, "learning_rate": 3.024475524475525e-06, "loss": -0.001, "max_completion_length": 237.0, "max_terminated_completion_length": 237.0, "mean_completion_length": 159.80357360839844, "mean_terminated_completion_length": 159.80357360839844, "min_completion_length": 116.0, "min_terminated_completion_length": 116.0, "num_tokens": 837723.0, "reward": 1.0402510166168213, "reward_std": 0.09231642633676529, "rewards/check_gptzero_func/mean": 0.0535714291036129, "rewards/check_gptzero_func/std": 0.22720779478549957, "rewards/check_winston_local_func/mean": 0.02137337438762188, "rewards/check_winston_local_func/std": 0.05079011991620064, "rewards/sentence_count_match_reward_logic/mean": 0.9653061628341675, "rewards/sentence_count_match_reward_logic/std": 0.05654605105519295, "step": 173 }, { "clip_ratio": 0.0018093424150720239, "epoch": 0.06088173547935619, "grad_norm": 0.1796448640266747, "kl": 0.0703125, "learning_rate": 3.0419580419580425e-06, "loss": -0.0006, "step": 174 }, { "clip_ratio": 0.0033462063875049353, "epoch": 0.06123163051084675, "grad_norm": 0.16849614877453634, "kl": 0.06884765625, "learning_rate": 3.0594405594405598e-06, "loss": -0.0011, "step": 175 }, { "clip_ratio": 0.003045273246243596, "epoch": 0.0615815255423373, "grad_norm": 0.1777637492832051, "kl": 0.0634765625, "learning_rate": 3.0769230769230774e-06, "loss": -0.0014, "step": 176 }, { "clip_ratio": 0.0024862950667738914, "clipped_completions_ratio": 0.0, "epoch": 0.061931420573827854, "grad_norm": 0.14583871487529404, "kl": 0.029052734375, "learning_rate": 3.0944055944055947e-06, "loss": -0.0053, "max_completion_length": 242.0, "max_terminated_completion_length": 242.0, "mean_completion_length": 159.85714721679688, "mean_terminated_completion_length": 159.85714721679688, "min_completion_length": 98.0, "min_terminated_completion_length": 98.0, "num_tokens": 857955.0, "reward": 1.146930456161499, "reward_std": 0.08706771582365036, "rewards/check_gptzero_func/mean": 0.0892857164144516, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.09080792963504791, "rewards/check_winston_local_func/std": 0.2122219204902649, "rewards/sentence_count_match_reward_logic/mean": 0.9668367505073547, "rewards/sentence_count_match_reward_logic/std": 0.06886878609657288, "step": 177 }, { "clip_ratio": 0.0018071567174047232, "epoch": 0.0622813156053184, "grad_norm": 0.15029444664454325, "kl": 0.029052734375, "learning_rate": 3.1118881118881123e-06, "loss": -0.0058, "step": 178 }, { "clip_ratio": 0.0018241427605971694, "epoch": 0.06263121063680896, "grad_norm": 0.15391540680729762, "kl": 0.0277099609375, "learning_rate": 3.1293706293706296e-06, "loss": -0.0056, "step": 179 }, { "clip_ratio": 0.002388282911852002, "epoch": 0.06298110566829951, "grad_norm": 0.14713639374867676, "kl": 0.02783203125, "learning_rate": 3.1468531468531472e-06, "loss": -0.0056, "step": 180 }, { "clip_ratio": 0.0018958192085847259, "clipped_completions_ratio": 0.0714285714285714, "epoch": 0.06333100069979006, "grad_norm": 0.14683623165268664, "kl": 0.046142578125, "learning_rate": 3.164335664335665e-06, "loss": 0.0002, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 177.32144165039062, "mean_terminated_completion_length": 171.2692413330078, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 880493.0, "reward": 0.9574806094169617, "reward_std": 0.013555331155657768, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.015771429985761642, "rewards/check_winston_local_func/std": 0.014727215282619, "rewards/sentence_count_match_reward_logic/mean": 0.9417091608047485, "rewards/sentence_count_match_reward_logic/std": 0.1096082329750061, "step": 181 }, { "clip_ratio": 0.0018131204415112734, "epoch": 0.06368089573128062, "grad_norm": 0.1561048316123917, "kl": 0.048095703125, "learning_rate": 3.181818181818182e-06, "loss": 0.0009, "step": 182 }, { "clip_ratio": 0.0022541945800185204, "epoch": 0.06403079076277117, "grad_norm": 0.14658781091512593, "kl": 0.050537109375, "learning_rate": 3.1993006993006998e-06, "loss": 0.0, "step": 183 }, { "clip_ratio": 0.0017217799322679639, "epoch": 0.06438068579426172, "grad_norm": 0.136353787685372, "kl": 0.0517578125, "learning_rate": 3.216783216783217e-06, "loss": -0.0, "step": 184 }, { "clip_ratio": 0.002266347175464034, "clipped_completions_ratio": 0.0, "epoch": 0.06473058082575227, "grad_norm": 8.278015250177981, "kl": 2.21875, "learning_rate": 3.2342657342657347e-06, "loss": 0.0115, "max_completion_length": 252.0, "max_terminated_completion_length": 252.0, "mean_completion_length": 174.0178680419922, "mean_terminated_completion_length": 174.0178680419922, "min_completion_length": 110.0, "min_terminated_completion_length": 110.0, "num_tokens": 902422.0, "reward": 0.9974592924118042, "reward_std": 0.03521871194243431, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.018377605825662613, "rewards/check_winston_local_func/std": 0.03328465670347214, "rewards/sentence_count_match_reward_logic/mean": 0.9790816307067871, "rewards/sentence_count_match_reward_logic/std": 0.05674957111477852, "step": 185 }, { "clip_ratio": 0.002660295693203807, "epoch": 0.06508047585724283, "grad_norm": 2.58897730874021, "kl": 0.80078125, "learning_rate": 3.251748251748252e-06, "loss": -0.0016, "step": 186 }, { "clip_ratio": 0.0041712382808327675, "epoch": 0.06543037088873338, "grad_norm": 1.7997234467564227, "kl": 0.09814453125, "learning_rate": 3.2692307692307696e-06, "loss": -0.0028, "step": 187 }, { "clip_ratio": 0.005109732039272785, "epoch": 0.06578026592022393, "grad_norm": 3.0662665557385917, "kl": 0.07421875, "learning_rate": 3.286713286713287e-06, "loss": 0.0019, "step": 188 }, { "clip_ratio": 0.002270778175443411, "clipped_completions_ratio": 0.25, "epoch": 0.06613016095171449, "grad_norm": 0.20749240480086167, "kl": 0.055908203125, "learning_rate": 3.3041958041958045e-06, "loss": -0.0025, "max_completion_length": 256.0, "max_terminated_completion_length": 243.0, "mean_completion_length": 183.44644165039062, "mean_terminated_completion_length": 159.26190185546875, "min_completion_length": 90.0, "min_terminated_completion_length": 90.0, "num_tokens": 925911.0, "reward": 1.0381243228912354, "reward_std": 0.05537028610706329, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.05449328571557999, "rewards/check_winston_local_func/std": 0.1494232714176178, "rewards/sentence_count_match_reward_logic/mean": 0.9836309552192688, "rewards/sentence_count_match_reward_logic/std": 0.04052364081144333, "step": 189 }, { "clip_ratio": 0.0020771226845681667, "epoch": 0.06648005598320504, "grad_norm": 0.21982791755807615, "kl": 0.05712890625, "learning_rate": 3.321678321678322e-06, "loss": -0.003, "step": 190 }, { "clip_ratio": 0.002658702665939927, "epoch": 0.06682995101469559, "grad_norm": 0.19181128200298606, "kl": 0.0615234375, "learning_rate": 3.3391608391608394e-06, "loss": -0.0028, "step": 191 }, { "clip_ratio": 0.003014713292941451, "epoch": 0.06717984604618614, "grad_norm": 0.21827362181957524, "kl": 0.0673828125, "learning_rate": 3.356643356643357e-06, "loss": -0.0037, "step": 192 }, { "clip_ratio": 0.003020267002284527, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.0675297410776767, "grad_norm": 0.24288429286533597, "kl": 0.0869140625, "learning_rate": 3.3741258741258742e-06, "loss": -0.0066, "max_completion_length": 256.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 142.8928680419922, "mean_terminated_completion_length": 124.04167175292969, "min_completion_length": 54.0, "min_terminated_completion_length": 54.0, "num_tokens": 944273.0, "reward": 1.040244698524475, "reward_std": 0.10484717786312103, "rewards/check_gptzero_func/mean": 0.0535714291036129, "rewards/check_gptzero_func/std": 0.22720779478549957, "rewards/check_winston_local_func/mean": 0.024371659383177757, "rewards/check_winston_local_func/std": 0.052092794328927994, "rewards/sentence_count_match_reward_logic/mean": 0.9623016119003296, "rewards/sentence_count_match_reward_logic/std": 0.07899988442659378, "step": 193 }, { "clip_ratio": 0.003016131930053234, "epoch": 0.06787963610916725, "grad_norm": 0.23687081500102808, "kl": 0.08251953125, "learning_rate": 3.391608391608392e-06, "loss": -0.0072, "step": 194 }, { "clip_ratio": 0.002400663448497653, "epoch": 0.0682295311406578, "grad_norm": 0.23514427746959, "kl": 0.08935546875, "learning_rate": 3.409090909090909e-06, "loss": -0.007, "step": 195 }, { "clip_ratio": 0.006215465255081654, "epoch": 0.06857942617214835, "grad_norm": 0.2113733482453629, "kl": 0.0947265625, "learning_rate": 3.426573426573427e-06, "loss": -0.0085, "step": 196 }, { "clip_ratio": 0.0034960005432367325, "clipped_completions_ratio": 0.0, "epoch": 0.06892932120363891, "grad_norm": 1.2321150420189328, "kl": 0.6328125, "learning_rate": 3.4440559440559445e-06, "loss": 0.0076, "max_completion_length": 200.0, "max_terminated_completion_length": 200.0, "mean_completion_length": 119.73214721679688, "mean_terminated_completion_length": 119.73214721679688, "min_completion_length": 64.0, "min_terminated_completion_length": 64.0, "num_tokens": 959810.0, "reward": 1.0255584716796875, "reward_std": 0.018219558522105217, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.025558501482009888, "rewards/check_winston_local_func/std": 0.05346439778804779, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 197 }, { "clip_ratio": 0.00411754846572876, "epoch": 0.06927921623512946, "grad_norm": 0.4086564647559431, "kl": 0.318359375, "learning_rate": 3.4615384615384617e-06, "loss": 0.0048, "step": 198 }, { "clip_ratio": 0.0073602572083473206, "epoch": 0.06962911126662001, "grad_norm": 0.3962651314048408, "kl": 0.1943359375, "learning_rate": 3.4790209790209793e-06, "loss": 0.0045, "step": 199 }, { "clip_ratio": 0.007816408760845661, "epoch": 0.06997900629811056, "grad_norm": 0.5338089679499434, "kl": 0.1640625, "learning_rate": 3.4965034965034966e-06, "loss": 0.0048, "step": 200 }, { "clip_ratio": 0.002551842015236616, "clipped_completions_ratio": 0.0, "epoch": 0.07032890132960112, "grad_norm": 0.2413722043298586, "kl": 0.04931640625, "learning_rate": 3.5139860139860142e-06, "loss": 0.0016, "max_completion_length": 227.0, "max_terminated_completion_length": 227.0, "mean_completion_length": 146.19644165039062, "mean_terminated_completion_length": 146.19644165039062, "min_completion_length": 89.0, "min_terminated_completion_length": 89.0, "num_tokens": 978493.0, "reward": 1.109501838684082, "reward_std": 0.08710088580846786, "rewards/check_gptzero_func/mean": 0.1071428582072258, "rewards/check_gptzero_func/std": 0.3120938837528229, "rewards/check_winston_local_func/mean": 0.017239894717931747, "rewards/check_winston_local_func/std": 0.027768542990088463, "rewards/sentence_count_match_reward_logic/mean": 0.9851189851760864, "rewards/sentence_count_match_reward_logic/std": 0.047956064343452454, "step": 201 }, { "clip_ratio": 0.0026854814495891333, "epoch": 0.07067879636109167, "grad_norm": 0.2299438236651764, "kl": 0.0498046875, "learning_rate": 3.5314685314685315e-06, "loss": 0.0013, "step": 202 }, { "clip_ratio": 0.002619766630232334, "epoch": 0.07102869139258222, "grad_norm": 0.23478616781497233, "kl": 0.05029296875, "learning_rate": 3.548951048951049e-06, "loss": 0.0012, "step": 203 }, { "clip_ratio": 0.0020387708209455013, "epoch": 0.07137858642407278, "grad_norm": 0.22901323441864063, "kl": 0.049560546875, "learning_rate": 3.566433566433567e-06, "loss": 0.0002, "step": 204 }, { "clip_ratio": 0.0028240703977644444, "clipped_completions_ratio": 0.0, "epoch": 0.07172848145556333, "grad_norm": 0.2487023017951264, "kl": 0.0849609375, "learning_rate": 3.583916083916084e-06, "loss": 0.0013, "max_completion_length": 215.0, "max_terminated_completion_length": 215.0, "mean_completion_length": 153.7678680419922, "mean_terminated_completion_length": 153.7678680419922, "min_completion_length": 85.0, "min_terminated_completion_length": 85.0, "num_tokens": 998024.0, "reward": 1.007434368133545, "reward_std": 0.01847977004945278, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.014577196910977364, "rewards/check_winston_local_func/std": 0.012516133487224579, "rewards/sentence_count_match_reward_logic/mean": 0.9928570985794067, "rewards/sentence_count_match_reward_logic/std": 0.03745126724243164, "step": 205 }, { "clip_ratio": 0.0031287248712033033, "epoch": 0.07207837648705388, "grad_norm": 0.266706256327368, "kl": 0.07470703125, "learning_rate": 3.6013986013986017e-06, "loss": 0.0006, "step": 206 }, { "clip_ratio": 0.0025359259452670813, "epoch": 0.07242827151854443, "grad_norm": 0.24638144193269867, "kl": 0.0830078125, "learning_rate": 3.618881118881119e-06, "loss": 0.0003, "step": 207 }, { "clip_ratio": 0.0049720942042768, "epoch": 0.072778166550035, "grad_norm": 0.1965833878673567, "kl": 0.0830078125, "learning_rate": 3.6363636363636366e-06, "loss": -0.0004, "step": 208 }, { "clip_ratio": 0.0021548988297581673, "clipped_completions_ratio": 0.0, "epoch": 0.07312806158152554, "grad_norm": 0.19438810568739492, "kl": 0.08154296875, "learning_rate": 3.653846153846154e-06, "loss": 0.0053, "max_completion_length": 232.0, "max_terminated_completion_length": 232.0, "mean_completion_length": 170.94644165039062, "mean_terminated_completion_length": 170.94644165039062, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 1019717.0, "reward": 1.2282158136367798, "reward_std": 0.12499537318944931, "rewards/check_gptzero_func/mean": 0.2321428507566452, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.013930107466876507, "rewards/check_winston_local_func/std": 0.026050902903079987, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.05016207695007324, "step": 209 }, { "clip_ratio": 0.002562692854553461, "epoch": 0.0734779566130161, "grad_norm": 0.20774459650619462, "kl": 0.08056640625, "learning_rate": 3.6713286713286715e-06, "loss": 0.0056, "step": 210 }, { "clip_ratio": 0.0027915502432733774, "epoch": 0.07382785164450664, "grad_norm": 0.19531433351408503, "kl": 0.08154296875, "learning_rate": 3.6888111888111896e-06, "loss": 0.0048, "step": 211 }, { "clip_ratio": 0.0028938325121998787, "epoch": 0.0741777466759972, "grad_norm": 0.20877227314634014, "kl": 0.08203125, "learning_rate": 3.7062937062937064e-06, "loss": 0.0046, "step": 212 }, { "clip_ratio": 0.002180489245802164, "clipped_completions_ratio": 0.0, "epoch": 0.07452764170748775, "grad_norm": 0.21720395405720322, "kl": 0.05029296875, "learning_rate": 3.7237762237762245e-06, "loss": -0.0, "max_completion_length": 235.0, "max_terminated_completion_length": 235.0, "mean_completion_length": 147.1607208251953, "mean_terminated_completion_length": 147.1607208251953, "min_completion_length": 81.0, "min_terminated_completion_length": 81.0, "num_tokens": 1038814.0, "reward": 1.2389166355133057, "reward_std": 0.13826137781143188, "rewards/check_gptzero_func/mean": 0.2321428507566452, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.00875773187726736, "rewards/check_winston_local_func/std": 0.0017621287843212485, "rewards/sentence_count_match_reward_logic/mean": 0.9980158805847168, "rewards/sentence_count_match_reward_logic/std": 0.014847845770418644, "step": 213 }, { "clip_ratio": 0.002725671511143446, "epoch": 0.0748775367389783, "grad_norm": 0.22333380910199105, "kl": 0.048095703125, "learning_rate": 3.7412587412587413e-06, "loss": -0.0006, "step": 214 }, { "clip_ratio": 0.003092368831858039, "epoch": 0.07522743177046885, "grad_norm": 0.2132903425809044, "kl": 0.0478515625, "learning_rate": 3.7587412587412593e-06, "loss": -0.0007, "step": 215 }, { "clip_ratio": 0.0045762318186461926, "epoch": 0.07557732680195942, "grad_norm": 0.20958768868150485, "kl": 0.04345703125, "learning_rate": 3.776223776223776e-06, "loss": -0.0014, "step": 216 }, { "clip_ratio": 0.0012405810412019491, "clipped_completions_ratio": 0.0, "epoch": 0.07592722183344996, "grad_norm": 1.1987423057786224, "kl": 0.451171875, "learning_rate": 3.7937062937062942e-06, "loss": 0.0015, "max_completion_length": 248.0, "max_terminated_completion_length": 248.0, "mean_completion_length": 147.5357208251953, "mean_terminated_completion_length": 147.5357208251953, "min_completion_length": 86.0, "min_terminated_completion_length": 86.0, "num_tokens": 1057676.0, "reward": 1.1088405847549438, "reward_std": 0.06816881895065308, "rewards/check_gptzero_func/mean": 0.125, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.010130232200026512, "rewards/check_winston_local_func/std": 0.00623362697660923, "rewards/sentence_count_match_reward_logic/mean": 0.9737103581428528, "rewards/sentence_count_match_reward_logic/std": 0.062094852328300476, "step": 217 }, { "clip_ratio": 0.0015944185433909297, "epoch": 0.07627711686494051, "grad_norm": 0.24121509114121514, "kl": 0.166015625, "learning_rate": 3.811188811188811e-06, "loss": -0.0007, "step": 218 }, { "clip_ratio": 0.004143388010561466, "epoch": 0.07662701189643108, "grad_norm": 0.8218317257877151, "kl": 0.0908203125, "learning_rate": 3.828671328671329e-06, "loss": 0.0005, "step": 219 }, { "clip_ratio": 0.005629605147987604, "epoch": 0.07697690692792163, "grad_norm": 0.7360667165702768, "kl": 0.09228515625, "learning_rate": 3.846153846153847e-06, "loss": -0.0005, "step": 220 }, { "clip_ratio": 0.0016516688046976924, "clipped_completions_ratio": 0.25, "epoch": 0.07732680195941218, "grad_norm": 0.19251578591757354, "kl": 0.04248046875, "learning_rate": 3.863636363636364e-06, "loss": -0.0054, "max_completion_length": 256.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 160.7857208251953, "mean_terminated_completion_length": 129.04762268066406, "min_completion_length": 83.0, "min_terminated_completion_length": 83.0, "num_tokens": 1078576.0, "reward": 1.003616213798523, "reward_std": 0.07215794920921326, "rewards/check_gptzero_func/mean": 0.01785714365541935, "rewards/check_gptzero_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.022068535909056664, "rewards/check_winston_local_func/std": 0.04285987839102745, "rewards/sentence_count_match_reward_logic/mean": 0.963690459728241, "rewards/sentence_count_match_reward_logic/std": 0.06479237973690033, "step": 221 }, { "clip_ratio": 0.0018835071241483092, "epoch": 0.07767669699090272, "grad_norm": 0.1946975730011551, "kl": 0.046142578125, "learning_rate": 3.881118881118881e-06, "loss": -0.0058, "step": 222 }, { "clip_ratio": 0.0023626936599612236, "epoch": 0.07802659202239329, "grad_norm": 0.18921352810626108, "kl": 0.046630859375, "learning_rate": 3.898601398601399e-06, "loss": -0.0061, "step": 223 }, { "clip_ratio": 0.004541447386145592, "epoch": 0.07837648705388384, "grad_norm": 0.18682471701515735, "kl": 0.048828125, "learning_rate": 3.916083916083917e-06, "loss": -0.0062, "step": 224 }, { "clip_ratio": 0.001720010070130229, "clipped_completions_ratio": 0.125, "epoch": 0.07872638208537439, "grad_norm": 0.16891407450095902, "kl": 0.038330078125, "learning_rate": 3.933566433566433e-06, "loss": 0.0001, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 174.00001525878906, "mean_terminated_completion_length": 162.28570556640625, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 1101088.0, "reward": 1.1370298862457275, "reward_std": 0.20175141096115112, "rewards/check_gptzero_func/mean": 0.125, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.012029767036437988, "rewards/check_winston_local_func/std": 0.009536348283290863, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 225 }, { "clip_ratio": 0.002697163028642535, "epoch": 0.07907627711686493, "grad_norm": 0.16817090241025123, "kl": 0.0400390625, "learning_rate": 3.951048951048951e-06, "loss": 0.0001, "step": 226 }, { "clip_ratio": 0.0026927010621875525, "epoch": 0.0794261721483555, "grad_norm": 0.16242918067307713, "kl": 0.04052734375, "learning_rate": 3.968531468531469e-06, "loss": -0.0009, "step": 227 }, { "clip_ratio": 0.003168097697198391, "epoch": 0.07977606717984605, "grad_norm": 0.167767831037079, "kl": 0.044677734375, "learning_rate": 3.986013986013986e-06, "loss": -0.0009, "step": 228 }, { "clip_ratio": 0.002144278259947896, "clipped_completions_ratio": 0.0, "epoch": 0.0801259622113366, "grad_norm": 0.2240035625400852, "kl": 0.051513671875, "learning_rate": 4.003496503496504e-06, "loss": 0.0015, "max_completion_length": 220.0, "max_terminated_completion_length": 220.0, "mean_completion_length": 136.94644165039062, "mean_terminated_completion_length": 136.94644165039062, "min_completion_length": 96.0, "min_terminated_completion_length": 96.0, "num_tokens": 1118861.0, "reward": 1.261858582496643, "reward_std": 0.11494257301092148, "rewards/check_gptzero_func/mean": 0.2142857164144516, "rewards/check_gptzero_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.04757285490632057, "rewards/check_winston_local_func/std": 0.10724673420190811, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 229 }, { "clip_ratio": 0.0027313840109854937, "epoch": 0.08047585724282715, "grad_norm": 0.22081725233419894, "kl": 0.053955078125, "learning_rate": 4.020979020979021e-06, "loss": 0.0015, "step": 230 }, { "clip_ratio": 0.0029543256387114525, "epoch": 0.08082575227431771, "grad_norm": 0.22297347132928128, "kl": 0.0556640625, "learning_rate": 4.0384615384615385e-06, "loss": 0.001, "step": 231 }, { "clip_ratio": 0.0031250359024852514, "epoch": 0.08117564730580826, "grad_norm": 0.22308651783335837, "kl": 0.05810546875, "learning_rate": 4.055944055944056e-06, "loss": 0.0003, "step": 232 }, { "clip_ratio": 0.002297039143741131, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.0815255423372988, "grad_norm": 0.2416304514212736, "kl": 0.09716796875, "learning_rate": 4.073426573426574e-06, "loss": -0.0008, "max_completion_length": 256.0, "max_terminated_completion_length": 225.0, "mean_completion_length": 146.57144165039062, "mean_terminated_completion_length": 128.33334350585938, "min_completion_length": 84.0, "min_terminated_completion_length": 84.0, "num_tokens": 1138061.0, "reward": 1.0312678813934326, "reward_std": 0.12543019652366638, "rewards/check_gptzero_func/mean": 0.0714285746216774, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.011426589451730251, "rewards/check_winston_local_func/std": 0.01166428904980421, "rewards/sentence_count_match_reward_logic/mean": 0.9484127163887024, "rewards/sentence_count_match_reward_logic/std": 0.06688276678323746, "step": 233 }, { "clip_ratio": 0.0022575026378035545, "epoch": 0.08187543736878937, "grad_norm": 0.25388849148912723, "kl": 0.09765625, "learning_rate": 4.0909090909090915e-06, "loss": -0.001, "step": 234 }, { "clip_ratio": 0.002623456297442317, "epoch": 0.08222533240027992, "grad_norm": 0.271057375432641, "kl": 0.10400390625, "learning_rate": 4.108391608391608e-06, "loss": -0.001, "step": 235 }, { "clip_ratio": 0.004261789843440056, "epoch": 0.08257522743177047, "grad_norm": 0.22309001394278102, "kl": 0.1123046875, "learning_rate": 4.125874125874127e-06, "loss": -0.0018, "step": 236 }, { "clip_ratio": 0.0032179837580770254, "clipped_completions_ratio": 0.0, "epoch": 0.08292512246326102, "grad_norm": 0.7046127988877897, "kl": 0.2236328125, "learning_rate": 4.143356643356644e-06, "loss": -0.0067, "max_completion_length": 248.0, "max_terminated_completion_length": 248.0, "mean_completion_length": 175.6428680419922, "mean_terminated_completion_length": 175.6428680419922, "min_completion_length": 121.0, "min_terminated_completion_length": 121.0, "num_tokens": 1160553.0, "reward": 1.1686890125274658, "reward_std": 0.2122069150209427, "rewards/check_gptzero_func/mean": 0.1785714328289032, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.014671018347144127, "rewards/check_winston_local_func/std": 0.016325756907463074, "rewards/sentence_count_match_reward_logic/mean": 0.9754464030265808, "rewards/sentence_count_match_reward_logic/std": 0.060394737869501114, "step": 237 }, { "clip_ratio": 0.004452978260815144, "epoch": 0.08327501749475158, "grad_norm": 0.24184095369315686, "kl": 0.11767578125, "learning_rate": 4.160839160839161e-06, "loss": -0.0073, "step": 238 }, { "clip_ratio": 0.004571528639644384, "epoch": 0.08362491252624213, "grad_norm": 0.27298110271972986, "kl": 0.0908203125, "learning_rate": 4.178321678321678e-06, "loss": -0.0077, "step": 239 }, { "clip_ratio": 0.00406808964908123, "epoch": 0.08397480755773268, "grad_norm": 0.25551692755359273, "kl": 0.08447265625, "learning_rate": 4.195804195804197e-06, "loss": -0.0083, "step": 240 }, { "clip_ratio": 0.0029129136819392443, "clipped_completions_ratio": 0.0, "epoch": 0.08432470258922323, "grad_norm": 1.3969031059314712, "kl": 0.458984375, "learning_rate": 4.213286713286714e-06, "loss": -0.002, "max_completion_length": 181.0, "max_terminated_completion_length": 181.0, "mean_completion_length": 134.125, "mean_terminated_completion_length": 134.125, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 1178312.0, "reward": 1.0368242263793945, "reward_std": 0.14592131972312927, "rewards/check_gptzero_func/mean": 0.0357142873108387, "rewards/check_gptzero_func/std": 0.1872563362121582, "rewards/check_winston_local_func/mean": 0.04575273394584656, "rewards/check_winston_local_func/std": 0.09592721611261368, "rewards/sentence_count_match_reward_logic/mean": 0.9553571343421936, "rewards/sentence_count_match_reward_logic/std": 0.09937574714422226, "step": 241 }, { "clip_ratio": 0.004165219608694315, "epoch": 0.08467459762071379, "grad_norm": 0.5991420384367726, "kl": 0.21875, "learning_rate": 4.230769230769231e-06, "loss": -0.0044, "step": 242 }, { "clip_ratio": 0.0033621671609580517, "epoch": 0.08502449265220434, "grad_norm": 0.8700727545354138, "kl": 0.0810546875, "learning_rate": 4.248251748251749e-06, "loss": -0.0049, "step": 243 }, { "clip_ratio": 0.0059625329449772835, "epoch": 0.08537438768369489, "grad_norm": 1.2761991269640236, "kl": 0.07666015625, "learning_rate": 4.265734265734266e-06, "loss": -0.0042, "step": 244 }, { "clip_ratio": 0.0020485648419708014, "clipped_completions_ratio": 0.0, "epoch": 0.08572428271518544, "grad_norm": 1.2727040800469525, "kl": 0.291015625, "learning_rate": 4.283216783216784e-06, "loss": 0.008, "max_completion_length": 218.0, "max_terminated_completion_length": 218.0, "mean_completion_length": 159.96429443359375, "mean_terminated_completion_length": 159.96429443359375, "min_completion_length": 127.0, "min_terminated_completion_length": 127.0, "num_tokens": 1198774.0, "reward": 1.2123796939849854, "reward_std": 0.12776634097099304, "rewards/check_gptzero_func/mean": 0.2321428507566452, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.00999865960329771, "rewards/check_winston_local_func/std": 0.003623450407758355, "rewards/sentence_count_match_reward_logic/mean": 0.9702381491661072, "rewards/sentence_count_match_reward_logic/std": 0.06895425915718079, "step": 245 }, { "clip_ratio": 0.003134961938485503, "epoch": 0.086074177746676, "grad_norm": 0.34089335434798756, "kl": 0.1298828125, "learning_rate": 4.300699300699301e-06, "loss": 0.0064, "step": 246 }, { "clip_ratio": 0.0036525020841509104, "epoch": 0.08642407277816655, "grad_norm": 0.1808225193632548, "kl": 0.078125, "learning_rate": 4.3181818181818185e-06, "loss": 0.0051, "step": 247 }, { "clip_ratio": 0.0039376914501190186, "epoch": 0.0867739678096571, "grad_norm": 0.19216968099359494, "kl": 0.06884765625, "learning_rate": 4.335664335664336e-06, "loss": 0.0054, "step": 248 }, { "clip_ratio": 0.0028878392186015844, "clipped_completions_ratio": 0.0, "epoch": 0.08712386284114766, "grad_norm": 0.22987263815757797, "kl": 0.054931640625, "learning_rate": 4.353146853146854e-06, "loss": -0.0037, "max_completion_length": 240.0, "max_terminated_completion_length": 240.0, "mean_completion_length": 174.5357208251953, "mean_terminated_completion_length": 174.5357208251953, "min_completion_length": 129.0, "min_terminated_completion_length": 129.0, "num_tokens": 1220892.0, "reward": 1.157668948173523, "reward_std": 0.24218226969242096, "rewards/check_gptzero_func/mean": 0.1785714328289032, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.02338605746626854, "rewards/check_winston_local_func/std": 0.04294895380735397, "rewards/sentence_count_match_reward_logic/mean": 0.9557114243507385, "rewards/sentence_count_match_reward_logic/std": 0.08367286622524261, "step": 249 }, { "clip_ratio": 0.002856008242815733, "epoch": 0.08747375787263821, "grad_norm": 0.22900251649649198, "kl": 0.054443359375, "learning_rate": 4.3706293706293715e-06, "loss": -0.0037, "step": 250 }, { "clip_ratio": 0.002855695318430662, "epoch": 0.08782365290412876, "grad_norm": 0.22450967161889016, "kl": 0.058837890625, "learning_rate": 4.388111888111888e-06, "loss": -0.0041, "step": 251 }, { "clip_ratio": 0.003894730005413294, "epoch": 0.08817354793561931, "grad_norm": 0.22126263518147674, "kl": 0.05908203125, "learning_rate": 4.405594405594406e-06, "loss": -0.0048, "step": 252 }, { "clip_ratio": 0.002601606072857976, "clipped_completions_ratio": 0.0, "epoch": 0.08852344296710987, "grad_norm": 0.4276195448440729, "kl": 0.1044921875, "learning_rate": 4.423076923076924e-06, "loss": -0.0018, "max_completion_length": 224.0, "max_terminated_completion_length": 224.0, "mean_completion_length": 134.35714721679688, "mean_terminated_completion_length": 134.35714721679688, "min_completion_length": 63.0, "min_terminated_completion_length": 63.0, "num_tokens": 1238472.0, "reward": 0.9979439377784729, "reward_std": 0.0338624082505703, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.019202375784516335, "rewards/check_winston_local_func/std": 0.03700218349695206, "rewards/sentence_count_match_reward_logic/mean": 0.9787415266036987, "rewards/sentence_count_match_reward_logic/std": 0.05269065499305725, "step": 253 }, { "clip_ratio": 0.004721163772046566, "epoch": 0.08887333799860042, "grad_norm": 0.26257379612773785, "kl": 0.12451171875, "learning_rate": 4.440559440559441e-06, "loss": -0.0035, "step": 254 }, { "clip_ratio": 0.006972015369683504, "epoch": 0.08922323303009097, "grad_norm": 0.3338783055890077, "kl": 0.140625, "learning_rate": 4.458041958041958e-06, "loss": -0.0033, "step": 255 }, { "clip_ratio": 0.007899336516857147, "epoch": 0.08957312806158152, "grad_norm": 0.35151678864195063, "kl": 0.15625, "learning_rate": 4.475524475524476e-06, "loss": -0.0037, "step": 256 }, { "clip_ratio": 0.003908628597855568, "clipped_completions_ratio": 0.0, "epoch": 0.08992302309307208, "grad_norm": 0.21556284957508806, "kl": 0.11767578125, "learning_rate": 4.493006993006993e-06, "loss": 0.0017, "max_completion_length": 186.0, "max_terminated_completion_length": 186.0, "mean_completion_length": 135.55357360839844, "mean_terminated_completion_length": 135.55357360839844, "min_completion_length": 64.0, "min_terminated_completion_length": 64.0, "num_tokens": 1256111.0, "reward": 1.0075403451919556, "reward_std": 0.0007208471070043743, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.007540321908891201, "rewards/check_winston_local_func/std": 0.001197755103930831, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 257 }, { "clip_ratio": 0.0043180095963180065, "epoch": 0.09027291812456263, "grad_norm": 0.2139007983047576, "kl": 0.11865234375, "learning_rate": 4.510489510489511e-06, "loss": 0.0012, "step": 258 }, { "clip_ratio": 0.003870891872793436, "epoch": 0.09062281315605318, "grad_norm": 0.23116967164497554, "kl": 0.1162109375, "learning_rate": 4.527972027972029e-06, "loss": 0.001, "step": 259 }, { "clip_ratio": 0.0056404415518045425, "epoch": 0.09097270818754374, "grad_norm": 0.19805908737664274, "kl": 0.12255859375, "learning_rate": 4.5454545454545455e-06, "loss": 0.0004, "step": 260 }, { "clip_ratio": 0.0026440941728651524, "clipped_completions_ratio": 0.0, "epoch": 0.09132260321903429, "grad_norm": 0.2444119714609592, "kl": 0.07763671875, "learning_rate": 4.562937062937063e-06, "loss": 0.0045, "max_completion_length": 201.0, "max_terminated_completion_length": 201.0, "mean_completion_length": 123.41072082519531, "mean_terminated_completion_length": 123.41072082519531, "min_completion_length": 80.0, "min_terminated_completion_length": 80.0, "num_tokens": 1272318.0, "reward": 1.021695613861084, "reward_std": 0.008388105779886246, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.021695498377084732, "rewards/check_winston_local_func/std": 0.03372184932231903, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 261 }, { "clip_ratio": 0.003208306385204196, "epoch": 0.09167249825052484, "grad_norm": 0.23889818201092108, "kl": 0.07958984375, "learning_rate": 4.580419580419581e-06, "loss": 0.0042, "step": 262 }, { "clip_ratio": 0.004348608665168285, "epoch": 0.09202239328201539, "grad_norm": 0.24039495036753222, "kl": 0.0791015625, "learning_rate": 4.5979020979020985e-06, "loss": 0.0036, "step": 263 }, { "clip_ratio": 0.007564173545688391, "epoch": 0.09237228831350595, "grad_norm": 0.22733166871676097, "kl": 0.08251953125, "learning_rate": 4.615384615384616e-06, "loss": 0.0032, "step": 264 }, { "clip_ratio": 0.003216324606910348, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.0927221833449965, "grad_norm": 0.24826270165624542, "kl": 0.08642578125, "learning_rate": 4.632867132867133e-06, "loss": 0.052, "max_completion_length": 256.0, "max_terminated_completion_length": 220.0, "mean_completion_length": 151.4107208251953, "mean_terminated_completion_length": 133.9791717529297, "min_completion_length": 62.0, "min_terminated_completion_length": 62.0, "num_tokens": 1291589.0, "reward": 0.968677818775177, "reward_std": 0.051748327910900116, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.022419286891818047, "rewards/check_winston_local_func/std": 0.04166262596845627, "rewards/sentence_count_match_reward_logic/mean": 0.9462584853172302, "rewards/sentence_count_match_reward_logic/std": 0.10380401462316513, "step": 265 }, { "clip_ratio": 0.002958890749141574, "epoch": 0.09307207837648705, "grad_norm": 0.25334909833340036, "kl": 0.087890625, "learning_rate": 4.650349650349651e-06, "loss": 0.0517, "step": 266 }, { "clip_ratio": 0.0024499143473803997, "epoch": 0.0934219734079776, "grad_norm": 0.23436316483395678, "kl": 0.0966796875, "learning_rate": 4.667832167832168e-06, "loss": 0.0506, "step": 267 }, { "clip_ratio": 0.004241432994604111, "epoch": 0.09377186843946816, "grad_norm": 0.22739590721592087, "kl": 0.10400390625, "learning_rate": 4.685314685314686e-06, "loss": 0.0503, "step": 268 }, { "clip_ratio": 0.0027607628144323826, "clipped_completions_ratio": 0.0, "epoch": 0.09412176347095871, "grad_norm": 0.23874240453642054, "kl": 0.091796875, "learning_rate": 4.702797202797203e-06, "loss": 0.0048, "max_completion_length": 210.0, "max_terminated_completion_length": 210.0, "mean_completion_length": 153.2857208251953, "mean_terminated_completion_length": 153.2857208251953, "min_completion_length": 106.0, "min_terminated_completion_length": 106.0, "num_tokens": 1311397.0, "reward": 1.2064087390899658, "reward_std": 0.08568008244037628, "rewards/check_gptzero_func/mean": 0.125, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.1026671975851059, "rewards/check_winston_local_func/std": 0.1773795336484909, "rewards/sentence_count_match_reward_logic/mean": 0.9787415266036987, "rewards/sentence_count_match_reward_logic/std": 0.05269065499305725, "step": 269 }, { "clip_ratio": 0.0024697312619537115, "epoch": 0.09447165850244926, "grad_norm": 0.23590198516817984, "kl": 0.0947265625, "learning_rate": 4.72027972027972e-06, "loss": 0.005, "step": 270 }, { "clip_ratio": 0.003596110502257943, "epoch": 0.09482155353393981, "grad_norm": 0.21913709436131862, "kl": 0.09521484375, "learning_rate": 4.737762237762238e-06, "loss": 0.0039, "step": 271 }, { "clip_ratio": 0.005609494633972645, "epoch": 0.09517144856543037, "grad_norm": 0.2031725861224792, "kl": 0.09619140625, "learning_rate": 4.755244755244756e-06, "loss": 0.0034, "step": 272 }, { "clip_ratio": 0.0028410451486706734, "clipped_completions_ratio": 0.0, "epoch": 0.09552134359692092, "grad_norm": 0.2734021900635813, "kl": 0.1533203125, "learning_rate": 4.772727272727273e-06, "loss": 0.0081, "max_completion_length": 228.0, "max_terminated_completion_length": 228.0, "mean_completion_length": 136.07144165039062, "mean_terminated_completion_length": 136.07144165039062, "min_completion_length": 77.0, "min_terminated_completion_length": 77.0, "num_tokens": 1329017.0, "reward": 1.0501024723052979, "reward_std": 0.0419287271797657, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.05407077074050903, "rewards/check_winston_local_func/std": 0.10090764611959457, "rewards/sentence_count_match_reward_logic/mean": 0.9960317611694336, "rewards/sentence_count_match_reward_logic/std": 0.029695691540837288, "step": 273 }, { "clip_ratio": 0.005440612323582172, "epoch": 0.09587123862841147, "grad_norm": 0.25803022018255567, "kl": 0.162109375, "learning_rate": 4.79020979020979e-06, "loss": 0.0076, "step": 274 }, { "clip_ratio": 0.005998299922794104, "epoch": 0.09622113365990204, "grad_norm": 0.23229228369383742, "kl": 0.16015625, "learning_rate": 4.807692307692308e-06, "loss": 0.0074, "step": 275 }, { "clip_ratio": 0.008744364604353905, "epoch": 0.09657102869139259, "grad_norm": 0.2454274680012143, "kl": 0.1669921875, "learning_rate": 4.8251748251748255e-06, "loss": 0.0067, "step": 276 }, { "clip_ratio": 0.0024705769028514624, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.09692092372288313, "grad_norm": 0.2021637193589076, "kl": 0.1298828125, "learning_rate": 4.842657342657343e-06, "loss": 0.0037, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 196.32144165039062, "mean_terminated_completion_length": 186.375, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 1353619.0, "reward": 1.1672163009643555, "reward_std": 0.19659830629825592, "rewards/check_gptzero_func/mean": 0.2142857164144516, "rewards/check_gptzero_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.010363981127738953, "rewards/check_winston_local_func/std": 0.0036798331420868635, "rewards/sentence_count_match_reward_logic/mean": 0.9425665736198425, "rewards/sentence_count_match_reward_logic/std": 0.0778760239481926, "step": 277 }, { "clip_ratio": 0.002803183626383543, "epoch": 0.09727081875437368, "grad_norm": 0.1929020290878761, "kl": 0.126953125, "learning_rate": 4.86013986013986e-06, "loss": 0.0033, "step": 278 }, { "clip_ratio": 0.003510218346491456, "epoch": 0.09762071378586425, "grad_norm": 0.1801340639666837, "kl": 0.130859375, "learning_rate": 4.877622377622378e-06, "loss": 0.003, "step": 279 }, { "clip_ratio": 0.004746931605041027, "epoch": 0.0979706088173548, "grad_norm": 0.19684870489301856, "kl": 0.1298828125, "learning_rate": 4.895104895104895e-06, "loss": 0.0022, "step": 280 }, { "clip_ratio": 0.0024563646875321865, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.09832050384884534, "grad_norm": 0.2379359558588857, "kl": 0.072265625, "learning_rate": 4.912587412587413e-06, "loss": 0.0293, "max_completion_length": 256.0, "max_terminated_completion_length": 225.0, "mean_completion_length": 171.92857360839844, "mean_terminated_completion_length": 157.9166717529297, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 1375015.0, "reward": 1.3391869068145752, "reward_std": 0.18360011279582977, "rewards/check_gptzero_func/mean": 0.3035714328289032, "rewards/check_gptzero_func/std": 0.4639608860015869, "rewards/check_winston_local_func/mean": 0.04709496349096298, "rewards/check_winston_local_func/std": 0.0988120436668396, "rewards/sentence_count_match_reward_logic/mean": 0.9885204434394836, "rewards/sentence_count_match_reward_logic/std": 0.06915442645549774, "step": 281 }, { "clip_ratio": 0.002372650895267725, "epoch": 0.0986703988803359, "grad_norm": 0.22845319968356628, "kl": 0.0732421875, "learning_rate": 4.930069930069931e-06, "loss": 0.0285, "step": 282 }, { "clip_ratio": 0.003768901340663433, "epoch": 0.09902029391182646, "grad_norm": 0.20936540773733742, "kl": 0.0771484375, "learning_rate": 4.9475524475524474e-06, "loss": 0.0276, "step": 283 }, { "clip_ratio": 0.004120904952287674, "epoch": 0.099370188943317, "grad_norm": 0.21310651282047793, "kl": 0.07666015625, "learning_rate": 4.965034965034965e-06, "loss": 0.0275, "step": 284 }, { "clip_ratio": 0.0031348941847682, "clipped_completions_ratio": 0.017857142857142905, "epoch": 0.09972008397480756, "grad_norm": 0.2442581907970321, "kl": 0.09326171875, "learning_rate": 4.982517482517483e-06, "loss": -0.0116, "max_completion_length": 256.0, "max_terminated_completion_length": 252.0, "mean_completion_length": 165.10714721679688, "mean_terminated_completion_length": 163.4545440673828, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 1396141.0, "reward": 1.29506254196167, "reward_std": 0.1241571456193924, "rewards/check_gptzero_func/mean": 0.25, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.05602021887898445, "rewards/check_winston_local_func/std": 0.10330010205507278, "rewards/sentence_count_match_reward_logic/mean": 0.9890422224998474, "rewards/sentence_count_match_reward_logic/std": 0.032357025891542435, "step": 285 }, { "clip_ratio": 0.0032294844277203083, "epoch": 0.1000699790062981, "grad_norm": 0.23912795003793333, "kl": 0.0966796875, "learning_rate": 5e-06, "loss": -0.0116, "step": 286 }, { "clip_ratio": 0.0029970393516123295, "epoch": 0.10041987403778867, "grad_norm": 0.24402370681608904, "kl": 0.095703125, "learning_rate": 5.017482517482518e-06, "loss": -0.0119, "step": 287 }, { "clip_ratio": 0.004838546272367239, "epoch": 0.10076976906927922, "grad_norm": 0.22853516431796347, "kl": 0.09619140625, "learning_rate": 5.034965034965036e-06, "loss": -0.0126, "step": 288 }, { "clip_ratio": 0.002297190949320793, "clipped_completions_ratio": 0.0, "epoch": 0.10111966410076977, "grad_norm": 0.26655537295585524, "kl": 0.10107421875, "learning_rate": 5.0524475524475525e-06, "loss": -0.0006, "max_completion_length": 241.0, "max_terminated_completion_length": 241.0, "mean_completion_length": 153.6428680419922, "mean_terminated_completion_length": 153.6428680419922, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 1415609.0, "reward": 1.1444271802902222, "reward_std": 0.08991769701242447, "rewards/check_gptzero_func/mean": 0.125, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.03388283774256706, "rewards/check_winston_local_func/std": 0.05453626811504364, "rewards/sentence_count_match_reward_logic/mean": 0.9855442047119141, "rewards/sentence_count_match_reward_logic/std": 0.0534030944108963, "step": 289 }, { "clip_ratio": 0.0025302444119006395, "epoch": 0.10146955913226033, "grad_norm": 0.2575409228791836, "kl": 0.1015625, "learning_rate": 5.06993006993007e-06, "loss": -0.0009, "step": 290 }, { "clip_ratio": 0.005339559633284807, "epoch": 0.10181945416375088, "grad_norm": 0.23652441884138897, "kl": 0.1201171875, "learning_rate": 5.087412587412588e-06, "loss": -0.0017, "step": 291 }, { "clip_ratio": 0.010898358188569546, "epoch": 0.10216934919524143, "grad_norm": 0.2551582355951664, "kl": 0.1484375, "learning_rate": 5.1048951048951055e-06, "loss": -0.0021, "step": 292 }, { "clip_ratio": 0.003763397689908743, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.10251924422673198, "grad_norm": 0.4669080019111669, "kl": 0.22265625, "learning_rate": 5.122377622377622e-06, "loss": 0.0019, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 169.05357360839844, "mean_terminated_completion_length": 154.5625, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 1437044.0, "reward": 1.2010902166366577, "reward_std": 0.14009907841682434, "rewards/check_gptzero_func/mean": 0.125, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.08258365839719772, "rewards/check_winston_local_func/std": 0.14769704639911652, "rewards/sentence_count_match_reward_logic/mean": 0.9935064911842346, "rewards/sentence_count_match_reward_logic/std": 0.023624546825885773, "step": 293 }, { "clip_ratio": 0.003521582344546914, "epoch": 0.10286913925822254, "grad_norm": 0.27274546512181463, "kl": 0.15625, "learning_rate": 5.13986013986014e-06, "loss": 0.001, "step": 294 }, { "clip_ratio": 0.0041003599762916565, "epoch": 0.10321903428971309, "grad_norm": 0.24739815048154776, "kl": 0.146484375, "learning_rate": 5.157342657342658e-06, "loss": 0.0006, "step": 295 }, { "clip_ratio": 0.006170731969177723, "epoch": 0.10356892932120364, "grad_norm": 0.26049826556263433, "kl": 0.12060546875, "learning_rate": 5.174825174825175e-06, "loss": 0.0001, "step": 296 }, { "clip_ratio": 0.002389104338362813, "clipped_completions_ratio": 0.1964285714285714, "epoch": 0.10391882435269419, "grad_norm": 0.19946110804864722, "kl": 0.1171875, "learning_rate": 5.192307692307693e-06, "loss": -0.0056, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 168.82144165039062, "mean_terminated_completion_length": 147.5111083984375, "min_completion_length": 96.0, "min_terminated_completion_length": 96.0, "num_tokens": 1458250.0, "reward": 1.0490965843200684, "reward_std": 0.10546497255563736, "rewards/check_gptzero_func/mean": 0.0357142873108387, "rewards/check_gptzero_func/std": 0.1872563362121582, "rewards/check_winston_local_func/mean": 0.02052510716021061, "rewards/check_winston_local_func/std": 0.03214414045214653, "rewards/sentence_count_match_reward_logic/mean": 0.9928570985794067, "rewards/sentence_count_match_reward_logic/std": 0.025987014174461365, "step": 297 }, { "clip_ratio": 0.0029505945276468992, "epoch": 0.10426871938418475, "grad_norm": 0.19399113847529828, "kl": 0.1142578125, "learning_rate": 5.20979020979021e-06, "loss": -0.0057, "step": 298 }, { "clip_ratio": 0.0051363082602620125, "epoch": 0.1046186144156753, "grad_norm": 0.19832741697009115, "kl": 0.123046875, "learning_rate": 5.2272727272727274e-06, "loss": -0.006, "step": 299 }, { "clip_ratio": 0.007703573442995548, "epoch": 0.10496850944716585, "grad_norm": 0.2117402089600467, "kl": 0.12890625, "learning_rate": 5.244755244755245e-06, "loss": -0.007, "step": 300 }, { "clip_ratio": 0.00295211817137897, "clipped_completions_ratio": 0.0, "epoch": 0.1053184044786564, "grad_norm": 0.8820242522130938, "kl": 0.400390625, "learning_rate": 5.262237762237763e-06, "loss": -0.0004, "max_completion_length": 222.0, "max_terminated_completion_length": 222.0, "mean_completion_length": 133.58929443359375, "mean_terminated_completion_length": 133.58929443359375, "min_completion_length": 69.0, "min_terminated_completion_length": 69.0, "num_tokens": 1475875.0, "reward": 0.9992614984512329, "reward_std": 0.1253109723329544, "rewards/check_gptzero_func/mean": 0.0357142873108387, "rewards/check_gptzero_func/std": 0.1872563362121582, "rewards/check_winston_local_func/mean": 0.017118625342845917, "rewards/check_winston_local_func/std": 0.038708027452230453, "rewards/sentence_count_match_reward_logic/mean": 0.9464285969734192, "rewards/sentence_count_match_reward_logic/std": 0.09751076251268387, "step": 301 }, { "clip_ratio": 0.004402214661240578, "epoch": 0.10566829951014696, "grad_norm": 1.2376310613492376, "kl": 0.46875, "learning_rate": 5.27972027972028e-06, "loss": 0.0004, "step": 302 }, { "clip_ratio": 0.006663001142442226, "epoch": 0.10601819454163751, "grad_norm": 59.31624918520976, "kl": 6.15625, "learning_rate": 5.297202797202797e-06, "loss": 0.0532, "step": 303 }, { "clip_ratio": 0.010003164410591125, "epoch": 0.10636808957312806, "grad_norm": 54.677423819492724, "kl": 7.21875, "learning_rate": 5.314685314685315e-06, "loss": 0.063, "step": 304 }, { "clip_ratio": 0.0046646613627672195, "clipped_completions_ratio": 0.0, "epoch": 0.10671798460461862, "grad_norm": 0.22331022081184587, "kl": 0.1220703125, "learning_rate": 5.3321678321678325e-06, "loss": -0.0083, "max_completion_length": 193.0, "max_terminated_completion_length": 193.0, "mean_completion_length": 155.2678680419922, "mean_terminated_completion_length": 155.2678680419922, "min_completion_length": 104.0, "min_terminated_completion_length": 104.0, "num_tokens": 1495794.0, "reward": 1.0091552734375, "reward_std": 0.011009646579623222, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.011706249788403511, "rewards/check_winston_local_func/std": 0.008122336119413376, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 305 }, { "clip_ratio": 0.004655820317566395, "epoch": 0.10706787963610917, "grad_norm": 0.26222367008496866, "kl": 0.1142578125, "learning_rate": 5.34965034965035e-06, "loss": -0.0077, "step": 306 }, { "clip_ratio": 0.005945663433521986, "epoch": 0.10741777466759972, "grad_norm": 0.29686690122708737, "kl": 0.10986328125, "learning_rate": 5.367132867132867e-06, "loss": -0.008, "step": 307 }, { "clip_ratio": 0.007122192531824112, "epoch": 0.10776766969909027, "grad_norm": 0.27726003694066437, "kl": 0.11376953125, "learning_rate": 5.384615384615385e-06, "loss": -0.0083, "step": 308 }, { "clip_ratio": 0.0023485664278268814, "clipped_completions_ratio": 0.0535714285714286, "epoch": 0.10811756473058083, "grad_norm": 0.22531851188250673, "kl": 0.07568359375, "learning_rate": 5.402097902097902e-06, "loss": -0.0082, "max_completion_length": 256.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 195.37501525878906, "mean_terminated_completion_length": 191.9434051513672, "min_completion_length": 129.0, "min_terminated_completion_length": 129.0, "num_tokens": 1520423.0, "reward": 1.381346583366394, "reward_std": 0.15548723936080933, "rewards/check_gptzero_func/mean": 0.3035714328289032, "rewards/check_gptzero_func/std": 0.4639608860015869, "rewards/check_winston_local_func/mean": 0.07975923269987106, "rewards/check_winston_local_func/std": 0.13784511387348175, "rewards/sentence_count_match_reward_logic/mean": 0.9980158805847168, "rewards/sentence_count_match_reward_logic/std": 0.014847845770418644, "step": 309 }, { "clip_ratio": 0.003978311084210873, "epoch": 0.10846745976207138, "grad_norm": 0.20955026953798456, "kl": 0.08056640625, "learning_rate": 5.41958041958042e-06, "loss": -0.0084, "step": 310 }, { "clip_ratio": 0.00493956683203578, "epoch": 0.10881735479356193, "grad_norm": 0.189190224889168, "kl": 0.08740234375, "learning_rate": 5.437062937062938e-06, "loss": -0.0088, "step": 311 }, { "clip_ratio": 0.008830455131828785, "epoch": 0.10916724982505248, "grad_norm": 0.18341077943272485, "kl": 0.09033203125, "learning_rate": 5.4545454545454545e-06, "loss": -0.0097, "step": 312 }, { "clip_ratio": 0.0038997679948806763, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.10951714485654304, "grad_norm": 0.277343272030032, "kl": 0.142578125, "learning_rate": 5.472027972027972e-06, "loss": -0.0101, "max_completion_length": 256.0, "max_terminated_completion_length": 242.0, "mean_completion_length": 172.83929443359375, "mean_terminated_completion_length": 158.9791717529297, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 1543414.0, "reward": 0.9923837184906006, "reward_std": 0.019203493371605873, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.009319642558693886, "rewards/check_winston_local_func/std": 0.0026681311428546906, "rewards/sentence_count_match_reward_logic/mean": 0.9830641150474548, "rewards/sentence_count_match_reward_logic/std": 0.0420658104121685, "step": 313 }, { "clip_ratio": 0.0059569161385297775, "epoch": 0.10986703988803359, "grad_norm": 0.2557838768142569, "kl": 0.150390625, "learning_rate": 5.48951048951049e-06, "loss": -0.0108, "step": 314 }, { "clip_ratio": 0.008162682875990868, "epoch": 0.11021693491952414, "grad_norm": 0.2875058157402217, "kl": 0.1552734375, "learning_rate": 5.5069930069930074e-06, "loss": -0.0118, "step": 315 }, { "clip_ratio": 0.008753785863518715, "epoch": 0.11056682995101469, "grad_norm": 0.2260183326116466, "kl": 0.150390625, "learning_rate": 5.524475524475524e-06, "loss": -0.0125, "step": 316 }, { "clip_ratio": 0.004118381999433041, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.11091672498250525, "grad_norm": 0.328819146176066, "kl": 0.2333984375, "learning_rate": 5.541958041958042e-06, "loss": -0.0022, "max_completion_length": 256.0, "max_terminated_completion_length": 174.0, "mean_completion_length": 141.7857208251953, "mean_terminated_completion_length": 122.75, "min_completion_length": 60.0, "min_terminated_completion_length": 60.0, "num_tokens": 1562234.0, "reward": 1.0581145286560059, "reward_std": 0.10213746130466461, "rewards/check_gptzero_func/mean": 0.0714285746216774, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.014747214503586292, "rewards/check_winston_local_func/std": 0.01736227609217167, "rewards/sentence_count_match_reward_logic/mean": 0.9719387888908386, "rewards/sentence_count_match_reward_logic/std": 0.06341922283172607, "step": 317 }, { "clip_ratio": 0.004692512564361095, "epoch": 0.1112666200139958, "grad_norm": 0.31109781386054847, "kl": 0.251953125, "learning_rate": 5.5594405594405596e-06, "loss": -0.0026, "step": 318 }, { "clip_ratio": 0.0072699496522545815, "epoch": 0.11161651504548635, "grad_norm": 0.31356998249657214, "kl": 0.2373046875, "learning_rate": 5.576923076923077e-06, "loss": -0.0038, "step": 319 }, { "clip_ratio": 0.01124772522598505, "epoch": 0.11196641007697691, "grad_norm": 0.253193750365412, "kl": 0.251953125, "learning_rate": 5.594405594405595e-06, "loss": -0.005, "step": 320 }, { "clip_ratio": 0.0060139307752251625, "clipped_completions_ratio": 0.017857142857142905, "epoch": 0.11231630510846746, "grad_norm": 0.27782441064219016, "kl": 0.126953125, "learning_rate": 5.611888111888112e-06, "loss": 0.0213, "max_completion_length": 256.0, "max_terminated_completion_length": 209.0, "mean_completion_length": 144.9107208251953, "mean_terminated_completion_length": 142.89089965820312, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 1581005.0, "reward": 1.0848604440689087, "reward_std": 0.13412615656852722, "rewards/check_gptzero_func/mean": 0.0714285746216774, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.02363584004342556, "rewards/check_winston_local_func/std": 0.05926951393485069, "rewards/sentence_count_match_reward_logic/mean": 0.989795982837677, "rewards/sentence_count_match_reward_logic/std": 0.07636035978794098, "step": 321 }, { "clip_ratio": 0.004080915357917547, "epoch": 0.11266620013995801, "grad_norm": 0.2596496693239198, "kl": 0.1318359375, "learning_rate": 5.629370629370629e-06, "loss": 0.0209, "step": 322 }, { "clip_ratio": 0.009372314438223839, "epoch": 0.11301609517144856, "grad_norm": 0.26873317852727185, "kl": 0.134765625, "learning_rate": 5.646853146853147e-06, "loss": 0.0203, "step": 323 }, { "clip_ratio": 0.012432738207280636, "epoch": 0.11336599020293912, "grad_norm": 0.3017914670105751, "kl": 0.1318359375, "learning_rate": 5.664335664335665e-06, "loss": 0.0194, "step": 324 }, { "clip_ratio": 0.0024974108673632145, "clipped_completions_ratio": 0.0, "epoch": 0.11371588523442967, "grad_norm": 0.2641638145558472, "kl": 0.12109375, "learning_rate": 5.681818181818183e-06, "loss": -0.0038, "max_completion_length": 239.0, "max_terminated_completion_length": 239.0, "mean_completion_length": 155.73214721679688, "mean_terminated_completion_length": 155.73214721679688, "min_completion_length": 89.0, "min_terminated_completion_length": 89.0, "num_tokens": 1601078.0, "reward": 1.1428585052490234, "reward_std": 0.019869638606905937, "rewards/check_gptzero_func/mean": 0.1428571492433548, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.014287143014371395, "rewards/check_winston_local_func/std": 0.011751273646950722, "rewards/sentence_count_match_reward_logic/mean": 0.985714316368103, "rewards/sentence_count_match_reward_logic/std": 0.051974017173051834, "step": 325 }, { "clip_ratio": 0.0038719738367944956, "epoch": 0.11406578026592022, "grad_norm": 0.2475228409169693, "kl": 0.11962890625, "learning_rate": 5.699300699300699e-06, "loss": -0.0043, "step": 326 }, { "clip_ratio": 0.004854138009250164, "epoch": 0.11441567529741077, "grad_norm": 0.23015860355794118, "kl": 0.1279296875, "learning_rate": 5.716783216783217e-06, "loss": -0.0052, "step": 327 }, { "clip_ratio": 0.007583252154290676, "epoch": 0.11476557032890133, "grad_norm": 0.21737274634725764, "kl": 0.1318359375, "learning_rate": 5.7342657342657345e-06, "loss": -0.0063, "step": 328 }, { "clip_ratio": 0.004732684697955847, "clipped_completions_ratio": 0.0, "epoch": 0.11511546536039188, "grad_norm": 0.46509500546948596, "kl": 0.1533203125, "learning_rate": 5.751748251748253e-06, "loss": -0.0062, "max_completion_length": 189.0, "max_terminated_completion_length": 189.0, "mean_completion_length": 115.66072082519531, "mean_terminated_completion_length": 115.66072082519531, "min_completion_length": 50.0, "min_terminated_completion_length": 50.0, "num_tokens": 1616347.0, "reward": 1.0227348804473877, "reward_std": 0.09520234167575836, "rewards/check_gptzero_func/mean": 0.0357142873108387, "rewards/check_gptzero_func/std": 0.1872563362121582, "rewards/check_winston_local_func/mean": 0.012615785002708435, "rewards/check_winston_local_func/std": 0.006289024371653795, "rewards/sentence_count_match_reward_logic/mean": 0.9744047522544861, "rewards/sentence_count_match_reward_logic/std": 0.0635613352060318, "step": 329 }, { "clip_ratio": 0.007670692168176174, "epoch": 0.11546536039188243, "grad_norm": 0.3444629847185503, "kl": 0.162109375, "learning_rate": 5.769230769230769e-06, "loss": -0.0065, "step": 330 }, { "clip_ratio": 0.015010225586593151, "epoch": 0.11581525542337298, "grad_norm": 0.35797996086939843, "kl": 0.1640625, "learning_rate": 5.786713286713287e-06, "loss": -0.0076, "step": 331 }, { "clip_ratio": 0.017973478883504868, "epoch": 0.11616515045486354, "grad_norm": 0.40840428399295603, "kl": 0.166015625, "learning_rate": 5.804195804195804e-06, "loss": -0.0084, "step": 332 }, { "clip_ratio": 0.0033016053494066, "clipped_completions_ratio": 0.0, "epoch": 0.1165150454863541, "grad_norm": 0.25927612685254964, "kl": 0.1826171875, "learning_rate": 5.821678321678323e-06, "loss": 0.0019, "max_completion_length": 208.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 145.875, "mean_terminated_completion_length": 145.875, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 1635132.0, "reward": 1.1585677862167358, "reward_std": 0.08567550033330917, "rewards/check_gptzero_func/mean": 0.0892857164144516, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.07183294743299484, "rewards/check_winston_local_func/std": 0.14698566496372223, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 333 }, { "clip_ratio": 0.004250450991094112, "epoch": 0.11686494051784464, "grad_norm": 0.24801725933283278, "kl": 0.1611328125, "learning_rate": 5.83916083916084e-06, "loss": 0.0009, "step": 334 }, { "clip_ratio": 0.004495074041187763, "epoch": 0.1172148355493352, "grad_norm": 0.22906915186341498, "kl": 0.1611328125, "learning_rate": 5.856643356643356e-06, "loss": 0.0007, "step": 335 }, { "clip_ratio": 0.008183440193533897, "epoch": 0.11756473058082575, "grad_norm": 0.2241194468013796, "kl": 0.162109375, "learning_rate": 5.874125874125874e-06, "loss": -0.0, "step": 336 }, { "clip_ratio": 0.0029595638625323772, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.1179146256123163, "grad_norm": 0.23903574309270553, "kl": 0.10693359375, "learning_rate": 5.8916083916083925e-06, "loss": 0.0007, "max_completion_length": 256.0, "max_terminated_completion_length": 214.0, "mean_completion_length": 178.5357208251953, "mean_terminated_completion_length": 165.625, "min_completion_length": 98.0, "min_terminated_completion_length": 98.0, "num_tokens": 1658226.0, "reward": 1.2789238691329956, "reward_std": 0.17707088589668274, "rewards/check_gptzero_func/mean": 0.3035714328289032, "rewards/check_gptzero_func/std": 0.4639608860015869, "rewards/check_winston_local_func/mean": 0.018755214288830757, "rewards/check_winston_local_func/std": 0.02580123022198677, "rewards/sentence_count_match_reward_logic/mean": 0.9565972089767456, "rewards/sentence_count_match_reward_logic/std": 0.07685520499944687, "step": 337 }, { "clip_ratio": 0.0031262647826224566, "epoch": 0.11826452064380685, "grad_norm": 0.20898596015736498, "kl": 0.109375, "learning_rate": 5.90909090909091e-06, "loss": 0.0003, "step": 338 }, { "clip_ratio": 0.003567679785192013, "epoch": 0.11861441567529742, "grad_norm": 0.20598603283907863, "kl": 0.10986328125, "learning_rate": 5.926573426573428e-06, "loss": -0.0001, "step": 339 }, { "clip_ratio": 0.005549075547605753, "epoch": 0.11896431070678797, "grad_norm": 0.17909635487820905, "kl": 0.1201171875, "learning_rate": 5.944055944055944e-06, "loss": -0.0006, "step": 340 }, { "clip_ratio": 0.002884205197915435, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.11931420573827851, "grad_norm": 0.24160627495226425, "kl": 0.1640625, "learning_rate": 5.961538461538462e-06, "loss": -0.0014, "max_completion_length": 256.0, "max_terminated_completion_length": 213.0, "mean_completion_length": 167.42857360839844, "mean_terminated_completion_length": 152.6666717529297, "min_completion_length": 109.0, "min_terminated_completion_length": 109.0, "num_tokens": 1679482.0, "reward": 1.1415669918060303, "reward_std": 0.10392527282238007, "rewards/check_gptzero_func/mean": 0.0714285746216774, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.08873958885669708, "rewards/check_winston_local_func/std": 0.1724359542131424, "rewards/sentence_count_match_reward_logic/mean": 0.9813987612724304, "rewards/sentence_count_match_reward_logic/std": 0.046275172382593155, "step": 341 }, { "clip_ratio": 0.0031552130822092295, "epoch": 0.11966410076976906, "grad_norm": 0.22528129216066542, "kl": 0.1591796875, "learning_rate": 5.97902097902098e-06, "loss": -0.0017, "step": 342 }, { "clip_ratio": 0.004542023874819279, "epoch": 0.12001399580125963, "grad_norm": 0.22081335406487326, "kl": 0.162109375, "learning_rate": 5.996503496503498e-06, "loss": -0.0022, "step": 343 }, { "clip_ratio": 0.00803966075181961, "epoch": 0.12036389083275018, "grad_norm": 0.22959922943868216, "kl": 0.169921875, "learning_rate": 6.013986013986014e-06, "loss": -0.003, "step": 344 }, { "clip_ratio": 0.0028484624344855547, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.12071378586424072, "grad_norm": 0.194735792449192, "kl": 0.1083984375, "learning_rate": 6.031468531468532e-06, "loss": 0.0051, "max_completion_length": 256.0, "max_terminated_completion_length": 218.0, "mean_completion_length": 194.5357208251953, "mean_terminated_completion_length": 169.9499969482422, "min_completion_length": 147.0, "min_terminated_completion_length": 147.0, "num_tokens": 1704360.0, "reward": 1.2045162916183472, "reward_std": 0.1732185333967209, "rewards/check_gptzero_func/mean": 0.1607142835855484, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.0875462144613266, "rewards/check_winston_local_func/std": 0.19957998394966125, "rewards/sentence_count_match_reward_logic/mean": 0.9562557935714722, "rewards/sentence_count_match_reward_logic/std": 0.09781955182552338, "step": 345 }, { "clip_ratio": 0.0027786078862845898, "epoch": 0.12106368089573127, "grad_norm": 0.19655582225040164, "kl": 0.10791015625, "learning_rate": 6.04895104895105e-06, "loss": 0.0046, "step": 346 }, { "clip_ratio": 0.0031542566139250994, "epoch": 0.12141357592722184, "grad_norm": 0.1940368367082629, "kl": 0.107421875, "learning_rate": 6.0664335664335674e-06, "loss": 0.0042, "step": 347 }, { "clip_ratio": 0.003605904523283243, "epoch": 0.12176347095871239, "grad_norm": 0.1812021387087795, "kl": 0.09814453125, "learning_rate": 6.083916083916085e-06, "loss": 0.0034, "step": 348 }, { "clip_ratio": 0.004136413335800171, "clipped_completions_ratio": 0.0, "epoch": 0.12211336599020294, "grad_norm": 0.2808019919571069, "kl": 0.12890625, "learning_rate": 6.101398601398602e-06, "loss": 0.0051, "max_completion_length": 218.0, "max_terminated_completion_length": 218.0, "mean_completion_length": 130.73214721679688, "mean_terminated_completion_length": 130.73214721679688, "min_completion_length": 77.0, "min_terminated_completion_length": 77.0, "num_tokens": 1721273.0, "reward": 1.1649794578552246, "reward_std": 0.07313619554042816, "rewards/check_gptzero_func/mean": 0.125, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.04890790954232216, "rewards/check_winston_local_func/std": 0.11681642383337021, "rewards/sentence_count_match_reward_logic/mean": 0.9910714030265808, "rewards/sentence_count_match_reward_logic/std": 0.037867967039346695, "step": 349 }, { "clip_ratio": 0.005468006245791912, "epoch": 0.1224632610216935, "grad_norm": 0.2781792351391594, "kl": 0.12890625, "learning_rate": 6.1188811188811196e-06, "loss": 0.0044, "step": 350 }, { "clip_ratio": 0.0066650426015257835, "epoch": 0.12281315605318405, "grad_norm": 0.25307519013103696, "kl": 0.1328125, "learning_rate": 6.136363636363637e-06, "loss": 0.0035, "step": 351 }, { "clip_ratio": 0.011933539994060993, "epoch": 0.1231630510846746, "grad_norm": 0.23225086662179908, "kl": 0.140625, "learning_rate": 6.153846153846155e-06, "loss": 0.0021, "step": 352 }, { "clip_ratio": 0.00393474567681551, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.12351294611616515, "grad_norm": 0.33708725831917646, "kl": 0.19921875, "learning_rate": 6.171328671328672e-06, "loss": -0.0012, "max_completion_length": 256.0, "max_terminated_completion_length": 175.0, "mean_completion_length": 147.8928680419922, "mean_terminated_completion_length": 129.875, "min_completion_length": 76.0, "min_terminated_completion_length": 76.0, "num_tokens": 1740507.0, "reward": 1.2807575464248657, "reward_std": 0.17910964787006378, "rewards/check_gptzero_func/mean": 0.2142857164144516, "rewards/check_gptzero_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.07837658375501633, "rewards/check_winston_local_func/std": 0.13983504474163055, "rewards/sentence_count_match_reward_logic/mean": 0.988095223903656, "rewards/sentence_count_match_reward_logic/std": 0.04331168904900551, "step": 353 }, { "clip_ratio": 0.005851727910339832, "epoch": 0.12386284114765571, "grad_norm": 0.3457983120273169, "kl": 0.201171875, "learning_rate": 6.188811188811189e-06, "loss": -0.0016, "step": 354 }, { "clip_ratio": 0.009467205964028835, "epoch": 0.12421273617914626, "grad_norm": 0.3658544509841302, "kl": 0.1884765625, "learning_rate": 6.206293706293707e-06, "loss": -0.002, "step": 355 }, { "clip_ratio": 0.013148047961294651, "epoch": 0.1245626312106368, "grad_norm": 0.2203982938012038, "kl": 0.1943359375, "learning_rate": 6.223776223776225e-06, "loss": -0.0034, "step": 356 }, { "clip_ratio": 0.0020805743988603354, "clipped_completions_ratio": 0.0, "epoch": 0.12491252624212736, "grad_norm": 0.23494327768793585, "kl": 0.13671875, "learning_rate": 6.241258741258742e-06, "loss": 0.0002, "max_completion_length": 251.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 174.58929443359375, "mean_terminated_completion_length": 174.58929443359375, "min_completion_length": 112.0, "min_terminated_completion_length": 112.0, "num_tokens": 1762460.0, "reward": 1.06517493724823, "reward_std": 0.03623374551534653, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.07027687132358551, "rewards/check_winston_local_func/std": 0.12391314655542374, "rewards/sentence_count_match_reward_logic/mean": 0.9948979616165161, "rewards/sentence_count_match_reward_logic/std": 0.026750903576612473, "step": 357 }, { "clip_ratio": 0.002787985373288393, "epoch": 0.12526242127361792, "grad_norm": 0.21895288214893574, "kl": 0.1376953125, "learning_rate": 6.258741258741259e-06, "loss": -0.0001, "step": 358 }, { "clip_ratio": 0.005539965815842152, "epoch": 0.12561231630510847, "grad_norm": 0.21766990106685427, "kl": 0.1435546875, "learning_rate": 6.276223776223777e-06, "loss": -0.0012, "step": 359 }, { "clip_ratio": 0.008504088968038559, "epoch": 0.12596221133659902, "grad_norm": 0.23691570173574783, "kl": 0.1474609375, "learning_rate": 6.2937062937062944e-06, "loss": -0.002, "step": 360 }, { "clip_ratio": 0.004019828513264656, "clipped_completions_ratio": 0.0, "epoch": 0.12631210636808957, "grad_norm": 0.29406555756716346, "kl": 0.1962890625, "learning_rate": 6.311188811188812e-06, "loss": 0.0027, "max_completion_length": 219.0, "max_terminated_completion_length": 219.0, "mean_completion_length": 163.05357360839844, "mean_terminated_completion_length": 163.05357360839844, "min_completion_length": 117.0, "min_terminated_completion_length": 117.0, "num_tokens": 1783335.0, "reward": 0.9826167821884155, "reward_std": 0.03610057756304741, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.013143946416676044, "rewards/check_winston_local_func/std": 0.01701406203210354, "rewards/sentence_count_match_reward_logic/mean": 0.9694727659225464, "rewards/sentence_count_match_reward_logic/std": 0.06642366200685501, "step": 361 }, { "clip_ratio": 0.003836485091596842, "epoch": 0.12666200139958012, "grad_norm": 0.3233202860104344, "kl": 0.1689453125, "learning_rate": 6.32867132867133e-06, "loss": 0.0018, "step": 362 }, { "clip_ratio": 0.005621118005365133, "epoch": 0.1270118964310707, "grad_norm": 0.23712111917134054, "kl": 0.1728515625, "learning_rate": 6.3461538461538466e-06, "loss": 0.0009, "step": 363 }, { "clip_ratio": 0.007987242192029953, "epoch": 0.12736179146256124, "grad_norm": 0.23614915194722474, "kl": 0.1806640625, "learning_rate": 6.363636363636364e-06, "loss": -0.0003, "step": 364 }, { "clip_ratio": 0.002567132469266653, "clipped_completions_ratio": 0.0, "epoch": 0.1277116864940518, "grad_norm": 0.3628732944236113, "kl": 0.1611328125, "learning_rate": 6.381118881118882e-06, "loss": -0.0027, "max_completion_length": 183.0, "max_terminated_completion_length": 183.0, "mean_completion_length": 124.25000762939453, "mean_terminated_completion_length": 124.25000762939453, "min_completion_length": 75.0, "min_terminated_completion_length": 75.0, "num_tokens": 1799437.0, "reward": 1.0280895233154297, "reward_std": 0.05210889130830765, "rewards/check_gptzero_func/mean": 0.01785714365541935, "rewards/check_gptzero_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.010232357308268547, "rewards/check_winston_local_func/std": 0.003483328502625227, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 365 }, { "clip_ratio": 0.007052411325275898, "epoch": 0.12806158152554234, "grad_norm": 0.2916309399670458, "kl": 0.1728515625, "learning_rate": 6.3986013986013996e-06, "loss": -0.0036, "step": 366 }, { "clip_ratio": 0.011214635334908962, "epoch": 0.1284114765570329, "grad_norm": 0.3170460616509015, "kl": 0.1796875, "learning_rate": 6.416083916083916e-06, "loss": -0.0047, "step": 367 }, { "clip_ratio": 0.01858406886458397, "epoch": 0.12876137158852344, "grad_norm": 0.3098186086438865, "kl": 0.1923828125, "learning_rate": 6.433566433566434e-06, "loss": -0.0058, "step": 368 }, { "clip_ratio": 0.002858559601008892, "clipped_completions_ratio": 0.4285714285714286, "epoch": 0.129111266620014, "grad_norm": 0.30237163998673805, "kl": 0.193359375, "learning_rate": 6.451048951048952e-06, "loss": 0.0075, "max_completion_length": 256.0, "max_terminated_completion_length": 183.0, "mean_completion_length": 203.82144165039062, "mean_terminated_completion_length": 164.6875, "min_completion_length": 129.0, "min_terminated_completion_length": 129.0, "num_tokens": 1827587.0, "reward": 1.425007700920105, "reward_std": 0.16389310359954834, "rewards/check_gptzero_func/mean": 0.4285714328289032, "rewards/check_gptzero_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.0911782756447792, "rewards/check_winston_local_func/std": 0.15100520849227905, "rewards/sentence_count_match_reward_logic/mean": 0.9052578806877136, "rewards/sentence_count_match_reward_logic/std": 0.1231311708688736, "step": 369 }, { "clip_ratio": 0.004952060524374247, "epoch": 0.12946116165150454, "grad_norm": 0.3040489839006338, "kl": 0.193359375, "learning_rate": 6.468531468531469e-06, "loss": 0.0066, "step": 370 }, { "clip_ratio": 0.006575216073542833, "epoch": 0.1298110566829951, "grad_norm": 0.2414477081454303, "kl": 0.166015625, "learning_rate": 6.486013986013987e-06, "loss": 0.0057, "step": 371 }, { "clip_ratio": 0.007013890892267227, "epoch": 0.13016095171448566, "grad_norm": 0.2745495556783455, "kl": 0.15625, "learning_rate": 6.503496503496504e-06, "loss": 0.0051, "step": 372 }, { "clip_ratio": 0.003695516847074032, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.1305108467459762, "grad_norm": 0.3039379699043927, "kl": 0.1826171875, "learning_rate": 6.5209790209790215e-06, "loss": 0.0029, "max_completion_length": 256.0, "max_terminated_completion_length": 202.0, "mean_completion_length": 186.57144165039062, "mean_terminated_completion_length": 158.8000030517578, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 1852531.0, "reward": 0.967571496963501, "reward_std": 0.027977975085377693, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.04222417622804642, "rewards/check_winston_local_func/std": 0.07533858716487885, "rewards/sentence_count_match_reward_logic/mean": 0.9253472089767456, "rewards/sentence_count_match_reward_logic/std": 0.1564539670944214, "step": 373 }, { "clip_ratio": 0.004367392510175705, "epoch": 0.13086074177746676, "grad_norm": 0.2415173782561947, "kl": 0.19140625, "learning_rate": 6.538461538461539e-06, "loss": 0.0032, "step": 374 }, { "clip_ratio": 0.006423503626137972, "epoch": 0.1312106368089573, "grad_norm": 0.24285453007232824, "kl": 0.2041015625, "learning_rate": 6.555944055944057e-06, "loss": 0.0018, "step": 375 }, { "clip_ratio": 0.0104123679921031, "epoch": 0.13156053184044786, "grad_norm": 0.22153810176441516, "kl": 0.205078125, "learning_rate": 6.573426573426574e-06, "loss": 0.0009, "step": 376 }, { "clip_ratio": 0.002908313414081931, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.1319104268719384, "grad_norm": 0.2507799704414443, "kl": 0.1533203125, "learning_rate": 6.590909090909091e-06, "loss": 0.0021, "max_completion_length": 256.0, "max_terminated_completion_length": 186.0, "mean_completion_length": 167.67857360839844, "mean_terminated_completion_length": 152.95834350585938, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 1874145.0, "reward": 1.2233426570892334, "reward_std": 0.08344338834285736, "rewards/check_gptzero_func/mean": 0.1607142835855484, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.08558756858110428, "rewards/check_winston_local_func/std": 0.19206155836582184, "rewards/sentence_count_match_reward_logic/mean": 0.9770408272743225, "rewards/sentence_count_match_reward_logic/std": 0.05294156074523926, "step": 377 }, { "clip_ratio": 0.0036824687849730253, "epoch": 0.13226032190342898, "grad_norm": 0.24576316874565252, "kl": 0.15625, "learning_rate": 6.608391608391609e-06, "loss": 0.0011, "step": 378 }, { "clip_ratio": 0.004960486199706793, "epoch": 0.13261021693491953, "grad_norm": 0.2188749475108505, "kl": 0.158203125, "learning_rate": 6.6258741258741266e-06, "loss": 0.0006, "step": 379 }, { "clip_ratio": 0.009436473250389099, "epoch": 0.13296011196641008, "grad_norm": 0.21620512092593397, "kl": 0.15625, "learning_rate": 6.643356643356644e-06, "loss": -0.0005, "step": 380 }, { "clip_ratio": 0.0035449594724923372, "clipped_completions_ratio": 0.0, "epoch": 0.13331000699790063, "grad_norm": 0.44324968350063226, "kl": 0.248046875, "learning_rate": 6.660839160839161e-06, "loss": -0.0013, "max_completion_length": 206.0, "max_terminated_completion_length": 206.0, "mean_completion_length": 149.9107208251953, "mean_terminated_completion_length": 149.9107208251953, "min_completion_length": 86.0, "min_terminated_completion_length": 86.0, "num_tokens": 1893508.0, "reward": 1.1991337537765503, "reward_std": 0.04497756436467171, "rewards/check_gptzero_func/mean": 0.1428571492433548, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.10984808951616287, "rewards/check_winston_local_func/std": 0.1860978901386261, "rewards/sentence_count_match_reward_logic/mean": 0.9464285969734192, "rewards/sentence_count_match_reward_logic/std": 0.09591212123632431, "step": 381 }, { "clip_ratio": 0.004789404571056366, "epoch": 0.13365990202939118, "grad_norm": 0.361069549343389, "kl": 0.2255859375, "learning_rate": 6.678321678321679e-06, "loss": -0.0028, "step": 382 }, { "clip_ratio": 0.011264834553003311, "epoch": 0.13400979706088173, "grad_norm": 0.3078977234005532, "kl": 0.2294921875, "learning_rate": 6.695804195804196e-06, "loss": -0.0048, "step": 383 }, { "clip_ratio": 0.019065752625465393, "epoch": 0.13435969209237228, "grad_norm": 0.2897460697750455, "kl": 0.232421875, "learning_rate": 6.713286713286714e-06, "loss": -0.0068, "step": 384 }, { "clip_ratio": 0.003903700038790703, "clipped_completions_ratio": 0.0, "epoch": 0.13470958712386283, "grad_norm": 0.6666051211404664, "kl": 0.40625, "learning_rate": 6.730769230769232e-06, "loss": 0.0095, "max_completion_length": 181.0, "max_terminated_completion_length": 181.0, "mean_completion_length": 129.48214721679688, "mean_terminated_completion_length": 129.48214721679688, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 1910383.0, "reward": 1.4329015016555786, "reward_std": 0.27741652727127075, "rewards/check_gptzero_func/mean": 0.25, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.19183000922203064, "rewards/check_winston_local_func/std": 0.2562735974788666, "rewards/sentence_count_match_reward_logic/mean": 0.9910714030265808, "rewards/sentence_count_match_reward_logic/std": 0.03786797076463699, "step": 385 }, { "clip_ratio": 0.010489290580153465, "epoch": 0.1350594821553534, "grad_norm": 0.47986458221626194, "kl": 0.28515625, "learning_rate": 6.7482517482517485e-06, "loss": 0.0079, "step": 386 }, { "clip_ratio": 0.01635574735701084, "epoch": 0.13540937718684395, "grad_norm": 0.6501487079642534, "kl": 0.255859375, "learning_rate": 6.765734265734266e-06, "loss": 0.0059, "step": 387 }, { "clip_ratio": 0.021620728075504303, "epoch": 0.1357592722183345, "grad_norm": 0.3910520242365424, "kl": 0.28515625, "learning_rate": 6.783216783216784e-06, "loss": 0.0036, "step": 388 }, { "clip_ratio": 0.002941790036857128, "clipped_completions_ratio": 0.0, "epoch": 0.13610916724982505, "grad_norm": 0.30996572984390636, "kl": 0.265625, "learning_rate": 6.8006993006993015e-06, "loss": 0.0021, "max_completion_length": 242.0, "max_terminated_completion_length": 242.0, "mean_completion_length": 149.71429443359375, "mean_terminated_completion_length": 149.71429443359375, "min_completion_length": 104.0, "min_terminated_completion_length": 104.0, "num_tokens": 1929359.0, "reward": 1.0046762228012085, "reward_std": 0.06402607262134552, "rewards/check_gptzero_func/mean": 0.01785714365541935, "rewards/check_gptzero_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.021116072311997414, "rewards/check_winston_local_func/std": 0.026203790679574013, "rewards/sentence_count_match_reward_logic/mean": 0.9657029509544373, "rewards/sentence_count_match_reward_logic/std": 0.05781049281358719, "step": 389 }, { "clip_ratio": 0.007369850296527147, "epoch": 0.1364590622813156, "grad_norm": 0.4833266106807161, "kl": 0.365234375, "learning_rate": 6.818181818181818e-06, "loss": 0.0025, "step": 390 }, { "clip_ratio": 0.01263163611292839, "epoch": 0.13680895731280615, "grad_norm": 0.5217130320815374, "kl": 0.369140625, "learning_rate": 6.835664335664336e-06, "loss": 0.0013, "step": 391 }, { "clip_ratio": 0.01598994992673397, "epoch": 0.1371588523442967, "grad_norm": 0.28796214781721263, "kl": 0.296875, "learning_rate": 6.853146853146854e-06, "loss": -0.0012, "step": 392 }, { "clip_ratio": 0.0033249312546104193, "clipped_completions_ratio": 0.1071428571428571, "epoch": 0.13750874737578728, "grad_norm": 0.3070801050816017, "kl": 0.1494140625, "learning_rate": 6.870629370629371e-06, "loss": 0.0011, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 163.94644165039062, "mean_terminated_completion_length": 152.89999389648438, "min_completion_length": 99.0, "min_terminated_completion_length": 99.0, "num_tokens": 1950060.0, "reward": 1.2415622472763062, "reward_std": 0.211212620139122, "rewards/check_gptzero_func/mean": 0.1071428582072258, "rewards/check_gptzero_func/std": 0.3120938837528229, "rewards/check_winston_local_func/mean": 0.16343717277050018, "rewards/check_winston_local_func/std": 0.2218257188796997, "rewards/sentence_count_match_reward_logic/mean": 0.9709821343421936, "rewards/sentence_count_match_reward_logic/std": 0.06451041996479034, "step": 393 }, { "clip_ratio": 0.005004371982067823, "epoch": 0.13785864240727783, "grad_norm": 0.41870097045946464, "kl": 0.138671875, "learning_rate": 6.888111888111889e-06, "loss": 0.0007, "step": 394 }, { "clip_ratio": 0.0058074635453522205, "epoch": 0.13820853743876838, "grad_norm": 0.4227570202121935, "kl": 0.1396484375, "learning_rate": 6.905594405594406e-06, "loss": -0.0003, "step": 395 }, { "clip_ratio": 0.013781893067061901, "epoch": 0.13855843247025892, "grad_norm": 0.27267468875279155, "kl": 0.1572265625, "learning_rate": 6.923076923076923e-06, "loss": -0.0017, "step": 396 }, { "clip_ratio": 0.00439003249630332, "clipped_completions_ratio": 0.0, "epoch": 0.13890832750174947, "grad_norm": 0.39809033616857836, "kl": 0.220703125, "learning_rate": 6.940559440559441e-06, "loss": -0.009, "max_completion_length": 208.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 130.625, "mean_terminated_completion_length": 130.625, "min_completion_length": 76.0, "min_terminated_completion_length": 76.0, "num_tokens": 1967151.0, "reward": 1.3787137269973755, "reward_std": 0.07869037240743637, "rewards/check_gptzero_func/mean": 0.2857142984867096, "rewards/check_gptzero_func/std": 0.4558423161506653, "rewards/check_winston_local_func/mean": 0.15694482624530792, "rewards/check_winston_local_func/std": 0.2126137763261795, "rewards/sentence_count_match_reward_logic/mean": 0.9360544085502625, "rewards/sentence_count_match_reward_logic/std": 0.1131039410829544, "step": 397 }, { "clip_ratio": 0.010137631557881832, "epoch": 0.13925822253324002, "grad_norm": 0.3267279978558415, "kl": 0.2431640625, "learning_rate": 6.958041958041959e-06, "loss": -0.0097, "step": 398 }, { "clip_ratio": 0.01778491400182247, "epoch": 0.13960811756473057, "grad_norm": 0.4236170396521425, "kl": 0.279296875, "learning_rate": 6.975524475524476e-06, "loss": -0.0114, "step": 399 }, { "clip_ratio": 0.020727455615997314, "epoch": 0.13995801259622112, "grad_norm": 0.3094908082677138, "kl": 0.271484375, "learning_rate": 6.993006993006993e-06, "loss": -0.0128, "step": 400 }, { "clip_ratio": 0.0034087803214788437, "clipped_completions_ratio": 0.0, "epoch": 0.1403079076277117, "grad_norm": 1.6449970148186048, "kl": 0.6953125, "learning_rate": 7.010489510489511e-06, "loss": 0.0067, "max_completion_length": 200.0, "max_terminated_completion_length": 200.0, "mean_completion_length": 157.10714721679688, "mean_terminated_completion_length": 157.10714721679688, "min_completion_length": 69.0, "min_terminated_completion_length": 69.0, "num_tokens": 1987229.0, "reward": 1.2786340713500977, "reward_std": 0.08475779742002487, "rewards/check_gptzero_func/mean": 0.2321428507566452, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.04649121314287186, "rewards/check_winston_local_func/std": 0.09124040603637695, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 401 }, { "clip_ratio": 0.007134762592613697, "epoch": 0.14065780265920225, "grad_norm": 1.4139273969244768, "kl": 0.2470703125, "learning_rate": 7.0279720279720285e-06, "loss": 0.0056, "step": 402 }, { "clip_ratio": 0.009336418472230434, "epoch": 0.1410076976906928, "grad_norm": 1.0694087756130979, "kl": 0.244140625, "learning_rate": 7.045454545454546e-06, "loss": 0.0043, "step": 403 }, { "clip_ratio": 0.01271090004593134, "epoch": 0.14135759272218335, "grad_norm": 1.3757064436037698, "kl": 0.5, "learning_rate": 7.062937062937063e-06, "loss": 0.0026, "step": 404 }, { "clip_ratio": 0.0034136853646486998, "clipped_completions_ratio": 0.017857142857142905, "epoch": 0.1417074877536739, "grad_norm": 0.28154528950615026, "kl": 0.1630859375, "learning_rate": 7.080419580419581e-06, "loss": -0.0014, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 183.85714721679688, "mean_terminated_completion_length": 182.5454559326172, "min_completion_length": 120.0, "min_terminated_completion_length": 120.0, "num_tokens": 2010005.0, "reward": 1.1104462146759033, "reward_std": 0.15188869833946228, "rewards/check_gptzero_func/mean": 0.0892857164144516, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.021160533651709557, "rewards/check_winston_local_func/std": 0.053583063185214996, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 405 }, { "clip_ratio": 0.004575788974761963, "epoch": 0.14205738278516444, "grad_norm": 0.2369928911229188, "kl": 0.1591796875, "learning_rate": 7.097902097902098e-06, "loss": -0.0016, "step": 406 }, { "clip_ratio": 0.010158679448068142, "epoch": 0.142407277816655, "grad_norm": 0.28037699305420705, "kl": 0.1591796875, "learning_rate": 7.115384615384616e-06, "loss": -0.0024, "step": 407 }, { "clip_ratio": 0.015094381757080555, "epoch": 0.14275717284814557, "grad_norm": 0.3093623238566984, "kl": 0.162109375, "learning_rate": 7.132867132867134e-06, "loss": -0.003, "step": 408 }, { "clip_ratio": 0.0029624789021909237, "clipped_completions_ratio": 0.0, "epoch": 0.14310706787963612, "grad_norm": 0.3749993832058073, "kl": 0.2021484375, "learning_rate": 7.15034965034965e-06, "loss": -0.0033, "max_completion_length": 242.0, "max_terminated_completion_length": 242.0, "mean_completion_length": 148.3928680419922, "mean_terminated_completion_length": 148.3928680419922, "min_completion_length": 65.0, "min_terminated_completion_length": 65.0, "num_tokens": 2028595.0, "reward": 1.354546070098877, "reward_std": 0.20188185572624207, "rewards/check_gptzero_func/mean": 0.1071428582072258, "rewards/check_gptzero_func/std": 0.3120938837528229, "rewards/check_winston_local_func/mean": 0.2578198313713074, "rewards/check_winston_local_func/std": 0.3032528758049011, "rewards/sentence_count_match_reward_logic/mean": 0.9895833134651184, "rewards/sentence_count_match_reward_logic/std": 0.04512188956141472, "step": 409 }, { "clip_ratio": 0.0029767476953566074, "epoch": 0.14345696291112667, "grad_norm": 0.35126021510552485, "kl": 0.1962890625, "learning_rate": 7.167832167832168e-06, "loss": -0.004, "step": 410 }, { "clip_ratio": 0.009162528440356255, "epoch": 0.14380685794261722, "grad_norm": 0.2820024657710274, "kl": 0.2060546875, "learning_rate": 7.185314685314686e-06, "loss": -0.0055, "step": 411 }, { "clip_ratio": 0.01563202403485775, "epoch": 0.14415675297410777, "grad_norm": 0.2787608508239506, "kl": 0.232421875, "learning_rate": 7.202797202797203e-06, "loss": -0.0071, "step": 412 }, { "clip_ratio": 0.0017465607961639762, "clipped_completions_ratio": 0.0, "epoch": 0.14450664800559831, "grad_norm": 0.300838913463713, "kl": 0.21484375, "learning_rate": 7.22027972027972e-06, "loss": -0.0011, "max_completion_length": 216.0, "max_terminated_completion_length": 216.0, "mean_completion_length": 164.25, "mean_terminated_completion_length": 164.25, "min_completion_length": 78.0, "min_terminated_completion_length": 78.0, "num_tokens": 2049265.0, "reward": 1.3911216259002686, "reward_std": 0.026732662692666054, "rewards/check_gptzero_func/mean": 0.2857142984867096, "rewards/check_gptzero_func/std": 0.4558423161506653, "rewards/check_winston_local_func/mean": 0.10540729016065598, "rewards/check_winston_local_func/std": 0.15132766962051392, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 413 }, { "clip_ratio": 0.003323977580294013, "epoch": 0.14485654303708886, "grad_norm": 0.28791615335676096, "kl": 0.201171875, "learning_rate": 7.237762237762238e-06, "loss": -0.0015, "step": 414 }, { "clip_ratio": 0.0061817714013159275, "epoch": 0.1452064380685794, "grad_norm": 0.24613729388432493, "kl": 0.203125, "learning_rate": 7.2552447552447555e-06, "loss": -0.0031, "step": 415 }, { "clip_ratio": 0.012667994014918804, "epoch": 0.14555633310007, "grad_norm": 0.2233322654723996, "kl": 0.20703125, "learning_rate": 7.272727272727273e-06, "loss": -0.004, "step": 416 }, { "clip_ratio": 0.002191821811720729, "clipped_completions_ratio": 0.1964285714285714, "epoch": 0.14590622813156054, "grad_norm": 0.24205785775567884, "kl": 0.1953125, "learning_rate": 7.290209790209791e-06, "loss": 0.0061, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 182.19644165039062, "mean_terminated_completion_length": 164.1555633544922, "min_completion_length": 121.0, "min_terminated_completion_length": 121.0, "num_tokens": 2071820.0, "reward": 1.19413161277771, "reward_std": 0.07091901451349258, "rewards/check_gptzero_func/mean": 0.1607142835855484, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.033417265862226486, "rewards/check_winston_local_func/std": 0.07166556268930435, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 417 }, { "clip_ratio": 0.0033148364163935184, "epoch": 0.1462561231630511, "grad_norm": 0.22669195425201621, "kl": 0.1875, "learning_rate": 7.307692307692308e-06, "loss": 0.0051, "step": 418 }, { "clip_ratio": 0.0039189839735627174, "epoch": 0.14660601819454164, "grad_norm": 0.21417309682923097, "kl": 0.1826171875, "learning_rate": 7.325174825174825e-06, "loss": 0.0046, "step": 419 }, { "clip_ratio": 0.007146198768168688, "epoch": 0.1469559132260322, "grad_norm": 0.19848378122617022, "kl": 0.1787109375, "learning_rate": 7.342657342657343e-06, "loss": 0.0036, "step": 420 }, { "clip_ratio": 0.004733151290565729, "clipped_completions_ratio": 0.0, "epoch": 0.14730580825752274, "grad_norm": 0.3633428057833971, "kl": 0.255859375, "learning_rate": 7.360139860139861e-06, "loss": 0.0008, "max_completion_length": 232.0, "max_terminated_completion_length": 232.0, "mean_completion_length": 152.21429443359375, "mean_terminated_completion_length": 152.21429443359375, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 2091160.0, "reward": 1.1390231847763062, "reward_std": 0.1827010214328766, "rewards/check_gptzero_func/mean": 0.0714285746216774, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.07354684174060822, "rewards/check_winston_local_func/std": 0.09356772154569626, "rewards/sentence_count_match_reward_logic/mean": 0.9940475821495056, "rewards/sentence_count_match_reward_logic/std": 0.02524530701339245, "step": 421 }, { "clip_ratio": 0.005851077847182751, "epoch": 0.14765570328901328, "grad_norm": 0.344222696571183, "kl": 0.26171875, "learning_rate": 7.377622377622379e-06, "loss": -0.0003, "step": 422 }, { "clip_ratio": 0.011133468709886074, "epoch": 0.14800559832050386, "grad_norm": 0.3269080549201767, "kl": 0.283203125, "learning_rate": 7.395104895104895e-06, "loss": -0.0023, "step": 423 }, { "clip_ratio": 0.020120084285736084, "epoch": 0.1483554933519944, "grad_norm": 0.29158771998949473, "kl": 0.298828125, "learning_rate": 7.412587412587413e-06, "loss": -0.0032, "step": 424 }, { "clip_ratio": 0.0030941003933548927, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.14870538838348496, "grad_norm": 0.2975992797229105, "kl": 0.224609375, "learning_rate": 7.43006993006993e-06, "loss": 0.0004, "max_completion_length": 256.0, "max_terminated_completion_length": 244.0, "mean_completion_length": 186.2857208251953, "mean_terminated_completion_length": 174.6666717529297, "min_completion_length": 112.0, "min_terminated_completion_length": 112.0, "num_tokens": 2114584.0, "reward": 1.3018759489059448, "reward_std": 0.09126918017864227, "rewards/check_gptzero_func/mean": 0.3035714328289032, "rewards/check_gptzero_func/std": 0.4639608860015869, "rewards/check_winston_local_func/mean": 0.04826194420456886, "rewards/check_winston_local_func/std": 0.07159659266471863, "rewards/sentence_count_match_reward_logic/mean": 0.9500425457954407, "rewards/sentence_count_match_reward_logic/std": 0.08587155491113663, "step": 425 }, { "clip_ratio": 0.004208793863654137, "epoch": 0.1490552834149755, "grad_norm": 0.2917487332450019, "kl": 0.22265625, "learning_rate": 7.447552447552449e-06, "loss": -0.0006, "step": 426 }, { "clip_ratio": 0.005884097423404455, "epoch": 0.14940517844646606, "grad_norm": 0.27759155901204785, "kl": 0.228515625, "learning_rate": 7.465034965034965e-06, "loss": -0.0018, "step": 427 }, { "clip_ratio": 0.01089183147996664, "epoch": 0.1497550734779566, "grad_norm": 0.2507302936650399, "kl": 0.2373046875, "learning_rate": 7.4825174825174825e-06, "loss": -0.0033, "step": 428 }, { "clip_ratio": 0.002356515033170581, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.15010496850944716, "grad_norm": 0.3193702992001575, "kl": 0.2236328125, "learning_rate": 7.500000000000001e-06, "loss": 0.0036, "max_completion_length": 256.0, "max_terminated_completion_length": 234.0, "mean_completion_length": 172.30357360839844, "mean_terminated_completion_length": 158.3541717529297, "min_completion_length": 114.0, "min_terminated_completion_length": 114.0, "num_tokens": 2136425.0, "reward": 1.2490816116333008, "reward_std": 0.23253564536571503, "rewards/check_gptzero_func/mean": 0.1785714328289032, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.07051021605730057, "rewards/check_winston_local_func/std": 0.16707082092761993, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 429 }, { "clip_ratio": 0.003948782570660114, "epoch": 0.1504548635409377, "grad_norm": 0.30711958444999987, "kl": 0.2197265625, "learning_rate": 7.517482517482519e-06, "loss": 0.0023, "step": 430 }, { "clip_ratio": 0.007561282720416784, "epoch": 0.15080475857242828, "grad_norm": 0.2847227802578539, "kl": 0.220703125, "learning_rate": 7.534965034965036e-06, "loss": 0.0011, "step": 431 }, { "clip_ratio": 0.012431194074451923, "epoch": 0.15115465360391883, "grad_norm": 0.2586408233570859, "kl": 0.21875, "learning_rate": 7.552447552447552e-06, "loss": -0.0003, "step": 432 }, { "clip_ratio": 0.0024352248292416334, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.15150454863540938, "grad_norm": 0.32222012658133037, "kl": 0.2294921875, "learning_rate": 7.569930069930071e-06, "loss": 0.003, "max_completion_length": 256.0, "max_terminated_completion_length": 164.0, "mean_completion_length": 153.10714721679688, "mean_terminated_completion_length": 135.95834350585938, "min_completion_length": 106.0, "min_terminated_completion_length": 106.0, "num_tokens": 2156399.0, "reward": 1.0644701719284058, "reward_std": 0.07134105265140533, "rewards/check_gptzero_func/mean": 0.01785714365541935, "rewards/check_gptzero_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.059599947184324265, "rewards/check_winston_local_func/std": 0.12514035403728485, "rewards/sentence_count_match_reward_logic/mean": 0.9870129823684692, "rewards/sentence_count_match_reward_logic/std": 0.03209943696856499, "step": 433 }, { "clip_ratio": 0.002629189984872937, "epoch": 0.15185444366689993, "grad_norm": 0.30974318004347257, "kl": 0.236328125, "learning_rate": 7.5874125874125885e-06, "loss": 0.0023, "step": 434 }, { "clip_ratio": 0.006670649629086256, "epoch": 0.15220433869839048, "grad_norm": 0.26778090550806666, "kl": 0.2470703125, "learning_rate": 7.604895104895106e-06, "loss": 0.001, "step": 435 }, { "clip_ratio": 0.01393958367407322, "epoch": 0.15255423372988103, "grad_norm": 0.22678195374385363, "kl": 0.2470703125, "learning_rate": 7.622377622377622e-06, "loss": -0.0005, "step": 436 }, { "clip_ratio": 0.0023361120838671923, "clipped_completions_ratio": 0.125, "epoch": 0.15290412876137158, "grad_norm": 0.34646504831882013, "kl": 0.2158203125, "learning_rate": 7.63986013986014e-06, "loss": 0.0006, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 163.5178680419922, "mean_terminated_completion_length": 150.30612182617188, "min_completion_length": 70.0, "min_terminated_completion_length": 70.0, "num_tokens": 2177124.0, "reward": 1.257991075515747, "reward_std": 0.2230372130870819, "rewards/check_gptzero_func/mean": 0.125, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.13299088180065155, "rewards/check_winston_local_func/std": 0.19415271282196045, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 437 }, { "clip_ratio": 0.002750524552538991, "epoch": 0.15325402379286215, "grad_norm": 0.33410307598390615, "kl": 0.2109375, "learning_rate": 7.657342657342658e-06, "loss": -0.0006, "step": 438 }, { "clip_ratio": 0.007434629835188389, "epoch": 0.1536039188243527, "grad_norm": 0.32919642852417436, "kl": 0.2099609375, "learning_rate": 7.674825174825176e-06, "loss": -0.0025, "step": 439 }, { "clip_ratio": 0.01222111377865076, "epoch": 0.15395381385584325, "grad_norm": 0.24734484801199325, "kl": 0.2158203125, "learning_rate": 7.692307692307694e-06, "loss": -0.004, "step": 440 }, { "clip_ratio": 0.002999285701662302, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.1543037088873338, "grad_norm": 0.49158071761842975, "kl": 0.33203125, "learning_rate": 7.70979020979021e-06, "loss": 0.0016, "max_completion_length": 256.0, "max_terminated_completion_length": 195.0, "mean_completion_length": 139.32144165039062, "mean_terminated_completion_length": 119.875, "min_completion_length": 69.0, "min_terminated_completion_length": 69.0, "num_tokens": 2195094.0, "reward": 1.2170443534851074, "reward_std": 0.04073408246040344, "rewards/check_gptzero_func/mean": 0.1428571492433548, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.08717413991689682, "rewards/check_winston_local_func/std": 0.07963847368955612, "rewards/sentence_count_match_reward_logic/mean": 0.9870129823684692, "rewards/sentence_count_match_reward_logic/std": 0.03209943696856499, "step": 441 }, { "clip_ratio": 0.005014142952859402, "epoch": 0.15465360391882435, "grad_norm": 0.3785956236838834, "kl": 0.30078125, "learning_rate": 7.727272727272727e-06, "loss": -0.0006, "step": 442 }, { "clip_ratio": 0.012624637223780155, "epoch": 0.1550034989503149, "grad_norm": 0.32347214042226397, "kl": 0.291015625, "learning_rate": 7.744755244755245e-06, "loss": -0.0016, "step": 443 }, { "clip_ratio": 0.020246217027306557, "epoch": 0.15535339398180545, "grad_norm": 0.2487738331865957, "kl": 0.294921875, "learning_rate": 7.762237762237763e-06, "loss": -0.0028, "step": 444 }, { "clip_ratio": 0.0033458094112575054, "clipped_completions_ratio": 0.0, "epoch": 0.155703289013296, "grad_norm": 0.28606451902402796, "kl": 0.25390625, "learning_rate": 7.77972027972028e-06, "loss": -0.0001, "max_completion_length": 207.0, "max_terminated_completion_length": 207.0, "mean_completion_length": 155.80357360839844, "mean_terminated_completion_length": 155.80357360839844, "min_completion_length": 118.0, "min_terminated_completion_length": 118.0, "num_tokens": 2214827.0, "reward": 1.3670592308044434, "reward_std": 0.03967110067605972, "rewards/check_gptzero_func/mean": 0.2857142984867096, "rewards/check_gptzero_func/std": 0.4558423161506653, "rewards/check_winston_local_func/mean": 0.0813448429107666, "rewards/check_winston_local_func/std": 0.14711511135101318, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 445 }, { "clip_ratio": 0.003140495391562581, "epoch": 0.15605318404478657, "grad_norm": 0.2883874864406178, "kl": 0.275390625, "learning_rate": 7.797202797202798e-06, "loss": -0.0007, "step": 446 }, { "clip_ratio": 0.006892256438732147, "epoch": 0.15640307907627712, "grad_norm": 0.2778372939030984, "kl": 0.275390625, "learning_rate": 7.814685314685316e-06, "loss": -0.0022, "step": 447 }, { "clip_ratio": 0.012521302327513695, "epoch": 0.15675297410776767, "grad_norm": 0.24648534480171116, "kl": 0.26953125, "learning_rate": 7.832167832167833e-06, "loss": -0.0037, "step": 448 }, { "clip_ratio": 0.0027369544841349125, "clipped_completions_ratio": 0.0, "epoch": 0.15710286913925822, "grad_norm": 0.40360035748540973, "kl": 0.2431640625, "learning_rate": 7.84965034965035e-06, "loss": 0.0054, "max_completion_length": 241.0, "max_terminated_completion_length": 241.0, "mean_completion_length": 163.21429443359375, "mean_terminated_completion_length": 163.21429443359375, "min_completion_length": 82.0, "min_terminated_completion_length": 82.0, "num_tokens": 2235183.0, "reward": 1.2893269062042236, "reward_std": 0.17071498930454254, "rewards/check_gptzero_func/mean": 0.2142857164144516, "rewards/check_gptzero_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.08575532585382462, "rewards/check_winston_local_func/std": 0.14402316510677338, "rewards/sentence_count_match_reward_logic/mean": 0.9892857670783997, "rewards/sentence_count_match_reward_logic/std": 0.045441556721925735, "step": 449 }, { "clip_ratio": 0.006195554509758949, "epoch": 0.15745276417074877, "grad_norm": 0.34595329203246755, "kl": 0.2470703125, "learning_rate": 7.867132867132867e-06, "loss": 0.0042, "step": 450 }, { "clip_ratio": 0.014319048263132572, "epoch": 0.15780265920223932, "grad_norm": 0.3470155744282075, "kl": 0.2451171875, "learning_rate": 7.884615384615384e-06, "loss": 0.0027, "step": 451 }, { "clip_ratio": 0.022648591548204422, "epoch": 0.15815255423372987, "grad_norm": 0.36077278699446846, "kl": 0.234375, "learning_rate": 7.902097902097902e-06, "loss": 0.0008, "step": 452 }, { "clip_ratio": 0.004959539510309696, "clipped_completions_ratio": 0.0, "epoch": 0.15850244926522045, "grad_norm": 0.42209207380646224, "kl": 0.255859375, "learning_rate": 7.91958041958042e-06, "loss": -0.0012, "max_completion_length": 255.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 134.85714721679688, "mean_terminated_completion_length": 134.85714721679688, "min_completion_length": 69.0, "min_terminated_completion_length": 69.0, "num_tokens": 2252599.0, "reward": 1.2714593410491943, "reward_std": 0.24827757477760315, "rewards/check_gptzero_func/mean": 0.1607142835855484, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.11074503511190414, "rewards/check_winston_local_func/std": 0.2112496793270111, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 453 }, { "clip_ratio": 0.00645465450361371, "epoch": 0.158852344296711, "grad_norm": 0.4142915021863042, "kl": 0.271484375, "learning_rate": 7.937062937062937e-06, "loss": -0.0015, "step": 454 }, { "clip_ratio": 0.013211087323725224, "epoch": 0.15920223932820154, "grad_norm": 0.3019763350419255, "kl": 0.26953125, "learning_rate": 7.954545454545455e-06, "loss": -0.0035, "step": 455 }, { "clip_ratio": 0.021898964419960976, "epoch": 0.1595521343596921, "grad_norm": 0.3100828044296115, "kl": 0.29296875, "learning_rate": 7.972027972027973e-06, "loss": -0.0047, "step": 456 }, { "clip_ratio": 0.002177079673856497, "clipped_completions_ratio": 0.0892857142857143, "epoch": 0.15990202939118264, "grad_norm": 0.23265989767002643, "kl": 0.2294921875, "learning_rate": 7.98951048951049e-06, "loss": -0.0009, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 201.1428680419922, "mean_terminated_completion_length": 195.76470947265625, "min_completion_length": 154.0, "min_terminated_completion_length": 154.0, "num_tokens": 2277375.0, "reward": 1.0888543128967285, "reward_std": 0.08349988609552383, "rewards/check_gptzero_func/mean": 0.0357142873108387, "rewards/check_gptzero_func/std": 0.1872563362121582, "rewards/check_winston_local_func/mean": 0.053139906376600266, "rewards/check_winston_local_func/std": 0.11596661061048508, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 457 }, { "clip_ratio": 0.00263209524564445, "epoch": 0.1602519244226732, "grad_norm": 0.23106640376552798, "kl": 0.2373046875, "learning_rate": 8.006993006993008e-06, "loss": -0.0017, "step": 458 }, { "clip_ratio": 0.004629111848771572, "epoch": 0.16060181945416374, "grad_norm": 0.21253050357111644, "kl": 0.23046875, "learning_rate": 8.024475524475524e-06, "loss": -0.0025, "step": 459 }, { "clip_ratio": 0.008189460262656212, "epoch": 0.1609517144856543, "grad_norm": 0.1881925290925868, "kl": 0.21484375, "learning_rate": 8.041958041958042e-06, "loss": -0.0034, "step": 460 }, { "clip_ratio": 0.002259187400341034, "clipped_completions_ratio": 0.0, "epoch": 0.16130160951714487, "grad_norm": 0.3392365052898801, "kl": 0.291015625, "learning_rate": 8.05944055944056e-06, "loss": -0.0014, "max_completion_length": 223.0, "max_terminated_completion_length": 223.0, "mean_completion_length": 166.2678680419922, "mean_terminated_completion_length": 166.2678680419922, "min_completion_length": 125.0, "min_terminated_completion_length": 125.0, "num_tokens": 2298254.0, "reward": 1.140175461769104, "reward_std": 0.15021425485610962, "rewards/check_gptzero_func/mean": 0.0357142873108387, "rewards/check_gptzero_func/std": 0.1872563362121582, "rewards/check_winston_local_func/mean": 0.1394314169883728, "rewards/check_winston_local_func/std": 0.18443666398525238, "rewards/sentence_count_match_reward_logic/mean": 0.9650297164916992, "rewards/sentence_count_match_reward_logic/std": 0.08732008188962936, "step": 461 }, { "clip_ratio": 0.004020337946712971, "epoch": 0.16165150454863542, "grad_norm": 0.3193987139381406, "kl": 0.296875, "learning_rate": 8.076923076923077e-06, "loss": -0.0023, "step": 462 }, { "clip_ratio": 0.006451794411987066, "epoch": 0.16200139958012597, "grad_norm": 0.3108882246227795, "kl": 0.29296875, "learning_rate": 8.094405594405595e-06, "loss": -0.0034, "step": 463 }, { "clip_ratio": 0.011719276197254658, "epoch": 0.16235129461161651, "grad_norm": 0.3757426638301859, "kl": 0.271484375, "learning_rate": 8.111888111888112e-06, "loss": -0.0044, "step": 464 }, { "clip_ratio": 0.0025372756645083427, "clipped_completions_ratio": 0.0, "epoch": 0.16270118964310706, "grad_norm": 0.36021475263896124, "kl": 0.275390625, "learning_rate": 8.12937062937063e-06, "loss": -0.0025, "max_completion_length": 228.0, "max_terminated_completion_length": 228.0, "mean_completion_length": 148.0357208251953, "mean_terminated_completion_length": 148.0357208251953, "min_completion_length": 104.0, "min_terminated_completion_length": 104.0, "num_tokens": 2317208.0, "reward": 1.034956455230713, "reward_std": 0.02797347865998745, "rewards/check_gptzero_func/mean": 0.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.03495639190077782, "rewards/check_winston_local_func/std": 0.04446538910269737, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 465 }, { "clip_ratio": 0.004700044170022011, "epoch": 0.1630510846745976, "grad_norm": 0.33247219581084597, "kl": 0.27734375, "learning_rate": 8.146853146853148e-06, "loss": -0.0041, "step": 466 }, { "clip_ratio": 0.01000574417412281, "epoch": 0.16340097970608816, "grad_norm": 0.31900830385325896, "kl": 0.287109375, "learning_rate": 8.164335664335665e-06, "loss": -0.0053, "step": 467 }, { "clip_ratio": 0.023370934650301933, "epoch": 0.16375087473757874, "grad_norm": 0.2690090340025171, "kl": 0.310546875, "learning_rate": 8.181818181818183e-06, "loss": -0.0073, "step": 468 }, { "clip_ratio": 0.003969726152718067, "clipped_completions_ratio": 0.0, "epoch": 0.1641007697690693, "grad_norm": 0.3550342335616597, "kl": 0.328125, "learning_rate": 8.199300699300699e-06, "loss": 0.0016, "max_completion_length": 213.0, "max_terminated_completion_length": 213.0, "mean_completion_length": 158.82144165039062, "mean_terminated_completion_length": 158.82144165039062, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 2337142.0, "reward": 1.3998816013336182, "reward_std": 0.2442658394575119, "rewards/check_gptzero_func/mean": 0.2857142984867096, "rewards/check_gptzero_func/std": 0.4558423161506653, "rewards/check_winston_local_func/mean": 0.16391213238239288, "rewards/check_winston_local_func/std": 0.2834424376487732, "rewards/sentence_count_match_reward_logic/mean": 0.9502550959587097, "rewards/sentence_count_match_reward_logic/std": 0.10782865434885025, "step": 469 }, { "clip_ratio": 0.004554621875286102, "epoch": 0.16445066480055984, "grad_norm": 0.34720906961520326, "kl": 0.31640625, "learning_rate": 8.216783216783217e-06, "loss": 0.0, "step": 470 }, { "clip_ratio": 0.008967523463070393, "epoch": 0.1648005598320504, "grad_norm": 0.2948200144582828, "kl": 0.302734375, "learning_rate": 8.234265734265734e-06, "loss": -0.0014, "step": 471 }, { "clip_ratio": 0.016885969787836075, "epoch": 0.16515045486354094, "grad_norm": 0.24750417424237392, "kl": 0.2890625, "learning_rate": 8.251748251748254e-06, "loss": -0.0035, "step": 472 }, { "clip_ratio": 0.00251864199526608, "clipped_completions_ratio": 0.0, "epoch": 0.16550034989503148, "grad_norm": 0.3077135537889204, "kl": 0.1787109375, "learning_rate": 8.26923076923077e-06, "loss": 0.0006, "max_completion_length": 212.0, "max_terminated_completion_length": 212.0, "mean_completion_length": 178.62501525878906, "mean_terminated_completion_length": 178.62501525878906, "min_completion_length": 113.0, "min_terminated_completion_length": 113.0, "num_tokens": 2359489.0, "reward": 1.3425127267837524, "reward_std": 0.09478064626455307, "rewards/check_gptzero_func/mean": 0.1785714328289032, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.17442873120307922, "rewards/check_winston_local_func/std": 0.34610339999198914, "rewards/sentence_count_match_reward_logic/mean": 0.9895124435424805, "rewards/sentence_count_match_reward_logic/std": 0.0340137779712677, "step": 473 }, { "clip_ratio": 0.0029904141556471586, "epoch": 0.16585024492652203, "grad_norm": 0.3003293542402224, "kl": 0.1787109375, "learning_rate": 8.286713286713287e-06, "loss": -0.0007, "step": 474 }, { "clip_ratio": 0.004408208653330803, "epoch": 0.16620013995801258, "grad_norm": 0.2719613333749883, "kl": 0.1845703125, "learning_rate": 8.304195804195805e-06, "loss": -0.0018, "step": 475 }, { "clip_ratio": 0.010373151861131191, "epoch": 0.16655003498950316, "grad_norm": 0.2646289705790753, "kl": 0.193359375, "learning_rate": 8.321678321678323e-06, "loss": -0.0035, "step": 476 }, { "clip_ratio": 0.0038805799558758736, "clipped_completions_ratio": 0.0, "epoch": 0.1668999300209937, "grad_norm": 0.2948416454338538, "kl": 0.271484375, "learning_rate": 8.33916083916084e-06, "loss": -0.005, "max_completion_length": 247.0, "max_terminated_completion_length": 247.0, "mean_completion_length": 184.57144165039062, "mean_terminated_completion_length": 184.57144165039062, "min_completion_length": 106.0, "min_terminated_completion_length": 106.0, "num_tokens": 2382353.0, "reward": 1.1343214511871338, "reward_std": 0.05412138253450394, "rewards/check_gptzero_func/mean": 0.125, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.01527373306453228, "rewards/check_winston_local_func/std": 0.014383594505488873, "rewards/sentence_count_match_reward_logic/mean": 0.9940475821495056, "rewards/sentence_count_match_reward_logic/std": 0.031209392473101616, "step": 477 }, { "clip_ratio": 0.00593605125322938, "epoch": 0.16724982505248426, "grad_norm": 0.29171782029839133, "kl": 0.279296875, "learning_rate": 8.356643356643356e-06, "loss": -0.0059, "step": 478 }, { "clip_ratio": 0.008455800823867321, "epoch": 0.1675997200839748, "grad_norm": 0.25419091255177223, "kl": 0.2734375, "learning_rate": 8.374125874125874e-06, "loss": -0.0075, "step": 479 }, { "clip_ratio": 0.013121899217367172, "epoch": 0.16794961511546536, "grad_norm": 0.23742948658106616, "kl": 0.279296875, "learning_rate": 8.391608391608393e-06, "loss": -0.0087, "step": 480 }, { "clip_ratio": 0.00287855276837945, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.1682995101469559, "grad_norm": 0.40347888846392727, "kl": 0.248046875, "learning_rate": 8.40909090909091e-06, "loss": 0.0009, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 174.44644165039062, "mean_terminated_completion_length": 171.42593383789062, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 2403898.0, "reward": 1.5372226238250732, "reward_std": 0.05712408944964409, "rewards/check_gptzero_func/mean": 0.4285714328289032, "rewards/check_gptzero_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.1265082210302353, "rewards/check_winston_local_func/std": 0.18959854543209076, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.04767312481999397, "step": 481 }, { "clip_ratio": 0.005462454166263342, "epoch": 0.16864940517844645, "grad_norm": 0.3320101402928167, "kl": 0.2578125, "learning_rate": 8.426573426573428e-06, "loss": -0.0007, "step": 482 }, { "clip_ratio": 0.01003098301589489, "epoch": 0.16899930020993703, "grad_norm": 0.2921577499982277, "kl": 0.2578125, "learning_rate": 8.444055944055944e-06, "loss": -0.0018, "step": 483 }, { "clip_ratio": 0.01585288904607296, "epoch": 0.16934919524142758, "grad_norm": 0.22937327173937327, "kl": 0.25390625, "learning_rate": 8.461538461538462e-06, "loss": -0.0029, "step": 484 }, { "clip_ratio": 0.003903373144567013, "clipped_completions_ratio": 0.0, "epoch": 0.16969909027291813, "grad_norm": 0.41464754882483956, "kl": 0.271484375, "learning_rate": 8.47902097902098e-06, "loss": 0.0037, "max_completion_length": 199.0, "max_terminated_completion_length": 199.0, "mean_completion_length": 146.5357208251953, "mean_terminated_completion_length": 146.5357208251953, "min_completion_length": 76.0, "min_terminated_completion_length": 76.0, "num_tokens": 2422520.0, "reward": 1.1722151041030884, "reward_std": 0.030032414942979813, "rewards/check_gptzero_func/mean": 0.1428571492433548, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.03190891072154045, "rewards/check_winston_local_func/std": 0.05681261420249939, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 485 }, { "clip_ratio": 0.005240556783974171, "epoch": 0.17004898530440868, "grad_norm": 0.365867919694595, "kl": 0.2578125, "learning_rate": 8.496503496503497e-06, "loss": 0.0023, "step": 486 }, { "clip_ratio": 0.012498365715146065, "epoch": 0.17039888033589923, "grad_norm": 0.3716297616267854, "kl": 0.26171875, "learning_rate": 8.513986013986013e-06, "loss": 0.0011, "step": 487 }, { "clip_ratio": 0.02442135661840439, "epoch": 0.17074877536738978, "grad_norm": 0.25389656408310374, "kl": 0.306640625, "learning_rate": 8.531468531468533e-06, "loss": -0.0006, "step": 488 }, { "clip_ratio": 0.0035830645356327295, "clipped_completions_ratio": 0.0, "epoch": 0.17109867039888033, "grad_norm": 0.35296740785694264, "kl": 0.267578125, "learning_rate": 8.54895104895105e-06, "loss": -0.012, "max_completion_length": 254.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 185.07144165039062, "mean_terminated_completion_length": 185.07144165039062, "min_completion_length": 119.0, "min_terminated_completion_length": 119.0, "num_tokens": 2445220.0, "reward": 1.3449348211288452, "reward_std": 0.06428959220647812, "rewards/check_gptzero_func/mean": 0.2857142984867096, "rewards/check_gptzero_func/std": 0.4558423161506653, "rewards/check_winston_local_func/mean": 0.06591688841581345, "rewards/check_winston_local_func/std": 0.11531028896570206, "rewards/sentence_count_match_reward_logic/mean": 0.9933035969734192, "rewards/sentence_count_match_reward_logic/std": 0.05011148378252983, "step": 489 }, { "clip_ratio": 0.005666172597557306, "epoch": 0.17144856543037088, "grad_norm": 0.3060317102878857, "kl": 0.27734375, "learning_rate": 8.566433566433568e-06, "loss": -0.0133, "step": 490 }, { "clip_ratio": 0.010287150740623474, "epoch": 0.17179846046186145, "grad_norm": 0.28812346666542854, "kl": 0.283203125, "learning_rate": 8.583916083916086e-06, "loss": -0.0145, "step": 491 }, { "clip_ratio": 0.01527146715670824, "epoch": 0.172148355493352, "grad_norm": 0.24661128708593252, "kl": 0.279296875, "learning_rate": 8.601398601398602e-06, "loss": -0.0158, "step": 492 }, { "clip_ratio": 0.003737964201718569, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.17249825052484255, "grad_norm": 0.32203134050190146, "kl": 0.30078125, "learning_rate": 8.61888111888112e-06, "loss": -0.0038, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 182.33929443359375, "mean_terminated_completion_length": 179.61111450195312, "min_completion_length": 114.0, "min_terminated_completion_length": 114.0, "num_tokens": 2467895.0, "reward": 1.1521400213241577, "reward_std": 0.002568450989201665, "rewards/check_gptzero_func/mean": 0.1428571492433548, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.009282874874770641, "rewards/check_winston_local_func/std": 0.004464148078113794, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 493 }, { "clip_ratio": 0.004685242660343647, "epoch": 0.1728481455563331, "grad_norm": 0.307516617411171, "kl": 0.30078125, "learning_rate": 8.636363636363637e-06, "loss": -0.0048, "step": 494 }, { "clip_ratio": 0.006979783996939659, "epoch": 0.17319804058782365, "grad_norm": 0.27912895288006806, "kl": 0.302734375, "learning_rate": 8.653846153846155e-06, "loss": -0.0058, "step": 495 }, { "clip_ratio": 0.01673518493771553, "epoch": 0.1735479356193142, "grad_norm": 0.2582385470772511, "kl": 0.322265625, "learning_rate": 8.671328671328672e-06, "loss": -0.0076, "step": 496 }, { "clip_ratio": 0.003491037292405963, "clipped_completions_ratio": 0.0, "epoch": 0.17389783065080475, "grad_norm": 0.3539729217882104, "kl": 0.294921875, "learning_rate": 8.68881118881119e-06, "loss": 0.0026, "max_completion_length": 239.0, "max_terminated_completion_length": 239.0, "mean_completion_length": 143.35714721679688, "mean_terminated_completion_length": 143.35714721679688, "min_completion_length": 82.0, "min_terminated_completion_length": 82.0, "num_tokens": 2485947.0, "reward": 1.2827284336090088, "reward_std": 0.08512762188911438, "rewards/check_gptzero_func/mean": 0.125, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.1577284336090088, "rewards/check_winston_local_func/std": 0.2943573296070099, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 497 }, { "clip_ratio": 0.004431830253452063, "epoch": 0.17424772568229532, "grad_norm": 0.32171647051923175, "kl": 0.30859375, "learning_rate": 8.706293706293708e-06, "loss": 0.0014, "step": 498 }, { "clip_ratio": 0.011924756690859795, "epoch": 0.17459762071378587, "grad_norm": 0.2897111020992882, "kl": 0.31640625, "learning_rate": 8.723776223776225e-06, "loss": -0.0012, "step": 499 }, { "clip_ratio": 0.01903114654123783, "epoch": 0.17494751574527642, "grad_norm": 0.3056078591694228, "kl": 0.326171875, "learning_rate": 8.741258741258743e-06, "loss": -0.0024, "step": 500 }, { "clip_ratio": 0.003899830160662532, "clipped_completions_ratio": 0.0, "epoch": 0.17529741077676697, "grad_norm": 0.41598779495151667, "kl": 0.259765625, "learning_rate": 8.758741258741259e-06, "loss": -0.001, "max_completion_length": 196.0, "max_terminated_completion_length": 196.0, "mean_completion_length": 136.80357360839844, "mean_terminated_completion_length": 136.80357360839844, "min_completion_length": 108.0, "min_terminated_completion_length": 108.0, "num_tokens": 2503552.0, "reward": 1.371405839920044, "reward_std": 0.11332882940769196, "rewards/check_gptzero_func/mean": 0.25, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.2303343266248703, "rewards/check_winston_local_func/std": 0.3295005261898041, "rewards/sentence_count_match_reward_logic/mean": 0.8910714387893677, "rewards/sentence_count_match_reward_logic/std": 0.2185804545879364, "step": 501 }, { "clip_ratio": 0.00551963085308671, "epoch": 0.17564730580825752, "grad_norm": 0.37939840135158215, "kl": 0.25390625, "learning_rate": 8.776223776223777e-06, "loss": -0.0026, "step": 502 }, { "clip_ratio": 0.011630690656602383, "epoch": 0.17599720083974807, "grad_norm": 0.3030748490786099, "kl": 0.255859375, "learning_rate": 8.793706293706294e-06, "loss": -0.0043, "step": 503 }, { "clip_ratio": 0.023505104705691338, "epoch": 0.17634709587123862, "grad_norm": 0.2621221638183288, "kl": 0.2578125, "learning_rate": 8.811188811188812e-06, "loss": -0.0065, "step": 504 }, { "clip_ratio": 0.003457161597907543, "clipped_completions_ratio": 0.0, "epoch": 0.17669699090272917, "grad_norm": 0.33287712838047584, "kl": 0.2890625, "learning_rate": 8.82867132867133e-06, "loss": 0.0, "max_completion_length": 205.0, "max_terminated_completion_length": 205.0, "mean_completion_length": 150.375, "mean_terminated_completion_length": 150.375, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 2522693.0, "reward": 1.3910528421401978, "reward_std": 0.15971800684928894, "rewards/check_gptzero_func/mean": 0.2678571343421936, "rewards/check_gptzero_func/std": 0.44685041904449463, "rewards/check_winston_local_func/mean": 0.14402909576892853, "rewards/check_winston_local_func/std": 0.22105053067207336, "rewards/sentence_count_match_reward_logic/mean": 0.9791666269302368, "rewards/sentence_count_match_reward_logic/std": 0.05561865493655205, "step": 505 }, { "clip_ratio": 0.005941895302385092, "epoch": 0.17704688593421974, "grad_norm": 0.3011566786519287, "kl": 0.30078125, "learning_rate": 8.846153846153847e-06, "loss": -0.0017, "step": 506 }, { "clip_ratio": 0.0090678995475173, "epoch": 0.1773967809657103, "grad_norm": 0.3294995999234595, "kl": 0.314453125, "learning_rate": 8.863636363636365e-06, "loss": -0.003, "step": 507 }, { "clip_ratio": 0.01896015740931034, "epoch": 0.17774667599720084, "grad_norm": 0.24852498607432666, "kl": 0.298828125, "learning_rate": 8.881118881118883e-06, "loss": -0.0044, "step": 508 }, { "clip_ratio": 0.0021082363091409206, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.1780965710286914, "grad_norm": 0.4233289356936372, "kl": 0.32421875, "learning_rate": 8.8986013986014e-06, "loss": 0.0047, "max_completion_length": 256.0, "max_terminated_completion_length": 231.0, "mean_completion_length": 157.3928680419922, "mean_terminated_completion_length": 140.95834350585938, "min_completion_length": 65.0, "min_terminated_completion_length": 65.0, "num_tokens": 2542883.0, "reward": 1.3084179162979126, "reward_std": 0.07507846504449844, "rewards/check_gptzero_func/mean": 0.25, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.05841785669326782, "rewards/check_winston_local_func/std": 0.09250454604625702, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 509 }, { "clip_ratio": 0.007186931557953358, "epoch": 0.17844646606018194, "grad_norm": 0.3064649794483547, "kl": 0.32421875, "learning_rate": 8.916083916083916e-06, "loss": 0.0027, "step": 510 }, { "clip_ratio": 0.013539891690015793, "epoch": 0.1787963610916725, "grad_norm": 0.26180115563005196, "kl": 0.3203125, "learning_rate": 8.933566433566434e-06, "loss": 0.0011, "step": 511 }, { "clip_ratio": 0.024036280810832977, "epoch": 0.17914625612316304, "grad_norm": 0.24023934034840808, "kl": 0.33984375, "learning_rate": 8.951048951048951e-06, "loss": 0.0, "step": 512 }, { "clip_ratio": 0.0028726565651595592, "clipped_completions_ratio": 0.0, "epoch": 0.17949615115465362, "grad_norm": 0.430765644364522, "kl": 0.2578125, "learning_rate": 8.968531468531469e-06, "loss": 0.0027, "max_completion_length": 212.0, "max_terminated_completion_length": 212.0, "mean_completion_length": 158.7678680419922, "mean_terminated_completion_length": 158.7678680419922, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 2562510.0, "reward": 1.62360417842865, "reward_std": 0.3516157269477844, "rewards/check_gptzero_func/mean": 0.5, "rewards/check_gptzero_func/std": 0.5045249462127686, "rewards/check_winston_local_func/mean": 0.12360402196645737, "rewards/check_winston_local_func/std": 0.17227087914943695, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 513 }, { "clip_ratio": 0.0033508737105876207, "epoch": 0.17984604618614417, "grad_norm": 0.40115312374806766, "kl": 0.26171875, "learning_rate": 8.986013986013987e-06, "loss": 0.0015, "step": 514 }, { "clip_ratio": 0.013999514281749725, "epoch": 0.18019594121763471, "grad_norm": 0.3179972565954623, "kl": 0.271484375, "learning_rate": 9.003496503496504e-06, "loss": -0.0018, "step": 515 }, { "clip_ratio": 0.028148677200078964, "epoch": 0.18054583624912526, "grad_norm": 0.27094596618042743, "kl": 0.263671875, "learning_rate": 9.020979020979022e-06, "loss": -0.0036, "step": 516 }, { "clip_ratio": 0.002350021153688431, "clipped_completions_ratio": 0.0, "epoch": 0.1808957312806158, "grad_norm": 0.32342911697665927, "kl": 0.2890625, "learning_rate": 9.03846153846154e-06, "loss": 0.0025, "max_completion_length": 231.0, "max_terminated_completion_length": 231.0, "mean_completion_length": 169.0, "mean_terminated_completion_length": 169.0, "min_completion_length": 71.0, "min_terminated_completion_length": 71.0, "num_tokens": 2583670.0, "reward": 1.4445035457611084, "reward_std": 0.018006717786192894, "rewards/check_gptzero_func/mean": 0.4285714328289032, "rewards/check_gptzero_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.051646310836076736, "rewards/check_winston_local_func/std": 0.10653294622898102, "rewards/sentence_count_match_reward_logic/mean": 0.9642857313156128, "rewards/sentence_count_match_reward_logic/std": 0.08827348798513412, "step": 517 }, { "clip_ratio": 0.0036240227054804564, "epoch": 0.18124562631210636, "grad_norm": 0.3172956841397585, "kl": 0.28515625, "learning_rate": 9.055944055944057e-06, "loss": 0.0018, "step": 518 }, { "clip_ratio": 0.01001545786857605, "epoch": 0.1815955213435969, "grad_norm": 0.24850739609241754, "kl": 0.279296875, "learning_rate": 9.073426573426573e-06, "loss": -0.0, "step": 519 }, { "clip_ratio": 0.01792219839990139, "epoch": 0.1819454163750875, "grad_norm": 0.24268369180984792, "kl": 0.271484375, "learning_rate": 9.090909090909091e-06, "loss": -0.001, "step": 520 }, { "clip_ratio": 0.0018603203352540731, "clipped_completions_ratio": 0.0, "epoch": 0.18229531140657804, "grad_norm": 0.35748853888093074, "kl": 0.2734375, "learning_rate": 9.108391608391609e-06, "loss": -0.0039, "max_completion_length": 239.0, "max_terminated_completion_length": 239.0, "mean_completion_length": 161.83929443359375, "mean_terminated_completion_length": 161.83929443359375, "min_completion_length": 113.0, "min_terminated_completion_length": 113.0, "num_tokens": 2604045.0, "reward": 1.3263376951217651, "reward_std": 0.18801908195018768, "rewards/check_gptzero_func/mean": 0.2321428507566452, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.09419480711221695, "rewards/check_winston_local_func/std": 0.18392357230186462, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 521 }, { "clip_ratio": 0.004261092282831669, "epoch": 0.18264520643806859, "grad_norm": 0.31856460669379605, "kl": 0.28125, "learning_rate": 9.125874125874126e-06, "loss": -0.0051, "step": 522 }, { "clip_ratio": 0.010556627064943314, "epoch": 0.18299510146955913, "grad_norm": 0.2784115652659218, "kl": 0.28515625, "learning_rate": 9.143356643356644e-06, "loss": -0.0066, "step": 523 }, { "clip_ratio": 0.019772497937083244, "epoch": 0.18334499650104968, "grad_norm": 0.2543186536412269, "kl": 0.28125, "learning_rate": 9.160839160839162e-06, "loss": -0.0084, "step": 524 }, { "clip_ratio": 0.003695501247420907, "clipped_completions_ratio": 0.0535714285714286, "epoch": 0.18369489153254023, "grad_norm": 0.43663730377558757, "kl": 0.287109375, "learning_rate": 9.17832167832168e-06, "loss": -0.0001, "max_completion_length": 256.0, "max_terminated_completion_length": 252.0, "mean_completion_length": 168.75, "mean_terminated_completion_length": 163.8113250732422, "min_completion_length": 73.0, "min_terminated_completion_length": 73.0, "num_tokens": 2625047.0, "reward": 1.5926240682601929, "reward_std": 0.2401302009820938, "rewards/check_gptzero_func/mean": 0.2321428507566452, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.38458824157714844, "rewards/check_winston_local_func/std": 0.3262787163257599, "rewards/sentence_count_match_reward_logic/mean": 0.9758929014205933, "rewards/sentence_count_match_reward_logic/std": 0.057398974895477295, "step": 525 }, { "clip_ratio": 0.006055768113583326, "epoch": 0.18404478656403078, "grad_norm": 0.42911912355480886, "kl": 0.294921875, "learning_rate": 9.195804195804197e-06, "loss": -0.0012, "step": 526 }, { "clip_ratio": 0.014100544154644012, "epoch": 0.18439468159552133, "grad_norm": 0.3613035335574703, "kl": 0.275390625, "learning_rate": 9.213286713286715e-06, "loss": -0.0034, "step": 527 }, { "clip_ratio": 0.026682356372475624, "epoch": 0.1847445766270119, "grad_norm": 0.27875544968618643, "kl": 0.28515625, "learning_rate": 9.230769230769232e-06, "loss": -0.0054, "step": 528 }, { "clip_ratio": 0.003247226122766733, "clipped_completions_ratio": 0.0, "epoch": 0.18509447165850246, "grad_norm": 0.44985281315241354, "kl": 0.3359375, "learning_rate": 9.248251748251748e-06, "loss": 0.0045, "max_completion_length": 213.0, "max_terminated_completion_length": 213.0, "mean_completion_length": 142.08929443359375, "mean_terminated_completion_length": 142.08929443359375, "min_completion_length": 78.0, "min_terminated_completion_length": 78.0, "num_tokens": 2643060.0, "reward": 1.212063193321228, "reward_std": 0.238146111369133, "rewards/check_gptzero_func/mean": 0.1607142835855484, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.10021493583917618, "rewards/check_winston_local_func/std": 0.15861475467681885, "rewards/sentence_count_match_reward_logic/mean": 0.9511337876319885, "rewards/sentence_count_match_reward_logic/std": 0.11764141172170639, "step": 529 }, { "clip_ratio": 0.008719215169548988, "epoch": 0.185444366689993, "grad_norm": 0.37881745708630965, "kl": 0.359375, "learning_rate": 9.265734265734266e-06, "loss": 0.0029, "step": 530 }, { "clip_ratio": 0.01599576137959957, "epoch": 0.18579426172148356, "grad_norm": 0.45996873971295515, "kl": 0.3828125, "learning_rate": 9.283216783216784e-06, "loss": 0.0003, "step": 531 }, { "clip_ratio": 0.024946484714746475, "epoch": 0.1861441567529741, "grad_norm": 0.2652482623571403, "kl": 0.365234375, "learning_rate": 9.300699300699301e-06, "loss": -0.0017, "step": 532 }, { "clip_ratio": 0.0029474894981831312, "clipped_completions_ratio": 0.0, "epoch": 0.18649405178446465, "grad_norm": 0.419370505953391, "kl": 0.353515625, "learning_rate": 9.318181818181819e-06, "loss": -0.0051, "max_completion_length": 212.0, "max_terminated_completion_length": 212.0, "mean_completion_length": 148.57144165039062, "mean_terminated_completion_length": 148.57144165039062, "min_completion_length": 114.0, "min_terminated_completion_length": 114.0, "num_tokens": 2661772.0, "reward": 1.4069041013717651, "reward_std": 0.1159488707780838, "rewards/check_gptzero_func/mean": 0.2678571343421936, "rewards/check_gptzero_func/std": 0.44685041904449463, "rewards/check_winston_local_func/mean": 0.14916595816612244, "rewards/check_winston_local_func/std": 0.14623968303203583, "rewards/sentence_count_match_reward_logic/mean": 0.9898809790611267, "rewards/sentence_count_match_reward_logic/std": 0.04307364672422409, "step": 533 }, { "clip_ratio": 0.006734353024512529, "epoch": 0.1868439468159552, "grad_norm": 0.3756354874999973, "kl": 0.35546875, "learning_rate": 9.335664335664337e-06, "loss": -0.0062, "step": 534 }, { "clip_ratio": 0.013941682875156403, "epoch": 0.18719384184744578, "grad_norm": 0.32842204334741, "kl": 0.359375, "learning_rate": 9.353146853146854e-06, "loss": -0.0088, "step": 535 }, { "clip_ratio": 0.02483806386590004, "epoch": 0.18754373687893633, "grad_norm": 0.31577756039371585, "kl": 0.396484375, "learning_rate": 9.370629370629372e-06, "loss": -0.0107, "step": 536 }, { "clip_ratio": 0.001866485457867384, "clipped_completions_ratio": 0.0, "epoch": 0.18789363191042688, "grad_norm": 0.47224838563489985, "kl": 0.361328125, "learning_rate": 9.38811188811189e-06, "loss": 0.0002, "max_completion_length": 219.0, "max_terminated_completion_length": 219.0, "mean_completion_length": 147.9107208251953, "mean_terminated_completion_length": 147.9107208251953, "min_completion_length": 81.0, "min_terminated_completion_length": 81.0, "num_tokens": 2680367.0, "reward": 1.420225977897644, "reward_std": 0.1977434903383255, "rewards/check_gptzero_func/mean": 0.2678571343421936, "rewards/check_gptzero_func/std": 0.44685041904449463, "rewards/check_winston_local_func/mean": 0.1574709415435791, "rewards/check_winston_local_func/std": 0.15711145102977753, "rewards/sentence_count_match_reward_logic/mean": 0.9948979616165161, "rewards/sentence_count_match_reward_logic/std": 0.026750901713967323, "step": 537 }, { "clip_ratio": 0.0065506258979439735, "epoch": 0.18824352694191743, "grad_norm": 0.3830177108500331, "kl": 0.34765625, "learning_rate": 9.405594405594406e-06, "loss": -0.0014, "step": 538 }, { "clip_ratio": 0.01545481663197279, "epoch": 0.18859342197340798, "grad_norm": 0.3028414593763439, "kl": 0.357421875, "learning_rate": 9.423076923076923e-06, "loss": -0.0029, "step": 539 }, { "clip_ratio": 0.02784229815006256, "epoch": 0.18894331700489853, "grad_norm": 0.28406499845202426, "kl": 0.390625, "learning_rate": 9.44055944055944e-06, "loss": -0.0044, "step": 540 }, { "clip_ratio": 0.00211763265542686, "clipped_completions_ratio": 0.25, "epoch": 0.18929321203638907, "grad_norm": 0.39766625462474514, "kl": 0.259765625, "learning_rate": 9.458041958041958e-06, "loss": -0.0036, "max_completion_length": 256.0, "max_terminated_completion_length": 252.0, "mean_completion_length": 170.9107208251953, "mean_terminated_completion_length": 142.54762268066406, "min_completion_length": 104.0, "min_terminated_completion_length": 104.0, "num_tokens": 2701546.0, "reward": 1.4147971868515015, "reward_std": 0.20396539568901062, "rewards/check_gptzero_func/mean": 0.2142857164144516, "rewards/check_gptzero_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.20547161996364594, "rewards/check_winston_local_func/std": 0.29537487030029297, "rewards/sentence_count_match_reward_logic/mean": 0.9950396418571472, "rewards/sentence_count_match_reward_logic/std": 0.02654176577925682, "step": 541 }, { "clip_ratio": 0.004080981016159058, "epoch": 0.18964310706787962, "grad_norm": 0.34929446091253585, "kl": 0.271484375, "learning_rate": 9.475524475524476e-06, "loss": -0.0049, "step": 542 }, { "clip_ratio": 0.010366237722337246, "epoch": 0.1899930020993702, "grad_norm": 0.297322672878404, "kl": 0.28125, "learning_rate": 9.493006993006994e-06, "loss": -0.0076, "step": 543 }, { "clip_ratio": 0.022945581004023552, "epoch": 0.19034289713086075, "grad_norm": 0.26340943933910377, "kl": 0.29296875, "learning_rate": 9.510489510489511e-06, "loss": -0.0089, "step": 544 }, { "clip_ratio": 0.004682681988924742, "clipped_completions_ratio": 0.0, "epoch": 0.1906927921623513, "grad_norm": 0.5251021234213183, "kl": 0.380859375, "learning_rate": 9.527972027972029e-06, "loss": 0.0042, "max_completion_length": 234.0, "max_terminated_completion_length": 234.0, "mean_completion_length": 157.57144165039062, "mean_terminated_completion_length": 157.57144165039062, "min_completion_length": 75.0, "min_terminated_completion_length": 75.0, "num_tokens": 2721274.0, "reward": 1.6971756219863892, "reward_std": 0.24070307612419128, "rewards/check_gptzero_func/mean": 0.3571428656578064, "rewards/check_gptzero_func/std": 0.48349377512931824, "rewards/check_winston_local_func/mean": 0.357889860868454, "rewards/check_winston_local_func/std": 0.3277137577533722, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.04767312481999397, "step": 545 }, { "clip_ratio": 0.007579232100397348, "epoch": 0.19104268719384185, "grad_norm": 0.4521425667303527, "kl": 0.400390625, "learning_rate": 9.545454545454547e-06, "loss": 0.0014, "step": 546 }, { "clip_ratio": 0.023295441642403603, "epoch": 0.1913925822253324, "grad_norm": 0.39328706826256216, "kl": 0.427734375, "learning_rate": 9.562937062937063e-06, "loss": -0.0005, "step": 547 }, { "clip_ratio": 0.03594477102160454, "epoch": 0.19174247725682295, "grad_norm": 0.3322105604411964, "kl": 0.4296875, "learning_rate": 9.58041958041958e-06, "loss": -0.0029, "step": 548 }, { "clip_ratio": 0.003079479094594717, "clipped_completions_ratio": 0.0, "epoch": 0.1920923722883135, "grad_norm": 0.40114627120771656, "kl": 0.318359375, "learning_rate": 9.597902097902098e-06, "loss": 0.0041, "max_completion_length": 221.0, "max_terminated_completion_length": 221.0, "mean_completion_length": 165.05357360839844, "mean_terminated_completion_length": 165.05357360839844, "min_completion_length": 115.0, "min_terminated_completion_length": 115.0, "num_tokens": 2741581.0, "reward": 1.354027509689331, "reward_std": 0.12854821979999542, "rewards/check_gptzero_func/mean": 0.1785714328289032, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.17992033064365387, "rewards/check_winston_local_func/std": 0.2506072521209717, "rewards/sentence_count_match_reward_logic/mean": 0.9955357313156128, "rewards/sentence_count_match_reward_logic/std": 0.033407654613256454, "step": 549 }, { "clip_ratio": 0.005300926510244608, "epoch": 0.19244226731980407, "grad_norm": 0.3671618287114479, "kl": 0.30859375, "learning_rate": 9.615384615384616e-06, "loss": 0.0027, "step": 550 }, { "clip_ratio": 0.01192291546612978, "epoch": 0.19279216235129462, "grad_norm": 0.3956044690770405, "kl": 0.3046875, "learning_rate": 9.632867132867133e-06, "loss": 0.0009, "step": 551 }, { "clip_ratio": 0.023997141048312187, "epoch": 0.19314205738278517, "grad_norm": 0.23471642935297218, "kl": 0.318359375, "learning_rate": 9.650349650349651e-06, "loss": -0.0017, "step": 552 }, { "clip_ratio": 0.001936074928380549, "clipped_completions_ratio": 0.2678571428571429, "epoch": 0.19349195241427572, "grad_norm": 0.3567763395563354, "kl": 0.283203125, "learning_rate": 9.667832167832169e-06, "loss": -0.0009, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 197.9107208251953, "mean_terminated_completion_length": 176.6585235595703, "min_completion_length": 132.0, "min_terminated_completion_length": 132.0, "num_tokens": 2766336.0, "reward": 1.3411049842834473, "reward_std": 0.21134565770626068, "rewards/check_gptzero_func/mean": 0.2321428507566452, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.1268191635608673, "rewards/check_winston_local_func/std": 0.12632320821285248, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.05201565474271774, "step": 553 }, { "clip_ratio": 0.005047404672950506, "epoch": 0.19384184744576627, "grad_norm": 0.33312172086284386, "kl": 0.29296875, "learning_rate": 9.685314685314686e-06, "loss": -0.0022, "step": 554 }, { "clip_ratio": 0.012919927015900612, "epoch": 0.19419174247725682, "grad_norm": 0.3329858049452869, "kl": 0.30859375, "learning_rate": 9.702797202797204e-06, "loss": -0.0043, "step": 555 }, { "clip_ratio": 0.023546569049358368, "epoch": 0.19454163750874737, "grad_norm": 0.2791816220836572, "kl": 0.3203125, "learning_rate": 9.72027972027972e-06, "loss": -0.0062, "step": 556 }, { "clip_ratio": 0.0027796952053904533, "clipped_completions_ratio": 0.017857142857142905, "epoch": 0.19489153254023792, "grad_norm": 0.432335290298135, "kl": 0.359375, "learning_rate": 9.737762237762238e-06, "loss": 0.0011, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 173.67857360839844, "mean_terminated_completion_length": 172.1818084716797, "min_completion_length": 90.0, "min_terminated_completion_length": 90.0, "num_tokens": 2787966.0, "reward": 1.2432159185409546, "reward_std": 0.12903258204460144, "rewards/check_gptzero_func/mean": 0.0357142873108387, "rewards/check_gptzero_func/std": 0.1872563362121582, "rewards/check_winston_local_func/mean": 0.2279098778963089, "rewards/check_winston_local_func/std": 0.26556435227394104, "rewards/sentence_count_match_reward_logic/mean": 0.9795918464660645, "rewards/sentence_count_match_reward_logic/std": 0.050441987812519073, "step": 557 }, { "clip_ratio": 0.005890588276088238, "epoch": 0.1952414275717285, "grad_norm": 0.39385599466365895, "kl": 0.361328125, "learning_rate": 9.755244755244755e-06, "loss": 0.0005, "step": 558 }, { "clip_ratio": 0.015469936653971672, "epoch": 0.19559132260321904, "grad_norm": 0.35289780911844654, "kl": 0.359375, "learning_rate": 9.772727272727273e-06, "loss": -0.0026, "step": 559 }, { "clip_ratio": 0.028049185872077942, "epoch": 0.1959412176347096, "grad_norm": 0.2868260384753504, "kl": 0.380859375, "learning_rate": 9.79020979020979e-06, "loss": -0.004, "step": 560 }, { "clip_ratio": 0.0022649010643363, "clipped_completions_ratio": 0.2678571428571429, "epoch": 0.19629111266620014, "grad_norm": 0.3607050653732719, "kl": 0.27734375, "learning_rate": 9.807692307692308e-06, "loss": 0.0026, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 207.1428680419922, "mean_terminated_completion_length": 189.26828002929688, "min_completion_length": 110.0, "min_terminated_completion_length": 110.0, "num_tokens": 2813622.0, "reward": 1.3934189081192017, "reward_std": 0.12287566065788269, "rewards/check_gptzero_func/mean": 0.2678571343421936, "rewards/check_gptzero_func/std": 0.44685041904449463, "rewards/check_winston_local_func/mean": 0.13553392887115479, "rewards/check_winston_local_func/std": 0.1608913242816925, "rewards/sentence_count_match_reward_logic/mean": 0.9900278449058533, "rewards/sentence_count_match_reward_logic/std": 0.03304014727473259, "step": 561 }, { "clip_ratio": 0.0029829093255102634, "epoch": 0.1966410076976907, "grad_norm": 0.344900050037974, "kl": 0.2734375, "learning_rate": 9.825174825174826e-06, "loss": 0.0013, "step": 562 }, { "clip_ratio": 0.01024604495614767, "epoch": 0.19699090272918124, "grad_norm": 0.2775678227242367, "kl": 0.279296875, "learning_rate": 9.842657342657344e-06, "loss": -0.0008, "step": 563 }, { "clip_ratio": 0.022250337526202202, "epoch": 0.1973407977606718, "grad_norm": 0.22767471752414994, "kl": 0.28515625, "learning_rate": 9.860139860139861e-06, "loss": -0.0027, "step": 564 }, { "clip_ratio": 0.002892527962103486, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.19769069279216236, "grad_norm": 0.41902426401368575, "kl": 0.361328125, "learning_rate": 9.877622377622379e-06, "loss": 0.0052, "max_completion_length": 256.0, "max_terminated_completion_length": 217.0, "mean_completion_length": 174.48214721679688, "mean_terminated_completion_length": 160.89584350585938, "min_completion_length": 108.0, "min_terminated_completion_length": 108.0, "num_tokens": 2835657.0, "reward": 1.3309686183929443, "reward_std": 0.1842060536146164, "rewards/check_gptzero_func/mean": 0.25, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.08096853643655777, "rewards/check_winston_local_func/std": 0.10865692794322968, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 565 }, { "clip_ratio": 0.006333273835480213, "epoch": 0.1980405878236529, "grad_norm": 0.40418262642006275, "kl": 0.373046875, "learning_rate": 9.895104895104895e-06, "loss": 0.0032, "step": 566 }, { "clip_ratio": 0.013836399652063847, "epoch": 0.19839048285514346, "grad_norm": 0.3739877354350468, "kl": 0.3671875, "learning_rate": 9.912587412587413e-06, "loss": 0.0003, "step": 567 }, { "clip_ratio": 0.02258377894759178, "epoch": 0.198740377886634, "grad_norm": 0.2821446883320711, "kl": 0.36328125, "learning_rate": 9.93006993006993e-06, "loss": -0.0017, "step": 568 }, { "clip_ratio": 0.002944260137155652, "clipped_completions_ratio": 0.0, "epoch": 0.19909027291812456, "grad_norm": 0.40915101147837046, "kl": 0.421875, "learning_rate": 9.94755244755245e-06, "loss": 0.0101, "max_completion_length": 197.0, "max_terminated_completion_length": 197.0, "mean_completion_length": 154.7678680419922, "mean_terminated_completion_length": 154.7678680419922, "min_completion_length": 109.0, "min_terminated_completion_length": 109.0, "num_tokens": 2855012.0, "reward": 1.212138056755066, "reward_std": 0.21345411241054535, "rewards/check_gptzero_func/mean": 0.1071428582072258, "rewards/check_gptzero_func/std": 0.3120938837528229, "rewards/check_winston_local_func/mean": 0.1049952358007431, "rewards/check_winston_local_func/std": 0.10944804549217224, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 569 }, { "clip_ratio": 0.004595747217535973, "epoch": 0.1994401679496151, "grad_norm": 0.4223862608263226, "kl": 0.423828125, "learning_rate": 9.965034965034966e-06, "loss": 0.0086, "step": 570 }, { "clip_ratio": 0.017990199849009514, "epoch": 0.19979006298110566, "grad_norm": 0.3395174959340422, "kl": 0.42578125, "learning_rate": 9.982517482517483e-06, "loss": 0.006, "step": 571 }, { "clip_ratio": 0.0318964421749115, "epoch": 0.2001399580125962, "grad_norm": 0.3119326481832398, "kl": 0.443359375, "learning_rate": 1e-05, "loss": 0.0042, "step": 572 }, { "clip_ratio": 0.0028012783732265234, "clipped_completions_ratio": 0.0, "epoch": 0.20048985304408679, "grad_norm": 0.5586897664012678, "kl": 0.458984375, "learning_rate": 9.999999067523662e-06, "loss": 0.0069, "max_completion_length": 210.0, "max_terminated_completion_length": 210.0, "mean_completion_length": 147.30357360839844, "mean_terminated_completion_length": 147.30357360839844, "min_completion_length": 83.0, "min_terminated_completion_length": 83.0, "num_tokens": 2873437.0, "reward": 1.4574552774429321, "reward_std": 0.2673817574977875, "rewards/check_gptzero_func/mean": 0.2857142984867096, "rewards/check_gptzero_func/std": 0.4558423161506653, "rewards/check_winston_local_func/mean": 0.173972949385643, "rewards/check_winston_local_func/std": 0.18746381998062134, "rewards/sentence_count_match_reward_logic/mean": 0.9977678656578064, "rewards/sentence_count_match_reward_logic/std": 0.016703827306628227, "step": 573 }, { "clip_ratio": 0.015061897225677967, "epoch": 0.20083974807557733, "grad_norm": 0.6093195481638569, "kl": 0.5234375, "learning_rate": 9.999996270094992e-06, "loss": 0.0062, "step": 574 }, { "clip_ratio": 0.024995647370815277, "epoch": 0.20118964310706788, "grad_norm": 0.3884726536165026, "kl": 0.48046875, "learning_rate": 9.999991607715036e-06, "loss": 0.0033, "step": 575 }, { "clip_ratio": 0.031818557530641556, "epoch": 0.20153953813855843, "grad_norm": 0.6874431833567782, "kl": 0.46875, "learning_rate": 9.99998508038553e-06, "loss": 0.0027, "step": 576 }, { "clip_ratio": 0.0027596461586654186, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.20188943317004898, "grad_norm": 0.370769086862407, "kl": 0.357421875, "learning_rate": 9.999976688108912e-06, "loss": 0.0021, "max_completion_length": 256.0, "max_terminated_completion_length": 241.0, "mean_completion_length": 184.48214721679688, "mean_terminated_completion_length": 172.5625, "min_completion_length": 102.0, "min_terminated_completion_length": 102.0, "num_tokens": 2896864.0, "reward": 1.196372628211975, "reward_std": 0.060726214200258255, "rewards/check_gptzero_func/mean": 0.1607142835855484, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.06163232401013374, "rewards/check_winston_local_func/std": 0.11651450395584106, "rewards/sentence_count_match_reward_logic/mean": 0.9740260243415833, "rewards/sentence_count_match_reward_logic/std": 0.06419889628887177, "step": 577 }, { "clip_ratio": 0.005652668885886669, "epoch": 0.20223932820153953, "grad_norm": 0.41497185169753814, "kl": 0.384765625, "learning_rate": 9.999966430888313e-06, "loss": 0.0012, "step": 578 }, { "clip_ratio": 0.013339566998183727, "epoch": 0.20258922323303008, "grad_norm": 0.2956376317282209, "kl": 0.373046875, "learning_rate": 9.999954308727553e-06, "loss": -0.0014, "step": 579 }, { "clip_ratio": 0.022920873016119003, "epoch": 0.20293911826452066, "grad_norm": 0.3140657963729821, "kl": 0.36328125, "learning_rate": 9.999940321631158e-06, "loss": -0.002, "step": 580 }, { "clip_ratio": 0.002512059872969985, "clipped_completions_ratio": 0.1071428571428571, "epoch": 0.2032890132960112, "grad_norm": 0.5169192321515178, "kl": 0.380859375, "learning_rate": 9.999924469604346e-06, "loss": 0.0028, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 156.1428680419922, "mean_terminated_completion_length": 144.16000366210938, "min_completion_length": 84.0, "min_terminated_completion_length": 84.0, "num_tokens": 2916600.0, "reward": 1.4923707246780396, "reward_std": 0.13986456394195557, "rewards/check_gptzero_func/mean": 0.25, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.2423705756664276, "rewards/check_winston_local_func/std": 0.3294840455055237, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 581 }, { "clip_ratio": 0.00740852253511548, "epoch": 0.20363890832750176, "grad_norm": 0.4461866821452423, "kl": 0.39453125, "learning_rate": 9.999906752653027e-06, "loss": -0.0006, "step": 582 }, { "clip_ratio": 0.023990055546164513, "epoch": 0.2039888033589923, "grad_norm": 0.4456473344759745, "kl": 0.412109375, "learning_rate": 9.99988717078381e-06, "loss": -0.0026, "step": 583 }, { "clip_ratio": 0.03651728481054306, "epoch": 0.20433869839048285, "grad_norm": 0.2997192365870588, "kl": 0.412109375, "learning_rate": 9.999865724003998e-06, "loss": -0.0047, "step": 584 }, { "clip_ratio": 0.0034739605616778135, "clipped_completions_ratio": 0.0, "epoch": 0.2046885934219734, "grad_norm": 0.5279143853516982, "kl": 0.37109375, "learning_rate": 9.999842412321593e-06, "loss": 0.0031, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 169.83929443359375, "mean_terminated_completion_length": 169.83929443359375, "min_completion_length": 81.0, "min_terminated_completion_length": 81.0, "num_tokens": 2937799.0, "reward": 1.4820798635482788, "reward_std": 0.3392122983932495, "rewards/check_gptzero_func/mean": 0.2678571343421936, "rewards/check_gptzero_func/std": 0.44685041904449463, "rewards/check_winston_local_func/mean": 0.24526003003120422, "rewards/check_winston_local_func/std": 0.24455320835113525, "rewards/sentence_count_match_reward_logic/mean": 0.968962550163269, "rewards/sentence_count_match_reward_logic/std": 0.06084445118904114, "step": 585 }, { "clip_ratio": 0.014018189162015915, "epoch": 0.20503848845346395, "grad_norm": 1.0877014941586542, "kl": 0.345703125, "learning_rate": 9.999817235745289e-06, "loss": 0.0019, "step": 586 }, { "clip_ratio": 0.01764339581131935, "epoch": 0.2053883834849545, "grad_norm": 1.876627700981518, "kl": 0.69140625, "learning_rate": 9.999790194284474e-06, "loss": 0.0022, "step": 587 }, { "clip_ratio": 0.02808491885662079, "epoch": 0.20573827851644508, "grad_norm": 0.36989596730970337, "kl": 0.421875, "learning_rate": 9.999761287949237e-06, "loss": -0.0021, "step": 588 }, { "clip_ratio": 0.003446590853855014, "clipped_completions_ratio": 0.0, "epoch": 0.20608817354793563, "grad_norm": 0.3616501524396585, "kl": 0.2451171875, "learning_rate": 9.99973051675036e-06, "loss": 0.0044, "max_completion_length": 243.0, "max_terminated_completion_length": 243.0, "mean_completion_length": 179.57144165039062, "mean_terminated_completion_length": 179.57144165039062, "min_completion_length": 146.0, "min_terminated_completion_length": 146.0, "num_tokens": 2959751.0, "reward": 2.111604690551758, "reward_std": 0.23957082629203796, "rewards/check_gptzero_func/mean": 0.8035714030265808, "rewards/check_gptzero_func/std": 0.40089187026023865, "rewards/check_winston_local_func/mean": 0.30803316831588745, "rewards/check_winston_local_func/std": 0.31487539410591125, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 589 }, { "clip_ratio": 0.004535973072052002, "epoch": 0.20643806857942618, "grad_norm": 0.4148859205652714, "kl": 0.2392578125, "learning_rate": 9.999697880699319e-06, "loss": 0.0041, "step": 590 }, { "clip_ratio": 0.009397734887897968, "epoch": 0.20678796361091673, "grad_norm": 0.26863630618216994, "kl": 0.24609375, "learning_rate": 9.999663379808287e-06, "loss": 0.0021, "step": 591 }, { "clip_ratio": 0.018791228532791138, "epoch": 0.20713785864240727, "grad_norm": 0.21367545106039423, "kl": 0.2578125, "learning_rate": 9.999627014090133e-06, "loss": 0.0008, "step": 592 }, { "clip_ratio": 0.004366072826087475, "clipped_completions_ratio": 0.0, "epoch": 0.20748775367389782, "grad_norm": 0.5596247346547165, "kl": 0.390625, "learning_rate": 9.99958878355842e-06, "loss": 0.0091, "max_completion_length": 218.0, "max_terminated_completion_length": 218.0, "mean_completion_length": 167.1428680419922, "mean_terminated_completion_length": 167.1428680419922, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 2980719.0, "reward": 1.1506283283233643, "reward_std": 0.14046302437782288, "rewards/check_gptzero_func/mean": 0.0714285746216774, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.0791996493935585, "rewards/check_winston_local_func/std": 0.12217718362808228, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 593 }, { "clip_ratio": 0.01136794500052929, "epoch": 0.20783764870538837, "grad_norm": 0.4684545333426991, "kl": 0.41015625, "learning_rate": 9.99954868822741e-06, "loss": 0.0074, "step": 594 }, { "clip_ratio": 0.020601443946361542, "epoch": 0.20818754373687895, "grad_norm": 0.4512392158849456, "kl": 0.41796875, "learning_rate": 9.999506728112057e-06, "loss": 0.0044, "step": 595 }, { "clip_ratio": 0.03252872824668884, "epoch": 0.2085374387683695, "grad_norm": 0.32734492334643844, "kl": 0.408203125, "learning_rate": 9.99946290322801e-06, "loss": 0.0025, "step": 596 }, { "clip_ratio": 0.004302050918340683, "clipped_completions_ratio": 0.0, "epoch": 0.20888733379986005, "grad_norm": 0.6664955500939456, "kl": 0.46484375, "learning_rate": 9.999417213591619e-06, "loss": -0.0005, "max_completion_length": 227.0, "max_terminated_completion_length": 227.0, "mean_completion_length": 130.55357360839844, "mean_terminated_completion_length": 130.55357360839844, "min_completion_length": 80.0, "min_terminated_completion_length": 80.0, "num_tokens": 2997326.0, "reward": 1.5903209447860718, "reward_std": 0.31898051500320435, "rewards/check_gptzero_func/mean": 0.2857142984867096, "rewards/check_gptzero_func/std": 0.4558423161506653, "rewards/check_winston_local_func/mean": 0.3224637806415558, "rewards/check_winston_local_func/std": 0.2929226756095886, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.04767312481999397, "step": 597 }, { "clip_ratio": 0.012549824081361294, "epoch": 0.2092372288313506, "grad_norm": 0.562787812380598, "kl": 0.48046875, "learning_rate": 9.999369659219922e-06, "loss": -0.0039, "step": 598 }, { "clip_ratio": 0.028447994962334633, "epoch": 0.20958712386284115, "grad_norm": 0.3324611659879844, "kl": 0.50390625, "learning_rate": 9.999320240130658e-06, "loss": -0.0066, "step": 599 }, { "clip_ratio": 0.04377491772174835, "epoch": 0.2099370188943317, "grad_norm": 0.34649148521431883, "kl": 0.53125, "learning_rate": 9.999268956342261e-06, "loss": -0.0074, "step": 600 }, { "clip_ratio": 0.0036202336195856333, "clipped_completions_ratio": 0.0, "epoch": 0.21028691392582224, "grad_norm": 0.6123704958791593, "kl": 0.466796875, "learning_rate": 9.999215807873857e-06, "loss": -0.0074, "max_completion_length": 237.0, "max_terminated_completion_length": 237.0, "mean_completion_length": 176.6607208251953, "mean_terminated_completion_length": 176.6607208251953, "min_completion_length": 102.0, "min_terminated_completion_length": 102.0, "num_tokens": 3019163.0, "reward": 1.6056851148605347, "reward_std": 0.12028766423463821, "rewards/check_gptzero_func/mean": 0.3928571343421936, "rewards/check_gptzero_func/std": 0.4928053915500641, "rewards/check_winston_local_func/mean": 0.2286442369222641, "rewards/check_winston_local_func/std": 0.31597548723220825, "rewards/sentence_count_match_reward_logic/mean": 0.984183669090271, "rewards/sentence_count_match_reward_logic/std": 0.051664575934410095, "step": 601 }, { "clip_ratio": 0.009268887341022491, "epoch": 0.2106368089573128, "grad_norm": 1.2455140561335982, "kl": 0.39453125, "learning_rate": 9.999160794745271e-06, "loss": -0.0083, "step": 602 }, { "clip_ratio": 0.01950032263994217, "epoch": 0.21098670398880337, "grad_norm": 4.662108515875804, "kl": 1.25, "learning_rate": 9.999103916977022e-06, "loss": -0.0031, "step": 603 }, { "clip_ratio": 0.027278350666165352, "epoch": 0.21133659902029392, "grad_norm": 0.8131701121526624, "kl": 0.55078125, "learning_rate": 9.999045174590324e-06, "loss": -0.0109, "step": 604 }, { "clip_ratio": 0.0025334160309284925, "clipped_completions_ratio": 0.0, "epoch": 0.21168649405178447, "grad_norm": 0.4703254073498994, "kl": 0.4140625, "learning_rate": 9.998984567607091e-06, "loss": -0.0005, "max_completion_length": 214.0, "max_terminated_completion_length": 214.0, "mean_completion_length": 171.0357208251953, "mean_terminated_completion_length": 171.0357208251953, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 3040125.0, "reward": 1.5679641962051392, "reward_std": 0.2156568318605423, "rewards/check_gptzero_func/mean": 0.375, "rewards/check_gptzero_func/std": 0.48850420117378235, "rewards/check_winston_local_func/mean": 0.19891656935214996, "rewards/check_winston_local_func/std": 0.237327441573143, "rewards/sentence_count_match_reward_logic/mean": 0.9940475821495056, "rewards/sentence_count_match_reward_logic/std": 0.02524530701339245, "step": 605 }, { "clip_ratio": 0.0076380097307264805, "epoch": 0.21203638908327502, "grad_norm": 0.48210404673141444, "kl": 0.408203125, "learning_rate": 9.998922096049925e-06, "loss": -0.0013, "step": 606 }, { "clip_ratio": 0.015565494075417519, "epoch": 0.21238628411476557, "grad_norm": 0.3496873205963685, "kl": 0.435546875, "learning_rate": 9.998857759942132e-06, "loss": -0.0033, "step": 607 }, { "clip_ratio": 0.0275461096316576, "epoch": 0.21273617914625612, "grad_norm": 0.3085178748479723, "kl": 0.470703125, "learning_rate": 9.998791559307702e-06, "loss": -0.0051, "step": 608 }, { "clip_ratio": 0.0031665468122810125, "clipped_completions_ratio": 0.0, "epoch": 0.21308607417774666, "grad_norm": 0.5305610779020943, "kl": 0.431640625, "learning_rate": 9.998723494171331e-06, "loss": 0.0057, "max_completion_length": 255.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 169.875, "mean_terminated_completion_length": 169.875, "min_completion_length": 86.0, "min_terminated_completion_length": 86.0, "num_tokens": 3061318.0, "reward": 1.1954001188278198, "reward_std": 0.1492670178413391, "rewards/check_gptzero_func/mean": 0.0535714291036129, "rewards/check_gptzero_func/std": 0.22720779478549957, "rewards/check_winston_local_func/mean": 0.15075726807117462, "rewards/check_winston_local_func/std": 0.13951821625232697, "rewards/sentence_count_match_reward_logic/mean": 0.9910714030265808, "rewards/sentence_count_match_reward_logic/std": 0.037867967039346695, "step": 609 }, { "clip_ratio": 0.01137758232653141, "epoch": 0.21343596920923724, "grad_norm": 0.4506280504793068, "kl": 0.44921875, "learning_rate": 9.998653564558408e-06, "loss": 0.0036, "step": 610 }, { "clip_ratio": 0.021190622821450233, "epoch": 0.2137858642407278, "grad_norm": 0.4527767690521238, "kl": 0.453125, "learning_rate": 9.998581770495012e-06, "loss": 0.0022, "step": 611 }, { "clip_ratio": 0.03220348805189133, "epoch": 0.21413575927221834, "grad_norm": 0.28586607283221466, "kl": 0.44921875, "learning_rate": 9.998508112007925e-06, "loss": 0.0003, "step": 612 }, { "clip_ratio": 0.003492561401799321, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.2144856543037089, "grad_norm": 0.4979554880514248, "kl": 0.380859375, "learning_rate": 9.99843258912462e-06, "loss": 0.0031, "max_completion_length": 256.0, "max_terminated_completion_length": 183.0, "mean_completion_length": 166.0178680419922, "mean_terminated_completion_length": 151.02084350585938, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 3082015.0, "reward": 1.5817210674285889, "reward_std": 0.21650952100753784, "rewards/check_gptzero_func/mean": 0.4107142984867096, "rewards/check_gptzero_func/std": 0.49641573429107666, "rewards/check_winston_local_func/mean": 0.1710067242383957, "rewards/check_winston_local_func/std": 0.1639496237039566, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 613 }, { "clip_ratio": 0.007606383878737688, "epoch": 0.21483554933519944, "grad_norm": 0.4733748026693374, "kl": 0.38671875, "learning_rate": 9.998355201873267e-06, "loss": 0.0008, "step": 614 }, { "clip_ratio": 0.020030338317155838, "epoch": 0.21518544436669, "grad_norm": 0.3431656350456533, "kl": 0.40234375, "learning_rate": 9.99827595028273e-06, "loss": -0.0016, "step": 615 }, { "clip_ratio": 0.03437113016843796, "epoch": 0.21553533939818054, "grad_norm": 0.3036933303712916, "kl": 0.4140625, "learning_rate": 9.998194834382567e-06, "loss": -0.0037, "step": 616 }, { "clip_ratio": 0.006537509150803089, "clipped_completions_ratio": 0.0, "epoch": 0.21588523442967109, "grad_norm": 0.6728394977115061, "kl": 0.5390625, "learning_rate": 9.998111854203037e-06, "loss": -0.0014, "max_completion_length": 171.0, "max_terminated_completion_length": 171.0, "mean_completion_length": 129.48214721679688, "mean_terminated_completion_length": 129.48214721679688, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 3098554.0, "reward": 1.6426244974136353, "reward_std": 0.21988359093666077, "rewards/check_gptzero_func/mean": 0.25, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.40155309438705444, "rewards/check_winston_local_func/std": 0.2662234902381897, "rewards/sentence_count_match_reward_logic/mean": 0.9910714030265808, "rewards/sentence_count_match_reward_logic/std": 0.04681408405303955, "step": 617 }, { "clip_ratio": 0.017091505229473114, "epoch": 0.21623512946116166, "grad_norm": 0.5305138336513603, "kl": 0.55859375, "learning_rate": 9.99802700977509e-06, "loss": -0.0049, "step": 618 }, { "clip_ratio": 0.03186539188027382, "epoch": 0.2165850244926522, "grad_norm": 0.41390706837614694, "kl": 0.57421875, "learning_rate": 9.99794030113037e-06, "loss": -0.007, "step": 619 }, { "clip_ratio": 0.04318554326891899, "epoch": 0.21693491952414276, "grad_norm": 0.29835163398965064, "kl": 0.58203125, "learning_rate": 9.997851728301219e-06, "loss": -0.0088, "step": 620 }, { "clip_ratio": 0.0027903893496841192, "clipped_completions_ratio": 0.1607142857142857, "epoch": 0.2172848145556333, "grad_norm": 0.44132557849790416, "kl": 0.416015625, "learning_rate": 9.997761291320676e-06, "loss": 0.0109, "max_completion_length": 256.0, "max_terminated_completion_length": 252.0, "mean_completion_length": 186.71429443359375, "mean_terminated_completion_length": 173.44680786132812, "min_completion_length": 122.0, "min_terminated_completion_length": 122.0, "num_tokens": 3121242.0, "reward": 1.5840256214141846, "reward_std": 0.11926669627428055, "rewards/check_gptzero_func/mean": 0.3214285671710968, "rewards/check_gptzero_func/std": 0.4712514281272888, "rewards/check_winston_local_func/mean": 0.27063268423080444, "rewards/check_winston_local_func/std": 0.30709412693977356, "rewards/sentence_count_match_reward_logic/mean": 0.9919642806053162, "rewards/sentence_count_match_reward_logic/std": 0.04240152984857559, "step": 621 }, { "clip_ratio": 0.004653595387935638, "epoch": 0.21763470958712386, "grad_norm": 0.403006227982956, "kl": 0.416015625, "learning_rate": 9.997668990222473e-06, "loss": 0.0093, "step": 622 }, { "clip_ratio": 0.013819066807627678, "epoch": 0.2179846046186144, "grad_norm": 0.31934732807988425, "kl": 0.421875, "learning_rate": 9.997574825041034e-06, "loss": 0.007, "step": 623 }, { "clip_ratio": 0.02564837411046028, "epoch": 0.21833449965010496, "grad_norm": 0.27371822748278357, "kl": 0.43359375, "learning_rate": 9.997478795811486e-06, "loss": 0.0057, "step": 624 }, { "clip_ratio": 0.0032414295710623264, "clipped_completions_ratio": 0.2678571428571429, "epoch": 0.21868439468159553, "grad_norm": 0.506992248945535, "kl": 0.45703125, "learning_rate": 9.997380902569644e-06, "loss": 0.0083, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 187.3928680419922, "mean_terminated_completion_length": 162.2926788330078, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 3144416.0, "reward": 1.6411033868789673, "reward_std": 0.20023909211158752, "rewards/check_gptzero_func/mean": 0.3035714328289032, "rewards/check_gptzero_func/std": 0.4639608860015869, "rewards/check_winston_local_func/mean": 0.34564879536628723, "rewards/check_winston_local_func/std": 0.3006749153137207, "rewards/sentence_count_match_reward_logic/mean": 0.9918831586837769, "rewards/sentence_count_match_reward_logic/std": 0.026157844811677933, "step": 625 }, { "clip_ratio": 0.007907523773610592, "epoch": 0.21903428971308608, "grad_norm": 0.42024292545121755, "kl": 0.458984375, "learning_rate": 9.997281145352022e-06, "loss": 0.0063, "step": 626 }, { "clip_ratio": 0.0184819046407938, "epoch": 0.21938418474457663, "grad_norm": 0.3506363719838166, "kl": 0.466796875, "learning_rate": 9.99717952419583e-06, "loss": 0.004, "step": 627 }, { "clip_ratio": 0.03395656496286392, "epoch": 0.21973407977606718, "grad_norm": 0.32541603433181254, "kl": 0.474609375, "learning_rate": 9.99707603913897e-06, "loss": 0.0024, "step": 628 }, { "clip_ratio": 0.0038437899202108383, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.22008397480755773, "grad_norm": 0.5690466846543851, "kl": 0.41796875, "learning_rate": 9.996970690220043e-06, "loss": 0.0025, "max_completion_length": 256.0, "max_terminated_completion_length": 183.0, "mean_completion_length": 179.67857360839844, "mean_terminated_completion_length": 149.15000915527344, "min_completion_length": 104.0, "min_terminated_completion_length": 104.0, "num_tokens": 3167534.0, "reward": 1.646116852760315, "reward_std": 0.2731439173221588, "rewards/check_gptzero_func/mean": 0.4821428656578064, "rewards/check_gptzero_func/std": 0.5042031407356262, "rewards/check_winston_local_func/mean": 0.24730727076530457, "rewards/check_winston_local_func/std": 0.2674359679222107, "rewards/sentence_count_match_reward_logic/mean": 0.9166666865348816, "rewards/sentence_count_match_reward_logic/std": 0.12247448414564133, "step": 629 }, { "clip_ratio": 0.008625361137092113, "epoch": 0.22043386983904828, "grad_norm": 0.45087594207555576, "kl": 0.416015625, "learning_rate": 9.99686347747834e-06, "loss": -0.0002, "step": 630 }, { "clip_ratio": 0.02206123061478138, "epoch": 0.22078376487053883, "grad_norm": 0.4039976063980334, "kl": 0.423828125, "learning_rate": 9.996754400953852e-06, "loss": -0.0024, "step": 631 }, { "clip_ratio": 0.03577674925327301, "epoch": 0.22113365990202938, "grad_norm": 0.3571837545791394, "kl": 0.458984375, "learning_rate": 9.996643460687264e-06, "loss": -0.004, "step": 632 }, { "clip_ratio": 0.0048550572246313095, "clipped_completions_ratio": 0.1607142857142857, "epoch": 0.22148355493351995, "grad_norm": 0.6895437526934312, "kl": 0.52734375, "learning_rate": 9.996530656719954e-06, "loss": 0.0007, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 155.75, "mean_terminated_completion_length": 136.55319213867188, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 3186960.0, "reward": 2.0056986808776855, "reward_std": 0.269986629486084, "rewards/check_gptzero_func/mean": 0.5714285969734192, "rewards/check_gptzero_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.49108821153640747, "rewards/check_winston_local_func/std": 0.2944048047065735, "rewards/sentence_count_match_reward_logic/mean": 0.9431818723678589, "rewards/sentence_count_match_reward_logic/std": 0.11074843257665634, "step": 633 }, { "clip_ratio": 0.01723216287791729, "epoch": 0.2218334499650105, "grad_norm": 0.7563808075887257, "kl": 0.5625, "learning_rate": 9.996415989093999e-06, "loss": -0.0019, "step": 634 }, { "clip_ratio": 0.028176680207252502, "epoch": 0.22218334499650105, "grad_norm": 0.37699443916436837, "kl": 0.546875, "learning_rate": 9.996299457852167e-06, "loss": -0.0055, "step": 635 }, { "clip_ratio": 0.0424787662923336, "epoch": 0.2225332400279916, "grad_norm": 0.5678846346706413, "kl": 0.53515625, "learning_rate": 9.996181063037924e-06, "loss": -0.0065, "step": 636 }, { "clip_ratio": 0.004251939710229635, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.22288313505948215, "grad_norm": 0.7601334764901881, "kl": 0.58984375, "learning_rate": 9.996060804695431e-06, "loss": 0.0049, "max_completion_length": 256.0, "max_terminated_completion_length": 241.0, "mean_completion_length": 162.33929443359375, "mean_terminated_completion_length": 146.7291717529297, "min_completion_length": 54.0, "min_terminated_completion_length": 54.0, "num_tokens": 3207195.0, "reward": 1.5396363735198975, "reward_std": 0.21214069426059723, "rewards/check_gptzero_func/mean": 0.2142857164144516, "rewards/check_gptzero_func/std": 0.41403928399086, "rewards/check_winston_local_func/mean": 0.35213643312454224, "rewards/check_winston_local_func/std": 0.21003934741020203, "rewards/sentence_count_match_reward_logic/mean": 0.9732142686843872, "rewards/sentence_count_match_reward_logic/std": 0.0517549142241478, "step": 637 }, { "clip_ratio": 0.022015197202563286, "epoch": 0.2232330300909727, "grad_norm": 0.7712160873424059, "kl": 0.65625, "learning_rate": 9.995938682869542e-06, "loss": 0.0027, "step": 638 }, { "clip_ratio": 0.03010730817914009, "epoch": 0.22358292512246325, "grad_norm": 0.4896609768707595, "kl": 0.64453125, "learning_rate": 9.995814697605808e-06, "loss": -0.0009, "step": 639 }, { "clip_ratio": 0.04329677298665047, "epoch": 0.22393282015395383, "grad_norm": 0.5459183265521755, "kl": 0.59765625, "learning_rate": 9.995688848950473e-06, "loss": -0.003, "step": 640 }, { "clip_ratio": 0.0028928155079483986, "clipped_completions_ratio": 0.0, "epoch": 0.22428271518544438, "grad_norm": 0.5607967179635124, "kl": 0.50390625, "learning_rate": 9.995561136950479e-06, "loss": 0.0034, "max_completion_length": 223.0, "max_terminated_completion_length": 223.0, "mean_completion_length": 163.58929443359375, "mean_terminated_completion_length": 163.58929443359375, "min_completion_length": 90.0, "min_terminated_completion_length": 90.0, "num_tokens": 3227308.0, "reward": 1.7989789247512817, "reward_std": 0.16394290328025818, "rewards/check_gptzero_func/mean": 0.5714285969734192, "rewards/check_gptzero_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.23052644729614258, "rewards/check_winston_local_func/std": 0.25162243843078613, "rewards/sentence_count_match_reward_logic/mean": 0.9970237612724304, "rewards/sentence_count_match_reward_logic/std": 0.02227177284657955, "step": 641 }, { "clip_ratio": 0.01464028935879469, "epoch": 0.22463261021693492, "grad_norm": 0.5674023935806448, "kl": 0.55859375, "learning_rate": 9.99543156165346e-06, "loss": 0.0018, "step": 642 }, { "clip_ratio": 0.021763572469353676, "epoch": 0.22498250524842547, "grad_norm": 0.48353051347280146, "kl": 0.5546875, "learning_rate": 9.995300123107747e-06, "loss": -0.0007, "step": 643 }, { "clip_ratio": 0.02983946166932583, "epoch": 0.22533240027991602, "grad_norm": 0.5149496263388784, "kl": 0.5, "learning_rate": 9.995166821362368e-06, "loss": -0.0016, "step": 644 }, { "clip_ratio": 0.004599442705512047, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.22568229531140657, "grad_norm": 0.6053084578615346, "kl": 0.498046875, "learning_rate": 9.995031656467038e-06, "loss": 0.0087, "max_completion_length": 256.0, "max_terminated_completion_length": 202.0, "mean_completion_length": 144.73214721679688, "mean_terminated_completion_length": 126.1875, "min_completion_length": 89.0, "min_terminated_completion_length": 89.0, "num_tokens": 3245549.0, "reward": 1.5583958625793457, "reward_std": 0.23426920175552368, "rewards/check_gptzero_func/mean": 0.0892857164144516, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.525657594203949, "rewards/check_winston_local_func/std": 0.4150081276893616, "rewards/sentence_count_match_reward_logic/mean": 0.9434523582458496, "rewards/sentence_count_match_reward_logic/std": 0.14428548514842987, "step": 645 }, { "clip_ratio": 0.009141015820205212, "epoch": 0.22603219034289712, "grad_norm": 0.49599976405114626, "kl": 0.5, "learning_rate": 9.994894628472175e-06, "loss": 0.0057, "step": 646 }, { "clip_ratio": 0.03620540723204613, "epoch": 0.22638208537438767, "grad_norm": 0.46010511564534773, "kl": 0.5078125, "learning_rate": 9.99475573742889e-06, "loss": 0.0033, "step": 647 }, { "clip_ratio": 0.05206413194537163, "epoch": 0.22673198040587825, "grad_norm": 0.3203537420311447, "kl": 0.5078125, "learning_rate": 9.994614983388986e-06, "loss": 0.0019, "step": 648 }, { "clip_ratio": 0.00348992133513093, "clipped_completions_ratio": 0.0, "epoch": 0.2270818754373688, "grad_norm": 0.714265971879709, "kl": 0.54296875, "learning_rate": 9.994472366404964e-06, "loss": 0.003, "max_completion_length": 226.0, "max_terminated_completion_length": 226.0, "mean_completion_length": 179.6607208251953, "mean_terminated_completion_length": 179.6607208251953, "min_completion_length": 111.0, "min_terminated_completion_length": 111.0, "num_tokens": 3267898.0, "reward": 1.352026343345642, "reward_std": 0.1562114804983139, "rewards/check_gptzero_func/mean": 0.1071428582072258, "rewards/check_gptzero_func/std": 0.3120938837528229, "rewards/check_winston_local_func/mean": 0.2541307508945465, "rewards/check_winston_local_func/std": 0.2919701337814331, "rewards/sentence_count_match_reward_logic/mean": 0.9907525777816772, "rewards/sentence_count_match_reward_logic/std": 0.03370845690369606, "step": 649 }, { "clip_ratio": 0.0094044990837574, "epoch": 0.22743177046885935, "grad_norm": 0.44935369206348036, "kl": 0.5078125, "learning_rate": 9.99432788653002e-06, "loss": -0.0001, "step": 650 }, { "clip_ratio": 0.021329207345843315, "epoch": 0.2277816655003499, "grad_norm": 0.3907542751379412, "kl": 0.5234375, "learning_rate": 9.99418154381804e-06, "loss": -0.0027, "step": 651 }, { "clip_ratio": 0.03326799347996712, "epoch": 0.22813156053184044, "grad_norm": 0.33395861236360924, "kl": 0.53125, "learning_rate": 9.994033338323612e-06, "loss": -0.0044, "step": 652 }, { "clip_ratio": 0.004340808838605881, "clipped_completions_ratio": 0.0, "epoch": 0.228481455563331, "grad_norm": 0.5509890400634149, "kl": 0.50390625, "learning_rate": 9.993883270102016e-06, "loss": -0.0018, "max_completion_length": 252.0, "max_terminated_completion_length": 252.0, "mean_completion_length": 141.80357360839844, "mean_terminated_completion_length": 141.80357360839844, "min_completion_length": 95.0, "min_terminated_completion_length": 95.0, "num_tokens": 3285727.0, "reward": 1.6245180368423462, "reward_std": 0.14113256335258484, "rewards/check_gptzero_func/mean": 0.4464285671710968, "rewards/check_gptzero_func/std": 0.5016207695007324, "rewards/check_winston_local_func/mean": 0.19594666361808777, "rewards/check_winston_local_func/std": 0.20257548987865448, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.05754726752638817, "step": 653 }, { "clip_ratio": 0.009644358418881893, "epoch": 0.22883135059482154, "grad_norm": 0.4886689398417403, "kl": 0.51171875, "learning_rate": 9.993731339209223e-06, "loss": -0.0043, "step": 654 }, { "clip_ratio": 0.026075633242726326, "epoch": 0.22918124562631212, "grad_norm": 0.33664750321585213, "kl": 0.52734375, "learning_rate": 9.993577545701902e-06, "loss": -0.0061, "step": 655 }, { "clip_ratio": 0.040259212255477905, "epoch": 0.22953114065780267, "grad_norm": 0.3371622572020565, "kl": 0.53125, "learning_rate": 9.993421889637418e-06, "loss": -0.0072, "step": 656 }, { "clip_ratio": 0.0036061317659914494, "clipped_completions_ratio": 0.0, "epoch": 0.22988103568929322, "grad_norm": 0.4864069327129521, "kl": 0.51171875, "learning_rate": 9.993264371073828e-06, "loss": 0.0044, "max_completion_length": 231.0, "max_terminated_completion_length": 231.0, "mean_completion_length": 177.73214721679688, "mean_terminated_completion_length": 177.73214721679688, "min_completion_length": 104.0, "min_terminated_completion_length": 104.0, "num_tokens": 3307216.0, "reward": 1.8277418613433838, "reward_std": 0.18758852779865265, "rewards/check_gptzero_func/mean": 0.5892857313156128, "rewards/check_gptzero_func/std": 0.49641573429107666, "rewards/check_winston_local_func/mean": 0.2384561449289322, "rewards/check_winston_local_func/std": 0.3166908025741577, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 657 }, { "clip_ratio": 0.010620033368468285, "epoch": 0.23023093072078377, "grad_norm": 0.49736808726627213, "kl": 0.52734375, "learning_rate": 9.993104990069887e-06, "loss": 0.0023, "step": 658 }, { "clip_ratio": 0.01972523331642151, "epoch": 0.23058082575227432, "grad_norm": 0.3496349091548678, "kl": 0.52734375, "learning_rate": 9.99294374668504e-06, "loss": 0.0001, "step": 659 }, { "clip_ratio": 0.031903766095638275, "epoch": 0.23093072078376486, "grad_norm": 0.294501867598896, "kl": 0.52734375, "learning_rate": 9.99278064097943e-06, "loss": -0.0013, "step": 660 }, { "clip_ratio": 0.005948017816990614, "clipped_completions_ratio": 0.0, "epoch": 0.2312806158152554, "grad_norm": 0.5591312868999889, "kl": 0.47265625, "learning_rate": 9.992615673013895e-06, "loss": -0.019, "max_completion_length": 214.0, "max_terminated_completion_length": 214.0, "mean_completion_length": 164.3928680419922, "mean_terminated_completion_length": 164.3928680419922, "min_completion_length": 111.0, "min_terminated_completion_length": 111.0, "num_tokens": 3327558.0, "reward": 1.8214843273162842, "reward_std": 0.34616056084632874, "rewards/check_gptzero_func/mean": 0.5714285969734192, "rewards/check_gptzero_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.2798176109790802, "rewards/check_winston_local_func/std": 0.3002585172653198, "rewards/sentence_count_match_reward_logic/mean": 0.9702381491661072, "rewards/sentence_count_match_reward_logic/std": 0.09591211378574371, "step": 661 }, { "clip_ratio": 0.009724353440105915, "epoch": 0.23163051084674596, "grad_norm": 0.4808393420346072, "kl": 0.470703125, "learning_rate": 9.992448842849967e-06, "loss": -0.0221, "step": 662 }, { "clip_ratio": 0.022878628224134445, "epoch": 0.23198040587823654, "grad_norm": 0.38197896481999816, "kl": 0.47265625, "learning_rate": 9.992280150549869e-06, "loss": -0.0248, "step": 663 }, { "clip_ratio": 0.04208405688405037, "epoch": 0.2323303009097271, "grad_norm": 0.33523333506527053, "kl": 0.46484375, "learning_rate": 9.992109596176525e-06, "loss": -0.0267, "step": 664 }, { "clip_ratio": 0.0042951093055307865, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.23268019594121764, "grad_norm": 0.5938446925452399, "kl": 0.5390625, "learning_rate": 9.991937179793547e-06, "loss": 0.0034, "max_completion_length": 256.0, "max_terminated_completion_length": 188.0, "mean_completion_length": 178.94644165039062, "mean_terminated_completion_length": 148.125, "min_completion_length": 107.0, "min_terminated_completion_length": 107.0, "num_tokens": 3350139.0, "reward": 1.6189996004104614, "reward_std": 0.1477842777967453, "rewards/check_gptzero_func/mean": 0.2678571343421936, "rewards/check_gptzero_func/std": 0.44685041904449463, "rewards/check_winston_local_func/mean": 0.35114240646362305, "rewards/check_winston_local_func/std": 0.34978392720222473, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 665 }, { "clip_ratio": 0.011609682813286781, "epoch": 0.2330300909727082, "grad_norm": 0.4321608386688348, "kl": 0.5390625, "learning_rate": 9.991762901465249e-06, "loss": 0.0, "step": 666 }, { "clip_ratio": 0.025481965392827988, "epoch": 0.23337998600419874, "grad_norm": 0.35831349380644884, "kl": 0.546875, "learning_rate": 9.99158676125663e-06, "loss": -0.002, "step": 667 }, { "clip_ratio": 0.03992075473070145, "epoch": 0.23372988103568929, "grad_norm": 0.3310222310664728, "kl": 0.56640625, "learning_rate": 9.991408759233394e-06, "loss": -0.0032, "step": 668 }, { "clip_ratio": 0.005570771638303995, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.23407977606717983, "grad_norm": 0.7667256519021827, "kl": 0.6796875, "learning_rate": 9.991228895461929e-06, "loss": 0.0035, "max_completion_length": 256.0, "max_terminated_completion_length": 180.0, "mean_completion_length": 133.82144165039062, "mean_terminated_completion_length": 113.45833587646484, "min_completion_length": 64.0, "min_terminated_completion_length": 64.0, "num_tokens": 3367401.0, "reward": 1.5712125301361084, "reward_std": 0.24168509244918823, "rewards/check_gptzero_func/mean": 0.2142857164144516, "rewards/check_gptzero_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.39009013772010803, "rewards/check_winston_local_func/std": 0.3336288034915924, "rewards/sentence_count_match_reward_logic/mean": 0.9668367505073547, "rewards/sentence_count_match_reward_logic/std": 0.08610761910676956, "step": 669 }, { "clip_ratio": 0.0211477167904377, "epoch": 0.2344296710986704, "grad_norm": 0.8178742112288013, "kl": 0.75, "learning_rate": 9.991047170009325e-06, "loss": 0.0013, "step": 670 }, { "clip_ratio": 0.036649543792009354, "epoch": 0.23477956613016096, "grad_norm": 0.5012607906055331, "kl": 0.703125, "learning_rate": 9.990863582943363e-06, "loss": -0.0016, "step": 671 }, { "clip_ratio": 0.04922090843319893, "epoch": 0.2351294611616515, "grad_norm": 0.6532080146389622, "kl": 0.67578125, "learning_rate": 9.990678134332521e-06, "loss": -0.0016, "step": 672 }, { "clip_ratio": 0.003687182441353798, "clipped_completions_ratio": 0.0, "epoch": 0.23547935619314206, "grad_norm": 0.666427003215135, "kl": 0.50390625, "learning_rate": 9.990490824245968e-06, "loss": 0.003, "max_completion_length": 216.0, "max_terminated_completion_length": 216.0, "mean_completion_length": 166.4107208251953, "mean_terminated_completion_length": 166.4107208251953, "min_completion_length": 94.0, "min_terminated_completion_length": 94.0, "num_tokens": 3387872.0, "reward": 1.6359522342681885, "reward_std": 0.19997376203536987, "rewards/check_gptzero_func/mean": 0.1785714328289032, "rewards/check_gptzero_func/std": 0.3864591121673584, "rewards/check_winston_local_func/mean": 0.4675846993923187, "rewards/check_winston_local_func/std": 0.3600202798843384, "rewards/sentence_count_match_reward_logic/mean": 0.989795982837677, "rewards/sentence_count_match_reward_logic/std": 0.03712429851293564, "step": 673 }, { "clip_ratio": 0.020294463261961937, "epoch": 0.2358292512246326, "grad_norm": 0.7204659029286057, "kl": 0.5390625, "learning_rate": 9.990301652753568e-06, "loss": 0.0004, "step": 674 }, { "clip_ratio": 0.028011145070195198, "epoch": 0.23617914625612316, "grad_norm": 0.5396897626745127, "kl": 0.5390625, "learning_rate": 9.990110619925884e-06, "loss": -0.0021, "step": 675 }, { "clip_ratio": 0.031828343868255615, "epoch": 0.2365290412876137, "grad_norm": 0.4579594661365567, "kl": 0.515625, "learning_rate": 9.989917725834166e-06, "loss": -0.003, "step": 676 }, { "clip_ratio": 0.0034862279426306486, "clipped_completions_ratio": 0.0, "epoch": 0.23687893631910426, "grad_norm": 0.4769609261709759, "kl": 0.50390625, "learning_rate": 9.989722970550363e-06, "loss": 0.0081, "max_completion_length": 232.0, "max_terminated_completion_length": 232.0, "mean_completion_length": 179.32144165039062, "mean_terminated_completion_length": 179.32144165039062, "min_completion_length": 115.0, "min_terminated_completion_length": 115.0, "num_tokens": 3409626.0, "reward": 1.7359546422958374, "reward_std": 0.13369247317314148, "rewards/check_gptzero_func/mean": 0.4464285671710968, "rewards/check_gptzero_func/std": 0.5016207695007324, "rewards/check_winston_local_func/mean": 0.3088711202144623, "rewards/check_winston_local_func/std": 0.2902243733406067, "rewards/sentence_count_match_reward_logic/mean": 0.9806547164916992, "rewards/sentence_count_match_reward_logic/std": 0.052080631256103516, "step": 677 }, { "clip_ratio": 0.0064384969882667065, "epoch": 0.23722883135059483, "grad_norm": 0.4368944895359137, "kl": 0.50390625, "learning_rate": 9.989526354147116e-06, "loss": 0.0064, "step": 678 }, { "clip_ratio": 0.01692330278456211, "epoch": 0.23757872638208538, "grad_norm": 0.40593945433762924, "kl": 0.5, "learning_rate": 9.98932787669776e-06, "loss": 0.0037, "step": 679 }, { "clip_ratio": 0.02962266467511654, "epoch": 0.23792862141357593, "grad_norm": 0.2755908924539012, "kl": 0.515625, "learning_rate": 9.989127538276329e-06, "loss": 0.002, "step": 680 }, { "clip_ratio": 0.003552611917257309, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.23827851644506648, "grad_norm": 0.5116218864119417, "kl": 0.6015625, "learning_rate": 9.988925338957544e-06, "loss": -0.0022, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 190.75001525878906, "mean_terminated_completion_length": 188.3333282470703, "min_completion_length": 95.0, "min_terminated_completion_length": 95.0, "num_tokens": 3433196.0, "reward": 1.4450743198394775, "reward_std": 0.25322896242141724, "rewards/check_gptzero_func/mean": 0.2678571343421936, "rewards/check_gptzero_func/std": 0.44685041904449463, "rewards/check_winston_local_func/mean": 0.2084670066833496, "rewards/check_winston_local_func/std": 0.1910780668258667, "rewards/sentence_count_match_reward_logic/mean": 0.96875, "rewards/sentence_count_match_reward_logic/std": 0.07302630692720413, "step": 681 }, { "clip_ratio": 0.0058493115939199924, "epoch": 0.23862841147655703, "grad_norm": 0.430353563586344, "kl": 0.58203125, "learning_rate": 9.988721278816826e-06, "loss": -0.004, "step": 682 }, { "clip_ratio": 0.017351290211081505, "epoch": 0.23897830650804758, "grad_norm": 0.3225287746579541, "kl": 0.5859375, "learning_rate": 9.988515357930284e-06, "loss": -0.0066, "step": 683 }, { "clip_ratio": 0.028298979625105858, "epoch": 0.23932820153953813, "grad_norm": 0.27994940401953566, "kl": 0.58984375, "learning_rate": 9.988307576374727e-06, "loss": -0.0084, "step": 684 }, { "clip_ratio": 0.0023310526739805937, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.2396780965710287, "grad_norm": 0.5526154145074637, "kl": 0.48046875, "learning_rate": 9.988097934227656e-06, "loss": 0.0057, "max_completion_length": 256.0, "max_terminated_completion_length": 243.0, "mean_completion_length": 175.96429443359375, "mean_terminated_completion_length": 162.625, "min_completion_length": 96.0, "min_terminated_completion_length": 96.0, "num_tokens": 3455682.0, "reward": 1.628157138824463, "reward_std": 0.2940354347229004, "rewards/check_gptzero_func/mean": 0.3392857015132904, "rewards/check_gptzero_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.314845472574234, "rewards/check_winston_local_func/std": 0.2985548675060272, "rewards/sentence_count_match_reward_logic/mean": 0.9740260243415833, "rewards/sentence_count_match_reward_logic/std": 0.06419889628887177, "step": 685 }, { "clip_ratio": 0.008438530378043652, "epoch": 0.24002799160251925, "grad_norm": 0.4237873132747593, "kl": 0.490234375, "learning_rate": 9.987886431567264e-06, "loss": 0.0032, "step": 686 }, { "clip_ratio": 0.022972997277975082, "epoch": 0.2403778866340098, "grad_norm": 0.33613758885893613, "kl": 0.5, "learning_rate": 9.98767306847244e-06, "loss": 0.0009, "step": 687 }, { "clip_ratio": 0.035667236894369125, "epoch": 0.24072778166550035, "grad_norm": 0.31421326252430426, "kl": 0.51171875, "learning_rate": 9.987457845022767e-06, "loss": -0.0006, "step": 688 }, { "clip_ratio": 0.0056762308813631535, "clipped_completions_ratio": 0.0, "epoch": 0.2410776766969909, "grad_norm": 0.8134274185498696, "kl": 0.69921875, "learning_rate": 9.98724076129852e-06, "loss": 0.0035, "max_completion_length": 177.0, "max_terminated_completion_length": 177.0, "mean_completion_length": 120.35714721679688, "mean_terminated_completion_length": 120.35714721679688, "min_completion_length": 53.0, "min_terminated_completion_length": 53.0, "num_tokens": 3471134.0, "reward": 1.9186803102493286, "reward_std": 0.3732222318649292, "rewards/check_gptzero_func/mean": 0.3571428656578064, "rewards/check_gptzero_func/std": 0.48349377512931824, "rewards/check_winston_local_func/mean": 0.5615373253822327, "rewards/check_winston_local_func/std": 0.28382834792137146, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 689 }, { "clip_ratio": 0.013347110711038113, "epoch": 0.24142757172848145, "grad_norm": 0.559670161965566, "kl": 0.734375, "learning_rate": 9.98702181738067e-06, "loss": -0.0001, "step": 690 }, { "clip_ratio": 0.03450771048665047, "epoch": 0.241777466759972, "grad_norm": 0.5032868576208462, "kl": 0.7734375, "learning_rate": 9.986801013350883e-06, "loss": -0.0032, "step": 691 }, { "clip_ratio": 0.04838121309876442, "epoch": 0.24212736179146255, "grad_norm": 0.450596135487766, "kl": 0.796875, "learning_rate": 9.986578349291514e-06, "loss": -0.0039, "step": 692 }, { "clip_ratio": 0.0030636379960924387, "clipped_completions_ratio": 0.0, "epoch": 0.24247725682295312, "grad_norm": 0.7222874953597533, "kl": 0.55859375, "learning_rate": 9.986353825285615e-06, "loss": -0.0006, "max_completion_length": 241.0, "max_terminated_completion_length": 241.0, "mean_completion_length": 135.35714721679688, "mean_terminated_completion_length": 135.35714721679688, "min_completion_length": 84.0, "min_terminated_completion_length": 84.0, "num_tokens": 3488290.0, "reward": 2.082014560699463, "reward_std": 0.16027364134788513, "rewards/check_gptzero_func/mean": 0.4464285671710968, "rewards/check_gptzero_func/std": 0.5016207695007324, "rewards/check_winston_local_func/mean": 0.6432390213012695, "rewards/check_winston_local_func/std": 0.3340514898300171, "rewards/sentence_count_match_reward_logic/mean": 0.9923468828201294, "rewards/sentence_count_match_reward_logic/std": 0.03245825320482254, "step": 693 }, { "clip_ratio": 0.010168096050620079, "epoch": 0.24282715185444367, "grad_norm": 0.46372169641827143, "kl": 0.56640625, "learning_rate": 9.98612744141693e-06, "loss": -0.0037, "step": 694 }, { "clip_ratio": 0.026424085721373558, "epoch": 0.24317704688593422, "grad_norm": 0.376084330485384, "kl": 0.58203125, "learning_rate": 9.985899197769902e-06, "loss": -0.0051, "step": 695 }, { "clip_ratio": 0.041096705943346024, "epoch": 0.24352694191742477, "grad_norm": 0.3117221383007737, "kl": 0.6015625, "learning_rate": 9.985669094429662e-06, "loss": -0.0063, "step": 696 }, { "clip_ratio": 0.0046484279446303844, "clipped_completions_ratio": 0.0, "epoch": 0.24387683694891532, "grad_norm": 0.7015019975685638, "kl": 0.57421875, "learning_rate": 9.985437131482035e-06, "loss": -0.0019, "max_completion_length": 231.0, "max_terminated_completion_length": 231.0, "mean_completion_length": 153.0357208251953, "mean_terminated_completion_length": 153.0357208251953, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 3507484.0, "reward": 1.9477488994598389, "reward_std": 0.2736755311489105, "rewards/check_gptzero_func/mean": 0.5535714030265808, "rewards/check_gptzero_func/std": 0.5016207695007324, "rewards/check_winston_local_func/mean": 0.4380762577056885, "rewards/check_winston_local_func/std": 0.3233146369457245, "rewards/sentence_count_match_reward_logic/mean": 0.9561012387275696, "rewards/sentence_count_match_reward_logic/std": 0.11163941770792007, "step": 697 }, { "clip_ratio": 0.010279265232384205, "epoch": 0.24422673198040587, "grad_norm": 0.5366903679462284, "kl": 0.609375, "learning_rate": 9.98520330901354e-06, "loss": -0.0049, "step": 698 }, { "clip_ratio": 0.026397403329610825, "epoch": 0.24457662701189642, "grad_norm": 0.43820760911965495, "kl": 0.64453125, "learning_rate": 9.984967627111395e-06, "loss": -0.0075, "step": 699 }, { "clip_ratio": 0.03642621263861656, "epoch": 0.244926522043387, "grad_norm": 0.45651483104979, "kl": 0.63671875, "learning_rate": 9.984730085863504e-06, "loss": -0.0083, "step": 700 }, { "clip_ratio": 0.0023700157180428505, "clipped_completions_ratio": 0.0, "epoch": 0.24527641707487755, "grad_norm": 0.4687496378594148, "kl": 0.462890625, "learning_rate": 9.984490685358468e-06, "loss": 0.0017, "max_completion_length": 242.0, "max_terminated_completion_length": 242.0, "mean_completion_length": 183.48214721679688, "mean_terminated_completion_length": 183.48214721679688, "min_completion_length": 153.0, "min_terminated_completion_length": 153.0, "num_tokens": 3529647.0, "reward": 1.630222201347351, "reward_std": 0.2235778123140335, "rewards/check_gptzero_func/mean": 0.4107142984867096, "rewards/check_gptzero_func/std": 0.49641573429107666, "rewards/check_winston_local_func/mean": 0.2343888133764267, "rewards/check_winston_local_func/std": 0.21926672756671906, "rewards/sentence_count_match_reward_logic/mean": 0.9851189851760864, "rewards/sentence_count_match_reward_logic/std": 0.047956064343452454, "step": 701 }, { "clip_ratio": 0.005596226546913385, "epoch": 0.2456263121063681, "grad_norm": 0.4328958144244223, "kl": 0.462890625, "learning_rate": 9.984249425685582e-06, "loss": -0.0007, "step": 702 }, { "clip_ratio": 0.014569886028766632, "epoch": 0.24597620713785864, "grad_norm": 0.3882688650432866, "kl": 0.4765625, "learning_rate": 9.984006306934832e-06, "loss": -0.003, "step": 703 }, { "clip_ratio": 0.029327819123864174, "epoch": 0.2463261021693492, "grad_norm": 0.2840468297399616, "kl": 0.486328125, "learning_rate": 9.9837613291969e-06, "loss": -0.0048, "step": 704 }, { "clip_ratio": 0.0034402632154524326, "clipped_completions_ratio": 0.0, "epoch": 0.24667599720083974, "grad_norm": 0.6561226927508743, "kl": 0.484375, "learning_rate": 9.98351449256316e-06, "loss": 0.002, "max_completion_length": 208.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 146.5357208251953, "mean_terminated_completion_length": 146.5357208251953, "min_completion_length": 81.0, "min_terminated_completion_length": 81.0, "num_tokens": 3548237.0, "reward": 1.5931800603866577, "reward_std": 0.15718750655651093, "rewards/check_gptzero_func/mean": 0.25, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.34317997097969055, "rewards/check_winston_local_func/std": 0.28782057762145996, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 705 }, { "clip_ratio": 0.0099645322188735, "epoch": 0.2470258922323303, "grad_norm": 0.47552179419712054, "kl": 0.4921875, "learning_rate": 9.98326579712568e-06, "loss": 0.0001, "step": 706 }, { "clip_ratio": 0.02554316446185112, "epoch": 0.24737578726382084, "grad_norm": 0.3396457232770832, "kl": 0.5, "learning_rate": 9.983015242977224e-06, "loss": -0.0024, "step": 707 }, { "clip_ratio": 0.03887275978922844, "epoch": 0.24772568229531142, "grad_norm": 0.25074642132395086, "kl": 0.50390625, "learning_rate": 9.982762830211239e-06, "loss": -0.0035, "step": 708 }, { "clip_ratio": 0.0022405986674129963, "clipped_completions_ratio": 0.0, "epoch": 0.24807557732680197, "grad_norm": 0.6346077533771352, "kl": 0.490234375, "learning_rate": 9.98250855892188e-06, "loss": 0.0018, "max_completion_length": 241.0, "max_terminated_completion_length": 241.0, "mean_completion_length": 173.62501525878906, "mean_terminated_completion_length": 173.62501525878906, "min_completion_length": 96.0, "min_terminated_completion_length": 96.0, "num_tokens": 3569616.0, "reward": 1.8184388875961304, "reward_std": 0.2776291072368622, "rewards/check_gptzero_func/mean": 0.4107142984867096, "rewards/check_gptzero_func/std": 0.49641573429107666, "rewards/check_winston_local_func/mean": 0.42558154463768005, "rewards/check_winston_local_func/std": 0.26197850704193115, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.044136740267276764, "step": 709 }, { "clip_ratio": 0.00993884913623333, "epoch": 0.24842547235829252, "grad_norm": 0.4183264525961107, "kl": 0.50390625, "learning_rate": 9.982252429203983e-06, "loss": -0.001, "step": 710 }, { "clip_ratio": 0.026523208245635033, "epoch": 0.24877536738978306, "grad_norm": 0.47866566049201165, "kl": 0.53515625, "learning_rate": 9.981994441153084e-06, "loss": -0.0024, "step": 711 }, { "clip_ratio": 0.03415742889046669, "epoch": 0.2491252624212736, "grad_norm": 0.36848183408527113, "kl": 0.54296875, "learning_rate": 9.98173459486541e-06, "loss": -0.0038, "step": 712 }, { "clip_ratio": 0.003344540251418948, "clipped_completions_ratio": 0.0, "epoch": 0.24947515745276416, "grad_norm": 0.7149173399404911, "kl": 0.7890625, "learning_rate": 9.981472890437881e-06, "loss": 0.0064, "max_completion_length": 175.0, "max_terminated_completion_length": 175.0, "mean_completion_length": 134.44644165039062, "mean_terminated_completion_length": 134.44644165039062, "min_completion_length": 95.0, "min_terminated_completion_length": 95.0, "num_tokens": 3586353.0, "reward": 1.9642882347106934, "reward_std": 0.28652259707450867, "rewards/check_gptzero_func/mean": 0.375, "rewards/check_gptzero_func/std": 0.48850420117378235, "rewards/check_winston_local_func/mean": 0.6096963286399841, "rewards/check_winston_local_func/std": 0.3526184558868408, "rewards/sentence_count_match_reward_logic/mean": 0.9795918464660645, "rewards/sentence_count_match_reward_logic/std": 0.050441987812519073, "step": 713 }, { "clip_ratio": 0.013704218901693821, "epoch": 0.2498250524842547, "grad_norm": 0.5510976797988405, "kl": 0.80078125, "learning_rate": 9.981209327968111e-06, "loss": 0.0024, "step": 714 }, { "clip_ratio": 0.028818443417549133, "epoch": 0.25017494751574526, "grad_norm": 0.38714217993966726, "kl": 0.81640625, "learning_rate": 9.980943907554404e-06, "loss": -0.0003, "step": 715 }, { "clip_ratio": 0.047858014702796936, "epoch": 0.25052484254723584, "grad_norm": 0.3561065799420372, "kl": 0.8359375, "learning_rate": 9.980676629295763e-06, "loss": -0.0016, "step": 716 }, { "clip_ratio": 0.005032869055867195, "clipped_completions_ratio": 0.0, "epoch": 0.25087473757872636, "grad_norm": 0.6491064867511085, "kl": 0.6328125, "learning_rate": 9.980407493291876e-06, "loss": 0.0107, "max_completion_length": 218.0, "max_terminated_completion_length": 218.0, "mean_completion_length": 157.83929443359375, "mean_terminated_completion_length": 157.83929443359375, "min_completion_length": 95.0, "min_terminated_completion_length": 95.0, "num_tokens": 3605856.0, "reward": 1.7628343105316162, "reward_std": 0.36246606707572937, "rewards/check_gptzero_func/mean": 0.375, "rewards/check_gptzero_func/std": 0.48850420117378235, "rewards/check_winston_local_func/mean": 0.42354854941368103, "rewards/check_winston_local_func/std": 0.3124120831489563, "rewards/sentence_count_match_reward_logic/mean": 0.9642857313156128, "rewards/sentence_count_match_reward_logic/std": 0.0861891582608223, "step": 717 }, { "clip_ratio": 0.010267793200910091, "epoch": 0.25122463261021694, "grad_norm": 0.5111498258630054, "kl": 0.640625, "learning_rate": 9.98013649964313e-06, "loss": 0.0076, "step": 718 }, { "clip_ratio": 0.027108777314424515, "epoch": 0.2515745276417075, "grad_norm": 0.4177050207247011, "kl": 0.6484375, "learning_rate": 9.979863648450606e-06, "loss": 0.0052, "step": 719 }, { "clip_ratio": 0.04006367176771164, "epoch": 0.25192442267319803, "grad_norm": 0.3451780323265858, "kl": 0.65234375, "learning_rate": 9.979588939816071e-06, "loss": 0.0043, "step": 720 }, { "clip_ratio": 0.006151231005787849, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.2522743177046886, "grad_norm": 0.5890671404520577, "kl": 0.5546875, "learning_rate": 9.97931237384199e-06, "loss": 0.0046, "max_completion_length": 256.0, "max_terminated_completion_length": 197.0, "mean_completion_length": 154.69644165039062, "mean_terminated_completion_length": 137.8125, "min_completion_length": 57.0, "min_terminated_completion_length": 57.0, "num_tokens": 3625271.0, "reward": 1.6199870109558105, "reward_std": 0.19228492677211761, "rewards/check_gptzero_func/mean": 0.1964285671710968, "rewards/check_gptzero_func/std": 0.40089187026023865, "rewards/check_winston_local_func/mean": 0.4235582947731018, "rewards/check_winston_local_func/std": 0.33619606494903564, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 721 }, { "clip_ratio": 0.010926024056971073, "epoch": 0.25262421273617913, "grad_norm": 0.5048538643700998, "kl": 0.5546875, "learning_rate": 9.97903395063152e-06, "loss": 0.0031, "step": 722 }, { "clip_ratio": 0.024701643735170364, "epoch": 0.2529741077676697, "grad_norm": 0.48708231041553457, "kl": 0.546875, "learning_rate": 9.978753670288508e-06, "loss": 0.0005, "step": 723 }, { "clip_ratio": 0.03290211409330368, "epoch": 0.25332400279916023, "grad_norm": 0.5256892987997226, "kl": 0.53515625, "learning_rate": 9.9784715329175e-06, "loss": 0.0007, "step": 724 }, { "clip_ratio": 0.0038280938751995564, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.2536738978306508, "grad_norm": 0.5946411711677871, "kl": 0.63671875, "learning_rate": 9.978187538623726e-06, "loss": 0.0122, "max_completion_length": 256.0, "max_terminated_completion_length": 165.0, "mean_completion_length": 165.57144165039062, "mean_terminated_completion_length": 129.40000915527344, "min_completion_length": 57.0, "min_terminated_completion_length": 57.0, "num_tokens": 3647311.0, "reward": 1.71094810962677, "reward_std": 0.20848514139652252, "rewards/check_gptzero_func/mean": 0.2321428507566452, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.5430908799171448, "rewards/check_winston_local_func/std": 0.3547170162200928, "rewards/sentence_count_match_reward_logic/mean": 0.9357143044471741, "rewards/sentence_count_match_reward_logic/std": 0.1034470796585083, "step": 725 }, { "clip_ratio": 0.009470386430621147, "epoch": 0.2540237928621414, "grad_norm": 0.5035715011299268, "kl": 0.6484375, "learning_rate": 9.977901687513117e-06, "loss": 0.0095, "step": 726 }, { "clip_ratio": 0.02188851125538349, "epoch": 0.2543736878936319, "grad_norm": 0.37519134087464434, "kl": 0.65625, "learning_rate": 9.977613979692291e-06, "loss": 0.0074, "step": 727 }, { "clip_ratio": 0.03216119855642319, "epoch": 0.2547235829251225, "grad_norm": 0.3887664710983205, "kl": 0.671875, "learning_rate": 9.97732441526856e-06, "loss": 0.006, "step": 728 }, { "clip_ratio": 0.004230254329741001, "clipped_completions_ratio": 0.0, "epoch": 0.255073477956613, "grad_norm": 0.7307050356710822, "kl": 0.765625, "learning_rate": 9.97703299434993e-06, "loss": 0.0032, "max_completion_length": 204.0, "max_terminated_completion_length": 204.0, "mean_completion_length": 155.375, "mean_terminated_completion_length": 155.375, "min_completion_length": 118.0, "min_terminated_completion_length": 118.0, "num_tokens": 3666564.0, "reward": 1.616034984588623, "reward_std": 0.20971707999706268, "rewards/check_gptzero_func/mean": 0.1607142835855484, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.4629737436771393, "rewards/check_winston_local_func/std": 0.3939400613307953, "rewards/sentence_count_match_reward_logic/mean": 0.9923468828201294, "rewards/sentence_count_match_reward_logic/std": 0.03245824947953224, "step": 729 }, { "clip_ratio": 0.011069649830460548, "epoch": 0.2554233729881036, "grad_norm": 0.5994411861078842, "kl": 0.77734375, "learning_rate": 9.976739717045097e-06, "loss": -0.0002, "step": 730 }, { "clip_ratio": 0.031448159366846085, "epoch": 0.2557732680195941, "grad_norm": 0.4733107004945427, "kl": 0.796875, "learning_rate": 9.976444583463449e-06, "loss": -0.0032, "step": 731 }, { "clip_ratio": 0.04907193407416344, "epoch": 0.2561231630510847, "grad_norm": 0.4417564432451054, "kl": 0.80078125, "learning_rate": 9.976147593715074e-06, "loss": -0.0052, "step": 732 }, { "clip_ratio": 0.0044006528332829475, "clipped_completions_ratio": 0.0, "epoch": 0.2564730580825752, "grad_norm": 0.5586895694516897, "kl": 0.65625, "learning_rate": 9.975848747910741e-06, "loss": 0.007, "max_completion_length": 216.0, "max_terminated_completion_length": 216.0, "mean_completion_length": 160.8928680419922, "mean_terminated_completion_length": 160.8928680419922, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 3686270.0, "reward": 1.7666749954223633, "reward_std": 0.24519045650959015, "rewards/check_gptzero_func/mean": 0.375, "rewards/check_gptzero_func/std": 0.48850420117378235, "rewards/check_winston_local_func/mean": 0.3942258059978485, "rewards/check_winston_local_func/std": 0.2892248332500458, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 733 }, { "clip_ratio": 0.00855567492544651, "epoch": 0.2568229531140658, "grad_norm": 0.46086755783797234, "kl": 0.6640625, "learning_rate": 9.975548046161918e-06, "loss": 0.0048, "step": 734 }, { "clip_ratio": 0.01979297213256359, "epoch": 0.25717284814555635, "grad_norm": 0.3678749087501046, "kl": 0.6640625, "learning_rate": 9.975245488580763e-06, "loss": 0.0017, "step": 735 }, { "clip_ratio": 0.03583905100822449, "epoch": 0.2575227431770469, "grad_norm": 0.2941734771234061, "kl": 0.6640625, "learning_rate": 9.974941075280128e-06, "loss": 0.0006, "step": 736 }, { "clip_ratio": 0.0036760251969099045, "clipped_completions_ratio": 0.0, "epoch": 0.25787263820853745, "grad_norm": 0.6956719994799249, "kl": 0.67578125, "learning_rate": 9.97463480637356e-06, "loss": 0.002, "max_completion_length": 204.0, "max_terminated_completion_length": 204.0, "mean_completion_length": 142.44644165039062, "mean_terminated_completion_length": 142.44644165039062, "min_completion_length": 90.0, "min_terminated_completion_length": 90.0, "num_tokens": 3703951.0, "reward": 1.7927576303482056, "reward_std": 0.2208889126777649, "rewards/check_gptzero_func/mean": 0.375, "rewards/check_gptzero_func/std": 0.48850420117378235, "rewards/check_winston_local_func/mean": 0.4778764843940735, "rewards/check_winston_local_func/std": 0.2968413531780243, "rewards/sentence_count_match_reward_logic/mean": 0.939880907535553, "rewards/sentence_count_match_reward_logic/std": 0.09835406392812729, "step": 737 }, { "clip_ratio": 0.014119166880846024, "epoch": 0.258222533240028, "grad_norm": 0.5563747791096711, "kl": 0.67578125, "learning_rate": 9.974326681975287e-06, "loss": -0.0021, "step": 738 }, { "clip_ratio": 0.031521376222372055, "epoch": 0.25857242827151855, "grad_norm": 0.4919377087938077, "kl": 0.69140625, "learning_rate": 9.974016702200244e-06, "loss": -0.0048, "step": 739 }, { "clip_ratio": 0.04542554169893265, "epoch": 0.25892232330300907, "grad_norm": 0.4316561787676758, "kl": 0.7265625, "learning_rate": 9.973704867164044e-06, "loss": -0.0065, "step": 740 }, { "clip_ratio": 0.0037894139531999826, "clipped_completions_ratio": 0.0, "epoch": 0.25927221833449965, "grad_norm": 0.7192002011148625, "kl": 0.66796875, "learning_rate": 9.973391176983003e-06, "loss": 0.0057, "max_completion_length": 248.0, "max_terminated_completion_length": 248.0, "mean_completion_length": 163.6428680419922, "mean_terminated_completion_length": 163.6428680419922, "min_completion_length": 80.0, "min_terminated_completion_length": 80.0, "num_tokens": 3724099.0, "reward": 1.885597586631775, "reward_std": 0.34664618968963623, "rewards/check_gptzero_func/mean": 0.4285714328289032, "rewards/check_gptzero_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.48636287450790405, "rewards/check_winston_local_func/std": 0.31724926829338074, "rewards/sentence_count_match_reward_logic/mean": 0.9706632494926453, "rewards/sentence_count_match_reward_logic/std": 0.06606735289096832, "step": 741 }, { "clip_ratio": 0.013355385512113571, "epoch": 0.2596221133659902, "grad_norm": 0.5161703467663138, "kl": 0.68359375, "learning_rate": 9.973075631774123e-06, "loss": 0.0023, "step": 742 }, { "clip_ratio": 0.030675023794174194, "epoch": 0.25997200839748075, "grad_norm": 0.4188088831521016, "kl": 0.70703125, "learning_rate": 9.972758231655097e-06, "loss": -0.0006, "step": 743 }, { "clip_ratio": 0.04342165216803551, "epoch": 0.2603219034289713, "grad_norm": 0.3529237438488764, "kl": 0.7109375, "learning_rate": 9.972438976744317e-06, "loss": -0.0026, "step": 744 }, { "clip_ratio": 0.0049988869577646255, "clipped_completions_ratio": 0.0, "epoch": 0.26067179846046185, "grad_norm": 0.5836069183343249, "kl": 0.6484375, "learning_rate": 9.972117867160858e-06, "loss": 0.0022, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 171.9107208251953, "mean_terminated_completion_length": 171.9107208251953, "min_completion_length": 108.0, "min_terminated_completion_length": 108.0, "num_tokens": 3745102.0, "reward": 1.988162636756897, "reward_std": 0.14375820755958557, "rewards/check_gptzero_func/mean": 0.5357142686843872, "rewards/check_gptzero_func/std": 0.5032362937927246, "rewards/check_winston_local_func/mean": 0.4524482190608978, "rewards/check_winston_local_func/std": 0.37690243124961853, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 745 }, { "clip_ratio": 0.011194375343620777, "epoch": 0.2610216934919524, "grad_norm": 0.5227634387808158, "kl": 0.66796875, "learning_rate": 9.971794903024493e-06, "loss": 0.0009, "step": 746 }, { "clip_ratio": 0.02090207114815712, "epoch": 0.26137158852344294, "grad_norm": 0.5124092359310917, "kl": 0.67578125, "learning_rate": 9.971470084455684e-06, "loss": -0.0009, "step": 747 }, { "clip_ratio": 0.030223658308386803, "epoch": 0.2617214835549335, "grad_norm": 0.29883179991988934, "kl": 0.6640625, "learning_rate": 9.971143411575585e-06, "loss": -0.003, "step": 748 }, { "clip_ratio": 0.0031192798633128405, "clipped_completions_ratio": 0.0, "epoch": 0.2620713785864241, "grad_norm": 0.7053574941878255, "kl": 0.7421875, "learning_rate": 9.970814884506043e-06, "loss": 0.0095, "max_completion_length": 195.0, "max_terminated_completion_length": 195.0, "mean_completion_length": 136.30357360839844, "mean_terminated_completion_length": 136.30357360839844, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 3762231.0, "reward": 2.0237948894500732, "reward_std": 0.23158983886241913, "rewards/check_gptzero_func/mean": 0.375, "rewards/check_gptzero_func/std": 0.48850420117378235, "rewards/check_winston_local_func/mean": 0.6513457894325256, "rewards/check_winston_local_func/std": 0.34497666358947754, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 749 }, { "clip_ratio": 0.016029318794608116, "epoch": 0.2624212736179146, "grad_norm": 0.6795471786423509, "kl": 0.73828125, "learning_rate": 9.970484503369595e-06, "loss": 0.006, "step": 750 }, { "clip_ratio": 0.029792863875627518, "epoch": 0.2627711686494052, "grad_norm": 0.4377456632940405, "kl": 0.74609375, "learning_rate": 9.970152268289468e-06, "loss": 0.003, "step": 751 }, { "clip_ratio": 0.045441143214702606, "epoch": 0.2631210636808957, "grad_norm": 0.48302087564436175, "kl": 0.7734375, "learning_rate": 9.969818179389586e-06, "loss": 0.0019, "step": 752 }, { "clip_ratio": 0.002526430180296302, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.2634709587123863, "grad_norm": 0.5209123818856511, "kl": 0.56640625, "learning_rate": 9.969482236794558e-06, "loss": 0.0064, "max_completion_length": 256.0, "max_terminated_completion_length": 210.0, "mean_completion_length": 182.23214721679688, "mean_terminated_completion_length": 169.9375, "min_completion_length": 132.0, "min_terminated_completion_length": 132.0, "num_tokens": 3784476.0, "reward": 1.6435906887054443, "reward_std": 0.3082408905029297, "rewards/check_gptzero_func/mean": 0.2142857164144516, "rewards/check_gptzero_func/std": 0.41403934359550476, "rewards/check_winston_local_func/mean": 0.48109060525894165, "rewards/check_winston_local_func/std": 0.36053529381752014, "rewards/sentence_count_match_reward_logic/mean": 0.9482142329216003, "rewards/sentence_count_match_reward_logic/std": 0.0873677060008049, "step": 753 }, { "clip_ratio": 0.007172503974288702, "epoch": 0.2638208537438768, "grad_norm": 0.44030474248508084, "kl": 0.578125, "learning_rate": 9.969144440629688e-06, "loss": 0.0043, "step": 754 }, { "clip_ratio": 0.01967255398631096, "epoch": 0.2641707487753674, "grad_norm": 0.43123491469324066, "kl": 0.5859375, "learning_rate": 9.968804791020972e-06, "loss": 0.0023, "step": 755 }, { "clip_ratio": 0.03222627565264702, "epoch": 0.26452064380685797, "grad_norm": 0.3106694358895903, "kl": 0.5859375, "learning_rate": 9.968463288095096e-06, "loss": 0.0006, "step": 756 }, { "clip_ratio": 0.0028126274701207876, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.2648705388383485, "grad_norm": 0.6227698443781838, "kl": 0.609375, "learning_rate": 9.968119931979436e-06, "loss": 0.0034, "max_completion_length": 256.0, "max_terminated_completion_length": 249.0, "mean_completion_length": 170.9107208251953, "mean_terminated_completion_length": 156.7291717529297, "min_completion_length": 94.0, "min_terminated_completion_length": 94.0, "num_tokens": 3805519.0, "reward": 1.8755477666854858, "reward_std": 0.15945452451705933, "rewards/check_gptzero_func/mean": 0.4821428656578064, "rewards/check_gptzero_func/std": 0.5042031407356262, "rewards/check_winston_local_func/mean": 0.4246548116207123, "rewards/check_winston_local_func/std": 0.3650044798851013, "rewards/sentence_count_match_reward_logic/mean": 0.96875, "rewards/sentence_count_match_reward_logic/std": 0.08342797309160233, "step": 757 }, { "clip_ratio": 0.010932456701993942, "epoch": 0.26522043386983907, "grad_norm": 0.45510706392061306, "kl": 0.609375, "learning_rate": 9.967774722802062e-06, "loss": 0.0, "step": 758 }, { "clip_ratio": 0.02447049878537655, "epoch": 0.2655703289013296, "grad_norm": 0.37422661915888417, "kl": 0.6171875, "learning_rate": 9.967427660691732e-06, "loss": -0.0013, "step": 759 }, { "clip_ratio": 0.03912734240293503, "epoch": 0.26592022393282017, "grad_norm": 0.29159963450896703, "kl": 0.6171875, "learning_rate": 9.9670787457779e-06, "loss": -0.0029, "step": 760 }, { "clip_ratio": 0.003759579034522176, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.2662701189643107, "grad_norm": 0.6678280433130753, "kl": 0.6875, "learning_rate": 9.966727978190705e-06, "loss": 0.0273, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 167.2678680419922, "mean_terminated_completion_length": 163.98147583007812, "min_completion_length": 106.0, "min_terminated_completion_length": 106.0, "num_tokens": 3825838.0, "reward": 2.0755019187927246, "reward_std": 0.22924154996871948, "rewards/check_gptzero_func/mean": 0.5178571343421936, "rewards/check_gptzero_func/std": 0.5042031407356262, "rewards/check_winston_local_func/mean": 0.6283966898918152, "rewards/check_winston_local_func/std": 0.21461158990859985, "rewards/sentence_count_match_reward_logic/mean": 0.9292478561401367, "rewards/sentence_count_match_reward_logic/std": 0.14897510409355164, "step": 761 }, { "clip_ratio": 0.009478701278567314, "epoch": 0.26662001399580126, "grad_norm": 0.499946614764796, "kl": 0.67578125, "learning_rate": 9.966375358060981e-06, "loss": 0.0251, "step": 762 }, { "clip_ratio": 0.020641332492232323, "epoch": 0.2669699090272918, "grad_norm": 0.4322925450883409, "kl": 0.69140625, "learning_rate": 9.966020885520251e-06, "loss": 0.0215, "step": 763 }, { "clip_ratio": 0.034415196627378464, "epoch": 0.26731980405878236, "grad_norm": 0.33046163432356473, "kl": 0.69140625, "learning_rate": 9.965664560700734e-06, "loss": 0.02, "step": 764 }, { "clip_ratio": 0.004159918054938316, "clipped_completions_ratio": 0.0892857142857143, "epoch": 0.26766969909027294, "grad_norm": 0.6325060726673916, "kl": 0.6953125, "learning_rate": 9.96530638373533e-06, "loss": 0.0035, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 169.75, "mean_terminated_completion_length": 161.29412841796875, "min_completion_length": 88.0, "min_terminated_completion_length": 88.0, "num_tokens": 3846552.0, "reward": 1.877763271331787, "reward_std": 0.24451196193695068, "rewards/check_gptzero_func/mean": 0.4285714328289032, "rewards/check_gptzero_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.4521680474281311, "rewards/check_winston_local_func/std": 0.30475080013275146, "rewards/sentence_count_match_reward_logic/mean": 0.9970237612724304, "rewards/sentence_count_match_reward_logic/std": 0.02227177284657955, "step": 765 }, { "clip_ratio": 0.009175165556371212, "epoch": 0.26801959412176346, "grad_norm": 0.5443797737122822, "kl": 0.6953125, "learning_rate": 9.964946354757638e-06, "loss": -0.0001, "step": 766 }, { "clip_ratio": 0.02374529466032982, "epoch": 0.26836948915325404, "grad_norm": 0.42006609326200994, "kl": 0.70703125, "learning_rate": 9.964584473901946e-06, "loss": -0.0024, "step": 767 }, { "clip_ratio": 0.04496155306696892, "epoch": 0.26871938418474456, "grad_norm": 0.4187157371458957, "kl": 0.7265625, "learning_rate": 9.964220741303232e-06, "loss": -0.0041, "step": 768 }, { "clip_ratio": 0.003790077054873109, "clipped_completions_ratio": 0.0, "epoch": 0.26906927921623514, "grad_norm": 0.4544414617998426, "kl": 0.58984375, "learning_rate": 9.963855157097166e-06, "loss": 0.0041, "max_completion_length": 234.0, "max_terminated_completion_length": 234.0, "mean_completion_length": 193.71429443359375, "mean_terminated_completion_length": 193.71429443359375, "min_completion_length": 132.0, "min_terminated_completion_length": 132.0, "num_tokens": 3869960.0, "reward": 2.173069477081299, "reward_std": 0.21917450428009033, "rewards/check_gptzero_func/mean": 0.7142857313156128, "rewards/check_gptzero_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.4754502475261688, "rewards/check_winston_local_func/std": 0.3076130151748657, "rewards/sentence_count_match_reward_logic/mean": 0.9833332896232605, "rewards/sentence_count_match_reward_logic/std": 0.05393598973751068, "step": 769 }, { "clip_ratio": 0.006037245970219374, "epoch": 0.26941917424772566, "grad_norm": 0.4235261517626319, "kl": 0.59375, "learning_rate": 9.963487721420104e-06, "loss": 0.0022, "step": 770 }, { "clip_ratio": 0.014882383868098259, "epoch": 0.26976906927921623, "grad_norm": 0.39057987393720645, "kl": 0.60546875, "learning_rate": 9.9631184344091e-06, "loss": 0.0004, "step": 771 }, { "clip_ratio": 0.02746381051838398, "epoch": 0.2701189643107068, "grad_norm": 0.30663993432146686, "kl": 0.60546875, "learning_rate": 9.962747296201891e-06, "loss": -0.0015, "step": 772 }, { "clip_ratio": 0.003782429965212941, "clipped_completions_ratio": 0.0, "epoch": 0.27046885934219733, "grad_norm": 0.6371580168183675, "kl": 0.72265625, "learning_rate": 9.96237430693691e-06, "loss": 0.0081, "max_completion_length": 181.0, "max_terminated_completion_length": 181.0, "mean_completion_length": 149.42857360839844, "mean_terminated_completion_length": 149.42857360839844, "min_completion_length": 126.0, "min_terminated_completion_length": 126.0, "num_tokens": 3888488.0, "reward": 1.9194650650024414, "reward_std": 0.2492791712284088, "rewards/check_gptzero_func/mean": 0.3392857015132904, "rewards/check_gptzero_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.603988766670227, "rewards/check_winston_local_func/std": 0.3066624402999878, "rewards/sentence_count_match_reward_logic/mean": 0.976190447807312, "rewards/sentence_count_match_reward_logic/std": 0.058848995715379715, "step": 773 }, { "clip_ratio": 0.0075677563436329365, "epoch": 0.2708187543736879, "grad_norm": 0.5027571155821349, "kl": 0.73046875, "learning_rate": 9.961999466753278e-06, "loss": 0.0048, "step": 774 }, { "clip_ratio": 0.022189000621438026, "epoch": 0.27116864940517843, "grad_norm": 0.4218410111741781, "kl": 0.73046875, "learning_rate": 9.961622775790808e-06, "loss": 0.0022, "step": 775 }, { "clip_ratio": 0.04100419953465462, "epoch": 0.271518544436669, "grad_norm": 0.3508962609654691, "kl": 0.73828125, "learning_rate": 9.961244234190001e-06, "loss": 0.0005, "step": 776 }, { "clip_ratio": 0.003733638906851411, "clipped_completions_ratio": 0.0, "epoch": 0.27186843946815953, "grad_norm": 0.753272116296656, "kl": 0.8203125, "learning_rate": 9.960863842092048e-06, "loss": 0.0097, "max_completion_length": 214.0, "max_terminated_completion_length": 214.0, "mean_completion_length": 148.08929443359375, "mean_terminated_completion_length": 148.08929443359375, "min_completion_length": 86.0, "min_terminated_completion_length": 86.0, "num_tokens": 3906861.0, "reward": 2.5699267387390137, "reward_std": 0.28781434893608093, "rewards/check_gptzero_func/mean": 0.75, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.830130934715271, "rewards/check_winston_local_func/std": 0.15495292842388153, "rewards/sentence_count_match_reward_logic/mean": 0.989795982837677, "rewards/sentence_count_match_reward_logic/std": 0.03712429851293564, "step": 777 }, { "clip_ratio": 0.01729295775294304, "epoch": 0.2722183344996501, "grad_norm": 0.6492870721347307, "kl": 0.8203125, "learning_rate": 9.960481599638835e-06, "loss": 0.0067, "step": 778 }, { "clip_ratio": 0.029893506318330765, "epoch": 0.2725682295311407, "grad_norm": 0.5308705807293033, "kl": 0.83984375, "learning_rate": 9.960097506972934e-06, "loss": 0.0043, "step": 779 }, { "clip_ratio": 0.040780045092105865, "epoch": 0.2729181245626312, "grad_norm": 0.5070871338102353, "kl": 0.87890625, "learning_rate": 9.959711564237603e-06, "loss": 0.0034, "step": 780 }, { "clip_ratio": 0.0037390824873000383, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.2732680195941218, "grad_norm": 0.6063356284704186, "kl": 0.63671875, "learning_rate": 9.959323771576804e-06, "loss": 0.0085, "max_completion_length": 256.0, "max_terminated_completion_length": 197.0, "mean_completion_length": 178.00001525878906, "mean_terminated_completion_length": 165.0, "min_completion_length": 113.0, "min_terminated_completion_length": 113.0, "num_tokens": 3928477.0, "reward": 2.3348922729492188, "reward_std": 0.2467653900384903, "rewards/check_gptzero_func/mean": 0.6428571343421936, "rewards/check_gptzero_func/std": 0.48349377512931824, "rewards/check_winston_local_func/mean": 0.7059240341186523, "rewards/check_winston_local_func/std": 0.259724885225296, "rewards/sentence_count_match_reward_logic/mean": 0.9861111044883728, "rewards/sentence_count_match_reward_logic/std": 0.0370790958404541, "step": 781 }, { "clip_ratio": 0.010625431314110756, "epoch": 0.2736179146256123, "grad_norm": 0.4323838912160281, "kl": 0.64453125, "learning_rate": 9.958934129135172e-06, "loss": 0.0063, "step": 782 }, { "clip_ratio": 0.024366723373532295, "epoch": 0.2739678096571029, "grad_norm": 0.4200606630172256, "kl": 0.65625, "learning_rate": 9.958542637058045e-06, "loss": 0.0048, "step": 783 }, { "clip_ratio": 0.0352371409535408, "epoch": 0.2743177046885934, "grad_norm": 0.34836052262088407, "kl": 0.6640625, "learning_rate": 9.958149295491441e-06, "loss": 0.0035, "step": 784 }, { "clip_ratio": 0.0035655279643833637, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.274667599720084, "grad_norm": 0.5581770782040789, "kl": 0.62109375, "learning_rate": 9.957754104582076e-06, "loss": 0.0051, "max_completion_length": 256.0, "max_terminated_completion_length": 240.0, "mean_completion_length": 190.33929443359375, "mean_terminated_completion_length": 164.0749969482422, "min_completion_length": 87.0, "min_terminated_completion_length": 87.0, "num_tokens": 3952088.0, "reward": 2.1649529933929443, "reward_std": 0.3125678598880768, "rewards/check_gptzero_func/mean": 0.5892857313156128, "rewards/check_gptzero_func/std": 0.49641573429107666, "rewards/check_winston_local_func/mean": 0.6074132919311523, "rewards/check_winston_local_func/std": 0.31583669781684875, "rewards/sentence_count_match_reward_logic/mean": 0.9682539701461792, "rewards/sentence_count_match_reward_logic/std": 0.050649140030145645, "step": 785 }, { "clip_ratio": 0.006846277974545956, "epoch": 0.27501749475157455, "grad_norm": 0.4956757984426915, "kl": 0.6328125, "learning_rate": 9.957357064477354e-06, "loss": 0.003, "step": 786 }, { "clip_ratio": 0.01982065662741661, "epoch": 0.2753673897830651, "grad_norm": 0.38752868419700626, "kl": 0.640625, "learning_rate": 9.956958175325362e-06, "loss": 0.0004, "step": 787 }, { "clip_ratio": 0.031972989439964294, "epoch": 0.27571728481455565, "grad_norm": 0.3368459727655874, "kl": 0.65234375, "learning_rate": 9.956557437274887e-06, "loss": -0.001, "step": 788 }, { "clip_ratio": 0.006371248047798872, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.2760671798460462, "grad_norm": 0.8614327480064679, "kl": 0.828125, "learning_rate": 9.9561548504754e-06, "loss": 0.0008, "max_completion_length": 256.0, "max_terminated_completion_length": 248.0, "mean_completion_length": 157.8928680419922, "mean_terminated_completion_length": 141.5416717529297, "min_completion_length": 85.0, "min_terminated_completion_length": 85.0, "num_tokens": 3971930.0, "reward": 1.9548025131225586, "reward_std": 0.24648970365524292, "rewards/check_gptzero_func/mean": 0.5357142686843872, "rewards/check_gptzero_func/std": 0.5032362937927246, "rewards/check_winston_local_func/mean": 0.45267653465270996, "rewards/check_winston_local_func/std": 0.2605832815170288, "rewards/sentence_count_match_reward_logic/mean": 0.9664115905761719, "rewards/sentence_count_match_reward_logic/std": 0.05902854725718498, "step": 789 }, { "clip_ratio": 0.020450718700885773, "epoch": 0.27641707487753675, "grad_norm": 0.6561259310017353, "kl": 0.83984375, "learning_rate": 9.955750415077056e-06, "loss": -0.0034, "step": 790 }, { "clip_ratio": 0.035426534712314606, "epoch": 0.27676696990902727, "grad_norm": 0.44259145391954186, "kl": 0.84375, "learning_rate": 9.955344131230714e-06, "loss": -0.0058, "step": 791 }, { "clip_ratio": 0.045556165277957916, "epoch": 0.27711686494051785, "grad_norm": 0.3997713811646037, "kl": 0.84375, "learning_rate": 9.954935999087908e-06, "loss": -0.007, "step": 792 }, { "clip_ratio": 0.003827052190899849, "clipped_completions_ratio": 0.0, "epoch": 0.27746675997200837, "grad_norm": 0.7626771589398212, "kl": 0.765625, "learning_rate": 9.954526018800872e-06, "loss": 0.0036, "max_completion_length": 232.0, "max_terminated_completion_length": 232.0, "mean_completion_length": 155.7857208251953, "mean_terminated_completion_length": 155.7857208251953, "min_completion_length": 74.0, "min_terminated_completion_length": 74.0, "num_tokens": 3990990.0, "reward": 2.409801483154297, "reward_std": 0.22188176214694977, "rewards/check_gptzero_func/mean": 0.7321428656578064, "rewards/check_gptzero_func/std": 0.44685041904449463, "rewards/check_winston_local_func/mean": 0.6904135942459106, "rewards/check_winston_local_func/std": 0.2867717444896698, "rewards/sentence_count_match_reward_logic/mean": 0.9872449040412903, "rewards/sentence_count_match_reward_logic/std": 0.04110519215464592, "step": 793 }, { "clip_ratio": 0.014311135746538639, "epoch": 0.27781665500349895, "grad_norm": 0.5446906398623602, "kl": 0.796875, "learning_rate": 9.954114190522521e-06, "loss": 0.0005, "step": 794 }, { "clip_ratio": 0.031397223472595215, "epoch": 0.2781665500349895, "grad_norm": 0.4825454096979377, "kl": 0.81640625, "learning_rate": 9.953700514406464e-06, "loss": -0.001, "step": 795 }, { "clip_ratio": 0.0430641695857048, "epoch": 0.27851644506648005, "grad_norm": 0.47904691449607373, "kl": 0.80859375, "learning_rate": 9.953284990607e-06, "loss": -0.0024, "step": 796 }, { "clip_ratio": 0.004313042387366295, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.2788663400979706, "grad_norm": 0.8677064232205906, "kl": 0.7578125, "learning_rate": 9.952867619279112e-06, "loss": 0.0068, "max_completion_length": 256.0, "max_terminated_completion_length": 198.0, "mean_completion_length": 185.32144165039062, "mean_terminated_completion_length": 157.0500030517578, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 4014864.0, "reward": 2.1847312450408936, "reward_std": 0.3229962885379791, "rewards/check_gptzero_func/mean": 0.6785714030265808, "rewards/check_gptzero_func/std": 0.4712514281272888, "rewards/check_winston_local_func/mean": 0.5906835198402405, "rewards/check_winston_local_func/std": 0.3111124336719513, "rewards/sentence_count_match_reward_logic/mean": 0.9154762029647827, "rewards/sentence_count_match_reward_logic/std": 0.12616556882858276, "step": 797 }, { "clip_ratio": 0.01413137186318636, "epoch": 0.27921623512946114, "grad_norm": 0.5565763403266384, "kl": 0.7265625, "learning_rate": 9.95244840057848e-06, "loss": 0.004, "step": 798 }, { "clip_ratio": 0.027324069291353226, "epoch": 0.2795661301609517, "grad_norm": 0.5473702734658372, "kl": 0.78125, "learning_rate": 9.952027334661465e-06, "loss": 0.0029, "step": 799 }, { "clip_ratio": 0.03480447083711624, "epoch": 0.27991602519244224, "grad_norm": 0.5239046969962095, "kl": 0.8359375, "learning_rate": 9.951604421685121e-06, "loss": 0.0018, "step": 800 }, { "clip_ratio": 0.004089975263923407, "clipped_completions_ratio": 0.0, "epoch": 0.2802659202239328, "grad_norm": 0.7687018734094455, "kl": 0.77734375, "learning_rate": 9.951179661807194e-06, "loss": -0.0001, "max_completion_length": 208.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 153.2857208251953, "mean_terminated_completion_length": 153.2857208251953, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 4034016.0, "reward": 2.1565544605255127, "reward_std": 0.33260148763656616, "rewards/check_gptzero_func/mean": 0.4107142984867096, "rewards/check_gptzero_func/std": 0.49641573429107666, "rewards/check_winston_local_func/mean": 0.7458399534225464, "rewards/check_winston_local_func/std": 0.18165047466754913, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 801 }, { "clip_ratio": 0.01739375852048397, "epoch": 0.2806158152554234, "grad_norm": 0.5494315390072829, "kl": 0.80078125, "learning_rate": 9.95075305518611e-06, "loss": -0.0036, "step": 802 }, { "clip_ratio": 0.034697066992521286, "epoch": 0.2809657102869139, "grad_norm": 0.46453980027276637, "kl": 0.80859375, "learning_rate": 9.950324601980994e-06, "loss": -0.0061, "step": 803 }, { "clip_ratio": 0.04721956327557564, "epoch": 0.2813156053184045, "grad_norm": 0.4100613878781145, "kl": 0.8046875, "learning_rate": 9.949894302351653e-06, "loss": -0.0074, "step": 804 }, { "clip_ratio": 0.005168006755411625, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.281665500349895, "grad_norm": 0.6344370018425421, "kl": 0.71875, "learning_rate": 9.949462156458584e-06, "loss": 0.0024, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 174.12501525878906, "mean_terminated_completion_length": 171.09259033203125, "min_completion_length": 96.0, "min_terminated_completion_length": 96.0, "num_tokens": 4055719.0, "reward": 2.0979020595550537, "reward_std": 0.22226402163505554, "rewards/check_gptzero_func/mean": 0.5714285969734192, "rewards/check_gptzero_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.5290243029594421, "rewards/check_winston_local_func/std": 0.36033424735069275, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 805 }, { "clip_ratio": 0.011909973807632923, "epoch": 0.2820153953813856, "grad_norm": 0.5206752181979414, "kl": 0.71875, "learning_rate": 9.949028164462976e-06, "loss": 0.0001, "step": 806 }, { "clip_ratio": 0.02611706778407097, "epoch": 0.2823652904128761, "grad_norm": 0.40777899502058473, "kl": 0.734375, "learning_rate": 9.9485923265267e-06, "loss": -0.0014, "step": 807 }, { "clip_ratio": 0.039011936634778976, "epoch": 0.2827151854443667, "grad_norm": 0.3618036543610281, "kl": 0.7421875, "learning_rate": 9.948154642812321e-06, "loss": -0.003, "step": 808 }, { "clip_ratio": 0.004959275480359793, "clipped_completions_ratio": 0.0, "epoch": 0.28306508047585727, "grad_norm": 0.5957130900488519, "kl": 0.71484375, "learning_rate": 9.947715113483091e-06, "loss": 0.0073, "max_completion_length": 211.0, "max_terminated_completion_length": 211.0, "mean_completion_length": 174.17857360839844, "mean_terminated_completion_length": 174.17857360839844, "min_completion_length": 151.0, "min_terminated_completion_length": 151.0, "num_tokens": 4076665.0, "reward": 2.3025104999542236, "reward_std": 0.33880025148391724, "rewards/check_gptzero_func/mean": 0.5, "rewards/check_gptzero_func/std": 0.5045249462127686, "rewards/check_winston_local_func/mean": 0.8060819506645203, "rewards/check_winston_local_func/std": 0.23315244913101196, "rewards/sentence_count_match_reward_logic/mean": 0.9964285492897034, "rewards/sentence_count_match_reward_logic/std": 0.026726121082901955, "step": 809 }, { "clip_ratio": 0.009895282797515392, "epoch": 0.2834149755073478, "grad_norm": 0.5135258783289761, "kl": 0.7265625, "learning_rate": 9.947273738702952e-06, "loss": 0.0054, "step": 810 }, { "clip_ratio": 0.02191961742937565, "epoch": 0.28376487053883837, "grad_norm": 0.4085812487988832, "kl": 0.70703125, "learning_rate": 9.946830518636532e-06, "loss": 0.0031, "step": 811 }, { "clip_ratio": 0.03289683163166046, "epoch": 0.2841147655703289, "grad_norm": 0.2681992387277162, "kl": 0.6953125, "learning_rate": 9.946385453449145e-06, "loss": 0.0011, "step": 812 }, { "clip_ratio": 0.005069525446742773, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.28446466060181946, "grad_norm": 0.6976228707989288, "kl": 0.6328125, "learning_rate": 9.9459385433068e-06, "loss": 0.0021, "max_completion_length": 256.0, "max_terminated_completion_length": 220.0, "mean_completion_length": 180.4107208251953, "mean_terminated_completion_length": 150.1750030517578, "min_completion_length": 96.0, "min_terminated_completion_length": 96.0, "num_tokens": 4099144.0, "reward": 2.3417229652404785, "reward_std": 0.251899391412735, "rewards/check_gptzero_func/mean": 0.6607142686843872, "rewards/check_gptzero_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.717041552066803, "rewards/check_winston_local_func/std": 0.29821330308914185, "rewards/sentence_count_match_reward_logic/mean": 0.9639668464660645, "rewards/sentence_count_match_reward_logic/std": 0.06029179319739342, "step": 813 }, { "clip_ratio": 0.010234645567834377, "epoch": 0.28481455563331, "grad_norm": 0.49133323535741674, "kl": 0.64453125, "learning_rate": 9.945489788376188e-06, "loss": 0.0002, "step": 814 }, { "clip_ratio": 0.022507961839437485, "epoch": 0.28516445066480056, "grad_norm": 1.5972038660967038, "kl": 0.6640625, "learning_rate": 9.94503918882469e-06, "loss": -0.0017, "step": 815 }, { "clip_ratio": 0.031737472862005234, "epoch": 0.28551434569629114, "grad_norm": 0.35497972458423727, "kl": 0.6640625, "learning_rate": 9.944586744820377e-06, "loss": -0.0028, "step": 816 }, { "clip_ratio": 0.005245559383183718, "clipped_completions_ratio": 0.0, "epoch": 0.28586424072778166, "grad_norm": 0.7148958755105297, "kl": 0.703125, "learning_rate": 9.944132456532005e-06, "loss": 0.0063, "max_completion_length": 218.0, "max_terminated_completion_length": 218.0, "mean_completion_length": 176.6428680419922, "mean_terminated_completion_length": 176.6428680419922, "min_completion_length": 117.0, "min_terminated_completion_length": 117.0, "num_tokens": 4120780.0, "reward": 2.4240472316741943, "reward_std": 0.14236971735954285, "rewards/check_gptzero_func/mean": 0.6428571343421936, "rewards/check_gptzero_func/std": 0.48349377512931824, "rewards/check_winston_local_func/mean": 0.8012795448303223, "rewards/check_winston_local_func/std": 0.2718448042869568, "rewards/sentence_count_match_reward_logic/mean": 0.9799107313156128, "rewards/sentence_count_match_reward_logic/std": 0.04632386937737465, "step": 817 }, { "clip_ratio": 0.011591474525630474, "epoch": 0.28621413575927224, "grad_norm": 0.5164166090349792, "kl": 0.7265625, "learning_rate": 9.94367632412902e-06, "loss": 0.0033, "step": 818 }, { "clip_ratio": 0.02441563829779625, "epoch": 0.28656403079076276, "grad_norm": 0.5049428991278224, "kl": 0.75390625, "learning_rate": 9.943218347781553e-06, "loss": 0.001, "step": 819 }, { "clip_ratio": 0.038708657026290894, "epoch": 0.28691392582225334, "grad_norm": 0.4088823881583168, "kl": 0.75390625, "learning_rate": 9.942758527660429e-06, "loss": -0.001, "step": 820 }, { "clip_ratio": 0.0031872859690338373, "clipped_completions_ratio": 0.1964285714285714, "epoch": 0.28726382085374386, "grad_norm": 0.5535877050311296, "kl": 0.6640625, "learning_rate": 9.942296863937154e-06, "loss": 0.0052, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 211.58929443359375, "mean_terminated_completion_length": 200.73333740234375, "min_completion_length": 135.0, "min_terminated_completion_length": 135.0, "num_tokens": 4146741.0, "reward": 1.9160106182098389, "reward_std": 0.18211735785007477, "rewards/check_gptzero_func/mean": 0.3392857015132904, "rewards/check_gptzero_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.6293609738349915, "rewards/check_winston_local_func/std": 0.3346320390701294, "rewards/sentence_count_match_reward_logic/mean": 0.9473639726638794, "rewards/sentence_count_match_reward_logic/std": 0.07774612307548523, "step": 821 }, { "clip_ratio": 0.00846775807440281, "epoch": 0.28761371588523443, "grad_norm": 0.5759330014001438, "kl": 0.65625, "learning_rate": 9.941833356783924e-06, "loss": 0.0032, "step": 822 }, { "clip_ratio": 0.019964536651968956, "epoch": 0.287963610916725, "grad_norm": 0.4222823764070233, "kl": 0.66796875, "learning_rate": 9.941368006373622e-06, "loss": 0.0006, "step": 823 }, { "clip_ratio": 0.031131437048316002, "epoch": 0.28831350594821553, "grad_norm": 0.37392729750794895, "kl": 0.6875, "learning_rate": 9.940900812879822e-06, "loss": -0.0016, "step": 824 }, { "clip_ratio": 0.003522888757288456, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.2886634009797061, "grad_norm": 0.7744607058401077, "kl": 0.6875, "learning_rate": 9.94043177647678e-06, "loss": 0.0048, "max_completion_length": 256.0, "max_terminated_completion_length": 215.0, "mean_completion_length": 157.2678680419922, "mean_terminated_completion_length": 140.8125, "min_completion_length": 80.0, "min_terminated_completion_length": 80.0, "num_tokens": 4166932.0, "reward": 2.363546133041382, "reward_std": 0.19052833318710327, "rewards/check_gptzero_func/mean": 0.7678571343421936, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.6224746704101562, "rewards/check_winston_local_func/std": 0.38269519805908203, "rewards/sentence_count_match_reward_logic/mean": 0.9732142686843872, "rewards/sentence_count_match_reward_logic/std": 0.0617651641368866, "step": 825 }, { "clip_ratio": 0.017279915511608124, "epoch": 0.28901329601119663, "grad_norm": 0.6708577891383783, "kl": 0.71875, "learning_rate": 9.939960897339444e-06, "loss": 0.0021, "step": 826 }, { "clip_ratio": 0.026001613587141037, "epoch": 0.2893631910426872, "grad_norm": 0.5770507548897443, "kl": 0.7421875, "learning_rate": 9.939488175643446e-06, "loss": 0.0002, "step": 827 }, { "clip_ratio": 0.03492940589785576, "epoch": 0.28971308607417773, "grad_norm": 0.4255923001662921, "kl": 0.70703125, "learning_rate": 9.93901361156511e-06, "loss": -0.0017, "step": 828 }, { "clip_ratio": 0.0027266175020486116, "clipped_completions_ratio": 0.2678571428571429, "epoch": 0.2900629811056683, "grad_norm": 0.7025976151156372, "kl": 0.7890625, "learning_rate": 9.938537205281438e-06, "loss": 0.0043, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 171.6607208251953, "mean_terminated_completion_length": 140.80487060546875, "min_completion_length": 63.0, "min_terminated_completion_length": 63.0, "num_tokens": 4188161.0, "reward": 2.0818803310394287, "reward_std": 0.32001936435699463, "rewards/check_gptzero_func/mean": 0.4107142984867096, "rewards/check_gptzero_func/std": 0.49641573429107666, "rewards/check_winston_local_func/mean": 0.7258195281028748, "rewards/check_winston_local_func/std": 0.30943360924720764, "rewards/sentence_count_match_reward_logic/mean": 0.9453463554382324, "rewards/sentence_count_match_reward_logic/std": 0.11091519892215729, "step": 829 }, { "clip_ratio": 0.009487207978963852, "epoch": 0.2904128761371588, "grad_norm": 0.48225772375346254, "kl": 0.796875, "learning_rate": 9.938058956970132e-06, "loss": 0.0015, "step": 830 }, { "clip_ratio": 0.026467015966773033, "epoch": 0.2907627711686494, "grad_norm": 0.3849233023042576, "kl": 0.82421875, "learning_rate": 9.937578866809567e-06, "loss": -0.0005, "step": 831 }, { "clip_ratio": 0.039166081696748734, "epoch": 0.29111266620014, "grad_norm": 0.3960182833721903, "kl": 0.84375, "learning_rate": 9.937096934978819e-06, "loss": -0.0014, "step": 832 }, { "clip_ratio": 0.0024821471888571978, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.2914625612316305, "grad_norm": 0.6312272806912561, "kl": 0.6484375, "learning_rate": 9.936613161657639e-06, "loss": 0.0099, "max_completion_length": 256.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 185.0357208251953, "mean_terminated_completion_length": 173.20834350585938, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 4210803.0, "reward": 2.384172201156616, "reward_std": 0.24843652546405792, "rewards/check_gptzero_func/mean": 0.9107142686843872, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.4913150668144226, "rewards/check_winston_local_func/std": 0.30528876185417175, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.04767312481999397, "step": 833 }, { "clip_ratio": 0.010346204042434692, "epoch": 0.2918124562631211, "grad_norm": 0.44286237567237324, "kl": 0.66015625, "learning_rate": 9.936127547026471e-06, "loss": 0.0074, "step": 834 }, { "clip_ratio": 0.023080997169017792, "epoch": 0.2921623512946116, "grad_norm": 0.45702714552472634, "kl": 0.671875, "learning_rate": 9.935640091266444e-06, "loss": 0.006, "step": 835 }, { "clip_ratio": 0.030351798981428146, "epoch": 0.2925122463261022, "grad_norm": 0.29296998866159807, "kl": 0.66015625, "learning_rate": 9.935150794559379e-06, "loss": 0.0046, "step": 836 }, { "clip_ratio": 0.005360517185181379, "clipped_completions_ratio": 0.0, "epoch": 0.2928621413575927, "grad_norm": 0.7440999973253173, "kl": 1.0234375, "learning_rate": 9.934659657087773e-06, "loss": 0.0101, "max_completion_length": 208.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 128.6607208251953, "mean_terminated_completion_length": 128.6607208251953, "min_completion_length": 63.0, "min_terminated_completion_length": 63.0, "num_tokens": 4227288.0, "reward": 1.8847278356552124, "reward_std": 0.25530606508255005, "rewards/check_gptzero_func/mean": 0.2857142984867096, "rewards/check_gptzero_func/std": 0.4558423161506653, "rewards/check_winston_local_func/mean": 0.6019896864891052, "rewards/check_winston_local_func/std": 0.33449438214302063, "rewards/sentence_count_match_reward_logic/mean": 0.9970237612724304, "rewards/sentence_count_match_reward_logic/std": 0.02227177284657955, "step": 837 }, { "clip_ratio": 0.010678277350962162, "epoch": 0.2932120363890833, "grad_norm": 0.5437832600210406, "kl": 1.0234375, "learning_rate": 9.93416667903482e-06, "loss": 0.0077, "step": 838 }, { "clip_ratio": 0.02542273886501789, "epoch": 0.29356193142057385, "grad_norm": 0.687431147958388, "kl": 1.0703125, "learning_rate": 9.933671860584394e-06, "loss": 0.0067, "step": 839 }, { "clip_ratio": 0.03637627884745598, "epoch": 0.2939118264520644, "grad_norm": 1.864539181645581, "kl": 1.0234375, "learning_rate": 9.933175201921057e-06, "loss": 0.0074, "step": 840 }, { "clip_ratio": 0.006015146151185036, "clipped_completions_ratio": 0.0, "epoch": 0.29426172148355495, "grad_norm": 0.8272352738891804, "kl": 0.85546875, "learning_rate": 9.93267670323006e-06, "loss": 0.0126, "max_completion_length": 188.0, "max_terminated_completion_length": 188.0, "mean_completion_length": 136.46429443359375, "mean_terminated_completion_length": 136.46429443359375, "min_completion_length": 85.0, "min_terminated_completion_length": 85.0, "num_tokens": 4244338.0, "reward": 1.8134739398956299, "reward_std": 0.2985120713710785, "rewards/check_gptzero_func/mean": 0.3035714328289032, "rewards/check_gptzero_func/std": 0.4639608860015869, "rewards/check_winston_local_func/mean": 0.5099024176597595, "rewards/check_winston_local_func/std": 0.34600207209587097, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 841 }, { "clip_ratio": 0.017436306923627853, "epoch": 0.29461161651504547, "grad_norm": 0.7249753382451578, "kl": 0.90625, "learning_rate": 9.932176364697337e-06, "loss": 0.0106, "step": 842 }, { "clip_ratio": 0.029856251552700996, "epoch": 0.29496151154653605, "grad_norm": 0.6759458352007216, "kl": 0.90625, "learning_rate": 9.93167418650951e-06, "loss": 0.0075, "step": 843 }, { "clip_ratio": 0.03839715197682381, "epoch": 0.29531140657802657, "grad_norm": 0.7373871745081169, "kl": 0.85546875, "learning_rate": 9.931170168853886e-06, "loss": 0.0065, "step": 844 }, { "clip_ratio": 0.003341892035678029, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.29566130160951715, "grad_norm": 0.5985212658813658, "kl": 0.65234375, "learning_rate": 9.93066431191846e-06, "loss": 0.0021, "max_completion_length": 256.0, "max_terminated_completion_length": 240.0, "mean_completion_length": 175.44644165039062, "mean_terminated_completion_length": 162.02084350585938, "min_completion_length": 96.0, "min_terminated_completion_length": 96.0, "num_tokens": 4265867.0, "reward": 2.0075607299804688, "reward_std": 0.24039222300052643, "rewards/check_gptzero_func/mean": 0.5178571343421936, "rewards/check_gptzero_func/std": 0.5042031407356262, "rewards/check_winston_local_func/mean": 0.5030964612960815, "rewards/check_winston_local_func/std": 0.29377204179763794, "rewards/sentence_count_match_reward_logic/mean": 0.9866071343421936, "rewards/sentence_count_match_reward_logic/std": 0.056801944971084595, "step": 845 }, { "clip_ratio": 0.010681996122002602, "epoch": 0.2960111966410077, "grad_norm": 0.5774520946778886, "kl": 0.66015625, "learning_rate": 9.93015661589191e-06, "loss": -0.0003, "step": 846 }, { "clip_ratio": 0.023544108495116234, "epoch": 0.29636109167249824, "grad_norm": 0.42259721567747943, "kl": 0.66796875, "learning_rate": 9.929647080963604e-06, "loss": -0.0021, "step": 847 }, { "clip_ratio": 0.03040950931608677, "epoch": 0.2967109867039888, "grad_norm": 0.3266071927838126, "kl": 0.6796875, "learning_rate": 9.929135707323592e-06, "loss": -0.0034, "step": 848 }, { "clip_ratio": 0.004898244980722666, "clipped_completions_ratio": 0.0, "epoch": 0.29706088173547934, "grad_norm": 0.6815396714684996, "kl": 0.7265625, "learning_rate": 9.928622495162613e-06, "loss": 0.009, "max_completion_length": 228.0, "max_terminated_completion_length": 228.0, "mean_completion_length": 158.71429443359375, "mean_terminated_completion_length": 158.71429443359375, "min_completion_length": 86.0, "min_terminated_completion_length": 86.0, "num_tokens": 4285867.0, "reward": 2.38148832321167, "reward_std": 0.2394341230392456, "rewards/check_gptzero_func/mean": 0.6607142686843872, "rewards/check_gptzero_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.7615901231765747, "rewards/check_winston_local_func/std": 0.2872723340988159, "rewards/sentence_count_match_reward_logic/mean": 0.9591836929321289, "rewards/sentence_count_match_reward_logic/std": 0.09020400047302246, "step": 849 }, { "clip_ratio": 0.011086399666965008, "epoch": 0.2974107767669699, "grad_norm": 0.6138379958816694, "kl": 0.7265625, "learning_rate": 9.92810744467209e-06, "loss": 0.0061, "step": 850 }, { "clip_ratio": 0.027584442868828773, "epoch": 0.29776067179846044, "grad_norm": 0.619620628865777, "kl": 0.72265625, "learning_rate": 9.927590556044132e-06, "loss": 0.003, "step": 851 }, { "clip_ratio": 0.044070057570934296, "epoch": 0.298110566829951, "grad_norm": 0.6075173716366078, "kl": 0.7734375, "learning_rate": 9.927071829471531e-06, "loss": 0.0015, "step": 852 }, { "clip_ratio": 0.00307409162633121, "clipped_completions_ratio": 0.0, "epoch": 0.2984604618614416, "grad_norm": 0.7154178295459771, "kl": 0.79296875, "learning_rate": 9.926551265147769e-06, "loss": 0.0037, "max_completion_length": 239.0, "max_terminated_completion_length": 239.0, "mean_completion_length": 155.3928680419922, "mean_terminated_completion_length": 155.3928680419922, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 4304769.0, "reward": 2.4925758838653564, "reward_std": 0.2384808212518692, "rewards/check_gptzero_func/mean": 0.7678571343421936, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.7247187495231628, "rewards/check_winston_local_func/std": 0.2189047634601593, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 853 }, { "clip_ratio": 0.011179240420460701, "epoch": 0.2988103568929321, "grad_norm": 0.5061486899797389, "kl": 0.796875, "learning_rate": 9.926028863267012e-06, "loss": 0.0007, "step": 854 }, { "clip_ratio": 0.026727380231022835, "epoch": 0.2991602519244227, "grad_norm": 0.3918522357673678, "kl": 0.8125, "learning_rate": 9.925504624024113e-06, "loss": -0.0016, "step": 855 }, { "clip_ratio": 0.03969233110547066, "epoch": 0.2995101469559132, "grad_norm": 0.32526607393044144, "kl": 0.82421875, "learning_rate": 9.924978547614604e-06, "loss": -0.0028, "step": 856 }, { "clip_ratio": 0.003997748252004385, "clipped_completions_ratio": 0.0, "epoch": 0.2998600419874038, "grad_norm": 0.6048875094570298, "kl": 0.75, "learning_rate": 9.924450634234708e-06, "loss": 0.0073, "max_completion_length": 205.0, "max_terminated_completion_length": 205.0, "mean_completion_length": 159.94644165039062, "mean_terminated_completion_length": 159.94644165039062, "min_completion_length": 115.0, "min_terminated_completion_length": 115.0, "num_tokens": 4324438.0, "reward": 2.3331496715545654, "reward_std": 0.23670341074466705, "rewards/check_gptzero_func/mean": 0.5535714030265808, "rewards/check_gptzero_func/std": 0.5016207695007324, "rewards/check_winston_local_func/mean": 0.7795780301094055, "rewards/check_winston_local_func/std": 0.25096896290779114, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 857 }, { "clip_ratio": 0.00885703694075346, "epoch": 0.3002099370188943, "grad_norm": 0.4445119582843271, "kl": 0.75390625, "learning_rate": 9.923920884081333e-06, "loss": 0.0054, "step": 858 }, { "clip_ratio": 0.01749413274228573, "epoch": 0.3005598320503849, "grad_norm": 0.3460557098042812, "kl": 0.76171875, "learning_rate": 9.92338929735207e-06, "loss": 0.0034, "step": 859 }, { "clip_ratio": 0.028394492343068123, "epoch": 0.3009097270818754, "grad_norm": 0.3027023167041744, "kl": 0.76953125, "learning_rate": 9.922855874245197e-06, "loss": 0.0025, "step": 860 }, { "clip_ratio": 0.0042757680639624596, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.301259622113366, "grad_norm": 0.7040135537498485, "kl": 0.78515625, "learning_rate": 9.922320614959672e-06, "loss": 0.0106, "max_completion_length": 256.0, "max_terminated_completion_length": 226.0, "mean_completion_length": 178.7678680419922, "mean_terminated_completion_length": 165.89584350585938, "min_completion_length": 114.0, "min_terminated_completion_length": 114.0, "num_tokens": 4346897.0, "reward": 2.689115047454834, "reward_std": 0.16766704618930817, "rewards/check_gptzero_func/mean": 0.9464285969734192, "rewards/check_gptzero_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.7656456828117371, "rewards/check_winston_local_func/std": 0.32836341857910156, "rewards/sentence_count_match_reward_logic/mean": 0.9770408272743225, "rewards/sentence_count_match_reward_logic/std": 0.05953926220536232, "step": 861 }, { "clip_ratio": 0.012717663310468197, "epoch": 0.30160951714485656, "grad_norm": 0.6684013053074803, "kl": 0.796875, "learning_rate": 9.921783519695144e-06, "loss": 0.0084, "step": 862 }, { "clip_ratio": 0.02439626306295395, "epoch": 0.3019594121763471, "grad_norm": 0.44203854170177287, "kl": 0.796875, "learning_rate": 9.921244588651946e-06, "loss": 0.0052, "step": 863 }, { "clip_ratio": 0.036076273769140244, "epoch": 0.30230930720783766, "grad_norm": 0.3518218825899825, "kl": 0.796875, "learning_rate": 9.920703822031094e-06, "loss": 0.0045, "step": 864 }, { "clip_ratio": 0.006942601874470711, "clipped_completions_ratio": 0.0, "epoch": 0.3026592022393282, "grad_norm": 1.0223460893330822, "kl": 1.1015625, "learning_rate": 9.920161220034286e-06, "loss": 0.0112, "max_completion_length": 196.0, "max_terminated_completion_length": 196.0, "mean_completion_length": 125.62500762939453, "mean_terminated_completion_length": 125.62500762939453, "min_completion_length": 74.0, "min_terminated_completion_length": 74.0, "num_tokens": 4363012.0, "reward": 2.2492547035217285, "reward_std": 0.31518852710723877, "rewards/check_gptzero_func/mean": 0.4464285671710968, "rewards/check_gptzero_func/std": 0.5016207695007324, "rewards/check_winston_local_func/mean": 0.8171117901802063, "rewards/check_winston_local_func/std": 0.2661481499671936, "rewards/sentence_count_match_reward_logic/mean": 0.985714316368103, "rewards/sentence_count_match_reward_logic/std": 0.051974017173051834, "step": 865 }, { "clip_ratio": 0.0274952445179224, "epoch": 0.30300909727081876, "grad_norm": 0.910845885020988, "kl": 1.1640625, "learning_rate": 9.919616782863911e-06, "loss": 0.0085, "step": 866 }, { "clip_ratio": 0.04205106943845749, "epoch": 0.3033589923023093, "grad_norm": 0.5942927484183402, "kl": 1.1484375, "learning_rate": 9.919070510723035e-06, "loss": 0.0045, "step": 867 }, { "clip_ratio": 0.056323472410440445, "epoch": 0.30370888733379986, "grad_norm": 1.0476475880713507, "kl": 1.1328125, "learning_rate": 9.918522403815414e-06, "loss": 0.0041, "step": 868 }, { "clip_ratio": 0.005845884792506695, "clipped_completions_ratio": 0.0, "epoch": 0.30405878236529044, "grad_norm": 0.6535911566841054, "kl": 0.78515625, "learning_rate": 9.917972462345488e-06, "loss": 0.0061, "max_completion_length": 220.0, "max_terminated_completion_length": 220.0, "mean_completion_length": 175.69644165039062, "mean_terminated_completion_length": 175.69644165039062, "min_completion_length": 126.0, "min_terminated_completion_length": 126.0, "num_tokens": 4384611.0, "reward": 2.277799606323242, "reward_std": 0.16614699363708496, "rewards/check_gptzero_func/mean": 0.5535714030265808, "rewards/check_gptzero_func/std": 0.5016207695007324, "rewards/check_winston_local_func/mean": 0.7264602780342102, "rewards/check_winston_local_func/std": 0.22645145654678345, "rewards/sentence_count_match_reward_logic/mean": 0.9977678656578064, "rewards/sentence_count_match_reward_logic/std": 0.016703827306628227, "step": 869 }, { "clip_ratio": 0.014692145399749279, "epoch": 0.30440867739678096, "grad_norm": 0.714934645756432, "kl": 0.80859375, "learning_rate": 9.917420686518378e-06, "loss": 0.0042, "step": 870 }, { "clip_ratio": 0.026284964755177498, "epoch": 0.30475857242827153, "grad_norm": 0.463569343622969, "kl": 0.8125, "learning_rate": 9.916867076539894e-06, "loss": 0.0014, "step": 871 }, { "clip_ratio": 0.04063590243458748, "epoch": 0.30510846745976206, "grad_norm": 0.4813996111376022, "kl": 0.80859375, "learning_rate": 9.916311632616525e-06, "loss": 0.0001, "step": 872 }, { "clip_ratio": 0.005378021392971277, "clipped_completions_ratio": 0.0, "epoch": 0.30545836249125263, "grad_norm": 0.8854889203091786, "kl": 0.9609375, "learning_rate": 9.915754354955445e-06, "loss": 0.0124, "max_completion_length": 203.0, "max_terminated_completion_length": 203.0, "mean_completion_length": 141.19644165039062, "mean_terminated_completion_length": 141.19644165039062, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 4402374.0, "reward": 2.841576337814331, "reward_std": 0.0904935821890831, "rewards/check_gptzero_func/mean": 0.9642857313156128, "rewards/check_gptzero_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9181066751480103, "rewards/check_winston_local_func/std": 0.16344119608402252, "rewards/sentence_count_match_reward_logic/mean": 0.9591836929321289, "rewards/sentence_count_match_reward_logic/std": 0.09020400047302246, "step": 873 }, { "clip_ratio": 0.020221464335918427, "epoch": 0.30580825752274315, "grad_norm": 0.5982543019469264, "kl": 0.97265625, "learning_rate": 9.915195243764514e-06, "loss": 0.0099, "step": 874 }, { "clip_ratio": 0.032543107867240906, "epoch": 0.30615815255423373, "grad_norm": 0.4751208382165145, "kl": 0.9765625, "learning_rate": 9.91463429925228e-06, "loss": 0.0075, "step": 875 }, { "clip_ratio": 0.044996634125709534, "epoch": 0.3065080475857243, "grad_norm": 0.40526692263644537, "kl": 0.9765625, "learning_rate": 9.914071521627964e-06, "loss": 0.0069, "step": 876 }, { "clip_ratio": 0.0038955160416662693, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.30685794261721483, "grad_norm": 0.7397041082013885, "kl": 0.80078125, "learning_rate": 9.91350691110148e-06, "loss": 0.0053, "max_completion_length": 256.0, "max_terminated_completion_length": 231.0, "mean_completion_length": 179.23214721679688, "mean_terminated_completion_length": 166.4375, "min_completion_length": 89.0, "min_terminated_completion_length": 89.0, "num_tokens": 4424699.0, "reward": 2.354552745819092, "reward_std": 0.11858876794576645, "rewards/check_gptzero_func/mean": 0.7142857313156128, "rewards/check_gptzero_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.6482034921646118, "rewards/check_winston_local_func/std": 0.30362194776535034, "rewards/sentence_count_match_reward_logic/mean": 0.9920635223388672, "rewards/sentence_count_match_reward_logic/std": 0.03421161696314812, "step": 877 }, { "clip_ratio": 0.011922070756554604, "epoch": 0.3072078376487054, "grad_norm": 0.5782162614251589, "kl": 0.7890625, "learning_rate": 9.912940467883421e-06, "loss": 0.003, "step": 878 }, { "clip_ratio": 0.024041883647441864, "epoch": 0.3075577326801959, "grad_norm": 0.4126496518760935, "kl": 0.80078125, "learning_rate": 9.912372192185064e-06, "loss": 0.0003, "step": 879 }, { "clip_ratio": 0.03605348989367485, "epoch": 0.3079076277116865, "grad_norm": 0.38844361910098735, "kl": 0.82421875, "learning_rate": 9.911802084218374e-06, "loss": -0.001, "step": 880 }, { "clip_ratio": 0.004703680519014597, "clipped_completions_ratio": 0.0714285714285714, "epoch": 0.308257522743177, "grad_norm": 0.9003996233741334, "kl": 0.99609375, "learning_rate": 9.911230144195992e-06, "loss": 0.0067, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 145.0178680419922, "mean_terminated_completion_length": 136.48077392578125, "min_completion_length": 73.0, "min_terminated_completion_length": 73.0, "num_tokens": 4442612.0, "reward": 2.0721328258514404, "reward_std": 0.19981519877910614, "rewards/check_gptzero_func/mean": 0.3392857015132904, "rewards/check_gptzero_func/std": 0.4777517318725586, "rewards/check_winston_local_func/mean": 0.7633529305458069, "rewards/check_winston_local_func/std": 0.2096220701932907, "rewards/sentence_count_match_reward_logic/mean": 0.9694939851760864, "rewards/sentence_count_match_reward_logic/std": 0.06281038373708725, "step": 881 }, { "clip_ratio": 0.01865055412054062, "epoch": 0.3086074177746676, "grad_norm": 0.5928445632120913, "kl": 1.0, "learning_rate": 9.910656372331246e-06, "loss": 0.0025, "step": 882 }, { "clip_ratio": 0.03731350600719452, "epoch": 0.3089573128061582, "grad_norm": 0.4585189524897681, "kl": 1.0234375, "learning_rate": 9.910080768838153e-06, "loss": 0.0014, "step": 883 }, { "clip_ratio": 0.04772050306200981, "epoch": 0.3093072078376487, "grad_norm": 0.42608112637163037, "kl": 1.0546875, "learning_rate": 9.909503333931402e-06, "loss": 0.0007, "step": 884 }, { "clip_ratio": 0.003945114091038704, "clipped_completions_ratio": 0.0, "epoch": 0.3096571028691393, "grad_norm": 0.6772398114102134, "kl": 0.953125, "learning_rate": 9.908924067826373e-06, "loss": 0.0118, "max_completion_length": 243.0, "max_terminated_completion_length": 243.0, "mean_completion_length": 163.23214721679688, "mean_terminated_completion_length": 163.23214721679688, "min_completion_length": 106.0, "min_terminated_completion_length": 106.0, "num_tokens": 4463001.0, "reward": 2.471806764602661, "reward_std": 0.0844702273607254, "rewards/check_gptzero_func/mean": 0.5714285969734192, "rewards/check_gptzero_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.9186603426933289, "rewards/check_winston_local_func/std": 0.09949959814548492, "rewards/sentence_count_match_reward_logic/mean": 0.9817176461219788, "rewards/sentence_count_match_reward_logic/std": 0.06795159727334976, "step": 885 }, { "clip_ratio": 0.011093233712017536, "epoch": 0.3100069979006298, "grad_norm": 0.5541477784634429, "kl": 0.984375, "learning_rate": 9.908342970739127e-06, "loss": 0.0095, "step": 886 }, { "clip_ratio": 0.023326493799686432, "epoch": 0.3103568929321204, "grad_norm": 0.48701142349260074, "kl": 0.9921875, "learning_rate": 9.907760042886406e-06, "loss": 0.0071, "step": 887 }, { "clip_ratio": 0.0336625762283802, "epoch": 0.3107067879636109, "grad_norm": 0.32098822184190695, "kl": 0.953125, "learning_rate": 9.90717528448564e-06, "loss": 0.0045, "step": 888 }, { "clip_ratio": 0.0038432925939559937, "clipped_completions_ratio": 0.0, "epoch": 0.3110566829951015, "grad_norm": 0.6210135348627654, "kl": 0.734375, "learning_rate": 9.906588695754933e-06, "loss": 0.0026, "max_completion_length": 246.0, "max_terminated_completion_length": 246.0, "mean_completion_length": 170.19644165039062, "mean_terminated_completion_length": 170.19644165039062, "min_completion_length": 121.0, "min_terminated_completion_length": 121.0, "num_tokens": 4483844.0, "reward": 2.4881014823913574, "reward_std": 0.18780750036239624, "rewards/check_gptzero_func/mean": 0.6785714030265808, "rewards/check_gptzero_func/std": 0.4712514281272888, "rewards/check_winston_local_func/mean": 0.8095301389694214, "rewards/check_winston_local_func/std": 0.213328555226326, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 889 }, { "clip_ratio": 0.00928865559399128, "epoch": 0.311406578026592, "grad_norm": 0.5277475815044218, "kl": 0.7265625, "learning_rate": 9.906000276913081e-06, "loss": 0.0002, "step": 890 }, { "clip_ratio": 0.023409387096762657, "epoch": 0.3117564730580826, "grad_norm": 0.48718335972555443, "kl": 0.734375, "learning_rate": 9.905410028179559e-06, "loss": -0.0018, "step": 891 }, { "clip_ratio": 0.038668230175971985, "epoch": 0.31210636808957315, "grad_norm": 0.4375365837413703, "kl": 0.75, "learning_rate": 9.904817949774524e-06, "loss": -0.0029, "step": 892 }, { "clip_ratio": 0.0030792905017733574, "clipped_completions_ratio": 0.1785714285714286, "epoch": 0.31245626312106367, "grad_norm": 0.591597104010393, "kl": 0.62109375, "learning_rate": 9.904224041918813e-06, "loss": 0.0067, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 191.69644165039062, "mean_terminated_completion_length": 177.71739196777344, "min_completion_length": 127.0, "min_terminated_completion_length": 127.0, "num_tokens": 4507523.0, "reward": 2.556955099105835, "reward_std": 0.19156593084335327, "rewards/check_gptzero_func/mean": 0.7678571343421936, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.8105263710021973, "rewards/check_winston_local_func/std": 0.2126593142747879, "rewards/sentence_count_match_reward_logic/mean": 0.9785714149475098, "rewards/sentence_count_match_reward_logic/std": 0.046570710837841034, "step": 893 }, { "clip_ratio": 0.008938318118453026, "epoch": 0.31280615815255425, "grad_norm": 0.4574499831438518, "kl": 0.6328125, "learning_rate": 9.90362830483395e-06, "loss": 0.0048, "step": 894 }, { "clip_ratio": 0.021221565082669258, "epoch": 0.31315605318404477, "grad_norm": 0.4300335374227883, "kl": 0.64453125, "learning_rate": 9.903030738742138e-06, "loss": 0.0022, "step": 895 }, { "clip_ratio": 0.031027832999825478, "epoch": 0.31350594821553535, "grad_norm": 0.3060578621480127, "kl": 0.6328125, "learning_rate": 9.902431343866266e-06, "loss": 0.0007, "step": 896 }, { "clip_ratio": 0.0045140027068555355, "clipped_completions_ratio": 0.0, "epoch": 0.31385584324702587, "grad_norm": 0.6954259525285015, "kl": 0.95703125, "learning_rate": 9.901830120429899e-06, "loss": 0.0136, "max_completion_length": 223.0, "max_terminated_completion_length": 223.0, "mean_completion_length": 139.60714721679688, "mean_terminated_completion_length": 139.60714721679688, "min_completion_length": 90.0, "min_terminated_completion_length": 90.0, "num_tokens": 4525117.0, "reward": 2.4727113246917725, "reward_std": 0.23622477054595947, "rewards/check_gptzero_func/mean": 0.625, "rewards/check_gptzero_func/std": 0.48850420117378235, "rewards/check_winston_local_func/mean": 0.8477113842964172, "rewards/check_winston_local_func/std": 0.2200753539800644, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 897 }, { "clip_ratio": 0.012766231782734394, "epoch": 0.31420573827851644, "grad_norm": 0.556245860285956, "kl": 0.95703125, "learning_rate": 9.901227068657292e-06, "loss": 0.0101, "step": 898 }, { "clip_ratio": 0.024819275364279747, "epoch": 0.314555633310007, "grad_norm": 0.4231530677249232, "kl": 0.9609375, "learning_rate": 9.900622188773374e-06, "loss": 0.0081, "step": 899 }, { "clip_ratio": 0.0396941602230072, "epoch": 0.31490552834149754, "grad_norm": 0.3984369414127261, "kl": 0.9765625, "learning_rate": 9.900015481003762e-06, "loss": 0.0069, "step": 900 }, { "clip_ratio": 0.003731941105797887, "clipped_completions_ratio": 0.125, "epoch": 0.3152554233729881, "grad_norm": 0.6826291065706623, "kl": 0.7890625, "learning_rate": 9.89940694557475e-06, "loss": 0.0037, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 161.83929443359375, "mean_terminated_completion_length": 148.38775634765625, "min_completion_length": 83.0, "min_terminated_completion_length": 83.0, "num_tokens": 4545188.0, "reward": 2.7463901042938232, "reward_std": 0.1716335415840149, "rewards/check_gptzero_func/mean": 0.9285714030265808, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.8426198959350586, "rewards/check_winston_local_func/std": 0.2138928920030594, "rewards/sentence_count_match_reward_logic/mean": 0.9751983880996704, "rewards/sentence_count_match_reward_logic/std": 0.05181296914815903, "step": 901 }, { "clip_ratio": 0.012126912362873554, "epoch": 0.31560531840447864, "grad_norm": 0.5087860437799461, "kl": 0.7890625, "learning_rate": 9.898796582713317e-06, "loss": 0.0008, "step": 902 }, { "clip_ratio": 0.027255095541477203, "epoch": 0.3159552134359692, "grad_norm": 0.3743102284714698, "kl": 0.796875, "learning_rate": 9.898184392647123e-06, "loss": -0.0017, "step": 903 }, { "clip_ratio": 0.037316977977752686, "epoch": 0.31630510846745974, "grad_norm": 0.3496340025887134, "kl": 0.8046875, "learning_rate": 9.897570375604508e-06, "loss": -0.003, "step": 904 }, { "clip_ratio": 0.00464384350925684, "clipped_completions_ratio": 0.0714285714285714, "epoch": 0.3166550034989503, "grad_norm": 0.5684180897498168, "kl": 0.6953125, "learning_rate": 9.896954531814496e-06, "loss": 0.0072, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 192.92857360839844, "mean_terminated_completion_length": 188.07693481445312, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 4568880.0, "reward": 2.582216739654541, "reward_std": 0.3466753661632538, "rewards/check_gptzero_func/mean": 0.75, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.8634666204452515, "rewards/check_winston_local_func/std": 0.1654520332813263, "rewards/sentence_count_match_reward_logic/mean": 0.96875, "rewards/sentence_count_match_reward_logic/std": 0.06418191641569138, "step": 905 }, { "clip_ratio": 0.007928028702735901, "epoch": 0.3170048985304409, "grad_norm": 0.5090495983308246, "kl": 0.6953125, "learning_rate": 9.89633686150679e-06, "loss": 0.0053, "step": 906 }, { "clip_ratio": 0.015656523406505585, "epoch": 0.3173547935619314, "grad_norm": 0.3610395965755186, "kl": 0.6953125, "learning_rate": 9.895717364911774e-06, "loss": 0.003, "step": 907 }, { "clip_ratio": 0.02591288648545742, "epoch": 0.317704688593422, "grad_norm": 0.3066418640369456, "kl": 0.69921875, "learning_rate": 9.895096042260517e-06, "loss": 0.0017, "step": 908 }, { "clip_ratio": 0.002318816026672721, "clipped_completions_ratio": 0.3214285714285714, "epoch": 0.3180545836249125, "grad_norm": 0.5147779667642478, "kl": 0.59375, "learning_rate": 9.894472893784766e-06, "loss": 0.0043, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 213.55357360839844, "mean_terminated_completion_length": 193.44737243652344, "min_completion_length": 105.0, "min_terminated_completion_length": 105.0, "num_tokens": 4596023.0, "reward": 2.0683350563049316, "reward_std": 0.27528801560401917, "rewards/check_gptzero_func/mean": 0.4107142984867096, "rewards/check_gptzero_func/std": 0.49641573429107666, "rewards/check_winston_local_func/mean": 0.7449222207069397, "rewards/check_winston_local_func/std": 0.22789479792118073, "rewards/sentence_count_match_reward_logic/mean": 0.9126983880996704, "rewards/sentence_count_match_reward_logic/std": 0.11932146549224854, "step": 909 }, { "clip_ratio": 0.005443432833999395, "epoch": 0.3184044786564031, "grad_norm": 0.41293839003280663, "kl": 0.59765625, "learning_rate": 9.893847919716948e-06, "loss": 0.0027, "step": 910 }, { "clip_ratio": 0.016589269042015076, "epoch": 0.3187543736878936, "grad_norm": 0.3845828739670669, "kl": 0.60546875, "learning_rate": 9.893221120290172e-06, "loss": 0.0014, "step": 911 }, { "clip_ratio": 0.028564833104610443, "epoch": 0.3191042687193842, "grad_norm": 0.34276215539022664, "kl": 0.61328125, "learning_rate": 9.892592495738229e-06, "loss": -0.0, "step": 912 }, { "clip_ratio": 0.004525037482380867, "clipped_completions_ratio": 0.017857142857142905, "epoch": 0.31945416375087476, "grad_norm": 0.7157882961886854, "kl": 1.046875, "learning_rate": 9.891962046295593e-06, "loss": 0.0047, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 149.9107208251953, "mean_terminated_completion_length": 147.9818115234375, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 4614522.0, "reward": 2.3774099349975586, "reward_std": 0.11898919939994812, "rewards/check_gptzero_func/mean": 0.5178571343421936, "rewards/check_gptzero_func/std": 0.5042031407356262, "rewards/check_winston_local_func/mean": 0.866695761680603, "rewards/check_winston_local_func/std": 0.17818060517311096, "rewards/sentence_count_match_reward_logic/mean": 0.9928570985794067, "rewards/sentence_count_match_reward_logic/std": 0.03745126724243164, "step": 913 }, { "clip_ratio": 0.011784140020608902, "epoch": 0.3198040587823653, "grad_norm": 0.6407625842714159, "kl": 1.046875, "learning_rate": 9.891329772197409e-06, "loss": 0.0018, "step": 914 }, { "clip_ratio": 0.029043113812804222, "epoch": 0.32015395381385586, "grad_norm": 0.4829117751832527, "kl": 1.0625, "learning_rate": 9.890695673679516e-06, "loss": -0.0006, "step": 915 }, { "clip_ratio": 0.04281826689839363, "epoch": 0.3205038488453464, "grad_norm": 0.5113341157106395, "kl": 1.0625, "learning_rate": 9.890059750978425e-06, "loss": -0.002, "step": 916 }, { "clip_ratio": 0.0030329341534525156, "clipped_completions_ratio": 0.0, "epoch": 0.32085374387683696, "grad_norm": 0.5824940615751932, "kl": 0.796875, "learning_rate": 9.889422004331326e-06, "loss": 0.0064, "max_completion_length": 232.0, "max_terminated_completion_length": 232.0, "mean_completion_length": 181.37501525878906, "mean_terminated_completion_length": 181.37501525878906, "min_completion_length": 140.0, "min_terminated_completion_length": 140.0, "num_tokens": 4636431.0, "reward": 1.9421368837356567, "reward_std": 0.2218908667564392, "rewards/check_gptzero_func/mean": 0.3571428656578064, "rewards/check_gptzero_func/std": 0.48349377512931824, "rewards/check_winston_local_func/mean": 0.5872259140014648, "rewards/check_winston_local_func/std": 0.3352111279964447, "rewards/sentence_count_match_reward_logic/mean": 0.9977678656578064, "rewards/sentence_count_match_reward_logic/std": 0.016703827306628227, "step": 917 }, { "clip_ratio": 0.0070825242437422276, "epoch": 0.3212036389083275, "grad_norm": 0.509364842839671, "kl": 0.796875, "learning_rate": 9.888782433976094e-06, "loss": 0.004, "step": 918 }, { "clip_ratio": 0.020670613273978233, "epoch": 0.32155353393981806, "grad_norm": 0.3713670752929556, "kl": 0.8203125, "learning_rate": 9.888141040151284e-06, "loss": 0.0019, "step": 919 }, { "clip_ratio": 0.031534772366285324, "epoch": 0.3219034289713086, "grad_norm": 0.3424248708586571, "kl": 0.83984375, "learning_rate": 9.88749782309613e-06, "loss": 0.0005, "step": 920 }, { "clip_ratio": 0.002900431863963604, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.32225332400279916, "grad_norm": 0.6597356886216599, "kl": 0.82421875, "learning_rate": 9.886852783050544e-06, "loss": 0.0103, "max_completion_length": 256.0, "max_terminated_completion_length": 221.0, "mean_completion_length": 175.98214721679688, "mean_terminated_completion_length": 162.64584350585938, "min_completion_length": 104.0, "min_terminated_completion_length": 104.0, "num_tokens": 4657846.0, "reward": 2.1084964275360107, "reward_std": 0.18297268450260162, "rewards/check_gptzero_func/mean": 0.3571428656578064, "rewards/check_gptzero_func/std": 0.48349377512931824, "rewards/check_winston_local_func/mean": 0.7625142931938171, "rewards/check_winston_local_func/std": 0.3232145309448242, "rewards/sentence_count_match_reward_logic/mean": 0.9888392686843872, "rewards/sentence_count_match_reward_logic/std": 0.03596704453229904, "step": 921 }, { "clip_ratio": 0.01098565198481083, "epoch": 0.32260321903428973, "grad_norm": 0.500059842385023, "kl": 0.83203125, "learning_rate": 9.886205920255123e-06, "loss": 0.0071, "step": 922 }, { "clip_ratio": 0.02701912447810173, "epoch": 0.32295311406578026, "grad_norm": 0.44241756399182275, "kl": 0.8359375, "learning_rate": 9.885557234951137e-06, "loss": 0.0049, "step": 923 }, { "clip_ratio": 0.03830842673778534, "epoch": 0.32330300909727083, "grad_norm": 0.35294905229529006, "kl": 0.8359375, "learning_rate": 9.88490672738054e-06, "loss": 0.0038, "step": 924 }, { "clip_ratio": 0.00479048490524292, "clipped_completions_ratio": 0.0, "epoch": 0.32365290412876135, "grad_norm": 0.828917146863201, "kl": 1.15625, "learning_rate": 9.884254397785966e-06, "loss": 0.0126, "max_completion_length": 245.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 140.55357360839844, "mean_terminated_completion_length": 140.55357360839844, "min_completion_length": 65.0, "min_terminated_completion_length": 65.0, "num_tokens": 4675333.0, "reward": 2.5482473373413086, "reward_std": 0.2532906234264374, "rewards/check_gptzero_func/mean": 0.6607142686843872, "rewards/check_gptzero_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.8875328898429871, "rewards/check_winston_local_func/std": 0.16429398953914642, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 925 }, { "clip_ratio": 0.012946797534823418, "epoch": 0.32400279916025193, "grad_norm": 0.6241618318286197, "kl": 1.15625, "learning_rate": 9.883600246410729e-06, "loss": 0.0084, "step": 926 }, { "clip_ratio": 0.033739522099494934, "epoch": 0.32435269419174245, "grad_norm": 0.44786031143108657, "kl": 1.171875, "learning_rate": 9.882944273498821e-06, "loss": 0.0057, "step": 927 }, { "clip_ratio": 0.04805877432227135, "epoch": 0.32470258922323303, "grad_norm": 0.3351522638730905, "kl": 1.1875, "learning_rate": 9.882286479294911e-06, "loss": 0.0049, "step": 928 }, { "clip_ratio": 0.0022244821302592754, "clipped_completions_ratio": 0.125, "epoch": 0.3250524842547236, "grad_norm": 0.737656816473442, "kl": 0.9375, "learning_rate": 9.881626864044354e-06, "loss": 0.0095, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 173.3928680419922, "mean_terminated_completion_length": 161.59182739257812, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 4696571.0, "reward": 2.6246135234832764, "reward_std": 0.1689365953207016, "rewards/check_gptzero_func/mean": 0.8035714030265808, "rewards/check_gptzero_func/std": 0.40089187026023865, "rewards/check_winston_local_func/mean": 0.8210417628288269, "rewards/check_winston_local_func/std": 0.2381395548582077, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 929 }, { "clip_ratio": 0.013212600722908974, "epoch": 0.3254023792862141, "grad_norm": 0.4518413233825127, "kl": 0.9375, "learning_rate": 9.880965427993177e-06, "loss": 0.006, "step": 930 }, { "clip_ratio": 0.023299552500247955, "epoch": 0.3257522743177047, "grad_norm": 0.40505043729797624, "kl": 0.9453125, "learning_rate": 9.88030217138809e-06, "loss": 0.0049, "step": 931 }, { "clip_ratio": 0.033195145428180695, "epoch": 0.3261021693491952, "grad_norm": 0.35669103430345855, "kl": 0.95703125, "learning_rate": 9.879637094476482e-06, "loss": 0.0036, "step": 932 }, { "clip_ratio": 0.005186777561903, "clipped_completions_ratio": 0.0, "epoch": 0.3264520643806858, "grad_norm": 0.6362173446564342, "kl": 0.796875, "learning_rate": 9.87897019750642e-06, "loss": 0.0049, "max_completion_length": 213.0, "max_terminated_completion_length": 213.0, "mean_completion_length": 161.98214721679688, "mean_terminated_completion_length": 161.98214721679688, "min_completion_length": 123.0, "min_terminated_completion_length": 123.0, "num_tokens": 4716338.0, "reward": 2.5859880447387695, "reward_std": 0.22734864056110382, "rewards/check_gptzero_func/mean": 0.6785714030265808, "rewards/check_gptzero_func/std": 0.4712514281272888, "rewards/check_winston_local_func/mean": 0.9205966591835022, "rewards/check_winston_local_func/std": 0.07871120423078537, "rewards/sentence_count_match_reward_logic/mean": 0.9868196845054626, "rewards/sentence_count_match_reward_logic/std": 0.04257231950759888, "step": 933 }, { "clip_ratio": 0.010869575664401054, "epoch": 0.3268019594121763, "grad_norm": 0.5442442831241402, "kl": 0.80078125, "learning_rate": 9.878301480726654e-06, "loss": 0.0019, "step": 934 }, { "clip_ratio": 0.025299686938524246, "epoch": 0.3271518544436669, "grad_norm": 0.35865174490911256, "kl": 0.80078125, "learning_rate": 9.877630944386603e-06, "loss": -0.0008, "step": 935 }, { "clip_ratio": 0.03921305388212204, "epoch": 0.3275017494751575, "grad_norm": 0.3305245189047512, "kl": 0.8046875, "learning_rate": 9.876958588736371e-06, "loss": -0.0019, "step": 936 }, { "clip_ratio": 0.004507238976657391, "clipped_completions_ratio": 0.0, "epoch": 0.327851644506648, "grad_norm": 0.8376692242950105, "kl": 1.078125, "learning_rate": 9.876284414026747e-06, "loss": 0.0055, "max_completion_length": 230.0, "max_terminated_completion_length": 230.0, "mean_completion_length": 160.83929443359375, "mean_terminated_completion_length": 160.83929443359375, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 4736033.0, "reward": 2.569384813308716, "reward_std": 0.09965448826551437, "rewards/check_gptzero_func/mean": 0.6964285969734192, "rewards/check_gptzero_func/std": 0.4639609158039093, "rewards/check_winston_local_func/mean": 0.9099105000495911, "rewards/check_winston_local_func/std": 0.11965200304985046, "rewards/sentence_count_match_reward_logic/mean": 0.9630456566810608, "rewards/sentence_count_match_reward_logic/std": 0.06553526222705841, "step": 937 }, { "clip_ratio": 0.013318615965545177, "epoch": 0.3282015395381386, "grad_norm": 0.5783727913990948, "kl": 1.0390625, "learning_rate": 9.875608420509186e-06, "loss": 0.0034, "step": 938 }, { "clip_ratio": 0.02318425290286541, "epoch": 0.3285514345696291, "grad_norm": 0.5187880194380734, "kl": 1.046875, "learning_rate": 9.874930608435826e-06, "loss": 0.0016, "step": 939 }, { "clip_ratio": 0.031220905482769012, "epoch": 0.3289013296011197, "grad_norm": 0.506193151344545, "kl": 1.0859375, "learning_rate": 9.874250978059489e-06, "loss": 0.0003, "step": 940 }, { "clip_ratio": 0.0030795701313763857, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.3292512246326102, "grad_norm": 0.617227737492987, "kl": 0.83984375, "learning_rate": 9.873569529633671e-06, "loss": 0.004, "max_completion_length": 256.0, "max_terminated_completion_length": 211.0, "mean_completion_length": 177.37501525878906, "mean_terminated_completion_length": 164.27084350585938, "min_completion_length": 138.0, "min_terminated_completion_length": 138.0, "num_tokens": 4758902.0, "reward": 2.7814011573791504, "reward_std": 0.17446324229240417, "rewards/check_gptzero_func/mean": 0.9107142686843872, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9197937846183777, "rewards/check_winston_local_func/std": 0.1603282392024994, "rewards/sentence_count_match_reward_logic/mean": 0.9508928656578064, "rewards/sentence_count_match_reward_logic/std": 0.06747013330459595, "step": 941 }, { "clip_ratio": 0.008859723806381226, "epoch": 0.3296011196641008, "grad_norm": 0.4982288472473728, "kl": 0.84765625, "learning_rate": 9.87288626341254e-06, "loss": 0.0014, "step": 942 }, { "clip_ratio": 0.023553533479571342, "epoch": 0.32995101469559135, "grad_norm": 0.35861147747112276, "kl": 0.8671875, "learning_rate": 9.872201179650954e-06, "loss": -0.0006, "step": 943 }, { "clip_ratio": 0.03776406869292259, "epoch": 0.33030090972708187, "grad_norm": 0.3604634577710512, "kl": 0.87890625, "learning_rate": 9.871514278604439e-06, "loss": -0.0017, "step": 944 }, { "clip_ratio": 0.005135310348123312, "clipped_completions_ratio": 0.0, "epoch": 0.33065080475857245, "grad_norm": 0.7913045961129755, "kl": 0.98046875, "learning_rate": 9.870825560529205e-06, "loss": 0.0107, "max_completion_length": 234.0, "max_terminated_completion_length": 234.0, "mean_completion_length": 145.5, "mean_terminated_completion_length": 145.5, "min_completion_length": 74.0, "min_terminated_completion_length": 74.0, "num_tokens": 4777194.0, "reward": 2.5710930824279785, "reward_std": 0.20636114478111267, "rewards/check_gptzero_func/mean": 0.8214285969734192, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.7496641874313354, "rewards/check_winston_local_func/std": 0.2916586995124817, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 945 }, { "clip_ratio": 0.013874468393623829, "epoch": 0.33100069979006297, "grad_norm": 0.5809006186587881, "kl": 0.984375, "learning_rate": 9.870135025682135e-06, "loss": 0.007, "step": 946 }, { "clip_ratio": 0.03391823172569275, "epoch": 0.33135059482155355, "grad_norm": 0.5388082470739783, "kl": 0.98828125, "learning_rate": 9.869442674320792e-06, "loss": 0.005, "step": 947 }, { "clip_ratio": 0.047077469527721405, "epoch": 0.33170048985304407, "grad_norm": 0.3880096705820106, "kl": 1.0, "learning_rate": 9.86874850670342e-06, "loss": 0.004, "step": 948 }, { "clip_ratio": 0.00473359227180481, "clipped_completions_ratio": 0.1071428571428571, "epoch": 0.33205038488453464, "grad_norm": 0.7802402186377158, "kl": 0.9140625, "learning_rate": 9.868052523088934e-06, "loss": 0.0517, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 171.0357208251953, "mean_terminated_completion_length": 160.83999633789062, "min_completion_length": 102.0, "min_terminated_completion_length": 102.0, "num_tokens": 4797996.0, "reward": 2.361348867416382, "reward_std": 0.15517657995224, "rewards/check_gptzero_func/mean": 0.5714285969734192, "rewards/check_gptzero_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.8140274882316589, "rewards/check_winston_local_func/std": 0.22594289481639862, "rewards/sentence_count_match_reward_logic/mean": 0.9758929014205933, "rewards/sentence_count_match_reward_logic/std": 0.1087077185511589, "step": 949 }, { "clip_ratio": 0.016652243211865425, "epoch": 0.33240027991602517, "grad_norm": 0.5422508853581515, "kl": 0.93359375, "learning_rate": 9.867354723736928e-06, "loss": 0.0483, "step": 950 }, { "clip_ratio": 0.028719797730445862, "epoch": 0.33275017494751574, "grad_norm": 0.5858287271598188, "kl": 0.94921875, "learning_rate": 9.86665510890768e-06, "loss": 0.0463, "step": 951 }, { "clip_ratio": 0.033635616302490234, "epoch": 0.3331000699790063, "grad_norm": 0.35620300284770195, "kl": 0.9375, "learning_rate": 9.865953678862133e-06, "loss": 0.0444, "step": 952 }, { "clip_ratio": 0.003948185592889786, "clipped_completions_ratio": 0.0535714285714286, "epoch": 0.33344996501049684, "grad_norm": 0.6675783741314065, "kl": 1.0234375, "learning_rate": 9.865250433861917e-06, "loss": 0.0124, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 154.5357208251953, "mean_terminated_completion_length": 148.79244995117188, "min_completion_length": 72.0, "min_terminated_completion_length": 72.0, "num_tokens": 4817018.0, "reward": 2.5918586254119873, "reward_std": 0.018290195614099503, "rewards/check_gptzero_func/mean": 0.7142857313156128, "rewards/check_gptzero_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.8775725960731506, "rewards/check_winston_local_func/std": 0.2657261788845062, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 953 }, { "clip_ratio": 0.015778224915266037, "epoch": 0.3337998600419874, "grad_norm": 0.5139027559549204, "kl": 1.0234375, "learning_rate": 9.864545374169337e-06, "loss": 0.0106, "step": 954 }, { "clip_ratio": 0.030906083062291145, "epoch": 0.33414975507347794, "grad_norm": 0.5424306246698362, "kl": 1.0234375, "learning_rate": 9.863838500047372e-06, "loss": 0.0091, "step": 955 }, { "clip_ratio": 0.03817666321992874, "epoch": 0.3344996501049685, "grad_norm": 0.43130676271149787, "kl": 1.03125, "learning_rate": 9.863129811759678e-06, "loss": 0.0073, "step": 956 }, { "clip_ratio": 0.002848481060937047, "clipped_completions_ratio": 0.0, "epoch": 0.33484954513645904, "grad_norm": 0.6825452360529747, "kl": 0.984375, "learning_rate": 9.862419309570592e-06, "loss": 0.0059, "max_completion_length": 208.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 160.17857360839844, "mean_terminated_completion_length": 160.17857360839844, "min_completion_length": 83.0, "min_terminated_completion_length": 83.0, "num_tokens": 4836428.0, "reward": 2.6837527751922607, "reward_std": 0.13554024696350098, "rewards/check_gptzero_func/mean": 0.8214285969734192, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.8623241186141968, "rewards/check_winston_local_func/std": 0.24953186511993408, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 957 }, { "clip_ratio": 0.015884296968579292, "epoch": 0.3351994401679496, "grad_norm": 0.5987440989320459, "kl": 0.99609375, "learning_rate": 9.861706993745122e-06, "loss": 0.0042, "step": 958 }, { "clip_ratio": 0.028796423226594925, "epoch": 0.3355493351994402, "grad_norm": 0.5163587496786317, "kl": 0.99609375, "learning_rate": 9.860992864548957e-06, "loss": 0.0021, "step": 959 }, { "clip_ratio": 0.03512963652610779, "epoch": 0.3358992302309307, "grad_norm": 0.3342439705574596, "kl": 0.98828125, "learning_rate": 9.86027692224846e-06, "loss": 0.0008, "step": 960 }, { "clip_ratio": 0.00413516815751791, "clipped_completions_ratio": 0.0, "epoch": 0.3362491252624213, "grad_norm": 0.7519828908359064, "kl": 1.0390625, "learning_rate": 9.85955916711067e-06, "loss": 0.0103, "max_completion_length": 214.0, "max_terminated_completion_length": 214.0, "mean_completion_length": 152.33929443359375, "mean_terminated_completion_length": 152.33929443359375, "min_completion_length": 90.0, "min_terminated_completion_length": 90.0, "num_tokens": 4855095.0, "reward": 2.4198830127716064, "reward_std": 0.2537258267402649, "rewards/check_gptzero_func/mean": 0.5178571343421936, "rewards/check_gptzero_func/std": 0.5042031407356262, "rewards/check_winston_local_func/mean": 0.9326381087303162, "rewards/check_winston_local_func/std": 0.11749542504549026, "rewards/sentence_count_match_reward_logic/mean": 0.9693877100944519, "rewards/sentence_count_match_reward_logic/std": 0.06270122528076172, "step": 961 }, { "clip_ratio": 0.010105122812092304, "epoch": 0.3365990202939118, "grad_norm": 0.560563334645668, "kl": 1.0390625, "learning_rate": 9.858839599403303e-06, "loss": 0.0076, "step": 962 }, { "clip_ratio": 0.02492489665746689, "epoch": 0.3369489153254024, "grad_norm": 0.41494460596286314, "kl": 1.046875, "learning_rate": 9.858118219394753e-06, "loss": 0.0054, "step": 963 }, { "clip_ratio": 0.03801753744482994, "epoch": 0.3372988103568929, "grad_norm": 0.32446264621368365, "kl": 1.0546875, "learning_rate": 9.857395027354085e-06, "loss": 0.004, "step": 964 }, { "clip_ratio": 0.0025317284744232893, "clipped_completions_ratio": 0.0, "epoch": 0.3376487053883835, "grad_norm": 0.7268029045150597, "kl": 0.9765625, "learning_rate": 9.856670023551043e-06, "loss": 0.0053, "max_completion_length": 233.0, "max_terminated_completion_length": 233.0, "mean_completion_length": 161.55357360839844, "mean_terminated_completion_length": 161.55357360839844, "min_completion_length": 95.0, "min_terminated_completion_length": 95.0, "num_tokens": 4874782.0, "reward": 2.894972085952759, "reward_std": 0.1376635879278183, "rewards/check_gptzero_func/mean": 0.9464285969734192, "rewards/check_gptzero_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9530075788497925, "rewards/check_winston_local_func/std": 0.07913074642419815, "rewards/sentence_count_match_reward_logic/mean": 0.9955357313156128, "rewards/sentence_count_match_reward_logic/std": 0.023407040163874626, "step": 965 }, { "clip_ratio": 0.009013036265969276, "epoch": 0.33799860041987406, "grad_norm": 0.5479059607045215, "kl": 0.98046875, "learning_rate": 9.855943208256046e-06, "loss": 0.0024, "step": 966 }, { "clip_ratio": 0.026292534545063972, "epoch": 0.3383484954513646, "grad_norm": 0.389655719244212, "kl": 0.98828125, "learning_rate": 9.855214581740194e-06, "loss": 0.0003, "step": 967 }, { "clip_ratio": 0.04059991240501404, "epoch": 0.33869839048285516, "grad_norm": 0.36275301420131917, "kl": 0.99609375, "learning_rate": 9.854484144275254e-06, "loss": -0.0008, "step": 968 }, { "clip_ratio": 0.004123796708881855, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.3390482855143457, "grad_norm": 0.551427648093647, "kl": 0.76171875, "learning_rate": 9.853751896133672e-06, "loss": 0.0098, "max_completion_length": 256.0, "max_terminated_completion_length": 239.0, "mean_completion_length": 197.94644165039062, "mean_terminated_completion_length": 188.27084350585938, "min_completion_length": 128.0, "min_terminated_completion_length": 128.0, "num_tokens": 4898787.0, "reward": 2.4463117122650146, "reward_std": 0.22476541996002197, "rewards/check_gptzero_func/mean": 0.6964285969734192, "rewards/check_gptzero_func/std": 0.4639609158039093, "rewards/check_winston_local_func/mean": 0.7736925482749939, "rewards/check_winston_local_func/std": 0.2510150969028473, "rewards/sentence_count_match_reward_logic/mean": 0.976190447807312, "rewards/sentence_count_match_reward_logic/std": 0.058848995715379715, "step": 969 }, { "clip_ratio": 0.006471659522503614, "epoch": 0.33939818054583626, "grad_norm": 0.47965770378357975, "kl": 0.76171875, "learning_rate": 9.853017837588569e-06, "loss": 0.0074, "step": 970 }, { "clip_ratio": 0.016761742532253265, "epoch": 0.3397480755773268, "grad_norm": 0.4363677083497622, "kl": 0.76953125, "learning_rate": 9.852281968913743e-06, "loss": 0.0048, "step": 971 }, { "clip_ratio": 0.0307687409222126, "epoch": 0.34009797060881736, "grad_norm": 0.32822014472798716, "kl": 0.76953125, "learning_rate": 9.85154429038367e-06, "loss": 0.0032, "step": 972 }, { "clip_ratio": 0.0035974280908703804, "clipped_completions_ratio": 0.0, "epoch": 0.34044786564030793, "grad_norm": 0.8547772247595723, "kl": 1.0859375, "learning_rate": 9.85080480227349e-06, "loss": 0.0144, "max_completion_length": 248.0, "max_terminated_completion_length": 248.0, "mean_completion_length": 169.33929443359375, "mean_terminated_completion_length": 169.33929443359375, "min_completion_length": 60.0, "min_terminated_completion_length": 60.0, "num_tokens": 4919238.0, "reward": 2.593381881713867, "reward_std": 0.04327172413468361, "rewards/check_gptzero_func/mean": 0.7142857313156128, "rewards/check_gptzero_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.8816472291946411, "rewards/check_winston_local_func/std": 0.20295676589012146, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 973 }, { "clip_ratio": 0.014175280928611755, "epoch": 0.34079776067179846, "grad_norm": 0.5856606739785767, "kl": 1.078125, "learning_rate": 9.85006350485903e-06, "loss": 0.0113, "step": 974 }, { "clip_ratio": 0.03442869335412979, "epoch": 0.34114765570328903, "grad_norm": 0.41396371829684775, "kl": 1.09375, "learning_rate": 9.849320398416784e-06, "loss": 0.0079, "step": 975 }, { "clip_ratio": 0.04698638245463371, "epoch": 0.34149755073477955, "grad_norm": 0.38192874911870556, "kl": 1.078125, "learning_rate": 9.848575483223925e-06, "loss": 0.0063, "step": 976 }, { "clip_ratio": 0.004062918480485678, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.34184744576627013, "grad_norm": 0.6187916199701984, "kl": 0.78515625, "learning_rate": 9.847828759558302e-06, "loss": 0.0015, "max_completion_length": 256.0, "max_terminated_completion_length": 243.0, "mean_completion_length": 207.6428680419922, "mean_terminated_completion_length": 199.58334350585938, "min_completion_length": 120.0, "min_terminated_completion_length": 120.0, "num_tokens": 4944178.0, "reward": 2.5914902687072754, "reward_std": 0.21761782467365265, "rewards/check_gptzero_func/mean": 0.7678571343421936, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.8346872329711914, "rewards/check_winston_local_func/std": 0.18017728626728058, "rewards/sentence_count_match_reward_logic/mean": 0.9889455437660217, "rewards/sentence_count_match_reward_logic/std": 0.0359317883849144, "step": 977 }, { "clip_ratio": 0.00806138850748539, "epoch": 0.34219734079776065, "grad_norm": 0.48660360047504864, "kl": 0.79296875, "learning_rate": 9.84708022769843e-06, "loss": -0.0008, "step": 978 }, { "clip_ratio": 0.02048332802951336, "epoch": 0.34254723582925123, "grad_norm": 0.4139621303635093, "kl": 0.80859375, "learning_rate": 9.84632988792351e-06, "loss": -0.0031, "step": 979 }, { "clip_ratio": 0.033752258867025375, "epoch": 0.34289713086074175, "grad_norm": 0.3780393957786283, "kl": 0.8125, "learning_rate": 9.845577740513409e-06, "loss": -0.0049, "step": 980 }, { "clip_ratio": 0.00337226502597332, "clipped_completions_ratio": 0.0, "epoch": 0.3432470258922323, "grad_norm": 0.5786314551241005, "kl": 0.7421875, "learning_rate": 9.844823785748669e-06, "loss": 0.0022, "max_completion_length": 237.0, "max_terminated_completion_length": 237.0, "mean_completion_length": 171.5357208251953, "mean_terminated_completion_length": 171.5357208251953, "min_completion_length": 146.0, "min_terminated_completion_length": 146.0, "num_tokens": 4964904.0, "reward": 2.7643532752990723, "reward_std": 0.040289122611284256, "rewards/check_gptzero_func/mean": 0.8571428656578064, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9357818365097046, "rewards/check_winston_local_func/std": 0.08150684088468552, "rewards/sentence_count_match_reward_logic/mean": 0.9714285731315613, "rewards/sentence_count_match_reward_logic/std": 0.07061877846717834, "step": 981 }, { "clip_ratio": 0.007594210095703602, "epoch": 0.3435969209237229, "grad_norm": 0.484372549447121, "kl": 0.74609375, "learning_rate": 9.844068023910512e-06, "loss": -0.0, "step": 982 }, { "clip_ratio": 0.01779591292142868, "epoch": 0.3439468159552134, "grad_norm": 0.34464205384999574, "kl": 0.75, "learning_rate": 9.843310455280828e-06, "loss": -0.0021, "step": 983 }, { "clip_ratio": 0.02997332066297531, "epoch": 0.344296710986704, "grad_norm": 0.30018237028337713, "kl": 0.75390625, "learning_rate": 9.842551080142182e-06, "loss": -0.0027, "step": 984 }, { "clip_ratio": 0.004151617176830769, "clipped_completions_ratio": 0.0, "epoch": 0.3446466060181945, "grad_norm": 0.8901058650936767, "kl": 1.1328125, "learning_rate": 9.841789898777817e-06, "loss": 0.0065, "max_completion_length": 214.0, "max_terminated_completion_length": 214.0, "mean_completion_length": 145.6428680419922, "mean_terminated_completion_length": 145.6428680419922, "min_completion_length": 78.0, "min_terminated_completion_length": 78.0, "num_tokens": 4982836.0, "reward": 2.3539254665374756, "reward_std": 0.10208884626626968, "rewards/check_gptzero_func/mean": 0.5535714030265808, "rewards/check_gptzero_func/std": 0.5016207695007324, "rewards/check_winston_local_func/mean": 0.8003543019294739, "rewards/check_winston_local_func/std": 0.317958265542984, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 985 }, { "clip_ratio": 0.011197238229215145, "epoch": 0.3449965010496851, "grad_norm": 0.9448833792280051, "kl": 1.2421875, "learning_rate": 9.84102691147164e-06, "loss": 0.0035, "step": 986 }, { "clip_ratio": 0.03370064124464989, "epoch": 0.3453463960811756, "grad_norm": 0.6664317357631265, "kl": 1.1640625, "learning_rate": 9.840262118508245e-06, "loss": -0.0001, "step": 987 }, { "clip_ratio": 0.05268578231334686, "epoch": 0.3456962911126662, "grad_norm": 0.4528927368829702, "kl": 1.1640625, "learning_rate": 9.83949552017289e-06, "loss": -0.0012, "step": 988 }, { "clip_ratio": 0.004026248585432768, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.3460461861441568, "grad_norm": 0.5978184005460149, "kl": 0.87890625, "learning_rate": 9.838727116751508e-06, "loss": 0.0063, "max_completion_length": 256.0, "max_terminated_completion_length": 233.0, "mean_completion_length": 178.58929443359375, "mean_terminated_completion_length": 165.6875, "min_completion_length": 102.0, "min_terminated_completion_length": 102.0, "num_tokens": 5004285.0, "reward": 2.7596616744995117, "reward_std": 0.04625946655869484, "rewards/check_gptzero_func/mean": 0.8571428656578064, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.933867871761322, "rewards/check_winston_local_func/std": 0.08203849196434021, "rewards/sentence_count_match_reward_logic/mean": 0.9686508178710938, "rewards/sentence_count_match_reward_logic/std": 0.07024431228637695, "step": 989 }, { "clip_ratio": 0.009626383893191814, "epoch": 0.3463960811756473, "grad_norm": 0.5032841384832489, "kl": 0.87109375, "learning_rate": 9.837956908530706e-06, "loss": 0.0035, "step": 990 }, { "clip_ratio": 0.019406918436288834, "epoch": 0.3467459762071379, "grad_norm": 0.4222172937891551, "kl": 0.875, "learning_rate": 9.837184895797765e-06, "loss": 0.0011, "step": 991 }, { "clip_ratio": 0.032765232026576996, "epoch": 0.3470958712386284, "grad_norm": 0.3054669091819159, "kl": 0.87890625, "learning_rate": 9.83641107884064e-06, "loss": -0.0008, "step": 992 }, { "clip_ratio": 0.004111025482416153, "clipped_completions_ratio": 0.0, "epoch": 0.34744576627011897, "grad_norm": 0.5936982506788407, "kl": 0.87109375, "learning_rate": 9.835635457947957e-06, "loss": 0.0073, "max_completion_length": 229.0, "max_terminated_completion_length": 229.0, "mean_completion_length": 165.48214721679688, "mean_terminated_completion_length": 165.48214721679688, "min_completion_length": 112.0, "min_terminated_completion_length": 112.0, "num_tokens": 5024520.0, "reward": 2.579582691192627, "reward_std": 0.2229398787021637, "rewards/check_gptzero_func/mean": 0.7142857313156128, "rewards/check_gptzero_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.9105562567710876, "rewards/check_winston_local_func/std": 0.13411632180213928, "rewards/sentence_count_match_reward_logic/mean": 0.9547406435012817, "rewards/sentence_count_match_reward_logic/std": 0.07327290624380112, "step": 993 }, { "clip_ratio": 0.008101382292807102, "epoch": 0.3477956613016095, "grad_norm": 0.5292498829140841, "kl": 0.8828125, "learning_rate": 9.834858033409012e-06, "loss": 0.0054, "step": 994 }, { "clip_ratio": 0.020942097529768944, "epoch": 0.34814555633310007, "grad_norm": 0.4375839464210657, "kl": 0.8828125, "learning_rate": 9.83407880551378e-06, "loss": 0.0031, "step": 995 }, { "clip_ratio": 0.030654750764369965, "epoch": 0.34849545136459065, "grad_norm": 0.36170879172838966, "kl": 0.8828125, "learning_rate": 9.833297774552905e-06, "loss": 0.002, "step": 996 }, { "clip_ratio": 0.00398838659748435, "clipped_completions_ratio": 0.0, "epoch": 0.34884534639608117, "grad_norm": 0.6882095712423868, "kl": 1.0859375, "learning_rate": 9.832514940817705e-06, "loss": 0.0091, "max_completion_length": 213.0, "max_terminated_completion_length": 213.0, "mean_completion_length": 158.85714721679688, "mean_terminated_completion_length": 158.85714721679688, "min_completion_length": 102.0, "min_terminated_completion_length": 102.0, "num_tokens": 5043912.0, "reward": 2.6730287075042725, "reward_std": 0.03902221471071243, "rewards/check_gptzero_func/mean": 0.7142857313156128, "rewards/check_gptzero_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.9587429761886597, "rewards/check_winston_local_func/std": 0.06203798949718475, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 997 }, { "clip_ratio": 0.01097427774220705, "epoch": 0.34919524142757175, "grad_norm": 0.472729375517795, "kl": 1.09375, "learning_rate": 9.83173030460017e-06, "loss": 0.0066, "step": 998 }, { "clip_ratio": 0.02613271214067936, "epoch": 0.34954513645906227, "grad_norm": 0.43088116967275497, "kl": 1.1015625, "learning_rate": 9.830943866192957e-06, "loss": 0.0049, "step": 999 }, { "clip_ratio": 0.03663092851638794, "epoch": 0.34989503149055284, "grad_norm": 0.34817849047410304, "kl": 1.109375, "learning_rate": 9.830155625889406e-06, "loss": 0.0035, "step": 1000 }, { "clip_ratio": 0.0021130992099642754, "clipped_completions_ratio": 0.0, "epoch": 0.35024492652204336, "grad_norm": 0.7788341236614141, "kl": 1.078125, "learning_rate": 9.829365583983518e-06, "loss": 0.0053, "max_completion_length": 193.0, "max_terminated_completion_length": 193.0, "mean_completion_length": 144.9107208251953, "mean_terminated_completion_length": 144.9107208251953, "min_completion_length": 65.0, "min_terminated_completion_length": 65.0, "num_tokens": 5061891.0, "reward": 2.552478313446045, "reward_std": 0.2334916591644287, "rewards/check_gptzero_func/mean": 0.6964285969734192, "rewards/check_gptzero_func/std": 0.4639609158039093, "rewards/check_winston_local_func/mean": 0.8590258359909058, "rewards/check_winston_local_func/std": 0.16801463067531586, "rewards/sentence_count_match_reward_logic/mean": 0.9970237612724304, "rewards/sentence_count_match_reward_logic/std": 0.02227177284657955, "step": 1001 }, { "clip_ratio": 0.011000622995197773, "epoch": 0.35059482155353394, "grad_norm": 0.4723779671006652, "kl": 1.09375, "learning_rate": 9.828573740769978e-06, "loss": 0.0024, "step": 1002 }, { "clip_ratio": 0.024035919457674026, "epoch": 0.3509447165850245, "grad_norm": 0.34031341728380815, "kl": 1.1171875, "learning_rate": 9.827780096544129e-06, "loss": 0.0012, "step": 1003 }, { "clip_ratio": 0.0364580936729908, "epoch": 0.35129461161651504, "grad_norm": 0.4033806171586014, "kl": 1.15625, "learning_rate": 9.826984651601998e-06, "loss": 0.0008, "step": 1004 }, { "clip_ratio": 0.004739958327263594, "clipped_completions_ratio": 0.0, "epoch": 0.3516445066480056, "grad_norm": 0.719778174448987, "kl": 1.078125, "learning_rate": 9.826187406240275e-06, "loss": 0.0124, "max_completion_length": 205.0, "max_terminated_completion_length": 205.0, "mean_completion_length": 142.0357208251953, "mean_terminated_completion_length": 142.0357208251953, "min_completion_length": 90.0, "min_terminated_completion_length": 90.0, "num_tokens": 5079365.0, "reward": 2.71061372756958, "reward_std": 0.14957712590694427, "rewards/check_gptzero_func/mean": 0.8035714030265808, "rewards/check_gptzero_func/std": 0.40089184045791626, "rewards/check_winston_local_func/mean": 0.9356135129928589, "rewards/check_winston_local_func/std": 0.09324430674314499, "rewards/sentence_count_match_reward_logic/mean": 0.9714285731315613, "rewards/sentence_count_match_reward_logic/std": 0.07061877846717834, "step": 1005 }, { "clip_ratio": 0.012946036644279957, "epoch": 0.35199440167949614, "grad_norm": 0.5374596379873889, "kl": 1.078125, "learning_rate": 9.825388360756326e-06, "loss": 0.0097, "step": 1006 }, { "clip_ratio": 0.02812086045742035, "epoch": 0.3523442967109867, "grad_norm": 0.3709904936062392, "kl": 1.078125, "learning_rate": 9.824587515448188e-06, "loss": 0.0077, "step": 1007 }, { "clip_ratio": 0.03732236102223396, "epoch": 0.35269419174247724, "grad_norm": 0.3547945491016646, "kl": 1.0859375, "learning_rate": 9.823784870614568e-06, "loss": 0.0074, "step": 1008 }, { "clip_ratio": 0.0034401551820337772, "clipped_completions_ratio": 0.0, "epoch": 0.3530440867739678, "grad_norm": 0.7626910260470886, "kl": 1.0859375, "learning_rate": 9.822980426554846e-06, "loss": 0.008, "max_completion_length": 205.0, "max_terminated_completion_length": 205.0, "mean_completion_length": 149.55357360839844, "mean_terminated_completion_length": 149.55357360839844, "min_completion_length": 81.0, "min_terminated_completion_length": 81.0, "num_tokens": 5097964.0, "reward": 2.6560544967651367, "reward_std": 0.29555603861808777, "rewards/check_gptzero_func/mean": 0.75, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.9060544967651367, "rewards/check_winston_local_func/std": 0.1416795551776886, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1009 }, { "clip_ratio": 0.011939933523535728, "epoch": 0.35339398180545833, "grad_norm": 0.5216228615484845, "kl": 1.09375, "learning_rate": 9.822174183569071e-06, "loss": 0.0055, "step": 1010 }, { "clip_ratio": 0.023967880755662918, "epoch": 0.3537438768369489, "grad_norm": 0.4537048284131478, "kl": 1.1015625, "learning_rate": 9.821366141957964e-06, "loss": 0.0035, "step": 1011 }, { "clip_ratio": 0.03829505667090416, "epoch": 0.3540937718684395, "grad_norm": 0.3516875339727975, "kl": 1.1015625, "learning_rate": 9.820556302022916e-06, "loss": 0.0017, "step": 1012 }, { "clip_ratio": 0.005968005862087011, "clipped_completions_ratio": 0.0, "epoch": 0.35444366689993, "grad_norm": 0.6902755111022826, "kl": 0.9296875, "learning_rate": 9.819744664065993e-06, "loss": 0.0101, "max_completion_length": 244.0, "max_terminated_completion_length": 244.0, "mean_completion_length": 188.69644165039062, "mean_terminated_completion_length": 188.69644165039062, "min_completion_length": 126.0, "min_terminated_completion_length": 126.0, "num_tokens": 5120971.0, "reward": 2.8152143955230713, "reward_std": 0.03145122528076172, "rewards/check_gptzero_func/mean": 0.8571428656578064, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9640235304832458, "rewards/check_winston_local_func/std": 0.04315488785505295, "rewards/sentence_count_match_reward_logic/mean": 0.9940475821495056, "rewards/sentence_count_match_reward_logic/std": 0.031209392473101616, "step": 1013 }, { "clip_ratio": 0.011332509107887745, "epoch": 0.3547935619314206, "grad_norm": 0.5600015250753794, "kl": 0.93359375, "learning_rate": 9.818931228389925e-06, "loss": 0.0082, "step": 1014 }, { "clip_ratio": 0.023984557017683983, "epoch": 0.3551434569629111, "grad_norm": 0.44939591302032794, "kl": 0.9375, "learning_rate": 9.818115995298117e-06, "loss": 0.0054, "step": 1015 }, { "clip_ratio": 0.03601376712322235, "epoch": 0.3554933519944017, "grad_norm": 0.3645944722696622, "kl": 0.94921875, "learning_rate": 9.817298965094644e-06, "loss": 0.0041, "step": 1016 }, { "clip_ratio": 0.004279031418263912, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.3558432470258922, "grad_norm": 0.8222160526352692, "kl": 0.96875, "learning_rate": 9.816480138084248e-06, "loss": 0.0109, "max_completion_length": 256.0, "max_terminated_completion_length": 199.0, "mean_completion_length": 154.55357360839844, "mean_terminated_completion_length": 137.64584350585938, "min_completion_length": 70.0, "min_terminated_completion_length": 70.0, "num_tokens": 5140498.0, "reward": 2.5659072399139404, "reward_std": 0.0999382957816124, "rewards/check_gptzero_func/mean": 0.7321428656578064, "rewards/check_gptzero_func/std": 0.44685041904449463, "rewards/check_winston_local_func/mean": 0.8925665616989136, "rewards/check_winston_local_func/std": 0.17331480979919434, "rewards/sentence_count_match_reward_logic/mean": 0.9411976933479309, "rewards/sentence_count_match_reward_logic/std": 0.11061926931142807, "step": 1017 }, { "clip_ratio": 0.016179976984858513, "epoch": 0.3561931420573828, "grad_norm": 0.5726359225502801, "kl": 0.984375, "learning_rate": 9.815659514572347e-06, "loss": 0.0084, "step": 1018 }, { "clip_ratio": 0.029918279498815536, "epoch": 0.35654303708887336, "grad_norm": 0.6299770130478196, "kl": 0.9921875, "learning_rate": 9.814837094865021e-06, "loss": 0.0069, "step": 1019 }, { "clip_ratio": 0.03593306615948677, "epoch": 0.3568929321203639, "grad_norm": 0.3650417498842024, "kl": 0.984375, "learning_rate": 9.814012879269031e-06, "loss": 0.0058, "step": 1020 }, { "clip_ratio": 0.003964471165090799, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.35724282715185446, "grad_norm": 0.6178198999706905, "kl": 0.90234375, "learning_rate": 9.813186868091799e-06, "loss": 0.0061, "max_completion_length": 256.0, "max_terminated_completion_length": 240.0, "mean_completion_length": 188.94644165039062, "mean_terminated_completion_length": 177.77084350585938, "min_completion_length": 117.0, "min_terminated_completion_length": 117.0, "num_tokens": 5163279.0, "reward": 2.5524559020996094, "reward_std": 0.22260595858097076, "rewards/check_gptzero_func/mean": 0.7142857313156128, "rewards/check_gptzero_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.9604917168617249, "rewards/check_winston_local_func/std": 0.043404243886470795, "rewards/sentence_count_match_reward_logic/mean": 0.8776785731315613, "rewards/sentence_count_match_reward_logic/std": 0.18313422799110413, "step": 1021 }, { "clip_ratio": 0.010191149078309536, "epoch": 0.357592722183345, "grad_norm": 0.6074758698768995, "kl": 0.90625, "learning_rate": 9.812359061641417e-06, "loss": 0.0035, "step": 1022 }, { "clip_ratio": 0.02694130502641201, "epoch": 0.35794261721483556, "grad_norm": 0.42556515563596703, "kl": 0.921875, "learning_rate": 9.811529460226651e-06, "loss": 0.0009, "step": 1023 }, { "clip_ratio": 0.03828049451112747, "epoch": 0.3582925122463261, "grad_norm": 0.3946742782985423, "kl": 0.921875, "learning_rate": 9.810698064156935e-06, "loss": -0.0004, "step": 1024 }, { "clip_ratio": 0.006175887770950794, "clipped_completions_ratio": 0.0, "epoch": 0.35864240727781665, "grad_norm": 0.7159976782134253, "kl": 0.84765625, "learning_rate": 9.809864873742372e-06, "loss": 0.0049, "max_completion_length": 232.0, "max_terminated_completion_length": 232.0, "mean_completion_length": 182.3928680419922, "mean_terminated_completion_length": 182.3928680419922, "min_completion_length": 126.0, "min_terminated_completion_length": 126.0, "num_tokens": 5185197.0, "reward": 2.67368483543396, "reward_std": 0.03772585093975067, "rewards/check_gptzero_func/mean": 0.7142857313156128, "rewards/check_gptzero_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.9865036606788635, "rewards/check_winston_local_func/std": 0.01055778656154871, "rewards/sentence_count_match_reward_logic/mean": 0.9728954434394836, "rewards/sentence_count_match_reward_logic/std": 0.05543208867311478, "step": 1025 }, { "clip_ratio": 0.010082010179758072, "epoch": 0.35899230230930723, "grad_norm": 0.5532236804645279, "kl": 0.84375, "learning_rate": 9.809029889293731e-06, "loss": 0.0016, "step": 1026 }, { "clip_ratio": 0.024589387699961662, "epoch": 0.35934219734079775, "grad_norm": 0.43790087361637486, "kl": 0.87109375, "learning_rate": 9.808193111122457e-06, "loss": -0.0002, "step": 1027 }, { "clip_ratio": 0.03787184879183769, "epoch": 0.35969209237228833, "grad_norm": 0.4172248446245142, "kl": 0.88671875, "learning_rate": 9.80735453954066e-06, "loss": -0.0018, "step": 1028 }, { "clip_ratio": 0.0035256510600447655, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.36004198740377885, "grad_norm": 0.918200415109547, "kl": 1.1640625, "learning_rate": 9.806514174861117e-06, "loss": 0.0128, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 136.94644165039062, "mean_terminated_completion_length": 132.5370330810547, "min_completion_length": 62.0, "min_terminated_completion_length": 62.0, "num_tokens": 5202242.0, "reward": 2.786620616912842, "reward_std": 0.20017196238040924, "rewards/check_gptzero_func/mean": 0.9107142686843872, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.911620557308197, "rewards/check_winston_local_func/std": 0.10721475630998611, "rewards/sentence_count_match_reward_logic/mean": 0.9642857313156128, "rewards/sentence_count_match_reward_logic/std": 0.08827348798513412, "step": 1029 }, { "clip_ratio": 0.015225419774651527, "epoch": 0.36039188243526943, "grad_norm": 0.5389368324846368, "kl": 1.171875, "learning_rate": 9.805672017397276e-06, "loss": 0.0101, "step": 1030 }, { "clip_ratio": 0.02768116444349289, "epoch": 0.36074177746675995, "grad_norm": 0.5225758360044308, "kl": 1.1953125, "learning_rate": 9.804828067463257e-06, "loss": 0.0092, "step": 1031 }, { "clip_ratio": 0.034805577248334885, "epoch": 0.3610916724982505, "grad_norm": 0.3754753224988853, "kl": 1.1953125, "learning_rate": 9.803982325373843e-06, "loss": 0.0079, "step": 1032 }, { "clip_ratio": 0.004946516826748848, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.3614415675297411, "grad_norm": 0.6813217497121985, "kl": 0.8671875, "learning_rate": 9.803134791444488e-06, "loss": 0.0025, "max_completion_length": 256.0, "max_terminated_completion_length": 209.0, "mean_completion_length": 174.6428680419922, "mean_terminated_completion_length": 161.08334350585938, "min_completion_length": 140.0, "min_terminated_completion_length": 140.0, "num_tokens": 5223990.0, "reward": 2.646001100540161, "reward_std": 0.08075606822967529, "rewards/check_gptzero_func/mean": 0.8571428656578064, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.8084160089492798, "rewards/check_winston_local_func/std": 0.32607969641685486, "rewards/sentence_count_match_reward_logic/mean": 0.9804421663284302, "rewards/sentence_count_match_reward_logic/std": 0.05238236114382744, "step": 1033 }, { "clip_ratio": 0.011250319890677929, "epoch": 0.3617914625612316, "grad_norm": 0.6154992638012529, "kl": 0.86328125, "learning_rate": 9.802285465991315e-06, "loss": 0.0001, "step": 1034 }, { "clip_ratio": 0.023676643148064613, "epoch": 0.3621413575927222, "grad_norm": 0.42385361685260425, "kl": 0.8671875, "learning_rate": 9.801434349331114e-06, "loss": -0.0031, "step": 1035 }, { "clip_ratio": 0.03842925280332565, "epoch": 0.3624912526242127, "grad_norm": 0.40048734158105925, "kl": 0.87109375, "learning_rate": 9.800581441781342e-06, "loss": -0.0049, "step": 1036 }, { "clip_ratio": 0.004111312795430422, "clipped_completions_ratio": 0.0, "epoch": 0.3628411476557033, "grad_norm": 0.5954320960808703, "kl": 0.89453125, "learning_rate": 9.799726743660128e-06, "loss": 0.0009, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 195.98214721679688, "mean_terminated_completion_length": 195.98214721679688, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 5247581.0, "reward": 2.5699524879455566, "reward_std": 0.1795409470796585, "rewards/check_gptzero_func/mean": 0.6607142686843872, "rewards/check_gptzero_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.9852585792541504, "rewards/check_winston_local_func/std": 0.010516049340367317, "rewards/sentence_count_match_reward_logic/mean": 0.923979640007019, "rewards/sentence_count_match_reward_logic/std": 0.17983929812908173, "step": 1037 }, { "clip_ratio": 0.007544501684606075, "epoch": 0.3631910426871938, "grad_norm": 0.5429375474792747, "kl": 0.89453125, "learning_rate": 9.798870255286262e-06, "loss": -0.0021, "step": 1038 }, { "clip_ratio": 0.021693162620067596, "epoch": 0.3635409377186844, "grad_norm": 0.410528356732804, "kl": 0.8984375, "learning_rate": 9.798011976979211e-06, "loss": -0.004, "step": 1039 }, { "clip_ratio": 0.034631870687007904, "epoch": 0.363890832750175, "grad_norm": 0.4130979253469296, "kl": 0.90234375, "learning_rate": 9.797151909059102e-06, "loss": -0.0052, "step": 1040 }, { "clip_ratio": 0.004220395814627409, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.3642407277816655, "grad_norm": 0.7701397795962057, "kl": 0.86328125, "learning_rate": 9.796290051846735e-06, "loss": 0.0052, "max_completion_length": 256.0, "max_terminated_completion_length": 199.0, "mean_completion_length": 169.71429443359375, "mean_terminated_completion_length": 155.33334350585938, "min_completion_length": 119.0, "min_terminated_completion_length": 119.0, "num_tokens": 5268597.0, "reward": 2.8666040897369385, "reward_std": 0.06277068704366684, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.8963657021522522, "rewards/check_winston_local_func/std": 0.09679704159498215, "rewards/sentence_count_match_reward_logic/mean": 0.9702380895614624, "rewards/sentence_count_match_reward_logic/std": 0.07854191213846207, "step": 1041 }, { "clip_ratio": 0.011724145151674747, "epoch": 0.3645906228131561, "grad_norm": 0.5495898899672776, "kl": 0.859375, "learning_rate": 9.795426405663571e-06, "loss": 0.0008, "step": 1042 }, { "clip_ratio": 0.026681307703256607, "epoch": 0.3649405178446466, "grad_norm": 0.41149957539729903, "kl": 0.86328125, "learning_rate": 9.79456097083174e-06, "loss": -0.001, "step": 1043 }, { "clip_ratio": 0.038695886731147766, "epoch": 0.36529041287613717, "grad_norm": 0.3176074074198517, "kl": 0.87890625, "learning_rate": 9.79369374767405e-06, "loss": -0.0019, "step": 1044 }, { "clip_ratio": 0.005156972445547581, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.3656403079076277, "grad_norm": 0.7497005199651836, "kl": 1.1015625, "learning_rate": 9.792824736513958e-06, "loss": 0.0123, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 162.6428680419922, "mean_terminated_completion_length": 159.1851806640625, "min_completion_length": 68.0, "min_terminated_completion_length": 68.0, "num_tokens": 5288537.0, "reward": 2.587907552719116, "reward_std": 0.14443224668502808, "rewards/check_gptzero_func/mean": 0.6607142686843872, "rewards/check_gptzero_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.9289790987968445, "rewards/check_winston_local_func/std": 0.11218193173408508, "rewards/sentence_count_match_reward_logic/mean": 0.9982143044471741, "rewards/sentence_count_match_reward_logic/std": 0.01336306519806385, "step": 1045 }, { "clip_ratio": 0.015749463811516762, "epoch": 0.36599020293911827, "grad_norm": 0.6209472760680838, "kl": 1.1171875, "learning_rate": 9.7919539376756e-06, "loss": 0.0086, "step": 1046 }, { "clip_ratio": 0.027046116068959236, "epoch": 0.3663400979706088, "grad_norm": 0.520325734842052, "kl": 1.1171875, "learning_rate": 9.791081351483777e-06, "loss": 0.0073, "step": 1047 }, { "clip_ratio": 0.038997139781713486, "epoch": 0.36668999300209937, "grad_norm": 0.3775683941147111, "kl": 1.1171875, "learning_rate": 9.790206978263955e-06, "loss": 0.0058, "step": 1048 }, { "clip_ratio": 0.0038740593008697033, "clipped_completions_ratio": 0.0, "epoch": 0.36703988803358994, "grad_norm": 0.7487170498478737, "kl": 0.99609375, "learning_rate": 9.789330818342266e-06, "loss": 0.0096, "max_completion_length": 227.0, "max_terminated_completion_length": 227.0, "mean_completion_length": 168.9107208251953, "mean_terminated_completion_length": 168.9107208251953, "min_completion_length": 119.0, "min_terminated_completion_length": 119.0, "num_tokens": 5309180.0, "reward": 2.716600179672241, "reward_std": 0.11443926393985748, "rewards/check_gptzero_func/mean": 0.8214285969734192, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.8951717019081116, "rewards/check_winston_local_func/std": 0.15504814684391022, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1049 }, { "clip_ratio": 0.01043405570089817, "epoch": 0.36738978306508047, "grad_norm": 0.5992036449391707, "kl": 0.984375, "learning_rate": 9.788452872045508e-06, "loss": 0.0064, "step": 1050 }, { "clip_ratio": 0.025407833978533745, "epoch": 0.36773967809657104, "grad_norm": 0.4646850098098831, "kl": 1.0, "learning_rate": 9.78757313970115e-06, "loss": 0.0047, "step": 1051 }, { "clip_ratio": 0.037016209214925766, "epoch": 0.36808957312806156, "grad_norm": 0.42231677218760405, "kl": 0.99609375, "learning_rate": 9.786691621637322e-06, "loss": 0.0038, "step": 1052 }, { "clip_ratio": 0.003559861332178116, "clipped_completions_ratio": 0.0, "epoch": 0.36843946815955214, "grad_norm": 0.724166793656295, "kl": 1.1484375, "learning_rate": 9.785808318182821e-06, "loss": 0.014, "max_completion_length": 199.0, "max_terminated_completion_length": 199.0, "mean_completion_length": 147.55357360839844, "mean_terminated_completion_length": 147.55357360839844, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 5327419.0, "reward": 2.945533275604248, "reward_std": 0.016819801181554794, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9633903503417969, "rewards/check_winston_local_func/std": 0.06517856568098068, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.04413674399256706, "step": 1053 }, { "clip_ratio": 0.011692943051457405, "epoch": 0.36878936319104266, "grad_norm": 0.5926575339439213, "kl": 1.1484375, "learning_rate": 9.784923229667113e-06, "loss": 0.0111, "step": 1054 }, { "clip_ratio": 0.027993595227599144, "epoch": 0.36913925822253324, "grad_norm": 0.3714402946730139, "kl": 1.15625, "learning_rate": 9.784036356420327e-06, "loss": 0.0084, "step": 1055 }, { "clip_ratio": 0.03938700631260872, "epoch": 0.3694891532540238, "grad_norm": 0.31453401256771824, "kl": 1.1640625, "learning_rate": 9.783147698773257e-06, "loss": 0.0077, "step": 1056 }, { "clip_ratio": 0.004382194485515356, "clipped_completions_ratio": 0.0535714285714286, "epoch": 0.36983904828551434, "grad_norm": 0.7269551258924888, "kl": 0.9140625, "learning_rate": 9.782257257057364e-06, "loss": 0.009, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 191.42857360839844, "mean_terminated_completion_length": 187.77359008789062, "min_completion_length": 127.0, "min_terminated_completion_length": 127.0, "num_tokens": 5350515.0, "reward": 2.81188702583313, "reward_std": 0.16453781723976135, "rewards/check_gptzero_func/mean": 0.8392857313156128, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.9726012349128723, "rewards/check_winston_local_func/std": 0.05377277359366417, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1057 }, { "clip_ratio": 0.012110810726881027, "epoch": 0.3701889433170049, "grad_norm": 0.5223482892384014, "kl": 0.92578125, "learning_rate": 9.781365031604776e-06, "loss": 0.006, "step": 1058 }, { "clip_ratio": 0.02760731428861618, "epoch": 0.37053883834849544, "grad_norm": 0.547269099501327, "kl": 0.94921875, "learning_rate": 9.780471022748284e-06, "loss": 0.0038, "step": 1059 }, { "clip_ratio": 0.03883157670497894, "epoch": 0.370888733379986, "grad_norm": 0.43793142341513186, "kl": 0.94921875, "learning_rate": 9.779575230821344e-06, "loss": 0.0021, "step": 1060 }, { "clip_ratio": 0.00520226638764143, "clipped_completions_ratio": 0.0, "epoch": 0.37123862841147653, "grad_norm": 0.8422436761654561, "kl": 1.09375, "learning_rate": 9.77867765615808e-06, "loss": 0.005, "max_completion_length": 208.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 152.69644165039062, "mean_terminated_completion_length": 152.69644165039062, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 5369106.0, "reward": 2.7020509243011475, "reward_std": 0.13440454006195068, "rewards/check_gptzero_func/mean": 0.7321428656578064, "rewards/check_gptzero_func/std": 0.446850448846817, "rewards/check_winston_local_func/mean": 0.9699080586433411, "rewards/check_winston_local_func/std": 0.041938938200473785, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1061 }, { "clip_ratio": 0.01609143055975437, "epoch": 0.3715885234429671, "grad_norm": 0.5724042256818541, "kl": 1.09375, "learning_rate": 9.777778299093274e-06, "loss": 0.0019, "step": 1062 }, { "clip_ratio": 0.03135962411761284, "epoch": 0.3719384184744577, "grad_norm": 0.4372510483630423, "kl": 1.1015625, "learning_rate": 9.776877159962384e-06, "loss": -0.0004, "step": 1063 }, { "clip_ratio": 0.04646845906972885, "epoch": 0.3722883135059482, "grad_norm": 0.44734503539627946, "kl": 1.1171875, "learning_rate": 9.775974239101522e-06, "loss": -0.0014, "step": 1064 }, { "clip_ratio": 0.005415027495473623, "clipped_completions_ratio": 0.0, "epoch": 0.3726382085374388, "grad_norm": 0.8278678996912758, "kl": 0.953125, "learning_rate": 9.77506953684747e-06, "loss": 0.0061, "max_completion_length": 236.0, "max_terminated_completion_length": 236.0, "mean_completion_length": 157.35714721679688, "mean_terminated_completion_length": 157.35714721679688, "min_completion_length": 111.0, "min_terminated_completion_length": 111.0, "num_tokens": 5388382.0, "reward": 2.6385445594787598, "reward_std": 0.15305311977863312, "rewards/check_gptzero_func/mean": 0.8035714030265808, "rewards/check_gptzero_func/std": 0.40089184045791626, "rewards/check_winston_local_func/mean": 0.8869072794914246, "rewards/check_winston_local_func/std": 0.20218528807163239, "rewards/sentence_count_match_reward_logic/mean": 0.948065459728241, "rewards/sentence_count_match_reward_logic/std": 0.08413240313529968, "step": 1065 }, { "clip_ratio": 0.01586555503308773, "epoch": 0.3729881035689293, "grad_norm": 0.5789253310457346, "kl": 0.95703125, "learning_rate": 9.774163053537675e-06, "loss": 0.0027, "step": 1066 }, { "clip_ratio": 0.033481620252132416, "epoch": 0.3733379986004199, "grad_norm": 0.49616844277776034, "kl": 0.9609375, "learning_rate": 9.773254789510244e-06, "loss": 0.0001, "step": 1067 }, { "clip_ratio": 0.04923311993479729, "epoch": 0.3736878936319104, "grad_norm": 0.37040334517145634, "kl": 0.96484375, "learning_rate": 9.772344745103955e-06, "loss": -0.0018, "step": 1068 }, { "clip_ratio": 0.00463307136669755, "clipped_completions_ratio": 0.0, "epoch": 0.374037788663401, "grad_norm": 0.7146084682930564, "kl": 1.015625, "learning_rate": 9.77143292065824e-06, "loss": 0.0042, "max_completion_length": 247.0, "max_terminated_completion_length": 247.0, "mean_completion_length": 188.25001525878906, "mean_terminated_completion_length": 188.25001525878906, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 5411244.0, "reward": 2.460357427597046, "reward_std": 0.2941552996635437, "rewards/check_gptzero_func/mean": 0.5, "rewards/check_gptzero_func/std": 0.5045249462127686, "rewards/check_winston_local_func/mean": 0.9603572487831116, "rewards/check_winston_local_func/std": 0.05094347149133682, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1069 }, { "clip_ratio": 0.01231194194406271, "epoch": 0.37438768369489156, "grad_norm": 0.5359259880564162, "kl": 1.0390625, "learning_rate": 9.770519316513205e-06, "loss": 0.0016, "step": 1070 }, { "clip_ratio": 0.024154234677553177, "epoch": 0.3747375787263821, "grad_norm": 0.5855450999256684, "kl": 1.046875, "learning_rate": 9.769603933009614e-06, "loss": 0.0001, "step": 1071 }, { "clip_ratio": 0.03279034048318863, "epoch": 0.37508747375787266, "grad_norm": 0.8608286993023648, "kl": 1.0078125, "learning_rate": 9.7686867704889e-06, "loss": -0.0007, "step": 1072 }, { "clip_ratio": 0.005127421580255032, "clipped_completions_ratio": 0.0, "epoch": 0.3754373687893632, "grad_norm": 0.65084220272849, "kl": 0.9453125, "learning_rate": 9.767767829293151e-06, "loss": 0.004, "max_completion_length": 235.0, "max_terminated_completion_length": 235.0, "mean_completion_length": 171.05357360839844, "mean_terminated_completion_length": 171.05357360839844, "min_completion_length": 90.0, "min_terminated_completion_length": 90.0, "num_tokens": 5432207.0, "reward": 2.7668497562408447, "reward_std": 0.1152038425207138, "rewards/check_gptzero_func/mean": 0.8035714030265808, "rewards/check_gptzero_func/std": 0.40089184045791626, "rewards/check_winston_local_func/mean": 0.9709309339523315, "rewards/check_winston_local_func/std": 0.06079000234603882, "rewards/sentence_count_match_reward_logic/mean": 0.9923468828201294, "rewards/sentence_count_match_reward_logic/std": 0.03245825320482254, "step": 1073 }, { "clip_ratio": 0.01059640385210514, "epoch": 0.37578726382085376, "grad_norm": 0.5608430414657603, "kl": 0.95703125, "learning_rate": 9.766847109765126e-06, "loss": 0.0018, "step": 1074 }, { "clip_ratio": 0.0217194315046072, "epoch": 0.3761371588523443, "grad_norm": 0.4884798460547232, "kl": 0.95703125, "learning_rate": 9.765924612248245e-06, "loss": -0.0001, "step": 1075 }, { "clip_ratio": 0.032074376940727234, "epoch": 0.37648705388383485, "grad_norm": 0.4020028027784553, "kl": 0.94921875, "learning_rate": 9.76500033708659e-06, "loss": -0.0017, "step": 1076 }, { "clip_ratio": 0.00507169496268034, "clipped_completions_ratio": 0.0, "epoch": 0.3768369489153254, "grad_norm": 0.7461602715721968, "kl": 0.86328125, "learning_rate": 9.764074284624904e-06, "loss": 0.0015, "max_completion_length": 228.0, "max_terminated_completion_length": 228.0, "mean_completion_length": 183.2857208251953, "mean_terminated_completion_length": 183.2857208251953, "min_completion_length": 129.0, "min_terminated_completion_length": 129.0, "num_tokens": 5454639.0, "reward": 2.7158701419830322, "reward_std": 0.1986466944217682, "rewards/check_gptzero_func/mean": 0.8035714030265808, "rewards/check_gptzero_func/std": 0.40089187026023865, "rewards/check_winston_local_func/mean": 0.9378085136413574, "rewards/check_winston_local_func/std": 0.07433735579252243, "rewards/sentence_count_match_reward_logic/mean": 0.9744897484779358, "rewards/sentence_count_match_reward_logic/std": 0.13375452160835266, "step": 1077 }, { "clip_ratio": 0.009900106117129326, "epoch": 0.37718684394681595, "grad_norm": 0.5916964204265288, "kl": 0.86328125, "learning_rate": 9.763146455208602e-06, "loss": -0.0017, "step": 1078 }, { "clip_ratio": 0.026054123416543007, "epoch": 0.37753673897830653, "grad_norm": 0.441617664243434, "kl": 0.86328125, "learning_rate": 9.76221684918375e-06, "loss": -0.005, "step": 1079 }, { "clip_ratio": 0.03976001590490341, "epoch": 0.37788663400979705, "grad_norm": 0.4175826346758222, "kl": 0.8671875, "learning_rate": 9.761285466897086e-06, "loss": -0.0066, "step": 1080 }, { "clip_ratio": 0.004911141004413366, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.37823652904128763, "grad_norm": 0.9288491863285572, "kl": 1.140625, "learning_rate": 9.760352308696005e-06, "loss": 0.0061, "max_completion_length": 256.0, "max_terminated_completion_length": 199.0, "mean_completion_length": 158.92857360839844, "mean_terminated_completion_length": 142.75, "min_completion_length": 85.0, "min_terminated_completion_length": 85.0, "num_tokens": 5474491.0, "reward": 2.6999754905700684, "reward_std": 0.28688573837280273, "rewards/check_gptzero_func/mean": 0.7678571343421936, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.971552848815918, "rewards/check_winston_local_func/std": 0.058001644909381866, "rewards/sentence_count_match_reward_logic/mean": 0.960565447807312, "rewards/sentence_count_match_reward_logic/std": 0.06667596846818924, "step": 1081 }, { "clip_ratio": 0.017348594963550568, "epoch": 0.37858642407277815, "grad_norm": 0.6780197293252191, "kl": 1.140625, "learning_rate": 9.759417374928566e-06, "loss": 0.0018, "step": 1082 }, { "clip_ratio": 0.037097055464982986, "epoch": 0.3789363191042687, "grad_norm": 0.47005208976360763, "kl": 1.1484375, "learning_rate": 9.758480665943492e-06, "loss": -0.0006, "step": 1083 }, { "clip_ratio": 0.049667105078697205, "epoch": 0.37928621413575925, "grad_norm": 0.42252972356175605, "kl": 1.15625, "learning_rate": 9.757542182090165e-06, "loss": -0.0022, "step": 1084 }, { "clip_ratio": 0.005188434850424528, "clipped_completions_ratio": 0.0, "epoch": 0.3796361091672498, "grad_norm": 0.8700422118253077, "kl": 1.234375, "learning_rate": 9.756601923718631e-06, "loss": 0.0114, "max_completion_length": 221.0, "max_terminated_completion_length": 221.0, "mean_completion_length": 144.05357360839844, "mean_terminated_completion_length": 144.05357360839844, "min_completion_length": 75.0, "min_terminated_completion_length": 75.0, "num_tokens": 5492502.0, "reward": 2.700252056121826, "reward_std": 0.19796186685562134, "rewards/check_gptzero_func/mean": 0.75, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.9740613102912903, "rewards/check_winston_local_func/std": 0.03876729682087898, "rewards/sentence_count_match_reward_logic/mean": 0.976190447807312, "rewards/sentence_count_match_reward_logic/std": 0.07405002415180206, "step": 1085 }, { "clip_ratio": 0.018700743094086647, "epoch": 0.3799860041987404, "grad_norm": 0.6255424365841153, "kl": 1.234375, "learning_rate": 9.755659891179598e-06, "loss": 0.0076, "step": 1086 }, { "clip_ratio": 0.032779011875391006, "epoch": 0.3803358992302309, "grad_norm": 0.44958874137537, "kl": 1.2421875, "learning_rate": 9.754716084824434e-06, "loss": 0.0053, "step": 1087 }, { "clip_ratio": 0.04521270468831062, "epoch": 0.3806857942617215, "grad_norm": 0.3810957948216863, "kl": 1.25, "learning_rate": 9.753770505005171e-06, "loss": 0.004, "step": 1088 }, { "clip_ratio": 0.003971710801124573, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.381035689293212, "grad_norm": 0.709851168627604, "kl": 1.0546875, "learning_rate": 9.7528231520745e-06, "loss": 0.0123, "max_completion_length": 256.0, "max_terminated_completion_length": 239.0, "mean_completion_length": 190.35714721679688, "mean_terminated_completion_length": 179.4166717529297, "min_completion_length": 84.0, "min_terminated_completion_length": 84.0, "num_tokens": 5515650.0, "reward": 2.7405107021331787, "reward_std": 0.10067899525165558, "rewards/check_gptzero_func/mean": 0.7857142686843872, "rewards/check_gptzero_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.9547962546348572, "rewards/check_winston_local_func/std": 0.0676790252327919, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1089 }, { "clip_ratio": 0.01284459512680769, "epoch": 0.3813855843247026, "grad_norm": 0.6043097362958901, "kl": 1.046875, "learning_rate": 9.751874026385777e-06, "loss": 0.0093, "step": 1090 }, { "clip_ratio": 0.02631344087421894, "epoch": 0.3817354793561931, "grad_norm": 0.42852540741017076, "kl": 1.046875, "learning_rate": 9.750923128293016e-06, "loss": 0.0067, "step": 1091 }, { "clip_ratio": 0.03851550817489624, "epoch": 0.3820853743876837, "grad_norm": 0.3787138165727088, "kl": 1.0703125, "learning_rate": 9.749970458150893e-06, "loss": 0.0055, "step": 1092 }, { "clip_ratio": 0.006286031100898981, "clipped_completions_ratio": 0.0, "epoch": 0.3824352694191743, "grad_norm": 0.719646775110094, "kl": 0.9921875, "learning_rate": 9.749016016314744e-06, "loss": 0.011, "max_completion_length": 227.0, "max_terminated_completion_length": 227.0, "mean_completion_length": 168.83929443359375, "mean_terminated_completion_length": 168.83929443359375, "min_completion_length": 127.0, "min_terminated_completion_length": 127.0, "num_tokens": 5536881.0, "reward": 2.683321714401245, "reward_std": 0.17184145748615265, "rewards/check_gptzero_func/mean": 0.7678571343421936, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.9482022523880005, "rewards/check_winston_local_func/std": 0.08370424062013626, "rewards/sentence_count_match_reward_logic/mean": 0.9672618508338928, "rewards/sentence_count_match_reward_logic/std": 0.06391790509223938, "step": 1093 }, { "clip_ratio": 0.012745331972837448, "epoch": 0.3827851644506648, "grad_norm": 0.5618705976422878, "kl": 0.9921875, "learning_rate": 9.748059803140564e-06, "loss": 0.0086, "step": 1094 }, { "clip_ratio": 0.02800343558192253, "epoch": 0.38313505948215537, "grad_norm": 0.3751764788071942, "kl": 0.9921875, "learning_rate": 9.747101818985018e-06, "loss": 0.0061, "step": 1095 }, { "clip_ratio": 0.04025520011782646, "epoch": 0.3834849545136459, "grad_norm": 0.3691557793093258, "kl": 0.9921875, "learning_rate": 9.746142064205422e-06, "loss": 0.0039, "step": 1096 }, { "clip_ratio": 0.004309240262955427, "clipped_completions_ratio": 0.0, "epoch": 0.38383484954513647, "grad_norm": 0.8192452162026552, "kl": 1.0625, "learning_rate": 9.745180539159754e-06, "loss": 0.005, "max_completion_length": 226.0, "max_terminated_completion_length": 226.0, "mean_completion_length": 149.375, "mean_terminated_completion_length": 149.375, "min_completion_length": 76.0, "min_terminated_completion_length": 76.0, "num_tokens": 5555798.0, "reward": 2.7453887462615967, "reward_std": 0.15893959999084473, "rewards/check_gptzero_func/mean": 0.8035714030265808, "rewards/check_gptzero_func/std": 0.40089187026023865, "rewards/check_winston_local_func/mean": 0.9624024629592896, "rewards/check_winston_local_func/std": 0.058172617107629776, "rewards/sentence_count_match_reward_logic/mean": 0.9794146418571472, "rewards/sentence_count_match_reward_logic/std": 0.04463273286819458, "step": 1097 }, { "clip_ratio": 0.014437246136367321, "epoch": 0.384184744576627, "grad_norm": 0.525189239511599, "kl": 1.0703125, "learning_rate": 9.744217244206655e-06, "loss": 0.0021, "step": 1098 }, { "clip_ratio": 0.03248784691095352, "epoch": 0.38453463960811757, "grad_norm": 0.4972855301396055, "kl": 1.0859375, "learning_rate": 9.743252179705425e-06, "loss": 0.0003, "step": 1099 }, { "clip_ratio": 0.04403135925531387, "epoch": 0.38488453463960814, "grad_norm": 0.38909029824298325, "kl": 1.09375, "learning_rate": 9.742285346016024e-06, "loss": -0.0012, "step": 1100 }, { "clip_ratio": 0.005043720826506615, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.38523442967109867, "grad_norm": 0.9176579780611755, "kl": 1.1875, "learning_rate": 9.741316743499071e-06, "loss": 0.0159, "max_completion_length": 256.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 164.05357360839844, "mean_terminated_completion_length": 148.7291717529297, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 5575953.0, "reward": 2.8548617362976074, "reward_std": 0.17553773522377014, "rewards/check_gptzero_func/mean": 0.8928571343421936, "rewards/check_gptzero_func/std": 0.31209391355514526, "rewards/check_winston_local_func/mean": 0.9835819005966187, "rewards/check_winston_local_func/std": 0.011163093149662018, "rewards/sentence_count_match_reward_logic/mean": 0.9784225821495056, "rewards/sentence_count_match_reward_logic/std": 0.060506630688905716, "step": 1101 }, { "clip_ratio": 0.015082002617418766, "epoch": 0.38558432470258924, "grad_norm": 0.7303118106527964, "kl": 1.203125, "learning_rate": 9.740346372515847e-06, "loss": 0.0121, "step": 1102 }, { "clip_ratio": 0.03734833002090454, "epoch": 0.38593421973407976, "grad_norm": 0.5790975928854428, "kl": 1.1953125, "learning_rate": 9.73937423342829e-06, "loss": 0.0085, "step": 1103 }, { "clip_ratio": 0.05220363661646843, "epoch": 0.38628411476557034, "grad_norm": 0.5228079159470546, "kl": 1.1875, "learning_rate": 9.738400326599e-06, "loss": 0.0072, "step": 1104 }, { "clip_ratio": 0.0066772042773664, "clipped_completions_ratio": 0.0, "epoch": 0.38663400979706086, "grad_norm": 0.7802925275534724, "kl": 0.96875, "learning_rate": 9.737424652391232e-06, "loss": 0.0017, "max_completion_length": 222.0, "max_terminated_completion_length": 222.0, "mean_completion_length": 167.94644165039062, "mean_terminated_completion_length": 167.94644165039062, "min_completion_length": 101.0, "min_terminated_completion_length": 101.0, "num_tokens": 5596294.0, "reward": 2.884705066680908, "reward_std": 0.12229883670806885, "rewards/check_gptzero_func/mean": 0.9285714030265808, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9591096043586731, "rewards/check_winston_local_func/std": 0.07589443027973175, "rewards/sentence_count_match_reward_logic/mean": 0.9970237612724304, "rewards/sentence_count_match_reward_logic/std": 0.02227177284657955, "step": 1105 }, { "clip_ratio": 0.011903305537998676, "epoch": 0.38698390482855144, "grad_norm": 0.5391621521115709, "kl": 0.98046875, "learning_rate": 9.736447211168908e-06, "loss": -0.0005, "step": 1106 }, { "clip_ratio": 0.02417854778468609, "epoch": 0.38733379986004196, "grad_norm": 0.5037533006403608, "kl": 1.0078125, "learning_rate": 9.735468003296599e-06, "loss": -0.0035, "step": 1107 }, { "clip_ratio": 0.04044731706380844, "epoch": 0.38768369489153254, "grad_norm": 0.40013322926688233, "kl": 1.0234375, "learning_rate": 9.734487029139544e-06, "loss": -0.0047, "step": 1108 }, { "clip_ratio": 0.005941012408584356, "clipped_completions_ratio": 0.0, "epoch": 0.3880335899230231, "grad_norm": 0.8979859809086581, "kl": 1.375, "learning_rate": 9.733504289063636e-06, "loss": 0.0144, "max_completion_length": 208.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 120.58929443359375, "mean_terminated_completion_length": 120.58929443359375, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 5611671.0, "reward": 2.7821738719940186, "reward_std": 0.24869269132614136, "rewards/check_gptzero_func/mean": 0.8214285969734192, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9607451558113098, "rewards/check_winston_local_func/std": 0.05338210240006447, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1109 }, { "clip_ratio": 0.014902492053806782, "epoch": 0.38838348495451364, "grad_norm": 0.6666853249007287, "kl": 1.390625, "learning_rate": 9.732519783435427e-06, "loss": 0.0105, "step": 1110 }, { "clip_ratio": 0.03164752200245857, "epoch": 0.3887333799860042, "grad_norm": 0.47760793632268367, "kl": 1.390625, "learning_rate": 9.731533512622129e-06, "loss": 0.0075, "step": 1111 }, { "clip_ratio": 0.048436667770147324, "epoch": 0.38908327501749473, "grad_norm": 0.43262698199305144, "kl": 1.3984375, "learning_rate": 9.730545476991613e-06, "loss": 0.0065, "step": 1112 }, { "clip_ratio": 0.004382933024317026, "clipped_completions_ratio": 0.1785714285714286, "epoch": 0.3894331700489853, "grad_norm": 0.7329089293073344, "kl": 1.0390625, "learning_rate": 9.729555676912405e-06, "loss": 0.0065, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 168.8928680419922, "mean_terminated_completion_length": 149.95652770996094, "min_completion_length": 80.0, "min_terminated_completion_length": 80.0, "num_tokens": 5633489.0, "reward": 2.550112247467041, "reward_std": 0.09773682057857513, "rewards/check_gptzero_func/mean": 0.6428571343421936, "rewards/check_gptzero_func/std": 0.48349377512931824, "rewards/check_winston_local_func/mean": 0.9712429046630859, "rewards/check_winston_local_func/std": 0.022549327462911606, "rewards/sentence_count_match_reward_logic/mean": 0.9360118508338928, "rewards/sentence_count_match_reward_logic/std": 0.15969595313072205, "step": 1113 }, { "clip_ratio": 0.0105082206428051, "epoch": 0.38978306508047583, "grad_norm": 0.578054082303487, "kl": 1.046875, "learning_rate": 9.72856411275369e-06, "loss": 0.0039, "step": 1114 }, { "clip_ratio": 0.02675445005297661, "epoch": 0.3901329601119664, "grad_norm": 0.49663099672629285, "kl": 1.0546875, "learning_rate": 9.727570784885316e-06, "loss": 0.0025, "step": 1115 }, { "clip_ratio": 0.03979587182402611, "epoch": 0.390482855143457, "grad_norm": 0.4393814090874488, "kl": 1.0625, "learning_rate": 9.726575693677782e-06, "loss": 0.0005, "step": 1116 }, { "clip_ratio": 0.004095335025340319, "clipped_completions_ratio": 0.0, "epoch": 0.3908327501749475, "grad_norm": 0.7772948590151608, "kl": 1.0546875, "learning_rate": 9.725578839502249e-06, "loss": 0.0082, "max_completion_length": 226.0, "max_terminated_completion_length": 226.0, "mean_completion_length": 152.73214721679688, "mean_terminated_completion_length": 152.73214721679688, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 5652290.0, "reward": 2.79420804977417, "reward_std": 0.20359869301319122, "rewards/check_gptzero_func/mean": 0.8214285969734192, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.972779393196106, "rewards/check_winston_local_func/std": 0.037798356264829636, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1117 }, { "clip_ratio": 0.011406996287405491, "epoch": 0.3911826452064381, "grad_norm": 0.6019218664182052, "kl": 1.0546875, "learning_rate": 9.724580222730533e-06, "loss": 0.0049, "step": 1118 }, { "clip_ratio": 0.027931977063417435, "epoch": 0.3915325402379286, "grad_norm": 0.46343003506531866, "kl": 1.0625, "learning_rate": 9.723579843735108e-06, "loss": 0.0028, "step": 1119 }, { "clip_ratio": 0.046310342848300934, "epoch": 0.3918824352694192, "grad_norm": 0.41266561991720785, "kl": 1.0625, "learning_rate": 9.722577702889106e-06, "loss": 0.0017, "step": 1120 }, { "clip_ratio": 0.0037180325016379356, "clipped_completions_ratio": 0.0, "epoch": 0.3922323303009097, "grad_norm": 0.7403108525889172, "kl": 1.0234375, "learning_rate": 9.72157380056632e-06, "loss": 0.0118, "max_completion_length": 244.0, "max_terminated_completion_length": 244.0, "mean_completion_length": 168.8928680419922, "mean_terminated_completion_length": 168.8928680419922, "min_completion_length": 90.0, "min_terminated_completion_length": 90.0, "num_tokens": 5672956.0, "reward": 2.770559787750244, "reward_std": 0.09011400490999222, "rewards/check_gptzero_func/mean": 0.8392857313156128, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.9491307139396667, "rewards/check_winston_local_func/std": 0.05966741964221001, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.05201565474271774, "step": 1121 }, { "clip_ratio": 0.011526421643793583, "epoch": 0.3925822253324003, "grad_norm": 0.5432931618349366, "kl": 1.0234375, "learning_rate": 9.72056813714119e-06, "loss": 0.0091, "step": 1122 }, { "clip_ratio": 0.023660745471715927, "epoch": 0.39293212036389086, "grad_norm": 0.414268143143153, "kl": 1.03125, "learning_rate": 9.719560712988824e-06, "loss": 0.0073, "step": 1123 }, { "clip_ratio": 0.03901759907603264, "epoch": 0.3932820153953814, "grad_norm": 0.3952575997793045, "kl": 1.0390625, "learning_rate": 9.718551528484979e-06, "loss": 0.006, "step": 1124 }, { "clip_ratio": 0.004956573247909546, "clipped_completions_ratio": 0.0, "epoch": 0.39363191042687196, "grad_norm": 0.9136836560685698, "kl": 1.15625, "learning_rate": 9.717540584006074e-06, "loss": 0.0159, "max_completion_length": 244.0, "max_terminated_completion_length": 244.0, "mean_completion_length": 152.19644165039062, "mean_terminated_completion_length": 152.19644165039062, "min_completion_length": 83.0, "min_terminated_completion_length": 83.0, "num_tokens": 5691511.0, "reward": 2.671863079071045, "reward_std": 0.11589551717042923, "rewards/check_gptzero_func/mean": 0.7678571343421936, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.9040058851242065, "rewards/check_winston_local_func/std": 0.13749319314956665, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1125 }, { "clip_ratio": 0.016265379264950752, "epoch": 0.3939818054583625, "grad_norm": 0.5989084819926221, "kl": 1.15625, "learning_rate": 9.716527879929176e-06, "loss": 0.0125, "step": 1126 }, { "clip_ratio": 0.03527113422751427, "epoch": 0.39433170048985305, "grad_norm": 0.48221144032966506, "kl": 1.171875, "learning_rate": 9.71551341663202e-06, "loss": 0.0101, "step": 1127 }, { "clip_ratio": 0.05027706176042557, "epoch": 0.3946815955213436, "grad_norm": 0.43276363964334874, "kl": 1.1953125, "learning_rate": 9.714497194492988e-06, "loss": 0.0093, "step": 1128 }, { "clip_ratio": 0.003935065120458603, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.39503149055283415, "grad_norm": 0.6615551981043055, "kl": 1.125, "learning_rate": 9.713479213891122e-06, "loss": 0.0065, "max_completion_length": 256.0, "max_terminated_completion_length": 184.0, "mean_completion_length": 152.82144165039062, "mean_terminated_completion_length": 135.625, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 5711197.0, "reward": 2.835932731628418, "reward_std": 0.11694423854351044, "rewards/check_gptzero_func/mean": 0.875, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.9854860305786133, "rewards/check_winston_local_func/std": 0.008078367449343204, "rewards/sentence_count_match_reward_logic/mean": 0.9754464030265808, "rewards/sentence_count_match_reward_logic/std": 0.06155947968363762, "step": 1129 }, { "clip_ratio": 0.011293801479041576, "epoch": 0.39538138558432473, "grad_norm": 0.4994667604584736, "kl": 1.140625, "learning_rate": 9.712459475206119e-06, "loss": 0.0047, "step": 1130 }, { "clip_ratio": 0.02514897659420967, "epoch": 0.39573128061581525, "grad_norm": 0.5028439198381996, "kl": 1.171875, "learning_rate": 9.711437978818332e-06, "loss": 0.0029, "step": 1131 }, { "clip_ratio": 0.03577624261379242, "epoch": 0.3960811756473058, "grad_norm": 0.3962845450743122, "kl": 1.171875, "learning_rate": 9.710414725108771e-06, "loss": 0.0015, "step": 1132 }, { "clip_ratio": 0.004156992770731449, "clipped_completions_ratio": 0.0, "epoch": 0.39643107067879635, "grad_norm": 0.7468822583268294, "kl": 1.1015625, "learning_rate": 9.709389714459098e-06, "loss": 0.01, "max_completion_length": 218.0, "max_terminated_completion_length": 218.0, "mean_completion_length": 165.82144165039062, "mean_terminated_completion_length": 165.82144165039062, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 5731707.0, "reward": 2.669491767883301, "reward_std": 0.03528378903865814, "rewards/check_gptzero_func/mean": 0.7142857313156128, "rewards/check_gptzero_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.9611583352088928, "rewards/check_winston_local_func/std": 0.05122165381908417, "rewards/sentence_count_match_reward_logic/mean": 0.9940475821495056, "rewards/sentence_count_match_reward_logic/std": 0.02524530701339245, "step": 1133 }, { "clip_ratio": 0.014389369636774063, "epoch": 0.3967809657102869, "grad_norm": 0.7974784166558013, "kl": 1.09375, "learning_rate": 9.708362947251632e-06, "loss": 0.0078, "step": 1134 }, { "clip_ratio": 0.028249401599168777, "epoch": 0.39713086074177745, "grad_norm": 0.4383858477646339, "kl": 1.109375, "learning_rate": 9.707334423869348e-06, "loss": 0.0046, "step": 1135 }, { "clip_ratio": 0.040690407156944275, "epoch": 0.397480755773268, "grad_norm": 0.47408242059043376, "kl": 1.140625, "learning_rate": 9.706304144695877e-06, "loss": 0.0026, "step": 1136 }, { "clip_ratio": 0.0055810376070439816, "clipped_completions_ratio": 0.1071428571428571, "epoch": 0.39783065080475855, "grad_norm": 0.6730335209494832, "kl": 1.046875, "learning_rate": 9.705272110115503e-06, "loss": 0.0097, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 174.2857208251953, "mean_terminated_completion_length": 164.47999572753906, "min_completion_length": 106.0, "min_terminated_completion_length": 106.0, "num_tokens": 5752771.0, "reward": 2.8179752826690674, "reward_std": 0.17529107630252838, "rewards/check_gptzero_func/mean": 0.875, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.9429751634597778, "rewards/check_winston_local_func/std": 0.08858190476894379, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1137 }, { "clip_ratio": 0.013418917544186115, "epoch": 0.3981805458362491, "grad_norm": 0.6018931632836488, "kl": 1.0546875, "learning_rate": 9.704238320513162e-06, "loss": 0.0079, "step": 1138 }, { "clip_ratio": 0.026148222386837006, "epoch": 0.3985304408677397, "grad_norm": 0.4621677350085179, "kl": 1.0625, "learning_rate": 9.703202776274452e-06, "loss": 0.0057, "step": 1139 }, { "clip_ratio": 0.036049120128154755, "epoch": 0.3988803358992302, "grad_norm": 0.43194559667215277, "kl": 1.078125, "learning_rate": 9.702165477785618e-06, "loss": 0.0051, "step": 1140 }, { "clip_ratio": 0.00517693068832159, "clipped_completions_ratio": 0.0, "epoch": 0.3992302309307208, "grad_norm": 0.9066891807420514, "kl": 1.15625, "learning_rate": 9.701126425433564e-06, "loss": 0.0105, "max_completion_length": 253.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 161.71429443359375, "mean_terminated_completion_length": 161.71429443359375, "min_completion_length": 107.0, "min_terminated_completion_length": 107.0, "num_tokens": 5772747.0, "reward": 2.81735897064209, "reward_std": 0.1649254858493805, "rewards/check_gptzero_func/mean": 0.9107142686843872, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9270526766777039, "rewards/check_winston_local_func/std": 0.10915977507829666, "rewards/sentence_count_match_reward_logic/mean": 0.9795918464660645, "rewards/sentence_count_match_reward_logic/std": 0.050441987812519073, "step": 1141 }, { "clip_ratio": 0.015887660905718803, "epoch": 0.3995801259622113, "grad_norm": 0.6189693738491189, "kl": 1.1640625, "learning_rate": 9.700085619605846e-06, "loss": 0.0066, "step": 1142 }, { "clip_ratio": 0.03273956477642059, "epoch": 0.3999300209937019, "grad_norm": 0.5135818784364484, "kl": 1.1796875, "learning_rate": 9.699043060690675e-06, "loss": 0.0035, "step": 1143 }, { "clip_ratio": 0.047997452318668365, "epoch": 0.4002799160251924, "grad_norm": 0.3843819540195967, "kl": 1.171875, "learning_rate": 9.697998749076916e-06, "loss": 0.0016, "step": 1144 }, { "clip_ratio": 0.005548373330384493, "clipped_completions_ratio": 0.0535714285714286, "epoch": 0.400629811056683, "grad_norm": 0.9663762949839353, "kl": 1.234375, "learning_rate": 9.696952685154088e-06, "loss": 0.0097, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 157.5357208251953, "mean_terminated_completion_length": 151.96226501464844, "min_completion_length": 49.0, "min_terminated_completion_length": 49.0, "num_tokens": 5792233.0, "reward": 2.5069212913513184, "reward_std": 0.22613555192947388, "rewards/check_gptzero_func/mean": 0.7142857313156128, "rewards/check_gptzero_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.8374484777450562, "rewards/check_winston_local_func/std": 0.26764026284217834, "rewards/sentence_count_match_reward_logic/mean": 0.9551870226860046, "rewards/sentence_count_match_reward_logic/std": 0.07890079915523529, "step": 1145 }, { "clip_ratio": 0.016072982922196388, "epoch": 0.40097970608817357, "grad_norm": 0.6104007077704664, "kl": 1.25, "learning_rate": 9.69590486931236e-06, "loss": 0.0066, "step": 1146 }, { "clip_ratio": 0.03168664872646332, "epoch": 0.4013296011196641, "grad_norm": 0.4664927027229104, "kl": 1.2734375, "learning_rate": 9.694855301942563e-06, "loss": 0.0044, "step": 1147 }, { "clip_ratio": 0.040621157735586166, "epoch": 0.40167949615115467, "grad_norm": 0.435863546504357, "kl": 1.3046875, "learning_rate": 9.69380398343617e-06, "loss": 0.0036, "step": 1148 }, { "clip_ratio": 0.0035596443340182304, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.4020293911826452, "grad_norm": 0.7498168639925179, "kl": 0.94921875, "learning_rate": 9.692750914185314e-06, "loss": 0.0078, "max_completion_length": 256.0, "max_terminated_completion_length": 191.0, "mean_completion_length": 170.9107208251953, "mean_terminated_completion_length": 156.7291717529297, "min_completion_length": 126.0, "min_terminated_completion_length": 126.0, "num_tokens": 5813516.0, "reward": 2.7458345890045166, "reward_std": 0.07690517604351044, "rewards/check_gptzero_func/mean": 0.8392857313156128, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.947875440120697, "rewards/check_winston_local_func/std": 0.0841207504272461, "rewards/sentence_count_match_reward_logic/mean": 0.9586734771728516, "rewards/sentence_count_match_reward_logic/std": 0.07703592628240585, "step": 1149 }, { "clip_ratio": 0.009870666079223156, "epoch": 0.40237928621413577, "grad_norm": 0.5701968738772137, "kl": 0.94921875, "learning_rate": 9.691696094582782e-06, "loss": 0.0042, "step": 1150 }, { "clip_ratio": 0.028481092303991318, "epoch": 0.4027291812456263, "grad_norm": 0.3715062569886875, "kl": 0.94921875, "learning_rate": 9.69063952502201e-06, "loss": 0.0023, "step": 1151 }, { "clip_ratio": 0.04139386862516403, "epoch": 0.40307907627711687, "grad_norm": 0.3644869783192514, "kl": 0.95703125, "learning_rate": 9.68958120589709e-06, "loss": 0.0011, "step": 1152 }, { "clip_ratio": 0.004296443425118923, "clipped_completions_ratio": 0.0, "epoch": 0.40342897130860744, "grad_norm": 0.7701028226221898, "kl": 1.09375, "learning_rate": 9.688521137602763e-06, "loss": 0.0141, "max_completion_length": 226.0, "max_terminated_completion_length": 226.0, "mean_completion_length": 166.2857208251953, "mean_terminated_completion_length": 166.2857208251953, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 5833764.0, "reward": 2.605071783065796, "reward_std": 0.14633804559707642, "rewards/check_gptzero_func/mean": 0.6607142686843872, "rewards/check_gptzero_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.9736942648887634, "rewards/check_winston_local_func/std": 0.04016333818435669, "rewards/sentence_count_match_reward_logic/mean": 0.9706632494926453, "rewards/sentence_count_match_reward_logic/std": 0.05682016909122467, "step": 1153 }, { "clip_ratio": 0.014957329258322716, "epoch": 0.40377886634009796, "grad_norm": 0.6556601523487294, "kl": 1.109375, "learning_rate": 9.687459320534428e-06, "loss": 0.0107, "step": 1154 }, { "clip_ratio": 0.029423872008919716, "epoch": 0.40412876137158854, "grad_norm": 0.4959048970549257, "kl": 1.109375, "learning_rate": 9.686395755088128e-06, "loss": 0.0079, "step": 1155 }, { "clip_ratio": 0.04111381247639656, "epoch": 0.40447865640307906, "grad_norm": 0.4864541158534679, "kl": 1.1015625, "learning_rate": 9.685330441660564e-06, "loss": 0.0067, "step": 1156 }, { "clip_ratio": 0.004164672456681728, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.40482855143456964, "grad_norm": 0.7092755171167853, "kl": 1.0859375, "learning_rate": 9.68426338064909e-06, "loss": 0.0064, "max_completion_length": 256.0, "max_terminated_completion_length": 205.0, "mean_completion_length": 154.19644165039062, "mean_terminated_completion_length": 137.2291717529297, "min_completion_length": 81.0, "min_terminated_completion_length": 81.0, "num_tokens": 5852831.0, "reward": 2.884045362472534, "reward_std": 0.10895834118127823, "rewards/check_gptzero_func/mean": 0.9642857313156128, "rewards/check_gptzero_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9492236375808716, "rewards/check_winston_local_func/std": 0.08589622378349304, "rewards/sentence_count_match_reward_logic/mean": 0.9705356955528259, "rewards/sentence_count_match_reward_logic/std": 0.07309817522764206, "step": 1157 }, { "clip_ratio": 0.010128030553460121, "epoch": 0.40517844646606016, "grad_norm": 0.592603202870978, "kl": 1.078125, "learning_rate": 9.68319457245171e-06, "loss": 0.0035, "step": 1158 }, { "clip_ratio": 0.023250762373209, "epoch": 0.40552834149755074, "grad_norm": 0.45878658922919163, "kl": 1.078125, "learning_rate": 9.682124017467075e-06, "loss": 0.0015, "step": 1159 }, { "clip_ratio": 0.03507550433278084, "epoch": 0.4058782365290413, "grad_norm": 0.3437802593025265, "kl": 1.0859375, "learning_rate": 9.681051716094497e-06, "loss": 0.001, "step": 1160 }, { "clip_ratio": 0.004734461195766926, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.40622813156053184, "grad_norm": 0.7611407906344207, "kl": 1.03125, "learning_rate": 9.67997766873393e-06, "loss": 0.0069, "max_completion_length": 256.0, "max_terminated_completion_length": 198.0, "mean_completion_length": 175.10714721679688, "mean_terminated_completion_length": 161.625, "min_completion_length": 121.0, "min_terminated_completion_length": 121.0, "num_tokens": 5874565.0, "reward": 2.5734524726867676, "reward_std": 0.18420031666755676, "rewards/check_gptzero_func/mean": 0.6071428656578064, "rewards/check_gptzero_func/std": 0.4928053915500641, "rewards/check_winston_local_func/mean": 0.9663094282150269, "rewards/check_winston_local_func/std": 0.06301653385162354, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1161 }, { "clip_ratio": 0.01218318846076727, "epoch": 0.4065780265920224, "grad_norm": 0.5736255183274213, "kl": 1.0390625, "learning_rate": 9.678901875785986e-06, "loss": 0.0037, "step": 1162 }, { "clip_ratio": 0.02953297272324562, "epoch": 0.40692792162351293, "grad_norm": 0.5136595791684997, "kl": 1.0703125, "learning_rate": 9.677824337651927e-06, "loss": 0.0018, "step": 1163 }, { "clip_ratio": 0.0417758971452713, "epoch": 0.4072778166550035, "grad_norm": 0.3544924506413501, "kl": 1.046875, "learning_rate": 9.676745054733661e-06, "loss": 0.0002, "step": 1164 }, { "clip_ratio": 0.005719998385757208, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.40762771168649403, "grad_norm": 0.9692286774559156, "kl": 0.95703125, "learning_rate": 9.675664027433753e-06, "loss": 0.0025, "max_completion_length": 256.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 189.44644165039062, "mean_terminated_completion_length": 178.3541717529297, "min_completion_length": 141.0, "min_terminated_completion_length": 141.0, "num_tokens": 5897246.0, "reward": 2.853776216506958, "reward_std": 0.07130862772464752, "rewards/check_gptzero_func/mean": 0.875, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.9879031181335449, "rewards/check_winston_local_func/std": 0.0074660321697592735, "rewards/sentence_count_match_reward_logic/mean": 0.9908730387687683, "rewards/sentence_count_match_reward_logic/std": 0.03992738202214241, "step": 1165 }, { "clip_ratio": 0.009079933166503906, "epoch": 0.4079776067179846, "grad_norm": 2.7085930436501173, "kl": 1.375, "learning_rate": 9.674581256155415e-06, "loss": 0.0042, "step": 1166 }, { "clip_ratio": 0.018451137468218803, "epoch": 0.40832750174947513, "grad_norm": 0.49924643932039325, "kl": 0.9765625, "learning_rate": 9.673496741302509e-06, "loss": -0.0016, "step": 1167 }, { "clip_ratio": 0.02731625735759735, "epoch": 0.4086773967809657, "grad_norm": 0.4894010524976285, "kl": 0.94921875, "learning_rate": 9.67241048327955e-06, "loss": -0.0032, "step": 1168 }, { "clip_ratio": 0.006465758662670851, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.4090272918124563, "grad_norm": 0.6950358820233005, "kl": 1.078125, "learning_rate": 9.671322482491704e-06, "loss": 0.011, "max_completion_length": 256.0, "max_terminated_completion_length": 223.0, "mean_completion_length": 179.3928680419922, "mean_terminated_completion_length": 166.625, "min_completion_length": 102.0, "min_terminated_completion_length": 102.0, "num_tokens": 5919068.0, "reward": 2.7159485816955566, "reward_std": 0.09136351943016052, "rewards/check_gptzero_func/mean": 0.8392857313156128, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.8766628503799438, "rewards/check_winston_local_func/std": 0.2721659243106842, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1169 }, { "clip_ratio": 0.012349150143563747, "epoch": 0.4093771868439468, "grad_norm": 0.6511306318832686, "kl": 1.1171875, "learning_rate": 9.67023273934478e-06, "loss": 0.0086, "step": 1170 }, { "clip_ratio": 0.023004190996289253, "epoch": 0.4097270818754374, "grad_norm": 0.4085853796291751, "kl": 1.109375, "learning_rate": 9.669141254245248e-06, "loss": 0.0063, "step": 1171 }, { "clip_ratio": 0.03560462221503258, "epoch": 0.4100769769069279, "grad_norm": 0.4578871853828771, "kl": 1.109375, "learning_rate": 9.668048027600217e-06, "loss": 0.0049, "step": 1172 }, { "clip_ratio": 0.004614962264895439, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.4104268719384185, "grad_norm": 0.7400992533435928, "kl": 1.140625, "learning_rate": 9.666953059817452e-06, "loss": 0.0075, "max_completion_length": 256.0, "max_terminated_completion_length": 224.0, "mean_completion_length": 165.55357360839844, "mean_terminated_completion_length": 150.4791717529297, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 5939547.0, "reward": 2.825406789779663, "reward_std": 0.10499919950962067, "rewards/check_gptzero_func/mean": 0.9107142686843872, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9146924614906311, "rewards/check_winston_local_func/std": 0.16741043329238892, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1173 }, { "clip_ratio": 0.014195221476256847, "epoch": 0.410776766969909, "grad_norm": 0.5466555066340155, "kl": 1.1484375, "learning_rate": 9.665856351305365e-06, "loss": 0.0042, "step": 1174 }, { "clip_ratio": 0.027463356032967567, "epoch": 0.4111266620013996, "grad_norm": 0.4794662643249777, "kl": 1.1796875, "learning_rate": 9.664757902473019e-06, "loss": 0.0027, "step": 1175 }, { "clip_ratio": 0.0348774679005146, "epoch": 0.41147655703289016, "grad_norm": 0.4294521829325571, "kl": 1.1796875, "learning_rate": 9.663657713730123e-06, "loss": 0.002, "step": 1176 }, { "clip_ratio": 0.005729080177843571, "clipped_completions_ratio": 0.0, "epoch": 0.4118264520643807, "grad_norm": 1.0418593835783052, "kl": 1.078125, "learning_rate": 9.66255578548704e-06, "loss": 0.0048, "max_completion_length": 208.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 169.8928680419922, "mean_terminated_completion_length": 169.8928680419922, "min_completion_length": 120.0, "min_terminated_completion_length": 120.0, "num_tokens": 5960213.0, "reward": 2.4984679222106934, "reward_std": 0.042316898703575134, "rewards/check_gptzero_func/mean": 0.5714285969734192, "rewards/check_gptzero_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.9335868954658508, "rewards/check_winston_local_func/std": 0.1329740732908249, "rewards/sentence_count_match_reward_logic/mean": 0.9934523701667786, "rewards/sentence_count_match_reward_logic/std": 0.03447712957859039, "step": 1177 }, { "clip_ratio": 0.012147036381065845, "epoch": 0.41217634709587125, "grad_norm": 6707.031613459821, "kl": 364.0, "learning_rate": 9.661452118154777e-06, "loss": 3.6335, "step": 1178 }, { "clip_ratio": 0.02162834256887436, "epoch": 0.4125262421273618, "grad_norm": 4.381937946486746, "kl": 1.3671875, "learning_rate": 9.66034671214499e-06, "loss": 0.004, "step": 1179 }, { "clip_ratio": 0.03019651025533676, "epoch": 0.41287613715885235, "grad_norm": 1.150868743060567, "kl": 1.078125, "learning_rate": 9.659239567869989e-06, "loss": 0.0163, "step": 1180 }, { "clip_ratio": 0.0038281057495623827, "clipped_completions_ratio": 0.0, "epoch": 0.4132260321903429, "grad_norm": 0.6346144197377235, "kl": 0.9296875, "learning_rate": 9.658130685742724e-06, "loss": 0.0063, "max_completion_length": 233.0, "max_terminated_completion_length": 233.0, "mean_completion_length": 183.07144165039062, "mean_terminated_completion_length": 183.07144165039062, "min_completion_length": 98.0, "min_terminated_completion_length": 98.0, "num_tokens": 5982497.0, "reward": 2.575972318649292, "reward_std": 0.18434317409992218, "rewards/check_gptzero_func/mean": 0.6785714030265808, "rewards/check_gptzero_func/std": 0.4712514281272888, "rewards/check_winston_local_func/mean": 0.9418520927429199, "rewards/check_winston_local_func/std": 0.10167888551950455, "rewards/sentence_count_match_reward_logic/mean": 0.9555484652519226, "rewards/sentence_count_match_reward_logic/std": 0.07971690595149994, "step": 1181 }, { "clip_ratio": 0.012845356948673725, "epoch": 0.41357592722183345, "grad_norm": 0.6070362048253173, "kl": 0.93359375, "learning_rate": 9.6570200661768e-06, "loss": 0.0046, "step": 1182 }, { "clip_ratio": 0.022804638370871544, "epoch": 0.413925822253324, "grad_norm": 0.6123692551551819, "kl": 0.9375, "learning_rate": 9.655907709586465e-06, "loss": 0.002, "step": 1183 }, { "clip_ratio": 0.030333617702126503, "epoch": 0.41427571728481455, "grad_norm": 0.3486565060731845, "kl": 0.9375, "learning_rate": 9.654793616386621e-06, "loss": -0.0004, "step": 1184 }, { "clip_ratio": 0.00530208507552743, "clipped_completions_ratio": 0.0, "epoch": 0.4146256123163051, "grad_norm": 0.6729685030477239, "kl": 0.9375, "learning_rate": 9.653677786992815e-06, "loss": 0.0003, "max_completion_length": 244.0, "max_terminated_completion_length": 244.0, "mean_completion_length": 185.7857208251953, "mean_terminated_completion_length": 185.7857208251953, "min_completion_length": 98.0, "min_terminated_completion_length": 98.0, "num_tokens": 6005165.0, "reward": 2.6382153034210205, "reward_std": 0.2216976135969162, "rewards/check_gptzero_func/mean": 0.7142857313156128, "rewards/check_gptzero_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.958368182182312, "rewards/check_winston_local_func/std": 0.05874758958816528, "rewards/sentence_count_match_reward_logic/mean": 0.9655612111091614, "rewards/sentence_count_match_reward_logic/std": 0.0819985419511795, "step": 1185 }, { "clip_ratio": 0.008913205936551094, "epoch": 0.41497550734779565, "grad_norm": 0.5325575824625612, "kl": 0.94140625, "learning_rate": 9.652560221821234e-06, "loss": -0.0018, "step": 1186 }, { "clip_ratio": 0.02162860706448555, "epoch": 0.4153254023792862, "grad_norm": 0.39296758796553577, "kl": 0.9453125, "learning_rate": 9.651440921288727e-06, "loss": -0.0035, "step": 1187 }, { "clip_ratio": 0.033414267003536224, "epoch": 0.41567529741077675, "grad_norm": 0.5009212344034152, "kl": 0.953125, "learning_rate": 9.650319885812777e-06, "loss": -0.0048, "step": 1188 }, { "clip_ratio": 0.0029352239798754454, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.4160251924422673, "grad_norm": 5.713600806005514, "kl": 0.91796875, "learning_rate": 9.649197115811524e-06, "loss": 0.014, "max_completion_length": 256.0, "max_terminated_completion_length": 252.0, "mean_completion_length": 191.46429443359375, "mean_terminated_completion_length": 180.70834350585938, "min_completion_length": 85.0, "min_terminated_completion_length": 85.0, "num_tokens": 6028567.0, "reward": 2.672349214553833, "reward_std": 0.10768268257379532, "rewards/check_gptzero_func/mean": 0.8392857313156128, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.8646109700202942, "rewards/check_winston_local_func/std": 0.22330068051815033, "rewards/sentence_count_match_reward_logic/mean": 0.9684523940086365, "rewards/sentence_count_match_reward_logic/std": 0.0728689506649971, "step": 1189 }, { "clip_ratio": 0.004890720825642347, "epoch": 0.4163750874737579, "grad_norm": 0.7804162193311898, "kl": 0.91015625, "learning_rate": 9.648072611703749e-06, "loss": 0.0133, "step": 1190 }, { "clip_ratio": 0.010733587667346, "epoch": 0.4167249825052484, "grad_norm": 0.4922453115389626, "kl": 0.91796875, "learning_rate": 9.64694637390888e-06, "loss": 0.0107, "step": 1191 }, { "clip_ratio": 0.024764282628893852, "epoch": 0.417074877536739, "grad_norm": 0.41933357738283544, "kl": 0.94140625, "learning_rate": 9.645818402846992e-06, "loss": 0.009, "step": 1192 }, { "clip_ratio": 0.00558991776779294, "clipped_completions_ratio": 0.0, "epoch": 0.4174247725682295, "grad_norm": 0.7591015640129517, "kl": 1.265625, "learning_rate": 9.644688698938811e-06, "loss": 0.0136, "max_completion_length": 218.0, "max_terminated_completion_length": 218.0, "mean_completion_length": 140.92857360839844, "mean_terminated_completion_length": 140.92857360839844, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 6045995.0, "reward": 2.41753888130188, "reward_std": 0.2445993572473526, "rewards/check_gptzero_func/mean": 0.4642857015132904, "rewards/check_gptzero_func/std": 0.5032362341880798, "rewards/check_winston_local_func/mean": 0.9651578068733215, "rewards/check_winston_local_func/std": 0.049831073731184006, "rewards/sentence_count_match_reward_logic/mean": 0.9880952835083008, "rewards/sentence_count_match_reward_logic/std": 0.062418777495622635, "step": 1193 }, { "clip_ratio": 0.013598439283668995, "epoch": 0.4177746675997201, "grad_norm": 0.5863577346239804, "kl": 1.265625, "learning_rate": 9.643557262605704e-06, "loss": 0.0102, "step": 1194 }, { "clip_ratio": 0.02846682257950306, "epoch": 0.4181245626312106, "grad_norm": 0.5160154101730952, "kl": 1.28125, "learning_rate": 9.642424094269685e-06, "loss": 0.0084, "step": 1195 }, { "clip_ratio": 0.040356505662202835, "epoch": 0.4184744576627012, "grad_norm": 0.39712229347690736, "kl": 1.2890625, "learning_rate": 9.641289194353418e-06, "loss": 0.0072, "step": 1196 }, { "clip_ratio": 0.0048699104227125645, "clipped_completions_ratio": 0.0, "epoch": 0.4188243526941917, "grad_norm": 0.6007556490208125, "kl": 1.1328125, "learning_rate": 9.640152563280207e-06, "loss": 0.0136, "max_completion_length": 226.0, "max_terminated_completion_length": 226.0, "mean_completion_length": 162.08929443359375, "mean_terminated_completion_length": 162.08929443359375, "min_completion_length": 78.0, "min_terminated_completion_length": 78.0, "num_tokens": 6065808.0, "reward": 2.8652584552764893, "reward_std": 0.1473524570465088, "rewards/check_gptzero_func/mean": 0.9285714030265808, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9775030016899109, "rewards/check_winston_local_func/std": 0.032816726714372635, "rewards/sentence_count_match_reward_logic/mean": 0.9591836929321289, "rewards/sentence_count_match_reward_logic/std": 0.06512032449245453, "step": 1197 }, { "clip_ratio": 0.007405196316540241, "epoch": 0.4191742477256823, "grad_norm": 0.5316115698290758, "kl": 1.1328125, "learning_rate": 9.639014201474009e-06, "loss": 0.0115, "step": 1198 }, { "clip_ratio": 0.018656382337212563, "epoch": 0.41952414275717287, "grad_norm": 0.4107923518221337, "kl": 1.1328125, "learning_rate": 9.637874109359416e-06, "loss": 0.0091, "step": 1199 }, { "clip_ratio": 0.03156377002596855, "epoch": 0.4198740377886634, "grad_norm": 0.32898239182980893, "kl": 1.1328125, "learning_rate": 9.636732287361675e-06, "loss": 0.0072, "step": 1200 }, { "clip_ratio": 0.004945687483996153, "clipped_completions_ratio": 0.0, "epoch": 0.42022393282015397, "grad_norm": 0.6649327987047489, "kl": 1.0859375, "learning_rate": 9.635588735906675e-06, "loss": 0.0153, "max_completion_length": 199.0, "max_terminated_completion_length": 199.0, "mean_completion_length": 155.69644165039062, "mean_terminated_completion_length": 155.69644165039062, "min_completion_length": 116.0, "min_terminated_completion_length": 116.0, "num_tokens": 18815.0, "reward": 2.569572925567627, "reward_std": 0.28890499472618103, "rewards/check_originality_func/mean": 0.625, "rewards/check_originality_func/std": 0.48850420117378235, "rewards/check_winston_local_func/mean": 0.9811800122261047, "rewards/check_winston_local_func/std": 0.02085181511938572, "rewards/sentence_count_match_reward_logic/mean": 0.9633928537368774, "rewards/sentence_count_match_reward_logic/std": 0.08500859141349792, "step": 1201 }, { "clip_ratio": 0.010148798115551472, "epoch": 0.4205738278516445, "grad_norm": 0.5448849122146231, "kl": 1.1015625, "learning_rate": 9.63444345542095e-06, "loss": 0.0131, "step": 1202 }, { "clip_ratio": 0.020778683945536613, "epoch": 0.42092372288313507, "grad_norm": 0.4425003454867359, "kl": 1.1015625, "learning_rate": 9.633296446331678e-06, "loss": 0.0109, "step": 1203 }, { "clip_ratio": 0.03452986478805542, "epoch": 0.4212736179146256, "grad_norm": 0.3620754411957497, "kl": 1.09375, "learning_rate": 9.632147709066682e-06, "loss": 0.0099, "step": 1204 }, { "clip_ratio": 0.0027869483456015587, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.42162351294611616, "grad_norm": 0.6383098387659161, "kl": 0.8671875, "learning_rate": 9.630997244054429e-06, "loss": 0.0125, "max_completion_length": 256.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 180.5178680419922, "mean_terminated_completion_length": 167.9375, "min_completion_length": 108.0, "min_terminated_completion_length": 108.0, "num_tokens": 41116.0, "reward": 2.0268943309783936, "reward_std": 0.3536655008792877, "rewards/check_originality_func/mean": 0.3035714328289032, "rewards/check_originality_func/std": 0.4639608860015869, "rewards/check_winston_local_func/mean": 0.7309759855270386, "rewards/check_winston_local_func/std": 0.3141782879829407, "rewards/sentence_count_match_reward_logic/mean": 0.9923468828201294, "rewards/sentence_count_match_reward_logic/std": 0.03245825320482254, "step": 1205 }, { "clip_ratio": 0.005318230018019676, "epoch": 0.42197340797760674, "grad_norm": 0.5415576026899201, "kl": 0.87109375, "learning_rate": 9.629845051724037e-06, "loss": 0.0101, "step": 1206 }, { "clip_ratio": 0.016477925702929497, "epoch": 0.42232330300909726, "grad_norm": 0.39481499641471957, "kl": 0.87890625, "learning_rate": 9.628691132505257e-06, "loss": 0.0077, "step": 1207 }, { "clip_ratio": 0.030839508399367332, "epoch": 0.42267319804058784, "grad_norm": 0.3607174450266309, "kl": 0.88671875, "learning_rate": 9.627535486828491e-06, "loss": 0.0059, "step": 1208 }, { "clip_ratio": 0.003992937039583921, "clipped_completions_ratio": 0.0, "epoch": 0.42302309307207836, "grad_norm": 0.649004475330097, "kl": 1.015625, "learning_rate": 9.626378115124788e-06, "loss": 0.0081, "max_completion_length": 234.0, "max_terminated_completion_length": 234.0, "mean_completion_length": 188.58929443359375, "mean_terminated_completion_length": 188.58929443359375, "min_completion_length": 112.0, "min_terminated_completion_length": 112.0, "num_tokens": 63533.0, "reward": 2.3369786739349365, "reward_std": 0.32879209518432617, "rewards/check_originality_func/mean": 0.375, "rewards/check_originality_func/std": 0.48850420117378235, "rewards/check_winston_local_func/mean": 0.986851155757904, "rewards/check_winston_local_func/std": 0.009674403816461563, "rewards/sentence_count_match_reward_logic/mean": 0.9751275777816772, "rewards/sentence_count_match_reward_logic/std": 0.0654723197221756, "step": 1209 }, { "clip_ratio": 0.00877948384732008, "epoch": 0.42337298810356894, "grad_norm": 0.5686537545249353, "kl": 1.015625, "learning_rate": 9.625219017825833e-06, "loss": 0.0056, "step": 1210 }, { "clip_ratio": 0.02011043392121792, "epoch": 0.42372288313505946, "grad_norm": 0.4421670167809315, "kl": 1.0234375, "learning_rate": 9.624058195363958e-06, "loss": 0.0028, "step": 1211 }, { "clip_ratio": 0.03441622480750084, "epoch": 0.42407277816655004, "grad_norm": 0.40469100021898685, "kl": 1.0390625, "learning_rate": 9.622895648172141e-06, "loss": 0.0014, "step": 1212 }, { "clip_ratio": 0.00405740924179554, "clipped_completions_ratio": 0.0714285714285714, "epoch": 0.4244226731980406, "grad_norm": 0.7747203828694988, "kl": 1.0234375, "learning_rate": 9.621731376683998e-06, "loss": 0.0127, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 184.2857208251953, "mean_terminated_completion_length": 178.7692413330078, "min_completion_length": 129.0, "min_terminated_completion_length": 129.0, "num_tokens": 85637.0, "reward": 2.0465331077575684, "reward_std": 0.3159066140651703, "rewards/check_originality_func/mean": 0.1607142835855484, "rewards/check_originality_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.9126043319702148, "rewards/check_winston_local_func/std": 0.09127394109964371, "rewards/sentence_count_match_reward_logic/mean": 0.9732142686843872, "rewards/sentence_count_match_reward_logic/std": 0.07802347093820572, "step": 1213 }, { "clip_ratio": 0.010928383097052574, "epoch": 0.42477256822953113, "grad_norm": 0.594250319602429, "kl": 1.0390625, "learning_rate": 9.620565381333796e-06, "loss": 0.0093, "step": 1214 }, { "clip_ratio": 0.02934846840798855, "epoch": 0.4251224632610217, "grad_norm": 0.43640515659411167, "kl": 1.0625, "learning_rate": 9.619397662556434e-06, "loss": 0.0069, "step": 1215 }, { "clip_ratio": 0.042632803320884705, "epoch": 0.42547235829251223, "grad_norm": 0.36252406450841845, "kl": 1.0703125, "learning_rate": 9.618228220787466e-06, "loss": 0.0057, "step": 1216 }, { "clip_ratio": 0.0037426913622766733, "clipped_completions_ratio": 0.2321428571428571, "epoch": 0.4258222533240028, "grad_norm": 0.7563080271439762, "kl": 1.0390625, "learning_rate": 9.61705705646308e-06, "loss": -0.0009, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 180.4107208251953, "mean_terminated_completion_length": 157.55813598632812, "min_completion_length": 79.0, "min_terminated_completion_length": 79.0, "num_tokens": 107812.0, "reward": 2.1108717918395996, "reward_std": 0.2650800347328186, "rewards/check_originality_func/mean": 0.1607142835855484, "rewards/check_originality_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.9571729302406311, "rewards/check_winston_local_func/std": 0.0741693452000618, "rewards/sentence_count_match_reward_logic/mean": 0.9929847121238708, "rewards/sentence_count_match_reward_logic/std": 0.029818281531333923, "step": 1217 }, { "clip_ratio": 0.01173780020326376, "epoch": 0.42617214835549333, "grad_norm": 0.5490233833349492, "kl": 1.0390625, "learning_rate": 9.615884170020107e-06, "loss": -0.0034, "step": 1218 }, { "clip_ratio": 0.026484357193112373, "epoch": 0.4265220433869839, "grad_norm": 0.4744627637578883, "kl": 1.0546875, "learning_rate": 9.614709561896027e-06, "loss": -0.0062, "step": 1219 }, { "clip_ratio": 0.03941657394170761, "epoch": 0.4268719384184745, "grad_norm": 0.3785111572860761, "kl": 1.0546875, "learning_rate": 9.613533232528956e-06, "loss": -0.0078, "step": 1220 }, { "clip_ratio": 0.0029473358299583197, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.427221833449965, "grad_norm": 0.5847438070197079, "kl": 0.8046875, "learning_rate": 9.612355182357654e-06, "loss": 0.0036, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 221.32144165039062, "mean_terminated_completion_length": 207.4499969482422, "min_completion_length": 172.0, "min_terminated_completion_length": 172.0, "num_tokens": 134526.0, "reward": 2.2126877307891846, "reward_std": 0.27451246976852417, "rewards/check_originality_func/mean": 0.3928571343421936, "rewards/check_originality_func/std": 0.4928053617477417, "rewards/check_winston_local_func/mean": 0.8531850576400757, "rewards/check_winston_local_func/std": 0.27526071667671204, "rewards/sentence_count_match_reward_logic/mean": 0.966645359992981, "rewards/sentence_count_match_reward_logic/std": 0.07307922095060349, "step": 1221 }, { "clip_ratio": 0.006831125821918249, "epoch": 0.4275717284814556, "grad_norm": 0.44902167890146943, "kl": 0.80859375, "learning_rate": 9.611175411821519e-06, "loss": 0.0018, "step": 1222 }, { "clip_ratio": 0.016120564192533493, "epoch": 0.4279216235129461, "grad_norm": 0.3709412303705722, "kl": 0.81640625, "learning_rate": 9.609993921360599e-06, "loss": 0.0004, "step": 1223 }, { "clip_ratio": 0.02601804956793785, "epoch": 0.4282715185444367, "grad_norm": 0.34934767435557856, "kl": 0.82421875, "learning_rate": 9.608810711415577e-06, "loss": -0.0009, "step": 1224 }, { "clip_ratio": 0.002861512592062354, "clipped_completions_ratio": 0.0, "epoch": 0.4286214135759272, "grad_norm": 0.7528187208561833, "kl": 1.234375, "learning_rate": 9.607625782427779e-06, "loss": 0.0149, "max_completion_length": 243.0, "max_terminated_completion_length": 243.0, "mean_completion_length": 166.3928680419922, "mean_terminated_completion_length": 166.3928680419922, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 154428.0, "reward": 2.1673731803894043, "reward_std": 0.3161282539367676, "rewards/check_originality_func/mean": 0.2321428507566452, "rewards/check_originality_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.9590397477149963, "rewards/check_winston_local_func/std": 0.09004151076078415, "rewards/sentence_count_match_reward_logic/mean": 0.976190447807312, "rewards/sentence_count_match_reward_logic/std": 0.058848995715379715, "step": 1225 }, { "clip_ratio": 0.011914136819541454, "epoch": 0.4289713086074178, "grad_norm": 0.6143517714043734, "kl": 1.2265625, "learning_rate": 9.606439134839172e-06, "loss": 0.0123, "step": 1226 }, { "clip_ratio": 0.026783408597111702, "epoch": 0.42932120363890836, "grad_norm": 0.4452986399817623, "kl": 1.2265625, "learning_rate": 9.605250769092364e-06, "loss": 0.0096, "step": 1227 }, { "clip_ratio": 0.04360569640994072, "epoch": 0.4296710986703989, "grad_norm": 0.4575090308804134, "kl": 1.25, "learning_rate": 9.604060685630608e-06, "loss": 0.0081, "step": 1228 }, { "clip_ratio": 0.005140876397490501, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.43002099370188945, "grad_norm": 0.7988052953318306, "kl": 1.1484375, "learning_rate": 9.602868884897787e-06, "loss": 0.0104, "max_completion_length": 256.0, "max_terminated_completion_length": 190.0, "mean_completion_length": 162.9107208251953, "mean_terminated_completion_length": 147.39584350585938, "min_completion_length": 96.0, "min_terminated_completion_length": 96.0, "num_tokens": 174303.0, "reward": 2.6430413722991943, "reward_std": 0.1277996003627777, "rewards/check_originality_func/mean": 0.6607142686843872, "rewards/check_originality_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.98679119348526, "rewards/check_winston_local_func/std": 0.007846014574170113, "rewards/sentence_count_match_reward_logic/mean": 0.9955357313156128, "rewards/sentence_count_match_reward_logic/std": 0.023407040163874626, "step": 1229 }, { "clip_ratio": 0.014598974958062172, "epoch": 0.43037088873338, "grad_norm": 0.5620297327043807, "kl": 1.1640625, "learning_rate": 9.601675367338436e-06, "loss": 0.0078, "step": 1230 }, { "clip_ratio": 0.025540070608258247, "epoch": 0.43072078376487055, "grad_norm": 0.46989298546250174, "kl": 1.171875, "learning_rate": 9.600480133397726e-06, "loss": 0.0055, "step": 1231 }, { "clip_ratio": 0.035120248794555664, "epoch": 0.4310706787963611, "grad_norm": 0.3407946303556918, "kl": 1.15625, "learning_rate": 9.599283183521467e-06, "loss": 0.005, "step": 1232 }, { "clip_ratio": 0.003660990623757243, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.43142057382785165, "grad_norm": 0.7506820895089269, "kl": 1.171875, "learning_rate": 9.598084518156107e-06, "loss": 0.0127, "max_completion_length": 256.0, "max_terminated_completion_length": 200.0, "mean_completion_length": 166.2857208251953, "mean_terminated_completion_length": 151.33334350585938, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 194711.0, "reward": 2.3271195888519287, "reward_std": 0.23566605150699615, "rewards/check_originality_func/mean": 0.375, "rewards/check_originality_func/std": 0.48850420117378235, "rewards/check_winston_local_func/mean": 0.9664053320884705, "rewards/check_winston_local_func/std": 0.0598401241004467, "rewards/sentence_count_match_reward_logic/mean": 0.985714316368103, "rewards/sentence_count_match_reward_logic/std": 0.035309404134750366, "step": 1233 }, { "clip_ratio": 0.010144869796931744, "epoch": 0.43177046885934217, "grad_norm": 0.6067015872455033, "kl": 1.1796875, "learning_rate": 9.596884137748744e-06, "loss": 0.0094, "step": 1234 }, { "clip_ratio": 0.02563832886517048, "epoch": 0.43212036389083275, "grad_norm": 0.4880019651105491, "kl": 1.1875, "learning_rate": 9.5956820427471e-06, "loss": 0.0071, "step": 1235 }, { "clip_ratio": 0.04023320600390434, "epoch": 0.4324702589223233, "grad_norm": 0.41887601777275757, "kl": 1.2109375, "learning_rate": 9.594478233599551e-06, "loss": 0.006, "step": 1236 }, { "clip_ratio": 0.0038009292911738157, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.43282015395381385, "grad_norm": 0.7755405203400451, "kl": 1.2421875, "learning_rate": 9.593272710755104e-06, "loss": 0.0141, "max_completion_length": 256.0, "max_terminated_completion_length": 232.0, "mean_completion_length": 163.07144165039062, "mean_terminated_completion_length": 147.58334350585938, "min_completion_length": 83.0, "min_terminated_completion_length": 83.0, "num_tokens": 214931.0, "reward": 2.2484776973724365, "reward_std": 0.31616583466529846, "rewards/check_originality_func/mean": 0.3928571343421936, "rewards/check_originality_func/std": 0.4928053617477417, "rewards/check_winston_local_func/mean": 0.8922914862632751, "rewards/check_winston_local_func/std": 0.2147950679063797, "rewards/sentence_count_match_reward_logic/mean": 0.963329017162323, "rewards/sentence_count_match_reward_logic/std": 0.07923419028520584, "step": 1237 }, { "clip_ratio": 0.010914744809269905, "epoch": 0.4331700489853044, "grad_norm": 0.5328830387321607, "kl": 1.2421875, "learning_rate": 9.59206547466341e-06, "loss": 0.0115, "step": 1238 }, { "clip_ratio": 0.025266828015446663, "epoch": 0.43351994401679494, "grad_norm": 0.5344456111111252, "kl": 1.2578125, "learning_rate": 9.590856525774753e-06, "loss": 0.0095, "step": 1239 }, { "clip_ratio": 0.033671729266643524, "epoch": 0.4338698390482855, "grad_norm": 0.39186481194772693, "kl": 1.25, "learning_rate": 9.589645864540061e-06, "loss": 0.0085, "step": 1240 }, { "clip_ratio": 0.0031417107675224543, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.43421973407977604, "grad_norm": 0.7132119526045112, "kl": 1.21875, "learning_rate": 9.5884334914109e-06, "loss": 0.008, "max_completion_length": 256.0, "max_terminated_completion_length": 201.0, "mean_completion_length": 170.69644165039062, "mean_terminated_completion_length": 156.4791717529297, "min_completion_length": 106.0, "min_terminated_completion_length": 106.0, "num_tokens": 235946.0, "reward": 2.4016690254211426, "reward_std": 0.20884555578231812, "rewards/check_originality_func/mean": 0.4285714328289032, "rewards/check_originality_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.9869862794876099, "rewards/check_winston_local_func/std": 0.007913396693766117, "rewards/sentence_count_match_reward_logic/mean": 0.9861111044883728, "rewards/sentence_count_match_reward_logic/std": 0.0370790958404541, "step": 1241 }, { "clip_ratio": 0.012192683294415474, "epoch": 0.4345696291112666, "grad_norm": 0.5768246193023068, "kl": 1.2109375, "learning_rate": 9.587219406839475e-06, "loss": 0.0054, "step": 1242 }, { "clip_ratio": 0.024568801745772362, "epoch": 0.4349195241427572, "grad_norm": 0.4256909876134551, "kl": 1.21875, "learning_rate": 9.586003611278625e-06, "loss": 0.0025, "step": 1243 }, { "clip_ratio": 0.036808744072914124, "epoch": 0.4352694191742477, "grad_norm": 0.3390456090688815, "kl": 1.2265625, "learning_rate": 9.584786105181831e-06, "loss": 0.0008, "step": 1244 }, { "clip_ratio": 0.003803929779678583, "clipped_completions_ratio": 0.0, "epoch": 0.4356193142057383, "grad_norm": 0.7511976129268647, "kl": 1.078125, "learning_rate": 9.583566889003212e-06, "loss": 0.0116, "max_completion_length": 221.0, "max_terminated_completion_length": 221.0, "mean_completion_length": 170.6607208251953, "mean_terminated_completion_length": 170.6607208251953, "min_completion_length": 112.0, "min_terminated_completion_length": 112.0, "num_tokens": 256471.0, "reward": 2.307236909866333, "reward_std": 0.1790280044078827, "rewards/check_originality_func/mean": 0.3392857015132904, "rewards/check_originality_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.9883591532707214, "rewards/check_winston_local_func/std": 0.010489200241863728, "rewards/sentence_count_match_reward_logic/mean": 0.9795918464660645, "rewards/sentence_count_match_reward_logic/std": 0.050441987812519073, "step": 1245 }, { "clip_ratio": 0.012136963196098804, "epoch": 0.4359692092372288, "grad_norm": 0.5807892276119717, "kl": 1.0859375, "learning_rate": 9.582345963197525e-06, "loss": 0.0079, "step": 1246 }, { "clip_ratio": 0.026517491787672043, "epoch": 0.4363191042687194, "grad_norm": 0.3995238885020929, "kl": 1.0859375, "learning_rate": 9.581123328220161e-06, "loss": 0.0056, "step": 1247 }, { "clip_ratio": 0.03770041465759277, "epoch": 0.4366689993002099, "grad_norm": 0.326508967114945, "kl": 1.0859375, "learning_rate": 9.579898984527154e-06, "loss": 0.0044, "step": 1248 }, { "clip_ratio": 0.0038656913675367832, "clipped_completions_ratio": 0.0, "epoch": 0.4370188943317005, "grad_norm": 0.7735675682202906, "kl": 1.1171875, "learning_rate": 9.578672932575173e-06, "loss": 0.0157, "max_completion_length": 183.0, "max_terminated_completion_length": 183.0, "mean_completion_length": 151.0, "mean_terminated_completion_length": 151.0, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 274719.0, "reward": 2.5510401725769043, "reward_std": 0.27732041478157043, "rewards/check_originality_func/mean": 0.6071428656578064, "rewards/check_originality_func/std": 0.4928053617477417, "rewards/check_winston_local_func/mean": 0.9438976049423218, "rewards/check_winston_local_func/std": 0.08900821954011917, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1249 }, { "clip_ratio": 0.011020035482943058, "epoch": 0.43736878936319107, "grad_norm": 0.5816901377601312, "kl": 1.1171875, "learning_rate": 9.57744517282152e-06, "loss": 0.0131, "step": 1250 }, { "clip_ratio": 0.026682114228606224, "epoch": 0.4377186843946816, "grad_norm": 0.544297832660078, "kl": 1.125, "learning_rate": 9.57621570572414e-06, "loss": 0.0103, "step": 1251 }, { "clip_ratio": 0.04312824457883835, "epoch": 0.43806857942617217, "grad_norm": 0.35137834420692887, "kl": 1.125, "learning_rate": 9.574984531741613e-06, "loss": 0.0086, "step": 1252 }, { "clip_ratio": 0.005015516187995672, "clipped_completions_ratio": 0.0, "epoch": 0.4384184744576627, "grad_norm": 0.959938043217486, "kl": 1.34375, "learning_rate": 9.573751651333156e-06, "loss": 0.0139, "max_completion_length": 215.0, "max_terminated_completion_length": 215.0, "mean_completion_length": 128.07144165039062, "mean_terminated_completion_length": 128.07144165039062, "min_completion_length": 76.0, "min_terminated_completion_length": 76.0, "num_tokens": 290731.0, "reward": 2.1331582069396973, "reward_std": 0.2994512617588043, "rewards/check_originality_func/mean": 0.25, "rewards/check_originality_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.9593486785888672, "rewards/check_winston_local_func/std": 0.06249568983912468, "rewards/sentence_count_match_reward_logic/mean": 0.9238095283508301, "rewards/sentence_count_match_reward_logic/std": 0.12676459550857544, "step": 1253 }, { "clip_ratio": 0.0147964833304286, "epoch": 0.43876836948915326, "grad_norm": 0.6718652439505055, "kl": 1.34375, "learning_rate": 9.57251706495862e-06, "loss": 0.0105, "step": 1254 }, { "clip_ratio": 0.03725993633270264, "epoch": 0.4391182645206438, "grad_norm": 0.4749530544900867, "kl": 1.3515625, "learning_rate": 9.571280773078495e-06, "loss": 0.008, "step": 1255 }, { "clip_ratio": 0.05197941139340401, "epoch": 0.43946815955213436, "grad_norm": 0.44807307655388207, "kl": 1.359375, "learning_rate": 9.570042776153904e-06, "loss": 0.0072, "step": 1256 }, { "clip_ratio": 0.0033256090246140957, "clipped_completions_ratio": 0.0, "epoch": 0.43981805458362494, "grad_norm": 0.689762592794427, "kl": 1.125, "learning_rate": 9.568803074646614e-06, "loss": 0.0042, "max_completion_length": 235.0, "max_terminated_completion_length": 235.0, "mean_completion_length": 159.7857208251953, "mean_terminated_completion_length": 159.7857208251953, "min_completion_length": 109.0, "min_terminated_completion_length": 109.0, "num_tokens": 309991.0, "reward": 2.711996555328369, "reward_std": 0.09135549515485764, "rewards/check_originality_func/mean": 0.7678571343421936, "rewards/check_originality_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.9597641825675964, "rewards/check_winston_local_func/std": 0.06763239949941635, "rewards/sentence_count_match_reward_logic/mean": 0.984375, "rewards/sentence_count_match_reward_logic/std": 0.04171398654580116, "step": 1257 }, { "clip_ratio": 0.010653922334313393, "epoch": 0.44016794961511546, "grad_norm": 0.5147195490953638, "kl": 1.125, "learning_rate": 9.567561669019014e-06, "loss": 0.001, "step": 1258 }, { "clip_ratio": 0.0239418838173151, "epoch": 0.44051784464660604, "grad_norm": 0.41003695211420876, "kl": 1.1328125, "learning_rate": 9.566318559734142e-06, "loss": -0.0008, "step": 1259 }, { "clip_ratio": 0.03411407023668289, "epoch": 0.44086773967809656, "grad_norm": 0.3404125205684113, "kl": 1.140625, "learning_rate": 9.565073747255665e-06, "loss": -0.0022, "step": 1260 }, { "clip_ratio": 0.0034394115209579468, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.44121763470958714, "grad_norm": 0.6345722513652162, "kl": 1.0546875, "learning_rate": 9.563827232047885e-06, "loss": 0.0074, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 198.17857360839844, "mean_terminated_completion_length": 188.5416717529297, "min_completion_length": 130.0, "min_terminated_completion_length": 130.0, "num_tokens": 333833.0, "reward": 2.7333037853240967, "reward_std": 0.2698856294155121, "rewards/check_originality_func/mean": 0.75, "rewards/check_originality_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.9904465079307556, "rewards/check_winston_local_func/std": 0.0032766335643827915, "rewards/sentence_count_match_reward_logic/mean": 0.9928570985794067, "rewards/sentence_count_match_reward_logic/std": 0.032232917845249176, "step": 1261 }, { "clip_ratio": 0.007745531387627125, "epoch": 0.44156752974107766, "grad_norm": 0.5316704133781882, "kl": 1.0546875, "learning_rate": 9.562579014575741e-06, "loss": 0.0049, "step": 1262 }, { "clip_ratio": 0.02136557176709175, "epoch": 0.44191742477256823, "grad_norm": 0.43846215885447704, "kl": 1.0625, "learning_rate": 9.561329095304805e-06, "loss": 0.0022, "step": 1263 }, { "clip_ratio": 0.03498087078332901, "epoch": 0.44226731980405876, "grad_norm": 0.36566172865339336, "kl": 1.0625, "learning_rate": 9.56007747470129e-06, "loss": 0.0004, "step": 1264 }, { "clip_ratio": 0.003668367164209485, "clipped_completions_ratio": 0.0, "epoch": 0.44261721483554933, "grad_norm": 0.810167614171391, "kl": 1.2421875, "learning_rate": 9.558824153232033e-06, "loss": 0.0155, "max_completion_length": 238.0, "max_terminated_completion_length": 238.0, "mean_completion_length": 163.42857360839844, "mean_terminated_completion_length": 163.42857360839844, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 353993.0, "reward": 2.348903179168701, "reward_std": 0.2895905375480652, "rewards/check_originality_func/mean": 0.375, "rewards/check_originality_func/std": 0.48850420117378235, "rewards/check_winston_local_func/mean": 0.9739031195640564, "rewards/check_winston_local_func/std": 0.0509297139942646, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1265 }, { "clip_ratio": 0.014734757132828236, "epoch": 0.4429671098670399, "grad_norm": 0.584190530393976, "kl": 1.2421875, "learning_rate": 9.557569131364512e-06, "loss": 0.0119, "step": 1266 }, { "clip_ratio": 0.030985038727521896, "epoch": 0.44331700489853043, "grad_norm": 0.5900508075420395, "kl": 1.2421875, "learning_rate": 9.556312409566842e-06, "loss": 0.0094, "step": 1267 }, { "clip_ratio": 0.04220619052648544, "epoch": 0.443666899930021, "grad_norm": 0.35631264705158017, "kl": 1.25, "learning_rate": 9.555053988307764e-06, "loss": 0.0076, "step": 1268 }, { "clip_ratio": 0.005313123110681772, "clipped_completions_ratio": 0.0, "epoch": 0.44401679496151153, "grad_norm": 1.0290011839780902, "kl": 1.4453125, "learning_rate": 9.55379386805666e-06, "loss": 0.0204, "max_completion_length": 220.0, "max_terminated_completion_length": 220.0, "mean_completion_length": 132.98214721679688, "mean_terminated_completion_length": 132.98214721679688, "min_completion_length": 76.0, "min_terminated_completion_length": 76.0, "num_tokens": 370624.0, "reward": 2.5767927169799805, "reward_std": 0.15038247406482697, "rewards/check_originality_func/mean": 0.625, "rewards/check_originality_func/std": 0.48850420117378235, "rewards/check_winston_local_func/mean": 0.9562568664550781, "rewards/check_winston_local_func/std": 0.05154391750693321, "rewards/sentence_count_match_reward_logic/mean": 0.9955357313156128, "rewards/sentence_count_match_reward_logic/std": 0.033407654613256454, "step": 1269 }, { "clip_ratio": 0.01846202090382576, "epoch": 0.4443666899930021, "grad_norm": 0.7578242689256316, "kl": 1.4453125, "learning_rate": 9.552532049283541e-06, "loss": 0.0161, "step": 1270 }, { "clip_ratio": 0.04070534184575081, "epoch": 0.4447165850244926, "grad_norm": 0.5255418951342015, "kl": 1.4609375, "learning_rate": 9.551268532459055e-06, "loss": 0.0135, "step": 1271 }, { "clip_ratio": 0.052867576479911804, "epoch": 0.4450664800559832, "grad_norm": 0.4518997774758263, "kl": 1.46875, "learning_rate": 9.550003318054482e-06, "loss": 0.0124, "step": 1272 }, { "clip_ratio": 0.00432349368929863, "clipped_completions_ratio": 0.0, "epoch": 0.4454163750874738, "grad_norm": 1.1076257187491925, "kl": 1.5234375, "learning_rate": 9.548736406541732e-06, "loss": 0.0131, "max_completion_length": 205.0, "max_terminated_completion_length": 205.0, "mean_completion_length": 138.05357360839844, "mean_terminated_completion_length": 138.05357360839844, "min_completion_length": 74.0, "min_terminated_completion_length": 74.0, "num_tokens": 387955.0, "reward": 2.6302387714385986, "reward_std": 0.3258107304573059, "rewards/check_originality_func/mean": 0.6428571343421936, "rewards/check_originality_func/std": 0.48349374532699585, "rewards/check_winston_local_func/mean": 0.9873815774917603, "rewards/check_winston_local_func/std": 0.008512543514370918, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1273 }, { "clip_ratio": 0.015448124147951603, "epoch": 0.4457662701189643, "grad_norm": 0.6364452268544082, "kl": 1.5390625, "learning_rate": 9.547467798393355e-06, "loss": 0.0091, "step": 1274 }, { "clip_ratio": 0.034359920769929886, "epoch": 0.4461161651504549, "grad_norm": 0.559450071417137, "kl": 1.5703125, "learning_rate": 9.546197494082528e-06, "loss": 0.007, "step": 1275 }, { "clip_ratio": 0.04819007217884064, "epoch": 0.4464660601819454, "grad_norm": 0.5590761446211638, "kl": 1.5859375, "learning_rate": 9.544925494083062e-06, "loss": 0.0056, "step": 1276 }, { "clip_ratio": 0.004598166793584824, "clipped_completions_ratio": 0.0, "epoch": 0.446815955213436, "grad_norm": 0.6499697062829881, "kl": 1.15625, "learning_rate": 9.5436517988694e-06, "loss": 0.02, "max_completion_length": 230.0, "max_terminated_completion_length": 230.0, "mean_completion_length": 187.44644165039062, "mean_terminated_completion_length": 187.44644165039062, "min_completion_length": 119.0, "min_terminated_completion_length": 119.0, "num_tokens": 410620.0, "reward": 2.30145001411438, "reward_std": 0.14173991978168488, "rewards/check_originality_func/mean": 0.3571428656578064, "rewards/check_originality_func/std": 0.48349377512931824, "rewards/check_winston_local_func/mean": 0.9895169138908386, "rewards/check_winston_local_func/std": 0.006815581116825342, "rewards/sentence_count_match_reward_logic/mean": 0.9547902345657349, "rewards/sentence_count_match_reward_logic/std": 0.06479400396347046, "step": 1277 }, { "clip_ratio": 0.007670018821954727, "epoch": 0.4471658502449265, "grad_norm": 0.584979563064615, "kl": 1.15625, "learning_rate": 9.54237640891662e-06, "loss": 0.0182, "step": 1278 }, { "clip_ratio": 0.019631944596767426, "epoch": 0.4475157452764171, "grad_norm": 0.4895353796345041, "kl": 1.15625, "learning_rate": 9.541099324700432e-06, "loss": 0.0147, "step": 1279 }, { "clip_ratio": 0.031071491539478302, "epoch": 0.44786564030790765, "grad_norm": 0.38495127309399163, "kl": 1.15625, "learning_rate": 9.539820546697175e-06, "loss": 0.0128, "step": 1280 }, { "clip_ratio": 0.005276786629110575, "clipped_completions_ratio": 0.017857142857142905, "epoch": 0.4482155353393982, "grad_norm": 0.86535437846101, "kl": 1.2890625, "learning_rate": 9.53854007538382e-06, "loss": 0.0205, "max_completion_length": 256.0, "max_terminated_completion_length": 250.0, "mean_completion_length": 165.92857360839844, "mean_terminated_completion_length": 164.29090881347656, "min_completion_length": 66.0, "min_terminated_completion_length": 66.0, "num_tokens": 430688.0, "reward": 2.784507989883423, "reward_std": 0.18371953070163727, "rewards/check_originality_func/mean": 0.8214285969734192, "rewards/check_originality_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9696268439292908, "rewards/check_winston_local_func/std": 0.05566810443997383, "rewards/sentence_count_match_reward_logic/mean": 0.9934523701667786, "rewards/sentence_count_match_reward_logic/std": 0.03447712957859039, "step": 1281 }, { "clip_ratio": 0.01427333801984787, "epoch": 0.44856543037088875, "grad_norm": 0.5923901423422924, "kl": 1.296875, "learning_rate": 9.537257911237968e-06, "loss": 0.0165, "step": 1282 }, { "clip_ratio": 0.028302432969212532, "epoch": 0.4489153254023793, "grad_norm": 0.5115033624488718, "kl": 1.3125, "learning_rate": 9.53597405473786e-06, "loss": 0.0145, "step": 1283 }, { "clip_ratio": 0.04026639461517334, "epoch": 0.44926522043386985, "grad_norm": 0.3898677687423044, "kl": 1.3046875, "learning_rate": 9.53468850636236e-06, "loss": 0.0131, "step": 1284 }, { "clip_ratio": 0.005937982816249132, "clipped_completions_ratio": 0.0, "epoch": 0.44961511546536037, "grad_norm": 0.9817757998775287, "kl": 1.3671875, "learning_rate": 9.533401266590964e-06, "loss": 0.0132, "max_completion_length": 183.0, "max_terminated_completion_length": 183.0, "mean_completion_length": 141.23214721679688, "mean_terminated_completion_length": 141.23214721679688, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 448557.0, "reward": 2.633331775665283, "reward_std": 0.37493059039115906, "rewards/check_originality_func/mean": 0.6607142686843872, "rewards/check_originality_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.9874981641769409, "rewards/check_winston_local_func/std": 0.006563503295183182, "rewards/sentence_count_match_reward_logic/mean": 0.9851189851760864, "rewards/sentence_count_match_reward_logic/std": 0.047956064343452454, "step": 1285 }, { "clip_ratio": 0.01669508032500744, "epoch": 0.44996501049685095, "grad_norm": 0.6621327566632875, "kl": 1.3671875, "learning_rate": 9.532112335903803e-06, "loss": 0.009, "step": 1286 }, { "clip_ratio": 0.03360385075211525, "epoch": 0.4503149055283415, "grad_norm": 0.5106686132610343, "kl": 1.375, "learning_rate": 9.530821714781632e-06, "loss": 0.0057, "step": 1287 }, { "clip_ratio": 0.053288597613573074, "epoch": 0.45066480055983205, "grad_norm": 0.5954706771522402, "kl": 1.390625, "learning_rate": 9.529529403705844e-06, "loss": 0.0045, "step": 1288 }, { "clip_ratio": 0.005189760122448206, "clipped_completions_ratio": 0.0, "epoch": 0.4510146955913226, "grad_norm": 0.7841932423838146, "kl": 1.265625, "learning_rate": 9.528235403158457e-06, "loss": 0.0132, "max_completion_length": 238.0, "max_terminated_completion_length": 238.0, "mean_completion_length": 174.25001525878906, "mean_terminated_completion_length": 174.25001525878906, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 469755.0, "reward": 2.7004597187042236, "reward_std": 0.22042909264564514, "rewards/check_originality_func/mean": 0.7678571343421936, "rewards/check_originality_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.9802217483520508, "rewards/check_winston_local_func/std": 0.03194750100374222, "rewards/sentence_count_match_reward_logic/mean": 0.9523809552192688, "rewards/sentence_count_match_reward_logic/std": 0.08763545751571655, "step": 1289 }, { "clip_ratio": 0.011222575791180134, "epoch": 0.45136459062281314, "grad_norm": 0.6839891018877833, "kl": 1.2578125, "learning_rate": 9.526939713622121e-06, "loss": 0.01, "step": 1290 }, { "clip_ratio": 0.025107955560088158, "epoch": 0.4517144856543037, "grad_norm": 0.47211041605735005, "kl": 1.265625, "learning_rate": 9.525642335580116e-06, "loss": 0.0064, "step": 1291 }, { "clip_ratio": 0.04007402062416077, "epoch": 0.45206438068579424, "grad_norm": 0.5486782920996728, "kl": 1.28125, "learning_rate": 9.524343269516354e-06, "loss": 0.0044, "step": 1292 }, { "clip_ratio": 0.0037617222405970097, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.4524142757172848, "grad_norm": 0.8714242619404969, "kl": 1.609375, "learning_rate": 9.523042515915371e-06, "loss": 0.0204, "max_completion_length": 256.0, "max_terminated_completion_length": 216.0, "mean_completion_length": 148.82144165039062, "mean_terminated_completion_length": 130.95834350585938, "min_completion_length": 73.0, "min_terminated_completion_length": 73.0, "num_tokens": 487921.0, "reward": 2.490084409713745, "reward_std": 0.12775734066963196, "rewards/check_originality_func/mean": 0.5178571343421936, "rewards/check_originality_func/std": 0.5042031407356262, "rewards/check_winston_local_func/mean": 0.9742112755775452, "rewards/check_winston_local_func/std": 0.039171002805233, "rewards/sentence_count_match_reward_logic/mean": 0.9980158805847168, "rewards/sentence_count_match_reward_logic/std": 0.014847845770418644, "step": 1293 }, { "clip_ratio": 0.012130710296332836, "epoch": 0.45276417074877534, "grad_norm": 0.5886252224909317, "kl": 1.6171875, "learning_rate": 9.521740075262338e-06, "loss": 0.0174, "step": 1294 }, { "clip_ratio": 0.030246471986174583, "epoch": 0.4531140657802659, "grad_norm": 0.4892440804810327, "kl": 1.640625, "learning_rate": 9.520435948043051e-06, "loss": 0.0146, "step": 1295 }, { "clip_ratio": 0.044631581753492355, "epoch": 0.4534639608117565, "grad_norm": 0.4233907914021122, "kl": 1.65625, "learning_rate": 9.519130134743938e-06, "loss": 0.0132, "step": 1296 }, { "clip_ratio": 0.004249983001500368, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.453813855843247, "grad_norm": 0.7652443087265592, "kl": 1.4296875, "learning_rate": 9.517822635852059e-06, "loss": 0.0146, "max_completion_length": 256.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 165.0, "mean_terminated_completion_length": 149.83334350585938, "min_completion_length": 81.0, "min_terminated_completion_length": 81.0, "num_tokens": 508401.0, "reward": 2.5840024948120117, "reward_std": 0.19739092886447906, "rewards/check_originality_func/mean": 0.6428571343421936, "rewards/check_originality_func/std": 0.48349377512931824, "rewards/check_winston_local_func/mean": 0.9859864115715027, "rewards/check_winston_local_func/std": 0.004980749450623989, "rewards/sentence_count_match_reward_logic/mean": 0.9551587104797363, "rewards/sentence_count_match_reward_logic/std": 0.07865206152200699, "step": 1297 }, { "clip_ratio": 0.01070801354944706, "epoch": 0.4541637508747376, "grad_norm": 0.601911595770609, "kl": 1.4296875, "learning_rate": 9.516513451855093e-06, "loss": 0.0121, "step": 1298 }, { "clip_ratio": 0.02629636973142624, "epoch": 0.4545136459062281, "grad_norm": 0.4583584594739605, "kl": 1.4375, "learning_rate": 9.515202583241355e-06, "loss": 0.0096, "step": 1299 }, { "clip_ratio": 0.035579048097133636, "epoch": 0.4548635409377187, "grad_norm": 0.3823380460077371, "kl": 1.4375, "learning_rate": 9.513890030499786e-06, "loss": 0.0089, "step": 1300 }, { "clip_ratio": 0.003910442348569632, "clipped_completions_ratio": 0.0, "epoch": 0.4552134359692092, "grad_norm": 0.9094960652071589, "kl": 1.4140625, "learning_rate": 9.512575794119957e-06, "loss": 0.0097, "max_completion_length": 218.0, "max_terminated_completion_length": 218.0, "mean_completion_length": 147.94644165039062, "mean_terminated_completion_length": 147.94644165039062, "min_completion_length": 111.0, "min_terminated_completion_length": 111.0, "num_tokens": 526702.0, "reward": 2.6958417892456055, "reward_std": 0.34439489245414734, "rewards/check_originality_func/mean": 0.7142857313156128, "rewards/check_originality_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.9815558195114136, "rewards/check_winston_local_func/std": 0.020014598965644836, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1301 }, { "clip_ratio": 0.012272720225155354, "epoch": 0.4555633310006998, "grad_norm": 0.6915618742424354, "kl": 1.4140625, "learning_rate": 9.511259874592067e-06, "loss": 0.0052, "step": 1302 }, { "clip_ratio": 0.0320856086909771, "epoch": 0.45591322603219037, "grad_norm": 0.5173037704635143, "kl": 1.421875, "learning_rate": 9.509942272406938e-06, "loss": 0.0023, "step": 1303 }, { "clip_ratio": 0.04726675897836685, "epoch": 0.4562631210636809, "grad_norm": 0.49887408352522844, "kl": 1.4296875, "learning_rate": 9.508622988056026e-06, "loss": 0.0004, "step": 1304 }, { "clip_ratio": 0.003021246986463666, "clipped_completions_ratio": 0.0, "epoch": 0.45661301609517146, "grad_norm": 0.895510570329785, "kl": 1.359375, "learning_rate": 9.50730202203141e-06, "loss": 0.026, "max_completion_length": 254.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 162.25, "mean_terminated_completion_length": 162.25, "min_completion_length": 68.0, "min_terminated_completion_length": 68.0, "num_tokens": 546044.0, "reward": 2.687197685241699, "reward_std": 0.2147466540336609, "rewards/check_originality_func/mean": 0.7142857313156128, "rewards/check_originality_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.9848164916038513, "rewards/check_winston_local_func/std": 0.023524483665823936, "rewards/sentence_count_match_reward_logic/mean": 0.988095223903656, "rewards/sentence_count_match_reward_logic/std": 0.04331168904900551, "step": 1305 }, { "clip_ratio": 0.011753843165934086, "epoch": 0.456962911126662, "grad_norm": 0.7083850249187466, "kl": 1.359375, "learning_rate": 9.5059793748258e-06, "loss": 0.0229, "step": 1306 }, { "clip_ratio": 0.022651810199022293, "epoch": 0.45731280615815256, "grad_norm": 0.4939991632986404, "kl": 1.359375, "learning_rate": 9.504655046932528e-06, "loss": 0.0203, "step": 1307 }, { "clip_ratio": 0.041712645441293716, "epoch": 0.4576627011896431, "grad_norm": 0.5449899682391031, "kl": 1.375, "learning_rate": 9.503329038845556e-06, "loss": 0.0194, "step": 1308 }, { "clip_ratio": 0.004401960410177708, "clipped_completions_ratio": 0.0, "epoch": 0.45801259622113366, "grad_norm": 0.7359182535703118, "kl": 1.203125, "learning_rate": 9.502001351059477e-06, "loss": 0.0209, "max_completion_length": 214.0, "max_terminated_completion_length": 214.0, "mean_completion_length": 170.58929443359375, "mean_terminated_completion_length": 170.58929443359375, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 566661.0, "reward": 2.469944477081299, "reward_std": 0.4260061979293823, "rewards/check_originality_func/mean": 0.5, "rewards/check_originality_func/std": 0.5045249462127686, "rewards/check_winston_local_func/mean": 0.9750462770462036, "rewards/check_winston_local_func/std": 0.0369475893676281, "rewards/sentence_count_match_reward_logic/mean": 0.9948979616165161, "rewards/sentence_count_match_reward_logic/std": 0.026750901713967323, "step": 1309 }, { "clip_ratio": 0.012233609333634377, "epoch": 0.45836249125262424, "grad_norm": 0.5735920429998637, "kl": 1.1953125, "learning_rate": 9.500671984069501e-06, "loss": 0.0177, "step": 1310 }, { "clip_ratio": 0.026229960843920708, "epoch": 0.45871238628411476, "grad_norm": 0.456159413657815, "kl": 1.1953125, "learning_rate": 9.49934093837147e-06, "loss": 0.0148, "step": 1311 }, { "clip_ratio": 0.03790070489048958, "epoch": 0.45906228131560534, "grad_norm": 0.36720191591842505, "kl": 1.1953125, "learning_rate": 9.498008214461854e-06, "loss": 0.0129, "step": 1312 }, { "clip_ratio": 0.004377963487058878, "clipped_completions_ratio": 0.0, "epoch": 0.45941217634709586, "grad_norm": 0.8590687350945657, "kl": 1.5546875, "learning_rate": 9.496673812837742e-06, "loss": 0.0209, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 158.125, "mean_terminated_completion_length": 158.125, "min_completion_length": 59.0, "min_terminated_completion_length": 59.0, "num_tokens": 585884.0, "reward": 2.4171595573425293, "reward_std": 0.16238899528980255, "rewards/check_originality_func/mean": 0.4464285671710968, "rewards/check_originality_func/std": 0.5016207695007324, "rewards/check_winston_local_func/mean": 0.9707310795783997, "rewards/check_winston_local_func/std": 0.040908992290496826, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1313 }, { "clip_ratio": 0.010971980169415474, "epoch": 0.45976207137858643, "grad_norm": 0.5273473236858353, "kl": 1.5625, "learning_rate": 9.495337733996858e-06, "loss": 0.0178, "step": 1314 }, { "clip_ratio": 0.021525155752897263, "epoch": 0.46011196641007696, "grad_norm": 0.482458779291895, "kl": 1.5703125, "learning_rate": 9.493999978437544e-06, "loss": 0.016, "step": 1315 }, { "clip_ratio": 0.03224235773086548, "epoch": 0.46046186144156753, "grad_norm": 0.5989915932226973, "kl": 1.609375, "learning_rate": 9.492660546658771e-06, "loss": 0.015, "step": 1316 }, { "clip_ratio": 0.0036322709638625383, "clipped_completions_ratio": 0.0, "epoch": 0.4608117564730581, "grad_norm": 0.7706652110061761, "kl": 1.4375, "learning_rate": 9.491319439160134e-06, "loss": 0.0145, "max_completion_length": 189.0, "max_terminated_completion_length": 189.0, "mean_completion_length": 136.60714721679688, "mean_terminated_completion_length": 136.60714721679688, "min_completion_length": 82.0, "min_terminated_completion_length": 82.0, "num_tokens": 602902.0, "reward": 2.5571961402893066, "reward_std": 0.10538371652364731, "rewards/check_originality_func/mean": 0.5714285969734192, "rewards/check_originality_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.9883185029029846, "rewards/check_winston_local_func/std": 0.006284718867391348, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 1317 }, { "clip_ratio": 0.011972112581133842, "epoch": 0.46116165150454863, "grad_norm": 0.6579766238057287, "kl": 1.4375, "learning_rate": 9.489976656441855e-06, "loss": 0.0113, "step": 1318 }, { "clip_ratio": 0.022324852645397186, "epoch": 0.4615115465360392, "grad_norm": 0.4805917305809053, "kl": 1.4453125, "learning_rate": 9.488632199004777e-06, "loss": 0.0083, "step": 1319 }, { "clip_ratio": 0.03867022693157196, "epoch": 0.46186144156752973, "grad_norm": 0.40687769136754154, "kl": 1.4609375, "learning_rate": 9.48728606735037e-06, "loss": 0.0072, "step": 1320 }, { "clip_ratio": 0.0032251335214823484, "clipped_completions_ratio": 0.0, "epoch": 0.4622113365990203, "grad_norm": 0.8579357686988761, "kl": 1.3828125, "learning_rate": 9.48593826198073e-06, "loss": 0.017, "max_completion_length": 218.0, "max_terminated_completion_length": 218.0, "mean_completion_length": 143.58929443359375, "mean_terminated_completion_length": 143.58929443359375, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 620951.0, "reward": 2.34468936920166, "reward_std": 0.36853015422821045, "rewards/check_originality_func/mean": 0.3928571343421936, "rewards/check_originality_func/std": 0.4928053617477417, "rewards/check_winston_local_func/mean": 0.9819340109825134, "rewards/check_winston_local_func/std": 0.02047080732882023, "rewards/sentence_count_match_reward_logic/mean": 0.9698979258537292, "rewards/sentence_count_match_reward_logic/std": 0.0797807052731514, "step": 1321 }, { "clip_ratio": 0.014996033161878586, "epoch": 0.4625612316305108, "grad_norm": 0.6991600995040256, "kl": 1.390625, "learning_rate": 9.484588783398573e-06, "loss": 0.013, "step": 1322 }, { "clip_ratio": 0.033640045672655106, "epoch": 0.4629111266620014, "grad_norm": 0.5200408620027235, "kl": 1.3828125, "learning_rate": 9.483237632107245e-06, "loss": 0.0094, "step": 1323 }, { "clip_ratio": 0.05326118692755699, "epoch": 0.4632610216934919, "grad_norm": 0.47329117232058693, "kl": 1.3828125, "learning_rate": 9.481884808610712e-06, "loss": 0.0076, "step": 1324 }, { "clip_ratio": 0.0035649677738547325, "clipped_completions_ratio": 0.0, "epoch": 0.4636109167249825, "grad_norm": 0.6322116368987786, "kl": 1.046875, "learning_rate": 9.480530313413563e-06, "loss": 0.0086, "max_completion_length": 242.0, "max_terminated_completion_length": 242.0, "mean_completion_length": 186.62501525878906, "mean_terminated_completion_length": 186.62501525878906, "min_completion_length": 106.0, "min_terminated_completion_length": 106.0, "num_tokens": 643514.0, "reward": 2.337578535079956, "reward_std": 0.22975818812847137, "rewards/check_originality_func/mean": 0.3928571343421936, "rewards/check_originality_func/std": 0.4928053617477417, "rewards/check_winston_local_func/mean": 0.9615863561630249, "rewards/check_winston_local_func/std": 0.050028663128614426, "rewards/sentence_count_match_reward_logic/mean": 0.983134925365448, "rewards/sentence_count_match_reward_logic/std": 0.04959956929087639, "step": 1325 }, { "clip_ratio": 0.00928418803960085, "epoch": 0.4639608117564731, "grad_norm": 0.48796042316650406, "kl": 1.0546875, "learning_rate": 9.47917414702101e-06, "loss": 0.0069, "step": 1326 }, { "clip_ratio": 0.019918793812394142, "epoch": 0.4643107067879636, "grad_norm": 0.4923942269632059, "kl": 1.0546875, "learning_rate": 9.477816309938896e-06, "loss": 0.0046, "step": 1327 }, { "clip_ratio": 0.032722145318984985, "epoch": 0.4646606018194542, "grad_norm": 0.38901911722809634, "kl": 1.0625, "learning_rate": 9.476456802673677e-06, "loss": 0.0028, "step": 1328 }, { "clip_ratio": 0.005095984321087599, "clipped_completions_ratio": 0.0, "epoch": 0.4650104968509447, "grad_norm": 0.7185775784251118, "kl": 1.265625, "learning_rate": 9.475095625732437e-06, "loss": 0.0138, "max_completion_length": 220.0, "max_terminated_completion_length": 220.0, "mean_completion_length": 156.69644165039062, "mean_terminated_completion_length": 156.69644165039062, "min_completion_length": 107.0, "min_terminated_completion_length": 107.0, "num_tokens": 662769.0, "reward": 2.817841053009033, "reward_std": 0.2572782635688782, "rewards/check_originality_func/mean": 0.8571428656578064, "rewards/check_originality_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9892696142196655, "rewards/check_winston_local_func/std": 0.0035505364648997784, "rewards/sentence_count_match_reward_logic/mean": 0.9714285731315613, "rewards/sentence_count_match_reward_logic/std": 0.07061877846717834, "step": 1329 }, { "clip_ratio": 0.009261779487133026, "epoch": 0.4653603918824353, "grad_norm": 0.5881414633421497, "kl": 1.265625, "learning_rate": 9.473732779622882e-06, "loss": 0.0114, "step": 1330 }, { "clip_ratio": 0.024891309440135956, "epoch": 0.4657102869139258, "grad_norm": 0.4542368835995394, "kl": 1.265625, "learning_rate": 9.472368264853342e-06, "loss": 0.0086, "step": 1331 }, { "clip_ratio": 0.03897099941968918, "epoch": 0.4660601819454164, "grad_norm": 0.3760100837077723, "kl": 1.2734375, "learning_rate": 9.471002081932767e-06, "loss": 0.0067, "step": 1332 }, { "clip_ratio": 0.004185998812317848, "clipped_completions_ratio": 0.0, "epoch": 0.46641007697690695, "grad_norm": 0.8790539173087842, "kl": 1.4296875, "learning_rate": 9.46963423137073e-06, "loss": 0.0093, "max_completion_length": 250.0, "max_terminated_completion_length": 250.0, "mean_completion_length": 157.9107208251953, "mean_terminated_completion_length": 157.9107208251953, "min_completion_length": 112.0, "min_terminated_completion_length": 112.0, "num_tokens": 682300.0, "reward": 2.559109926223755, "reward_std": 0.19852279126644135, "rewards/check_originality_func/mean": 0.5714285969734192, "rewards/check_originality_func/std": 0.4993502199649811, "rewards/check_winston_local_func/mean": 0.989913284778595, "rewards/check_winston_local_func/std": 0.003824800020083785, "rewards/sentence_count_match_reward_logic/mean": 0.9977678656578064, "rewards/sentence_count_match_reward_logic/std": 0.016703827306628227, "step": 1333 }, { "clip_ratio": 0.014519920572638512, "epoch": 0.4667599720083975, "grad_norm": 0.6844643260297508, "kl": 1.4296875, "learning_rate": 9.468264713677427e-06, "loss": 0.0047, "step": 1334 }, { "clip_ratio": 0.03480134159326553, "epoch": 0.46710986703988805, "grad_norm": 0.5536619921147892, "kl": 1.4296875, "learning_rate": 9.466893529363676e-06, "loss": 0.0018, "step": 1335 }, { "clip_ratio": 0.05094856396317482, "epoch": 0.46745976207137857, "grad_norm": 0.4782244334505987, "kl": 1.4375, "learning_rate": 9.465520678940913e-06, "loss": -0.0003, "step": 1336 }, { "clip_ratio": 0.003921383526176214, "clipped_completions_ratio": 0.0, "epoch": 0.46780965710286915, "grad_norm": 0.7860898497015992, "kl": 1.2421875, "learning_rate": 9.464146162921201e-06, "loss": 0.0042, "max_completion_length": 225.0, "max_terminated_completion_length": 225.0, "mean_completion_length": 155.69644165039062, "mean_terminated_completion_length": 155.69644165039062, "min_completion_length": 108.0, "min_terminated_completion_length": 108.0, "num_tokens": 701387.0, "reward": 2.3558502197265625, "reward_std": 0.2196071892976761, "rewards/check_originality_func/mean": 0.3928571343421936, "rewards/check_originality_func/std": 0.4928053617477417, "rewards/check_winston_local_func/mean": 0.983231246471405, "rewards/check_winston_local_func/std": 0.011102966964244843, "rewards/sentence_count_match_reward_logic/mean": 0.9797619581222534, "rewards/sentence_count_match_reward_logic/std": 0.05917908996343613, "step": 1337 }, { "clip_ratio": 0.012475614435970783, "epoch": 0.46815955213435967, "grad_norm": 0.5793233431384224, "kl": 1.234375, "learning_rate": 9.462769981817221e-06, "loss": 0.0006, "step": 1338 }, { "clip_ratio": 0.02834147773683071, "epoch": 0.46850944716585025, "grad_norm": 0.43263599911018036, "kl": 1.2421875, "learning_rate": 9.461392136142276e-06, "loss": -0.0021, "step": 1339 }, { "clip_ratio": 0.042821917682886124, "epoch": 0.4688593421973408, "grad_norm": 0.37055855504627055, "kl": 1.25, "learning_rate": 9.460012626410286e-06, "loss": -0.0033, "step": 1340 }, { "clip_ratio": 0.004471904598176479, "clipped_completions_ratio": 0.0, "epoch": 0.46920923722883134, "grad_norm": 0.7462258626401054, "kl": 1.15625, "learning_rate": 9.458631453135799e-06, "loss": 0.0049, "max_completion_length": 248.0, "max_terminated_completion_length": 248.0, "mean_completion_length": 186.35714721679688, "mean_terminated_completion_length": 186.35714721679688, "min_completion_length": 98.0, "min_terminated_completion_length": 98.0, "num_tokens": 723599.0, "reward": 2.7579946517944336, "reward_std": 0.11493153870105743, "rewards/check_originality_func/mean": 0.8214285969734192, "rewards/check_originality_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9623079299926758, "rewards/check_winston_local_func/std": 0.07847212255001068, "rewards/sentence_count_match_reward_logic/mean": 0.9742578864097595, "rewards/sentence_count_match_reward_logic/std": 0.05589991435408592, "step": 1341 }, { "clip_ratio": 0.014571805484592915, "epoch": 0.4695591322603219, "grad_norm": 0.55394728799414, "kl": 1.1640625, "learning_rate": 9.457248616833978e-06, "loss": 0.0017, "step": 1342 }, { "clip_ratio": 0.027077946811914444, "epoch": 0.46990902729181244, "grad_norm": 0.49607628170990564, "kl": 1.171875, "learning_rate": 9.455864118020607e-06, "loss": -0.0005, "step": 1343 }, { "clip_ratio": 0.0377073809504509, "epoch": 0.470258922323303, "grad_norm": 0.3955361300162301, "kl": 1.171875, "learning_rate": 9.454477957212092e-06, "loss": -0.0017, "step": 1344 }, { "clip_ratio": 0.0053015886805951595, "clipped_completions_ratio": 0.1071428571428571, "epoch": 0.47060881735479354, "grad_norm": 0.7734427545975269, "kl": 1.28125, "learning_rate": 9.453090134925459e-06, "loss": 0.0081, "max_completion_length": 256.0, "max_terminated_completion_length": 247.0, "mean_completion_length": 179.07144165039062, "mean_terminated_completion_length": 169.83999633789062, "min_completion_length": 88.0, "min_terminated_completion_length": 88.0, "num_tokens": 745035.0, "reward": 2.4747517108917236, "reward_std": 0.3072446584701538, "rewards/check_originality_func/mean": 0.5, "rewards/check_originality_func/std": 0.5045249462127686, "rewards/check_winston_local_func/mean": 0.9792870283126831, "rewards/check_winston_local_func/std": 0.047746628522872925, "rewards/sentence_count_match_reward_logic/mean": 0.9954648613929749, "rewards/sentence_count_match_reward_logic/std": 0.023970454931259155, "step": 1345 }, { "clip_ratio": 0.012192826718091965, "epoch": 0.4709587123862841, "grad_norm": 0.596825276903608, "kl": 1.2890625, "learning_rate": 9.45170065167835e-06, "loss": 0.0048, "step": 1346 }, { "clip_ratio": 0.027589071542024612, "epoch": 0.4713086074177747, "grad_norm": 0.5457720473666801, "kl": 1.3046875, "learning_rate": 9.450309507989031e-06, "loss": 0.0031, "step": 1347 }, { "clip_ratio": 0.03917095810174942, "epoch": 0.4716585024492652, "grad_norm": 0.4884975312752631, "kl": 1.3046875, "learning_rate": 9.448916704376384e-06, "loss": 0.0007, "step": 1348 }, { "clip_ratio": 0.004472489468753338, "clipped_completions_ratio": 0.0, "epoch": 0.4720083974807558, "grad_norm": 0.8807717400165987, "kl": 1.5, "learning_rate": 9.447522241359911e-06, "loss": 0.0299, "max_completion_length": 238.0, "max_terminated_completion_length": 238.0, "mean_completion_length": 168.46429443359375, "mean_terminated_completion_length": 168.46429443359375, "min_completion_length": 76.0, "min_terminated_completion_length": 76.0, "num_tokens": 765069.0, "reward": 2.8977320194244385, "reward_std": 0.1497306078672409, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9899935126304626, "rewards/check_winston_local_func/std": 0.00443227868527174, "rewards/sentence_count_match_reward_logic/mean": 0.9970237612724304, "rewards/sentence_count_match_reward_logic/std": 0.02227177284657955, "step": 1349 }, { "clip_ratio": 0.011073804460465908, "epoch": 0.4723582925122463, "grad_norm": 0.7092099780226879, "kl": 1.515625, "learning_rate": 9.446126119459739e-06, "loss": 0.0264, "step": 1350 }, { "clip_ratio": 0.028553375974297523, "epoch": 0.4727081875437369, "grad_norm": 0.5735663909936536, "kl": 1.5234375, "learning_rate": 9.444728339196601e-06, "loss": 0.0234, "step": 1351 }, { "clip_ratio": 0.04727425053715706, "epoch": 0.4730580825752274, "grad_norm": 0.4887335623234053, "kl": 1.515625, "learning_rate": 9.44332890109186e-06, "loss": 0.0219, "step": 1352 }, { "clip_ratio": 0.0041747684590518475, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.473407977606718, "grad_norm": 0.7168864838565705, "kl": 1.21875, "learning_rate": 9.441927805667493e-06, "loss": 0.0094, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 180.83929443359375, "mean_terminated_completion_length": 178.05555725097656, "min_completion_length": 117.0, "min_terminated_completion_length": 117.0, "num_tokens": 786644.0, "reward": 2.5804409980773926, "reward_std": 0.16985104978084564, "rewards/check_originality_func/mean": 0.6428571343421936, "rewards/check_originality_func/std": 0.48349377512931824, "rewards/check_winston_local_func/mean": 0.9865633249282837, "rewards/check_winston_local_func/std": 0.0071238018572330475, "rewards/sentence_count_match_reward_logic/mean": 0.951020359992981, "rewards/sentence_count_match_reward_logic/std": 0.07964937388896942, "step": 1353 }, { "clip_ratio": 0.00875094998627901, "epoch": 0.4737578726382085, "grad_norm": 0.5786658923725011, "kl": 1.21875, "learning_rate": 9.440525053446092e-06, "loss": 0.0065, "step": 1354 }, { "clip_ratio": 0.022995656356215477, "epoch": 0.4741077676696991, "grad_norm": 0.4504493986376824, "kl": 1.21875, "learning_rate": 9.439120644950875e-06, "loss": 0.0043, "step": 1355 }, { "clip_ratio": 0.034613173454999924, "epoch": 0.47445766270118966, "grad_norm": 0.39407006354157725, "kl": 1.2265625, "learning_rate": 9.437714580705671e-06, "loss": 0.0029, "step": 1356 }, { "clip_ratio": 0.003572471672669053, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.4748075577326802, "grad_norm": 0.6417276673364549, "kl": 1.0625, "learning_rate": 9.436306861234926e-06, "loss": 0.0134, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 194.44644165039062, "mean_terminated_completion_length": 184.1875, "min_completion_length": 133.0, "min_terminated_completion_length": 133.0, "num_tokens": 810125.0, "reward": 2.6470279693603516, "reward_std": 0.20109061896800995, "rewards/check_originality_func/mean": 0.6964285969734192, "rewards/check_originality_func/std": 0.4639608860015869, "rewards/check_winston_local_func/mean": 0.9898850321769714, "rewards/check_winston_local_func/std": 0.0031978818587958813, "rewards/sentence_count_match_reward_logic/mean": 0.9607142806053162, "rewards/sentence_count_match_reward_logic/std": 0.06885214895009995, "step": 1357 }, { "clip_ratio": 0.007921296171844006, "epoch": 0.47515745276417076, "grad_norm": 0.5568856334823931, "kl": 1.0703125, "learning_rate": 9.43489748706371e-06, "loss": 0.0113, "step": 1358 }, { "clip_ratio": 0.01946430280804634, "epoch": 0.4755073477956613, "grad_norm": 0.45094552026967905, "kl": 1.078125, "learning_rate": 9.433486458717705e-06, "loss": 0.009, "step": 1359 }, { "clip_ratio": 0.03241975978016853, "epoch": 0.47585724282715186, "grad_norm": 0.3920172038970689, "kl": 1.078125, "learning_rate": 9.43207377672321e-06, "loss": 0.0071, "step": 1360 }, { "clip_ratio": 0.006052381359040737, "clipped_completions_ratio": 0.0, "epoch": 0.4762071378586424, "grad_norm": 0.7243306910024909, "kl": 1.375, "learning_rate": 9.430659441607145e-06, "loss": 0.0182, "max_completion_length": 248.0, "max_terminated_completion_length": 248.0, "mean_completion_length": 166.32144165039062, "mean_terminated_completion_length": 166.32144165039062, "min_completion_length": 110.0, "min_terminated_completion_length": 110.0, "num_tokens": 830559.0, "reward": 2.7162258625030518, "reward_std": 0.298006147146225, "rewards/check_originality_func/mean": 0.7678571343421936, "rewards/check_originality_func/std": 0.4260205626487732, "rewards/check_winston_local_func/mean": 0.982459545135498, "rewards/check_winston_local_func/std": 0.014816895127296448, "rewards/sentence_count_match_reward_logic/mean": 0.9659090638160706, "rewards/sentence_count_match_reward_logic/std": 0.06629623472690582, "step": 1361 }, { "clip_ratio": 0.0100498516112566, "epoch": 0.47655703289013296, "grad_norm": 0.568823992741433, "kl": 1.3828125, "learning_rate": 9.429243453897039e-06, "loss": 0.0148, "step": 1362 }, { "clip_ratio": 0.024077530950307846, "epoch": 0.47690692792162354, "grad_norm": 0.4744537243542723, "kl": 1.390625, "learning_rate": 9.427825814121047e-06, "loss": 0.0125, "step": 1363 }, { "clip_ratio": 0.03367314487695694, "epoch": 0.47725682295311406, "grad_norm": 0.3891600151717495, "kl": 1.3828125, "learning_rate": 9.426406522807932e-06, "loss": 0.0109, "step": 1364 }, { "clip_ratio": 0.0043685720302164555, "clipped_completions_ratio": 0.25, "epoch": 0.47760671798460463, "grad_norm": 0.6804786808281096, "kl": 1.3125, "learning_rate": 9.424985580487075e-06, "loss": 0.0125, "max_completion_length": 256.0, "max_terminated_completion_length": 249.0, "mean_completion_length": 193.0357208251953, "mean_terminated_completion_length": 172.04762268066406, "min_completion_length": 95.0, "min_terminated_completion_length": 95.0, "num_tokens": 853873.0, "reward": 2.4968819618225098, "reward_std": 0.32764649391174316, "rewards/check_originality_func/mean": 0.5357142686843872, "rewards/check_originality_func/std": 0.5032362937927246, "rewards/check_winston_local_func/mean": 0.9888883829116821, "rewards/check_winston_local_func/std": 0.004455724731087685, "rewards/sentence_count_match_reward_logic/mean": 0.972278892993927, "rewards/sentence_count_match_reward_logic/std": 0.05541476234793663, "step": 1365 }, { "clip_ratio": 0.00925582367926836, "epoch": 0.47795661301609516, "grad_norm": 0.5518886083540722, "kl": 1.3125, "learning_rate": 9.423562987688479e-06, "loss": 0.0104, "step": 1366 }, { "clip_ratio": 0.02306811884045601, "epoch": 0.47830650804758573, "grad_norm": 0.4291994460635828, "kl": 1.3203125, "learning_rate": 9.422138744942753e-06, "loss": 0.0077, "step": 1367 }, { "clip_ratio": 0.03930608183145523, "epoch": 0.47865640307907625, "grad_norm": 0.42240222880393113, "kl": 1.328125, "learning_rate": 9.420712852781129e-06, "loss": 0.0059, "step": 1368 }, { "clip_ratio": 0.004101386293768883, "clipped_completions_ratio": 0.1071428571428571, "epoch": 0.47900629811056683, "grad_norm": 0.6846977965596028, "kl": 1.375, "learning_rate": 9.419285311735449e-06, "loss": 0.0114, "max_completion_length": 256.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 172.42857360839844, "mean_terminated_completion_length": 162.39999389648438, "min_completion_length": 99.0, "min_terminated_completion_length": 99.0, "num_tokens": 874369.0, "reward": 2.826442241668701, "reward_std": 0.13908828794956207, "rewards/check_originality_func/mean": 0.8392857313156128, "rewards/check_originality_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.9916208982467651, "rewards/check_winston_local_func/std": 0.005402315873652697, "rewards/sentence_count_match_reward_logic/mean": 0.9955357313156128, "rewards/sentence_count_match_reward_logic/std": 0.033407654613256454, "step": 1369 }, { "clip_ratio": 0.011546188965439796, "epoch": 0.4793561931420574, "grad_norm": 0.5926680479997174, "kl": 1.375, "learning_rate": 9.417856122338174e-06, "loss": 0.0083, "step": 1370 }, { "clip_ratio": 0.02504243701696396, "epoch": 0.47970608817354793, "grad_norm": 0.5222399912499958, "kl": 1.375, "learning_rate": 9.416425285122376e-06, "loss": 0.0056, "step": 1371 }, { "clip_ratio": 0.03869848698377609, "epoch": 0.4800559832050385, "grad_norm": 0.43175993497592247, "kl": 1.3828125, "learning_rate": 9.414992800621749e-06, "loss": 0.0037, "step": 1372 }, { "clip_ratio": 0.006825873162597418, "clipped_completions_ratio": 0.0, "epoch": 0.480405878236529, "grad_norm": 0.8396611385089447, "kl": 1.4609375, "learning_rate": 9.41355866937059e-06, "loss": 0.0207, "max_completion_length": 249.0, "max_terminated_completion_length": 249.0, "mean_completion_length": 160.33929443359375, "mean_terminated_completion_length": 160.33929443359375, "min_completion_length": 81.0, "min_terminated_completion_length": 81.0, "num_tokens": 894140.0, "reward": 2.9393317699432373, "reward_std": 0.07409761846065521, "rewards/check_originality_func/mean": 0.9821428656578064, "rewards/check_originality_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9886513948440552, "rewards/check_winston_local_func/std": 0.012845483608543873, "rewards/sentence_count_match_reward_logic/mean": 0.968537449836731, "rewards/sentence_count_match_reward_logic/std": 0.06674502789974213, "step": 1373 }, { "clip_ratio": 0.01781044341623783, "epoch": 0.4807557732680196, "grad_norm": 0.7345723174949004, "kl": 1.4609375, "learning_rate": 9.412122891903819e-06, "loss": 0.0166, "step": 1374 }, { "clip_ratio": 0.031654488295316696, "epoch": 0.4811056682995101, "grad_norm": 0.5018443047253941, "kl": 1.46875, "learning_rate": 9.410685468756966e-06, "loss": 0.0127, "step": 1375 }, { "clip_ratio": 0.0479559563100338, "epoch": 0.4814555633310007, "grad_norm": 0.5077520184604717, "kl": 1.4765625, "learning_rate": 9.409246400466178e-06, "loss": 0.0107, "step": 1376 }, { "clip_ratio": 0.0036418961826711893, "clipped_completions_ratio": 0.0, "epoch": 0.4818054583624913, "grad_norm": 0.7830849217679701, "kl": 1.3828125, "learning_rate": 9.407805687568214e-06, "loss": 0.017, "max_completion_length": 239.0, "max_terminated_completion_length": 239.0, "mean_completion_length": 167.30357360839844, "mean_terminated_completion_length": 167.30357360839844, "min_completion_length": 102.0, "min_terminated_completion_length": 102.0, "num_tokens": 914325.0, "reward": 2.5621399879455566, "reward_std": 0.29893743991851807, "rewards/check_originality_func/mean": 0.6607142686843872, "rewards/check_originality_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.9881601333618164, "rewards/check_winston_local_func/std": 0.008323641493916512, "rewards/sentence_count_match_reward_logic/mean": 0.9132653474807739, "rewards/sentence_count_match_reward_logic/std": 0.11852078884840012, "step": 1377 }, { "clip_ratio": 0.011347892694175243, "epoch": 0.4821553533939818, "grad_norm": 0.6416940994794992, "kl": 1.3828125, "learning_rate": 9.406363330600445e-06, "loss": 0.0134, "step": 1378 }, { "clip_ratio": 0.02790648676455021, "epoch": 0.4825052484254724, "grad_norm": 0.5879243579231662, "kl": 1.390625, "learning_rate": 9.404919330100858e-06, "loss": 0.0096, "step": 1379 }, { "clip_ratio": 0.047405023127794266, "epoch": 0.4828551434569629, "grad_norm": 0.4396951544863993, "kl": 1.3984375, "learning_rate": 9.40347368660805e-06, "loss": 0.0068, "step": 1380 }, { "clip_ratio": 0.004021116998046637, "clipped_completions_ratio": 0.0, "epoch": 0.4832050384884535, "grad_norm": 0.9692590953904896, "kl": 1.484375, "learning_rate": 9.402026400661233e-06, "loss": 0.0174, "max_completion_length": 212.0, "max_terminated_completion_length": 212.0, "mean_completion_length": 133.98214721679688, "mean_terminated_completion_length": 133.98214721679688, "min_completion_length": 63.0, "min_terminated_completion_length": 63.0, "num_tokens": 930956.0, "reward": 2.6256215572357178, "reward_std": 0.1868075430393219, "rewards/check_originality_func/mean": 0.6785714030265808, "rewards/check_originality_func/std": 0.4712514281272888, "rewards/check_winston_local_func/mean": 0.9708596467971802, "rewards/check_winston_local_func/std": 0.07026456296443939, "rewards/sentence_count_match_reward_logic/mean": 0.976190447807312, "rewards/sentence_count_match_reward_logic/std": 0.058848991990089417, "step": 1381 }, { "clip_ratio": 0.01631203666329384, "epoch": 0.483554933519944, "grad_norm": 0.719504480721516, "kl": 1.484375, "learning_rate": 9.40057747280023e-06, "loss": 0.0142, "step": 1382 }, { "clip_ratio": 0.03469136357307434, "epoch": 0.4839048285514346, "grad_norm": 0.6759129581332399, "kl": 1.4921875, "learning_rate": 9.39912690356548e-06, "loss": 0.0113, "step": 1383 }, { "clip_ratio": 0.04884995147585869, "epoch": 0.4842547235829251, "grad_norm": 0.43902487184317435, "kl": 1.515625, "learning_rate": 9.39767469349803e-06, "loss": 0.0096, "step": 1384 }, { "clip_ratio": 0.004530398640781641, "clipped_completions_ratio": 0.0, "epoch": 0.48460461861441567, "grad_norm": 0.766316761509495, "kl": 1.25, "learning_rate": 9.396220843139538e-06, "loss": 0.0085, "max_completion_length": 246.0, "max_terminated_completion_length": 246.0, "mean_completion_length": 181.2678680419922, "mean_terminated_completion_length": 181.2678680419922, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 952603.0, "reward": 2.8849685192108154, "reward_std": 0.18366782367229462, "rewards/check_originality_func/mean": 0.9285714030265808, "rewards/check_originality_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9717031121253967, "rewards/check_winston_local_func/std": 0.044578347355127335, "rewards/sentence_count_match_reward_logic/mean": 0.9846938848495483, "rewards/sentence_count_match_reward_logic/std": 0.044584840536117554, "step": 1385 }, { "clip_ratio": 0.011788149364292622, "epoch": 0.48495451364590625, "grad_norm": 0.6268287417349827, "kl": 1.265625, "learning_rate": 9.39476535303228e-06, "loss": 0.0056, "step": 1386 }, { "clip_ratio": 0.023844558745622635, "epoch": 0.48530440867739677, "grad_norm": 0.5848960801037895, "kl": 1.2734375, "learning_rate": 9.393308223719139e-06, "loss": 0.0027, "step": 1387 }, { "clip_ratio": 0.035454846918582916, "epoch": 0.48565430370888735, "grad_norm": 0.42322740767996797, "kl": 1.265625, "learning_rate": 9.39184945574361e-06, "loss": 0.0003, "step": 1388 }, { "clip_ratio": 0.005325273610651493, "clipped_completions_ratio": 0.0, "epoch": 0.48600419874037787, "grad_norm": 0.8705842798211832, "kl": 1.5859375, "learning_rate": 9.3903890496498e-06, "loss": 0.0127, "max_completion_length": 224.0, "max_terminated_completion_length": 224.0, "mean_completion_length": 143.25, "mean_terminated_completion_length": 143.25, "min_completion_length": 85.0, "min_terminated_completion_length": 85.0, "num_tokens": 970273.0, "reward": 2.622519016265869, "reward_std": 0.1872907280921936, "rewards/check_originality_func/mean": 0.6607142686843872, "rewards/check_originality_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.9618043899536133, "rewards/check_winston_local_func/std": 0.058640819042921066, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1389 }, { "clip_ratio": 0.016721010208129883, "epoch": 0.48635409377186845, "grad_norm": 0.6724263844164737, "kl": 1.5859375, "learning_rate": 9.388927005982427e-06, "loss": 0.0091, "step": 1390 }, { "clip_ratio": 0.03477465733885765, "epoch": 0.48670398880335897, "grad_norm": 0.541148810205413, "kl": 1.5859375, "learning_rate": 9.387463325286822e-06, "loss": 0.0056, "step": 1391 }, { "clip_ratio": 0.050720419734716415, "epoch": 0.48705388383484954, "grad_norm": 0.4752550059731569, "kl": 1.59375, "learning_rate": 9.385998008108917e-06, "loss": 0.0045, "step": 1392 }, { "clip_ratio": 0.004530536942183971, "clipped_completions_ratio": 0.0, "epoch": 0.4874037788663401, "grad_norm": 0.6736241230959708, "kl": 1.21875, "learning_rate": 9.384531054995267e-06, "loss": 0.0102, "max_completion_length": 224.0, "max_terminated_completion_length": 224.0, "mean_completion_length": 173.25001525878906, "mean_terminated_completion_length": 173.25001525878906, "min_completion_length": 113.0, "min_terminated_completion_length": 113.0, "num_tokens": 991439.0, "reward": 2.7887961864471436, "reward_std": 0.17834682762622833, "rewards/check_originality_func/mean": 0.8214285969734192, "rewards/check_originality_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9739149212837219, "rewards/check_winston_local_func/std": 0.08023788034915924, "rewards/sentence_count_match_reward_logic/mean": 0.9934523701667786, "rewards/sentence_count_match_reward_logic/std": 0.03447712957859039, "step": 1393 }, { "clip_ratio": 0.007348640356212854, "epoch": 0.48775367389783064, "grad_norm": 0.5731789891238873, "kl": 1.21875, "learning_rate": 9.383062466493029e-06, "loss": 0.008, "step": 1394 }, { "clip_ratio": 0.020098235458135605, "epoch": 0.4881035689293212, "grad_norm": 0.5424672551017449, "kl": 1.21875, "learning_rate": 9.381592243149976e-06, "loss": 0.0052, "step": 1395 }, { "clip_ratio": 0.036357998847961426, "epoch": 0.48845346396081174, "grad_norm": 0.3959857923139987, "kl": 1.21875, "learning_rate": 9.380120385514484e-06, "loss": 0.0028, "step": 1396 }, { "clip_ratio": 0.003939241170883179, "clipped_completions_ratio": 0.1964285714285714, "epoch": 0.4888033589923023, "grad_norm": 0.6384550328793578, "kl": 1.140625, "learning_rate": 9.378646894135544e-06, "loss": 0.0028, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 204.55357360839844, "mean_terminated_completion_length": 191.977783203125, "min_completion_length": 126.0, "min_terminated_completion_length": 126.0, "num_tokens": 1016830.0, "reward": 2.690425395965576, "reward_std": 0.3482080399990082, "rewards/check_originality_func/mean": 0.7857142686843872, "rewards/check_originality_func/std": 0.41403928399086, "rewards/check_winston_local_func/mean": 0.9775851964950562, "rewards/check_winston_local_func/std": 0.06275302171707153, "rewards/sentence_count_match_reward_logic/mean": 0.9271258115768433, "rewards/sentence_count_match_reward_logic/std": 0.14766104519367218, "step": 1397 }, { "clip_ratio": 0.010631696321070194, "epoch": 0.48915325402379284, "grad_norm": 0.5752798213380272, "kl": 1.140625, "learning_rate": 9.377171769562751e-06, "loss": 0.0002, "step": 1398 }, { "clip_ratio": 0.023573165759444237, "epoch": 0.4895031490552834, "grad_norm": 0.5247740988565089, "kl": 1.140625, "learning_rate": 9.375695012346319e-06, "loss": -0.0022, "step": 1399 }, { "clip_ratio": 0.03351950645446777, "epoch": 0.489853044086774, "grad_norm": 0.4117267036093235, "kl": 1.140625, "learning_rate": 9.374216623037057e-06, "loss": -0.0048, "step": 1400 }, { "clip_ratio": 0.003518729005008936, "clipped_completions_ratio": 0.1607142857142857, "epoch": 0.4902029391182645, "grad_norm": 0.8077832754346402, "kl": 1.328125, "learning_rate": 9.372736602186396e-06, "loss": 0.0129, "max_completion_length": 256.0, "max_terminated_completion_length": 246.0, "mean_completion_length": 171.85714721679688, "mean_terminated_completion_length": 155.7446746826172, "min_completion_length": 82.0, "min_terminated_completion_length": 82.0, "num_tokens": 1037782.0, "reward": 2.580191135406494, "reward_std": 0.246234729886055, "rewards/check_originality_func/mean": 0.625, "rewards/check_originality_func/std": 0.48850420117378235, "rewards/check_winston_local_func/mean": 0.9888643026351929, "rewards/check_winston_local_func/std": 0.005151070654392242, "rewards/sentence_count_match_reward_logic/mean": 0.9663265347480774, "rewards/sentence_count_match_reward_logic/std": 0.07352399080991745, "step": 1401 }, { "clip_ratio": 0.012066296301782131, "epoch": 0.4905528341497551, "grad_norm": 0.6819958274064147, "kl": 1.3359375, "learning_rate": 9.371254950346366e-06, "loss": 0.0097, "step": 1402 }, { "clip_ratio": 0.03126336261630058, "epoch": 0.4909027291812456, "grad_norm": 0.48472546707742814, "kl": 1.34375, "learning_rate": 9.369771668069612e-06, "loss": 0.0056, "step": 1403 }, { "clip_ratio": 0.05108288303017616, "epoch": 0.4912526242127362, "grad_norm": 0.4190778449955663, "kl": 1.34375, "learning_rate": 9.368286755909383e-06, "loss": 0.0035, "step": 1404 }, { "clip_ratio": 0.005661581642925739, "clipped_completions_ratio": 0.0, "epoch": 0.4916025192442267, "grad_norm": 0.9512467942770071, "kl": 1.5078125, "learning_rate": 9.366800214419536e-06, "loss": 0.0136, "max_completion_length": 200.0, "max_terminated_completion_length": 200.0, "mean_completion_length": 129.2678680419922, "mean_terminated_completion_length": 129.2678680419922, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 1053821.0, "reward": 2.593280076980591, "reward_std": 0.20178773999214172, "rewards/check_originality_func/mean": 0.6428571343421936, "rewards/check_originality_func/std": 0.48349377512931824, "rewards/check_winston_local_func/mean": 0.9846066832542419, "rewards/check_winston_local_func/std": 0.02565445564687252, "rewards/sentence_count_match_reward_logic/mean": 0.9658163785934448, "rewards/sentence_count_match_reward_logic/std": 0.07091652601957321, "step": 1405 }, { "clip_ratio": 0.016918659210205078, "epoch": 0.4919524142757173, "grad_norm": 0.6594174819787052, "kl": 1.5078125, "learning_rate": 9.36531204415454e-06, "loss": 0.0091, "step": 1406 }, { "clip_ratio": 0.03670854866504669, "epoch": 0.49230230930720786, "grad_norm": 0.49481090824801954, "kl": 1.515625, "learning_rate": 9.363822245669462e-06, "loss": 0.0068, "step": 1407 }, { "clip_ratio": 0.047799400985240936, "epoch": 0.4926522043386984, "grad_norm": 0.42561050829445435, "kl": 1.5234375, "learning_rate": 9.362330819519991e-06, "loss": 0.0053, "step": 1408 }, { "clip_ratio": 0.0038009854033589363, "clipped_completions_ratio": 0.1071428571428571, "epoch": 0.49300209937018896, "grad_norm": 0.7163774254137132, "kl": 1.265625, "learning_rate": 9.36083776626241e-06, "loss": 0.0127, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 181.2857208251953, "mean_terminated_completion_length": 172.3199920654297, "min_completion_length": 140.0, "min_terminated_completion_length": 140.0, "num_tokens": 1076061.0, "reward": 2.728226900100708, "reward_std": 0.2851127088069916, "rewards/check_originality_func/mean": 0.7857142686843872, "rewards/check_originality_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.9887496829032898, "rewards/check_winston_local_func/std": 0.007458928972482681, "rewards/sentence_count_match_reward_logic/mean": 0.9537627100944519, "rewards/sentence_count_match_reward_logic/std": 0.08930367976427078, "step": 1409 }, { "clip_ratio": 0.010284614749252796, "epoch": 0.4933519944016795, "grad_norm": 0.5705910869332796, "kl": 1.265625, "learning_rate": 9.359343086453613e-06, "loss": 0.0098, "step": 1410 }, { "clip_ratio": 0.024916376918554306, "epoch": 0.49370188943317006, "grad_norm": 0.4734194347879802, "kl": 1.265625, "learning_rate": 9.357846780651105e-06, "loss": 0.0071, "step": 1411 }, { "clip_ratio": 0.03872774541378021, "epoch": 0.4940517844646606, "grad_norm": 0.4008828549190635, "kl": 1.2734375, "learning_rate": 9.356348849412991e-06, "loss": 0.0054, "step": 1412 }, { "clip_ratio": 0.005494688171893358, "clipped_completions_ratio": 0.0, "epoch": 0.49440167949615116, "grad_norm": 0.8235509579134217, "kl": 1.5, "learning_rate": 9.354849293297987e-06, "loss": 0.0127, "max_completion_length": 194.0, "max_terminated_completion_length": 194.0, "mean_completion_length": 141.0357208251953, "mean_terminated_completion_length": 141.0357208251953, "min_completion_length": 101.0, "min_terminated_completion_length": 101.0, "num_tokens": 1093535.0, "reward": 2.9005379676818848, "reward_std": 0.14071138203144073, "rewards/check_originality_func/mean": 0.9285714030265808, "rewards/check_originality_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9898233413696289, "rewards/check_winston_local_func/std": 0.0043550594709813595, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.04767312481999397, "step": 1413 }, { "clip_ratio": 0.015563366934657097, "epoch": 0.4947515745276417, "grad_norm": 0.6930641707838632, "kl": 1.5, "learning_rate": 9.353348112865413e-06, "loss": 0.0086, "step": 1414 }, { "clip_ratio": 0.030912183225154877, "epoch": 0.49510146955913226, "grad_norm": 0.4828293759270495, "kl": 1.5, "learning_rate": 9.351845308675193e-06, "loss": 0.0054, "step": 1415 }, { "clip_ratio": 0.04826159402728081, "epoch": 0.49545136459062283, "grad_norm": 0.4518671711586982, "kl": 1.5078125, "learning_rate": 9.350340881287861e-06, "loss": 0.0027, "step": 1416 }, { "clip_ratio": 0.004885665141046047, "clipped_completions_ratio": 0.0, "epoch": 0.49580125962211335, "grad_norm": 0.9186265037413232, "kl": 1.5390625, "learning_rate": 9.348834831264552e-06, "loss": 0.0023, "max_completion_length": 219.0, "max_terminated_completion_length": 219.0, "mean_completion_length": 157.0, "mean_terminated_completion_length": 157.0, "min_completion_length": 74.0, "min_terminated_completion_length": 74.0, "num_tokens": 1113039.0, "reward": 2.957613945007324, "reward_std": 0.07599345594644547, "rewards/check_originality_func/mean": 0.9821428656578064, "rewards/check_originality_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9869502186775208, "rewards/check_winston_local_func/std": 0.011208808980882168, "rewards/sentence_count_match_reward_logic/mean": 0.9885204434394836, "rewards/sentence_count_match_reward_logic/std": 0.0418572835624218, "step": 1417 }, { "clip_ratio": 0.013121504336595535, "epoch": 0.49615115465360393, "grad_norm": 0.6165765340169541, "kl": 1.5390625, "learning_rate": 9.347327159167013e-06, "loss": -0.0014, "step": 1418 }, { "clip_ratio": 0.027954379096627235, "epoch": 0.49650104968509445, "grad_norm": 0.4585534733188691, "kl": 1.5390625, "learning_rate": 9.345817865557584e-06, "loss": -0.004, "step": 1419 }, { "clip_ratio": 0.040822792798280716, "epoch": 0.49685094471658503, "grad_norm": 0.42389915100023223, "kl": 1.5390625, "learning_rate": 9.344306950999226e-06, "loss": -0.0056, "step": 1420 }, { "clip_ratio": 0.003152207238599658, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.49720083974807555, "grad_norm": 0.7870119954237691, "kl": 1.21875, "learning_rate": 9.342794416055489e-06, "loss": 0.0172, "max_completion_length": 256.0, "max_terminated_completion_length": 188.0, "mean_completion_length": 189.42857360839844, "mean_terminated_completion_length": 162.8000030517578, "min_completion_length": 135.0, "min_terminated_completion_length": 135.0, "num_tokens": 1137151.0, "reward": 2.439150333404541, "reward_std": 0.25542086362838745, "rewards/check_originality_func/mean": 0.4821428656578064, "rewards/check_originality_func/std": 0.5042031407356262, "rewards/check_winston_local_func/mean": 0.988576352596283, "rewards/check_winston_local_func/std": 0.003963882103562355, "rewards/sentence_count_match_reward_logic/mean": 0.9684311151504517, "rewards/sentence_count_match_reward_logic/std": 0.05808786675333977, "step": 1421 }, { "clip_ratio": 0.011629407294094563, "epoch": 0.49755073477956613, "grad_norm": 0.6275235174210525, "kl": 1.2265625, "learning_rate": 9.341280261290537e-06, "loss": 0.0134, "step": 1422 }, { "clip_ratio": 0.030694596469402313, "epoch": 0.4979006298110567, "grad_norm": 0.540229476742146, "kl": 1.234375, "learning_rate": 9.339764487269133e-06, "loss": 0.01, "step": 1423 }, { "clip_ratio": 0.048162318766117096, "epoch": 0.4982505248425472, "grad_norm": 0.42645504316244465, "kl": 1.234375, "learning_rate": 9.338247094556651e-06, "loss": 0.0076, "step": 1424 }, { "clip_ratio": 0.005130842328071594, "clipped_completions_ratio": 0.0535714285714286, "epoch": 0.4986004198740378, "grad_norm": 0.7427838951306649, "kl": 1.2109375, "learning_rate": 9.33672808371906e-06, "loss": 0.0115, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 181.96429443359375, "mean_terminated_completion_length": 177.77359008789062, "min_completion_length": 126.0, "min_terminated_completion_length": 126.0, "num_tokens": 1159269.0, "reward": 2.831312656402588, "reward_std": 0.20752254128456116, "rewards/check_originality_func/mean": 0.8571428656578064, "rewards/check_originality_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9848838448524475, "rewards/check_winston_local_func/std": 0.01349086593836546, "rewards/sentence_count_match_reward_logic/mean": 0.9892857670783997, "rewards/sentence_count_match_reward_logic/std": 0.045441556721925735, "step": 1425 }, { "clip_ratio": 0.014679042622447014, "epoch": 0.4989503149055283, "grad_norm": 0.6113613439918761, "kl": 1.2109375, "learning_rate": 9.33520745532294e-06, "loss": 0.0089, "step": 1426 }, { "clip_ratio": 0.02858438529074192, "epoch": 0.4993002099370189, "grad_norm": 0.5881353669515371, "kl": 1.2109375, "learning_rate": 9.333685209935465e-06, "loss": 0.0059, "step": 1427 }, { "clip_ratio": 0.040182698518037796, "epoch": 0.4996501049685094, "grad_norm": 0.42841147613325514, "kl": 1.21875, "learning_rate": 9.332161348124426e-06, "loss": 0.0042, "step": 1428 }, { "clip_ratio": 0.00294986879453063, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.5, "grad_norm": 0.6606303131157211, "kl": 1.125, "learning_rate": 9.330635870458205e-06, "loss": 0.0128, "max_completion_length": 256.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 182.60714721679688, "mean_terminated_completion_length": 170.375, "min_completion_length": 83.0, "min_terminated_completion_length": 83.0, "num_tokens": 1181623.0, "reward": 2.8451905250549316, "reward_std": 0.09242739528417587, "rewards/check_originality_func/mean": 0.8928571343421936, "rewards/check_originality_func/std": 0.31209391355514526, "rewards/check_winston_local_func/mean": 0.9594760537147522, "rewards/check_winston_local_func/std": 0.0600665882229805, "rewards/sentence_count_match_reward_logic/mean": 0.9928570985794067, "rewards/sentence_count_match_reward_logic/std": 0.03745126724243164, "step": 1429 }, { "clip_ratio": 0.009517277590930462, "epoch": 0.5003498950314905, "grad_norm": 0.5155403387077422, "kl": 1.1328125, "learning_rate": 9.329108777505788e-06, "loss": 0.0102, "step": 1430 }, { "clip_ratio": 0.01974802277982235, "epoch": 0.5006997900629812, "grad_norm": 0.44553465832789646, "kl": 1.140625, "learning_rate": 9.327580069836773e-06, "loss": 0.0081, "step": 1431 }, { "clip_ratio": 0.03211156278848648, "epoch": 0.5010496850944717, "grad_norm": 0.36051828464803565, "kl": 1.140625, "learning_rate": 9.326049748021348e-06, "loss": 0.0065, "step": 1432 }, { "clip_ratio": 0.004453746136277914, "clipped_completions_ratio": 0.0, "epoch": 0.5013995801259622, "grad_norm": 0.9891611152794066, "kl": 1.59375, "learning_rate": 9.32451781263031e-06, "loss": 0.0161, "max_completion_length": 245.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 160.05357360839844, "mean_terminated_completion_length": 160.05357360839844, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 1201130.0, "reward": 2.7230632305145264, "reward_std": 0.18728001415729523, "rewards/check_originality_func/mean": 0.75, "rewards/check_originality_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.9873486161231995, "rewards/check_winston_local_func/std": 0.012289319187402725, "rewards/sentence_count_match_reward_logic/mean": 0.985714316368103, "rewards/sentence_count_match_reward_logic/std": 0.051974017173051834, "step": 1433 }, { "clip_ratio": 0.013716035522520542, "epoch": 0.5017494751574527, "grad_norm": 2.0808002583806244, "kl": 1.59375, "learning_rate": 9.322984264235055e-06, "loss": 0.014, "step": 1434 }, { "clip_ratio": 0.028997980058193207, "epoch": 0.5020993701889434, "grad_norm": 13.043354402198231, "kl": 3.109375, "learning_rate": 9.321449103407587e-06, "loss": 0.0241, "step": 1435 }, { "clip_ratio": 0.03770667314529419, "epoch": 0.5024492652204339, "grad_norm": 1.613826604238982, "kl": 1.796875, "learning_rate": 9.319912330720502e-06, "loss": 0.0093, "step": 1436 }, { "clip_ratio": 0.005521066021174192, "clipped_completions_ratio": 0.0, "epoch": 0.5027991602519244, "grad_norm": 0.9264485179252486, "kl": 1.671875, "learning_rate": 9.318373946747e-06, "loss": 0.0216, "max_completion_length": 246.0, "max_terminated_completion_length": 246.0, "mean_completion_length": 143.5357208251953, "mean_terminated_completion_length": 143.5357208251953, "min_completion_length": 98.0, "min_terminated_completion_length": 98.0, "num_tokens": 1218544.0, "reward": 2.896437168121338, "reward_std": 0.08471586555242538, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9892939329147339, "rewards/check_winston_local_func/std": 0.009619462303817272, "rewards/sentence_count_match_reward_logic/mean": 0.9964285492897034, "rewards/sentence_count_match_reward_logic/std": 0.026726121082901955, "step": 1437 }, { "clip_ratio": 0.013302263803780079, "epoch": 0.503149055283415, "grad_norm": 0.7101853135104974, "kl": 1.671875, "learning_rate": 9.316833952060891e-06, "loss": 0.0179, "step": 1438 }, { "clip_ratio": 0.037099748849868774, "epoch": 0.5034989503149055, "grad_norm": 0.5730601712173613, "kl": 1.671875, "learning_rate": 9.315292347236571e-06, "loss": 0.0141, "step": 1439 }, { "clip_ratio": 0.05295026674866676, "epoch": 0.5038488453463961, "grad_norm": 0.4744171421383945, "kl": 1.6875, "learning_rate": 9.313749132849048e-06, "loss": 0.0122, "step": 1440 }, { "clip_ratio": 0.0044950516894459724, "clipped_completions_ratio": 0.0535714285714286, "epoch": 0.5041987403778866, "grad_norm": 0.8030585467092205, "kl": 1.3671875, "learning_rate": 9.312204309473924e-06, "loss": 0.016, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 169.73214721679688, "mean_terminated_completion_length": 164.84906005859375, "min_completion_length": 105.0, "min_terminated_completion_length": 105.0, "num_tokens": 1239241.0, "reward": 2.7164804935455322, "reward_std": 0.31750982999801636, "rewards/check_originality_func/mean": 0.7321428656578064, "rewards/check_originality_func/std": 0.44685041904449463, "rewards/check_winston_local_func/mean": 0.986888587474823, "rewards/check_winston_local_func/std": 0.008252541534602642, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 1441 }, { "clip_ratio": 0.012187430635094643, "epoch": 0.5045486354093772, "grad_norm": 0.6840638760974918, "kl": 1.3671875, "learning_rate": 9.310657877687406e-06, "loss": 0.0128, "step": 1442 }, { "clip_ratio": 0.02995995432138443, "epoch": 0.5048985304408677, "grad_norm": 0.4925234963628876, "kl": 1.375, "learning_rate": 9.309109838066297e-06, "loss": 0.0099, "step": 1443 }, { "clip_ratio": 0.04540598765015602, "epoch": 0.5052484254723583, "grad_norm": 0.4772296313098869, "kl": 1.3828125, "learning_rate": 9.307560191188e-06, "loss": 0.0081, "step": 1444 }, { "clip_ratio": 0.004167091101408005, "clipped_completions_ratio": 0.0, "epoch": 0.5055983205038489, "grad_norm": 0.8346844944913436, "kl": 1.4453125, "learning_rate": 9.306008937630523e-06, "loss": 0.0141, "max_completion_length": 200.0, "max_terminated_completion_length": 200.0, "mean_completion_length": 160.25, "mean_terminated_completion_length": 160.25, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 1259063.0, "reward": 2.7971129417419434, "reward_std": 0.01663622073829174, "rewards/check_originality_func/mean": 0.8571428656578064, "rewards/check_originality_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.991628110408783, "rewards/check_winston_local_func/std": 0.0021461769938468933, "rewards/sentence_count_match_reward_logic/mean": 0.9483418464660645, "rewards/sentence_count_match_reward_logic/std": 0.12356216460466385, "step": 1445 }, { "clip_ratio": 0.012172210961580276, "epoch": 0.5059482155353394, "grad_norm": 0.6422337109109187, "kl": 1.4453125, "learning_rate": 9.304456077972463e-06, "loss": 0.01, "step": 1446 }, { "clip_ratio": 0.029804524034261703, "epoch": 0.5062981105668299, "grad_norm": 0.5056677369525304, "kl": 1.4453125, "learning_rate": 9.302901612793028e-06, "loss": 0.0066, "step": 1447 }, { "clip_ratio": 0.04307251796126366, "epoch": 0.5066480055983205, "grad_norm": 0.44301407804066184, "kl": 1.4375, "learning_rate": 9.301345542672012e-06, "loss": 0.005, "step": 1448 }, { "clip_ratio": 0.005817108787596226, "clipped_completions_ratio": 0.0, "epoch": 0.5069979006298111, "grad_norm": 0.8304735950740306, "kl": 1.59375, "learning_rate": 9.299787868189821e-06, "loss": 0.0121, "max_completion_length": 221.0, "max_terminated_completion_length": 221.0, "mean_completion_length": 143.71429443359375, "mean_terminated_completion_length": 143.71429443359375, "min_completion_length": 80.0, "min_terminated_completion_length": 80.0, "num_tokens": 1276735.0, "reward": 2.7951323986053467, "reward_std": 0.26261115074157715, "rewards/check_originality_func/mean": 0.8214285969734192, "rewards/check_originality_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9915607571601868, "rewards/check_winston_local_func/std": 0.0021742191165685654, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.05754726752638817, "step": 1449 }, { "clip_ratio": 0.013011666014790535, "epoch": 0.5073477956613016, "grad_norm": 0.5800389320836988, "kl": 1.59375, "learning_rate": 9.298228589927446e-06, "loss": 0.0083, "step": 1450 }, { "clip_ratio": 0.028852086514234543, "epoch": 0.5076976906927921, "grad_norm": 0.4563609765969615, "kl": 1.59375, "learning_rate": 9.29666770846649e-06, "loss": 0.0053, "step": 1451 }, { "clip_ratio": 0.04148577153682709, "epoch": 0.5080475857242828, "grad_norm": 0.46633875140061304, "kl": 1.59375, "learning_rate": 9.295105224389144e-06, "loss": 0.0041, "step": 1452 }, { "clip_ratio": 0.0059488811530172825, "clipped_completions_ratio": 0.0892857142857143, "epoch": 0.5083974807557733, "grad_norm": 0.9719462901643148, "kl": 1.4765625, "learning_rate": 9.293541138278199e-06, "loss": 0.0152, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 170.6607208251953, "mean_terminated_completion_length": 162.29412841796875, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 1297244.0, "reward": 2.7482359409332275, "reward_std": 0.23402626812458038, "rewards/check_originality_func/mean": 0.7678571343421936, "rewards/check_originality_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.9891582131385803, "rewards/check_winston_local_func/std": 0.00858743954449892, "rewards/sentence_count_match_reward_logic/mean": 0.9912201762199402, "rewards/sentence_count_match_reward_logic/std": 0.0379200279712677, "step": 1453 }, { "clip_ratio": 0.016879448667168617, "epoch": 0.5087473757872638, "grad_norm": 0.724678001833337, "kl": 1.4765625, "learning_rate": 9.291975450717043e-06, "loss": 0.0105, "step": 1454 }, { "clip_ratio": 0.03793760761618614, "epoch": 0.5090972708187543, "grad_norm": 0.5360337063968276, "kl": 1.484375, "learning_rate": 9.290408162289668e-06, "loss": 0.0073, "step": 1455 }, { "clip_ratio": 0.05230187252163887, "epoch": 0.509447165850245, "grad_norm": 0.5235221065268358, "kl": 1.4921875, "learning_rate": 9.288839273580652e-06, "loss": 0.0053, "step": 1456 }, { "clip_ratio": 0.006877355743199587, "clipped_completions_ratio": 0.0, "epoch": 0.5097970608817355, "grad_norm": 0.8969599625886013, "kl": 1.40625, "learning_rate": 9.28726878517518e-06, "loss": 0.0087, "max_completion_length": 185.0, "max_terminated_completion_length": 185.0, "mean_completion_length": 158.60714721679688, "mean_terminated_completion_length": 158.60714721679688, "min_completion_length": 127.0, "min_terminated_completion_length": 127.0, "num_tokens": 1316742.0, "reward": 2.9059884548187256, "reward_std": 0.08391585201025009, "rewards/check_originality_func/mean": 0.9821428656578064, "rewards/check_originality_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9902569651603699, "rewards/check_winston_local_func/std": 0.005321171600371599, "rewards/sentence_count_match_reward_logic/mean": 0.933588445186615, "rewards/sentence_count_match_reward_logic/std": 0.08480722457170486, "step": 1457 }, { "clip_ratio": 0.015680495649576187, "epoch": 0.510146955913226, "grad_norm": 0.7143958430105737, "kl": 1.40625, "learning_rate": 9.285696697659026e-06, "loss": 0.0047, "step": 1458 }, { "clip_ratio": 0.035206060856580734, "epoch": 0.5104968509447166, "grad_norm": 0.6411528388740905, "kl": 1.40625, "learning_rate": 9.284123011618564e-06, "loss": 0.0003, "step": 1459 }, { "clip_ratio": 0.05151151865720749, "epoch": 0.5108467459762072, "grad_norm": 0.5230378302540742, "kl": 1.4140625, "learning_rate": 9.282547727640767e-06, "loss": -0.0026, "step": 1460 }, { "clip_ratio": 0.0059289466589689255, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.5111966410076977, "grad_norm": 0.6600850549284119, "kl": 1.2421875, "learning_rate": 9.280970846313199e-06, "loss": 0.0098, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 197.5178680419922, "mean_terminated_completion_length": 187.77084350585938, "min_completion_length": 122.0, "min_terminated_completion_length": 122.0, "num_tokens": 1341235.0, "reward": 2.812070369720459, "reward_std": 0.1348467767238617, "rewards/check_originality_func/mean": 0.8928571343421936, "rewards/check_originality_func/std": 0.3120938837528229, "rewards/check_winston_local_func/mean": 0.9846892356872559, "rewards/check_winston_local_func/std": 0.018404025584459305, "rewards/sentence_count_match_reward_logic/mean": 0.9345237612724304, "rewards/sentence_count_match_reward_logic/std": 0.13457216322422028, "step": 1461 }, { "clip_ratio": 0.007704389281570911, "epoch": 0.5115465360391882, "grad_norm": 0.6027774207822079, "kl": 1.25, "learning_rate": 9.279392368224022e-06, "loss": 0.0075, "step": 1462 }, { "clip_ratio": 0.021188801154494286, "epoch": 0.5118964310706788, "grad_norm": 0.4800690880959666, "kl": 1.25, "learning_rate": 9.277812293961992e-06, "loss": 0.0043, "step": 1463 }, { "clip_ratio": 0.035752277821302414, "epoch": 0.5122463261021694, "grad_norm": 0.4365213345791739, "kl": 1.2578125, "learning_rate": 9.276230624116464e-06, "loss": 0.0023, "step": 1464 }, { "clip_ratio": 0.0055562760680913925, "clipped_completions_ratio": 0.0892857142857143, "epoch": 0.5125962211336599, "grad_norm": 0.7008885095348244, "kl": 1.2265625, "learning_rate": 9.274647359277386e-06, "loss": 0.0053, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 188.30357360839844, "mean_terminated_completion_length": 181.6666717529297, "min_completion_length": 132.0, "min_terminated_completion_length": 132.0, "num_tokens": 1364100.0, "reward": 2.8745625019073486, "reward_std": 0.13425877690315247, "rewards/check_originality_func/mean": 0.9285714030265808, "rewards/check_originality_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.983328640460968, "rewards/check_winston_local_func/std": 0.018054179847240448, "rewards/sentence_count_match_reward_logic/mean": 0.9626623392105103, "rewards/sentence_count_match_reward_logic/std": 0.07067461311817169, "step": 1465 }, { "clip_ratio": 0.009032802656292915, "epoch": 0.5129461161651504, "grad_norm": 0.6115023047032422, "kl": 1.2265625, "learning_rate": 9.273062500035298e-06, "loss": 0.0027, "step": 1466 }, { "clip_ratio": 0.02329397387802601, "epoch": 0.513296011196641, "grad_norm": 0.469055448005693, "kl": 1.2265625, "learning_rate": 9.27147604698134e-06, "loss": -0.0007, "step": 1467 }, { "clip_ratio": 0.03668951988220215, "epoch": 0.5136459062281316, "grad_norm": 0.3934721175052143, "kl": 1.234375, "learning_rate": 9.269888000707243e-06, "loss": -0.0027, "step": 1468 }, { "clip_ratio": 0.005261340644210577, "clipped_completions_ratio": 0.0, "epoch": 0.5139958012596221, "grad_norm": 0.7280990965619474, "kl": 1.390625, "learning_rate": 9.268298361805333e-06, "loss": 0.0133, "max_completion_length": 240.0, "max_terminated_completion_length": 240.0, "mean_completion_length": 175.2678680419922, "mean_terminated_completion_length": 175.2678680419922, "min_completion_length": 149.0, "min_terminated_completion_length": 149.0, "num_tokens": 1385555.0, "reward": 2.9257287979125977, "reward_std": 0.1437358260154724, "rewards/check_originality_func/mean": 0.9464285969734192, "rewards/check_originality_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9912047386169434, "rewards/check_winston_local_func/std": 0.004251287318766117, "rewards/sentence_count_match_reward_logic/mean": 0.988095223903656, "rewards/sentence_count_match_reward_logic/std": 0.053721532225608826, "step": 1469 }, { "clip_ratio": 0.010916388593614101, "epoch": 0.5143456962911127, "grad_norm": 0.6440013357947247, "kl": 1.390625, "learning_rate": 9.266707130868531e-06, "loss": 0.0105, "step": 1470 }, { "clip_ratio": 0.026537476107478142, "epoch": 0.5146955913226032, "grad_norm": 0.4956228115658322, "kl": 1.390625, "learning_rate": 9.265114308490352e-06, "loss": 0.0065, "step": 1471 }, { "clip_ratio": 0.03923932462930679, "epoch": 0.5150454863540938, "grad_norm": 0.43873336499350984, "kl": 1.3984375, "learning_rate": 9.263519895264901e-06, "loss": 0.0037, "step": 1472 }, { "clip_ratio": 0.005549679975956678, "clipped_completions_ratio": 0.0, "epoch": 0.5153953813855843, "grad_norm": 0.8513388454869447, "kl": 1.4375, "learning_rate": 9.261923891786881e-06, "loss": 0.0082, "max_completion_length": 251.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 171.05357360839844, "mean_terminated_completion_length": 171.05357360839844, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 1406190.0, "reward": 2.8644497394561768, "reward_std": 0.13910835981369019, "rewards/check_originality_func/mean": 0.8928571343421936, "rewards/check_originality_func/std": 0.31209391355514526, "rewards/check_winston_local_func/mean": 0.9899457097053528, "rewards/check_winston_local_func/std": 0.003984814044088125, "rewards/sentence_count_match_reward_logic/mean": 0.9816468358039856, "rewards/sentence_count_match_reward_logic/std": 0.042384978383779526, "step": 1473 }, { "clip_ratio": 0.014496900141239166, "epoch": 0.5157452764170749, "grad_norm": 0.6777644557895867, "kl": 1.4453125, "learning_rate": 9.260326298651586e-06, "loss": 0.0045, "step": 1474 }, { "clip_ratio": 0.030454600229859352, "epoch": 0.5160951714485654, "grad_norm": 0.6224180575239157, "kl": 1.4375, "learning_rate": 9.258727116454901e-06, "loss": 0.0014, "step": 1475 }, { "clip_ratio": 0.04271722584962845, "epoch": 0.516445066480056, "grad_norm": 0.4335320529409752, "kl": 1.4375, "learning_rate": 9.25712634579331e-06, "loss": -0.0012, "step": 1476 }, { "clip_ratio": 0.005454913713037968, "clipped_completions_ratio": 0.0714285714285714, "epoch": 0.5167949615115466, "grad_norm": 1.424837285628361, "kl": 1.921875, "learning_rate": 9.255523987263884e-06, "loss": 0.016, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 134.1607208251953, "mean_terminated_completion_length": 124.78846740722656, "min_completion_length": 70.0, "min_terminated_completion_length": 70.0, "num_tokens": 1422911.0, "reward": 2.838676691055298, "reward_std": 0.23329193890094757, "rewards/check_originality_func/mean": 0.8571428656578064, "rewards/check_originality_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9880812764167786, "rewards/check_winston_local_func/std": 0.008850513957440853, "rewards/sentence_count_match_reward_logic/mean": 0.9934523701667786, "rewards/sentence_count_match_reward_logic/std": 0.03447712957859039, "step": 1477 }, { "clip_ratio": 0.01478518359363079, "epoch": 0.5171448565430371, "grad_norm": 0.7836342243623446, "kl": 1.9140625, "learning_rate": 9.253920041464283e-06, "loss": 0.0112, "step": 1478 }, { "clip_ratio": 0.033867448568344116, "epoch": 0.5174947515745276, "grad_norm": 0.6505319593972854, "kl": 1.9140625, "learning_rate": 9.25231450899277e-06, "loss": 0.009, "step": 1479 }, { "clip_ratio": 0.0472593829035759, "epoch": 0.5178446466060181, "grad_norm": 0.6148143311247064, "kl": 1.953125, "learning_rate": 9.250707390448187e-06, "loss": 0.0072, "step": 1480 }, { "clip_ratio": 0.004880478605628014, "clipped_completions_ratio": 0.3035714285714286, "epoch": 0.5181945416375088, "grad_norm": 0.8142664261729899, "kl": 1.28125, "learning_rate": 9.249098686429983e-06, "loss": 0.0077, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 204.17857360839844, "mean_terminated_completion_length": 181.58975219726562, "min_completion_length": 71.0, "min_terminated_completion_length": 71.0, "num_tokens": 1447329.0, "reward": 2.7225868701934814, "reward_std": 0.2115725576877594, "rewards/check_originality_func/mean": 0.7678571343421936, "rewards/check_originality_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.9874676465988159, "rewards/check_winston_local_func/std": 0.005710007157176733, "rewards/sentence_count_match_reward_logic/mean": 0.9672618508338928, "rewards/sentence_count_match_reward_logic/std": 0.07746276259422302, "step": 1481 }, { "clip_ratio": 0.01185649074614048, "epoch": 0.5185444366689993, "grad_norm": 0.7186655760911294, "kl": 1.2734375, "learning_rate": 9.24748839753818e-06, "loss": 0.0046, "step": 1482 }, { "clip_ratio": 0.03250127658247948, "epoch": 0.5188943317004898, "grad_norm": 0.5735332950354983, "kl": 1.2734375, "learning_rate": 9.245876524373405e-06, "loss": 0.0011, "step": 1483 }, { "clip_ratio": 0.0476921908557415, "epoch": 0.5192442267319805, "grad_norm": 0.4857802481323066, "kl": 1.2734375, "learning_rate": 9.244263067536872e-06, "loss": -0.0017, "step": 1484 }, { "clip_ratio": 0.004942556377500296, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.519594121763471, "grad_norm": 0.8747245470420937, "kl": 1.359375, "learning_rate": 9.242648027630382e-06, "loss": 0.0158, "max_completion_length": 256.0, "max_terminated_completion_length": 215.0, "mean_completion_length": 176.96429443359375, "mean_terminated_completion_length": 163.7916717529297, "min_completion_length": 103.0, "min_terminated_completion_length": 103.0, "num_tokens": 1468591.0, "reward": 2.7218172550201416, "reward_std": 0.21798190474510193, "rewards/check_originality_func/mean": 0.8571428656578064, "rewards/check_originality_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9503035545349121, "rewards/check_winston_local_func/std": 0.10528495162725449, "rewards/sentence_count_match_reward_logic/mean": 0.9143707156181335, "rewards/sentence_count_match_reward_logic/std": 0.1459280103445053, "step": 1485 }, { "clip_ratio": 0.014659041538834572, "epoch": 0.5199440167949615, "grad_norm": 0.6732862398981798, "kl": 1.3359375, "learning_rate": 9.241031405256334e-06, "loss": 0.0122, "step": 1486 }, { "clip_ratio": 0.030539344996213913, "epoch": 0.520293911826452, "grad_norm": 0.5534129280273881, "kl": 1.34375, "learning_rate": 9.239413201017709e-06, "loss": 0.0089, "step": 1487 }, { "clip_ratio": 0.046452198177576065, "epoch": 0.5206438068579426, "grad_norm": 0.4982167822112026, "kl": 1.359375, "learning_rate": 9.237793415518083e-06, "loss": 0.0066, "step": 1488 }, { "clip_ratio": 0.006105160806328058, "clipped_completions_ratio": 0.0, "epoch": 0.5209937018894332, "grad_norm": 0.9027681852976048, "kl": 1.578125, "learning_rate": 9.23617204936162e-06, "loss": 0.0055, "max_completion_length": 251.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 151.625, "mean_terminated_completion_length": 151.625, "min_completion_length": 82.0, "min_terminated_completion_length": 82.0, "num_tokens": 1487034.0, "reward": 2.8447353839874268, "reward_std": 0.18321481347084045, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9899736046791077, "rewards/check_winston_local_func/std": 0.006083894520998001, "rewards/sentence_count_match_reward_logic/mean": 0.9440476298332214, "rewards/sentence_count_match_reward_logic/std": 0.08601318299770355, "step": 1489 }, { "clip_ratio": 0.016611037775874138, "epoch": 0.5213435969209237, "grad_norm": 0.623360763861033, "kl": 1.6015625, "learning_rate": 9.234549103153077e-06, "loss": 0.0021, "step": 1490 }, { "clip_ratio": 0.029035862535238266, "epoch": 0.5216934919524143, "grad_norm": 0.6385342210557797, "kl": 1.6328125, "learning_rate": 9.232924577497797e-06, "loss": 0.0002, "step": 1491 }, { "clip_ratio": 0.04268219694495201, "epoch": 0.5220433869839048, "grad_norm": 0.47993792079564435, "kl": 1.5859375, "learning_rate": 9.23129847300171e-06, "loss": -0.0014, "step": 1492 }, { "clip_ratio": 0.005223024170845747, "clipped_completions_ratio": 0.0, "epoch": 0.5223932820153954, "grad_norm": 0.8232148368138872, "kl": 1.3359375, "learning_rate": 9.229670790271339e-06, "loss": 0.0186, "max_completion_length": 223.0, "max_terminated_completion_length": 223.0, "mean_completion_length": 168.08929443359375, "mean_terminated_completion_length": 168.08929443359375, "min_completion_length": 89.0, "min_terminated_completion_length": 89.0, "num_tokens": 1507607.0, "reward": 2.7398011684417725, "reward_std": 0.212740957736969, "rewards/check_originality_func/mean": 0.7678571343421936, "rewards/check_originality_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.9880151748657227, "rewards/check_winston_local_func/std": 0.007480742875486612, "rewards/sentence_count_match_reward_logic/mean": 0.9839285612106323, "rewards/sentence_count_match_reward_logic/std": 0.05202605202794075, "step": 1493 }, { "clip_ratio": 0.013942261226475239, "epoch": 0.5227431770468859, "grad_norm": 0.6378789737347056, "kl": 1.328125, "learning_rate": 9.228041529913794e-06, "loss": 0.0145, "step": 1494 }, { "clip_ratio": 0.026751315221190453, "epoch": 0.5230930720783765, "grad_norm": 0.5174645822837256, "kl": 1.328125, "learning_rate": 9.226410692536773e-06, "loss": 0.0117, "step": 1495 }, { "clip_ratio": 0.044698528945446014, "epoch": 0.523442967109867, "grad_norm": 0.4072282972353224, "kl": 1.328125, "learning_rate": 9.224778278748567e-06, "loss": 0.0098, "step": 1496 }, { "clip_ratio": 0.007186316419392824, "clipped_completions_ratio": 0.017857142857142905, "epoch": 0.5237928621413576, "grad_norm": 1.2055958186339402, "kl": 1.8984375, "learning_rate": 9.223144289158046e-06, "loss": 0.0193, "max_completion_length": 256.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 136.55357360839844, "mean_terminated_completion_length": 134.38182067871094, "min_completion_length": 65.0, "min_terminated_completion_length": 65.0, "num_tokens": 1524342.0, "reward": 2.834303379058838, "reward_std": 0.1999741941690445, "rewards/check_originality_func/mean": 0.8571428656578064, "rewards/check_originality_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9924876093864441, "rewards/check_winston_local_func/std": 0.0020545809529721737, "rewards/sentence_count_match_reward_logic/mean": 0.9846726655960083, "rewards/sentence_count_match_reward_logic/std": 0.05932653695344925, "step": 1497 }, { "clip_ratio": 0.01947002485394478, "epoch": 0.5241427571728482, "grad_norm": 0.8134405355246377, "kl": 1.90625, "learning_rate": 9.221508724374674e-06, "loss": 0.0139, "step": 1498 }, { "clip_ratio": 0.04135635122656822, "epoch": 0.5244926522043387, "grad_norm": 0.7312272671346759, "kl": 1.921875, "learning_rate": 9.219871585008503e-06, "loss": 0.0101, "step": 1499 }, { "clip_ratio": 0.057328712195158005, "epoch": 0.5248425472358292, "grad_norm": 0.6645532166392808, "kl": 1.921875, "learning_rate": 9.218232871670168e-06, "loss": 0.0072, "step": 1500 }, { "clip_ratio": 0.005753465928137302, "clipped_completions_ratio": 0.0, "epoch": 0.5251924422673198, "grad_norm": 0.9947067159791143, "kl": 1.890625, "learning_rate": 9.216592584970893e-06, "loss": 0.0179, "max_completion_length": 251.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 140.7857208251953, "mean_terminated_completion_length": 140.7857208251953, "min_completion_length": 78.0, "min_terminated_completion_length": 78.0, "num_tokens": 1541738.0, "reward": 2.785485029220581, "reward_std": 0.2638635039329529, "rewards/check_originality_func/mean": 0.8214285969734192, "rewards/check_originality_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9895662665367126, "rewards/check_winston_local_func/std": 0.005101821385324001, "rewards/sentence_count_match_reward_logic/mean": 0.9744897484779358, "rewards/sentence_count_match_reward_logic/std": 0.080691859126091, "step": 1501 }, { "clip_ratio": 0.01860940083861351, "epoch": 0.5255423372988104, "grad_norm": 0.696643273513937, "kl": 1.8984375, "learning_rate": 9.214950725522495e-06, "loss": 0.0137, "step": 1502 }, { "clip_ratio": 0.037208814173936844, "epoch": 0.5258922323303009, "grad_norm": 0.6182450340635984, "kl": 1.8984375, "learning_rate": 9.213307293937366e-06, "loss": 0.011, "step": 1503 }, { "clip_ratio": 0.04938323050737381, "epoch": 0.5262421273617914, "grad_norm": 0.47909851752694355, "kl": 1.8984375, "learning_rate": 9.211662290828493e-06, "loss": 0.0084, "step": 1504 }, { "clip_ratio": 0.005369211081415415, "clipped_completions_ratio": 0.0, "epoch": 0.5265920223932821, "grad_norm": 0.8037566904879928, "kl": 1.4140625, "learning_rate": 9.210015716809446e-06, "loss": 0.0066, "max_completion_length": 246.0, "max_terminated_completion_length": 246.0, "mean_completion_length": 170.10714721679688, "mean_terminated_completion_length": 170.10714721679688, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 1562440.0, "reward": 2.725799798965454, "reward_std": 0.16676516830921173, "rewards/check_originality_func/mean": 0.75, "rewards/check_originality_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.9817518591880798, "rewards/check_winston_local_func/std": 0.027812324464321136, "rewards/sentence_count_match_reward_logic/mean": 0.9940475821495056, "rewards/sentence_count_match_reward_logic/std": 0.031209392473101616, "step": 1505 }, { "clip_ratio": 0.013752736151218414, "epoch": 0.5269419174247726, "grad_norm": 0.6524393318852013, "kl": 1.4140625, "learning_rate": 9.208367572494381e-06, "loss": 0.0032, "step": 1506 }, { "clip_ratio": 0.026316510513424873, "epoch": 0.5272918124562631, "grad_norm": 0.48069457472176064, "kl": 1.4140625, "learning_rate": 9.206717858498042e-06, "loss": -0.0005, "step": 1507 }, { "clip_ratio": 0.03742130473256111, "epoch": 0.5276417074877536, "grad_norm": 0.39411795554425405, "kl": 1.4140625, "learning_rate": 9.205066575435754e-06, "loss": -0.0021, "step": 1508 }, { "clip_ratio": 0.005584368482232094, "clipped_completions_ratio": 0.0, "epoch": 0.5279916025192443, "grad_norm": 1.015707603902313, "kl": 1.6484375, "learning_rate": 9.203413723923433e-06, "loss": 0.0047, "max_completion_length": 211.0, "max_terminated_completion_length": 211.0, "mean_completion_length": 143.0357208251953, "mean_terminated_completion_length": 143.0357208251953, "min_completion_length": 73.0, "min_terminated_completion_length": 73.0, "num_tokens": 1580154.0, "reward": 2.7189342975616455, "reward_std": 0.3049834668636322, "rewards/check_originality_func/mean": 0.7678571343421936, "rewards/check_originality_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.9893211722373962, "rewards/check_winston_local_func/std": 0.005348391830921173, "rewards/sentence_count_match_reward_logic/mean": 0.9617558717727661, "rewards/sentence_count_match_reward_logic/std": 0.068171426653862, "step": 1509 }, { "clip_ratio": 0.01883404329419136, "epoch": 0.5283414975507348, "grad_norm": 0.711415917410841, "kl": 1.640625, "learning_rate": 9.201759304577576e-06, "loss": 0.0001, "step": 1510 }, { "clip_ratio": 0.040490735322237015, "epoch": 0.5286913925822253, "grad_norm": 0.5260083266810285, "kl": 1.640625, "learning_rate": 9.200103318015266e-06, "loss": -0.0022, "step": 1511 }, { "clip_ratio": 0.055699367076158524, "epoch": 0.5290412876137159, "grad_norm": 0.4188204394096767, "kl": 1.640625, "learning_rate": 9.198445764854166e-06, "loss": -0.0042, "step": 1512 }, { "clip_ratio": 0.003750074887648225, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.5293911826452065, "grad_norm": 0.7273364275873615, "kl": 1.234375, "learning_rate": 9.196786645712535e-06, "loss": 0.0096, "max_completion_length": 256.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 201.7857208251953, "mean_terminated_completion_length": 192.75, "min_completion_length": 132.0, "min_terminated_completion_length": 132.0, "num_tokens": 1604534.0, "reward": 2.702334403991699, "reward_std": 0.06884507834911346, "rewards/check_originality_func/mean": 0.7321428656578064, "rewards/check_originality_func/std": 0.44685041904449463, "rewards/check_winston_local_func/mean": 0.9886439442634583, "rewards/check_winston_local_func/std": 0.010940050706267357, "rewards/sentence_count_match_reward_logic/mean": 0.9815476536750793, "rewards/sentence_count_match_reward_logic/std": 0.05390588566660881, "step": 1513 }, { "clip_ratio": 0.008017930202186108, "epoch": 0.529741077676697, "grad_norm": 0.6212528608145331, "kl": 1.234375, "learning_rate": 9.195125961209203e-06, "loss": 0.007, "step": 1514 }, { "clip_ratio": 0.020103000104427338, "epoch": 0.5300909727081875, "grad_norm": 0.4953610576482522, "kl": 1.234375, "learning_rate": 9.193463711963593e-06, "loss": 0.0044, "step": 1515 }, { "clip_ratio": 0.034805554896593094, "epoch": 0.5304408677396781, "grad_norm": 0.44899486005567824, "kl": 1.234375, "learning_rate": 9.191799898595706e-06, "loss": 0.0016, "step": 1516 }, { "clip_ratio": 0.005804464686661959, "clipped_completions_ratio": 0.0, "epoch": 0.5307907627711687, "grad_norm": 0.7249418847651158, "kl": 1.40625, "learning_rate": 9.19013452172613e-06, "loss": 0.0073, "max_completion_length": 212.0, "max_terminated_completion_length": 212.0, "mean_completion_length": 175.82144165039062, "mean_terminated_completion_length": 175.82144165039062, "min_completion_length": 133.0, "min_terminated_completion_length": 133.0, "num_tokens": 1625468.0, "reward": 2.9162018299102783, "reward_std": 0.1393963247537613, "rewards/check_originality_func/mean": 0.9285714030265808, "rewards/check_originality_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9927321672439575, "rewards/check_winston_local_func/std": 0.0028623356483876705, "rewards/sentence_count_match_reward_logic/mean": 0.9948979616165161, "rewards/sentence_count_match_reward_logic/std": 0.026750901713967323, "step": 1517 }, { "clip_ratio": 0.009809721261262894, "epoch": 0.5311406578026592, "grad_norm": 0.6639698952436639, "kl": 1.40625, "learning_rate": 9.188467581976035e-06, "loss": 0.0052, "step": 1518 }, { "clip_ratio": 0.024976126849651337, "epoch": 0.5314905528341498, "grad_norm": 0.4816063785918628, "kl": 1.4140625, "learning_rate": 9.18679907996717e-06, "loss": 0.0011, "step": 1519 }, { "clip_ratio": 0.04057883471250534, "epoch": 0.5318404478656403, "grad_norm": 0.4208483880507387, "kl": 1.4140625, "learning_rate": 9.185129016321877e-06, "loss": -0.0007, "step": 1520 }, { "clip_ratio": 0.004509320016950369, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.5321903428971309, "grad_norm": 0.893005665220823, "kl": 1.421875, "learning_rate": 9.183457391663068e-06, "loss": 0.0143, "max_completion_length": 256.0, "max_terminated_completion_length": 228.0, "mean_completion_length": 169.7857208251953, "mean_terminated_completion_length": 155.4166717529297, "min_completion_length": 105.0, "min_terminated_completion_length": 105.0, "num_tokens": 1646600.0, "reward": 2.744330883026123, "reward_std": 0.15185831487178802, "rewards/check_originality_func/mean": 0.7857142686843872, "rewards/check_originality_func/std": 0.41403934359550476, "rewards/check_winston_local_func/mean": 0.992205023765564, "rewards/check_winston_local_func/std": 0.002301053609699011, "rewards/sentence_count_match_reward_logic/mean": 0.9664115905761719, "rewards/sentence_count_match_reward_logic/std": 0.05902854725718498, "step": 1521 }, { "clip_ratio": 0.012030145153403282, "epoch": 0.5325402379286214, "grad_norm": 0.6361978517758092, "kl": 1.421875, "learning_rate": 9.181784206614247e-06, "loss": 0.0106, "step": 1522 }, { "clip_ratio": 0.02671997621655464, "epoch": 0.532890132960112, "grad_norm": 0.4778765789255451, "kl": 1.4296875, "learning_rate": 9.180109461799493e-06, "loss": 0.0078, "step": 1523 }, { "clip_ratio": 0.041247863322496414, "epoch": 0.5332400279916025, "grad_norm": 0.4137967977639336, "kl": 1.4375, "learning_rate": 9.178433157843474e-06, "loss": 0.0058, "step": 1524 }, { "clip_ratio": 0.0030652384739369154, "clipped_completions_ratio": 0.4107142857142857, "epoch": 0.533589923023093, "grad_norm": 0.4955148505030735, "kl": 1.046875, "learning_rate": 9.176755295371433e-06, "loss": 0.0101, "max_completion_length": 256.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 212.1428680419922, "mean_terminated_completion_length": 181.5757598876953, "min_completion_length": 134.0, "min_terminated_completion_length": 134.0, "num_tokens": 1672480.0, "reward": 2.7209441661834717, "reward_std": 0.2129743993282318, "rewards/check_originality_func/mean": 0.7678571343421936, "rewards/check_originality_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.9693205952644348, "rewards/check_winston_local_func/std": 0.05922117829322815, "rewards/sentence_count_match_reward_logic/mean": 0.9837661981582642, "rewards/sentence_count_match_reward_logic/std": 0.04284103587269783, "step": 1525 }, { "clip_ratio": 0.005768115166574717, "epoch": 0.5339398180545836, "grad_norm": 0.4738893464300683, "kl": 1.046875, "learning_rate": 9.175075875009197e-06, "loss": 0.0089, "step": 1526 }, { "clip_ratio": 0.012836823239922523, "epoch": 0.5342897130860742, "grad_norm": 0.4051418279197384, "kl": 1.046875, "learning_rate": 9.173394897383172e-06, "loss": 0.0069, "step": 1527 }, { "clip_ratio": 0.02330399677157402, "epoch": 0.5346396081175647, "grad_norm": 0.33551166796086307, "kl": 1.0546875, "learning_rate": 9.171712363120351e-06, "loss": 0.0053, "step": 1528 }, { "clip_ratio": 0.003385162679478526, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.5349895031490552, "grad_norm": 1.1059759808565321, "kl": 1.0859375, "learning_rate": 9.1700282728483e-06, "loss": 0.0141, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 212.69644165039062, "mean_terminated_completion_length": 195.375, "min_completion_length": 167.0, "min_terminated_completion_length": 167.0, "num_tokens": 1698135.0, "reward": 2.7624828815460205, "reward_std": 0.24186362326145172, "rewards/check_originality_func/mean": 0.8035714030265808, "rewards/check_originality_func/std": 0.40089187026023865, "rewards/check_winston_local_func/mean": 0.9708160758018494, "rewards/check_winston_local_func/std": 0.05854424461722374, "rewards/sentence_count_match_reward_logic/mean": 0.988095223903656, "rewards/sentence_count_match_reward_logic/std": 0.03467709571123123, "step": 1529 }, { "clip_ratio": 0.009321048855781555, "epoch": 0.5353393981805459, "grad_norm": 0.583322855247045, "kl": 1.0625, "learning_rate": 9.16834262719517e-06, "loss": 0.0106, "step": 1530 }, { "clip_ratio": 0.01985154300928116, "epoch": 0.5356892932120364, "grad_norm": 0.44576490605353775, "kl": 1.0625, "learning_rate": 9.166655426789692e-06, "loss": 0.0085, "step": 1531 }, { "clip_ratio": 0.03381825610995293, "epoch": 0.5360391882435269, "grad_norm": 0.410992384614992, "kl": 1.0625, "learning_rate": 9.164966672261171e-06, "loss": 0.0065, "step": 1532 }, { "clip_ratio": 0.0045369877479970455, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.5363890832750174, "grad_norm": 0.7768020161079819, "kl": 1.3359375, "learning_rate": 9.163276364239504e-06, "loss": 0.0525, "max_completion_length": 256.0, "max_terminated_completion_length": 248.0, "mean_completion_length": 191.1607208251953, "mean_terminated_completion_length": 188.75926208496094, "min_completion_length": 127.0, "min_terminated_completion_length": 127.0, "num_tokens": 1720672.0, "reward": 2.9226810932159424, "reward_std": 0.10286673903465271, "rewards/check_originality_func/mean": 0.9642857313156128, "rewards/check_originality_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.991877555847168, "rewards/check_winston_local_func/std": 0.004529962781816721, "rewards/sentence_count_match_reward_logic/mean": 0.9665178656578064, "rewards/sentence_count_match_reward_logic/std": 0.08278703689575195, "step": 1533 }, { "clip_ratio": 0.010418240912258625, "epoch": 0.5367389783065081, "grad_norm": 0.6635015929295419, "kl": 1.328125, "learning_rate": 9.161584503355155e-06, "loss": 0.0494, "step": 1534 }, { "clip_ratio": 0.02402973361313343, "epoch": 0.5370888733379986, "grad_norm": 0.5347004882732016, "kl": 1.328125, "learning_rate": 9.15989109023917e-06, "loss": 0.045, "step": 1535 }, { "clip_ratio": 0.040735628455877304, "epoch": 0.5374387683694891, "grad_norm": 0.44622854674620305, "kl": 1.328125, "learning_rate": 9.158196125523182e-06, "loss": 0.0419, "step": 1536 }, { "clip_ratio": 0.004594913683831692, "clipped_completions_ratio": 0.0, "epoch": 0.5377886634009797, "grad_norm": 0.8960733268192868, "kl": 1.4453125, "learning_rate": 9.156499609839394e-06, "loss": 0.0161, "max_completion_length": 235.0, "max_terminated_completion_length": 235.0, "mean_completion_length": 170.87501525878906, "mean_terminated_completion_length": 170.87501525878906, "min_completion_length": 102.0, "min_terminated_completion_length": 102.0, "num_tokens": 1741705.0, "reward": 2.7700507640838623, "reward_std": 0.21730029582977295, "rewards/check_originality_func/mean": 0.7857142686843872, "rewards/check_originality_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.9910327196121216, "rewards/check_winston_local_func/std": 0.005487225018441677, "rewards/sentence_count_match_reward_logic/mean": 0.9933035969734192, "rewards/sentence_count_match_reward_logic/std": 0.028400972485542297, "step": 1537 }, { "clip_ratio": 0.017735537141561508, "epoch": 0.5381385584324703, "grad_norm": 0.668765996452728, "kl": 1.453125, "learning_rate": 9.154801543820589e-06, "loss": 0.0124, "step": 1538 }, { "clip_ratio": 0.03310443088412285, "epoch": 0.5384884534639608, "grad_norm": 0.557879955258996, "kl": 1.4609375, "learning_rate": 9.15310192810013e-06, "loss": 0.0087, "step": 1539 }, { "clip_ratio": 0.04835443198680878, "epoch": 0.5388383484954513, "grad_norm": 0.44398096023095834, "kl": 1.4609375, "learning_rate": 9.151400763311958e-06, "loss": 0.0062, "step": 1540 }, { "clip_ratio": 0.006839794106781483, "clipped_completions_ratio": 0.0, "epoch": 0.539188243526942, "grad_norm": 1.045623421076277, "kl": 1.6796875, "learning_rate": 9.149698050090594e-06, "loss": 0.0216, "max_completion_length": 166.0, "max_terminated_completion_length": 166.0, "mean_completion_length": 137.5, "mean_terminated_completion_length": 137.5, "min_completion_length": 74.0, "min_terminated_completion_length": 74.0, "num_tokens": 1758437.0, "reward": 2.794018268585205, "reward_std": 0.31816336512565613, "rewards/check_originality_func/mean": 0.8035714030265808, "rewards/check_originality_func/std": 0.40089184045791626, "rewards/check_winston_local_func/mean": 0.9904466271400452, "rewards/check_winston_local_func/std": 0.0026296896394342184, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1541 }, { "clip_ratio": 0.019307607784867287, "epoch": 0.5395381385584325, "grad_norm": 0.7490489304098684, "kl": 1.6796875, "learning_rate": 9.147993789071128e-06, "loss": 0.0164, "step": 1542 }, { "clip_ratio": 0.040307022631168365, "epoch": 0.539888033589923, "grad_norm": 0.573915514773464, "kl": 1.6796875, "learning_rate": 9.146287980889239e-06, "loss": 0.0129, "step": 1543 }, { "clip_ratio": 0.05339236557483673, "epoch": 0.5402379286214136, "grad_norm": 0.46616234364669207, "kl": 1.6796875, "learning_rate": 9.144580626181176e-06, "loss": 0.0112, "step": 1544 }, { "clip_ratio": 0.00584153737872839, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.5405878236529041, "grad_norm": 0.6582232036778538, "kl": 1.21875, "learning_rate": 9.142871725583763e-06, "loss": 0.0202, "max_completion_length": 256.0, "max_terminated_completion_length": 239.0, "mean_completion_length": 206.96429443359375, "mean_terminated_completion_length": 198.7916717529297, "min_completion_length": 146.0, "min_terminated_completion_length": 146.0, "num_tokens": 1783771.0, "reward": 2.8944618701934814, "reward_std": 0.11621589958667755, "rewards/check_originality_func/mean": 0.9642857313156128, "rewards/check_originality_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9874676465988159, "rewards/check_winston_local_func/std": 0.029206395149230957, "rewards/sentence_count_match_reward_logic/mean": 0.9427083730697632, "rewards/sentence_count_match_reward_logic/std": 0.13285008072853088, "step": 1545 }, { "clip_ratio": 0.010153188370168209, "epoch": 0.5409377186843947, "grad_norm": 0.6401935412742291, "kl": 1.21875, "learning_rate": 9.141161279734408e-06, "loss": 0.0174, "step": 1546 }, { "clip_ratio": 0.019960373640060425, "epoch": 0.5412876137158852, "grad_norm": 0.5139190725280004, "kl": 1.21875, "learning_rate": 9.139449289271087e-06, "loss": 0.0146, "step": 1547 }, { "clip_ratio": 0.03439062088727951, "epoch": 0.5416375087473758, "grad_norm": 0.48175427495514334, "kl": 1.2421875, "learning_rate": 9.13773575483236e-06, "loss": 0.0112, "step": 1548 }, { "clip_ratio": 0.005212484858930111, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.5419874037788663, "grad_norm": 0.8566149317894173, "kl": 1.3671875, "learning_rate": 9.136020677057357e-06, "loss": 0.0103, "max_completion_length": 256.0, "max_terminated_completion_length": 190.0, "mean_completion_length": 180.23214721679688, "mean_terminated_completion_length": 149.9250030517578, "min_completion_length": 73.0, "min_terminated_completion_length": 73.0, "num_tokens": 1806400.0, "reward": 2.8808228969573975, "reward_std": 0.17296501994132996, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9899497628211975, "rewards/check_winston_local_func/std": 0.0034715975634753704, "rewards/sentence_count_match_reward_logic/mean": 0.9801587462425232, "rewards/sentence_count_match_reward_logic/std": 0.05236126855015755, "step": 1549 }, { "clip_ratio": 0.011629616841673851, "epoch": 0.5423372988103569, "grad_norm": 0.5738632970021165, "kl": 1.3671875, "learning_rate": 9.134304056585786e-06, "loss": 0.0072, "step": 1550 }, { "clip_ratio": 0.02410878799855709, "epoch": 0.5426871938418475, "grad_norm": 0.45157135311946284, "kl": 1.3671875, "learning_rate": 9.13258589405793e-06, "loss": 0.0044, "step": 1551 }, { "clip_ratio": 0.03611298277974129, "epoch": 0.543037088873338, "grad_norm": 0.3978747716978332, "kl": 1.375, "learning_rate": 9.130866190114649e-06, "loss": 0.0031, "step": 1552 }, { "clip_ratio": 0.0065942187793552876, "clipped_completions_ratio": 0.0, "epoch": 0.5433869839048285, "grad_norm": 0.7388766448253058, "kl": 1.25, "learning_rate": 9.129144945397373e-06, "loss": 0.0064, "max_completion_length": 232.0, "max_terminated_completion_length": 232.0, "mean_completion_length": 193.25001525878906, "mean_terminated_completion_length": 193.25001525878906, "min_completion_length": 140.0, "min_terminated_completion_length": 140.0, "num_tokens": 1829750.0, "reward": 2.9926371574401855, "reward_std": 0.0018219754565507174, "rewards/check_originality_func/mean": 1.0, "rewards/check_originality_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9926370978355408, "rewards/check_winston_local_func/std": 0.0021702521480619907, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1553 }, { "clip_ratio": 0.01065507996827364, "epoch": 0.5437368789363191, "grad_norm": 0.6692455017063372, "kl": 1.25, "learning_rate": 9.127422160548114e-06, "loss": 0.0031, "step": 1554 }, { "clip_ratio": 0.025276340544223785, "epoch": 0.5440867739678097, "grad_norm": 2.52166320680458, "kl": 1.390625, "learning_rate": 9.12569783620945e-06, "loss": 0.0014, "step": 1555 }, { "clip_ratio": 0.0384550616145134, "epoch": 0.5444366689993002, "grad_norm": 0.48031893912000995, "kl": 1.25, "learning_rate": 9.123971973024543e-06, "loss": -0.0011, "step": 1556 }, { "clip_ratio": 0.0055661131627857685, "clipped_completions_ratio": 0.1785714285714286, "epoch": 0.5447865640307907, "grad_norm": 0.796944772913187, "kl": 1.359375, "learning_rate": 9.12224457163712e-06, "loss": 0.0229, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 218.46429443359375, "mean_terminated_completion_length": 210.30435180664062, "min_completion_length": 172.0, "min_terminated_completion_length": 172.0, "num_tokens": 1855104.0, "reward": 2.802607297897339, "reward_std": 0.2839454114437103, "rewards/check_originality_func/mean": 0.8214285969734192, "rewards/check_originality_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9923392534255981, "rewards/check_winston_local_func/std": 0.0018493032548576593, "rewards/sentence_count_match_reward_logic/mean": 0.9888392686843872, "rewards/sentence_count_match_reward_logic/std": 0.03781523555517197, "step": 1557 }, { "clip_ratio": 0.008703131228685379, "epoch": 0.5451364590622814, "grad_norm": 0.687687281846461, "kl": 1.359375, "learning_rate": 9.120515632691486e-06, "loss": 0.0193, "step": 1558 }, { "clip_ratio": 0.02192872017621994, "epoch": 0.5454863540937719, "grad_norm": 0.5299709338361266, "kl": 1.359375, "learning_rate": 9.118785156832518e-06, "loss": 0.0154, "step": 1559 }, { "clip_ratio": 0.03931228071451187, "epoch": 0.5458362491252624, "grad_norm": 0.4469949178496335, "kl": 1.3515625, "learning_rate": 9.11705314470567e-06, "loss": 0.0118, "step": 1560 }, { "clip_ratio": 0.0049471138045191765, "clipped_completions_ratio": 0.0, "epoch": 0.5461861441567529, "grad_norm": 0.8291084778908344, "kl": 1.5078125, "learning_rate": 9.115319596956962e-06, "loss": 0.0036, "max_completion_length": 207.0, "max_terminated_completion_length": 207.0, "mean_completion_length": 156.85714721679688, "mean_terminated_completion_length": 156.85714721679688, "min_completion_length": 73.0, "min_terminated_completion_length": 73.0, "num_tokens": 1874144.0, "reward": 2.802393913269043, "reward_std": 0.13944107294082642, "rewards/check_originality_func/mean": 0.8214285969734192, "rewards/check_originality_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9904891848564148, "rewards/check_winston_local_func/std": 0.004385435488075018, "rewards/sentence_count_match_reward_logic/mean": 0.9904761910438538, "rewards/sentence_count_match_reward_logic/std": 0.040558867156505585, "step": 1561 }, { "clip_ratio": 0.010923783294856548, "epoch": 0.5465360391882436, "grad_norm": 0.665568978164375, "kl": 1.5078125, "learning_rate": 9.113584514232994e-06, "loss": 0.0001, "step": 1562 }, { "clip_ratio": 0.02882484719157219, "epoch": 0.5468859342197341, "grad_norm": 0.5144744264765384, "kl": 1.515625, "learning_rate": 9.111847897180935e-06, "loss": -0.0034, "step": 1563 }, { "clip_ratio": 0.046950701624155045, "epoch": 0.5472358292512246, "grad_norm": 0.4729282926299081, "kl": 1.5234375, "learning_rate": 9.110109746448527e-06, "loss": -0.0064, "step": 1564 }, { "clip_ratio": 0.004134749993681908, "clipped_completions_ratio": 0.0892857142857143, "epoch": 0.5475857242827152, "grad_norm": 0.7829901836997144, "kl": 1.2890625, "learning_rate": 9.108370062684082e-06, "loss": 0.0118, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 196.42857360839844, "mean_terminated_completion_length": 190.58824157714844, "min_completion_length": 149.0, "min_terminated_completion_length": 149.0, "num_tokens": 1897784.0, "reward": 2.860138177871704, "reward_std": 0.14940373599529266, "rewards/check_originality_func/mean": 0.875, "rewards/check_originality_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.9889078736305237, "rewards/check_winston_local_func/std": 0.006543994881212711, "rewards/sentence_count_match_reward_logic/mean": 0.9962301254272461, "rewards/sentence_count_match_reward_logic/std": 0.019794318825006485, "step": 1565 }, { "clip_ratio": 0.01178108248859644, "epoch": 0.5479356193142058, "grad_norm": 0.6509798224976052, "kl": 1.2890625, "learning_rate": 9.106628846536488e-06, "loss": 0.0092, "step": 1566 }, { "clip_ratio": 0.026878587901592255, "epoch": 0.5482855143456963, "grad_norm": 0.49067668645033474, "kl": 1.296875, "learning_rate": 9.1048860986552e-06, "loss": 0.0055, "step": 1567 }, { "clip_ratio": 0.04357193037867546, "epoch": 0.5486354093771868, "grad_norm": 0.43471066656636, "kl": 1.296875, "learning_rate": 9.103141819690246e-06, "loss": 0.003, "step": 1568 }, { "clip_ratio": 0.006464312318712473, "clipped_completions_ratio": 0.0, "epoch": 0.5489853044086774, "grad_norm": 0.8666438622012316, "kl": 1.59375, "learning_rate": 9.101396010292228e-06, "loss": 0.0189, "max_completion_length": 240.0, "max_terminated_completion_length": 240.0, "mean_completion_length": 159.21429443359375, "mean_terminated_completion_length": 159.21429443359375, "min_completion_length": 102.0, "min_terminated_completion_length": 102.0, "num_tokens": 1916932.0, "reward": 2.9274697303771973, "reward_std": 0.0954056829214096, "rewards/check_originality_func/mean": 0.9464285969734192, "rewards/check_originality_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9920955300331116, "rewards/check_winston_local_func/std": 0.002017445396631956, "rewards/sentence_count_match_reward_logic/mean": 0.9889455437660217, "rewards/sentence_count_match_reward_logic/std": 0.0403459295630455, "step": 1569 }, { "clip_ratio": 0.01277988962829113, "epoch": 0.549335199440168, "grad_norm": 0.7175695342983458, "kl": 1.59375, "learning_rate": 9.099648671112315e-06, "loss": 0.0149, "step": 1570 }, { "clip_ratio": 0.03326551616191864, "epoch": 0.5496850944716585, "grad_norm": 0.5579649982797353, "kl": 1.59375, "learning_rate": 9.097899802802247e-06, "loss": 0.0104, "step": 1571 }, { "clip_ratio": 0.05257796496152878, "epoch": 0.5500349895031491, "grad_norm": 0.4640983012506353, "kl": 1.59375, "learning_rate": 9.09614940601434e-06, "loss": 0.008, "step": 1572 }, { "clip_ratio": 0.00510165560990572, "clipped_completions_ratio": 0.0, "epoch": 0.5503848845346396, "grad_norm": 0.834548753460458, "kl": 1.4765625, "learning_rate": 9.09439748140147e-06, "loss": 0.0116, "max_completion_length": 253.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 193.44644165039062, "mean_terminated_completion_length": 193.44644165039062, "min_completion_length": 83.0, "min_terminated_completion_length": 83.0, "num_tokens": 1940029.0, "reward": 2.9747183322906494, "reward_std": 0.05176786333322525, "rewards/check_originality_func/mean": 0.9821428656578064, "rewards/check_originality_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9925752282142639, "rewards/check_winston_local_func/std": 0.0021363261621445417, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1573 }, { "clip_ratio": 0.011353236623108387, "epoch": 0.5507347795661302, "grad_norm": 0.7055357966251614, "kl": 1.46875, "learning_rate": 9.092644029617088e-06, "loss": 0.0075, "step": 1574 }, { "clip_ratio": 0.023534677922725677, "epoch": 0.5510846745976207, "grad_norm": 0.4956783214061487, "kl": 1.4765625, "learning_rate": 9.090889051315222e-06, "loss": 0.0049, "step": 1575 }, { "clip_ratio": 0.04111763462424278, "epoch": 0.5514345696291113, "grad_norm": 0.5222923906728029, "kl": 1.484375, "learning_rate": 9.089132547150453e-06, "loss": 0.0029, "step": 1576 }, { "clip_ratio": 0.0058310311287641525, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.5517844646606018, "grad_norm": 1.3947094830307276, "kl": 1.4375, "learning_rate": 9.087374517777947e-06, "loss": 0.0124, "max_completion_length": 256.0, "max_terminated_completion_length": 228.0, "mean_completion_length": 186.67857360839844, "mean_terminated_completion_length": 175.125, "min_completion_length": 93.0, "min_terminated_completion_length": 93.0, "num_tokens": 1962435.0, "reward": 2.8614962100982666, "reward_std": 0.04869551956653595, "rewards/check_originality_func/mean": 0.875, "rewards/check_originality_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.9924481511116028, "rewards/check_winston_local_func/std": 0.002178777474910021, "rewards/sentence_count_match_reward_logic/mean": 0.9940475821495056, "rewards/sentence_count_match_reward_logic/std": 0.02524530701339245, "step": 1577 }, { "clip_ratio": 0.011433946900069714, "epoch": 0.5521343596920923, "grad_norm": 110609.46905897027, "kl": 11392.0, "learning_rate": 9.085614963853429e-06, "loss": 114.5252, "step": 1578 }, { "clip_ratio": 0.01696484163403511, "epoch": 0.552484254723583, "grad_norm": 2813.0484254609914, "kl": 239.0, "learning_rate": 9.083853886033198e-06, "loss": 2.3801, "step": 1579 }, { "clip_ratio": 0.025260325521230698, "epoch": 0.5528341497550735, "grad_norm": 3.9594895310336784, "kl": 1.4375, "learning_rate": 9.08209128497412e-06, "loss": 0.0098, "step": 1580 }, { "clip_ratio": 0.00539001589640975, "clipped_completions_ratio": 0.0, "epoch": 0.553184044786564, "grad_norm": 0.9105480713009401, "kl": 1.5078125, "learning_rate": 9.080327161333624e-06, "loss": 0.0101, "max_completion_length": 201.0, "max_terminated_completion_length": 201.0, "mean_completion_length": 154.9107208251953, "mean_terminated_completion_length": 154.9107208251953, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 1981078.0, "reward": 2.823596239089966, "reward_std": 0.1956283003091812, "rewards/check_originality_func/mean": 0.8392857313156128, "rewards/check_originality_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.9843102693557739, "rewards/check_winston_local_func/std": 0.03185805678367615, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1581 }, { "clip_ratio": 0.011245991103351116, "epoch": 0.5535339398180545, "grad_norm": 0.6180654722271787, "kl": 1.515625, "learning_rate": 9.078561515769717e-06, "loss": 0.0071, "step": 1582 }, { "clip_ratio": 0.030870363116264343, "epoch": 0.5538838348495452, "grad_norm": 0.5738731509746539, "kl": 1.5234375, "learning_rate": 9.076794348940965e-06, "loss": 0.0039, "step": 1583 }, { "clip_ratio": 0.044801969081163406, "epoch": 0.5542337298810357, "grad_norm": 0.47328859807572393, "kl": 1.53125, "learning_rate": 9.075025661506505e-06, "loss": 0.002, "step": 1584 }, { "clip_ratio": 0.005766505841165781, "clipped_completions_ratio": 0.0, "epoch": 0.5545836249125262, "grad_norm": 0.8657211658016691, "kl": 1.6015625, "learning_rate": 9.07325545412604e-06, "loss": 0.0177, "max_completion_length": 204.0, "max_terminated_completion_length": 204.0, "mean_completion_length": 161.1428680419922, "mean_terminated_completion_length": 161.1428680419922, "min_completion_length": 118.0, "min_terminated_completion_length": 118.0, "num_tokens": 2000366.0, "reward": 2.896683931350708, "reward_std": 0.23113462328910828, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9931122660636902, "rewards/check_winston_local_func/std": 0.0016329489881172776, "rewards/sentence_count_match_reward_logic/mean": 0.9928570985794067, "rewards/sentence_count_match_reward_logic/std": 0.03745126724243164, "step": 1585 }, { "clip_ratio": 0.01735108159482479, "epoch": 0.5549335199440167, "grad_norm": 0.7564138619059841, "kl": 1.59375, "learning_rate": 9.071483727459842e-06, "loss": 0.0144, "step": 1586 }, { "clip_ratio": 0.02800741419196129, "epoch": 0.5552834149755074, "grad_norm": 0.5894090820524737, "kl": 1.6015625, "learning_rate": 9.069710482168746e-06, "loss": 0.0121, "step": 1587 }, { "clip_ratio": 0.036324407905340195, "epoch": 0.5556333100069979, "grad_norm": 0.4076000273004046, "kl": 1.6015625, "learning_rate": 9.06793571891416e-06, "loss": 0.0098, "step": 1588 }, { "clip_ratio": 0.007824153639376163, "clipped_completions_ratio": 0.0, "epoch": 0.5559832050384884, "grad_norm": 0.928385994159017, "kl": 1.8046875, "learning_rate": 9.066159438358049e-06, "loss": 0.0195, "max_completion_length": 210.0, "max_terminated_completion_length": 210.0, "mean_completion_length": 128.7678680419922, "mean_terminated_completion_length": 128.7678680419922, "min_completion_length": 83.0, "min_terminated_completion_length": 83.0, "num_tokens": 2016281.0, "reward": 2.8737919330596924, "reward_std": 0.14884476363658905, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9630775451660156, "rewards/check_winston_local_func/std": 0.08463993668556213, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1589 }, { "clip_ratio": 0.016867322847247124, "epoch": 0.556333100069979, "grad_norm": 0.8933800723862566, "kl": 1.8125, "learning_rate": 9.064381641162952e-06, "loss": 0.0156, "step": 1590 }, { "clip_ratio": 0.037668392062187195, "epoch": 0.5566829951014696, "grad_norm": 0.605145788802844, "kl": 1.8125, "learning_rate": 9.06260232799197e-06, "loss": 0.0121, "step": 1591 }, { "clip_ratio": 0.05397147685289383, "epoch": 0.5570328901329601, "grad_norm": 0.525373850491048, "kl": 1.8125, "learning_rate": 9.060821499508769e-06, "loss": 0.0099, "step": 1592 }, { "clip_ratio": 0.004773738328367472, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.5573827851644506, "grad_norm": 0.7935802007831887, "kl": 1.5078125, "learning_rate": 9.05903915637758e-06, "loss": 0.0174, "max_completion_length": 256.0, "max_terminated_completion_length": 236.0, "mean_completion_length": 174.19644165039062, "mean_terminated_completion_length": 160.5625, "min_completion_length": 94.0, "min_terminated_completion_length": 94.0, "num_tokens": 2037636.0, "reward": 2.756836175918579, "reward_std": 0.08984281122684479, "rewards/check_originality_func/mean": 0.8035714030265808, "rewards/check_originality_func/std": 0.40089187026023865, "rewards/check_winston_local_func/mean": 0.9822825193405151, "rewards/check_winston_local_func/std": 0.027312111109495163, "rewards/sentence_count_match_reward_logic/mean": 0.9709821343421936, "rewards/sentence_count_match_reward_logic/std": 0.07534417510032654, "step": 1593 }, { "clip_ratio": 0.013190227560698986, "epoch": 0.5577326801959412, "grad_norm": 0.6457249602098923, "kl": 1.515625, "learning_rate": 9.057255299263204e-06, "loss": 0.0139, "step": 1594 }, { "clip_ratio": 0.029817940667271614, "epoch": 0.5580825752274318, "grad_norm": 0.5611092509274587, "kl": 1.53125, "learning_rate": 9.055469928831e-06, "loss": 0.0109, "step": 1595 }, { "clip_ratio": 0.046930942684412, "epoch": 0.5584324702589223, "grad_norm": 0.4971312300019001, "kl": 1.53125, "learning_rate": 9.053683045746897e-06, "loss": 0.0091, "step": 1596 }, { "clip_ratio": 0.004531575832515955, "clipped_completions_ratio": 0.125, "epoch": 0.5587823652904129, "grad_norm": 0.9159035160358326, "kl": 1.609375, "learning_rate": 9.05189465067738e-06, "loss": 0.0133, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 147.55357360839844, "mean_terminated_completion_length": 132.06121826171875, "min_completion_length": 79.0, "min_terminated_completion_length": 79.0, "num_tokens": 2055771.0, "reward": 2.953636884689331, "reward_std": 0.06677519530057907, "rewards/check_originality_func/mean": 0.9821428656578064, "rewards/check_originality_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9857794642448425, "rewards/check_winston_local_func/std": 0.033251725137233734, "rewards/sentence_count_match_reward_logic/mean": 0.985714316368103, "rewards/sentence_count_match_reward_logic/std": 0.051974013447761536, "step": 1597 }, { "clip_ratio": 0.011264722794294357, "epoch": 0.5591322603219034, "grad_norm": 0.6988930505345462, "kl": 1.609375, "learning_rate": 9.05010474428951e-06, "loss": 0.01, "step": 1598 }, { "clip_ratio": 0.027086416259407997, "epoch": 0.559482155353394, "grad_norm": 0.5310295311057197, "kl": 1.609375, "learning_rate": 9.0483133272509e-06, "loss": 0.007, "step": 1599 }, { "clip_ratio": 0.04260743036866188, "epoch": 0.5598320503848845, "grad_norm": 0.4702339240526421, "kl": 1.6171875, "learning_rate": 9.046520400229734e-06, "loss": 0.0055, "step": 1600 }, { "clip_ratio": 0.0060084969736635685, "clipped_completions_ratio": 0.0, "epoch": 0.5601819454163751, "grad_norm": 0.7962448373063161, "kl": 1.4375, "learning_rate": 9.044725963894756e-06, "loss": 0.012, "max_completion_length": 205.0, "max_terminated_completion_length": 205.0, "mean_completion_length": 168.17857360839844, "mean_terminated_completion_length": 168.17857360839844, "min_completion_length": 112.0, "min_terminated_completion_length": 112.0, "num_tokens": 2075669.0, "reward": 2.9199414253234863, "reward_std": 0.0773719772696495, "rewards/check_originality_func/mean": 0.9285714030265808, "rewards/check_originality_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9913698434829712, "rewards/check_winston_local_func/std": 0.002016786253079772, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1601 }, { "clip_ratio": 0.012303329072892666, "epoch": 0.5605318404478656, "grad_norm": 0.7397105357171162, "kl": 1.4375, "learning_rate": 9.042930018915274e-06, "loss": 0.0095, "step": 1602 }, { "clip_ratio": 0.029621955007314682, "epoch": 0.5608817354793562, "grad_norm": 0.5561872390533473, "kl": 1.4375, "learning_rate": 9.041132565961159e-06, "loss": 0.0055, "step": 1603 }, { "clip_ratio": 0.04558155685663223, "epoch": 0.5612316305108468, "grad_norm": 0.42715498086554843, "kl": 1.4453125, "learning_rate": 9.039333605702844e-06, "loss": 0.0031, "step": 1604 }, { "clip_ratio": 0.006680475547909737, "clipped_completions_ratio": 0.0, "epoch": 0.5615815255423373, "grad_norm": 0.984279326082729, "kl": 1.7109375, "learning_rate": 9.037533138811325e-06, "loss": 0.0133, "max_completion_length": 245.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 156.85714721679688, "mean_terminated_completion_length": 156.85714721679688, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 2094677.0, "reward": 2.808915138244629, "reward_std": 0.20249442756175995, "rewards/check_originality_func/mean": 0.8214285969734192, "rewards/check_originality_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9874865412712097, "rewards/check_winston_local_func/std": 0.00894214678555727, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1605 }, { "clip_ratio": 0.014491247944533825, "epoch": 0.5619314205738278, "grad_norm": 0.7567449248747393, "kl": 1.7109375, "learning_rate": 9.035731165958156e-06, "loss": 0.0088, "step": 1606 }, { "clip_ratio": 0.037973448634147644, "epoch": 0.5622813156053184, "grad_norm": 0.6801607198947657, "kl": 1.71875, "learning_rate": 9.033927687815458e-06, "loss": 0.0058, "step": 1607 }, { "clip_ratio": 0.05554327368736267, "epoch": 0.562631210636809, "grad_norm": 0.5270583452684391, "kl": 1.7265625, "learning_rate": 9.032122705055912e-06, "loss": 0.0034, "step": 1608 }, { "clip_ratio": 0.00604108115658164, "clipped_completions_ratio": 0.0714285714285714, "epoch": 0.5629811056682995, "grad_norm": 0.8525636985767783, "kl": 1.375, "learning_rate": 9.030316218352757e-06, "loss": 0.0133, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 182.44644165039062, "mean_terminated_completion_length": 176.78846740722656, "min_completion_length": 125.0, "min_terminated_completion_length": 125.0, "num_tokens": 2116750.0, "reward": 2.824892997741699, "reward_std": 0.14915511012077332, "rewards/check_originality_func/mean": 0.8392857313156128, "rewards/check_originality_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.9891785383224487, "rewards/check_winston_local_func/std": 0.008084534667432308, "rewards/sentence_count_match_reward_logic/mean": 0.9964285492897034, "rewards/sentence_count_match_reward_logic/std": 0.026726122945547104, "step": 1609 }, { "clip_ratio": 0.013154539279639721, "epoch": 0.56333100069979, "grad_norm": 0.7121687288960417, "kl": 1.375, "learning_rate": 9.028508228379798e-06, "loss": 0.0097, "step": 1610 }, { "clip_ratio": 0.03067266196012497, "epoch": 0.5636808957312807, "grad_norm": 0.5664683114314586, "kl": 1.375, "learning_rate": 9.026698735811395e-06, "loss": 0.0066, "step": 1611 }, { "clip_ratio": 0.044532861560583115, "epoch": 0.5640307907627712, "grad_norm": 0.4295546736437786, "kl": 1.390625, "learning_rate": 9.024887741322475e-06, "loss": 0.0041, "step": 1612 }, { "clip_ratio": 0.006637147627770901, "clipped_completions_ratio": 0.0, "epoch": 0.5643806857942617, "grad_norm": 1.0749198217020264, "kl": 1.984375, "learning_rate": 9.023075245588521e-06, "loss": 0.0158, "max_completion_length": 254.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 134.7678680419922, "mean_terminated_completion_length": 134.7678680419922, "min_completion_length": 77.0, "min_terminated_completion_length": 77.0, "num_tokens": 2133321.0, "reward": 2.9679360389709473, "reward_std": 0.06328929960727692, "rewards/check_originality_func/mean": 0.9821428656578064, "rewards/check_originality_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9887691140174866, "rewards/check_winston_local_func/std": 0.012815999798476696, "rewards/sentence_count_match_reward_logic/mean": 0.9970237612724304, "rewards/sentence_count_match_reward_logic/std": 0.02227177284657955, "step": 1613 }, { "clip_ratio": 0.017308542504906654, "epoch": 0.5647305808257522, "grad_norm": 0.7552102988395952, "kl": 1.9609375, "learning_rate": 9.021261249285575e-06, "loss": 0.0106, "step": 1614 }, { "clip_ratio": 0.035466425120830536, "epoch": 0.5650804758572429, "grad_norm": 0.558949030938833, "kl": 1.9609375, "learning_rate": 9.019445753090243e-06, "loss": 0.0072, "step": 1615 }, { "clip_ratio": 0.05300519987940788, "epoch": 0.5654303708887334, "grad_norm": 0.4502454260026967, "kl": 1.9609375, "learning_rate": 9.017628757679685e-06, "loss": 0.0049, "step": 1616 }, { "clip_ratio": 0.007156444247812033, "clipped_completions_ratio": 0.0, "epoch": 0.5657802659202239, "grad_norm": 1.474198063119226, "kl": 1.890625, "learning_rate": 9.015810263731626e-06, "loss": 0.0267, "max_completion_length": 253.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 135.375, "mean_terminated_completion_length": 135.375, "min_completion_length": 52.0, "min_terminated_completion_length": 52.0, "num_tokens": 2149734.0, "reward": 2.8479464054107666, "reward_std": 0.10274534672498703, "rewards/check_originality_func/mean": 0.8571428656578064, "rewards/check_originality_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9908033013343811, "rewards/check_winston_local_func/std": 0.003913251683115959, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1617 }, { "clip_ratio": 0.019359517842531204, "epoch": 0.5661301609517145, "grad_norm": 0.937217349240316, "kl": 1.8984375, "learning_rate": 9.013990271924345e-06, "loss": 0.0224, "step": 1618 }, { "clip_ratio": 0.0356641486287117, "epoch": 0.566480055983205, "grad_norm": 0.6717074771307143, "kl": 1.8984375, "learning_rate": 9.012168782936684e-06, "loss": 0.0194, "step": 1619 }, { "clip_ratio": 0.054757989943027496, "epoch": 0.5668299510146956, "grad_norm": 0.6275632116890312, "kl": 1.9296875, "learning_rate": 9.010345797448037e-06, "loss": 0.0171, "step": 1620 }, { "clip_ratio": 0.007541895378381014, "clipped_completions_ratio": 0.0, "epoch": 0.5671798460461861, "grad_norm": 0.8847144317077913, "kl": 1.4140625, "learning_rate": 9.008521316138364e-06, "loss": 0.0126, "max_completion_length": 251.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 178.46429443359375, "mean_terminated_completion_length": 178.46429443359375, "min_completion_length": 122.0, "min_terminated_completion_length": 122.0, "num_tokens": 2171144.0, "reward": 2.809156894683838, "reward_std": 0.20420916378498077, "rewards/check_originality_func/mean": 0.875, "rewards/check_originality_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.988876223564148, "rewards/check_winston_local_func/std": 0.006020308937877417, "rewards/sentence_count_match_reward_logic/mean": 0.9452806115150452, "rewards/sentence_count_match_reward_logic/std": 0.0896320715546608, "step": 1621 }, { "clip_ratio": 0.018132943660020828, "epoch": 0.5675297410776767, "grad_norm": 0.6467760815643242, "kl": 1.421875, "learning_rate": 9.006695339688179e-06, "loss": 0.0091, "step": 1622 }, { "clip_ratio": 0.03228289633989334, "epoch": 0.5678796361091673, "grad_norm": 0.5859327265874971, "kl": 1.4296875, "learning_rate": 9.00486786877855e-06, "loss": 0.0066, "step": 1623 }, { "clip_ratio": 0.04546486586332321, "epoch": 0.5682295311406578, "grad_norm": 0.42768866260686556, "kl": 1.4296875, "learning_rate": 9.003038904091113e-06, "loss": 0.0051, "step": 1624 }, { "clip_ratio": 0.0071746548637747765, "clipped_completions_ratio": 0.1785714285714286, "epoch": 0.5685794261721484, "grad_norm": 0.9496216315922223, "kl": 1.6015625, "learning_rate": 9.00120844630805e-06, "loss": 0.0102, "max_completion_length": 256.0, "max_terminated_completion_length": 250.0, "mean_completion_length": 168.32144165039062, "mean_terminated_completion_length": 149.26087951660156, "min_completion_length": 110.0, "min_terminated_completion_length": 110.0, "num_tokens": 2191218.0, "reward": 2.9012818336486816, "reward_std": 0.1281745880842209, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9905673265457153, "rewards/check_winston_local_func/std": 0.00479535385966301, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1625 }, { "clip_ratio": 0.014028368517756462, "epoch": 0.5689293212036389, "grad_norm": 0.7643417426870875, "kl": 1.6015625, "learning_rate": 8.999376496112105e-06, "loss": 0.007, "step": 1626 }, { "clip_ratio": 0.03309495374560356, "epoch": 0.5692792162351294, "grad_norm": 0.7318347777069477, "kl": 1.6171875, "learning_rate": 8.997543054186577e-06, "loss": 0.004, "step": 1627 }, { "clip_ratio": 0.04518233239650726, "epoch": 0.56962911126662, "grad_norm": 0.48184912114254014, "kl": 1.640625, "learning_rate": 8.995708121215325e-06, "loss": 0.0016, "step": 1628 }, { "clip_ratio": 0.006935488898307085, "clipped_completions_ratio": 0.0, "epoch": 0.5699790062981106, "grad_norm": 0.8551113687349602, "kl": 1.4921875, "learning_rate": 8.99387169788276e-06, "loss": 0.0108, "max_completion_length": 227.0, "max_terminated_completion_length": 227.0, "mean_completion_length": 172.12501525878906, "mean_terminated_completion_length": 172.12501525878906, "min_completion_length": 65.0, "min_terminated_completion_length": 65.0, "num_tokens": 2211657.0, "reward": 2.881596326828003, "reward_std": 0.12653851509094238, "rewards/check_originality_func/mean": 0.8928571343421936, "rewards/check_originality_func/std": 0.31209391355514526, "rewards/check_winston_local_func/mean": 0.988739013671875, "rewards/check_winston_local_func/std": 0.005683646537363529, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1629 }, { "clip_ratio": 0.013090456835925579, "epoch": 0.5703289013296011, "grad_norm": 0.7536955673784689, "kl": 1.4921875, "learning_rate": 8.992033784873854e-06, "loss": 0.0082, "step": 1630 }, { "clip_ratio": 0.02407756820321083, "epoch": 0.5706787963610916, "grad_norm": 0.5235108172315824, "kl": 1.5, "learning_rate": 8.990194382874126e-06, "loss": 0.0051, "step": 1631 }, { "clip_ratio": 0.04114918410778046, "epoch": 0.5710286913925823, "grad_norm": 0.5017185697138573, "kl": 1.5078125, "learning_rate": 8.988353492569657e-06, "loss": 0.0028, "step": 1632 }, { "clip_ratio": 0.007690296042710543, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.5713785864240728, "grad_norm": 0.8874505449610227, "kl": 1.4375, "learning_rate": 8.986511114647085e-06, "loss": 0.0109, "max_completion_length": 256.0, "max_terminated_completion_length": 238.0, "mean_completion_length": 193.4107208251953, "mean_terminated_completion_length": 182.9791717529297, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 2234832.0, "reward": 2.5466549396514893, "reward_std": 0.19667057693004608, "rewards/check_originality_func/mean": 0.5892857313156128, "rewards/check_originality_func/std": 0.49641573429107666, "rewards/check_winston_local_func/mean": 0.9871311187744141, "rewards/check_winston_local_func/std": 0.00891741644591093, "rewards/sentence_count_match_reward_logic/mean": 0.9702380895614624, "rewards/sentence_count_match_reward_logic/std": 0.07525809109210968, "step": 1633 }, { "clip_ratio": 0.01608034037053585, "epoch": 0.5717284814555633, "grad_norm": 0.7039674235943476, "kl": 1.4296875, "learning_rate": 8.984667249793594e-06, "loss": 0.0071, "step": 1634 }, { "clip_ratio": 0.03067627176642418, "epoch": 0.5720783764870538, "grad_norm": 0.587492628871218, "kl": 1.4296875, "learning_rate": 8.982821898696933e-06, "loss": 0.0046, "step": 1635 }, { "clip_ratio": 0.046167220920324326, "epoch": 0.5724282715185445, "grad_norm": 0.5520475237264656, "kl": 1.4375, "learning_rate": 8.980975062045398e-06, "loss": 0.0025, "step": 1636 }, { "clip_ratio": 0.007724837865680456, "clipped_completions_ratio": 0.0535714285714286, "epoch": 0.572778166550035, "grad_norm": 1.035532344157047, "kl": 1.75, "learning_rate": 8.979126740527842e-06, "loss": 0.0217, "max_completion_length": 256.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 165.98214721679688, "mean_terminated_completion_length": 160.8867950439453, "min_completion_length": 81.0, "min_terminated_completion_length": 81.0, "num_tokens": 2254495.0, "reward": 2.843907594680786, "reward_std": 0.19484013319015503, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9919729232788086, "rewards/check_winston_local_func/std": 0.0019984389655292034, "rewards/sentence_count_match_reward_logic/mean": 0.9412202835083008, "rewards/sentence_count_match_reward_logic/std": 0.1201789453625679, "step": 1637 }, { "clip_ratio": 0.01883881911635399, "epoch": 0.5731280615815255, "grad_norm": 0.8197359616127504, "kl": 1.7421875, "learning_rate": 8.977276934833673e-06, "loss": 0.0161, "step": 1638 }, { "clip_ratio": 0.03778909146785736, "epoch": 0.5734779566130161, "grad_norm": 0.6264600165753856, "kl": 1.7421875, "learning_rate": 8.975425645652848e-06, "loss": 0.0121, "step": 1639 }, { "clip_ratio": 0.05712758004665375, "epoch": 0.5738278516445067, "grad_norm": 0.5494252325773995, "kl": 1.7421875, "learning_rate": 8.973572873675882e-06, "loss": 0.0093, "step": 1640 }, { "clip_ratio": 0.005564040504395962, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.5741777466759972, "grad_norm": 0.9292138450723547, "kl": 1.3359375, "learning_rate": 8.971718619593844e-06, "loss": 0.0246, "max_completion_length": 256.0, "max_terminated_completion_length": 220.0, "mean_completion_length": 177.08929443359375, "mean_terminated_completion_length": 174.1666717529297, "min_completion_length": 116.0, "min_terminated_completion_length": 116.0, "num_tokens": 2275212.0, "reward": 2.8968729972839355, "reward_std": 0.1930885761976242, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9897302985191345, "rewards/check_winston_local_func/std": 0.007466427516192198, "rewards/sentence_count_match_reward_logic/mean": 0.9964285492897034, "rewards/sentence_count_match_reward_logic/std": 0.026726121082901955, "step": 1641 }, { "clip_ratio": 0.016077084466814995, "epoch": 0.5745276417074877, "grad_norm": 0.7817456990629856, "kl": 1.34375, "learning_rate": 8.969862884098346e-06, "loss": 0.0208, "step": 1642 }, { "clip_ratio": 0.02775220200419426, "epoch": 0.5748775367389783, "grad_norm": 0.574949582209816, "kl": 1.34375, "learning_rate": 8.968005667881567e-06, "loss": 0.0182, "step": 1643 }, { "clip_ratio": 0.04486168920993805, "epoch": 0.5752274317704689, "grad_norm": 0.4996273853121022, "kl": 1.359375, "learning_rate": 8.96614697163623e-06, "loss": 0.0152, "step": 1644 }, { "clip_ratio": 0.0063152615912258625, "clipped_completions_ratio": 0.125, "epoch": 0.5755773268019594, "grad_norm": 0.874210324831151, "kl": 1.5703125, "learning_rate": 8.964286796055608e-06, "loss": 0.0139, "max_completion_length": 256.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 174.00001525878906, "mean_terminated_completion_length": 162.28570556640625, "min_completion_length": 94.0, "min_terminated_completion_length": 94.0, "num_tokens": 2295828.0, "reward": 2.78255033493042, "reward_std": 0.08441735059022903, "rewards/check_originality_func/mean": 0.8035714030265808, "rewards/check_originality_func/std": 0.40089187026023865, "rewards/check_winston_local_func/mean": 0.9901396036148071, "rewards/check_winston_local_func/std": 0.004523094743490219, "rewards/sentence_count_match_reward_logic/mean": 0.9888392686843872, "rewards/sentence_count_match_reward_logic/std": 0.03596704453229904, "step": 1645 }, { "clip_ratio": 0.013074547983705997, "epoch": 0.57592722183345, "grad_norm": 0.6938630416150655, "kl": 1.5703125, "learning_rate": 8.96242514183353e-06, "loss": 0.0104, "step": 1646 }, { "clip_ratio": 0.030956188216805458, "epoch": 0.5762771168649405, "grad_norm": 0.5626246285142942, "kl": 1.5703125, "learning_rate": 8.960562009664376e-06, "loss": 0.0075, "step": 1647 }, { "clip_ratio": 0.04436171427369118, "epoch": 0.5766270118964311, "grad_norm": 0.5327871890577357, "kl": 1.578125, "learning_rate": 8.958697400243077e-06, "loss": 0.0059, "step": 1648 }, { "clip_ratio": 0.005450903438031673, "clipped_completions_ratio": 0.0, "epoch": 0.5769769069279216, "grad_norm": 0.871342907958747, "kl": 1.484375, "learning_rate": 8.956831314265116e-06, "loss": 0.0105, "max_completion_length": 226.0, "max_terminated_completion_length": 226.0, "mean_completion_length": 154.875, "mean_terminated_completion_length": 154.875, "min_completion_length": 102.0, "min_terminated_completion_length": 102.0, "num_tokens": 2314517.0, "reward": 2.794548988342285, "reward_std": 0.1431834101676941, "rewards/check_originality_func/mean": 0.8214285969734192, "rewards/check_originality_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.973120391368866, "rewards/check_winston_local_func/std": 0.0493166483938694, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1649 }, { "clip_ratio": 0.01398524735122919, "epoch": 0.5773268019594122, "grad_norm": 0.6561696602416757, "kl": 1.484375, "learning_rate": 8.95496375242652e-06, "loss": 0.0075, "step": 1650 }, { "clip_ratio": 0.028321417048573494, "epoch": 0.5776766969909027, "grad_norm": 0.5569800863688084, "kl": 1.4921875, "learning_rate": 8.953094715423878e-06, "loss": 0.0055, "step": 1651 }, { "clip_ratio": 0.04042942076921463, "epoch": 0.5780265920223933, "grad_norm": 0.42392228442652813, "kl": 1.4921875, "learning_rate": 8.95122420395432e-06, "loss": 0.0037, "step": 1652 }, { "clip_ratio": 0.006455504801124334, "clipped_completions_ratio": 0.0, "epoch": 0.5783764870538838, "grad_norm": 0.737809333338609, "kl": 1.359375, "learning_rate": 8.94935221871553e-06, "loss": 0.0016, "max_completion_length": 223.0, "max_terminated_completion_length": 223.0, "mean_completion_length": 186.3928680419922, "mean_terminated_completion_length": 186.3928680419922, "min_completion_length": 133.0, "min_terminated_completion_length": 133.0, "num_tokens": 2336363.0, "reward": 2.7998061180114746, "reward_std": 0.07834573090076447, "rewards/check_originality_func/mean": 0.8214285969734192, "rewards/check_originality_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9911325573921204, "rewards/check_winston_local_func/std": 0.0036202638875693083, "rewards/sentence_count_match_reward_logic/mean": 0.9872449040412903, "rewards/sentence_count_match_reward_logic/std": 0.04110519215464592, "step": 1653 }, { "clip_ratio": 0.013165691867470741, "epoch": 0.5787263820853744, "grad_norm": 0.6633063743415724, "kl": 1.359375, "learning_rate": 8.94747876040574e-06, "loss": -0.0013, "step": 1654 }, { "clip_ratio": 0.026692604646086693, "epoch": 0.5790762771168649, "grad_norm": 0.4886418772038259, "kl": 1.359375, "learning_rate": 8.945603829723732e-06, "loss": -0.0047, "step": 1655 }, { "clip_ratio": 0.039010755717754364, "epoch": 0.5794261721483555, "grad_norm": 0.41680435917979286, "kl": 1.3671875, "learning_rate": 8.94372742736884e-06, "loss": -0.0065, "step": 1656 }, { "clip_ratio": 0.006179118063300848, "clipped_completions_ratio": 0.0, "epoch": 0.5797760671798461, "grad_norm": 0.9107975217420006, "kl": 1.5859375, "learning_rate": 8.94184955404094e-06, "loss": 0.0219, "max_completion_length": 252.0, "max_terminated_completion_length": 252.0, "mean_completion_length": 181.1607208251953, "mean_terminated_completion_length": 181.1607208251953, "min_completion_length": 85.0, "min_terminated_completion_length": 85.0, "num_tokens": 2357516.0, "reward": 2.8801662921905518, "reward_std": 0.23359334468841553, "rewards/check_originality_func/mean": 0.8928571343421936, "rewards/check_originality_func/std": 0.31209391355514526, "rewards/check_winston_local_func/mean": 0.989859938621521, "rewards/check_winston_local_func/std": 0.005025110673159361, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 1657 }, { "clip_ratio": 0.01621832326054573, "epoch": 0.5801259622113366, "grad_norm": 0.6924992718634267, "kl": 1.59375, "learning_rate": 8.939970210440466e-06, "loss": 0.0184, "step": 1658 }, { "clip_ratio": 0.031080463901162148, "epoch": 0.5804758572428271, "grad_norm": 0.5133949221195692, "kl": 1.59375, "learning_rate": 8.93808939726839e-06, "loss": 0.0158, "step": 1659 }, { "clip_ratio": 0.04217400774359703, "epoch": 0.5808257522743177, "grad_norm": 0.4275355439614787, "kl": 1.6015625, "learning_rate": 8.936207115226242e-06, "loss": 0.0145, "step": 1660 }, { "clip_ratio": 0.00480048730969429, "clipped_completions_ratio": 0.1785714285714286, "epoch": 0.5811756473058083, "grad_norm": 0.7816688468381102, "kl": 1.1640625, "learning_rate": 8.934323365016094e-06, "loss": 0.0134, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 197.46429443359375, "mean_terminated_completion_length": 184.7391357421875, "min_completion_length": 112.0, "min_terminated_completion_length": 112.0, "num_tokens": 2382086.0, "reward": 2.465625762939453, "reward_std": 0.3318251967430115, "rewards/check_originality_func/mean": 0.5892857313156128, "rewards/check_originality_func/std": 0.49641573429107666, "rewards/check_winston_local_func/mean": 0.8961814045906067, "rewards/check_winston_local_func/std": 0.22877685725688934, "rewards/sentence_count_match_reward_logic/mean": 0.9801587462425232, "rewards/sentence_count_match_reward_logic/std": 0.04578602686524391, "step": 1661 }, { "clip_ratio": 0.010009455494582653, "epoch": 0.5815255423372988, "grad_norm": 0.6192290887944883, "kl": 1.171875, "learning_rate": 8.932438147340565e-06, "loss": 0.0112, "step": 1662 }, { "clip_ratio": 0.023978622630238533, "epoch": 0.5818754373687893, "grad_norm": 0.4249754381586524, "kl": 1.171875, "learning_rate": 8.930551462902826e-06, "loss": 0.0087, "step": 1663 }, { "clip_ratio": 0.0345783494412899, "epoch": 0.58222533240028, "grad_norm": 0.4149863172905822, "kl": 1.171875, "learning_rate": 8.928663312406593e-06, "loss": 0.007, "step": 1664 }, { "clip_ratio": 0.006970426067709923, "clipped_completions_ratio": 0.0714285714285714, "epoch": 0.5825752274317705, "grad_norm": 0.9102715816757105, "kl": 1.5703125, "learning_rate": 8.926773696556125e-06, "loss": 0.014, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 168.1607208251953, "mean_terminated_completion_length": 161.4038543701172, "min_completion_length": 77.0, "min_terminated_completion_length": 77.0, "num_tokens": 2402063.0, "reward": 2.7389416694641113, "reward_std": 0.29873424768447876, "rewards/check_originality_func/mean": 0.75, "rewards/check_originality_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.988941490650177, "rewards/check_winston_local_func/std": 0.010642393492162228, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1665 }, { "clip_ratio": 0.01186033058911562, "epoch": 0.582925122463261, "grad_norm": 0.7001556740082777, "kl": 1.578125, "learning_rate": 8.924882616056231e-06, "loss": 0.0106, "step": 1666 }, { "clip_ratio": 0.02923380769789219, "epoch": 0.5832750174947515, "grad_norm": 0.5249336522075201, "kl": 1.578125, "learning_rate": 8.92299007161227e-06, "loss": 0.0083, "step": 1667 }, { "clip_ratio": 0.044158775359392166, "epoch": 0.5836249125262422, "grad_norm": 0.44822928875278795, "kl": 1.6015625, "learning_rate": 8.921096063930141e-06, "loss": 0.0074, "step": 1668 }, { "clip_ratio": 0.006197282578796148, "clipped_completions_ratio": 0.0, "epoch": 0.5839748075577327, "grad_norm": 0.8615195056507845, "kl": 1.4296875, "learning_rate": 8.91920059371629e-06, "loss": 0.0061, "max_completion_length": 232.0, "max_terminated_completion_length": 232.0, "mean_completion_length": 183.55357360839844, "mean_terminated_completion_length": 183.55357360839844, "min_completion_length": 126.0, "min_terminated_completion_length": 126.0, "num_tokens": 2423598.0, "reward": 2.900864839553833, "reward_std": 0.0765240490436554, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9901503920555115, "rewards/check_winston_local_func/std": 0.003384185256436467, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1669 }, { "clip_ratio": 0.011492881923913956, "epoch": 0.5843247025892232, "grad_norm": 0.6705467522104103, "kl": 1.4296875, "learning_rate": 8.917303661677712e-06, "loss": 0.0034, "step": 1670 }, { "clip_ratio": 0.023280994966626167, "epoch": 0.5846745976207138, "grad_norm": 0.6306357350794807, "kl": 1.4296875, "learning_rate": 8.91540526852194e-06, "loss": 0.0008, "step": 1671 }, { "clip_ratio": 0.037305187433958054, "epoch": 0.5850244926522044, "grad_norm": 0.4487354350209113, "kl": 1.4375, "learning_rate": 8.91350541495706e-06, "loss": -0.0015, "step": 1672 }, { "clip_ratio": 0.004919210448861122, "clipped_completions_ratio": 0.0535714285714286, "epoch": 0.5853743876836949, "grad_norm": 0.9391896185962303, "kl": 1.4375, "learning_rate": 8.9116041016917e-06, "loss": 0.0236, "max_completion_length": 256.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 168.2857208251953, "mean_terminated_completion_length": 163.3207550048828, "min_completion_length": 110.0, "min_terminated_completion_length": 110.0, "num_tokens": 2443422.0, "reward": 2.585414409637451, "reward_std": 0.2385113686323166, "rewards/check_originality_func/mean": 0.6428571343421936, "rewards/check_originality_func/std": 0.48349374532699585, "rewards/check_winston_local_func/mean": 0.9589262008666992, "rewards/check_winston_local_func/std": 0.08359228074550629, "rewards/sentence_count_match_reward_logic/mean": 0.9836309552192688, "rewards/sentence_count_match_reward_logic/std": 0.060343384742736816, "step": 1673 }, { "clip_ratio": 0.015423928387463093, "epoch": 0.5857242827151854, "grad_norm": 0.7156300448208848, "kl": 1.4375, "learning_rate": 8.909701329435032e-06, "loss": 0.0199, "step": 1674 }, { "clip_ratio": 0.031746409833431244, "epoch": 0.586074177746676, "grad_norm": 0.5169027708329065, "kl": 1.4375, "learning_rate": 8.907797098896768e-06, "loss": 0.017, "step": 1675 }, { "clip_ratio": 0.0438484326004982, "epoch": 0.5864240727781665, "grad_norm": 0.43574699638489117, "kl": 1.4375, "learning_rate": 8.905891410787174e-06, "loss": 0.0154, "step": 1676 }, { "clip_ratio": 0.006366013549268246, "clipped_completions_ratio": 0.0, "epoch": 0.5867739678096571, "grad_norm": 1.1374016678081111, "kl": 1.828125, "learning_rate": 8.903984265817048e-06, "loss": 0.0125, "max_completion_length": 242.0, "max_terminated_completion_length": 242.0, "mean_completion_length": 151.55357360839844, "mean_terminated_completion_length": 151.55357360839844, "min_completion_length": 58.0, "min_terminated_completion_length": 58.0, "num_tokens": 2461989.0, "reward": 2.932560920715332, "reward_std": 0.08118584752082825, "rewards/check_originality_func/mean": 0.9464285969734192, "rewards/check_originality_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9881162643432617, "rewards/check_winston_local_func/std": 0.006371701136231422, "rewards/sentence_count_match_reward_logic/mean": 0.9980158805847168, "rewards/sentence_count_match_reward_logic/std": 0.014847845770418644, "step": 1677 }, { "clip_ratio": 0.017781708389520645, "epoch": 0.5871238628411477, "grad_norm": 0.7156318561829134, "kl": 1.8359375, "learning_rate": 8.90207566469774e-06, "loss": 0.0091, "step": 1678 }, { "clip_ratio": 0.03388269245624542, "epoch": 0.5874737578726382, "grad_norm": 0.6386930470645481, "kl": 1.859375, "learning_rate": 8.90016560814114e-06, "loss": 0.0069, "step": 1679 }, { "clip_ratio": 0.04627849534153938, "epoch": 0.5878236529041287, "grad_norm": 0.5650239454999542, "kl": 1.8671875, "learning_rate": 8.898254096859681e-06, "loss": 0.0054, "step": 1680 }, { "clip_ratio": 0.006507327780127525, "clipped_completions_ratio": 0.4285714285714286, "epoch": 0.5881735479356193, "grad_norm": 0.9445275239856761, "kl": 1.3828125, "learning_rate": 8.89634113156634e-06, "loss": 0.0118, "max_completion_length": 256.0, "max_terminated_completion_length": 226.0, "mean_completion_length": 205.19644165039062, "mean_terminated_completion_length": 167.09375, "min_completion_length": 106.0, "min_terminated_completion_length": 106.0, "num_tokens": 2486624.0, "reward": 2.698808431625366, "reward_std": 0.32877644896507263, "rewards/check_originality_func/mean": 0.7321428656578064, "rewards/check_originality_func/std": 0.44685041904449463, "rewards/check_winston_local_func/mean": 0.9896246194839478, "rewards/check_winston_local_func/std": 0.009998152032494545, "rewards/sentence_count_match_reward_logic/mean": 0.9770408272743225, "rewards/sentence_count_match_reward_logic/std": 0.05953926220536232, "step": 1681 }, { "clip_ratio": 0.012675286270678043, "epoch": 0.5885234429671099, "grad_norm": 0.7726843039662525, "kl": 1.3828125, "learning_rate": 8.89442671297463e-06, "loss": 0.0096, "step": 1682 }, { "clip_ratio": 0.025160925462841988, "epoch": 0.5888733379986004, "grad_norm": 0.5757302916723921, "kl": 1.390625, "learning_rate": 8.892510841798616e-06, "loss": 0.0053, "step": 1683 }, { "clip_ratio": 0.039766691625118256, "epoch": 0.5892232330300909, "grad_norm": 0.4152440269074419, "kl": 1.390625, "learning_rate": 8.8905935187529e-06, "loss": 0.0036, "step": 1684 }, { "clip_ratio": 0.005948531441390514, "clipped_completions_ratio": 0.3214285714285714, "epoch": 0.5895731280615816, "grad_norm": 0.8347496334284319, "kl": 1.234375, "learning_rate": 8.888674744552619e-06, "loss": 0.0203, "max_completion_length": 256.0, "max_terminated_completion_length": 246.0, "mean_completion_length": 184.6607208251953, "mean_terminated_completion_length": 150.86842346191406, "min_completion_length": 106.0, "min_terminated_completion_length": 106.0, "num_tokens": 2508781.0, "reward": 2.5770249366760254, "reward_std": 0.20972700417041779, "rewards/check_originality_func/mean": 0.6071428656578064, "rewards/check_originality_func/std": 0.4928053617477417, "rewards/check_winston_local_func/mean": 0.9851881861686707, "rewards/check_winston_local_func/std": 0.012268810532987118, "rewards/sentence_count_match_reward_logic/mean": 0.9846938848495483, "rewards/sentence_count_match_reward_logic/std": 0.044584840536117554, "step": 1685 }, { "clip_ratio": 0.01131476927548647, "epoch": 0.5899230230930721, "grad_norm": 0.630891459785606, "kl": 1.234375, "learning_rate": 8.886754519913465e-06, "loss": 0.0171, "step": 1686 }, { "clip_ratio": 0.026742825284600258, "epoch": 0.5902729181245626, "grad_norm": 0.4917391768713056, "kl": 1.2421875, "learning_rate": 8.88483284555166e-06, "loss": 0.0149, "step": 1687 }, { "clip_ratio": 0.037426646798849106, "epoch": 0.5906228131560531, "grad_norm": 0.4341985477244067, "kl": 1.234375, "learning_rate": 8.882909722183973e-06, "loss": 0.0135, "step": 1688 }, { "clip_ratio": 0.007315327879041433, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.5909727081875438, "grad_norm": 1.0585195362465574, "kl": 1.5390625, "learning_rate": 8.880985150527705e-06, "loss": 0.0196, "max_completion_length": 256.0, "max_terminated_completion_length": 217.0, "mean_completion_length": 171.33929443359375, "mean_terminated_completion_length": 157.2291717529297, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 2529560.0, "reward": 2.736982822418213, "reward_std": 0.174305722117424, "rewards/check_originality_func/mean": 0.8035714030265808, "rewards/check_originality_func/std": 0.40089184045791626, "rewards/check_winston_local_func/mean": 0.9693379998207092, "rewards/check_winston_local_func/std": 0.059496961534023285, "rewards/sentence_count_match_reward_logic/mean": 0.964073121547699, "rewards/sentence_count_match_reward_logic/std": 0.07053887844085693, "step": 1689 }, { "clip_ratio": 0.016426891088485718, "epoch": 0.5913226032190343, "grad_norm": 0.7555592536940304, "kl": 1.5390625, "learning_rate": 8.879059131300709e-06, "loss": 0.0155, "step": 1690 }, { "clip_ratio": 0.03215109184384346, "epoch": 0.5916724982505248, "grad_norm": 0.5802148809466472, "kl": 1.546875, "learning_rate": 8.87713166522137e-06, "loss": 0.0124, "step": 1691 }, { "clip_ratio": 0.04554247111082077, "epoch": 0.5920223932820154, "grad_norm": 0.5313541985861503, "kl": 1.5546875, "learning_rate": 8.875202753008614e-06, "loss": 0.0109, "step": 1692 }, { "clip_ratio": 0.006670800037682056, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.592372288313506, "grad_norm": 0.9067966853869223, "kl": 1.453125, "learning_rate": 8.873272395381907e-06, "loss": 0.0163, "max_completion_length": 256.0, "max_terminated_completion_length": 187.0, "mean_completion_length": 167.44644165039062, "mean_terminated_completion_length": 152.6875, "min_completion_length": 104.0, "min_terminated_completion_length": 104.0, "num_tokens": 2549585.0, "reward": 2.854673147201538, "reward_std": 0.05106176808476448, "rewards/check_originality_func/mean": 0.875, "rewards/check_originality_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.9852002263069153, "rewards/check_winston_local_func/std": 0.018124090507626534, "rewards/sentence_count_match_reward_logic/mean": 0.9944728016853333, "rewards/sentence_count_match_reward_logic/std": 0.029068926349282265, "step": 1693 }, { "clip_ratio": 0.014561831951141357, "epoch": 0.5927221833449965, "grad_norm": 0.7560174685231327, "kl": 1.453125, "learning_rate": 8.871340593061255e-06, "loss": 0.0129, "step": 1694 }, { "clip_ratio": 0.030186878517270088, "epoch": 0.593072078376487, "grad_norm": 0.6039750222155129, "kl": 1.453125, "learning_rate": 8.869407346767202e-06, "loss": 0.0091, "step": 1695 }, { "clip_ratio": 0.04421890527009964, "epoch": 0.5934219734079776, "grad_norm": 0.4744609362865127, "kl": 1.453125, "learning_rate": 8.867472657220829e-06, "loss": 0.0071, "step": 1696 }, { "clip_ratio": 0.006785286590456963, "clipped_completions_ratio": 0.1785714285714286, "epoch": 0.5937718684394682, "grad_norm": 0.8218516572457162, "kl": 1.265625, "learning_rate": 8.865536525143758e-06, "loss": 0.0119, "max_completion_length": 256.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 183.0178680419922, "mean_terminated_completion_length": 167.1521759033203, "min_completion_length": 132.0, "min_terminated_completion_length": 132.0, "num_tokens": 2571274.0, "reward": 2.768963098526001, "reward_std": 0.20574454963207245, "rewards/check_originality_func/mean": 0.8214285969734192, "rewards/check_originality_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.991943359375, "rewards/check_winston_local_func/std": 0.001869482221081853, "rewards/sentence_count_match_reward_logic/mean": 0.9555909037590027, "rewards/sentence_count_match_reward_logic/std": 0.07490973174571991, "step": 1697 }, { "clip_ratio": 0.01142541877925396, "epoch": 0.5941217634709587, "grad_norm": 0.6200123209664288, "kl": 1.265625, "learning_rate": 8.86359895125815e-06, "loss": 0.0095, "step": 1698 }, { "clip_ratio": 0.023820945993065834, "epoch": 0.5944716585024493, "grad_norm": 0.47750059082720187, "kl": 1.2734375, "learning_rate": 8.861659936286698e-06, "loss": 0.0064, "step": 1699 }, { "clip_ratio": 0.03576916083693504, "epoch": 0.5948215535339398, "grad_norm": 0.4203749759196097, "kl": 1.2890625, "learning_rate": 8.859719480952637e-06, "loss": 0.0049, "step": 1700 }, { "clip_ratio": 0.007588852662593126, "clipped_completions_ratio": 0.0, "epoch": 0.5951714485654304, "grad_norm": 0.8441624325931054, "kl": 1.375, "learning_rate": 8.85777758597974e-06, "loss": 0.0141, "max_completion_length": 218.0, "max_terminated_completion_length": 218.0, "mean_completion_length": 171.37501525878906, "mean_terminated_completion_length": 171.37501525878906, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 2591567.0, "reward": 2.826693058013916, "reward_std": 0.2730238139629364, "rewards/check_originality_func/mean": 0.8392857313156128, "rewards/check_originality_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.9874071478843689, "rewards/check_winston_local_func/std": 0.00635100482031703, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1701 }, { "clip_ratio": 0.014429425820708275, "epoch": 0.5955213435969209, "grad_norm": 0.6605135061068431, "kl": 1.375, "learning_rate": 8.855834252092315e-06, "loss": 0.011, "step": 1702 }, { "clip_ratio": 0.02850094810128212, "epoch": 0.5958712386284115, "grad_norm": 0.5371600178890484, "kl": 1.3828125, "learning_rate": 8.853889480015208e-06, "loss": 0.009, "step": 1703 }, { "clip_ratio": 0.042572662234306335, "epoch": 0.596221133659902, "grad_norm": 0.4400516565593935, "kl": 1.390625, "learning_rate": 8.851943270473797e-06, "loss": 0.0067, "step": 1704 }, { "clip_ratio": 0.006859803106635809, "clipped_completions_ratio": 0.0, "epoch": 0.5965710286913926, "grad_norm": 0.9768931216936146, "kl": 1.7734375, "learning_rate": 8.849995624194006e-06, "loss": 0.0278, "max_completion_length": 199.0, "max_terminated_completion_length": 199.0, "mean_completion_length": 141.1607208251953, "mean_terminated_completion_length": 141.1607208251953, "min_completion_length": 70.0, "min_terminated_completion_length": 70.0, "num_tokens": 2608600.0, "reward": 2.9102399349212646, "reward_std": 0.16914087533950806, "rewards/check_originality_func/mean": 0.9464285969734192, "rewards/check_originality_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9876208901405334, "rewards/check_winston_local_func/std": 0.013894368894398212, "rewards/sentence_count_match_reward_logic/mean": 0.976190447807312, "rewards/sentence_count_match_reward_logic/std": 0.058848995715379715, "step": 1705 }, { "clip_ratio": 0.013511528261005878, "epoch": 0.5969209237228832, "grad_norm": 0.6980664555853288, "kl": 1.7734375, "learning_rate": 8.84804654190228e-06, "loss": 0.0241, "step": 1706 }, { "clip_ratio": 0.031002560630440712, "epoch": 0.5972708187543737, "grad_norm": 0.5055921657950802, "kl": 1.7890625, "learning_rate": 8.846096024325616e-06, "loss": 0.0213, "step": 1707 }, { "clip_ratio": 0.04710895195603371, "epoch": 0.5976207137858642, "grad_norm": 0.5080652169076644, "kl": 1.8046875, "learning_rate": 8.844144072191537e-06, "loss": 0.0196, "step": 1708 }, { "clip_ratio": 0.0050065964460372925, "clipped_completions_ratio": 0.2321428571428571, "epoch": 0.5979706088173548, "grad_norm": 0.8900920851974606, "kl": 1.296875, "learning_rate": 8.842190686228098e-06, "loss": 0.0153, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 181.07144165039062, "mean_terminated_completion_length": 158.41860961914062, "min_completion_length": 65.0, "min_terminated_completion_length": 65.0, "num_tokens": 2630348.0, "reward": 2.638150930404663, "reward_std": 0.21393997967243195, "rewards/check_originality_func/mean": 0.6607142686843872, "rewards/check_originality_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.9889447093009949, "rewards/check_winston_local_func/std": 0.006158861331641674, "rewards/sentence_count_match_reward_logic/mean": 0.9884920120239258, "rewards/sentence_count_match_reward_logic/std": 0.03860439732670784, "step": 1709 }, { "clip_ratio": 0.014122048392891884, "epoch": 0.5983205038488454, "grad_norm": 0.6327721372055422, "kl": 1.3046875, "learning_rate": 8.8402358671639e-06, "loss": 0.0124, "step": 1710 }, { "clip_ratio": 0.02559087984263897, "epoch": 0.5986703988803359, "grad_norm": 0.5178473402642506, "kl": 1.328125, "learning_rate": 8.838279615728067e-06, "loss": 0.0106, "step": 1711 }, { "clip_ratio": 0.04101797565817833, "epoch": 0.5990202939118264, "grad_norm": 0.4760229703492022, "kl": 1.3515625, "learning_rate": 8.836321932650266e-06, "loss": 0.0091, "step": 1712 }, { "clip_ratio": 0.005229326896369457, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.599370188943317, "grad_norm": 0.8616651830081272, "kl": 1.2890625, "learning_rate": 8.834362818660692e-06, "loss": 0.0136, "max_completion_length": 256.0, "max_terminated_completion_length": 221.0, "mean_completion_length": 178.67857360839844, "mean_terminated_completion_length": 165.7916717529297, "min_completion_length": 110.0, "min_terminated_completion_length": 110.0, "num_tokens": 2651714.0, "reward": 2.6959993839263916, "reward_std": 0.13851694762706757, "rewards/check_originality_func/mean": 0.75, "rewards/check_originality_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.9740601778030396, "rewards/check_winston_local_func/std": 0.05516791716217995, "rewards/sentence_count_match_reward_logic/mean": 0.9719387888908386, "rewards/sentence_count_match_reward_logic/std": 0.0742039605975151, "step": 1713 }, { "clip_ratio": 0.014064209535717964, "epoch": 0.5997200839748076, "grad_norm": 0.6467994160231182, "kl": 1.296875, "learning_rate": 8.832402274490075e-06, "loss": 0.0101, "step": 1714 }, { "clip_ratio": 0.026781940832734108, "epoch": 0.6000699790062981, "grad_norm": 0.5741011837599668, "kl": 1.296875, "learning_rate": 8.830440300869681e-06, "loss": 0.0077, "step": 1715 }, { "clip_ratio": 0.040404729545116425, "epoch": 0.6004198740377886, "grad_norm": 0.39371967484201337, "kl": 1.296875, "learning_rate": 8.828476898531308e-06, "loss": 0.0054, "step": 1716 }, { "clip_ratio": 0.005550582893192768, "clipped_completions_ratio": 0.25, "epoch": 0.6007697690692793, "grad_norm": 0.8459838990952764, "kl": 1.359375, "learning_rate": 8.826512068207286e-06, "loss": 0.0127, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 173.19644165039062, "mean_terminated_completion_length": 145.59524536132812, "min_completion_length": 107.0, "min_terminated_completion_length": 107.0, "num_tokens": 2672541.0, "reward": 2.6193556785583496, "reward_std": 0.1883695423603058, "rewards/check_originality_func/mean": 0.6428571343421936, "rewards/check_originality_func/std": 0.48349377512931824, "rewards/check_winston_local_func/mean": 0.986702561378479, "rewards/check_winston_local_func/std": 0.010490351356565952, "rewards/sentence_count_match_reward_logic/mean": 0.989795982837677, "rewards/sentence_count_match_reward_logic/std": 0.03712429851293564, "step": 1717 }, { "clip_ratio": 0.011312580667436123, "epoch": 0.6011196641007698, "grad_norm": 0.6858992972799616, "kl": 1.359375, "learning_rate": 8.824545810630478e-06, "loss": 0.0099, "step": 1718 }, { "clip_ratio": 0.028359852731227875, "epoch": 0.6014695591322603, "grad_norm": 0.4634426261644045, "kl": 1.359375, "learning_rate": 8.822578126534278e-06, "loss": 0.007, "step": 1719 }, { "clip_ratio": 0.04222244396805763, "epoch": 0.6018194541637508, "grad_norm": 0.4286546025854166, "kl": 1.375, "learning_rate": 8.820609016652616e-06, "loss": 0.0057, "step": 1720 }, { "clip_ratio": 0.007308235391974449, "clipped_completions_ratio": 0.0, "epoch": 0.6021693491952415, "grad_norm": 1.1159723764168383, "kl": 1.8203125, "learning_rate": 8.818638481719952e-06, "loss": 0.0162, "max_completion_length": 213.0, "max_terminated_completion_length": 213.0, "mean_completion_length": 151.82144165039062, "mean_terminated_completion_length": 151.82144165039062, "min_completion_length": 57.0, "min_terminated_completion_length": 57.0, "num_tokens": 2690915.0, "reward": 2.8630869388580322, "reward_std": 0.17553813755512238, "rewards/check_originality_func/mean": 0.875, "rewards/check_originality_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.9880867004394531, "rewards/check_winston_local_func/std": 0.006147105246782303, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1721 }, { "clip_ratio": 0.016035879030823708, "epoch": 0.602519244226732, "grad_norm": 0.7653209725055296, "kl": 1.84375, "learning_rate": 8.816666522471274e-06, "loss": 0.0123, "step": 1722 }, { "clip_ratio": 0.033652789890766144, "epoch": 0.6028691392582225, "grad_norm": 0.6902736836583356, "kl": 1.8828125, "learning_rate": 8.814693139642105e-06, "loss": 0.0101, "step": 1723 }, { "clip_ratio": 0.04624111205339432, "epoch": 0.6032190342897131, "grad_norm": 0.571662215834823, "kl": 1.859375, "learning_rate": 8.812718333968498e-06, "loss": 0.0085, "step": 1724 }, { "clip_ratio": 0.008241831324994564, "clipped_completions_ratio": 0.1964285714285714, "epoch": 0.6035689293212037, "grad_norm": 0.8326345198291025, "kl": 1.3828125, "learning_rate": 8.810742106187039e-06, "loss": 0.0083, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 189.9107208251953, "mean_terminated_completion_length": 173.75555419921875, "min_completion_length": 108.0, "min_terminated_completion_length": 108.0, "num_tokens": 2713206.0, "reward": 2.7745401859283447, "reward_std": 0.1320342868566513, "rewards/check_originality_func/mean": 0.7857142686843872, "rewards/check_originality_func/std": 0.41403934359550476, "rewards/check_winston_local_func/mean": 0.9910576939582825, "rewards/check_winston_local_func/std": 0.0040301610715687275, "rewards/sentence_count_match_reward_logic/mean": 0.9977678656578064, "rewards/sentence_count_match_reward_logic/std": 0.016703827306628227, "step": 1725 }, { "clip_ratio": 0.013592597097158432, "epoch": 0.6039188243526942, "grad_norm": 0.6758358960807651, "kl": 1.375, "learning_rate": 8.808764457034839e-06, "loss": 0.0049, "step": 1726 }, { "clip_ratio": 0.02745046466588974, "epoch": 0.6042687193841847, "grad_norm": 0.49378502304033384, "kl": 1.375, "learning_rate": 8.806785387249546e-06, "loss": 0.0024, "step": 1727 }, { "clip_ratio": 0.0408182330429554, "epoch": 0.6046186144156753, "grad_norm": 0.42033327629258027, "kl": 1.3828125, "learning_rate": 8.80480489756933e-06, "loss": 0.0006, "step": 1728 }, { "clip_ratio": 0.006081518717110157, "clipped_completions_ratio": 0.017857142857142905, "epoch": 0.6049685094471658, "grad_norm": 0.9583518123040718, "kl": 1.6328125, "learning_rate": 8.8028229887329e-06, "loss": 0.0395, "max_completion_length": 256.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 153.2678680419922, "mean_terminated_completion_length": 151.39999389648438, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 2731525.0, "reward": 2.9124269485473633, "reward_std": 0.09352434426546097, "rewards/check_originality_func/mean": 0.9464285969734192, "rewards/check_originality_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9659983515739441, "rewards/check_winston_local_func/std": 0.0686015784740448, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1729 }, { "clip_ratio": 0.016275713220238686, "epoch": 0.6053184044786564, "grad_norm": 0.8287914975630107, "kl": 1.6328125, "learning_rate": 8.800839661479483e-06, "loss": 0.0363, "step": 1730 }, { "clip_ratio": 0.03257225081324577, "epoch": 0.605668299510147, "grad_norm": 1.0411726107339965, "kl": 1.71875, "learning_rate": 8.798854916548847e-06, "loss": 0.034, "step": 1731 }, { "clip_ratio": 0.0447259359061718, "epoch": 0.6060181945416375, "grad_norm": 0.5698945675873212, "kl": 1.6484375, "learning_rate": 8.79686875468128e-06, "loss": 0.0319, "step": 1732 }, { "clip_ratio": 0.007722186390310526, "clipped_completions_ratio": 0.0, "epoch": 0.606368089573128, "grad_norm": 0.9607848294491002, "kl": 1.4609375, "learning_rate": 8.794881176617602e-06, "loss": 0.0178, "max_completion_length": 217.0, "max_terminated_completion_length": 217.0, "mean_completion_length": 171.58929443359375, "mean_terminated_completion_length": 171.58929443359375, "min_completion_length": 77.0, "min_terminated_completion_length": 77.0, "num_tokens": 2751830.0, "reward": 2.979430675506592, "reward_std": 0.01747554913163185, "rewards/check_originality_func/mean": 1.0, "rewards/check_originality_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9824066162109375, "rewards/check_winston_local_func/std": 0.024049151688814163, "rewards/sentence_count_match_reward_logic/mean": 0.9970237612724304, "rewards/sentence_count_match_reward_logic/std": 0.02227177284657955, "step": 1733 }, { "clip_ratio": 0.019516605883836746, "epoch": 0.6067179846046186, "grad_norm": 0.87632715517645, "kl": 1.4609375, "learning_rate": 8.792892183099162e-06, "loss": 0.0146, "step": 1734 }, { "clip_ratio": 0.03306352347135544, "epoch": 0.6070678796361092, "grad_norm": 0.6902696726606252, "kl": 1.4609375, "learning_rate": 8.790901774867834e-06, "loss": 0.0112, "step": 1735 }, { "clip_ratio": 0.04468070715665817, "epoch": 0.6074177746675997, "grad_norm": 0.46212027049659676, "kl": 1.4765625, "learning_rate": 8.788909952666024e-06, "loss": 0.0091, "step": 1736 }, { "clip_ratio": 0.005154935643076897, "clipped_completions_ratio": 0.0, "epoch": 0.6077676696990902, "grad_norm": 0.8089263513934158, "kl": 1.3125, "learning_rate": 8.78691671723666e-06, "loss": 0.0136, "max_completion_length": 243.0, "max_terminated_completion_length": 243.0, "mean_completion_length": 192.1428680419922, "mean_terminated_completion_length": 192.1428680419922, "min_completion_length": 77.0, "min_terminated_completion_length": 77.0, "num_tokens": 2774254.0, "reward": 2.7688920497894287, "reward_std": 0.14026899635791779, "rewards/check_originality_func/mean": 0.7857142686843872, "rewards/check_originality_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.9831779599189758, "rewards/check_winston_local_func/std": 0.024832209572196007, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1737 }, { "clip_ratio": 0.014384102076292038, "epoch": 0.6081175647305809, "grad_norm": 0.7122502686259345, "kl": 1.3125, "learning_rate": 8.784922069323202e-06, "loss": 0.0107, "step": 1738 }, { "clip_ratio": 0.026090925559401512, "epoch": 0.6084674597620714, "grad_norm": 0.6230721054747376, "kl": 1.3125, "learning_rate": 8.782926009669635e-06, "loss": 0.0085, "step": 1739 }, { "clip_ratio": 0.034793294966220856, "epoch": 0.6088173547935619, "grad_norm": 0.46779112178316773, "kl": 1.3125, "learning_rate": 8.780928539020467e-06, "loss": 0.0063, "step": 1740 }, { "clip_ratio": 0.0047789751552045345, "clipped_completions_ratio": 0.125, "epoch": 0.6091672498250524, "grad_norm": 0.9347672764022856, "kl": 1.4609375, "learning_rate": 8.77892965812074e-06, "loss": 0.006, "max_completion_length": 256.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 183.69644165039062, "mean_terminated_completion_length": 173.36734008789062, "min_completion_length": 114.0, "min_terminated_completion_length": 114.0, "num_tokens": 2795669.0, "reward": 2.865147829055786, "reward_std": 0.05406028777360916, "rewards/check_originality_func/mean": 0.875, "rewards/check_originality_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.9901474714279175, "rewards/check_winston_local_func/std": 0.005919365677982569, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1741 }, { "clip_ratio": 0.012203969992697239, "epoch": 0.6095171448565431, "grad_norm": 0.797173726521578, "kl": 1.453125, "learning_rate": 8.776929367716013e-06, "loss": 0.0022, "step": 1742 }, { "clip_ratio": 0.02815239503979683, "epoch": 0.6098670398880336, "grad_norm": 0.5721350452257146, "kl": 1.4609375, "learning_rate": 8.77492766855238e-06, "loss": -0.0021, "step": 1743 }, { "clip_ratio": 0.046114400029182434, "epoch": 0.6102169349195241, "grad_norm": 0.4716002581501036, "kl": 1.46875, "learning_rate": 8.772924561376454e-06, "loss": -0.0045, "step": 1744 }, { "clip_ratio": 0.0037464716006070375, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.6105668299510147, "grad_norm": 0.7575524057301248, "kl": 1.3828125, "learning_rate": 8.770920046935374e-06, "loss": 0.0067, "max_completion_length": 256.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 179.55357360839844, "mean_terminated_completion_length": 166.8125, "min_completion_length": 85.0, "min_terminated_completion_length": 85.0, "num_tokens": 2817220.0, "reward": 2.7395198345184326, "reward_std": 0.09486033022403717, "rewards/check_originality_func/mean": 0.8035714030265808, "rewards/check_originality_func/std": 0.40089187026023865, "rewards/check_winston_local_func/mean": 0.9835672378540039, "rewards/check_winston_local_func/std": 0.02249489165842533, "rewards/sentence_count_match_reward_logic/mean": 0.9523809552192688, "rewards/sentence_count_match_reward_logic/std": 0.07597372680902481, "step": 1745 }, { "clip_ratio": 0.009152519516646862, "epoch": 0.6109167249825053, "grad_norm": 0.6168415680505156, "kl": 1.3828125, "learning_rate": 8.768914125976806e-06, "loss": 0.0045, "step": 1746 }, { "clip_ratio": 0.020170072093605995, "epoch": 0.6112666200139958, "grad_norm": 0.4925876209219422, "kl": 1.390625, "learning_rate": 8.76690679924894e-06, "loss": 0.0025, "step": 1747 }, { "clip_ratio": 0.03445278853178024, "epoch": 0.6116165150454863, "grad_norm": 0.413595625841306, "kl": 1.3984375, "learning_rate": 8.764898067500488e-06, "loss": 0.0006, "step": 1748 }, { "clip_ratio": 0.0046190740540623665, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.6119664100769769, "grad_norm": 0.7493149798267321, "kl": 1.1796875, "learning_rate": 8.76288793148069e-06, "loss": 0.0121, "max_completion_length": 256.0, "max_terminated_completion_length": 246.0, "mean_completion_length": 207.8928680419922, "mean_terminated_completion_length": 199.875, "min_completion_length": 154.0, "min_terminated_completion_length": 154.0, "num_tokens": 2841854.0, "reward": 2.8009543418884277, "reward_std": 0.07382646948099136, "rewards/check_originality_func/mean": 0.8392857313156128, "rewards/check_originality_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.9683650732040405, "rewards/check_winston_local_func/std": 0.06377780437469482, "rewards/sentence_count_match_reward_logic/mean": 0.9933035969734192, "rewards/sentence_count_match_reward_logic/std": 0.028400972485542297, "step": 1749 }, { "clip_ratio": 0.008009464479982853, "epoch": 0.6123163051084675, "grad_norm": 0.6334997895669491, "kl": 1.1875, "learning_rate": 8.760876391939308e-06, "loss": 0.0097, "step": 1750 }, { "clip_ratio": 0.021253101527690887, "epoch": 0.612666200139958, "grad_norm": 0.5094752297446196, "kl": 1.1875, "learning_rate": 8.758863449626624e-06, "loss": 0.0064, "step": 1751 }, { "clip_ratio": 0.03500990942120552, "epoch": 0.6130160951714486, "grad_norm": 0.5032203071795722, "kl": 1.1953125, "learning_rate": 8.756849105293447e-06, "loss": 0.0044, "step": 1752 }, { "clip_ratio": 0.005101894028484821, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.6133659902029391, "grad_norm": 0.9835811384070715, "kl": 1.5546875, "learning_rate": 8.754833359691115e-06, "loss": 0.0154, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 173.6428680419922, "mean_terminated_completion_length": 170.59259033203125, "min_completion_length": 72.0, "min_terminated_completion_length": 72.0, "num_tokens": 2862610.0, "reward": 2.9417545795440674, "reward_std": 0.11306533962488174, "rewards/check_originality_func/mean": 0.9642857313156128, "rewards/check_originality_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9774689078330994, "rewards/check_winston_local_func/std": 0.04060358181595802, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1753 }, { "clip_ratio": 0.015637299045920372, "epoch": 0.6137158852344297, "grad_norm": 0.6752716685790238, "kl": 1.5546875, "learning_rate": 8.752816213571474e-06, "loss": 0.0117, "step": 1754 }, { "clip_ratio": 0.029562121257185936, "epoch": 0.6140657802659202, "grad_norm": 0.5016572822797974, "kl": 1.5546875, "learning_rate": 8.750797667686902e-06, "loss": 0.009, "step": 1755 }, { "clip_ratio": 0.04391546919941902, "epoch": 0.6144156752974108, "grad_norm": 0.4382002071673074, "kl": 1.5625, "learning_rate": 8.7487777227903e-06, "loss": 0.0072, "step": 1756 }, { "clip_ratio": 0.005176561418920755, "clipped_completions_ratio": 0.0714285714285714, "epoch": 0.6147655703289013, "grad_norm": 0.8642671928060796, "kl": 1.4140625, "learning_rate": 8.746756379635088e-06, "loss": 0.0084, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 176.4107208251953, "mean_terminated_completion_length": 170.28846740722656, "min_completion_length": 88.0, "min_terminated_completion_length": 88.0, "num_tokens": 2883689.0, "reward": 2.67232346534729, "reward_std": 0.2138243317604065, "rewards/check_originality_func/mean": 0.6964285969734192, "rewards/check_originality_func/std": 0.4639609456062317, "rewards/check_winston_local_func/mean": 0.9900315403938293, "rewards/check_winston_local_func/std": 0.005371515639126301, "rewards/sentence_count_match_reward_logic/mean": 0.9858631491661072, "rewards/sentence_count_match_reward_logic/std": 0.06419508159160614, "step": 1757 }, { "clip_ratio": 0.011888760142028332, "epoch": 0.6151154653603919, "grad_norm": 0.6736973926547318, "kl": 1.4140625, "learning_rate": 8.744733638975206e-06, "loss": 0.0058, "step": 1758 }, { "clip_ratio": 0.02517446130514145, "epoch": 0.6154653603918825, "grad_norm": 0.526464471085974, "kl": 1.421875, "learning_rate": 8.742709501565118e-06, "loss": 0.0028, "step": 1759 }, { "clip_ratio": 0.04132445529103279, "epoch": 0.615815255423373, "grad_norm": 0.4911910004392961, "kl": 1.4296875, "learning_rate": 8.740683968159808e-06, "loss": 0.0004, "step": 1760 }, { "clip_ratio": 0.004815211519598961, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.6161651504548635, "grad_norm": 0.7977946855079785, "kl": 1.3203125, "learning_rate": 8.738657039514782e-06, "loss": 0.0161, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 178.71429443359375, "mean_terminated_completion_length": 175.8518524169922, "min_completion_length": 114.0, "min_terminated_completion_length": 114.0, "num_tokens": 2904913.0, "reward": 2.773306369781494, "reward_std": 0.2150561511516571, "rewards/check_originality_func/mean": 0.7857142686843872, "rewards/check_originality_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.9875917434692383, "rewards/check_winston_local_func/std": 0.011092539876699448, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1761 }, { "clip_ratio": 0.010541087947785854, "epoch": 0.616515045486354, "grad_norm": 0.7297763156892245, "kl": 1.3203125, "learning_rate": 8.736628716386064e-06, "loss": 0.0135, "step": 1762 }, { "clip_ratio": 0.022914845496416092, "epoch": 0.6168649405178447, "grad_norm": 0.5048220159562997, "kl": 1.3203125, "learning_rate": 8.7345989995302e-06, "loss": 0.0105, "step": 1763 }, { "clip_ratio": 0.036881472915410995, "epoch": 0.6172148355493352, "grad_norm": 0.3851114305079715, "kl": 1.328125, "learning_rate": 8.732567889704253e-06, "loss": 0.0092, "step": 1764 }, { "clip_ratio": 0.005940109025686979, "clipped_completions_ratio": 0.25, "epoch": 0.6175647305808257, "grad_norm": 0.8520461192295768, "kl": 1.375, "learning_rate": 8.730535387665812e-06, "loss": 0.0178, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 190.46429443359375, "mean_terminated_completion_length": 168.61904907226562, "min_completion_length": 72.0, "min_terminated_completion_length": 72.0, "num_tokens": 2927667.0, "reward": 2.916045665740967, "reward_std": 0.1676560789346695, "rewards/check_originality_func/mean": 0.9285714030265808, "rewards/check_originality_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9900252223014832, "rewards/check_winston_local_func/std": 0.004190464504063129, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 1765 }, { "clip_ratio": 0.011693121865391731, "epoch": 0.6179146256123164, "grad_norm": 0.6797206800837965, "kl": 1.375, "learning_rate": 8.728501494172976e-06, "loss": 0.0138, "step": 1766 }, { "clip_ratio": 0.026654334738850594, "epoch": 0.6182645206438069, "grad_norm": 0.5649889863050477, "kl": 1.3828125, "learning_rate": 8.726466209984372e-06, "loss": 0.0117, "step": 1767 }, { "clip_ratio": 0.03682331740856171, "epoch": 0.6186144156752974, "grad_norm": 0.4108342082000628, "kl": 1.375, "learning_rate": 8.72442953585914e-06, "loss": 0.0101, "step": 1768 }, { "clip_ratio": 0.006207895930856466, "clipped_completions_ratio": 0.0, "epoch": 0.6189643107067879, "grad_norm": 0.944902559826109, "kl": 1.21875, "learning_rate": 8.72239147255694e-06, "loss": 0.0112, "max_completion_length": 234.0, "max_terminated_completion_length": 234.0, "mean_completion_length": 210.85714721679688, "mean_terminated_completion_length": 210.85714721679688, "min_completion_length": 180.0, "min_terminated_completion_length": 180.0, "num_tokens": 2952299.0, "reward": 2.785768985748291, "reward_std": 0.130145862698555, "rewards/check_originality_func/mean": 0.8035714030265808, "rewards/check_originality_func/std": 0.40089184045791626, "rewards/check_winston_local_func/mean": 0.9881496429443359, "rewards/check_winston_local_func/std": 0.006414490751922131, "rewards/sentence_count_match_reward_logic/mean": 0.9940475821495056, "rewards/sentence_count_match_reward_logic/std": 0.031209392473101616, "step": 1769 }, { "clip_ratio": 0.009423806332051754, "epoch": 0.6193142057382786, "grad_norm": 15.03038626752625, "kl": 2.375, "learning_rate": 8.720352020837951e-06, "loss": 0.0203, "step": 1770 }, { "clip_ratio": 0.0151955746114254, "epoch": 0.6196641007697691, "grad_norm": 0.6134429967649532, "kl": 1.21875, "learning_rate": 8.71831118146287e-06, "loss": 0.0073, "step": 1771 }, { "clip_ratio": 0.02379613183438778, "epoch": 0.6200139958012596, "grad_norm": 0.4788284402850803, "kl": 1.21875, "learning_rate": 8.716268955192908e-06, "loss": 0.0074, "step": 1772 }, { "clip_ratio": 0.006239636801183224, "clipped_completions_ratio": 0.125, "epoch": 0.6203638908327501, "grad_norm": 0.7916048168751694, "kl": 1.3046875, "learning_rate": 8.714225342789799e-06, "loss": 0.0139, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 183.5357208251953, "mean_terminated_completion_length": 173.1836700439453, "min_completion_length": 114.0, "min_terminated_completion_length": 114.0, "num_tokens": 2973713.0, "reward": 2.987633228302002, "reward_std": 0.002636364195495844, "rewards/check_originality_func/mean": 1.0, "rewards/check_originality_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9876329302787781, "rewards/check_winston_local_func/std": 0.008234072476625443, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1773 }, { "clip_ratio": 0.009267712943255901, "epoch": 0.6207137858642408, "grad_norm": 0.7152447232093754, "kl": 1.3046875, "learning_rate": 8.712180345015789e-06, "loss": 0.0113, "step": 1774 }, { "clip_ratio": 0.02234381064772606, "epoch": 0.6210636808957313, "grad_norm": 0.4887616526008087, "kl": 1.3046875, "learning_rate": 8.710133962633644e-06, "loss": 0.0081, "step": 1775 }, { "clip_ratio": 0.03893901780247688, "epoch": 0.6214135759272218, "grad_norm": 0.3907748091406294, "kl": 1.3125, "learning_rate": 8.708086196406646e-06, "loss": 0.0068, "step": 1776 }, { "clip_ratio": 0.005183732137084007, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.6217634709587124, "grad_norm": 0.7214288220677049, "kl": 1.203125, "learning_rate": 8.70603704709859e-06, "loss": 0.0113, "max_completion_length": 256.0, "max_terminated_completion_length": 246.0, "mean_completion_length": 192.57144165039062, "mean_terminated_completion_length": 182.0, "min_completion_length": 112.0, "min_terminated_completion_length": 112.0, "num_tokens": 2996801.0, "reward": 2.750422239303589, "reward_std": 0.20233184099197388, "rewards/check_originality_func/mean": 0.7857142686843872, "rewards/check_originality_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.9774629473686218, "rewards/check_winston_local_func/std": 0.04489027336239815, "rewards/sentence_count_match_reward_logic/mean": 0.9872449040412903, "rewards/sentence_count_match_reward_logic/std": 0.04110519215464592, "step": 1777 }, { "clip_ratio": 0.008419109508395195, "epoch": 0.622113365990203, "grad_norm": 0.6389030771508039, "kl": 1.2109375, "learning_rate": 8.70398651547379e-06, "loss": 0.0092, "step": 1778 }, { "clip_ratio": 0.018708301708102226, "epoch": 0.6224632610216935, "grad_norm": 0.4815107299319901, "kl": 1.21875, "learning_rate": 8.701934602297077e-06, "loss": 0.0067, "step": 1779 }, { "clip_ratio": 0.03201255202293396, "epoch": 0.622813156053184, "grad_norm": 0.3917073277448687, "kl": 1.21875, "learning_rate": 8.699881308333794e-06, "loss": 0.0051, "step": 1780 }, { "clip_ratio": 0.006495320238173008, "clipped_completions_ratio": 0.0, "epoch": 0.6231630510846746, "grad_norm": 0.886446918452202, "kl": 1.4375, "learning_rate": 8.697826634349799e-06, "loss": 0.0138, "max_completion_length": 250.0, "max_terminated_completion_length": 250.0, "mean_completion_length": 191.73214721679688, "mean_terminated_completion_length": 191.73214721679688, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 3019418.0, "reward": 2.9474587440490723, "reward_std": 0.07845812290906906, "rewards/check_originality_func/mean": 0.9642857313156128, "rewards/check_originality_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9831729531288147, "rewards/check_winston_local_func/std": 0.026627248153090477, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1781 }, { "clip_ratio": 0.012645876966416836, "epoch": 0.6235129461161651, "grad_norm": 0.6412085848043543, "kl": 1.4375, "learning_rate": 8.695770581111467e-06, "loss": 0.0115, "step": 1782 }, { "clip_ratio": 0.02662925235927105, "epoch": 0.6238628411476557, "grad_norm": 0.5134441536184822, "kl": 1.453125, "learning_rate": 8.693713149385687e-06, "loss": 0.0091, "step": 1783 }, { "clip_ratio": 0.04059545323252678, "epoch": 0.6242127361791463, "grad_norm": 0.46563276178496643, "kl": 1.4609375, "learning_rate": 8.69165433993986e-06, "loss": 0.0072, "step": 1784 }, { "clip_ratio": 0.0062938653863966465, "clipped_completions_ratio": 0.0535714285714286, "epoch": 0.6245626312106368, "grad_norm": 0.920630924040457, "kl": 1.65625, "learning_rate": 8.689594153541904e-06, "loss": 0.0159, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 174.6607208251953, "mean_terminated_completion_length": 170.05661010742188, "min_completion_length": 76.0, "min_terminated_completion_length": 76.0, "num_tokens": 3040303.0, "reward": 2.9279818534851074, "reward_std": 0.09379876405000687, "rewards/check_originality_func/mean": 0.9464285969734192, "rewards/check_originality_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9815531969070435, "rewards/check_winston_local_func/std": 0.05266078561544418, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1785 }, { "clip_ratio": 0.012666655704379082, "epoch": 0.6249125262421273, "grad_norm": 0.725898643929234, "kl": 1.65625, "learning_rate": 8.687532590960247e-06, "loss": 0.0124, "step": 1786 }, { "clip_ratio": 0.030915742740035057, "epoch": 0.6252624212736179, "grad_norm": 0.541984925485377, "kl": 1.65625, "learning_rate": 8.685469652963837e-06, "loss": 0.0092, "step": 1787 }, { "clip_ratio": 0.04197109118103981, "epoch": 0.6256123163051085, "grad_norm": 0.480903756467315, "kl": 1.6640625, "learning_rate": 8.683405340322123e-06, "loss": 0.0075, "step": 1788 }, { "clip_ratio": 0.00592381227761507, "clipped_completions_ratio": 0.017857142857142905, "epoch": 0.625962211336599, "grad_norm": 1.1254465049407572, "kl": 1.6328125, "learning_rate": 8.68133965380508e-06, "loss": 0.0158, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 166.73214721679688, "mean_terminated_completion_length": 165.1090850830078, "min_completion_length": 60.0, "min_terminated_completion_length": 60.0, "num_tokens": 3060152.0, "reward": 2.964782953262329, "reward_std": 0.06800641119480133, "rewards/check_originality_func/mean": 0.9821428656578064, "rewards/check_originality_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9915681481361389, "rewards/check_winston_local_func/std": 0.0020670006051659584, "rewards/sentence_count_match_reward_logic/mean": 0.9910714030265808, "rewards/sentence_count_match_reward_logic/std": 0.04681408405303955, "step": 1789 }, { "clip_ratio": 0.021175066009163857, "epoch": 0.6263121063680895, "grad_norm": 0.7058463515946845, "kl": 1.6484375, "learning_rate": 8.679272594183187e-06, "loss": 0.0127, "step": 1790 }, { "clip_ratio": 0.03321269154548645, "epoch": 0.6266620013995802, "grad_norm": 0.5589273751789835, "kl": 1.65625, "learning_rate": 8.67720416222744e-06, "loss": 0.0109, "step": 1791 }, { "clip_ratio": 0.04099726676940918, "epoch": 0.6270118964310707, "grad_norm": 0.4430491579297104, "kl": 1.671875, "learning_rate": 8.675134358709341e-06, "loss": 0.0091, "step": 1792 }, { "clip_ratio": 0.006668230518698692, "clipped_completions_ratio": 0.0, "epoch": 0.6273617914625612, "grad_norm": 1.0018105715402739, "kl": 1.8515625, "learning_rate": 8.67306318440091e-06, "loss": 0.0204, "max_completion_length": 187.0, "max_terminated_completion_length": 187.0, "mean_completion_length": 139.19644165039062, "mean_terminated_completion_length": 139.19644165039062, "min_completion_length": 90.0, "min_terminated_completion_length": 90.0, "num_tokens": 3076659.0, "reward": 2.9542243480682373, "reward_std": 0.11143960803747177, "rewards/check_originality_func/mean": 0.9642857313156128, "rewards/check_originality_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9899381995201111, "rewards/check_winston_local_func/std": 0.02446400187909603, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1793 }, { "clip_ratio": 0.015960464254021645, "epoch": 0.6277116864940517, "grad_norm": 0.8171599966601842, "kl": 1.8671875, "learning_rate": 8.670990640074675e-06, "loss": 0.0168, "step": 1794 }, { "clip_ratio": 0.03447318077087402, "epoch": 0.6280615815255424, "grad_norm": 0.6500186974833716, "kl": 1.8828125, "learning_rate": 8.668916726503674e-06, "loss": 0.014, "step": 1795 }, { "clip_ratio": 0.045809146016836166, "epoch": 0.6284114765570329, "grad_norm": 0.4980826324762999, "kl": 1.859375, "learning_rate": 8.666841444461456e-06, "loss": 0.0131, "step": 1796 }, { "clip_ratio": 0.00675792433321476, "clipped_completions_ratio": 0.25, "epoch": 0.6287613715885234, "grad_norm": 1.0216320144152693, "kl": 1.421875, "learning_rate": 8.664764794722087e-06, "loss": 0.0056, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 184.94644165039062, "mean_terminated_completion_length": 161.26190185546875, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 3098864.0, "reward": 2.6120917797088623, "reward_std": 0.16058416664600372, "rewards/check_originality_func/mean": 0.6607142686843872, "rewards/check_originality_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.9920517802238464, "rewards/check_winston_local_func/std": 0.0023040841333568096, "rewards/sentence_count_match_reward_logic/mean": 0.95932537317276, "rewards/sentence_count_match_reward_logic/std": 0.06399624794721603, "step": 1797 }, { "clip_ratio": 0.016758929938077927, "epoch": 0.629111266620014, "grad_norm": 2.559686458305609, "kl": 1.5390625, "learning_rate": 8.662686778060131e-06, "loss": 0.0031, "step": 1798 }, { "clip_ratio": 0.025347979739308357, "epoch": 0.6294611616515046, "grad_norm": 0.6736094790007808, "kl": 1.421875, "learning_rate": 8.660607395250673e-06, "loss": 0.0011, "step": 1799 }, { "clip_ratio": 0.035434186458587646, "epoch": 0.6298110566829951, "grad_norm": 0.44576277095322414, "kl": 1.4296875, "learning_rate": 8.658526647069303e-06, "loss": 0.0004, "step": 1800 }, { "clip_ratio": 0.007083911448717117, "clipped_completions_ratio": 0.0, "epoch": 0.6301609517144856, "grad_norm": 0.8779131033101765, "kl": 1.453125, "learning_rate": 8.656444534292116e-06, "loss": 0.0115, "max_completion_length": 217.0, "max_terminated_completion_length": 217.0, "mean_completion_length": 165.875, "mean_terminated_completion_length": 165.875, "min_completion_length": 101.0, "min_terminated_completion_length": 101.0, "num_tokens": 3118777.0, "reward": 2.8464930057525635, "reward_std": 0.14025230705738068, "rewards/check_originality_func/mean": 0.8571428656578064, "rewards/check_originality_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9919009804725647, "rewards/check_winston_local_func/std": 0.0018344707787036896, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 1801 }, { "clip_ratio": 0.014813998714089394, "epoch": 0.6305108467459762, "grad_norm": 0.7400763690863106, "kl": 1.453125, "learning_rate": 8.654361057695727e-06, "loss": 0.0078, "step": 1802 }, { "clip_ratio": 0.029314842075109482, "epoch": 0.6308607417774668, "grad_norm": 0.591207772048256, "kl": 1.4609375, "learning_rate": 8.652276218057248e-06, "loss": 0.0044, "step": 1803 }, { "clip_ratio": 0.0463126115500927, "epoch": 0.6312106368089573, "grad_norm": 0.46390506864155756, "kl": 1.46875, "learning_rate": 8.650190016154307e-06, "loss": 0.0022, "step": 1804 }, { "clip_ratio": 0.005389831028878689, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.6315605318404479, "grad_norm": 1.0385009763842945, "kl": 1.578125, "learning_rate": 8.648102452765036e-06, "loss": 0.01, "max_completion_length": 256.0, "max_terminated_completion_length": 210.0, "mean_completion_length": 174.58929443359375, "mean_terminated_completion_length": 142.02500915527344, "min_completion_length": 54.0, "min_terminated_completion_length": 54.0, "num_tokens": 3140426.0, "reward": 2.9176135063171387, "reward_std": 0.05315222218632698, "rewards/check_originality_func/mean": 0.9821428656578064, "rewards/check_originality_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9890418648719788, "rewards/check_winston_local_func/std": 0.005878149066120386, "rewards/sentence_count_match_reward_logic/mean": 0.9464285969734192, "rewards/sentence_count_match_reward_logic/std": 0.1324102282524109, "step": 1805 }, { "clip_ratio": 0.013556082732975483, "epoch": 0.6319104268719384, "grad_norm": 0.6682921865091291, "kl": 1.5859375, "learning_rate": 8.646013528668078e-06, "loss": 0.0067, "step": 1806 }, { "clip_ratio": 0.02743544802069664, "epoch": 0.632260321903429, "grad_norm": 0.5531104038261795, "kl": 1.5859375, "learning_rate": 8.64392324464258e-06, "loss": 0.0048, "step": 1807 }, { "clip_ratio": 0.03851019963622093, "epoch": 0.6326102169349195, "grad_norm": 0.4191823457437005, "kl": 1.59375, "learning_rate": 8.641831601468198e-06, "loss": 0.0032, "step": 1808 }, { "clip_ratio": 0.004837389569729567, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.6329601119664101, "grad_norm": 0.9360274970777286, "kl": 1.515625, "learning_rate": 8.639738599925098e-06, "loss": 0.0056, "max_completion_length": 256.0, "max_terminated_completion_length": 232.0, "mean_completion_length": 177.05357360839844, "mean_terminated_completion_length": 163.89584350585938, "min_completion_length": 86.0, "min_terminated_completion_length": 86.0, "num_tokens": 3161981.0, "reward": 2.708766222000122, "reward_std": 0.12881875038146973, "rewards/check_originality_func/mean": 0.7678571343421936, "rewards/check_originality_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.9919294714927673, "rewards/check_winston_local_func/std": 0.0021812887862324715, "rewards/sentence_count_match_reward_logic/mean": 0.9489795565605164, "rewards/sentence_count_match_reward_logic/std": 0.09509672969579697, "step": 1809 }, { "clip_ratio": 0.012504416517913342, "epoch": 0.6333100069979006, "grad_norm": 0.7369486190601704, "kl": 1.515625, "learning_rate": 8.637644240793947e-06, "loss": 0.0024, "step": 1810 }, { "clip_ratio": 0.028787337243556976, "epoch": 0.6336599020293912, "grad_norm": 0.5516164598953104, "kl": 1.515625, "learning_rate": 8.635548524855924e-06, "loss": -0.0006, "step": 1811 }, { "clip_ratio": 0.04028450697660446, "epoch": 0.6340097970608818, "grad_norm": 0.46311699867421974, "kl": 1.515625, "learning_rate": 8.633451452892707e-06, "loss": -0.0025, "step": 1812 }, { "clip_ratio": 0.00536248879507184, "clipped_completions_ratio": 0.0, "epoch": 0.6343596920923723, "grad_norm": 0.9994872510295952, "kl": 1.703125, "learning_rate": 8.631353025686487e-06, "loss": 0.0135, "max_completion_length": 226.0, "max_terminated_completion_length": 226.0, "mean_completion_length": 143.82144165039062, "mean_terminated_completion_length": 143.82144165039062, "min_completion_length": 70.0, "min_terminated_completion_length": 70.0, "num_tokens": 3179395.0, "reward": 2.881106376647949, "reward_std": 0.07411257177591324, "rewards/check_originality_func/mean": 0.8928571343421936, "rewards/check_originality_func/std": 0.3120938837528229, "rewards/check_winston_local_func/mean": 0.9908002018928528, "rewards/check_winston_local_func/std": 0.0035874145105481148, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 1813 }, { "clip_ratio": 0.015013091266155243, "epoch": 0.6347095871238628, "grad_norm": 0.7024271095477915, "kl": 1.71875, "learning_rate": 8.629253244019957e-06, "loss": 0.0104, "step": 1814 }, { "clip_ratio": 0.03314068540930748, "epoch": 0.6350594821553533, "grad_norm": 0.6318578246120173, "kl": 1.7265625, "learning_rate": 8.627152108676316e-06, "loss": 0.0085, "step": 1815 }, { "clip_ratio": 0.043579403311014175, "epoch": 0.635409377186844, "grad_norm": 0.47059148278845686, "kl": 1.71875, "learning_rate": 8.625049620439266e-06, "loss": 0.0065, "step": 1816 }, { "clip_ratio": 0.006668065208941698, "clipped_completions_ratio": 0.1607142857142857, "epoch": 0.6357592722183345, "grad_norm": 0.9731198799807581, "kl": 1.5390625, "learning_rate": 8.622945780093017e-06, "loss": 0.0137, "max_completion_length": 256.0, "max_terminated_completion_length": 243.0, "mean_completion_length": 162.57144165039062, "mean_terminated_completion_length": 144.68084716796875, "min_completion_length": 95.0, "min_terminated_completion_length": 95.0, "num_tokens": 3199155.0, "reward": 2.5992844104766846, "reward_std": 0.21732692420482635, "rewards/check_originality_func/mean": 0.6785714030265808, "rewards/check_originality_func/std": 0.4712514281272888, "rewards/check_winston_local_func/mean": 0.9296414256095886, "rewards/check_winston_local_func/std": 0.12690898776054382, "rewards/sentence_count_match_reward_logic/mean": 0.9910714030265808, "rewards/sentence_count_match_reward_logic/std": 0.04681408405303955, "step": 1817 }, { "clip_ratio": 0.01544728223234415, "epoch": 0.636109167249825, "grad_norm": 0.7260286792648273, "kl": 1.5390625, "learning_rate": 8.62084058842228e-06, "loss": 0.0104, "step": 1818 }, { "clip_ratio": 0.029812579974532127, "epoch": 0.6364590622813157, "grad_norm": 0.501364951846367, "kl": 1.5546875, "learning_rate": 8.618734046212275e-06, "loss": 0.0075, "step": 1819 }, { "clip_ratio": 0.045158278197050095, "epoch": 0.6368089573128062, "grad_norm": 0.5455382223875771, "kl": 1.5625, "learning_rate": 8.616626154248717e-06, "loss": 0.0061, "step": 1820 }, { "clip_ratio": 0.0054420651867985725, "clipped_completions_ratio": 0.0, "epoch": 0.6371588523442967, "grad_norm": 0.9285994633242897, "kl": 1.6328125, "learning_rate": 8.614516913317835e-06, "loss": 0.0134, "max_completion_length": 251.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 177.5178680419922, "mean_terminated_completion_length": 177.5178680419922, "min_completion_length": 68.0, "min_terminated_completion_length": 68.0, "num_tokens": 3219984.0, "reward": 2.925849437713623, "reward_std": 0.04166414588689804, "rewards/check_originality_func/mean": 1.0, "rewards/check_originality_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9391146898269653, "rewards/check_winston_local_func/std": 0.1351444274187088, "rewards/sentence_count_match_reward_logic/mean": 0.9867346882820129, "rewards/sentence_count_match_reward_logic/std": 0.04872070997953415, "step": 1821 }, { "clip_ratio": 0.015225979499518871, "epoch": 0.6375087473757872, "grad_norm": 0.6942199501047043, "kl": 1.6328125, "learning_rate": 8.61240632420635e-06, "loss": 0.009, "step": 1822 }, { "clip_ratio": 0.03187843784689903, "epoch": 0.6378586424072779, "grad_norm": 0.550433051365153, "kl": 1.6328125, "learning_rate": 8.6102943877015e-06, "loss": 0.006, "step": 1823 }, { "clip_ratio": 0.04644354060292244, "epoch": 0.6382085374387684, "grad_norm": 0.4947304691883009, "kl": 1.640625, "learning_rate": 8.608181104591008e-06, "loss": 0.0039, "step": 1824 }, { "clip_ratio": 0.004912090487778187, "clipped_completions_ratio": 0.3928571428571429, "epoch": 0.6385584324702589, "grad_norm": 0.9135327386029524, "kl": 1.4921875, "learning_rate": 8.606066475663116e-06, "loss": 0.0191, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 201.58929443359375, "mean_terminated_completion_length": 166.38235473632812, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 3244033.0, "reward": 2.7901856899261475, "reward_std": 0.14439818263053894, "rewards/check_originality_func/mean": 0.8035714030265808, "rewards/check_originality_func/std": 0.40089184045791626, "rewards/check_winston_local_func/mean": 0.9917162656784058, "rewards/check_winston_local_func/std": 0.0027040799614042044, "rewards/sentence_count_match_reward_logic/mean": 0.9948979616165161, "rewards/sentence_count_match_reward_logic/std": 0.026750901713967323, "step": 1825 }, { "clip_ratio": 0.013221783563494682, "epoch": 0.6389083275017495, "grad_norm": 0.7213129694584851, "kl": 1.4921875, "learning_rate": 8.603950501706555e-06, "loss": 0.0158, "step": 1826 }, { "clip_ratio": 0.027401205152273178, "epoch": 0.63925822253324, "grad_norm": 0.5227619926289462, "kl": 1.4921875, "learning_rate": 8.601833183510565e-06, "loss": 0.013, "step": 1827 }, { "clip_ratio": 0.041053157299757004, "epoch": 0.6396081175647306, "grad_norm": 0.4640594725943834, "kl": 1.5, "learning_rate": 8.59971452186489e-06, "loss": 0.0116, "step": 1828 }, { "clip_ratio": 0.007999955676496029, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.6399580125962211, "grad_norm": 1.0748148716780104, "kl": 1.5859375, "learning_rate": 8.597594517559766e-06, "loss": 0.0117, "max_completion_length": 256.0, "max_terminated_completion_length": 246.0, "mean_completion_length": 178.07144165039062, "mean_terminated_completion_length": 165.08334350585938, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 3265389.0, "reward": 2.806013822555542, "reward_std": 0.15606817603111267, "rewards/check_originality_func/mean": 0.8214285969734192, "rewards/check_originality_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9911327362060547, "rewards/check_winston_local_func/std": 0.004361685831099749, "rewards/sentence_count_match_reward_logic/mean": 0.9934523701667786, "rewards/sentence_count_match_reward_logic/std": 0.028723444789648056, "step": 1829 }, { "clip_ratio": 0.017308183014392853, "epoch": 0.6403079076277117, "grad_norm": 0.8167563191006327, "kl": 1.5859375, "learning_rate": 8.595473171385936e-06, "loss": 0.0072, "step": 1830 }, { "clip_ratio": 0.0372648611664772, "epoch": 0.6406578026592022, "grad_norm": 0.6353068639364916, "kl": 1.5859375, "learning_rate": 8.593350484134642e-06, "loss": 0.0038, "step": 1831 }, { "clip_ratio": 0.05432455986738205, "epoch": 0.6410076976906928, "grad_norm": 0.48319977504387224, "kl": 1.5859375, "learning_rate": 8.591226456597626e-06, "loss": 0.0019, "step": 1832 }, { "clip_ratio": 0.005934897344559431, "clipped_completions_ratio": 0.25, "epoch": 0.6413575927221833, "grad_norm": 0.907138268115618, "kl": 1.390625, "learning_rate": 8.58910108956713e-06, "loss": 0.0119, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 182.32144165039062, "mean_terminated_completion_length": 157.76190185546875, "min_completion_length": 130.0, "min_terminated_completion_length": 130.0, "num_tokens": 3287055.0, "reward": 2.8515806198120117, "reward_std": 0.11841634660959244, "rewards/check_originality_func/mean": 0.875, "rewards/check_originality_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.9908661246299744, "rewards/check_winston_local_func/std": 0.0037629022262990475, "rewards/sentence_count_match_reward_logic/mean": 0.985714316368103, "rewards/sentence_count_match_reward_logic/std": 0.035309404134750366, "step": 1833 }, { "clip_ratio": 0.01284309197217226, "epoch": 0.6417074877536739, "grad_norm": 0.7298278287200656, "kl": 1.390625, "learning_rate": 8.586974383835897e-06, "loss": 0.0086, "step": 1834 }, { "clip_ratio": 0.02790398895740509, "epoch": 0.6420573827851644, "grad_norm": 0.6227042800598375, "kl": 1.40625, "learning_rate": 8.584846340197167e-06, "loss": 0.0057, "step": 1835 }, { "clip_ratio": 0.04181177541613579, "epoch": 0.642407277816655, "grad_norm": 0.43295363742414655, "kl": 1.40625, "learning_rate": 8.582716959444679e-06, "loss": 0.0034, "step": 1836 }, { "clip_ratio": 0.007715255953371525, "clipped_completions_ratio": 0.0, "epoch": 0.6427571728481456, "grad_norm": 1.3729440119022072, "kl": 2.0, "learning_rate": 8.580586242372675e-06, "loss": 0.0329, "max_completion_length": 197.0, "max_terminated_completion_length": 197.0, "mean_completion_length": 140.71429443359375, "mean_terminated_completion_length": 140.71429443359375, "min_completion_length": 67.0, "min_terminated_completion_length": 67.0, "num_tokens": 3304167.0, "reward": 2.934816837310791, "reward_std": 0.10121586173772812, "rewards/check_originality_func/mean": 0.9642857313156128, "rewards/check_originality_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9775462746620178, "rewards/check_winston_local_func/std": 0.05044688656926155, "rewards/sentence_count_match_reward_logic/mean": 0.9929847121238708, "rewards/sentence_count_match_reward_logic/std": 0.03817475214600563, "step": 1837 }, { "clip_ratio": 0.022550201043486595, "epoch": 0.6431070678796361, "grad_norm": 0.9396673099449434, "kl": 2.015625, "learning_rate": 8.578454189775889e-06, "loss": 0.027, "step": 1838 }, { "clip_ratio": 0.04328536242246628, "epoch": 0.6434569629111266, "grad_norm": 0.8165462719486712, "kl": 2.0, "learning_rate": 8.576320802449559e-06, "loss": 0.0236, "step": 1839 }, { "clip_ratio": 0.05739865452051163, "epoch": 0.6438068579426172, "grad_norm": 0.8894521672923492, "kl": 2.0, "learning_rate": 8.574186081189416e-06, "loss": 0.0225, "step": 1840 }, { "clip_ratio": 0.005556108430027962, "clipped_completions_ratio": 0.0, "epoch": 0.6441567529741078, "grad_norm": 0.8564807684283474, "kl": 1.5703125, "learning_rate": 8.572050026791693e-06, "loss": 0.0124, "max_completion_length": 225.0, "max_terminated_completion_length": 225.0, "mean_completion_length": 155.0357208251953, "mean_terminated_completion_length": 155.0357208251953, "min_completion_length": 109.0, "min_terminated_completion_length": 109.0, "num_tokens": 3322449.0, "reward": 2.949272394180298, "reward_std": 0.06479713320732117, "rewards/check_originality_func/mean": 0.9642857313156128, "rewards/check_originality_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.984986424446106, "rewards/check_winston_local_func/std": 0.020717358216643333, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1841 }, { "clip_ratio": 0.01255191769450903, "epoch": 0.6445066480055983, "grad_norm": 0.66242803578909, "kl": 1.578125, "learning_rate": 8.569912640053117e-06, "loss": 0.0092, "step": 1842 }, { "clip_ratio": 0.027515646070241928, "epoch": 0.6448565430370888, "grad_norm": 0.5161532663352977, "kl": 1.578125, "learning_rate": 8.567773921770914e-06, "loss": 0.007, "step": 1843 }, { "clip_ratio": 0.041859280318021774, "epoch": 0.6452064380685795, "grad_norm": 0.4114431190577448, "kl": 1.59375, "learning_rate": 8.565633872742803e-06, "loss": 0.0048, "step": 1844 }, { "clip_ratio": 0.007351220119744539, "clipped_completions_ratio": 0.0, "epoch": 0.64555633310007, "grad_norm": 1.0849629423383784, "kl": 1.71875, "learning_rate": 8.563492493767004e-06, "loss": 0.0166, "max_completion_length": 234.0, "max_terminated_completion_length": 234.0, "mean_completion_length": 150.8928680419922, "mean_terminated_completion_length": 150.8928680419922, "min_completion_length": 66.0, "min_terminated_completion_length": 66.0, "num_tokens": 3341083.0, "reward": 2.920941114425659, "reward_std": 0.10106454789638519, "rewards/check_originality_func/mean": 0.9464285969734192, "rewards/check_originality_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9887981414794922, "rewards/check_winston_local_func/std": 0.004164115991443396, "rewards/sentence_count_match_reward_logic/mean": 0.985714316368103, "rewards/sentence_count_match_reward_logic/std": 0.051974017173051834, "step": 1845 }, { "clip_ratio": 0.01747848652303219, "epoch": 0.6459062281315605, "grad_norm": 0.7285824344334942, "kl": 1.71875, "learning_rate": 8.561349785642232e-06, "loss": 0.0126, "step": 1846 }, { "clip_ratio": 0.031570568680763245, "epoch": 0.646256123163051, "grad_norm": 0.5883451160508503, "kl": 1.7265625, "learning_rate": 8.559205749167695e-06, "loss": 0.0101, "step": 1847 }, { "clip_ratio": 0.04601063206791878, "epoch": 0.6466060181945417, "grad_norm": 0.4872267034582624, "kl": 1.734375, "learning_rate": 8.557060385143102e-06, "loss": 0.0074, "step": 1848 }, { "clip_ratio": 0.007876910269260406, "clipped_completions_ratio": 0.0, "epoch": 0.6469559132260322, "grad_norm": 1.000158709347767, "kl": 1.65625, "learning_rate": 8.554913694368647e-06, "loss": 0.019, "max_completion_length": 253.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 172.17857360839844, "mean_terminated_completion_length": 172.17857360839844, "min_completion_length": 83.0, "min_terminated_completion_length": 83.0, "num_tokens": 3361325.0, "reward": 2.9925143718719482, "reward_std": 0.0013943978119641542, "rewards/check_originality_func/mean": 1.0, "rewards/check_originality_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9925143122673035, "rewards/check_winston_local_func/std": 0.0017490051686763763, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1849 }, { "clip_ratio": 0.019159670919179916, "epoch": 0.6473058082575227, "grad_norm": 0.7275065135765206, "kl": 1.65625, "learning_rate": 8.552765677645031e-06, "loss": 0.0159, "step": 1850 }, { "clip_ratio": 0.034321822226047516, "epoch": 0.6476557032890133, "grad_norm": 0.5613138262958249, "kl": 1.65625, "learning_rate": 8.550616335773441e-06, "loss": 0.0129, "step": 1851 }, { "clip_ratio": 0.05040881782770157, "epoch": 0.6480055983205039, "grad_norm": 0.4633779342437211, "kl": 1.6640625, "learning_rate": 8.548465669555564e-06, "loss": 0.0107, "step": 1852 }, { "clip_ratio": 0.0062144421972334385, "clipped_completions_ratio": 0.1607142857142857, "epoch": 0.6483554933519944, "grad_norm": 0.8184271321327669, "kl": 1.25, "learning_rate": 8.546313679793573e-06, "loss": 0.0135, "max_completion_length": 256.0, "max_terminated_completion_length": 244.0, "mean_completion_length": 202.75001525878906, "mean_terminated_completion_length": 192.55319213867188, "min_completion_length": 129.0, "min_terminated_completion_length": 129.0, "num_tokens": 3385607.0, "reward": 2.9535937309265137, "reward_std": 0.1033233106136322, "rewards/check_originality_func/mean": 0.9642857313156128, "rewards/check_originality_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9893079400062561, "rewards/check_winston_local_func/std": 0.008177761919796467, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1853 }, { "clip_ratio": 0.011139219626784325, "epoch": 0.6487053883834849, "grad_norm": 0.6853782753529808, "kl": 1.25, "learning_rate": 8.544160367290145e-06, "loss": 0.0109, "step": 1854 }, { "clip_ratio": 0.024765826761722565, "epoch": 0.6490552834149755, "grad_norm": 0.4728812124437454, "kl": 1.25, "learning_rate": 8.542005732848441e-06, "loss": 0.0081, "step": 1855 }, { "clip_ratio": 0.03817087039351463, "epoch": 0.6494051784464661, "grad_norm": 0.40636588024070674, "kl": 1.2578125, "learning_rate": 8.539849777272125e-06, "loss": 0.006, "step": 1856 }, { "clip_ratio": 0.007144119590520859, "clipped_completions_ratio": 0.2142857142857143, "epoch": 0.6497550734779566, "grad_norm": 0.9704280515166094, "kl": 1.5078125, "learning_rate": 8.537692501365342e-06, "loss": 0.0138, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 175.9107208251953, "mean_terminated_completion_length": 154.0681915283203, "min_completion_length": 94.0, "min_terminated_completion_length": 94.0, "num_tokens": 3406650.0, "reward": 2.820827007293701, "reward_std": 0.20870278775691986, "rewards/check_originality_func/mean": 0.8392857313156128, "rewards/check_originality_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.992255687713623, "rewards/check_winston_local_func/std": 0.0021863398142158985, "rewards/sentence_count_match_reward_logic/mean": 0.9892857670783997, "rewards/sentence_count_match_reward_logic/std": 0.045441556721925735, "step": 1857 }, { "clip_ratio": 0.014345230534672737, "epoch": 0.6501049685094472, "grad_norm": 0.7210866810328007, "kl": 1.5078125, "learning_rate": 8.535533905932739e-06, "loss": 0.0108, "step": 1858 }, { "clip_ratio": 0.03126004710793495, "epoch": 0.6504548635409377, "grad_norm": 0.5483735804710463, "kl": 1.5078125, "learning_rate": 8.53337399177945e-06, "loss": 0.0075, "step": 1859 }, { "clip_ratio": 0.04742981866002083, "epoch": 0.6508047585724283, "grad_norm": 0.464110691162454, "kl": 1.5078125, "learning_rate": 8.531212759711103e-06, "loss": 0.0059, "step": 1860 }, { "clip_ratio": 0.0071542104706168175, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.6511546536039188, "grad_norm": 0.8648722671810908, "kl": 1.421875, "learning_rate": 8.529050210533818e-06, "loss": 0.0086, "max_completion_length": 256.0, "max_terminated_completion_length": 212.0, "mean_completion_length": 187.05357360839844, "mean_terminated_completion_length": 175.5625, "min_completion_length": 145.0, "min_terminated_completion_length": 145.0, "num_tokens": 3428845.0, "reward": 2.8106019496917725, "reward_std": 0.02304992265999317, "rewards/check_originality_func/mean": 0.8571428656578064, "rewards/check_originality_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9909587502479553, "rewards/check_winston_local_func/std": 0.002181410090997815, "rewards/sentence_count_match_reward_logic/mean": 0.9625000357627869, "rewards/sentence_count_match_reward_logic/std": 0.07276987284421921, "step": 1861 }, { "clip_ratio": 0.014508129097521305, "epoch": 0.6515045486354094, "grad_norm": 0.7278495057940695, "kl": 1.421875, "learning_rate": 8.526886345054204e-06, "loss": 0.005, "step": 1862 }, { "clip_ratio": 0.031127650290727615, "epoch": 0.6518544436668999, "grad_norm": 0.6457881358672325, "kl": 1.4296875, "learning_rate": 8.524721164079363e-06, "loss": 0.0018, "step": 1863 }, { "clip_ratio": 0.044989828020334244, "epoch": 0.6522043386983905, "grad_norm": 0.4430989448132571, "kl": 1.4375, "learning_rate": 8.522554668416887e-06, "loss": -0.0007, "step": 1864 }, { "clip_ratio": 0.006552358157932758, "clipped_completions_ratio": 0.0, "epoch": 0.6525542337298811, "grad_norm": 0.9544715223910858, "kl": 1.65625, "learning_rate": 8.520386858874858e-06, "loss": 0.0207, "max_completion_length": 245.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 174.5357208251953, "mean_terminated_completion_length": 174.5357208251953, "min_completion_length": 71.0, "min_terminated_completion_length": 71.0, "num_tokens": 3449659.0, "reward": 2.919947624206543, "reward_std": 0.07747673243284225, "rewards/check_originality_func/mean": 0.9285714030265808, "rewards/check_originality_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9913759231567383, "rewards/check_winston_local_func/std": 0.002290081698447466, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1865 }, { "clip_ratio": 0.017300255596637726, "epoch": 0.6529041287613716, "grad_norm": 0.7736761177063339, "kl": 1.65625, "learning_rate": 8.518217736261848e-06, "loss": 0.0172, "step": 1866 }, { "clip_ratio": 0.03066161274909973, "epoch": 0.6532540237928621, "grad_norm": 0.6079964284093721, "kl": 1.65625, "learning_rate": 8.516047301386923e-06, "loss": 0.0139, "step": 1867 }, { "clip_ratio": 0.048659488558769226, "epoch": 0.6536039188243526, "grad_norm": 0.6251821728626737, "kl": 1.6640625, "learning_rate": 8.51387555505963e-06, "loss": 0.0119, "step": 1868 }, { "clip_ratio": 0.008964973501861095, "clipped_completions_ratio": 0.0, "epoch": 0.6539538138558433, "grad_norm": 1.1337674377489932, "kl": 1.796875, "learning_rate": 8.51170249809001e-06, "loss": 0.0136, "max_completion_length": 211.0, "max_terminated_completion_length": 211.0, "mean_completion_length": 135.48214721679688, "mean_terminated_completion_length": 135.48214721679688, "min_completion_length": 89.0, "min_terminated_completion_length": 89.0, "num_tokens": 3466142.0, "reward": 2.9337174892425537, "reward_std": 0.15497440099716187, "rewards/check_originality_func/mean": 0.9464285969734192, "rewards/check_originality_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9872888326644897, "rewards/check_winston_local_func/std": 0.0071637388318777084, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1869 }, { "clip_ratio": 0.01801568828523159, "epoch": 0.6543037088873338, "grad_norm": 0.8080425661606351, "kl": 1.796875, "learning_rate": 8.509528131288598e-06, "loss": 0.0101, "step": 1870 }, { "clip_ratio": 0.0388273261487484, "epoch": 0.6546536039188243, "grad_norm": 0.7990606252333087, "kl": 1.8125, "learning_rate": 8.507352455466405e-06, "loss": 0.0068, "step": 1871 }, { "clip_ratio": 0.05381487309932709, "epoch": 0.655003498950315, "grad_norm": 0.5608871147163852, "kl": 1.8125, "learning_rate": 8.505175471434943e-06, "loss": 0.0051, "step": 1872 }, { "clip_ratio": 0.008079607039690018, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.6553533939818055, "grad_norm": 0.9842920640601849, "kl": 1.4765625, "learning_rate": 8.502997180006202e-06, "loss": 0.0087, "max_completion_length": 256.0, "max_terminated_completion_length": 211.0, "mean_completion_length": 172.8928680419922, "mean_terminated_completion_length": 159.0416717529297, "min_completion_length": 97.0, "min_terminated_completion_length": 97.0, "num_tokens": 3487952.0, "reward": 2.818967580795288, "reward_std": 0.083297960460186, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9558722376823425, "rewards/check_winston_local_func/std": 0.09212151914834976, "rewards/sentence_count_match_reward_logic/mean": 0.9523810148239136, "rewards/sentence_count_match_reward_logic/std": 0.11769796907901764, "step": 1873 }, { "clip_ratio": 0.01531163603067398, "epoch": 0.655703289013296, "grad_norm": 0.7893596366633362, "kl": 1.4765625, "learning_rate": 8.500817581992669e-06, "loss": 0.0052, "step": 1874 }, { "clip_ratio": 0.030834486708045006, "epoch": 0.6560531840447865, "grad_norm": 0.5835644531213311, "kl": 1.5, "learning_rate": 8.498636678207311e-06, "loss": 0.0014, "step": 1875 }, { "clip_ratio": 0.04601472243666649, "epoch": 0.6564030790762772, "grad_norm": 0.48627048218558994, "kl": 1.515625, "learning_rate": 8.496454469463583e-06, "loss": -0.0001, "step": 1876 }, { "clip_ratio": 0.008624005131423473, "clipped_completions_ratio": 0.0, "epoch": 0.6567529741077677, "grad_norm": 1.1054937436402907, "kl": 2.03125, "learning_rate": 8.494270956575428e-06, "loss": 0.0164, "max_completion_length": 224.0, "max_terminated_completion_length": 224.0, "mean_completion_length": 133.94644165039062, "mean_terminated_completion_length": 133.94644165039062, "min_completion_length": 68.0, "min_terminated_completion_length": 68.0, "num_tokens": 3503965.0, "reward": 2.971626043319702, "reward_std": 0.021778948605060577, "rewards/check_originality_func/mean": 1.0, "rewards/check_originality_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9894831776618958, "rewards/check_winston_local_func/std": 0.0037046517245471478, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.06496752798557281, "step": 1877 }, { "clip_ratio": 0.017482224851846695, "epoch": 0.6571028691392582, "grad_norm": 0.866020952275755, "kl": 2.046875, "learning_rate": 8.492086140357279e-06, "loss": 0.0129, "step": 1878 }, { "clip_ratio": 0.029598891735076904, "epoch": 0.6574527641707488, "grad_norm": 0.5709960746663649, "kl": 2.0625, "learning_rate": 8.48990002162405e-06, "loss": 0.0103, "step": 1879 }, { "clip_ratio": 0.043548669666051865, "epoch": 0.6578026592022393, "grad_norm": 0.5503735530925696, "kl": 2.0625, "learning_rate": 8.487712601191143e-06, "loss": 0.0096, "step": 1880 }, { "clip_ratio": 0.005745591130107641, "clipped_completions_ratio": 0.0, "epoch": 0.6581525542337299, "grad_norm": 1.1936302248283521, "kl": 1.46875, "learning_rate": 8.485523879874444e-06, "loss": 0.0155, "max_completion_length": 244.0, "max_terminated_completion_length": 244.0, "mean_completion_length": 180.48214721679688, "mean_terminated_completion_length": 180.48214721679688, "min_completion_length": 129.0, "min_terminated_completion_length": 129.0, "num_tokens": 3525448.0, "reward": 2.9383747577667236, "reward_std": 0.06144602224230766, "rewards/check_originality_func/mean": 0.9821428656578064, "rewards/check_originality_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9812319874763489, "rewards/check_winston_local_func/std": 0.02877272106707096, "rewards/sentence_count_match_reward_logic/mean": 0.9749999642372131, "rewards/sentence_count_match_reward_logic/std": 0.06674237549304962, "step": 1881 }, { "clip_ratio": 0.010719822719693184, "epoch": 0.6585024492652204, "grad_norm": 1.4568755890602894, "kl": 1.546875, "learning_rate": 8.483333858490326e-06, "loss": 0.0127, "step": 1882 }, { "clip_ratio": 0.021827319636940956, "epoch": 0.658852344296711, "grad_norm": 0.8782550049067195, "kl": 1.546875, "learning_rate": 8.481142537855646e-06, "loss": 0.0109, "step": 1883 }, { "clip_ratio": 0.030705779790878296, "epoch": 0.6592022393282015, "grad_norm": 0.544268231389265, "kl": 1.515625, "learning_rate": 8.478949918787746e-06, "loss": 0.0088, "step": 1884 }, { "clip_ratio": 0.006947482470422983, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.6595521343596921, "grad_norm": 0.9936231257799318, "kl": 1.4140625, "learning_rate": 8.476756002104453e-06, "loss": 0.0091, "max_completion_length": 256.0, "max_terminated_completion_length": 206.0, "mean_completion_length": 191.94644165039062, "mean_terminated_completion_length": 181.27084350585938, "min_completion_length": 122.0, "min_terminated_completion_length": 122.0, "num_tokens": 3548325.0, "reward": 2.8729796409606934, "reward_std": 0.08450612425804138, "rewards/check_originality_func/mean": 0.8928571343421936, "rewards/check_originality_func/std": 0.3120938837528229, "rewards/check_winston_local_func/mean": 0.9911767244338989, "rewards/check_winston_local_func/std": 0.0028382495511323214, "rewards/sentence_count_match_reward_logic/mean": 0.9889455437660217, "rewards/sentence_count_match_reward_logic/std": 0.0359317883849144, "step": 1885 }, { "clip_ratio": 0.013424945995211601, "epoch": 0.6599020293911827, "grad_norm": 0.7984202628459683, "kl": 1.4140625, "learning_rate": 8.474560788624078e-06, "loss": 0.0055, "step": 1886 }, { "clip_ratio": 0.031233184039592743, "epoch": 0.6602519244226732, "grad_norm": 0.5493297041184647, "kl": 1.421875, "learning_rate": 8.47236427916541e-06, "loss": 0.0021, "step": 1887 }, { "clip_ratio": 0.04932066425681114, "epoch": 0.6606018194541637, "grad_norm": 0.47570029440811823, "kl": 1.4296875, "learning_rate": 8.470166474547731e-06, "loss": -0.0001, "step": 1888 }, { "clip_ratio": 0.006645479705184698, "clipped_completions_ratio": 0.0357142857142857, "epoch": 0.6609517144856543, "grad_norm": 0.8024026089872329, "kl": 1.359375, "learning_rate": 8.467967375590801e-06, "loss": 0.0184, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 198.42857360839844, "mean_terminated_completion_length": 196.29629516601562, "min_completion_length": 160.0, "min_terminated_completion_length": 160.0, "num_tokens": 3571557.0, "reward": 2.8818700313568115, "reward_std": 0.12688787281513214, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9906073212623596, "rewards/check_winston_local_func/std": 0.00409326096996665, "rewards/sentence_count_match_reward_logic/mean": 0.9805484414100647, "rewards/sentence_count_match_reward_logic/std": 0.061697233468294144, "step": 1889 }, { "clip_ratio": 0.010505116544663906, "epoch": 0.6613016095171449, "grad_norm": 0.7046128056305148, "kl": 1.359375, "learning_rate": 8.46576698311486e-06, "loss": 0.0161, "step": 1890 }, { "clip_ratio": 0.026905277743935585, "epoch": 0.6616515045486354, "grad_norm": 0.5654379376793753, "kl": 1.359375, "learning_rate": 8.463565297940636e-06, "loss": 0.0125, "step": 1891 }, { "clip_ratio": 0.04291379079222679, "epoch": 0.6620013995801259, "grad_norm": 0.49454047375260096, "kl": 1.359375, "learning_rate": 8.461362320889338e-06, "loss": 0.01, "step": 1892 }, { "clip_ratio": 0.006633031647652388, "clipped_completions_ratio": 0.0, "epoch": 0.6623512946116166, "grad_norm": 1.0871135094170863, "kl": 1.8046875, "learning_rate": 8.459158052782652e-06, "loss": 0.0104, "max_completion_length": 230.0, "max_terminated_completion_length": 230.0, "mean_completion_length": 139.07144165039062, "mean_terminated_completion_length": 139.07144165039062, "min_completion_length": 98.0, "min_terminated_completion_length": 98.0, "num_tokens": 3588729.0, "reward": 2.9897539615631104, "reward_std": 0.00256260484457016, "rewards/check_originality_func/mean": 1.0, "rewards/check_originality_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9897537231445312, "rewards/check_winston_local_func/std": 0.005418961867690086, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1893 }, { "clip_ratio": 0.01835641823709011, "epoch": 0.6627011896431071, "grad_norm": 0.7791421358847059, "kl": 1.8125, "learning_rate": 8.456952494442753e-06, "loss": 0.0068, "step": 1894 }, { "clip_ratio": 0.03208121657371521, "epoch": 0.6630510846745976, "grad_norm": 0.6962275860987978, "kl": 1.8203125, "learning_rate": 8.454745646692291e-06, "loss": 0.0045, "step": 1895 }, { "clip_ratio": 0.042579714208841324, "epoch": 0.6634009797060881, "grad_norm": 0.6020339264989856, "kl": 1.8125, "learning_rate": 8.452537510354397e-06, "loss": 0.0034, "step": 1896 }, { "clip_ratio": 0.007440302520990372, "clipped_completions_ratio": 0.0, "epoch": 0.6637508747375788, "grad_norm": 1.1134218444793278, "kl": 1.8515625, "learning_rate": 8.45032808625269e-06, "loss": 0.0186, "max_completion_length": 207.0, "max_terminated_completion_length": 207.0, "mean_completion_length": 134.1428680419922, "mean_terminated_completion_length": 134.1428680419922, "min_completion_length": 59.0, "min_terminated_completion_length": 59.0, "num_tokens": 3605289.0, "reward": 2.916752576828003, "reward_std": 0.07699184119701385, "rewards/check_originality_func/mean": 0.9642857313156128, "rewards/check_originality_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9917523264884949, "rewards/check_winston_local_func/std": 0.003079406451433897, "rewards/sentence_count_match_reward_logic/mean": 0.9607142806053162, "rewards/sentence_count_match_reward_logic/std": 0.09081164002418518, "step": 1897 }, { "clip_ratio": 0.018083207309246063, "epoch": 0.6641007697690693, "grad_norm": 0.7594313050507956, "kl": 1.84375, "learning_rate": 8.44811737521126e-06, "loss": 0.0152, "step": 1898 }, { "clip_ratio": 0.03001904860138893, "epoch": 0.6644506648005598, "grad_norm": 0.5372049763204327, "kl": 1.8515625, "learning_rate": 8.445905378054686e-06, "loss": 0.013, "step": 1899 }, { "clip_ratio": 0.04349043220281601, "epoch": 0.6648005598320503, "grad_norm": 0.44629536079442345, "kl": 1.8671875, "learning_rate": 8.443692095608019e-06, "loss": 0.0121, "step": 1900 }, { "clip_ratio": 0.006450509652495384, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.665150454863541, "grad_norm": 1.0343040601614142, "kl": 1.53125, "learning_rate": 8.441477528696789e-06, "loss": 0.0185, "max_completion_length": 256.0, "max_terminated_completion_length": 237.0, "mean_completion_length": 168.80357360839844, "mean_terminated_completion_length": 154.27084350585938, "min_completion_length": 85.0, "min_terminated_completion_length": 85.0, "num_tokens": 3626174.0, "reward": 2.794551372528076, "reward_std": 0.030878979712724686, "rewards/check_originality_func/mean": 0.8571428656578064, "rewards/check_originality_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9688707590103149, "rewards/check_winston_local_func/std": 0.06954582035541534, "rewards/sentence_count_match_reward_logic/mean": 0.9685373902320862, "rewards/sentence_count_match_reward_logic/std": 0.07209031283855438, "step": 1901 }, { "clip_ratio": 0.013151134364306927, "epoch": 0.6655003498950315, "grad_norm": 0.7120776497955951, "kl": 1.5234375, "learning_rate": 8.439261678147014e-06, "loss": 0.0148, "step": 1902 }, { "clip_ratio": 0.028941955417394638, "epoch": 0.665850244926522, "grad_norm": 0.5350168282391929, "kl": 1.5234375, "learning_rate": 8.437044544785184e-06, "loss": 0.012, "step": 1903 }, { "clip_ratio": 0.04261527210474014, "epoch": 0.6662001399580126, "grad_norm": 0.4210983873711036, "kl": 1.5234375, "learning_rate": 8.43482612943827e-06, "loss": 0.0106, "step": 1904 }, { "clip_ratio": 0.006402843166142702, "clipped_completions_ratio": 0.125, "epoch": 0.6665500349895032, "grad_norm": 0.9570199096686173, "kl": 1.4453125, "learning_rate": 8.432606432933716e-06, "loss": 0.0146, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 172.9107208251953, "mean_terminated_completion_length": 161.0408172607422, "min_completion_length": 124.0, "min_terminated_completion_length": 124.0, "num_tokens": 3646889.0, "reward": 2.988060235977173, "reward_std": 0.002499479567632079, "rewards/check_originality_func/mean": 1.0, "rewards/check_originality_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.988059937953949, "rewards/check_winston_local_func/std": 0.00530283385887742, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1905 }, { "clip_ratio": 0.014546722173690796, "epoch": 0.6668999300209937, "grad_norm": 0.7231848611986167, "kl": 1.4453125, "learning_rate": 8.43038545609945e-06, "loss": 0.0114, "step": 1906 }, { "clip_ratio": 0.031338248401880264, "epoch": 0.6672498250524842, "grad_norm": 0.5487960135009952, "kl": 1.4453125, "learning_rate": 8.428163199763876e-06, "loss": 0.0083, "step": 1907 }, { "clip_ratio": 0.04443163052201271, "epoch": 0.6675997200839748, "grad_norm": 0.46464253061876093, "kl": 1.453125, "learning_rate": 8.425939664755874e-06, "loss": 0.0067, "step": 1908 }, { "clip_ratio": 0.006265887524932623, "clipped_completions_ratio": 0.0, "epoch": 0.6679496151154654, "grad_norm": 1.1518688233998289, "kl": 1.796875, "learning_rate": 8.4237148519048e-06, "loss": 0.0143, "max_completion_length": 233.0, "max_terminated_completion_length": 233.0, "mean_completion_length": 147.1428680419922, "mean_terminated_completion_length": 147.1428680419922, "min_completion_length": 75.0, "min_terminated_completion_length": 75.0, "num_tokens": 3664625.0, "reward": 2.900477409362793, "reward_std": 0.14196519553661346, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9897629618644714, "rewards/check_winston_local_func/std": 0.004431195091456175, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1909 }, { "clip_ratio": 0.015489662066102028, "epoch": 0.6682995101469559, "grad_norm": 0.778237898344824, "kl": 1.796875, "learning_rate": 8.42148876204049e-06, "loss": 0.0103, "step": 1910 }, { "clip_ratio": 0.03478284180164337, "epoch": 0.6686494051784465, "grad_norm": 0.598460959753167, "kl": 1.8046875, "learning_rate": 8.419261395993255e-06, "loss": 0.0072, "step": 1911 }, { "clip_ratio": 0.05304986983537674, "epoch": 0.668999300209937, "grad_norm": 0.4636265961479712, "kl": 1.8046875, "learning_rate": 8.417032754593879e-06, "loss": 0.0053, "step": 1912 }, { "clip_ratio": 0.005703727249056101, "clipped_completions_ratio": 0.0, "epoch": 0.6693491952414276, "grad_norm": 0.9012163696418458, "kl": 1.3125, "learning_rate": 8.414802838673626e-06, "loss": 0.0073, "max_completion_length": 240.0, "max_terminated_completion_length": 240.0, "mean_completion_length": 172.07144165039062, "mean_terminated_completion_length": 172.07144165039062, "min_completion_length": 120.0, "min_terminated_completion_length": 120.0, "num_tokens": 3685397.0, "reward": 2.9917588233947754, "reward_std": 0.001816671108826995, "rewards/check_originality_func/mean": 1.0, "rewards/check_originality_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9917586445808411, "rewards/check_winston_local_func/std": 0.003841186175122857, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1913 }, { "clip_ratio": 0.011949235573410988, "epoch": 0.6696990902729181, "grad_norm": 0.6992162626195751, "kl": 1.3125, "learning_rate": 8.412571649064233e-06, "loss": 0.0037, "step": 1914 }, { "clip_ratio": 0.02919371984899044, "epoch": 0.6700489853044087, "grad_norm": 0.4945183273704302, "kl": 1.3203125, "learning_rate": 8.410339186597914e-06, "loss": 0.0005, "step": 1915 }, { "clip_ratio": 0.04326971620321274, "epoch": 0.6703988803358992, "grad_norm": 0.4097951859454409, "kl": 1.328125, "learning_rate": 8.408105452107353e-06, "loss": -0.0009, "step": 1916 }, { "clip_ratio": 0.008883016183972359, "clipped_completions_ratio": 0.017857142857142905, "epoch": 0.6707487753673897, "grad_norm": 1.2609611001295615, "kl": 1.6796875, "learning_rate": 8.405870446425715e-06, "loss": 0.0133, "max_completion_length": 256.0, "max_terminated_completion_length": 252.0, "mean_completion_length": 164.5178680419922, "mean_terminated_completion_length": 162.8545379638672, "min_completion_length": 95.0, "min_terminated_completion_length": 95.0, "num_tokens": 3705242.0, "reward": 2.9526941776275635, "reward_std": 0.06728103011846542, "rewards/check_originality_func/mean": 0.9642857313156128, "rewards/check_originality_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9909592866897583, "rewards/check_winston_local_func/std": 0.007765123154968023, "rewards/sentence_count_match_reward_logic/mean": 0.9974489808082581, "rewards/sentence_count_match_reward_logic/std": 0.019090088084340096, "step": 1917 }, { "clip_ratio": 0.01855776645243168, "epoch": 0.6710986703988804, "grad_norm": 0.8215619003253551, "kl": 1.6796875, "learning_rate": 8.403634170386634e-06, "loss": 0.0095, "step": 1918 }, { "clip_ratio": 0.03635615110397339, "epoch": 0.6714485654303709, "grad_norm": 0.6359420171655977, "kl": 1.6875, "learning_rate": 8.40139662482422e-06, "loss": 0.0054, "step": 1919 }, { "clip_ratio": 0.05613154545426369, "epoch": 0.6717984604618614, "grad_norm": 0.5619024357733998, "kl": 1.6796875, "learning_rate": 8.39915781057306e-06, "loss": 0.0036, "step": 1920 }, { "clip_ratio": 0.006476602051407099, "clipped_completions_ratio": 0.0, "epoch": 0.672148355493352, "grad_norm": 0.8759826787002964, "kl": 1.7421875, "learning_rate": 8.396917728468203e-06, "loss": 0.015, "max_completion_length": 208.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 144.0357208251953, "mean_terminated_completion_length": 144.0357208251953, "min_completion_length": 107.0, "min_terminated_completion_length": 107.0, "num_tokens": 3722748.0, "reward": 2.9021482467651367, "reward_std": 0.07628687471151352, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9914336800575256, "rewards/check_winston_local_func/std": 0.004669444635510445, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1921 }, { "clip_ratio": 0.013650279492139816, "epoch": 0.6724982505248426, "grad_norm": 0.7106304527354571, "kl": 1.7421875, "learning_rate": 8.394676379345187e-06, "loss": 0.0122, "step": 1922 }, { "clip_ratio": 0.030457254499197006, "epoch": 0.6728481455563331, "grad_norm": 0.5211236440949127, "kl": 1.7421875, "learning_rate": 8.392433764040008e-06, "loss": 0.0092, "step": 1923 }, { "clip_ratio": 0.044990912079811096, "epoch": 0.6731980405878236, "grad_norm": 0.401619453432595, "kl": 1.75, "learning_rate": 8.390189883389143e-06, "loss": 0.0081, "step": 1924 }, { "clip_ratio": 0.0063643851317465305, "clipped_completions_ratio": 0.0, "epoch": 0.6735479356193143, "grad_norm": 0.8037739508507984, "kl": 1.2109375, "learning_rate": 8.387944738229536e-06, "loss": 0.0063, "max_completion_length": 244.0, "max_terminated_completion_length": 244.0, "mean_completion_length": 188.33929443359375, "mean_terminated_completion_length": 188.33929443359375, "min_completion_length": 132.0, "min_terminated_completion_length": 132.0, "num_tokens": 3745375.0, "reward": 2.903007745742798, "reward_std": 0.07889077812433243, "rewards/check_originality_func/mean": 0.9285714030265808, "rewards/check_originality_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9897419810295105, "rewards/check_winston_local_func/std": 0.005626677069813013, "rewards/sentence_count_match_reward_logic/mean": 0.9846938848495483, "rewards/sentence_count_match_reward_logic/std": 0.044584840536117554, "step": 1925 }, { "clip_ratio": 0.013816064223647118, "epoch": 0.6738978306508048, "grad_norm": 0.7339870617281671, "kl": 1.2109375, "learning_rate": 8.385698329398609e-06, "loss": 0.0028, "step": 1926 }, { "clip_ratio": 0.026869535446166992, "epoch": 0.6742477256822953, "grad_norm": 0.5435336937210192, "kl": 1.2109375, "learning_rate": 8.383450657734245e-06, "loss": -0.0005, "step": 1927 }, { "clip_ratio": 0.04078582674264908, "epoch": 0.6745976207137858, "grad_norm": 0.45733338578482186, "kl": 1.21875, "learning_rate": 8.38120172407481e-06, "loss": -0.0027, "step": 1928 }, { "clip_ratio": 0.005539380479604006, "clipped_completions_ratio": 0.0, "epoch": 0.6749475157452764, "grad_norm": 0.8660517824999532, "kl": 1.4453125, "learning_rate": 8.378951529259133e-06, "loss": 0.016, "max_completion_length": 255.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 184.83929443359375, "mean_terminated_completion_length": 184.83929443359375, "min_completion_length": 113.0, "min_terminated_completion_length": 113.0, "num_tokens": 3767494.0, "reward": 2.9888360500335693, "reward_std": 0.0020405040122568607, "rewards/check_originality_func/mean": 1.0, "rewards/check_originality_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9888361096382141, "rewards/check_winston_local_func/std": 0.004955681972205639, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1929 }, { "clip_ratio": 0.013337476179003716, "epoch": 0.675297410776767, "grad_norm": 0.6652839526669468, "kl": 1.453125, "learning_rate": 8.376700074126515e-06, "loss": 0.0126, "step": 1930 }, { "clip_ratio": 0.030182059854269028, "epoch": 0.6756473058082575, "grad_norm": 0.6916746161737035, "kl": 1.4609375, "learning_rate": 8.374447359516727e-06, "loss": 0.01, "step": 1931 }, { "clip_ratio": 0.04171055927872658, "epoch": 0.6759972008397481, "grad_norm": 0.5020644178340908, "kl": 1.4609375, "learning_rate": 8.37219338627001e-06, "loss": 0.0077, "step": 1932 }, { "clip_ratio": 0.005214073229581118, "clipped_completions_ratio": 0.0, "epoch": 0.6763470958712386, "grad_norm": 0.9439702383193868, "kl": 1.7421875, "learning_rate": 8.369938155227078e-06, "loss": 0.0142, "max_completion_length": 194.0, "max_terminated_completion_length": 194.0, "mean_completion_length": 148.42857360839844, "mean_terminated_completion_length": 148.42857360839844, "min_completion_length": 105.0, "min_terminated_completion_length": 105.0, "num_tokens": 3785550.0, "reward": 2.9708876609802246, "reward_std": 0.05304204672574997, "rewards/check_originality_func/mean": 0.9821428656578064, "rewards/check_originality_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9887447357177734, "rewards/check_winston_local_func/std": 0.006055563222616911, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1933 }, { "clip_ratio": 0.011436952278017998, "epoch": 0.6766969909027292, "grad_norm": 0.7868131479336373, "kl": 1.734375, "learning_rate": 8.367681667229107e-06, "loss": 0.0108, "step": 1934 }, { "clip_ratio": 0.029469717293977737, "epoch": 0.6770468859342197, "grad_norm": 0.5483329969273054, "kl": 1.75, "learning_rate": 8.365423923117747e-06, "loss": 0.0076, "step": 1935 }, { "clip_ratio": 0.0434761680662632, "epoch": 0.6773967809657103, "grad_norm": 0.44928248425276857, "kl": 1.765625, "learning_rate": 8.363164923735116e-06, "loss": 0.0059, "step": 1936 }, { "clip_ratio": 0.006328755524009466, "clipped_completions_ratio": 0.125, "epoch": 0.6777466759972008, "grad_norm": 1.0954159990712395, "kl": 1.65625, "learning_rate": 8.360904669923797e-06, "loss": 0.0084, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 154.80357360839844, "mean_terminated_completion_length": 140.34693908691406, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 3804067.0, "reward": 2.936886787414551, "reward_std": 0.0741080567240715, "rewards/check_originality_func/mean": 0.9464285969734192, "rewards/check_originality_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9924421310424805, "rewards/check_winston_local_func/std": 0.002319609047845006, "rewards/sentence_count_match_reward_logic/mean": 0.9980158805847168, "rewards/sentence_count_match_reward_logic/std": 0.014847845770418644, "step": 1937 }, { "clip_ratio": 0.017525024712085724, "epoch": 0.6780965710286914, "grad_norm": 0.7662158038958206, "kl": 1.6640625, "learning_rate": 8.358643162526846e-06, "loss": 0.0043, "step": 1938 }, { "clip_ratio": 0.038378357887268066, "epoch": 0.678446466060182, "grad_norm": 0.650950177421889, "kl": 1.6796875, "learning_rate": 8.356380402387783e-06, "loss": 0.0006, "step": 1939 }, { "clip_ratio": 0.0558912567794323, "epoch": 0.6787963610916725, "grad_norm": 0.5126676552448739, "kl": 1.671875, "learning_rate": 8.354116390350594e-06, "loss": -0.0011, "step": 1940 }, { "clip_ratio": 0.006287095136940479, "clipped_completions_ratio": 0.2142857142857143, "epoch": 0.679146256123163, "grad_norm": 0.8810787356676623, "kl": 1.34375, "learning_rate": 8.351851127259739e-06, "loss": 0.0167, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 195.12501525878906, "mean_terminated_completion_length": 178.52273559570312, "min_completion_length": 110.0, "min_terminated_completion_length": 110.0, "num_tokens": 3827042.0, "reward": 2.9223744869232178, "reward_std": 0.1507035791873932, "rewards/check_originality_func/mean": 0.9464285969734192, "rewards/check_originality_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9866601228713989, "rewards/check_winston_local_func/std": 0.012588650919497013, "rewards/sentence_count_match_reward_logic/mean": 0.9892857670783997, "rewards/sentence_count_match_reward_logic/std": 0.08017836511135101, "step": 1941 }, { "clip_ratio": 0.012093925848603249, "epoch": 0.6794961511546536, "grad_norm": 0.7261174261838367, "kl": 1.34375, "learning_rate": 8.349584613960136e-06, "loss": 0.0137, "step": 1942 }, { "clip_ratio": 0.026875749230384827, "epoch": 0.6798460461861442, "grad_norm": 0.5849912304276401, "kl": 1.359375, "learning_rate": 8.347316851297171e-06, "loss": 0.0102, "step": 1943 }, { "clip_ratio": 0.038715116679668427, "epoch": 0.6801959412176347, "grad_norm": 0.46164853951513096, "kl": 1.3671875, "learning_rate": 8.345047840116704e-06, "loss": 0.008, "step": 1944 }, { "clip_ratio": 0.006977904587984085, "clipped_completions_ratio": 0.0714285714285714, "epoch": 0.6805458362491252, "grad_norm": 0.9107624537271362, "kl": 1.40625, "learning_rate": 8.342777581265048e-06, "loss": 0.0128, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 184.1607208251953, "mean_terminated_completion_length": 178.63462829589844, "min_completion_length": 101.0, "min_terminated_completion_length": 101.0, "num_tokens": 3849075.0, "reward": 2.937105894088745, "reward_std": 0.15249833464622498, "rewards/check_originality_func/mean": 0.9464285969734192, "rewards/check_originality_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9906773567199707, "rewards/check_winston_local_func/std": 0.0028803967870771885, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1945 }, { "clip_ratio": 0.010929042473435402, "epoch": 0.6808957312806159, "grad_norm": 0.7809586838267938, "kl": 1.3984375, "learning_rate": 8.340506075588994e-06, "loss": 0.0093, "step": 1946 }, { "clip_ratio": 0.02599426545202732, "epoch": 0.6812456263121064, "grad_norm": 0.5707080022433015, "kl": 1.3984375, "learning_rate": 8.33823332393579e-06, "loss": 0.0056, "step": 1947 }, { "clip_ratio": 0.04444075748324394, "epoch": 0.6815955213435969, "grad_norm": 0.42860209566650764, "kl": 1.40625, "learning_rate": 8.335959327153148e-06, "loss": 0.0034, "step": 1948 }, { "clip_ratio": 0.00624271621927619, "clipped_completions_ratio": 0.0, "epoch": 0.6819454163750874, "grad_norm": 1.0591224059338, "kl": 1.6171875, "learning_rate": 8.333684086089251e-06, "loss": 0.0226, "max_completion_length": 231.0, "max_terminated_completion_length": 231.0, "mean_completion_length": 148.71429443359375, "mean_terminated_completion_length": 148.71429443359375, "min_completion_length": 76.0, "min_terminated_completion_length": 76.0, "num_tokens": 3866795.0, "reward": 2.9187045097351074, "reward_std": 0.08438695222139359, "rewards/check_originality_func/mean": 0.9285714030265808, "rewards/check_originality_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9923650026321411, "rewards/check_winston_local_func/std": 0.003173974109813571, "rewards/sentence_count_match_reward_logic/mean": 0.9977678656578064, "rewards/sentence_count_match_reward_logic/std": 0.016703827306628227, "step": 1949 }, { "clip_ratio": 0.015410822816193104, "epoch": 0.6822953114065781, "grad_norm": 0.7007294750885624, "kl": 1.6171875, "learning_rate": 8.33140760159274e-06, "loss": 0.0185, "step": 1950 }, { "clip_ratio": 0.029957661405205727, "epoch": 0.6826452064380686, "grad_norm": 0.523840751250495, "kl": 1.6171875, "learning_rate": 8.329129874512724e-06, "loss": 0.0166, "step": 1951 }, { "clip_ratio": 0.037863850593566895, "epoch": 0.6829951014695591, "grad_norm": 0.4808611256889, "kl": 1.6171875, "learning_rate": 8.326850905698774e-06, "loss": 0.0149, "step": 1952 }, { "clip_ratio": 0.006194881163537502, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.6833449965010497, "grad_norm": 0.9663141288275735, "kl": 1.3671875, "learning_rate": 8.32457069600092e-06, "loss": 0.013, "max_completion_length": 256.0, "max_terminated_completion_length": 244.0, "mean_completion_length": 183.07144165039062, "mean_terminated_completion_length": 170.9166717529297, "min_completion_length": 124.0, "min_terminated_completion_length": 124.0, "num_tokens": 3888791.0, "reward": 2.8754148483276367, "reward_std": 0.14349345862865448, "rewards/check_originality_func/mean": 0.9107142686843872, "rewards/check_originality_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9885099530220032, "rewards/check_winston_local_func/std": 0.006037506274878979, "rewards/sentence_count_match_reward_logic/mean": 0.976190447807312, "rewards/sentence_count_match_reward_logic/std": 0.058848995715379715, "step": 1953 }, { "clip_ratio": 0.012484588660299778, "epoch": 0.6836948915325403, "grad_norm": 0.702607957228893, "kl": 1.375, "learning_rate": 8.322289246269662e-06, "loss": 0.0096, "step": 1954 }, { "clip_ratio": 0.029133383184671402, "epoch": 0.6840447865640308, "grad_norm": 0.5148792464244795, "kl": 1.375, "learning_rate": 8.32000655735596e-06, "loss": 0.006, "step": 1955 }, { "clip_ratio": 0.04270954802632332, "epoch": 0.6843946815955213, "grad_norm": 0.4297990781484171, "kl": 1.390625, "learning_rate": 8.317722630111233e-06, "loss": 0.0043, "step": 1956 }, { "clip_ratio": 0.00669521139934659, "clipped_completions_ratio": 0.0, "epoch": 0.6847445766270119, "grad_norm": 0.8133873921732556, "kl": 1.453125, "learning_rate": 8.315437465387367e-06, "loss": 0.0152, "max_completion_length": 239.0, "max_terminated_completion_length": 239.0, "mean_completion_length": 190.9107208251953, "mean_terminated_completion_length": 190.9107208251953, "min_completion_length": 142.0, "min_terminated_completion_length": 142.0, "num_tokens": 3911114.0, "reward": 2.9927923679351807, "reward_std": 0.0012460140278562903, "rewards/check_originality_func/mean": 1.0, "rewards/check_originality_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9927921295166016, "rewards/check_winston_local_func/std": 0.0024281146470457315, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1957 }, { "clip_ratio": 0.012487474828958511, "epoch": 0.6850944716585025, "grad_norm": 0.7268690172362794, "kl": 1.4609375, "learning_rate": 8.313151064036702e-06, "loss": 0.0119, "step": 1958 }, { "clip_ratio": 0.028648771345615387, "epoch": 0.685444366689993, "grad_norm": 0.599859136055295, "kl": 1.46875, "learning_rate": 8.310863426912047e-06, "loss": 0.0088, "step": 1959 }, { "clip_ratio": 0.04641099274158478, "epoch": 0.6857942617214835, "grad_norm": 0.5130785486489206, "kl": 1.46875, "learning_rate": 8.30857455486667e-06, "loss": 0.0062, "step": 1960 }, { "clip_ratio": 0.0048786913976073265, "clipped_completions_ratio": 0.0714285714285714, "epoch": 0.6861441567529741, "grad_norm": 0.9580745231177937, "kl": 1.5234375, "learning_rate": 8.306284448754298e-06, "loss": 0.0147, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 172.75001525878906, "mean_terminated_completion_length": 166.34616088867188, "min_completion_length": 115.0, "min_terminated_completion_length": 115.0, "num_tokens": 3931772.0, "reward": 2.9562864303588867, "reward_std": 0.06743813306093216, "rewards/check_originality_func/mean": 0.9642857313156128, "rewards/check_originality_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9920006990432739, "rewards/check_winston_local_func/std": 0.0021522396709769964, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1961 }, { "clip_ratio": 0.013313083909451962, "epoch": 0.6864940517844647, "grad_norm": 0.6498063080534514, "kl": 1.53125, "learning_rate": 8.303993109429118e-06, "loss": 0.0113, "step": 1962 }, { "clip_ratio": 0.027950696647167206, "epoch": 0.6868439468159552, "grad_norm": 0.541402512901591, "kl": 1.5390625, "learning_rate": 8.301700537745777e-06, "loss": 0.0088, "step": 1963 }, { "clip_ratio": 0.04279223829507828, "epoch": 0.6871938418474458, "grad_norm": 0.4740830803134096, "kl": 1.5546875, "learning_rate": 8.299406734559385e-06, "loss": 0.0071, "step": 1964 }, { "clip_ratio": 0.006317594554275274, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.6875437368789363, "grad_norm": 1.0133706272435232, "kl": 1.5078125, "learning_rate": 8.297111700725508e-06, "loss": 0.0164, "max_completion_length": 256.0, "max_terminated_completion_length": 238.0, "mean_completion_length": 172.92857360839844, "mean_terminated_completion_length": 159.08334350585938, "min_completion_length": 109.0, "min_terminated_completion_length": 109.0, "num_tokens": 3953048.0, "reward": 2.937976360321045, "reward_std": 0.012840615585446358, "rewards/check_originality_func/mean": 1.0, "rewards/check_originality_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9915475845336914, "rewards/check_winston_local_func/std": 0.0028665917925536633, "rewards/sentence_count_match_reward_logic/mean": 0.9464285969734192, "rewards/sentence_count_match_reward_logic/std": 0.09591212868690491, "step": 1965 }, { "clip_ratio": 0.015181533992290497, "epoch": 0.6878936319104269, "grad_norm": 0.8821377346774255, "kl": 1.5078125, "learning_rate": 8.294815437100172e-06, "loss": 0.0125, "step": 1966 }, { "clip_ratio": 0.03783084452152252, "epoch": 0.6882435269419174, "grad_norm": 0.5639387979241924, "kl": 1.5078125, "learning_rate": 8.29251794453986e-06, "loss": 0.0079, "step": 1967 }, { "clip_ratio": 0.05413319915533066, "epoch": 0.688593421973408, "grad_norm": 0.4752756201037744, "kl": 1.5078125, "learning_rate": 8.290219223901517e-06, "loss": 0.0058, "step": 1968 }, { "clip_ratio": 0.005717038176953793, "clipped_completions_ratio": 0.1785714285714286, "epoch": 0.6889433170048985, "grad_norm": 0.8196227038035181, "kl": 1.2265625, "learning_rate": 8.287919276042541e-06, "loss": 0.0132, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 219.3928680419922, "mean_terminated_completion_length": 211.43478393554688, "min_completion_length": 100.0, "min_terminated_completion_length": 100.0, "num_tokens": 3979206.0, "reward": 2.960094928741455, "reward_std": 0.059637464582920074, "rewards/check_originality_func/mean": 0.9821428656578064, "rewards/check_originality_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9898565411567688, "rewards/check_winston_local_func/std": 0.007027376908808947, "rewards/sentence_count_match_reward_logic/mean": 0.988095223903656, "rewards/sentence_count_match_reward_logic/std": 0.03467709571123123, "step": 1969 }, { "clip_ratio": 0.011125275865197182, "epoch": 0.689293212036389, "grad_norm": 0.6645246146529118, "kl": 1.2265625, "learning_rate": 8.285618101820799e-06, "loss": 0.0097, "step": 1970 }, { "clip_ratio": 0.025053950026631355, "epoch": 0.6896431070678797, "grad_norm": 0.5405428217293363, "kl": 1.234375, "learning_rate": 8.283315702094597e-06, "loss": 0.0075, "step": 1971 }, { "clip_ratio": 0.035572584718465805, "epoch": 0.6899930020993702, "grad_norm": 0.4346923912192989, "kl": 1.2421875, "learning_rate": 8.281012077722712e-06, "loss": 0.0058, "step": 1972 }, { "clip_ratio": 0.007016269024461508, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.6903428971308607, "grad_norm": 1.1789580238574988, "kl": 1.65625, "learning_rate": 8.278707229564375e-06, "loss": 0.0155, "max_completion_length": 256.0, "max_terminated_completion_length": 211.0, "mean_completion_length": 172.30357360839844, "mean_terminated_completion_length": 158.3541717529297, "min_completion_length": 74.0, "min_terminated_completion_length": 74.0, "num_tokens": 4000167.0, "reward": 2.8771140575408936, "reward_std": 0.07912319153547287, "rewards/check_originality_func/mean": 0.9285714030265808, "rewards/check_originality_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9868078231811523, "rewards/check_winston_local_func/std": 0.009625340811908245, "rewards/sentence_count_match_reward_logic/mean": 0.9617347121238708, "rewards/sentence_count_match_reward_logic/std": 0.06123994663357735, "step": 1973 }, { "clip_ratio": 0.015885857865214348, "epoch": 0.6906927921623512, "grad_norm": 0.8567160442913183, "kl": 1.65625, "learning_rate": 8.276401158479272e-06, "loss": 0.0114, "step": 1974 }, { "clip_ratio": 0.03163477033376694, "epoch": 0.6910426871938419, "grad_norm": 0.5724532440590835, "kl": 1.65625, "learning_rate": 8.274093865327548e-06, "loss": 0.0077, "step": 1975 }, { "clip_ratio": 0.0471312589943409, "epoch": 0.6913925822253324, "grad_norm": 0.4658025242766989, "kl": 1.65625, "learning_rate": 8.271785350969799e-06, "loss": 0.0057, "step": 1976 }, { "clip_ratio": 0.006378529127687216, "clipped_completions_ratio": 0.1071428571428571, "epoch": 0.6917424772568229, "grad_norm": 0.8797544683116297, "kl": 1.390625, "learning_rate": 8.269475616267077e-06, "loss": 0.0162, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 194.6607208251953, "mean_terminated_completion_length": 187.3000030517578, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 4023204.0, "reward": 2.8457894325256348, "reward_std": 0.20179075002670288, "rewards/check_originality_func/mean": 0.8571428656578064, "rewards/check_originality_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.988646388053894, "rewards/check_winston_local_func/std": 0.006582553032785654, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1977 }, { "clip_ratio": 0.010802144184708595, "epoch": 0.6920923722883136, "grad_norm": 0.7608403849530241, "kl": 1.390625, "learning_rate": 8.267164662080897e-06, "loss": 0.0134, "step": 1978 }, { "clip_ratio": 0.02435326762497425, "epoch": 0.6924422673198041, "grad_norm": 0.5293125004067717, "kl": 1.3828125, "learning_rate": 8.264852489273217e-06, "loss": 0.0108, "step": 1979 }, { "clip_ratio": 0.03850339725613594, "epoch": 0.6927921623512946, "grad_norm": 0.4978302921362726, "kl": 1.390625, "learning_rate": 8.26253909870646e-06, "loss": 0.0089, "step": 1980 }, { "clip_ratio": 0.00806526280939579, "clipped_completions_ratio": 0.0, "epoch": 0.6931420573827851, "grad_norm": 1.043517433599681, "kl": 1.6328125, "learning_rate": 8.260224491243497e-06, "loss": 0.0144, "max_completion_length": 244.0, "max_terminated_completion_length": 244.0, "mean_completion_length": 167.55357360839844, "mean_terminated_completion_length": 167.55357360839844, "min_completion_length": 117.0, "min_terminated_completion_length": 117.0, "num_tokens": 4043051.0, "reward": 2.954171895980835, "reward_std": 0.07310697436332703, "rewards/check_originality_func/mean": 0.9642857313156128, "rewards/check_originality_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9921181797981262, "rewards/check_winston_local_func/std": 0.002131227171048522, "rewards/sentence_count_match_reward_logic/mean": 0.9977678656578064, "rewards/sentence_count_match_reward_logic/std": 0.016703827306628227, "step": 1981 }, { "clip_ratio": 0.01757936365902424, "epoch": 0.6934919524142757, "grad_norm": 0.8051947018784344, "kl": 1.6328125, "learning_rate": 8.257908667747651e-06, "loss": 0.01, "step": 1982 }, { "clip_ratio": 0.03566616401076317, "epoch": 0.6938418474457663, "grad_norm": 0.6433738332527957, "kl": 1.6328125, "learning_rate": 8.25559162908271e-06, "loss": 0.0066, "step": 1983 }, { "clip_ratio": 0.05536020174622536, "epoch": 0.6941917424772568, "grad_norm": 0.517241272112107, "kl": 1.640625, "learning_rate": 8.253273376112902e-06, "loss": 0.0045, "step": 1984 }, { "clip_ratio": 0.006855878978967667, "clipped_completions_ratio": 0.0, "epoch": 0.6945416375087474, "grad_norm": 1.1192565347666636, "kl": 1.6171875, "learning_rate": 8.250953909702915e-06, "loss": 0.0104, "max_completion_length": 236.0, "max_terminated_completion_length": 236.0, "mean_completion_length": 165.875, "mean_terminated_completion_length": 165.875, "min_completion_length": 111.0, "min_terminated_completion_length": 111.0, "num_tokens": 4063012.0, "reward": 2.97469425201416, "reward_std": 0.05130118504166603, "rewards/check_originality_func/mean": 0.9821428656578064, "rewards/check_originality_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9925513863563538, "rewards/check_winston_local_func/std": 0.0017971195047721267, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1985 }, { "clip_ratio": 0.017757799476385117, "epoch": 0.6948915325402379, "grad_norm": 0.7772544390416979, "kl": 1.6171875, "learning_rate": 8.248633230717888e-06, "loss": 0.0067, "step": 1986 }, { "clip_ratio": 0.03391910716891289, "epoch": 0.6952414275717285, "grad_norm": 0.5853105385512613, "kl": 1.625, "learning_rate": 8.246311340023412e-06, "loss": 0.0034, "step": 1987 }, { "clip_ratio": 0.05112072452902794, "epoch": 0.695591322603219, "grad_norm": 0.5192361141061372, "kl": 1.625, "learning_rate": 8.24398823848553e-06, "loss": 0.0011, "step": 1988 }, { "clip_ratio": 0.006847807206213474, "clipped_completions_ratio": 0.1964285714285714, "epoch": 0.6959412176347096, "grad_norm": 0.9617758606906658, "kl": 1.515625, "learning_rate": 8.241663926970738e-06, "loss": 0.0084, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 178.44644165039062, "mean_terminated_completion_length": 159.4888916015625, "min_completion_length": 95.0, "min_terminated_completion_length": 95.0, "num_tokens": 4084157.0, "reward": 2.8147804737091064, "reward_std": 0.17032024264335632, "rewards/check_originality_func/mean": 0.8392857313156128, "rewards/check_originality_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.9862089157104492, "rewards/check_winston_local_func/std": 0.015266292728483677, "rewards/sentence_count_match_reward_logic/mean": 0.9892857670783997, "rewards/sentence_count_match_reward_logic/std": 0.045441556721925735, "step": 1989 }, { "clip_ratio": 0.01317503023892641, "epoch": 0.6962911126662001, "grad_norm": 0.7705214429193041, "kl": 1.515625, "learning_rate": 8.23933840634598e-06, "loss": 0.005, "step": 1990 }, { "clip_ratio": 0.029583895578980446, "epoch": 0.6966410076976907, "grad_norm": 0.5298518541653803, "kl": 1.5234375, "learning_rate": 8.237011677478657e-06, "loss": 0.0016, "step": 1991 }, { "clip_ratio": 0.04474521800875664, "epoch": 0.6969909027291813, "grad_norm": 0.43187660311221265, "kl": 1.53125, "learning_rate": 8.234683741236612e-06, "loss": -0.0008, "step": 1992 }, { "clip_ratio": 0.006952984258532524, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.6973407977606718, "grad_norm": 0.9048803044934997, "kl": 1.4140625, "learning_rate": 8.232354598488148e-06, "loss": 0.0143, "max_completion_length": 256.0, "max_terminated_completion_length": 240.0, "mean_completion_length": 180.30357360839844, "mean_terminated_completion_length": 167.6875, "min_completion_length": 124.0, "min_terminated_completion_length": 124.0, "num_tokens": 4105590.0, "reward": 2.8364651203155518, "reward_std": 0.1576537787914276, "rewards/check_originality_func/mean": 0.8571428656578064, "rewards/check_originality_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.990685760974884, "rewards/check_winston_local_func/std": 0.005519109778106213, "rewards/sentence_count_match_reward_logic/mean": 0.9886363744735718, "rewards/sentence_count_match_reward_logic/std": 0.030337436124682426, "step": 1993 }, { "clip_ratio": 0.014076356776058674, "epoch": 0.6976906927921623, "grad_norm": 0.7725440596577596, "kl": 1.4140625, "learning_rate": 8.230024250102009e-06, "loss": 0.0113, "step": 1994 }, { "clip_ratio": 0.02678089588880539, "epoch": 0.6980405878236529, "grad_norm": 0.553688580157572, "kl": 1.4140625, "learning_rate": 8.227692696947395e-06, "loss": 0.0079, "step": 1995 }, { "clip_ratio": 0.04360263794660568, "epoch": 0.6983904828551435, "grad_norm": 0.4660659791210874, "kl": 1.4140625, "learning_rate": 8.225359939893954e-06, "loss": 0.0061, "step": 1996 }, { "clip_ratio": 0.007040882017463446, "clipped_completions_ratio": 0.0, "epoch": 0.698740377886634, "grad_norm": 0.974510667763935, "kl": 1.546875, "learning_rate": 8.223025979811781e-06, "loss": 0.0115, "max_completion_length": 242.0, "max_terminated_completion_length": 242.0, "mean_completion_length": 180.23214721679688, "mean_terminated_completion_length": 180.23214721679688, "min_completion_length": 92.0, "min_terminated_completion_length": 92.0, "num_tokens": 4126843.0, "reward": 2.9561212062835693, "reward_std": 0.06787911057472229, "rewards/check_originality_func/mean": 0.9642857313156128, "rewards/check_originality_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9918354153633118, "rewards/check_winston_local_func/std": 0.0031191103626042604, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 1997 }, { "clip_ratio": 0.014044515788555145, "epoch": 0.6990902729181245, "grad_norm": 0.6770238608321105, "kl": 1.5546875, "learning_rate": 8.220690817571422e-06, "loss": 0.0081, "step": 1998 }, { "clip_ratio": 0.029513046145439148, "epoch": 0.6994401679496152, "grad_norm": 0.5205143125689183, "kl": 1.5703125, "learning_rate": 8.218354454043868e-06, "loss": 0.0061, "step": 1999 }, { "clip_ratio": 0.0425686351954937, "epoch": 0.6997900629811057, "grad_norm": 0.46638529850118954, "kl": 1.5859375, "learning_rate": 8.216016890100564e-06, "loss": 0.0041, "step": 2000 }, { "clip_ratio": 0.009346192702651024, "clipped_completions_ratio": 0.0, "epoch": 0.7001399580125962, "grad_norm": 3.061494766466172, "kl": 1.6796875, "learning_rate": 8.213678126613398e-06, "loss": 0.0245, "max_completion_length": 212.0, "max_terminated_completion_length": 212.0, "mean_completion_length": 101.08928680419922, "mean_terminated_completion_length": 101.08928680419922, "min_completion_length": 9.0, "min_terminated_completion_length": 9.0, "num_tokens": 12941.0, "reward": 2.6267483234405518, "reward_std": 0.2983337342739105, "rewards/check_gptzero_func/mean": 0.6607142686843872, "rewards/check_gptzero_func/std": 0.477751761674881, "rewards/check_winston_local_func/mean": 0.9838911294937134, "rewards/check_winston_local_func/std": 0.01577530801296234, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.054907578974962234, "step": 2001 }, { "clip_ratio": 0.03575694188475609, "epoch": 0.7004898530440867, "grad_norm": 1.4489722632552837, "kl": 1.8203125, "learning_rate": 8.211338164454706e-06, "loss": 0.0164, "step": 2002 }, { "clip_ratio": 0.0464215986430645, "epoch": 0.7008397480755774, "grad_norm": 0.9452486910764574, "kl": 1.6953125, "learning_rate": 8.20899700449727e-06, "loss": 0.0123, "step": 2003 }, { "clip_ratio": 0.07032540440559387, "epoch": 0.7011896431070679, "grad_norm": 3.9842489792632194, "kl": 1.6484375, "learning_rate": 8.206654647614323e-06, "loss": 0.0191, "step": 2004 }, { "clip_ratio": 0.0065513052977621555, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7015395381385584, "grad_norm": 1.6848919121505581, "kl": 2.0, "learning_rate": 8.204311094679543e-06, "loss": 0.0289, "max_completion_length": 256.0, "max_terminated_completion_length": 123.0, "mean_completion_length": 74.83928680419922, "mean_terminated_completion_length": 44.645835876464844, "min_completion_length": 21.0, "min_terminated_completion_length": 21.0, "num_tokens": 23140.0, "reward": 2.7675349712371826, "reward_std": 0.0868251621723175, "rewards/check_gptzero_func/mean": 0.8035714030265808, "rewards/check_gptzero_func/std": 0.40089187026023865, "rewards/check_winston_local_func/mean": 0.983606219291687, "rewards/check_winston_local_func/std": 0.007673986256122589, "rewards/sentence_count_match_reward_logic/mean": 0.9803571701049805, "rewards/sentence_count_match_reward_logic/std": 0.055332522839307785, "step": 2005 }, { "clip_ratio": 0.021313440054655075, "epoch": 0.701889433170049, "grad_norm": 0.9187765399238488, "kl": 2.0, "learning_rate": 8.20196634656705e-06, "loss": 0.0242, "step": 2006 }, { "clip_ratio": 0.03894505649805069, "epoch": 0.7022393282015396, "grad_norm": 0.7927156952733643, "kl": 2.015625, "learning_rate": 8.199620404151413e-06, "loss": 0.0227, "step": 2007 }, { "clip_ratio": 0.04713978245854378, "epoch": 0.7025892232330301, "grad_norm": 0.8915494720838415, "kl": 2.0, "learning_rate": 8.19727326830765e-06, "loss": 0.021, "step": 2008 }, { "clip_ratio": 0.005889543332159519, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7029391182645206, "grad_norm": 1.7601802251960126, "kl": 1.4921875, "learning_rate": 8.194924939911217e-06, "loss": 0.0153, "max_completion_length": 256.0, "max_terminated_completion_length": 216.0, "mean_completion_length": 135.875, "mean_terminated_completion_length": 115.85417175292969, "min_completion_length": 28.0, "min_terminated_completion_length": 28.0, "num_tokens": 39901.0, "reward": 2.887225866317749, "reward_std": 0.08795926719903946, "rewards/check_gptzero_func/mean": 0.9642857313156128, "rewards/check_gptzero_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9863330125808716, "rewards/check_winston_local_func/std": 0.009825514629483223, "rewards/sentence_count_match_reward_logic/mean": 0.9366071820259094, "rewards/sentence_count_match_reward_logic/std": 0.10723426938056946, "step": 2009 }, { "clip_ratio": 0.0202956460416317, "epoch": 0.7032890132960112, "grad_norm": 1.2100427655554824, "kl": 1.4921875, "learning_rate": 8.19257541983802e-06, "loss": 0.0112, "step": 2010 }, { "clip_ratio": 0.03513222560286522, "epoch": 0.7036389083275018, "grad_norm": 0.987840790173611, "kl": 1.5234375, "learning_rate": 8.190224708964406e-06, "loss": 0.008, "step": 2011 }, { "clip_ratio": 0.050053540617227554, "epoch": 0.7039888033589923, "grad_norm": 1.0391732028971115, "kl": 1.4921875, "learning_rate": 8.18787280816717e-06, "loss": 0.0053, "step": 2012 }, { "clip_ratio": 0.010341576300561428, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7043386983904829, "grad_norm": 2.2275096440651594, "kl": 1.78125, "learning_rate": 8.185519718323547e-06, "loss": 0.0171, "max_completion_length": 256.0, "max_terminated_completion_length": 138.0, "mean_completion_length": 83.17857360839844, "mean_terminated_completion_length": 54.375, "min_completion_length": 13.0, "min_terminated_completion_length": 13.0, "num_tokens": 51911.0, "reward": 2.6878483295440674, "reward_std": 0.1897146999835968, "rewards/check_gptzero_func/mean": 0.8035714030265808, "rewards/check_gptzero_func/std": 0.40089184045791626, "rewards/check_winston_local_func/mean": 0.9435028433799744, "rewards/check_winston_local_func/std": 0.097801074385643, "rewards/sentence_count_match_reward_logic/mean": 0.9407738447189331, "rewards/sentence_count_match_reward_logic/std": 0.13211257755756378, "step": 2013 }, { "clip_ratio": 0.029213646426796913, "epoch": 0.7046885934219734, "grad_norm": 0.8971409190341109, "kl": 1.7734375, "learning_rate": 8.183165440311218e-06, "loss": 0.0134, "step": 2014 }, { "clip_ratio": 0.05232185870409012, "epoch": 0.705038488453464, "grad_norm": 0.5540398383911895, "kl": 1.7734375, "learning_rate": 8.180809975008305e-06, "loss": 0.0113, "step": 2015 }, { "clip_ratio": 0.06180186942219734, "epoch": 0.7053883834849545, "grad_norm": 0.6172407521990341, "kl": 1.78125, "learning_rate": 8.178453323293378e-06, "loss": 0.0108, "step": 2016 }, { "clip_ratio": 0.007611092645674944, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7057382785164451, "grad_norm": 4.114805632390107, "kl": 2.109375, "learning_rate": 8.176095486045443e-06, "loss": 0.0314, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 134.10714721679688, "mean_terminated_completion_length": 113.79167175292969, "min_completion_length": 10.0, "min_terminated_completion_length": 10.0, "num_tokens": 68493.0, "reward": 2.900608539581299, "reward_std": 0.0810503214597702, "rewards/check_gptzero_func/mean": 0.9285714030265808, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9881085157394409, "rewards/check_winston_local_func/std": 0.009711154736578465, "rewards/sentence_count_match_reward_logic/mean": 0.9839285612106323, "rewards/sentence_count_match_reward_logic/std": 0.04167749360203743, "step": 2017 }, { "clip_ratio": 0.030746059492230415, "epoch": 0.7060881735479356, "grad_norm": 1.50369658171109, "kl": 2.140625, "learning_rate": 8.173736464143951e-06, "loss": 0.0247, "step": 2018 }, { "clip_ratio": 0.04907907918095589, "epoch": 0.7064380685794261, "grad_norm": 1.9508860602564155, "kl": 2.375, "learning_rate": 8.171376258468796e-06, "loss": 0.0242, "step": 2019 }, { "clip_ratio": 0.05995853245258331, "epoch": 0.7067879636109167, "grad_norm": 1.0575939671158037, "kl": 2.203125, "learning_rate": 8.169014869900308e-06, "loss": 0.0212, "step": 2020 }, { "clip_ratio": 0.007321977522224188, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7071378586424073, "grad_norm": 1.963635345039133, "kl": 1.71875, "learning_rate": 8.16665229931927e-06, "loss": 0.0233, "max_completion_length": 256.0, "max_terminated_completion_length": 188.0, "mean_completion_length": 112.71429443359375, "mean_terminated_completion_length": 88.83333587646484, "min_completion_length": 28.0, "min_terminated_completion_length": 28.0, "num_tokens": 83469.0, "reward": 2.8188931941986084, "reward_std": 0.16854283213615417, "rewards/check_gptzero_func/mean": 0.875, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.9671074748039246, "rewards/check_winston_local_func/std": 0.05377361923456192, "rewards/sentence_count_match_reward_logic/mean": 0.9767857193946838, "rewards/sentence_count_match_reward_logic/std": 0.06321988254785538, "step": 2021 }, { "clip_ratio": 0.025107942521572113, "epoch": 0.7074877536738978, "grad_norm": 1.2047697415530068, "kl": 1.71875, "learning_rate": 8.164288547606893e-06, "loss": 0.0185, "step": 2022 }, { "clip_ratio": 0.05073237046599388, "epoch": 0.7078376487053883, "grad_norm": 0.6694533254478159, "kl": 1.71875, "learning_rate": 8.161923615644836e-06, "loss": 0.0137, "step": 2023 }, { "clip_ratio": 0.06581519544124603, "epoch": 0.708187543736879, "grad_norm": 0.6636735523257131, "kl": 1.7265625, "learning_rate": 8.159557504315197e-06, "loss": 0.0122, "step": 2024 }, { "clip_ratio": 0.004663478117436171, "clipped_completions_ratio": 0.2321428571428571, "epoch": 0.7085374387683695, "grad_norm": 5.1511458362797, "kl": 0.9140625, "learning_rate": 8.15719021450051e-06, "loss": -0.0424, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 135.5, "mean_terminated_completion_length": 99.06977081298828, "min_completion_length": 5.0, "min_terminated_completion_length": 5.0, "num_tokens": 101409.0, "reward": 2.7221360206604004, "reward_std": 0.05374438688158989, "rewards/check_gptzero_func/mean": 0.8571428656578064, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9457507729530334, "rewards/check_winston_local_func/std": 0.10601859539747238, "rewards/sentence_count_match_reward_logic/mean": 0.919242262840271, "rewards/sentence_count_match_reward_logic/std": 0.13636942207813263, "step": 2025 }, { "clip_ratio": 0.04628335312008858, "epoch": 0.70888733379986, "grad_norm": 1.5498517452088818, "kl": 1.046875, "learning_rate": 8.154821747083755e-06, "loss": -0.0515, "step": 2026 }, { "clip_ratio": 0.07727441936731339, "epoch": 0.7092372288313505, "grad_norm": 1.4476832532675386, "kl": 1.2421875, "learning_rate": 8.152452102948348e-06, "loss": -0.0531, "step": 2027 }, { "clip_ratio": 0.08558981865644455, "epoch": 0.7095871238628412, "grad_norm": 2.3784708106755237, "kl": 1.3046875, "learning_rate": 8.150081282978139e-06, "loss": -0.0547, "step": 2028 }, { "clip_ratio": 0.005684252362698317, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7099370188943317, "grad_norm": 2.092635491713864, "kl": 1.484375, "learning_rate": 8.147709288057428e-06, "loss": 0.0189, "max_completion_length": 256.0, "max_terminated_completion_length": 145.0, "mean_completion_length": 104.48214721679688, "mean_terminated_completion_length": 79.22917175292969, "min_completion_length": 19.0, "min_terminated_completion_length": 19.0, "num_tokens": 115500.0, "reward": 2.690077066421509, "reward_std": 0.15867558121681213, "rewards/check_gptzero_func/mean": 0.875, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.8648217916488647, "rewards/check_winston_local_func/std": 0.29303649067878723, "rewards/sentence_count_match_reward_logic/mean": 0.9502550959587097, "rewards/sentence_count_match_reward_logic/std": 0.12328199297189713, "step": 2029 }, { "clip_ratio": 0.01915116421878338, "epoch": 0.7102869139258222, "grad_norm": 0.9004311280102643, "kl": 1.515625, "learning_rate": 8.145336119070942e-06, "loss": 0.0157, "step": 2030 }, { "clip_ratio": 0.03521281108260155, "epoch": 0.7106368089573128, "grad_norm": 2.0489930706650026, "kl": 1.7734375, "learning_rate": 8.142961776903852e-06, "loss": 0.0142, "step": 2031 }, { "clip_ratio": 0.04445498436689377, "epoch": 0.7109867039888034, "grad_norm": 0.6263436209106327, "kl": 1.53125, "learning_rate": 8.140586262441767e-06, "loss": 0.011, "step": 2032 }, { "clip_ratio": 0.005045156925916672, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7113365990202939, "grad_norm": 1.4473615113193672, "kl": 1.1875, "learning_rate": 8.138209576570729e-06, "loss": 0.0064, "max_completion_length": 256.0, "max_terminated_completion_length": 236.0, "mean_completion_length": 112.76786041259766, "mean_terminated_completion_length": 88.89583587646484, "min_completion_length": 9.0, "min_terminated_completion_length": 9.0, "num_tokens": 130983.0, "reward": 2.732717990875244, "reward_std": 0.009563814848661423, "rewards/check_gptzero_func/mean": 0.8571428656578064, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9250256419181824, "rewards/check_winston_local_func/std": 0.1480398327112198, "rewards/sentence_count_match_reward_logic/mean": 0.9505494832992554, "rewards/sentence_count_match_reward_logic/std": 0.12397275120019913, "step": 2033 }, { "clip_ratio": 0.014684529043734074, "epoch": 0.7116864940517844, "grad_norm": 0.5668784687769909, "kl": 1.1796875, "learning_rate": 8.135831720177221e-06, "loss": 0.005, "step": 2034 }, { "clip_ratio": 0.019493691623210907, "epoch": 0.712036389083275, "grad_norm": 0.4745485211123735, "kl": 1.1796875, "learning_rate": 8.133452694148159e-06, "loss": 0.0039, "step": 2035 }, { "clip_ratio": 0.0241145808249712, "epoch": 0.7123862841147656, "grad_norm": 0.4612740489274654, "kl": 1.171875, "learning_rate": 8.131072499370897e-06, "loss": 0.0024, "step": 2036 }, { "clip_ratio": 0.006510803941637278, "clipped_completions_ratio": 0.0, "epoch": 0.7127361791462561, "grad_norm": 1.3299696117521522, "kl": 2.078125, "learning_rate": 8.128691136733228e-06, "loss": 0.0038, "max_completion_length": 146.0, "max_terminated_completion_length": 146.0, "mean_completion_length": 78.14286041259766, "mean_terminated_completion_length": 78.14286041259766, "min_completion_length": 31.0, "min_terminated_completion_length": 31.0, "num_tokens": 141439.0, "reward": 2.8437631130218506, "reward_std": 0.08205083757638931, "rewards/check_gptzero_func/mean": 0.8928571343421936, "rewards/check_gptzero_func/std": 0.31209391355514526, "rewards/check_winston_local_func/mean": 0.9821557998657227, "rewards/check_winston_local_func/std": 0.02937423437833786, "rewards/sentence_count_match_reward_logic/mean": 0.96875, "rewards/sentence_count_match_reward_logic/std": 0.08342797309160233, "step": 2037 }, { "clip_ratio": 0.020427156239748, "epoch": 0.7130860741777467, "grad_norm": 0.7729328585264367, "kl": 2.078125, "learning_rate": 8.126308607123375e-06, "loss": -0.0006, "step": 2038 }, { "clip_ratio": 0.0365627221763134, "epoch": 0.7134359692092372, "grad_norm": 0.5913451770379019, "kl": 2.09375, "learning_rate": 8.12392491143e-06, "loss": -0.0026, "step": 2039 }, { "clip_ratio": 0.04679306969046593, "epoch": 0.7137858642407278, "grad_norm": 0.4460196738577376, "kl": 2.109375, "learning_rate": 8.121540050542198e-06, "loss": -0.0041, "step": 2040 }, { "clip_ratio": 0.004970156587660313, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7141357592722183, "grad_norm": 2.042887632169635, "kl": 1.8359375, "learning_rate": 8.119154025349503e-06, "loss": 0.0171, "max_completion_length": 256.0, "max_terminated_completion_length": 190.0, "mean_completion_length": 94.4464340209961, "mean_terminated_completion_length": 67.52083587646484, "min_completion_length": 17.0, "min_terminated_completion_length": 17.0, "num_tokens": 154744.0, "reward": 2.7449707984924316, "reward_std": 0.0811430960893631, "rewards/check_gptzero_func/mean": 0.8035714030265808, "rewards/check_gptzero_func/std": 0.40089184045791626, "rewards/check_winston_local_func/mean": 0.9873175024986267, "rewards/check_winston_local_func/std": 0.005508686415851116, "rewards/sentence_count_match_reward_logic/mean": 0.954081654548645, "rewards/sentence_count_match_reward_logic/std": 0.1143089011311531, "step": 2041 }, { "clip_ratio": 0.026233332231640816, "epoch": 0.7144856543037089, "grad_norm": 0.8725866004024965, "kl": 1.859375, "learning_rate": 8.116766836741877e-06, "loss": 0.013, "step": 2042 }, { "clip_ratio": 0.0446140430867672, "epoch": 0.7148355493351994, "grad_norm": 1.0247672735298752, "kl": 1.8828125, "learning_rate": 8.114378485609718e-06, "loss": 0.0116, "step": 2043 }, { "clip_ratio": 0.0513809435069561, "epoch": 0.71518544436669, "grad_norm": 0.3873570663700892, "kl": 1.8515625, "learning_rate": 8.111988972843859e-06, "loss": 0.0098, "step": 2044 }, { "clip_ratio": 0.009132452309131622, "clipped_completions_ratio": 0.0, "epoch": 0.7155353393981806, "grad_norm": 1.2941343961769072, "kl": 1.4453125, "learning_rate": 8.109598299335566e-06, "loss": 0.0201, "max_completion_length": 248.0, "max_terminated_completion_length": 248.0, "mean_completion_length": 146.17857360839844, "mean_terminated_completion_length": 146.17857360839844, "min_completion_length": 33.0, "min_terminated_completion_length": 33.0, "num_tokens": 172442.0, "reward": 2.930277109146118, "reward_std": 0.11740201711654663, "rewards/check_gptzero_func/mean": 0.9642857313156128, "rewards/check_gptzero_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9883124232292175, "rewards/check_winston_local_func/std": 0.010489496402442455, "rewards/sentence_count_match_reward_logic/mean": 0.9776785969734192, "rewards/sentence_count_match_reward_logic/std": 0.06354643404483795, "step": 2045 }, { "clip_ratio": 0.01630699448287487, "epoch": 0.7158852344296711, "grad_norm": 0.8132200972690595, "kl": 1.4375, "learning_rate": 8.107206465976538e-06, "loss": 0.0161, "step": 2046 }, { "clip_ratio": 0.033023901283741, "epoch": 0.7162351294611616, "grad_norm": 0.5926152076528542, "kl": 1.4453125, "learning_rate": 8.104813473658908e-06, "loss": 0.0133, "step": 2047 }, { "clip_ratio": 0.047066692262887955, "epoch": 0.7165850244926522, "grad_norm": 0.4525619481432236, "kl": 1.4609375, "learning_rate": 8.102419323275234e-06, "loss": 0.0114, "step": 2048 }, { "clip_ratio": 0.006130851339548826, "clipped_completions_ratio": 0.0714285714285714, "epoch": 0.7169349195241428, "grad_norm": 2.267728463089112, "kl": 1.78125, "learning_rate": 8.100024015718517e-06, "loss": 0.0046, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 112.08929443359375, "mean_terminated_completion_length": 101.01923370361328, "min_completion_length": 17.0, "min_terminated_completion_length": 17.0, "num_tokens": 186191.0, "reward": 2.7707910537719727, "reward_std": 0.08385667204856873, "rewards/check_gptzero_func/mean": 0.7857142686843872, "rewards/check_gptzero_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.9850764870643616, "rewards/check_winston_local_func/std": 0.01567998342216015, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 2049 }, { "clip_ratio": 0.030387258157134056, "epoch": 0.7172848145556333, "grad_norm": 0.981386744338752, "kl": 1.796875, "learning_rate": 8.097627551882181e-06, "loss": -0.0009, "step": 2050 }, { "clip_ratio": 0.05927653983235359, "epoch": 0.7176347095871238, "grad_norm": 0.8087256744330924, "kl": 1.8125, "learning_rate": 8.095229932660087e-06, "loss": -0.0036, "step": 2051 }, { "clip_ratio": 0.0683625265955925, "epoch": 0.7179846046186145, "grad_norm": 0.7171737278795307, "kl": 1.828125, "learning_rate": 8.09283115894652e-06, "loss": -0.0057, "step": 2052 }, { "clip_ratio": 0.009151647798717022, "clipped_completions_ratio": 0.0, "epoch": 0.718334499650105, "grad_norm": 2.371670492874319, "kl": 2.171875, "learning_rate": 8.090431231636204e-06, "loss": 0.0162, "max_completion_length": 157.0, "max_terminated_completion_length": 157.0, "mean_completion_length": 75.14286041259766, "mean_terminated_completion_length": 75.14286041259766, "min_completion_length": 17.0, "min_terminated_completion_length": 17.0, "num_tokens": 195999.0, "reward": 2.7623000144958496, "reward_std": 0.2196541577577591, "rewards/check_gptzero_func/mean": 0.8214285969734192, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9408712983131409, "rewards/check_winston_local_func/std": 0.10782074183225632, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 2053 }, { "clip_ratio": 0.02735133096575737, "epoch": 0.7186843946815955, "grad_norm": 1.2442363614438448, "kl": 2.171875, "learning_rate": 8.088030151624285e-06, "loss": 0.0132, "step": 2054 }, { "clip_ratio": 0.04857216030359268, "epoch": 0.719034289713086, "grad_norm": 1.0067913607209436, "kl": 2.171875, "learning_rate": 8.085627919806349e-06, "loss": 0.0108, "step": 2055 }, { "clip_ratio": 0.06109381467103958, "epoch": 0.7193841847445767, "grad_norm": 1.5219649551860444, "kl": 2.171875, "learning_rate": 8.083224537078401e-06, "loss": 0.0109, "step": 2056 }, { "clip_ratio": 0.004847576841711998, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7197340797760672, "grad_norm": 1.9776205124483244, "kl": 2.40625, "learning_rate": 8.08082000433688e-06, "loss": 0.013, "max_completion_length": 256.0, "max_terminated_completion_length": 166.0, "mean_completion_length": 102.10714721679688, "mean_terminated_completion_length": 76.45833587646484, "min_completion_length": 19.0, "min_terminated_completion_length": 19.0, "num_tokens": 209757.0, "reward": 2.760823965072632, "reward_std": 0.12233100831508636, "rewards/check_gptzero_func/mean": 0.8035714030265808, "rewards/check_gptzero_func/std": 0.40089187026023865, "rewards/check_winston_local_func/mean": 0.9878647923469543, "rewards/check_winston_local_func/std": 0.004780503921210766, "rewards/sentence_count_match_reward_logic/mean": 0.9693877100944519, "rewards/sentence_count_match_reward_logic/std": 0.07687923312187195, "step": 2057 }, { "clip_ratio": 0.023026591166853905, "epoch": 0.7200839748075577, "grad_norm": 1.1468784491103077, "kl": 2.421875, "learning_rate": 8.078414322478657e-06, "loss": 0.0081, "step": 2058 }, { "clip_ratio": 0.048202380537986755, "epoch": 0.7204338698390483, "grad_norm": 0.6234618896046608, "kl": 2.421875, "learning_rate": 8.076007492401026e-06, "loss": 0.0047, "step": 2059 }, { "clip_ratio": 0.061491116881370544, "epoch": 0.7207837648705389, "grad_norm": 1.0729604412559586, "kl": 2.4375, "learning_rate": 8.073599515001713e-06, "loss": 0.005, "step": 2060 }, { "clip_ratio": 0.0038074671756476164, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.7211336599020294, "grad_norm": 1.6693704162884724, "kl": 1.9609375, "learning_rate": 8.07119039117887e-06, "loss": 0.0174, "max_completion_length": 256.0, "max_terminated_completion_length": 158.0, "mean_completion_length": 123.01786041259766, "mean_terminated_completion_length": 69.82500457763672, "min_completion_length": 21.0, "min_terminated_completion_length": 21.0, "num_tokens": 227446.0, "reward": 2.7368128299713135, "reward_std": 0.05670728161931038, "rewards/check_gptzero_func/mean": 0.875, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.9885931015014648, "rewards/check_winston_local_func/std": 0.005761587992310524, "rewards/sentence_count_match_reward_logic/mean": 0.8732194900512695, "rewards/sentence_count_match_reward_logic/std": 0.24698154628276825, "step": 2061 }, { "clip_ratio": 0.011795546859502792, "epoch": 0.7214835549335199, "grad_norm": 0.6359395464774984, "kl": 1.953125, "learning_rate": 8.06878012183108e-06, "loss": 0.0141, "step": 2062 }, { "clip_ratio": 0.02617042139172554, "epoch": 0.7218334499650105, "grad_norm": 0.4552424258269145, "kl": 1.9453125, "learning_rate": 8.066368707857347e-06, "loss": 0.0123, "step": 2063 }, { "clip_ratio": 0.03245539590716362, "epoch": 0.722183344996501, "grad_norm": 0.3974498198146445, "kl": 1.953125, "learning_rate": 8.063956150157107e-06, "loss": 0.0111, "step": 2064 }, { "clip_ratio": 0.00755522632971406, "clipped_completions_ratio": 0.375, "epoch": 0.7225332400279916, "grad_norm": 2.4520853441699613, "kl": 1.578125, "learning_rate": 8.061542449630222e-06, "loss": 0.0193, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 119.64286041259766, "mean_terminated_completion_length": 37.82857131958008, "min_completion_length": 8.0, "min_terminated_completion_length": 8.0, "num_tokens": 243162.0, "reward": 2.892655611038208, "reward_std": 0.011736311949789524, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9682048559188843, "rewards/check_winston_local_func/std": 0.03334863856434822, "rewards/sentence_count_match_reward_logic/mean": 0.9244505167007446, "rewards/sentence_count_match_reward_logic/std": 0.13504545390605927, "step": 2065 }, { "clip_ratio": 0.025429757311940193, "epoch": 0.7228831350594822, "grad_norm": 0.8938768489686104, "kl": 1.5703125, "learning_rate": 8.05912760717698e-06, "loss": 0.0133, "step": 2066 }, { "clip_ratio": 0.040663737803697586, "epoch": 0.7232330300909727, "grad_norm": 0.5309411335970806, "kl": 1.578125, "learning_rate": 8.05671162369809e-06, "loss": 0.0107, "step": 2067 }, { "clip_ratio": 0.05185193195939064, "epoch": 0.7235829251224632, "grad_norm": 0.4930526583251036, "kl": 1.59375, "learning_rate": 8.054294500094697e-06, "loss": 0.0101, "step": 2068 }, { "clip_ratio": 0.007188393268734217, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7239328201539538, "grad_norm": 0.9911790700149168, "kl": 1.3671875, "learning_rate": 8.05187623726836e-06, "loss": 0.015, "max_completion_length": 256.0, "max_terminated_completion_length": 217.0, "mean_completion_length": 152.94644165039062, "mean_terminated_completion_length": 135.77084350585938, "min_completion_length": 68.0, "min_terminated_completion_length": 68.0, "num_tokens": 262815.0, "reward": 2.8620316982269287, "reward_std": 0.10152222961187363, "rewards/check_gptzero_func/mean": 0.9464285969734192, "rewards/check_gptzero_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9908155798912048, "rewards/check_winston_local_func/std": 0.0036079012788832188, "rewards/sentence_count_match_reward_logic/mean": 0.9247874021530151, "rewards/sentence_count_match_reward_logic/std": 0.15436996519565582, "step": 2069 }, { "clip_ratio": 0.013570892624557018, "epoch": 0.7242827151854444, "grad_norm": 0.7436091021584506, "kl": 1.3671875, "learning_rate": 8.049456836121072e-06, "loss": 0.0122, "step": 2070 }, { "clip_ratio": 0.02965323068201542, "epoch": 0.7246326102169349, "grad_norm": 0.5175246640768536, "kl": 1.3828125, "learning_rate": 8.047036297555247e-06, "loss": 0.0089, "step": 2071 }, { "clip_ratio": 0.042789947241544724, "epoch": 0.7249825052484254, "grad_norm": 0.44555204348851324, "kl": 1.4140625, "learning_rate": 8.044614622473717e-06, "loss": 0.0069, "step": 2072 }, { "clip_ratio": 0.0032059596851468086, "clipped_completions_ratio": 0.0, "epoch": 0.7253324002799161, "grad_norm": 1.9550839382365697, "kl": 1.5, "learning_rate": 8.042191811779752e-06, "loss": 0.0195, "max_completion_length": 162.0, "max_terminated_completion_length": 162.0, "mean_completion_length": 76.39286041259766, "mean_terminated_completion_length": 76.39286041259766, "min_completion_length": 11.0, "min_terminated_completion_length": 11.0, "num_tokens": 272741.0, "reward": 2.7446742057800293, "reward_std": 0.0654076337814331, "rewards/check_gptzero_func/mean": 0.8571428656578064, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9143170118331909, "rewards/check_winston_local_func/std": 0.15764008462429047, "rewards/sentence_count_match_reward_logic/mean": 0.9732142686843872, "rewards/sentence_count_match_reward_logic/std": 0.07802347838878632, "step": 2073 }, { "clip_ratio": 0.017856156453490257, "epoch": 0.7256822953114066, "grad_norm": 0.8461415786741163, "kl": 1.5, "learning_rate": 8.03976786637703e-06, "loss": 0.0152, "step": 2074 }, { "clip_ratio": 0.02943016029894352, "epoch": 0.7260321903428971, "grad_norm": 0.5425765423273934, "kl": 1.5078125, "learning_rate": 8.037342787169666e-06, "loss": 0.0132, "step": 2075 }, { "clip_ratio": 0.039724044501781464, "epoch": 0.7263820853743876, "grad_norm": 0.48300621057555654, "kl": 1.5234375, "learning_rate": 8.034916575062188e-06, "loss": 0.0122, "step": 2076 }, { "clip_ratio": 0.0025932055432349443, "clipped_completions_ratio": 0.0, "epoch": 0.7267319804058783, "grad_norm": 2.6473209612743482, "kl": 2.09375, "learning_rate": 8.03248923095955e-06, "loss": 0.0305, "max_completion_length": 209.0, "max_terminated_completion_length": 209.0, "mean_completion_length": 91.85714721679688, "mean_terminated_completion_length": 91.85714721679688, "min_completion_length": 16.0, "min_terminated_completion_length": 16.0, "num_tokens": 284645.0, "reward": 2.77886700630188, "reward_std": 0.07188621908426285, "rewards/check_gptzero_func/mean": 0.8392857313156128, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.9395812749862671, "rewards/check_winston_local_func/std": 0.09061562269926071, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 2077 }, { "clip_ratio": 0.016885092481970787, "epoch": 0.7270818754373688, "grad_norm": 0.8217250264026087, "kl": 2.109375, "learning_rate": 8.030060755767131e-06, "loss": 0.0265, "step": 2078 }, { "clip_ratio": 0.026506207883358, "epoch": 0.7274317704688593, "grad_norm": 1.2661646028498952, "kl": 2.1875, "learning_rate": 8.027631150390728e-06, "loss": 0.0263, "step": 2079 }, { "clip_ratio": 0.036531735211610794, "epoch": 0.72778166550035, "grad_norm": 1.2148074493831065, "kl": 2.28125, "learning_rate": 8.02520041573656e-06, "loss": 0.0258, "step": 2080 }, { "clip_ratio": 0.00449971342459321, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.7281315605318405, "grad_norm": 2.406601295137552, "kl": 1.3359375, "learning_rate": 8.022768552711268e-06, "loss": 0.0128, "max_completion_length": 256.0, "max_terminated_completion_length": 234.0, "mean_completion_length": 161.17857360839844, "mean_terminated_completion_length": 123.25, "min_completion_length": 17.0, "min_terminated_completion_length": 17.0, "num_tokens": 305023.0, "reward": 2.7769722938537598, "reward_std": 0.024440564215183258, "rewards/check_gptzero_func/mean": 0.8571428656578064, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9662577509880066, "rewards/check_winston_local_func/std": 0.05565343797206879, "rewards/sentence_count_match_reward_logic/mean": 0.9535714387893677, "rewards/sentence_count_match_reward_logic/std": 0.09026843309402466, "step": 2081 }, { "clip_ratio": 0.01674937829375267, "epoch": 0.728481455563331, "grad_norm": 0.7433242345975976, "kl": 1.3359375, "learning_rate": 8.020335562221914e-06, "loss": 0.0086, "step": 2082 }, { "clip_ratio": 0.02300531603395939, "epoch": 0.7288313505948215, "grad_norm": 0.5463931401598344, "kl": 1.34375, "learning_rate": 8.017901445175981e-06, "loss": 0.0074, "step": 2083 }, { "clip_ratio": 0.03114055097103119, "epoch": 0.7291812456263121, "grad_norm": 0.5514900573078845, "kl": 1.375, "learning_rate": 8.015466202481371e-06, "loss": 0.0054, "step": 2084 }, { "clip_ratio": 0.004985738545656204, "clipped_completions_ratio": 0.0, "epoch": 0.7295311406578027, "grad_norm": 1.6101007464993224, "kl": 2.484375, "learning_rate": 8.013029835046408e-06, "loss": 0.0291, "max_completion_length": 129.0, "max_terminated_completion_length": 129.0, "mean_completion_length": 53.017860412597656, "mean_terminated_completion_length": 53.017860412597656, "min_completion_length": 26.0, "min_terminated_completion_length": 26.0, "num_tokens": 312600.0, "reward": 2.723463296890259, "reward_std": 0.13172774016857147, "rewards/check_gptzero_func/mean": 0.75, "rewards/check_gptzero_func/std": 0.43693143129348755, "rewards/check_winston_local_func/mean": 0.9734629392623901, "rewards/check_winston_local_func/std": 0.025726398453116417, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 2085 }, { "clip_ratio": 0.02218664065003395, "epoch": 0.7298810356892932, "grad_norm": 0.9973002090477242, "kl": 2.484375, "learning_rate": 8.010592343779833e-06, "loss": 0.0226, "step": 2086 }, { "clip_ratio": 0.043985240161418915, "epoch": 0.7302309307207837, "grad_norm": 0.6415526470074814, "kl": 2.5, "learning_rate": 8.008153729590806e-06, "loss": 0.0196, "step": 2087 }, { "clip_ratio": 0.06112097576260567, "epoch": 0.7305808257522743, "grad_norm": 0.6180569490790983, "kl": 2.5, "learning_rate": 8.005713993388908e-06, "loss": 0.019, "step": 2088 }, { "clip_ratio": 0.004522028844803572, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7309307207837649, "grad_norm": 1.4183932995906996, "kl": 2.015625, "learning_rate": 8.00327313608414e-06, "loss": 0.0108, "max_completion_length": 256.0, "max_terminated_completion_length": 140.0, "mean_completion_length": 88.26786041259766, "mean_terminated_completion_length": 60.3125, "min_completion_length": 17.0, "min_terminated_completion_length": 17.0, "num_tokens": 324951.0, "reward": 2.868877649307251, "reward_std": 0.004888711031526327, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9371546506881714, "rewards/check_winston_local_func/std": 0.12672767043113708, "rewards/sentence_count_match_reward_logic/mean": 0.9317227005958557, "rewards/sentence_count_match_reward_logic/std": 0.10934722423553467, "step": 2089 }, { "clip_ratio": 0.020108547061681747, "epoch": 0.7312806158152554, "grad_norm": 0.6835717880806523, "kl": 2.03125, "learning_rate": 8.000831158586915e-06, "loss": 0.0078, "step": 2090 }, { "clip_ratio": 0.02836534008383751, "epoch": 0.731630510846746, "grad_norm": 0.5524152997651679, "kl": 2.03125, "learning_rate": 7.998388061808067e-06, "loss": 0.006, "step": 2091 }, { "clip_ratio": 0.033182043582201004, "epoch": 0.7319804058782365, "grad_norm": 0.507980350914621, "kl": 2.03125, "learning_rate": 7.995943846658852e-06, "loss": 0.005, "step": 2092 }, { "clip_ratio": 0.007148873060941696, "clipped_completions_ratio": 0.0, "epoch": 0.7323303009097271, "grad_norm": 1.075469559931454, "kl": 1.8046875, "learning_rate": 7.993498514050935e-06, "loss": 0.0207, "max_completion_length": 234.0, "max_terminated_completion_length": 234.0, "mean_completion_length": 122.96429443359375, "mean_terminated_completion_length": 122.96429443359375, "min_completion_length": 40.0, "min_terminated_completion_length": 40.0, "num_tokens": 340261.0, "reward": 2.9149601459503174, "reward_std": 0.0909556895494461, "rewards/check_gptzero_func/mean": 0.9464285969734192, "rewards/check_gptzero_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9893646240234375, "rewards/check_winston_local_func/std": 0.00598103366792202, "rewards/sentence_count_match_reward_logic/mean": 0.9791666269302368, "rewards/sentence_count_match_reward_logic/std": 0.05561865493655205, "step": 2093 }, { "clip_ratio": 0.015830567106604576, "epoch": 0.7326801959412176, "grad_norm": 0.8308062663871836, "kl": 1.8125, "learning_rate": 7.991052064896406e-06, "loss": 0.0183, "step": 2094 }, { "clip_ratio": 0.028004610911011696, "epoch": 0.7330300909727082, "grad_norm": 0.7749263659923253, "kl": 1.859375, "learning_rate": 7.988604500107764e-06, "loss": 0.0152, "step": 2095 }, { "clip_ratio": 0.03745907545089722, "epoch": 0.7333799860041987, "grad_norm": 0.6348869330359347, "kl": 1.8671875, "learning_rate": 7.986155820597927e-06, "loss": 0.0147, "step": 2096 }, { "clip_ratio": 0.00502993306145072, "clipped_completions_ratio": 0.2678571428571429, "epoch": 0.7337298810356893, "grad_norm": 1.0989199819421787, "kl": 1.421875, "learning_rate": 7.983706027280232e-06, "loss": 0.0186, "max_completion_length": 256.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 148.46429443359375, "mean_terminated_completion_length": 109.1219482421875, "min_completion_length": 33.0, "min_terminated_completion_length": 33.0, "num_tokens": 358935.0, "reward": 2.7966811656951904, "reward_std": 0.07444127649068832, "rewards/check_gptzero_func/mean": 0.875, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.988050103187561, "rewards/check_winston_local_func/std": 0.006535641849040985, "rewards/sentence_count_match_reward_logic/mean": 0.9336309432983398, "rewards/sentence_count_match_reward_logic/std": 0.11864091455936432, "step": 2097 }, { "clip_ratio": 0.01688501611351967, "epoch": 0.7340797760671799, "grad_norm": 0.5895048584293041, "kl": 1.4140625, "learning_rate": 7.981255121068428e-06, "loss": 0.0146, "step": 2098 }, { "clip_ratio": 0.02735188975930214, "epoch": 0.7344296710986704, "grad_norm": 0.45911413183546124, "kl": 1.4140625, "learning_rate": 7.97880310287668e-06, "loss": 0.0123, "step": 2099 }, { "clip_ratio": 0.040545474737882614, "epoch": 0.7347795661301609, "grad_norm": 0.4093732237814884, "kl": 1.421875, "learning_rate": 7.976349973619567e-06, "loss": 0.0116, "step": 2100 }, { "clip_ratio": 0.007175269536674023, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.7351294611616515, "grad_norm": 1.8539032115725422, "kl": 1.8359375, "learning_rate": 7.973895734212082e-06, "loss": 0.0167, "max_completion_length": 256.0, "max_terminated_completion_length": 176.0, "mean_completion_length": 140.625, "mean_terminated_completion_length": 94.4749984741211, "min_completion_length": 20.0, "min_terminated_completion_length": 20.0, "num_tokens": 376586.0, "reward": 2.6360883712768555, "reward_std": 0.1024676039814949, "rewards/check_gptzero_func/mean": 0.6785714030265808, "rewards/check_gptzero_func/std": 0.4712514281272888, "rewards/check_winston_local_func/mean": 0.9899841547012329, "rewards/check_winston_local_func/std": 0.004375204909592867, "rewards/sentence_count_match_reward_logic/mean": 0.9675324559211731, "rewards/sentence_count_match_reward_logic/std": 0.10601527988910675, "step": 2101 }, { "clip_ratio": 0.02508234791457653, "epoch": 0.7354793561931421, "grad_norm": 1.0789407964240565, "kl": 1.84375, "learning_rate": 7.971440385569635e-06, "loss": 0.0119, "step": 2102 }, { "clip_ratio": 0.041841648519039154, "epoch": 0.7358292512246326, "grad_norm": 0.8518686694872185, "kl": 1.8671875, "learning_rate": 7.968983928608047e-06, "loss": 0.0086, "step": 2103 }, { "clip_ratio": 0.05477995052933693, "epoch": 0.7361791462561231, "grad_norm": 1.2163978451039432, "kl": 1.8984375, "learning_rate": 7.966526364243553e-06, "loss": 0.0056, "step": 2104 }, { "clip_ratio": 0.009578688070178032, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7365290412876138, "grad_norm": 2.338236459154513, "kl": 2.15625, "learning_rate": 7.9640676933928e-06, "loss": 0.026, "max_completion_length": 256.0, "max_terminated_completion_length": 162.0, "mean_completion_length": 101.28572082519531, "mean_terminated_completion_length": 75.5, "min_completion_length": 16.0, "min_terminated_completion_length": 16.0, "num_tokens": 390298.0, "reward": 2.8711142539978027, "reward_std": 0.11116566509008408, "rewards/check_gptzero_func/mean": 0.9464285969734192, "rewards/check_gptzero_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.967268168926239, "rewards/check_winston_local_func/std": 0.07607001066207886, "rewards/sentence_count_match_reward_logic/mean": 0.9574176073074341, "rewards/sentence_count_match_reward_logic/std": 0.10969137400388718, "step": 2105 }, { "clip_ratio": 0.027528487145900726, "epoch": 0.7368789363191043, "grad_norm": 1.3973360831077553, "kl": 2.171875, "learning_rate": 7.961607916972853e-06, "loss": 0.0186, "step": 2106 }, { "clip_ratio": 0.054809700697660446, "epoch": 0.7372288313505948, "grad_norm": 0.870867652085194, "kl": 2.1875, "learning_rate": 7.959147035901181e-06, "loss": 0.0159, "step": 2107 }, { "clip_ratio": 0.07576524466276169, "epoch": 0.7375787263820853, "grad_norm": 0.7207867211153065, "kl": 2.21875, "learning_rate": 7.956685051095672e-06, "loss": 0.0135, "step": 2108 }, { "clip_ratio": 0.004746704362332821, "clipped_completions_ratio": 0.3571428571428571, "epoch": 0.737928621413576, "grad_norm": 0.7728937505552063, "kl": 1.109375, "learning_rate": 7.95422196347462e-06, "loss": 0.0177, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 204.4107208251953, "mean_terminated_completion_length": 175.75, "min_completion_length": 120.0, "min_terminated_completion_length": 120.0, "num_tokens": 415249.0, "reward": 2.7198915481567383, "reward_std": 0.2936488091945648, "rewards/check_gptzero_func/mean": 0.7678571343421936, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.9911925196647644, "rewards/check_winston_local_func/std": 0.0033093111123889685, "rewards/sentence_count_match_reward_logic/mean": 0.9608418345451355, "rewards/sentence_count_match_reward_logic/std": 0.06323075294494629, "step": 2109 }, { "clip_ratio": 0.011884665116667747, "epoch": 0.7382785164450665, "grad_norm": 0.6793980866515751, "kl": 1.109375, "learning_rate": 7.951757773956738e-06, "loss": 0.0149, "step": 2110 }, { "clip_ratio": 0.02270217426121235, "epoch": 0.738628411476557, "grad_norm": 0.6023135140306919, "kl": 1.1171875, "learning_rate": 7.949292483461141e-06, "loss": 0.0122, "step": 2111 }, { "clip_ratio": 0.034765951335430145, "epoch": 0.7389783065080476, "grad_norm": 0.4443323000938205, "kl": 1.1171875, "learning_rate": 7.946826092907362e-06, "loss": 0.0104, "step": 2112 }, { "clip_ratio": 0.006749753840267658, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.7393282015395382, "grad_norm": 1.1632575773246994, "kl": 1.7734375, "learning_rate": 7.94435860321534e-06, "loss": 0.0191, "max_completion_length": 256.0, "max_terminated_completion_length": 115.0, "mean_completion_length": 137.3928680419922, "mean_terminated_completion_length": 89.95000457763672, "min_completion_length": 38.0, "min_terminated_completion_length": 38.0, "num_tokens": 432335.0, "reward": 2.8424127101898193, "reward_std": 0.20011623203754425, "rewards/check_gptzero_func/mean": 0.8571428656578064, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9852699041366577, "rewards/check_winston_local_func/std": 0.025449717417359352, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 2113 }, { "clip_ratio": 0.012359258718788624, "epoch": 0.7396780965710287, "grad_norm": 1.0010781986902517, "kl": 1.78125, "learning_rate": 7.941890015305425e-06, "loss": 0.0142, "step": 2114 }, { "clip_ratio": 0.03441137820482254, "epoch": 0.7400279916025192, "grad_norm": 0.6068869118223514, "kl": 1.7890625, "learning_rate": 7.939420330098376e-06, "loss": 0.011, "step": 2115 }, { "clip_ratio": 0.04600631073117256, "epoch": 0.7403778866340098, "grad_norm": 0.682978962824675, "kl": 1.84375, "learning_rate": 7.936949548515364e-06, "loss": 0.0098, "step": 2116 }, { "clip_ratio": 0.0051321061328053474, "clipped_completions_ratio": 0.4285714285714286, "epoch": 0.7407277816655004, "grad_norm": 0.7586678317315161, "kl": 1.0625, "learning_rate": 7.934477671477968e-06, "loss": 0.0043, "max_completion_length": 256.0, "max_terminated_completion_length": 158.0, "mean_completion_length": 170.375, "mean_terminated_completion_length": 106.15625, "min_completion_length": 75.0, "min_terminated_completion_length": 75.0, "num_tokens": 454788.0, "reward": 2.6641814708709717, "reward_std": 0.08630891144275665, "rewards/check_gptzero_func/mean": 0.7678571343421936, "rewards/check_gptzero_func/std": 0.4260205328464508, "rewards/check_winston_local_func/mean": 0.9842706322669983, "rewards/check_winston_local_func/std": 0.012485026381909847, "rewards/sentence_count_match_reward_logic/mean": 0.9120535850524902, "rewards/sentence_count_match_reward_logic/std": 0.1354607492685318, "step": 2117 }, { "clip_ratio": 0.009007122367620468, "epoch": 0.7410776766969909, "grad_norm": 0.6430294896770793, "kl": 1.0625, "learning_rate": 7.932004699908173e-06, "loss": 0.0018, "step": 2118 }, { "clip_ratio": 0.019432242959737778, "epoch": 0.7414275717284815, "grad_norm": 0.5216269153981374, "kl": 1.0625, "learning_rate": 7.929530634728376e-06, "loss": -0.0003, "step": 2119 }, { "clip_ratio": 0.0342584103345871, "epoch": 0.741777466759972, "grad_norm": 0.3776092499751888, "kl": 1.0625, "learning_rate": 7.927055476861376e-06, "loss": -0.0021, "step": 2120 }, { "clip_ratio": 0.00423651747405529, "clipped_completions_ratio": 0.1607142857142857, "epoch": 0.7421273617914625, "grad_norm": 1.3600382589222888, "kl": 1.671875, "learning_rate": 7.924579227230387e-06, "loss": 0.0051, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 122.60714721679688, "mean_terminated_completion_length": 97.06382751464844, "min_completion_length": 32.0, "min_terminated_completion_length": 32.0, "num_tokens": 470686.0, "reward": 2.7420170307159424, "reward_std": 0.08263860642910004, "rewards/check_gptzero_func/mean": 0.8214285969734192, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9858480095863342, "rewards/check_winston_local_func/std": 0.007546415086835623, "rewards/sentence_count_match_reward_logic/mean": 0.9347402453422546, "rewards/sentence_count_match_reward_logic/std": 0.11678264290094376, "step": 2121 }, { "clip_ratio": 0.018658645451068878, "epoch": 0.7424772568229531, "grad_norm": 0.6504029227079081, "kl": 1.6796875, "learning_rate": 7.922101886759024e-06, "loss": 0.0007, "step": 2122 }, { "clip_ratio": 0.035339757800102234, "epoch": 0.7428271518544437, "grad_norm": 0.6752010688171087, "kl": 1.6953125, "learning_rate": 7.919623456371315e-06, "loss": -0.001, "step": 2123 }, { "clip_ratio": 0.04377010092139244, "epoch": 0.7431770468859342, "grad_norm": 0.5041724679488201, "kl": 1.703125, "learning_rate": 7.917143936991688e-06, "loss": -0.0031, "step": 2124 }, { "clip_ratio": 0.004341076593846083, "clipped_completions_ratio": 0.0, "epoch": 0.7435269419174247, "grad_norm": 1.0303761355455257, "kl": 1.5859375, "learning_rate": 7.914663329544983e-06, "loss": 0.013, "max_completion_length": 199.0, "max_terminated_completion_length": 199.0, "mean_completion_length": 117.87500762939453, "mean_terminated_completion_length": 117.87500762939453, "min_completion_length": 35.0, "min_terminated_completion_length": 35.0, "num_tokens": 485247.0, "reward": 2.9463322162628174, "reward_std": 0.0840698778629303, "rewards/check_gptzero_func/mean": 0.9642857313156128, "rewards/check_gptzero_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9909750819206238, "rewards/check_winston_local_func/std": 0.003240973223000765, "rewards/sentence_count_match_reward_logic/mean": 0.9910714030265808, "rewards/sentence_count_match_reward_logic/std": 0.04681408405303955, "step": 2125 }, { "clip_ratio": 0.011548569425940514, "epoch": 0.7438768369489154, "grad_norm": 0.7903149202024161, "kl": 1.5859375, "learning_rate": 7.91218163495644e-06, "loss": 0.0091, "step": 2126 }, { "clip_ratio": 0.02599216066300869, "epoch": 0.7442267319804059, "grad_norm": 0.48544349133141595, "kl": 1.59375, "learning_rate": 7.909698854151708e-06, "loss": 0.0065, "step": 2127 }, { "clip_ratio": 0.041198015213012695, "epoch": 0.7445766270118964, "grad_norm": 0.4200879956964785, "kl": 1.59375, "learning_rate": 7.907214988056844e-06, "loss": 0.0047, "step": 2128 }, { "clip_ratio": 0.0050535742193460464, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.7449265220433869, "grad_norm": 2.090688688581155, "kl": 1.1953125, "learning_rate": 7.904730037598303e-06, "loss": 0.0185, "max_completion_length": 256.0, "max_terminated_completion_length": 152.0, "mean_completion_length": 127.21429443359375, "mean_terminated_completion_length": 75.70000457763672, "min_completion_length": 18.0, "min_terminated_completion_length": 18.0, "num_tokens": 502307.0, "reward": 2.583322286605835, "reward_std": 0.04553721100091934, "rewards/check_gptzero_func/mean": 0.7142857313156128, "rewards/check_gptzero_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.9617023468017578, "rewards/check_winston_local_func/std": 0.05842094495892525, "rewards/sentence_count_match_reward_logic/mean": 0.9073342084884644, "rewards/sentence_count_match_reward_logic/std": 0.13839025795459747, "step": 2129 }, { "clip_ratio": 0.023933883756399155, "epoch": 0.7452764170748776, "grad_norm": 0.9354231434402184, "kl": 1.2109375, "learning_rate": 7.902244003702951e-06, "loss": 0.0127, "step": 2130 }, { "clip_ratio": 0.04076399281620979, "epoch": 0.7456263121063681, "grad_norm": 0.8335015702323826, "kl": 1.234375, "learning_rate": 7.89975688729805e-06, "loss": 0.0107, "step": 2131 }, { "clip_ratio": 0.05715494230389595, "epoch": 0.7459762071378586, "grad_norm": 0.7055871246664294, "kl": 1.2734375, "learning_rate": 7.897268689311278e-06, "loss": 0.0082, "step": 2132 }, { "clip_ratio": 0.0036771204322576523, "clipped_completions_ratio": 0.4285714285714286, "epoch": 0.7463261021693492, "grad_norm": 1.0974291514494783, "kl": 1.4296875, "learning_rate": 7.894779410670704e-06, "loss": 0.0114, "max_completion_length": 256.0, "max_terminated_completion_length": 185.0, "mean_completion_length": 173.69644165039062, "mean_terminated_completion_length": 111.96875, "min_completion_length": 24.0, "min_terminated_completion_length": 24.0, "num_tokens": 526642.0, "reward": 2.8706412315368652, "reward_std": 0.027738725766539574, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9908334016799927, "rewards/check_winston_local_func/std": 0.0030924726743251085, "rewards/sentence_count_match_reward_logic/mean": 0.8798076510429382, "rewards/sentence_count_match_reward_logic/std": 0.15060894191265106, "step": 2133 }, { "clip_ratio": 0.01459113135933876, "epoch": 0.7466759972008398, "grad_norm": 0.8060177608413809, "kl": 1.4453125, "learning_rate": 7.892289052304807e-06, "loss": 0.009, "step": 2134 }, { "clip_ratio": 0.02842901274561882, "epoch": 0.7470258922323303, "grad_norm": 0.6149024195883342, "kl": 1.4453125, "learning_rate": 7.889797615142467e-06, "loss": 0.0071, "step": 2135 }, { "clip_ratio": 0.03915954381227493, "epoch": 0.7473757872638208, "grad_norm": 0.47201175428828457, "kl": 1.4453125, "learning_rate": 7.887305100112967e-06, "loss": 0.005, "step": 2136 }, { "clip_ratio": 0.004091015085577965, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.7477256822953114, "grad_norm": 0.767237322092294, "kl": 1.0234375, "learning_rate": 7.884811508145993e-06, "loss": 0.002, "max_completion_length": 256.0, "max_terminated_completion_length": 191.0, "mean_completion_length": 180.0357208251953, "mean_terminated_completion_length": 149.65000915527344, "min_completion_length": 118.0, "min_terminated_completion_length": 118.0, "num_tokens": 548204.0, "reward": 2.905489921569824, "reward_std": 0.12848952412605286, "rewards/check_gptzero_func/mean": 0.9285714030265808, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9894181489944458, "rewards/check_winston_local_func/std": 0.009901471436023712, "rewards/sentence_count_match_reward_logic/mean": 0.987500011920929, "rewards/sentence_count_match_reward_logic/std": 0.04977221041917801, "step": 2137 }, { "clip_ratio": 0.007841205224394798, "epoch": 0.748075577326802, "grad_norm": 0.5785585882211298, "kl": 1.0234375, "learning_rate": 7.882316840171626e-06, "loss": -0.0008, "step": 2138 }, { "clip_ratio": 0.017192082479596138, "epoch": 0.7484254723582925, "grad_norm": 0.5091794015196942, "kl": 1.03125, "learning_rate": 7.879821097120358e-06, "loss": -0.0028, "step": 2139 }, { "clip_ratio": 0.029772605746984482, "epoch": 0.7487753673897831, "grad_norm": 0.43696042165430726, "kl": 1.046875, "learning_rate": 7.877324279923078e-06, "loss": -0.0048, "step": 2140 }, { "clip_ratio": 0.0030755966436117887, "clipped_completions_ratio": 0.0, "epoch": 0.7491252624212736, "grad_norm": 2.1171620266904663, "kl": 2.28125, "learning_rate": 7.874826389511073e-06, "loss": 0.035, "max_completion_length": 223.0, "max_terminated_completion_length": 223.0, "mean_completion_length": 95.05357360839844, "mean_terminated_completion_length": 95.05357360839844, "min_completion_length": 15.0, "min_terminated_completion_length": 15.0, "num_tokens": 560383.0, "reward": 2.94256329536438, "reward_std": 0.04850076884031296, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9604204893112183, "rewards/check_winston_local_func/std": 0.08428158611059189, "rewards/sentence_count_match_reward_logic/mean": 0.9821428656578064, "rewards/sentence_count_match_reward_logic/std": 0.06903790682554245, "step": 2141 }, { "clip_ratio": 0.023503387346863747, "epoch": 0.7494751574527642, "grad_norm": 0.9357557292204373, "kl": 2.28125, "learning_rate": 7.872327426816031e-06, "loss": 0.0282, "step": 2142 }, { "clip_ratio": 0.03974669426679611, "epoch": 0.7498250524842547, "grad_norm": 0.6285371396950967, "kl": 2.296875, "learning_rate": 7.869827392770046e-06, "loss": 0.0246, "step": 2143 }, { "clip_ratio": 0.05496827885508537, "epoch": 0.7501749475157453, "grad_norm": 0.5420070174084245, "kl": 2.328125, "learning_rate": 7.867326288305603e-06, "loss": 0.0222, "step": 2144 }, { "clip_ratio": 0.005362511146813631, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7505248425472358, "grad_norm": 2.0365679730964383, "kl": 2.0625, "learning_rate": 7.864824114355591e-06, "loss": 0.0331, "max_completion_length": 256.0, "max_terminated_completion_length": 165.0, "mean_completion_length": 106.01786041259766, "mean_terminated_completion_length": 81.02083587646484, "min_completion_length": 8.0, "min_terminated_completion_length": 8.0, "num_tokens": 575312.0, "reward": 2.8769280910491943, "reward_std": 0.0818016529083252, "rewards/check_gptzero_func/mean": 0.9642857313156128, "rewards/check_gptzero_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9690333604812622, "rewards/check_winston_local_func/std": 0.04892928525805473, "rewards/sentence_count_match_reward_logic/mean": 0.9436090588569641, "rewards/sentence_count_match_reward_logic/std": 0.14010003209114075, "step": 2145 }, { "clip_ratio": 0.021088192239403725, "epoch": 0.7508747375787264, "grad_norm": 1.1372807604999822, "kl": 2.0625, "learning_rate": 7.8623208718533e-06, "loss": 0.029, "step": 2146 }, { "clip_ratio": 0.03508613631129265, "epoch": 0.7512246326102169, "grad_norm": 0.8879521722024437, "kl": 2.046875, "learning_rate": 7.859816561732414e-06, "loss": 0.0272, "step": 2147 }, { "clip_ratio": 0.041439689695835114, "epoch": 0.7515745276417075, "grad_norm": 0.8042401961250741, "kl": 2.046875, "learning_rate": 7.857311184927015e-06, "loss": 0.0263, "step": 2148 }, { "clip_ratio": 0.00435396796092391, "clipped_completions_ratio": 0.0, "epoch": 0.751924422673198, "grad_norm": 1.898558900572497, "kl": 2.015625, "learning_rate": 7.854804742371588e-06, "loss": 0.0269, "max_completion_length": 227.0, "max_terminated_completion_length": 227.0, "mean_completion_length": 117.85714721679688, "mean_terminated_completion_length": 117.85714721679688, "min_completion_length": 20.0, "min_terminated_completion_length": 20.0, "num_tokens": 590032.0, "reward": 2.919304847717285, "reward_std": 0.12573522329330444, "rewards/check_gptzero_func/mean": 0.9285714030265808, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9907334446907043, "rewards/check_winston_local_func/std": 0.0031904205679893494, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 2149 }, { "clip_ratio": 0.02374551258981228, "epoch": 0.7522743177046886, "grad_norm": 0.9713357745146899, "kl": 2.015625, "learning_rate": 7.85229723500101e-06, "loss": 0.0199, "step": 2150 }, { "clip_ratio": 0.04261280596256256, "epoch": 0.7526242127361792, "grad_norm": 0.7798956223702034, "kl": 2.015625, "learning_rate": 7.84978866375056e-06, "loss": 0.0158, "step": 2151 }, { "clip_ratio": 0.06095085293054581, "epoch": 0.7529741077676697, "grad_norm": 0.5981870313663291, "kl": 2.015625, "learning_rate": 7.847279029555908e-06, "loss": 0.0138, "step": 2152 }, { "clip_ratio": 0.002061167499050498, "clipped_completions_ratio": 0.0, "epoch": 0.7533240027991602, "grad_norm": 3.4456254785659515, "kl": 3.515625, "learning_rate": 7.844768333353127e-06, "loss": 0.0492, "max_completion_length": 236.0, "max_terminated_completion_length": 236.0, "mean_completion_length": 56.41071701049805, "mean_terminated_completion_length": 56.41071701049805, "min_completion_length": 9.0, "min_terminated_completion_length": 9.0, "num_tokens": 597999.0, "reward": 2.911877393722534, "reward_std": 0.10544932633638382, "rewards/check_gptzero_func/mean": 0.9642857313156128, "rewards/check_gptzero_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9535437822341919, "rewards/check_winston_local_func/std": 0.07109339535236359, "rewards/sentence_count_match_reward_logic/mean": 0.9940475821495056, "rewards/sentence_count_match_reward_logic/std": 0.031209392473101616, "step": 2153 }, { "clip_ratio": 0.031904108822345734, "epoch": 0.7536738978306508, "grad_norm": 1.6079520608401212, "kl": 3.5, "learning_rate": 7.842256576078682e-06, "loss": 0.0413, "step": 2154 }, { "clip_ratio": 0.07058995962142944, "epoch": 0.7540237928621414, "grad_norm": 0.8755930235573544, "kl": 3.5, "learning_rate": 7.839743758669433e-06, "loss": 0.0366, "step": 2155 }, { "clip_ratio": 0.08410871773958206, "epoch": 0.7543736878936319, "grad_norm": 0.7329387167276774, "kl": 3.515625, "learning_rate": 7.837229882062638e-06, "loss": 0.0328, "step": 2156 }, { "clip_ratio": 0.00428102957084775, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7547235829251224, "grad_norm": 1.2353325101059072, "kl": 2.265625, "learning_rate": 7.83471494719595e-06, "loss": 0.0286, "max_completion_length": 256.0, "max_terminated_completion_length": 209.0, "mean_completion_length": 125.85714721679688, "mean_terminated_completion_length": 104.16667175292969, "min_completion_length": 21.0, "min_terminated_completion_length": 21.0, "num_tokens": 614063.0, "reward": 2.9322009086608887, "reward_std": 0.08391706645488739, "rewards/check_gptzero_func/mean": 0.9642857313156128, "rewards/check_gptzero_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9866651296615601, "rewards/check_winston_local_func/std": 0.012455319054424763, "rewards/sentence_count_match_reward_logic/mean": 0.981249988079071, "rewards/sentence_count_match_reward_logic/std": 0.04725415259599686, "step": 2157 }, { "clip_ratio": 0.015659959986805916, "epoch": 0.7550734779566131, "grad_norm": 0.8488510319543022, "kl": 2.265625, "learning_rate": 7.832198955007416e-06, "loss": 0.0236, "step": 2158 }, { "clip_ratio": 0.03746044635772705, "epoch": 0.7554233729881036, "grad_norm": 0.7311206889887628, "kl": 2.265625, "learning_rate": 7.829681906435475e-06, "loss": 0.0198, "step": 2159 }, { "clip_ratio": 0.054901380091905594, "epoch": 0.7557732680195941, "grad_norm": 0.5334343294421661, "kl": 2.265625, "learning_rate": 7.827163802418967e-06, "loss": 0.0176, "step": 2160 }, { "clip_ratio": 0.005188790149986744, "clipped_completions_ratio": 0.4285714285714286, "epoch": 0.7561231630510846, "grad_norm": 1.0557189737661197, "kl": 1.21875, "learning_rate": 7.824644643897116e-06, "loss": 0.0164, "max_completion_length": 256.0, "max_terminated_completion_length": 189.0, "mean_completion_length": 173.60714721679688, "mean_terminated_completion_length": 111.8125, "min_completion_length": 15.0, "min_terminated_completion_length": 15.0, "num_tokens": 637409.0, "reward": 2.7044782638549805, "reward_std": 0.08917839080095291, "rewards/check_gptzero_func/mean": 0.8214285969734192, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9877405762672424, "rewards/check_winston_local_func/std": 0.006042106077075005, "rewards/sentence_count_match_reward_logic/mean": 0.8953089118003845, "rewards/sentence_count_match_reward_logic/std": 0.1727817952632904, "step": 2161 }, { "clip_ratio": 0.01729576103389263, "epoch": 0.7564730580825753, "grad_norm": 0.7714732154599784, "kl": 1.21875, "learning_rate": 7.822124431809546e-06, "loss": 0.0132, "step": 2162 }, { "clip_ratio": 0.028371062129735947, "epoch": 0.7568229531140658, "grad_norm": 0.7858366568656762, "kl": 1.21875, "learning_rate": 7.819603167096272e-06, "loss": 0.0107, "step": 2163 }, { "clip_ratio": 0.040682803839445114, "epoch": 0.7571728481455563, "grad_norm": 0.4957429686792968, "kl": 1.2109375, "learning_rate": 7.817080850697705e-06, "loss": 0.0078, "step": 2164 }, { "clip_ratio": 0.004675957374274731, "clipped_completions_ratio": 0.0, "epoch": 0.7575227431770469, "grad_norm": 3.130326256989754, "kl": 2.609375, "learning_rate": 7.81455748355464e-06, "loss": 0.032, "max_completion_length": 181.0, "max_terminated_completion_length": 181.0, "mean_completion_length": 56.232147216796875, "mean_terminated_completion_length": 56.232147216796875, "min_completion_length": 9.0, "min_terminated_completion_length": 9.0, "num_tokens": 645278.0, "reward": 2.892038106918335, "reward_std": 0.18091502785682678, "rewards/check_gptzero_func/mean": 0.9107142686843872, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9813238978385925, "rewards/check_winston_local_func/std": 0.026181571185588837, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 2165 }, { "clip_ratio": 0.041663166135549545, "epoch": 0.7578726382085375, "grad_norm": 1.2672424590358216, "kl": 2.609375, "learning_rate": 7.812033066608273e-06, "loss": 0.028, "step": 2166 }, { "clip_ratio": 0.06460093706846237, "epoch": 0.758222533240028, "grad_norm": 0.6815257726511025, "kl": 2.625, "learning_rate": 7.809507600800186e-06, "loss": 0.0234, "step": 2167 }, { "clip_ratio": 0.07877930253744125, "epoch": 0.7585724282715185, "grad_norm": 0.8125372155632845, "kl": 2.65625, "learning_rate": 7.806981087072354e-06, "loss": 0.0225, "step": 2168 }, { "clip_ratio": 0.0023855618201196194, "clipped_completions_ratio": 0.0, "epoch": 0.7589223233030091, "grad_norm": 1.7478637967023822, "kl": 2.296875, "learning_rate": 7.804453526367142e-06, "loss": 0.0112, "max_completion_length": 255.0, "max_terminated_completion_length": 255.0, "mean_completion_length": 99.4464340209961, "mean_terminated_completion_length": 99.4464340209961, "min_completion_length": 18.0, "min_terminated_completion_length": 18.0, "num_tokens": 657775.0, "reward": 2.8579678535461426, "reward_std": 0.20496216416358948, "rewards/check_gptzero_func/mean": 0.875, "rewards/check_gptzero_func/std": 0.3337118923664093, "rewards/check_winston_local_func/mean": 0.9829674959182739, "rewards/check_winston_local_func/std": 0.016625503078103065, "rewards/sentence_count_match_reward_logic/mean": 1.0, "rewards/sentence_count_match_reward_logic/std": 0.0, "step": 2169 }, { "clip_ratio": 0.015560816042125225, "epoch": 0.7592722183344996, "grad_norm": 1.0284839129676147, "kl": 2.296875, "learning_rate": 7.801924919627308e-06, "loss": 0.006, "step": 2170 }, { "clip_ratio": 0.041367679834365845, "epoch": 0.7596221133659902, "grad_norm": 0.6325517077370594, "kl": 2.3125, "learning_rate": 7.799395267795997e-06, "loss": 0.0018, "step": 2171 }, { "clip_ratio": 0.062007199972867966, "epoch": 0.7599720083974808, "grad_norm": 0.6501456640965879, "kl": 2.3125, "learning_rate": 7.796864571816745e-06, "loss": -0.0003, "step": 2172 }, { "clip_ratio": 0.007333927322179079, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7603219034289713, "grad_norm": 2.105109828354224, "kl": 2.53125, "learning_rate": 7.794332832633479e-06, "loss": 0.0344, "max_completion_length": 256.0, "max_terminated_completion_length": 208.0, "mean_completion_length": 107.16072082519531, "mean_terminated_completion_length": 82.35417175292969, "min_completion_length": 14.0, "min_terminated_completion_length": 14.0, "num_tokens": 673072.0, "reward": 2.705554485321045, "reward_std": 0.2295072376728058, "rewards/check_gptzero_func/mean": 0.7857142686843872, "rewards/check_gptzero_func/std": 0.4140393137931824, "rewards/check_winston_local_func/mean": 0.9823400378227234, "rewards/check_winston_local_func/std": 0.04505303502082825, "rewards/sentence_count_match_reward_logic/mean": 0.9375, "rewards/sentence_count_match_reward_logic/std": 0.15447859466075897, "step": 2173 }, { "clip_ratio": 0.01761331781744957, "epoch": 0.7606717984604618, "grad_norm": 1.2567943496412572, "kl": 2.5625, "learning_rate": 7.791800051190513e-06, "loss": 0.0314, "step": 2174 }, { "clip_ratio": 0.04563029482960701, "epoch": 0.7610216934919524, "grad_norm": 0.9361591177284357, "kl": 2.546875, "learning_rate": 7.78926622843255e-06, "loss": 0.0277, "step": 2175 }, { "clip_ratio": 0.06349775195121765, "epoch": 0.761371588523443, "grad_norm": 1.6851810701201468, "kl": 2.5, "learning_rate": 7.786731365304682e-06, "loss": 0.0264, "step": 2176 }, { "clip_ratio": 0.0025309999473392963, "clipped_completions_ratio": 0.0, "epoch": 0.7617214835549335, "grad_norm": 1.1746397542092526, "kl": 2.078125, "learning_rate": 7.78419546275239e-06, "loss": 0.0245, "max_completion_length": 195.0, "max_terminated_completion_length": 195.0, "mean_completion_length": 102.58928680419922, "mean_terminated_completion_length": 102.58928680419922, "min_completion_length": 24.0, "min_terminated_completion_length": 24.0, "num_tokens": 686065.0, "reward": 2.878997564315796, "reward_std": 0.1334289163351059, "rewards/check_gptzero_func/mean": 0.9107142686843872, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9742355346679688, "rewards/check_winston_local_func/std": 0.042378153651952744, "rewards/sentence_count_match_reward_logic/mean": 0.9940475821495056, "rewards/sentence_count_match_reward_logic/std": 0.02524530701339245, "step": 2177 }, { "clip_ratio": 0.01287628524005413, "epoch": 0.762071378586424, "grad_norm": 0.7508027182939383, "kl": 2.078125, "learning_rate": 7.781658521721542e-06, "loss": 0.0214, "step": 2178 }, { "clip_ratio": 0.026785217225551605, "epoch": 0.7624212736179147, "grad_norm": 0.5833229940525236, "kl": 2.078125, "learning_rate": 7.77912054315839e-06, "loss": 0.0197, "step": 2179 }, { "clip_ratio": 0.04393242672085762, "epoch": 0.7627711686494052, "grad_norm": 0.6262888575456236, "kl": 2.078125, "learning_rate": 7.77658152800958e-06, "loss": 0.0186, "step": 2180 }, { "clip_ratio": 0.004884317517280579, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.7631210636808957, "grad_norm": 1.7196521964637381, "kl": 2.65625, "learning_rate": 7.774041477222136e-06, "loss": 0.015, "max_completion_length": 256.0, "max_terminated_completion_length": 243.0, "mean_completion_length": 130.0, "mean_terminated_completion_length": 79.5999984741211, "min_completion_length": 15.0, "min_terminated_completion_length": 15.0, "num_tokens": 702977.0, "reward": 2.9625463485717773, "reward_std": 0.011777471750974655, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9842297434806824, "rewards/check_winston_local_func/std": 0.011672640219330788, "rewards/sentence_count_match_reward_logic/mean": 0.9783163070678711, "rewards/sentence_count_match_reward_logic/std": 0.05602749437093735, "step": 2181 }, { "clip_ratio": 0.018516462296247482, "epoch": 0.7634709587123862, "grad_norm": 1.0686395540606022, "kl": 2.65625, "learning_rate": 7.771500391743478e-06, "loss": 0.0109, "step": 2182 }, { "clip_ratio": 0.04015287384390831, "epoch": 0.7638208537438769, "grad_norm": 0.7955583358583131, "kl": 2.671875, "learning_rate": 7.768958272521404e-06, "loss": 0.006, "step": 2183 }, { "clip_ratio": 0.05951203778386116, "epoch": 0.7641707487753674, "grad_norm": 0.7135370024564343, "kl": 2.6875, "learning_rate": 7.766415120504098e-06, "loss": 0.0036, "step": 2184 }, { "clip_ratio": 0.0050614019855856895, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.7645206438068579, "grad_norm": 1.2310222803050743, "kl": 2.078125, "learning_rate": 7.763870936640137e-06, "loss": 0.0154, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 161.75, "mean_terminated_completion_length": 124.05000305175781, "min_completion_length": 13.0, "min_terminated_completion_length": 13.0, "num_tokens": 722859.0, "reward": 2.9404871463775635, "reward_std": 0.08114520460367203, "rewards/check_gptzero_func/mean": 0.9821428656578064, "rewards/check_gptzero_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9791771769523621, "rewards/check_winston_local_func/std": 0.03336998447775841, "rewards/sentence_count_match_reward_logic/mean": 0.9791666269302368, "rewards/sentence_count_match_reward_logic/std": 0.045180149376392365, "step": 2185 }, { "clip_ratio": 0.01278642751276493, "epoch": 0.7648705388383485, "grad_norm": 0.6011782573594957, "kl": 2.09375, "learning_rate": 7.761325721878474e-06, "loss": 0.0129, "step": 2186 }, { "clip_ratio": 0.02196374163031578, "epoch": 0.7652204338698391, "grad_norm": 0.5178483891266393, "kl": 2.125, "learning_rate": 7.758779477168452e-06, "loss": 0.0108, "step": 2187 }, { "clip_ratio": 0.033459436148405075, "epoch": 0.7655703289013296, "grad_norm": 0.5083573108677727, "kl": 2.15625, "learning_rate": 7.756232203459794e-06, "loss": 0.0095, "step": 2188 }, { "clip_ratio": 0.003931226208806038, "clipped_completions_ratio": 0.4821428571428571, "epoch": 0.7659202239328201, "grad_norm": 1.3229498720044586, "kl": 1.3984375, "learning_rate": 7.75368390170261e-06, "loss": 0.0263, "max_completion_length": 256.0, "max_terminated_completion_length": 250.0, "mean_completion_length": 175.98214721679688, "mean_terminated_completion_length": 101.48275756835938, "min_completion_length": 19.0, "min_terminated_completion_length": 19.0, "num_tokens": 747578.0, "reward": 2.8222811222076416, "reward_std": 0.03336995467543602, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9849714040756226, "rewards/check_winston_local_func/std": 0.012996306642889977, "rewards/sentence_count_match_reward_logic/mean": 0.8373095393180847, "rewards/sentence_count_match_reward_logic/std": 0.19547922909259796, "step": 2189 }, { "clip_ratio": 0.015212619677186012, "epoch": 0.7662701189643107, "grad_norm": 0.8321854990036901, "kl": 1.3984375, "learning_rate": 7.751134572847394e-06, "loss": 0.022, "step": 2190 }, { "clip_ratio": 0.03588245436549187, "epoch": 0.7666200139958013, "grad_norm": 0.6930116945045719, "kl": 1.390625, "learning_rate": 7.74858421784502e-06, "loss": 0.0193, "step": 2191 }, { "clip_ratio": 0.045846838504076004, "epoch": 0.7669699090272918, "grad_norm": 0.631277385555764, "kl": 1.390625, "learning_rate": 7.746032837646742e-06, "loss": 0.0167, "step": 2192 }, { "clip_ratio": 0.005205393768846989, "clipped_completions_ratio": 0.0892857142857143, "epoch": 0.7673198040587824, "grad_norm": 0.8775476640986746, "kl": 1.5390625, "learning_rate": 7.74348043320421e-06, "loss": 0.0162, "max_completion_length": 256.0, "max_terminated_completion_length": 256.0, "mean_completion_length": 154.05357360839844, "mean_terminated_completion_length": 144.05882263183594, "min_completion_length": 93.0, "min_terminated_completion_length": 93.0, "num_tokens": 766181.0, "reward": 2.7776284217834473, "reward_std": 0.11399034410715103, "rewards/check_gptzero_func/mean": 0.8214285969734192, "rewards/check_gptzero_func/std": 0.3864591419696808, "rewards/check_winston_local_func/mean": 0.9908590316772461, "rewards/check_winston_local_func/std": 0.0029758168384432793, "rewards/sentence_count_match_reward_logic/mean": 0.9653409123420715, "rewards/sentence_count_match_reward_logic/std": 0.08950141072273254, "step": 2193 }, { "clip_ratio": 0.010021921247243881, "epoch": 0.7676696990902729, "grad_norm": 0.7968691584974072, "kl": 1.5390625, "learning_rate": 7.74092700546944e-06, "loss": 0.0125, "step": 2194 }, { "clip_ratio": 0.022671986371278763, "epoch": 0.7680195941217635, "grad_norm": 0.6233103649022973, "kl": 1.5390625, "learning_rate": 7.738372555394837e-06, "loss": 0.0083, "step": 2195 }, { "clip_ratio": 0.04263380914926529, "epoch": 0.768369489153254, "grad_norm": 0.49629804411621503, "kl": 1.5390625, "learning_rate": 7.735817083933189e-06, "loss": 0.0049, "step": 2196 }, { "clip_ratio": 0.005464816465973854, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.7687193841847446, "grad_norm": 1.7374231103561186, "kl": 1.8671875, "learning_rate": 7.733260592037661e-06, "loss": 0.0198, "max_completion_length": 256.0, "max_terminated_completion_length": 120.0, "mean_completion_length": 109.8214340209961, "mean_terminated_completion_length": 51.35000228881836, "min_completion_length": 16.0, "min_terminated_completion_length": 16.0, "num_tokens": 781787.0, "reward": 2.6795198917388916, "reward_std": 0.08260062336921692, "rewards/check_gptzero_func/mean": 0.7678571343421936, "rewards/check_gptzero_func/std": 0.4260205626487732, "rewards/check_winston_local_func/mean": 0.9639583826065063, "rewards/check_winston_local_func/std": 0.05745925381779671, "rewards/sentence_count_match_reward_logic/mean": 0.947704017162323, "rewards/sentence_count_match_reward_logic/std": 0.12957119941711426, "step": 2197 }, { "clip_ratio": 0.019519327208399773, "epoch": 0.7690692792162351, "grad_norm": 0.7823606733175158, "kl": 1.859375, "learning_rate": 7.7307030806618e-06, "loss": 0.016, "step": 2198 }, { "clip_ratio": 0.033113203942775726, "epoch": 0.7694191742477257, "grad_norm": 0.6816780842538469, "kl": 1.9296875, "learning_rate": 7.728144550759535e-06, "loss": 0.014, "step": 2199 }, { "clip_ratio": 0.04137331247329712, "epoch": 0.7697690692792163, "grad_norm": 0.9738607939023294, "kl": 2.046875, "learning_rate": 7.725585003285175e-06, "loss": 0.0133, "step": 2200 }, { "clip_ratio": 0.003570643486455083, "clipped_completions_ratio": 0.4285714285714286, "epoch": 0.7701189643107068, "grad_norm": 1.171387384201751, "kl": 1.4140625, "learning_rate": 7.723024439193401e-06, "loss": 0.0093, "max_completion_length": 256.0, "max_terminated_completion_length": 209.0, "mean_completion_length": 171.6607208251953, "mean_terminated_completion_length": 108.40625, "min_completion_length": 29.0, "min_terminated_completion_length": 29.0, "num_tokens": 803792.0, "reward": 2.8466193675994873, "reward_std": 0.014676041901111603, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9865541458129883, "rewards/check_winston_local_func/std": 0.01014085952192545, "rewards/sentence_count_match_reward_logic/mean": 0.8600649237632751, "rewards/sentence_count_match_reward_logic/std": 0.23203718662261963, "step": 2201 }, { "clip_ratio": 0.008794660679996014, "epoch": 0.7704688593421973, "grad_norm": 0.6612371875611066, "kl": 1.4140625, "learning_rate": 7.720462859439284e-06, "loss": 0.0071, "step": 2202 }, { "clip_ratio": 0.022566979750990868, "epoch": 0.7708187543736879, "grad_norm": 0.48373307890989875, "kl": 1.4140625, "learning_rate": 7.717900264978267e-06, "loss": 0.0045, "step": 2203 }, { "clip_ratio": 0.03567282110452652, "epoch": 0.7711686494051785, "grad_norm": 0.42034175169515536, "kl": 1.4140625, "learning_rate": 7.715336656766176e-06, "loss": 0.0025, "step": 2204 }, { "clip_ratio": 0.0028060146141797304, "clipped_completions_ratio": 0.017857142857142905, "epoch": 0.771518544436669, "grad_norm": 1.6302799733530842, "kl": 2.109375, "learning_rate": 7.71277203575921e-06, "loss": 0.0385, "max_completion_length": 256.0, "max_terminated_completion_length": 253.0, "mean_completion_length": 123.0714340209961, "mean_terminated_completion_length": 120.654541015625, "min_completion_length": 17.0, "min_terminated_completion_length": 17.0, "num_tokens": 819140.0, "reward": 2.923304319381714, "reward_std": 0.08271723240613937, "rewards/check_gptzero_func/mean": 0.9464285969734192, "rewards/check_gptzero_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9893757104873657, "rewards/check_winston_local_func/std": 0.0035578759852796793, "rewards/sentence_count_match_reward_logic/mean": 0.987500011920929, "rewards/sentence_count_match_reward_logic/std": 0.07150968909263611, "step": 2205 }, { "clip_ratio": 0.019669346511363983, "epoch": 0.7718684394681595, "grad_norm": 0.815491404330295, "kl": 2.109375, "learning_rate": 7.71020640291395e-06, "loss": 0.0347, "step": 2206 }, { "clip_ratio": 0.033664267510175705, "epoch": 0.77221833449965, "grad_norm": 0.5905891640342151, "kl": 2.09375, "learning_rate": 7.70763975918735e-06, "loss": 0.0312, "step": 2207 }, { "clip_ratio": 0.04902409017086029, "epoch": 0.7725682295311407, "grad_norm": 0.4503450033812369, "kl": 2.09375, "learning_rate": 7.705072105536748e-06, "loss": 0.0274, "step": 2208 }, { "clip_ratio": 0.004218716640025377, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7729181245626312, "grad_norm": 1.5751640086868128, "kl": 2.140625, "learning_rate": 7.702503442919853e-06, "loss": 0.0195, "max_completion_length": 256.0, "max_terminated_completion_length": 186.0, "mean_completion_length": 121.92857360839844, "mean_terminated_completion_length": 99.58333587646484, "min_completion_length": 20.0, "min_terminated_completion_length": 20.0, "num_tokens": 836184.0, "reward": 2.860346555709839, "reward_std": 0.0757782980799675, "rewards/check_gptzero_func/mean": 0.9642857313156128, "rewards/check_gptzero_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9898106455802917, "rewards/check_winston_local_func/std": 0.004782951902598143, "rewards/sentence_count_match_reward_logic/mean": 0.90625, "rewards/sentence_count_match_reward_logic/std": 0.232411190867424, "step": 2209 }, { "clip_ratio": 0.020553214475512505, "epoch": 0.7732680195941217, "grad_norm": 0.7419407153134354, "kl": 2.140625, "learning_rate": 7.69993377229475e-06, "loss": 0.0155, "step": 2210 }, { "clip_ratio": 0.02674606628715992, "epoch": 0.7736179146256124, "grad_norm": 0.6591205649757544, "kl": 2.140625, "learning_rate": 7.697363094619903e-06, "loss": 0.0131, "step": 2211 }, { "clip_ratio": 0.04394523799419403, "epoch": 0.7739678096571029, "grad_norm": 0.4951966165981054, "kl": 2.140625, "learning_rate": 7.69479141085415e-06, "loss": 0.0103, "step": 2212 }, { "clip_ratio": 0.005689231678843498, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7743177046885934, "grad_norm": 2.461692028912494, "kl": 1.421875, "learning_rate": 7.692218721956706e-06, "loss": 0.0115, "max_completion_length": 256.0, "max_terminated_completion_length": 140.0, "mean_completion_length": 107.64286041259766, "mean_terminated_completion_length": 82.91667175292969, "min_completion_length": 11.0, "min_terminated_completion_length": 11.0, "num_tokens": 851164.0, "reward": 2.8811380863189697, "reward_std": 0.03957914188504219, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9398111701011658, "rewards/check_winston_local_func/std": 0.14012262225151062, "rewards/sentence_count_match_reward_logic/mean": 0.9413265585899353, "rewards/sentence_count_match_reward_logic/std": 0.1461358368396759, "step": 2213 }, { "clip_ratio": 0.022386644035577774, "epoch": 0.7746675997200839, "grad_norm": 1.3681452542606134, "kl": 1.3984375, "learning_rate": 7.689645028887159e-06, "loss": 0.005, "step": 2214 }, { "clip_ratio": 0.04741518199443817, "epoch": 0.7750174947515746, "grad_norm": 0.8177982707847206, "kl": 1.390625, "learning_rate": 7.687070332605474e-06, "loss": 0.0014, "step": 2215 }, { "clip_ratio": 0.06220393255352974, "epoch": 0.7753673897830651, "grad_norm": 0.6308814728352462, "kl": 1.375, "learning_rate": 7.684494634071982e-06, "loss": -0.0013, "step": 2216 }, { "clip_ratio": 0.002824209863319993, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.7757172848145556, "grad_norm": 1.8191148050814527, "kl": 1.875, "learning_rate": 7.6819179342474e-06, "loss": 0.0081, "max_completion_length": 256.0, "max_terminated_completion_length": 198.0, "mean_completion_length": 120.92857360839844, "mean_terminated_completion_length": 66.9000015258789, "min_completion_length": 17.0, "min_terminated_completion_length": 17.0, "num_tokens": 867680.0, "reward": 2.8203203678131104, "reward_std": 0.06926866620779037, "rewards/check_gptzero_func/mean": 0.9642857313156128, "rewards/check_gptzero_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9661535620689392, "rewards/check_winston_local_func/std": 0.06902437657117844, "rewards/sentence_count_match_reward_logic/mean": 0.8898810148239136, "rewards/sentence_count_match_reward_logic/std": 0.170135959982872, "step": 2217 }, { "clip_ratio": 0.0179283507168293, "epoch": 0.7760671798460462, "grad_norm": 1.1100693407655917, "kl": 1.890625, "learning_rate": 7.679340234092812e-06, "loss": 0.0022, "step": 2218 }, { "clip_ratio": 0.04868884012103081, "epoch": 0.7764170748775368, "grad_norm": 0.8583717022115146, "kl": 1.9765625, "learning_rate": 7.676761534569675e-06, "loss": 0.0004, "step": 2219 }, { "clip_ratio": 0.060043226927518845, "epoch": 0.7767669699090273, "grad_norm": 0.6817045700003708, "kl": 1.9375, "learning_rate": 7.674181836639819e-06, "loss": -0.0023, "step": 2220 }, { "clip_ratio": 0.00390775129199028, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7771168649405178, "grad_norm": 1.2997886599907396, "kl": 2.21875, "learning_rate": 7.671601141265447e-06, "loss": 0.0135, "max_completion_length": 256.0, "max_terminated_completion_length": 143.0, "mean_completion_length": 114.4464340209961, "mean_terminated_completion_length": 90.85417175292969, "min_completion_length": 31.0, "min_terminated_completion_length": 31.0, "num_tokens": 882937.0, "reward": 2.91269850730896, "reward_std": 0.03976337984204292, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9858438372612, "rewards/check_winston_local_func/std": 0.008890511468052864, "rewards/sentence_count_match_reward_logic/mean": 0.9268544316291809, "rewards/sentence_count_match_reward_logic/std": 0.12472416460514069, "step": 2221 }, { "clip_ratio": 0.01042896043509245, "epoch": 0.7774667599720084, "grad_norm": 1.1033250410425097, "kl": 2.21875, "learning_rate": 7.669019449409133e-06, "loss": 0.0086, "step": 2222 }, { "clip_ratio": 0.035829510539770126, "epoch": 0.777816655003499, "grad_norm": 0.6921027196329903, "kl": 2.234375, "learning_rate": 7.666436762033828e-06, "loss": 0.0038, "step": 2223 }, { "clip_ratio": 0.05602490156888962, "epoch": 0.7781665500349895, "grad_norm": 0.6049643856824782, "kl": 2.25, "learning_rate": 7.663853080102845e-06, "loss": 0.0012, "step": 2224 }, { "clip_ratio": 0.0049926587380468845, "clipped_completions_ratio": 0.4285714285714286, "epoch": 0.7785164450664801, "grad_norm": 0.7629038194911174, "kl": 1.109375, "learning_rate": 7.661268404579876e-06, "loss": 0.0166, "max_completion_length": 256.0, "max_terminated_completion_length": 205.0, "mean_completion_length": 176.9107208251953, "mean_terminated_completion_length": 117.59375, "min_completion_length": 42.0, "min_terminated_completion_length": 42.0, "num_tokens": 908492.0, "reward": 2.6889524459838867, "reward_std": 0.22307230532169342, "rewards/check_gptzero_func/mean": 0.8571428656578064, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9918784499168396, "rewards/check_winston_local_func/std": 0.0020587677136063576, "rewards/sentence_count_match_reward_logic/mean": 0.8399309515953064, "rewards/sentence_count_match_reward_logic/std": 0.1941298246383667, "step": 2225 }, { "clip_ratio": 0.007930625230073929, "epoch": 0.7788663400979706, "grad_norm": 0.642914381990209, "kl": 1.109375, "learning_rate": 7.658682736428977e-06, "loss": 0.0143, "step": 2226 }, { "clip_ratio": 0.016943668946623802, "epoch": 0.7792162351294611, "grad_norm": 0.49683285279888906, "kl": 1.1015625, "learning_rate": 7.656096076614581e-06, "loss": 0.011, "step": 2227 }, { "clip_ratio": 0.026719285175204277, "epoch": 0.7795661301609517, "grad_norm": 0.40373245314038103, "kl": 1.1015625, "learning_rate": 7.653508426101488e-06, "loss": 0.0083, "step": 2228 }, { "clip_ratio": 0.004335204139351845, "clipped_completions_ratio": 0.1607142857142857, "epoch": 0.7799160251924423, "grad_norm": 1.0783678376220516, "kl": 1.84375, "learning_rate": 7.650919785854865e-06, "loss": 0.0218, "max_completion_length": 256.0, "max_terminated_completion_length": 251.0, "mean_completion_length": 170.32144165039062, "mean_terminated_completion_length": 153.91488647460938, "min_completion_length": 41.0, "min_terminated_completion_length": 41.0, "num_tokens": 930814.0, "reward": 2.8880128860473633, "reward_std": 0.08945376425981522, "rewards/check_gptzero_func/mean": 0.9464285969734192, "rewards/check_gptzero_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9893732666969299, "rewards/check_winston_local_func/std": 0.003974003251641989, "rewards/sentence_count_match_reward_logic/mean": 0.9522109031677246, "rewards/sentence_count_match_reward_logic/std": 0.11364161968231201, "step": 2229 }, { "clip_ratio": 0.011816642247140408, "epoch": 0.7802659202239328, "grad_norm": 0.8111315734296977, "kl": 1.84375, "learning_rate": 7.64833015684025e-06, "loss": 0.0178, "step": 2230 }, { "clip_ratio": 0.02567342482507229, "epoch": 0.7806158152554233, "grad_norm": 0.6104029265884525, "kl": 1.8515625, "learning_rate": 7.645739540023552e-06, "loss": 0.0141, "step": 2231 }, { "clip_ratio": 0.040108971297740936, "epoch": 0.780965710286914, "grad_norm": 0.5015752057698446, "kl": 1.8515625, "learning_rate": 7.643147936371047e-06, "loss": 0.0112, "step": 2232 }, { "clip_ratio": 0.004488673992455006, "clipped_completions_ratio": 0.1607142857142857, "epoch": 0.7813156053184045, "grad_norm": 1.668700328570692, "kl": 2.75, "learning_rate": 7.640555346849376e-06, "loss": 0.0196, "max_completion_length": 256.0, "max_terminated_completion_length": 250.0, "mean_completion_length": 129.92857360839844, "mean_terminated_completion_length": 105.7872314453125, "min_completion_length": 15.0, "min_terminated_completion_length": 15.0, "num_tokens": 948114.0, "reward": 2.869462251663208, "reward_std": 0.18086154758930206, "rewards/check_gptzero_func/mean": 0.9285714030265808, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9831527471542358, "rewards/check_winston_local_func/std": 0.01877225749194622, "rewards/sentence_count_match_reward_logic/mean": 0.9577381014823914, "rewards/sentence_count_match_reward_logic/std": 0.12317205965518951, "step": 2233 }, { "clip_ratio": 0.016932491213083267, "epoch": 0.781665500349895, "grad_norm": 1.1929253066306509, "kl": 2.75, "learning_rate": 7.63796177242555e-06, "loss": 0.015, "step": 2234 }, { "clip_ratio": 0.04603702574968338, "epoch": 0.7820153953813855, "grad_norm": 0.7743248518419463, "kl": 2.765625, "learning_rate": 7.635367214066953e-06, "loss": 0.011, "step": 2235 }, { "clip_ratio": 0.06284569203853607, "epoch": 0.7823652904128762, "grad_norm": 0.6849777414870005, "kl": 2.75, "learning_rate": 7.632771672741326e-06, "loss": 0.0079, "step": 2236 }, { "clip_ratio": 0.0028589472640305758, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7827151854443667, "grad_norm": 2.055566377237766, "kl": 2.140625, "learning_rate": 7.630175149416783e-06, "loss": 0.0004, "max_completion_length": 256.0, "max_terminated_completion_length": 160.0, "mean_completion_length": 70.3214340209961, "mean_terminated_completion_length": 39.375, "min_completion_length": 8.0, "min_terminated_completion_length": 8.0, "num_tokens": 958292.0, "reward": 2.6053309440612793, "reward_std": 0.145406112074852, "rewards/check_gptzero_func/mean": 0.6785714030265808, "rewards/check_gptzero_func/std": 0.4712514281272888, "rewards/check_winston_local_func/mean": 0.9458069205284119, "rewards/check_winston_local_func/std": 0.06499601900577545, "rewards/sentence_count_match_reward_logic/mean": 0.9809523820877075, "rewards/sentence_count_match_reward_logic/std": 0.051974013447761536, "step": 2237 }, { "clip_ratio": 0.025203539058566093, "epoch": 0.7830650804758572, "grad_norm": 3.42526742075901, "kl": 2.34375, "learning_rate": 7.627577645061801e-06, "loss": -0.0044, "step": 2238 }, { "clip_ratio": 0.037189316004514694, "epoch": 0.7834149755073478, "grad_norm": 1.100760958694366, "kl": 2.15625, "learning_rate": 7.624979160645225e-06, "loss": -0.0058, "step": 2239 }, { "clip_ratio": 0.04476959630846977, "epoch": 0.7837648705388384, "grad_norm": 0.6790792656860563, "kl": 2.171875, "learning_rate": 7.6223796971362685e-06, "loss": -0.0084, "step": 2240 }, { "clip_ratio": 0.0039336117915809155, "clipped_completions_ratio": 0.0, "epoch": 0.7841147655703289, "grad_norm": 1.3620377973721127, "kl": 1.8828125, "learning_rate": 7.619779255504502e-06, "loss": 0.0287, "max_completion_length": 219.0, "max_terminated_completion_length": 219.0, "mean_completion_length": 116.37500762939453, "mean_terminated_completion_length": 116.37500762939453, "min_completion_length": 28.0, "min_terminated_completion_length": 28.0, "num_tokens": 972921.0, "reward": 2.681797981262207, "reward_std": 0.0170002281665802, "rewards/check_gptzero_func/mean": 0.7142857313156128, "rewards/check_gptzero_func/std": 0.4558422863483429, "rewards/check_winston_local_func/mean": 0.9911231994628906, "rewards/check_winston_local_func/std": 0.0031449315138161182, "rewards/sentence_count_match_reward_logic/mean": 0.9763888716697693, "rewards/sentence_count_match_reward_logic/std": 0.06884285062551498, "step": 2241 }, { "clip_ratio": 0.01603773795068264, "epoch": 0.7844646606018194, "grad_norm": 0.9311187098036274, "kl": 1.890625, "learning_rate": 7.617177836719869e-06, "loss": 0.0236, "step": 2242 }, { "clip_ratio": 0.038673002272844315, "epoch": 0.78481455563331, "grad_norm": 0.6853436478703795, "kl": 1.8984375, "learning_rate": 7.614575441752672e-06, "loss": 0.0187, "step": 2243 }, { "clip_ratio": 0.05803689733147621, "epoch": 0.7851644506648006, "grad_norm": 0.5626099061626889, "kl": 1.9140625, "learning_rate": 7.611972071573579e-06, "loss": 0.0161, "step": 2244 }, { "clip_ratio": 0.00480610691010952, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7855143456962911, "grad_norm": 2.3431522029575746, "kl": 2.140625, "learning_rate": 7.6093677271536245e-06, "loss": 0.0093, "max_completion_length": 256.0, "max_terminated_completion_length": 131.0, "mean_completion_length": 81.28572082519531, "mean_terminated_completion_length": 52.16666793823242, "min_completion_length": 7.0, "min_terminated_completion_length": 7.0, "num_tokens": 985377.0, "reward": 2.87610125541687, "reward_std": 0.027388229966163635, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9304242730140686, "rewards/check_winston_local_func/std": 0.14590944349765778, "rewards/sentence_count_match_reward_logic/mean": 0.9456766843795776, "rewards/sentence_count_match_reward_logic/std": 0.12710720300674438, "step": 2245 }, { "clip_ratio": 0.022307926788926125, "epoch": 0.7858642407277817, "grad_norm": 1.816129612434784, "kl": 2.140625, "learning_rate": 7.6067624094642055e-06, "loss": 0.0022, "step": 2246 }, { "clip_ratio": 0.04953152313828468, "epoch": 0.7862141357592722, "grad_norm": 1.028640591733217, "kl": 2.15625, "learning_rate": 7.60415611947708e-06, "loss": -0.0022, "step": 2247 }, { "clip_ratio": 0.08314152806997299, "epoch": 0.7865640307907628, "grad_norm": 0.7687092435223117, "kl": 2.140625, "learning_rate": 7.601548858164366e-06, "loss": -0.0068, "step": 2248 }, { "clip_ratio": 0.004325367510318756, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7869139258222533, "grad_norm": 2.8913427844625974, "kl": 2.0, "learning_rate": 7.59894062649855e-06, "loss": 0.0265, "max_completion_length": 256.0, "max_terminated_completion_length": 227.0, "mean_completion_length": 129.875, "mean_terminated_completion_length": 108.85417175292969, "min_completion_length": 9.0, "min_terminated_completion_length": 9.0, "num_tokens": 1002906.0, "reward": 2.790915012359619, "reward_std": 0.11177027970552444, "rewards/check_gptzero_func/mean": 0.8571428656578064, "rewards/check_gptzero_func/std": 0.3530939221382141, "rewards/check_winston_local_func/mean": 0.9873433709144592, "rewards/check_winston_local_func/std": 0.014832573011517525, "rewards/sentence_count_match_reward_logic/mean": 0.9464285969734192, "rewards/sentence_count_match_reward_logic/std": 0.13298112154006958, "step": 2249 }, { "clip_ratio": 0.028552869334816933, "epoch": 0.7872638208537439, "grad_norm": 1.4777231247254283, "kl": 2.0, "learning_rate": 7.596331425452479e-06, "loss": 0.0211, "step": 2250 }, { "clip_ratio": 0.040760479867458344, "epoch": 0.7876137158852344, "grad_norm": 0.7510997445270541, "kl": 2.015625, "learning_rate": 7.593721255999358e-06, "loss": 0.0179, "step": 2251 }, { "clip_ratio": 0.057698313146829605, "epoch": 0.787963610916725, "grad_norm": 0.7799079832850356, "kl": 2.125, "learning_rate": 7.591110119112757e-06, "loss": 0.0158, "step": 2252 }, { "clip_ratio": 0.00577116571366787, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7883135059482156, "grad_norm": 0.9332785308075477, "kl": 1.5234375, "learning_rate": 7.588498015766604e-06, "loss": 0.0202, "max_completion_length": 256.0, "max_terminated_completion_length": 246.0, "mean_completion_length": 168.1607208251953, "mean_terminated_completion_length": 153.52084350585938, "min_completion_length": 91.0, "min_terminated_completion_length": 91.0, "num_tokens": 1024459.0, "reward": 2.9210941791534424, "reward_std": 0.07615404576063156, "rewards/check_gptzero_func/mean": 0.9821428656578064, "rewards/check_gptzero_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9902903437614441, "rewards/check_winston_local_func/std": 0.00483985198661685, "rewards/sentence_count_match_reward_logic/mean": 0.9486607313156128, "rewards/sentence_count_match_reward_logic/std": 0.10799512267112732, "step": 2253 }, { "clip_ratio": 0.00774614280089736, "epoch": 0.7886634009797061, "grad_norm": 1.2671744539792198, "kl": 1.578125, "learning_rate": 7.5858849469351895e-06, "loss": 0.0176, "step": 2254 }, { "clip_ratio": 0.01893477514386177, "epoch": 0.7890132960111966, "grad_norm": 0.7013222408555961, "kl": 1.5234375, "learning_rate": 7.583270913593163e-06, "loss": 0.0138, "step": 2255 }, { "clip_ratio": 0.0356498658657074, "epoch": 0.7893631910426872, "grad_norm": 0.5662166425632883, "kl": 1.5234375, "learning_rate": 7.580655916715537e-06, "loss": 0.0116, "step": 2256 }, { "clip_ratio": 0.004438542760908604, "clipped_completions_ratio": 0.0, "epoch": 0.7897130860741778, "grad_norm": 26.962812888481146, "kl": 3.265625, "learning_rate": 7.578039957277678e-06, "loss": 0.0268, "max_completion_length": 184.0, "max_terminated_completion_length": 184.0, "mean_completion_length": 78.67857360839844, "mean_terminated_completion_length": 78.67857360839844, "min_completion_length": 20.0, "min_terminated_completion_length": 20.0, "num_tokens": 1034953.0, "reward": 2.9689066410064697, "reward_std": 0.025769716128706932, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9822993874549866, "rewards/check_winston_local_func/std": 0.014806779101490974, "rewards/sentence_count_match_reward_logic/mean": 0.9866071343421936, "rewards/sentence_count_match_reward_logic/std": 0.056801944971084595, "step": 2257 }, { "clip_ratio": 0.006659829523414373, "epoch": 0.7900629811056683, "grad_norm": 1.679752936089574, "kl": 2.59375, "learning_rate": 7.575423036255312e-06, "loss": 0.036, "step": 2258 }, { "clip_ratio": 0.019001243636012077, "epoch": 0.7904128761371588, "grad_norm": 4.155999616771888, "kl": 2.59375, "learning_rate": 7.57280515462453e-06, "loss": 0.022, "step": 2259 }, { "clip_ratio": 0.03826297074556351, "epoch": 0.7907627711686495, "grad_norm": 1.7762861897387714, "kl": 2.671875, "learning_rate": 7.5701863133617735e-06, "loss": 0.0153, "step": 2260 }, { "clip_ratio": 0.004230176564306021, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.79111266620014, "grad_norm": 1.6114028139503787, "kl": 2.15625, "learning_rate": 7.567566513443849e-06, "loss": 0.0018, "max_completion_length": 256.0, "max_terminated_completion_length": 213.0, "mean_completion_length": 135.73214721679688, "mean_terminated_completion_length": 87.625, "min_completion_length": 12.0, "min_terminated_completion_length": 12.0, "num_tokens": 1053938.0, "reward": 2.876378297805786, "reward_std": 0.020343221724033356, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9896886944770813, "rewards/check_winston_local_func/std": 0.005400264635682106, "rewards/sentence_count_match_reward_logic/mean": 0.8866893649101257, "rewards/sentence_count_match_reward_logic/std": 0.16927899420261383, "step": 2261 }, { "clip_ratio": 0.010872861370444298, "epoch": 0.7914625612316305, "grad_norm": 1.167675900208445, "kl": 2.15625, "learning_rate": 7.564945755847917e-06, "loss": -0.0026, "step": 2262 }, { "clip_ratio": 0.038004860281944275, "epoch": 0.791812456263121, "grad_norm": 0.8506485635216153, "kl": 2.15625, "learning_rate": 7.5623240415514895e-06, "loss": -0.0072, "step": 2263 }, { "clip_ratio": 0.06254209578037262, "epoch": 0.7921623512946117, "grad_norm": 0.610871097856561, "kl": 2.1875, "learning_rate": 7.559701371532449e-06, "loss": -0.0096, "step": 2264 }, { "clip_ratio": 0.004618579987436533, "clipped_completions_ratio": 0.1607142857142857, "epoch": 0.7925122463261022, "grad_norm": 0.8718027163623452, "kl": 1.375, "learning_rate": 7.557077746769019e-06, "loss": 0.0196, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 180.19644165039062, "mean_terminated_completion_length": 165.68084716796875, "min_completion_length": 108.0, "min_terminated_completion_length": 108.0, "num_tokens": 1075925.0, "reward": 2.8820810317993164, "reward_std": 0.17568251490592957, "rewards/check_gptzero_func/mean": 0.9285714030265808, "rewards/check_gptzero_func/std": 0.25987008213996887, "rewards/check_winston_local_func/mean": 0.9921998381614685, "rewards/check_winston_local_func/std": 0.0027009437326341867, "rewards/sentence_count_match_reward_logic/mean": 0.9613094925880432, "rewards/sentence_count_match_reward_logic/std": 0.08984313905239105, "step": 2265 }, { "clip_ratio": 0.008412792347371578, "epoch": 0.7928621413575927, "grad_norm": 0.814687221765904, "kl": 1.375, "learning_rate": 7.554453168239794e-06, "loss": 0.0169, "step": 2266 }, { "clip_ratio": 0.017416782677173615, "epoch": 0.7932120363890833, "grad_norm": 0.6085035580536408, "kl": 1.375, "learning_rate": 7.551827636923712e-06, "loss": 0.0125, "step": 2267 }, { "clip_ratio": 0.034431472420692444, "epoch": 0.7935619314205739, "grad_norm": 0.4843468791388827, "kl": 1.3828125, "learning_rate": 7.549201153800073e-06, "loss": 0.0098, "step": 2268 }, { "clip_ratio": 0.007472625933587551, "clipped_completions_ratio": 0.0, "epoch": 0.7939118264520644, "grad_norm": 1.5023427800182911, "kl": 2.03125, "learning_rate": 7.546573719848529e-06, "loss": 0.0064, "max_completion_length": 223.0, "max_terminated_completion_length": 223.0, "mean_completion_length": 104.64286041259766, "mean_terminated_completion_length": 104.64286041259766, "min_completion_length": 13.0, "min_terminated_completion_length": 13.0, "num_tokens": 1089761.0, "reward": 2.953789472579956, "reward_std": 0.03515508770942688, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9609322547912598, "rewards/check_winston_local_func/std": 0.09374061226844788, "rewards/sentence_count_match_reward_logic/mean": 0.9928570985794067, "rewards/sentence_count_match_reward_logic/std": 0.03745126724243164, "step": 2269 }, { "clip_ratio": 0.02141587622463703, "epoch": 0.7942617214835549, "grad_norm": 1.001811192776467, "kl": 2.03125, "learning_rate": 7.543945336049092e-06, "loss": 0.0006, "step": 2270 }, { "clip_ratio": 0.04464544728398323, "epoch": 0.7946116165150455, "grad_norm": 0.7580278493947611, "kl": 2.046875, "learning_rate": 7.541316003382118e-06, "loss": -0.0031, "step": 2271 }, { "clip_ratio": 0.06634505093097687, "epoch": 0.794961511546536, "grad_norm": 0.6347015812246087, "kl": 2.0625, "learning_rate": 7.53868572282833e-06, "loss": -0.0061, "step": 2272 }, { "clip_ratio": 0.0030290980357676744, "clipped_completions_ratio": 0.2678571428571429, "epoch": 0.7953114065780266, "grad_norm": 2.536028390305701, "kl": 3.015625, "learning_rate": 7.536054495368792e-06, "loss": 0.0353, "max_completion_length": 256.0, "max_terminated_completion_length": 252.0, "mean_completion_length": 137.60714721679688, "mean_terminated_completion_length": 94.29267883300781, "min_completion_length": 14.0, "min_terminated_completion_length": 14.0, "num_tokens": 1107547.0, "reward": 2.846592903137207, "reward_std": 0.08255369961261749, "rewards/check_gptzero_func/mean": 0.9107142686843872, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9888004660606384, "rewards/check_winston_local_func/std": 0.006526331417262554, "rewards/sentence_count_match_reward_logic/mean": 0.9470778703689575, "rewards/sentence_count_match_reward_logic/std": 0.09237019717693329, "step": 2273 }, { "clip_ratio": 0.01709645614027977, "epoch": 0.7956613016095171, "grad_norm": 1.0945698397045078, "kl": 3.015625, "learning_rate": 7.5334223219849305e-06, "loss": 0.0306, "step": 2274 }, { "clip_ratio": 0.0300301406532526, "epoch": 0.7960111966410077, "grad_norm": 0.676742904956762, "kl": 3.03125, "learning_rate": 7.53078920365852e-06, "loss": 0.0274, "step": 2275 }, { "clip_ratio": 0.044024452567100525, "epoch": 0.7963610916724982, "grad_norm": 0.6345217280649087, "kl": 3.03125, "learning_rate": 7.528155141371688e-06, "loss": 0.0245, "step": 2276 }, { "clip_ratio": 0.005791619885712862, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7967109867039888, "grad_norm": 1.6046466500849137, "kl": 2.59375, "learning_rate": 7.5255201361069154e-06, "loss": 0.0178, "max_completion_length": 256.0, "max_terminated_completion_length": 181.0, "mean_completion_length": 118.53572082519531, "mean_terminated_completion_length": 95.625, "min_completion_length": 15.0, "min_terminated_completion_length": 15.0, "num_tokens": 1122297.0, "reward": 2.9882729053497314, "reward_std": 0.007664353121072054, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9902568459510803, "rewards/check_winston_local_func/std": 0.005367725156247616, "rewards/sentence_count_match_reward_logic/mean": 0.9980158805847168, "rewards/sentence_count_match_reward_logic/std": 0.014847845770418644, "step": 2277 }, { "clip_ratio": 0.01858406513929367, "epoch": 0.7970608817354794, "grad_norm": 0.9837396791363433, "kl": 2.609375, "learning_rate": 7.522884188847035e-06, "loss": 0.0142, "step": 2278 }, { "clip_ratio": 0.03760542720556259, "epoch": 0.7974107767669699, "grad_norm": 0.7499128882115769, "kl": 2.640625, "learning_rate": 7.52024730057523e-06, "loss": 0.0096, "step": 2279 }, { "clip_ratio": 0.05671399086713791, "epoch": 0.7977606717984604, "grad_norm": 0.8772030470015781, "kl": 2.6875, "learning_rate": 7.5176094722750344e-06, "loss": 0.0081, "step": 2280 }, { "clip_ratio": 0.005525720305740833, "clipped_completions_ratio": 0.0, "epoch": 0.798110566829951, "grad_norm": 1.9997383005772715, "kl": 2.40625, "learning_rate": 7.514970704930333e-06, "loss": 0.0094, "max_completion_length": 176.0, "max_terminated_completion_length": 176.0, "mean_completion_length": 78.76786041259766, "mean_terminated_completion_length": 78.76786041259766, "min_completion_length": 13.0, "min_terminated_completion_length": 13.0, "num_tokens": 1132884.0, "reward": 2.85269832611084, "reward_std": 0.11131193488836288, "rewards/check_gptzero_func/mean": 0.9464285969734192, "rewards/check_gptzero_func/std": 0.22720777988433838, "rewards/check_winston_local_func/mean": 0.9401982426643372, "rewards/check_winston_local_func/std": 0.10781288892030716, "rewards/sentence_count_match_reward_logic/mean": 0.9660714268684387, "rewards/sentence_count_match_reward_logic/std": 0.08426535129547119, "step": 2281 }, { "clip_ratio": 0.022006774321198463, "epoch": 0.7984604618614416, "grad_norm": 1.0168103242010813, "kl": 2.40625, "learning_rate": 7.5123309995253616e-06, "loss": 0.0048, "step": 2282 }, { "clip_ratio": 0.04012199118733406, "epoch": 0.7988103568929321, "grad_norm": 0.9701260887944859, "kl": 2.421875, "learning_rate": 7.509690357044705e-06, "loss": 0.0007, "step": 2283 }, { "clip_ratio": 0.05503549054265022, "epoch": 0.7991602519244226, "grad_norm": 0.7088001957571921, "kl": 2.4375, "learning_rate": 7.507048778473296e-06, "loss": -0.0034, "step": 2284 }, { "clip_ratio": 0.010537920519709587, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.7995101469559133, "grad_norm": 2.3096287821539927, "kl": 2.796875, "learning_rate": 7.504406264796422e-06, "loss": 0.0269, "max_completion_length": 256.0, "max_terminated_completion_length": 214.0, "mean_completion_length": 132.55357360839844, "mean_terminated_completion_length": 111.97917175292969, "min_completion_length": 16.0, "min_terminated_completion_length": 16.0, "num_tokens": 1148963.0, "reward": 2.9666764736175537, "reward_std": 0.0679018646478653, "rewards/check_gptzero_func/mean": 0.9821428656578064, "rewards/check_gptzero_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9893168210983276, "rewards/check_winston_local_func/std": 0.007293261121958494, "rewards/sentence_count_match_reward_logic/mean": 0.9952168464660645, "rewards/sentence_count_match_reward_logic/std": 0.02513670176267624, "step": 2285 }, { "clip_ratio": 0.018263444304466248, "epoch": 0.7998600419874038, "grad_norm": 1.28994391569871, "kl": 2.78125, "learning_rate": 7.501762816999714e-06, "loss": 0.0223, "step": 2286 }, { "clip_ratio": 0.03692783787846565, "epoch": 0.8002099370188943, "grad_norm": 0.779333299627939, "kl": 2.796875, "learning_rate": 7.499118436069151e-06, "loss": 0.0175, "step": 2287 }, { "clip_ratio": 0.05031320080161095, "epoch": 0.8005598320503848, "grad_norm": 0.6671679431217522, "kl": 2.8125, "learning_rate": 7.496473122991066e-06, "loss": 0.0143, "step": 2288 }, { "clip_ratio": 0.0042044175788760185, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8009097270818755, "grad_norm": 1.1991197885043068, "kl": 2.09375, "learning_rate": 7.493826878752132e-06, "loss": 0.0129, "max_completion_length": 256.0, "max_terminated_completion_length": 232.0, "mean_completion_length": 128.6428680419922, "mean_terminated_completion_length": 107.41667175292969, "min_completion_length": 10.0, "min_terminated_completion_length": 10.0, "num_tokens": 1166071.0, "reward": 2.9031264781951904, "reward_std": 0.06064142659306526, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9532962441444397, "rewards/check_winston_local_func/std": 0.0985758900642395, "rewards/sentence_count_match_reward_logic/mean": 0.9498299360275269, "rewards/sentence_count_match_reward_logic/std": 0.10387548804283142, "step": 2289 }, { "clip_ratio": 0.010033024474978447, "epoch": 0.801259622113366, "grad_norm": 0.7950815547432494, "kl": 2.09375, "learning_rate": 7.491179704339376e-06, "loss": 0.0101, "step": 2290 }, { "clip_ratio": 0.02467227540910244, "epoch": 0.8016095171448565, "grad_norm": 0.6412131739840448, "kl": 2.125, "learning_rate": 7.488531600740167e-06, "loss": 0.0069, "step": 2291 }, { "clip_ratio": 0.03884665295481682, "epoch": 0.8019594121763471, "grad_norm": 0.55935661785838, "kl": 2.125, "learning_rate": 7.485882568942222e-06, "loss": 0.0051, "step": 2292 }, { "clip_ratio": 0.007328317034989595, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8023093072078377, "grad_norm": 1.5943089925102394, "kl": 2.65625, "learning_rate": 7.483232609933608e-06, "loss": 0.0319, "max_completion_length": 256.0, "max_terminated_completion_length": 147.0, "mean_completion_length": 106.8214340209961, "mean_terminated_completion_length": 81.95833587646484, "min_completion_length": 22.0, "min_terminated_completion_length": 22.0, "num_tokens": 1179877.0, "reward": 2.94211483001709, "reward_std": 0.08617223799228668, "rewards/check_gptzero_func/mean": 0.9642857313156128, "rewards/check_gptzero_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9904103875160217, "rewards/check_winston_local_func/std": 0.004144808743149042, "rewards/sentence_count_match_reward_logic/mean": 0.9874188303947449, "rewards/sentence_count_match_reward_logic/std": 0.04155134782195091, "step": 2293 }, { "clip_ratio": 0.020540233701467514, "epoch": 0.8026592022393282, "grad_norm": 1.223357331307372, "kl": 2.65625, "learning_rate": 7.4805817247027335e-06, "loss": 0.0264, "step": 2294 }, { "clip_ratio": 0.04368789494037628, "epoch": 0.8030090972708187, "grad_norm": 0.9207051590415823, "kl": 2.65625, "learning_rate": 7.477929914238351e-06, "loss": 0.0209, "step": 2295 }, { "clip_ratio": 0.06673510372638702, "epoch": 0.8033589923023093, "grad_norm": 0.7263805885622512, "kl": 2.65625, "learning_rate": 7.475277179529562e-06, "loss": 0.0167, "step": 2296 }, { "clip_ratio": 0.005899460054934025, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8037088873337999, "grad_norm": 1.7441872382225563, "kl": 2.578125, "learning_rate": 7.4726235215658116e-06, "loss": 0.0328, "max_completion_length": 256.0, "max_terminated_completion_length": 143.0, "mean_completion_length": 100.23214721679688, "mean_terminated_completion_length": 74.27083587646484, "min_completion_length": 18.0, "min_terminated_completion_length": 18.0, "num_tokens": 1194202.0, "reward": 2.9114460945129395, "reward_std": 0.04507482424378395, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.986842691898346, "rewards/check_winston_local_func/std": 0.011489838361740112, "rewards/sentence_count_match_reward_logic/mean": 0.9246031641960144, "rewards/sentence_count_match_reward_logic/std": 0.1572878062725067, "step": 2297 }, { "clip_ratio": 0.021787740290164948, "epoch": 0.8040587823652904, "grad_norm": 1.1895301328058197, "kl": 2.578125, "learning_rate": 7.469968941336892e-06, "loss": 0.0282, "step": 2298 }, { "clip_ratio": 0.0458194725215435, "epoch": 0.804408677396781, "grad_norm": 0.8161005359980229, "kl": 2.578125, "learning_rate": 7.467313439832933e-06, "loss": 0.0245, "step": 2299 }, { "clip_ratio": 0.06352008879184723, "epoch": 0.8047585724282715, "grad_norm": 0.691599272373511, "kl": 2.578125, "learning_rate": 7.464657018044411e-06, "loss": 0.0215, "step": 2300 }, { "clip_ratio": 0.007999519817531109, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8051084674597621, "grad_norm": 2.3653075400483705, "kl": 2.890625, "learning_rate": 7.461999676962148e-06, "loss": 0.0197, "max_completion_length": 256.0, "max_terminated_completion_length": 203.0, "mean_completion_length": 108.12500762939453, "mean_terminated_completion_length": 83.47917175292969, "min_completion_length": 11.0, "min_terminated_completion_length": 11.0, "num_tokens": 1209761.0, "reward": 2.922501802444458, "reward_std": 0.015258186496794224, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9865683317184448, "rewards/check_winston_local_func/std": 0.015045755542814732, "rewards/sentence_count_match_reward_logic/mean": 0.935933530330658, "rewards/sentence_count_match_reward_logic/std": 0.15155553817749023, "step": 2301 }, { "clip_ratio": 0.02202501706779003, "epoch": 0.8054583624912526, "grad_norm": 1.1946750260920385, "kl": 2.875, "learning_rate": 7.4593414175773085e-06, "loss": 0.0146, "step": 2302 }, { "clip_ratio": 0.05428961291909218, "epoch": 0.8058082575227432, "grad_norm": 0.9606515526429644, "kl": 2.859375, "learning_rate": 7.456682240881395e-06, "loss": 0.0105, "step": 2303 }, { "clip_ratio": 0.06694845110177994, "epoch": 0.8061581525542337, "grad_norm": 0.8920063002644593, "kl": 2.875, "learning_rate": 7.4540221478662565e-06, "loss": 0.0079, "step": 2304 }, { "clip_ratio": 0.00581955024972558, "clipped_completions_ratio": 0.0, "epoch": 0.8065080475857243, "grad_norm": 2.1045893861561873, "kl": 3.28125, "learning_rate": 7.451361139524082e-06, "loss": 0.0365, "max_completion_length": 146.0, "max_terminated_completion_length": 146.0, "mean_completion_length": 80.92857360839844, "mean_terminated_completion_length": 80.92857360839844, "min_completion_length": 18.0, "min_terminated_completion_length": 18.0, "num_tokens": 1220309.0, "reward": 2.9576902389526367, "reward_std": 0.08340396732091904, "rewards/check_gptzero_func/mean": 0.9821428656578064, "rewards/check_gptzero_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9889401197433472, "rewards/check_winston_local_func/std": 0.01350527536123991, "rewards/sentence_count_match_reward_logic/mean": 0.9866071343421936, "rewards/sentence_count_match_reward_logic/std": 0.0741565078496933, "step": 2305 }, { "clip_ratio": 0.023330966010689735, "epoch": 0.8068579426172149, "grad_norm": 1.2675652237824047, "kl": 3.28125, "learning_rate": 7.448699216847404e-06, "loss": 0.0299, "step": 2306 }, { "clip_ratio": 0.054260581731796265, "epoch": 0.8072078376487054, "grad_norm": 0.9249963965519384, "kl": 3.28125, "learning_rate": 7.446036380829093e-06, "loss": 0.0247, "step": 2307 }, { "clip_ratio": 0.07750117778778076, "epoch": 0.8075577326801959, "grad_norm": 0.8003126026543609, "kl": 3.265625, "learning_rate": 7.443372632462363e-06, "loss": 0.0218, "step": 2308 }, { "clip_ratio": 0.004821067675948143, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8079076277116864, "grad_norm": 1.977552294109295, "kl": 2.75, "learning_rate": 7.440707972740766e-06, "loss": 0.0177, "max_completion_length": 256.0, "max_terminated_completion_length": 152.0, "mean_completion_length": 101.16072082519531, "mean_terminated_completion_length": 75.35417175292969, "min_completion_length": 15.0, "min_terminated_completion_length": 15.0, "num_tokens": 1234806.0, "reward": 2.942542791366577, "reward_std": 0.015932627022266388, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.991352379322052, "rewards/check_winston_local_func/std": 0.003248183988034725, "rewards/sentence_count_match_reward_logic/mean": 0.9511904716491699, "rewards/sentence_count_match_reward_logic/std": 0.11405549198389053, "step": 2309 }, { "clip_ratio": 0.01893007382750511, "epoch": 0.8082575227431771, "grad_norm": 2.0969904695129986, "kl": 2.75, "learning_rate": 7.438042402658195e-06, "loss": 0.0149, "step": 2310 }, { "clip_ratio": 0.03445199131965637, "epoch": 0.8086074177746676, "grad_norm": 0.8196541635038874, "kl": 2.734375, "learning_rate": 7.435375923208879e-06, "loss": 0.0112, "step": 2311 }, { "clip_ratio": 0.05591686815023422, "epoch": 0.8089573128061581, "grad_norm": 0.8483138016513014, "kl": 2.734375, "learning_rate": 7.432708535387397e-06, "loss": 0.0088, "step": 2312 }, { "clip_ratio": 0.007947120815515518, "clipped_completions_ratio": 0.1964285714285714, "epoch": 0.8093072078376488, "grad_norm": 1.8168284704595503, "kl": 3.28125, "learning_rate": 7.430040240188653e-06, "loss": 0.0288, "max_completion_length": 256.0, "max_terminated_completion_length": 246.0, "mean_completion_length": 141.33929443359375, "mean_terminated_completion_length": 113.31111145019531, "min_completion_length": 11.0, "min_terminated_completion_length": 11.0, "num_tokens": 1252041.0, "reward": 2.8871843814849854, "reward_std": 0.0935930535197258, "rewards/check_gptzero_func/mean": 0.9107142686843872, "rewards/check_gptzero_func/std": 0.28773635625839233, "rewards/check_winston_local_func/mean": 0.9886579513549805, "rewards/check_winston_local_func/std": 0.007069536484777927, "rewards/sentence_count_match_reward_logic/mean": 0.9878117442131042, "rewards/sentence_count_match_reward_logic/std": 0.03946445882320404, "step": 2313 }, { "clip_ratio": 0.018977923318743706, "epoch": 0.8096571028691393, "grad_norm": 1.3034529880966077, "kl": 3.296875, "learning_rate": 7.427371038607899e-06, "loss": 0.0234, "step": 2314 }, { "clip_ratio": 0.04375763610005379, "epoch": 0.8100069979006298, "grad_norm": 1.2338245312572254, "kl": 3.296875, "learning_rate": 7.4247009316407215e-06, "loss": 0.0201, "step": 2315 }, { "clip_ratio": 0.06078098714351654, "epoch": 0.8103568929321203, "grad_norm": 0.7791597140934301, "kl": 3.328125, "learning_rate": 7.422029920283044e-06, "loss": 0.0153, "step": 2316 }, { "clip_ratio": 0.003423549933359027, "clipped_completions_ratio": 0.0, "epoch": 0.810706787963611, "grad_norm": 1.8073594526450647, "kl": 2.828125, "learning_rate": 7.419358005531129e-06, "loss": 0.0247, "max_completion_length": 250.0, "max_terminated_completion_length": 250.0, "mean_completion_length": 115.25000762939453, "mean_terminated_completion_length": 115.25000762939453, "min_completion_length": 20.0, "min_terminated_completion_length": 20.0, "num_tokens": 1266303.0, "reward": 2.976149797439575, "reward_std": 0.025699691846966743, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9873107075691223, "rewards/check_winston_local_func/std": 0.006743582431226969, "rewards/sentence_count_match_reward_logic/mean": 0.9888392686843872, "rewards/sentence_count_match_reward_logic/std": 0.04929494857788086, "step": 2317 }, { "clip_ratio": 0.021315734833478928, "epoch": 0.8110566829951015, "grad_norm": 0.9769116938603979, "kl": 2.828125, "learning_rate": 7.416685188381576e-06, "loss": 0.0193, "step": 2318 }, { "clip_ratio": 0.031299471855163574, "epoch": 0.811406578026592, "grad_norm": 0.7916252322673318, "kl": 2.859375, "learning_rate": 7.414011469831319e-06, "loss": 0.0153, "step": 2319 }, { "clip_ratio": 0.04779273644089699, "epoch": 0.8117564730580826, "grad_norm": 0.7822900087181885, "kl": 2.875, "learning_rate": 7.411336850877633e-06, "loss": 0.0126, "step": 2320 }, { "clip_ratio": 0.007504025474190712, "clipped_completions_ratio": 0.0, "epoch": 0.8121063680895731, "grad_norm": 2.0705763575033727, "kl": 3.8125, "learning_rate": 7.408661332518122e-06, "loss": 0.0101, "max_completion_length": 140.0, "max_terminated_completion_length": 140.0, "mean_completion_length": 57.625003814697266, "mean_terminated_completion_length": 57.625003814697266, "min_completion_length": 9.0, "min_terminated_completion_length": 9.0, "num_tokens": 1274434.0, "reward": 2.869215965270996, "reward_std": 0.06544184684753418, "rewards/check_gptzero_func/mean": 0.8928571343421936, "rewards/check_gptzero_func/std": 0.31209391355514526, "rewards/check_winston_local_func/mean": 0.982310950756073, "rewards/check_winston_local_func/std": 0.016558388248085976, "rewards/sentence_count_match_reward_logic/mean": 0.9940475821495056, "rewards/sentence_count_match_reward_logic/std": 0.031209392473101616, "step": 2321 }, { "clip_ratio": 0.020718421787023544, "epoch": 0.8124562631210637, "grad_norm": 1.468731674648162, "kl": 3.8125, "learning_rate": 7.40598491575073e-06, "loss": 0.0038, "step": 2322 }, { "clip_ratio": 0.051618583500385284, "epoch": 0.8128061581525542, "grad_norm": 0.9924328370989601, "kl": 3.828125, "learning_rate": 7.403307601573737e-06, "loss": -0.0005, "step": 2323 }, { "clip_ratio": 0.07404764741659164, "epoch": 0.8131560531840448, "grad_norm": 0.776651810639774, "kl": 3.84375, "learning_rate": 7.400629390985753e-06, "loss": -0.0034, "step": 2324 }, { "clip_ratio": 0.003301086602732539, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.8135059482155353, "grad_norm": 1.6751868632136269, "kl": 2.421875, "learning_rate": 7.397950284985728e-06, "loss": 0.0044, "max_completion_length": 256.0, "max_terminated_completion_length": 177.0, "mean_completion_length": 127.64286041259766, "mean_terminated_completion_length": 76.30000305175781, "min_completion_length": 19.0, "min_terminated_completion_length": 19.0, "num_tokens": 1292262.0, "reward": 2.887242555618286, "reward_std": 0.034130699932575226, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9882323145866394, "rewards/check_winston_local_func/std": 0.0062887100502848625, "rewards/sentence_count_match_reward_logic/mean": 0.8990101218223572, "rewards/sentence_count_match_reward_logic/std": 0.12120544910430908, "step": 2325 }, { "clip_ratio": 0.018394751474261284, "epoch": 0.8138558432470259, "grad_norm": 0.9153924090923641, "kl": 2.421875, "learning_rate": 7.395270284572939e-06, "loss": -0.0013, "step": 2326 }, { "clip_ratio": 0.038458459079265594, "epoch": 0.8142057382785165, "grad_norm": 0.6887738172505002, "kl": 2.4375, "learning_rate": 7.392589390747006e-06, "loss": -0.0051, "step": 2327 }, { "clip_ratio": 0.05432556942105293, "epoch": 0.814555633310007, "grad_norm": 0.6203214344798167, "kl": 2.46875, "learning_rate": 7.389907604507874e-06, "loss": -0.0081, "step": 2328 }, { "clip_ratio": 0.0030691155698150396, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8149055283414975, "grad_norm": 2.5924801611160064, "kl": 2.703125, "learning_rate": 7.387224926855824e-06, "loss": 0.0207, "max_completion_length": 256.0, "max_terminated_completion_length": 222.0, "mean_completion_length": 127.78572082519531, "mean_terminated_completion_length": 106.41667175292969, "min_completion_length": 12.0, "min_terminated_completion_length": 12.0, "num_tokens": 1309210.0, "reward": 2.8639473915100098, "reward_std": 0.03563574701547623, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.962374210357666, "rewards/check_winston_local_func/std": 0.0771990567445755, "rewards/sentence_count_match_reward_logic/mean": 0.901573121547699, "rewards/sentence_count_match_reward_logic/std": 0.15546992421150208, "step": 2329 }, { "clip_ratio": 0.017847320064902306, "epoch": 0.8152554233729881, "grad_norm": 0.8682762173820818, "kl": 2.71875, "learning_rate": 7.384541358791471e-06, "loss": 0.0161, "step": 2330 }, { "clip_ratio": 0.027578651905059814, "epoch": 0.8156053184044787, "grad_norm": 0.7362552923092293, "kl": 2.765625, "learning_rate": 7.3818569013157585e-06, "loss": 0.0139, "step": 2331 }, { "clip_ratio": 0.04175647720694542, "epoch": 0.8159552134359692, "grad_norm": 0.7567814863120528, "kl": 2.8125, "learning_rate": 7.379171555429965e-06, "loss": 0.0119, "step": 2332 }, { "clip_ratio": 0.0060015516355633736, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.8163051084674597, "grad_norm": 2.026633281622972, "kl": 2.84375, "learning_rate": 7.376485322135697e-06, "loss": 0.0369, "max_completion_length": 256.0, "max_terminated_completion_length": 120.0, "mean_completion_length": 109.64286041259766, "mean_terminated_completion_length": 51.10000228881836, "min_completion_length": 19.0, "min_terminated_completion_length": 19.0, "num_tokens": 1325294.0, "reward": 2.8892641067504883, "reward_std": 0.0825885757803917, "rewards/check_gptzero_func/mean": 0.9821428656578064, "rewards/check_gptzero_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9884235262870789, "rewards/check_winston_local_func/std": 0.006481961812824011, "rewards/sentence_count_match_reward_logic/mean": 0.9186974763870239, "rewards/sentence_count_match_reward_logic/std": 0.1243722140789032, "step": 2333 }, { "clip_ratio": 0.018008146435022354, "epoch": 0.8166550034989503, "grad_norm": 1.0977499210821706, "kl": 2.859375, "learning_rate": 7.373798202434896e-06, "loss": 0.0319, "step": 2334 }, { "clip_ratio": 0.03109223023056984, "epoch": 0.8170048985304409, "grad_norm": 0.8821789707122142, "kl": 2.859375, "learning_rate": 7.371110197329834e-06, "loss": 0.0298, "step": 2335 }, { "clip_ratio": 0.05997472256422043, "epoch": 0.8173547935619314, "grad_norm": 0.746426490077747, "kl": 2.875, "learning_rate": 7.3684213078231084e-06, "loss": 0.0264, "step": 2336 }, { "clip_ratio": 0.00600241357460618, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8177046885934219, "grad_norm": 2.3067549258738524, "kl": 3.609375, "learning_rate": 7.365731534917651e-06, "loss": 0.0278, "max_completion_length": 256.0, "max_terminated_completion_length": 118.0, "mean_completion_length": 82.41072082519531, "mean_terminated_completion_length": 53.47916793823242, "min_completion_length": 19.0, "min_terminated_completion_length": 19.0, "num_tokens": 1337581.0, "reward": 2.953256368637085, "reward_std": 0.006692060735076666, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9904586672782898, "rewards/check_winston_local_func/std": 0.003336767666041851, "rewards/sentence_count_match_reward_logic/mean": 0.9627976417541504, "rewards/sentence_count_match_reward_logic/std": 0.09255034476518631, "step": 2337 }, { "clip_ratio": 0.022066812962293625, "epoch": 0.8180545836249126, "grad_norm": 1.359036274363853, "kl": 3.609375, "learning_rate": 7.36304087961672e-06, "loss": 0.0226, "step": 2338 }, { "clip_ratio": 0.047996509820222855, "epoch": 0.8184044786564031, "grad_norm": 0.793933355320914, "kl": 3.671875, "learning_rate": 7.360349342923905e-06, "loss": 0.0193, "step": 2339 }, { "clip_ratio": 0.0700114294886589, "epoch": 0.8187543736878936, "grad_norm": 0.9122721979348962, "kl": 3.71875, "learning_rate": 7.357656925843125e-06, "loss": 0.0171, "step": 2340 }, { "clip_ratio": 0.008857617154717445, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8191042687193841, "grad_norm": 2.787307674665899, "kl": 3.578125, "learning_rate": 7.354963629378626e-06, "loss": 0.0282, "max_completion_length": 256.0, "max_terminated_completion_length": 116.0, "mean_completion_length": 79.41072082519531, "mean_terminated_completion_length": 49.97916793823242, "min_completion_length": 15.0, "min_terminated_completion_length": 15.0, "num_tokens": 1349356.0, "reward": 2.9504504203796387, "reward_std": 0.022242004051804543, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9848660230636597, "rewards/check_winston_local_func/std": 0.01185772567987442, "rewards/sentence_count_match_reward_logic/mean": 0.965584397315979, "rewards/sentence_count_match_reward_logic/std": 0.08113978058099747, "step": 2341 }, { "clip_ratio": 0.02605828270316124, "epoch": 0.8194541637508748, "grad_norm": 1.7251760950378947, "kl": 3.578125, "learning_rate": 7.352269454534981e-06, "loss": 0.0216, "step": 2342 }, { "clip_ratio": 0.04962492361664772, "epoch": 0.8198040587823653, "grad_norm": 1.1398413967000278, "kl": 3.609375, "learning_rate": 7.3495744023170936e-06, "loss": 0.0169, "step": 2343 }, { "clip_ratio": 0.0769866406917572, "epoch": 0.8201539538138558, "grad_norm": 0.7027389243437882, "kl": 3.640625, "learning_rate": 7.346878473730189e-06, "loss": 0.0143, "step": 2344 }, { "clip_ratio": 0.0036789351142942905, "clipped_completions_ratio": 0.2857142857142857, "epoch": 0.8205038488453464, "grad_norm": 1.3401432267139974, "kl": 1.890625, "learning_rate": 7.344181669779826e-06, "loss": 0.0196, "max_completion_length": 256.0, "max_terminated_completion_length": 249.0, "mean_completion_length": 160.85714721679688, "mean_terminated_completion_length": 122.80000305175781, "min_completion_length": 20.0, "min_terminated_completion_length": 20.0, "num_tokens": 1369212.0, "reward": 2.9444916248321533, "reward_std": 0.057329073548316956, "rewards/check_gptzero_func/mean": 0.9821428656578064, "rewards/check_gptzero_func/std": 0.13363061845302582, "rewards/check_winston_local_func/mean": 0.9909199476242065, "rewards/check_winston_local_func/std": 0.00420338986441493, "rewards/sentence_count_match_reward_logic/mean": 0.9714285731315613, "rewards/sentence_count_match_reward_logic/std": 0.06799541413784027, "step": 2345 }, { "clip_ratio": 0.012914168648421764, "epoch": 0.820853743876837, "grad_norm": 0.969069952323308, "kl": 1.890625, "learning_rate": 7.341483991471887e-06, "loss": 0.015, "step": 2346 }, { "clip_ratio": 0.03380994126200676, "epoch": 0.8212036389083275, "grad_norm": 0.6884671945062988, "kl": 1.8828125, "learning_rate": 7.33878543981258e-06, "loss": 0.0119, "step": 2347 }, { "clip_ratio": 0.05216168984770775, "epoch": 0.821553533939818, "grad_norm": 0.6604322565548523, "kl": 1.8984375, "learning_rate": 7.336086015808439e-06, "loss": 0.0086, "step": 2348 }, { "clip_ratio": 0.003860751399770379, "clipped_completions_ratio": 0.0, "epoch": 0.8219034289713086, "grad_norm": 2.368447155045532, "kl": 2.015625, "learning_rate": 7.333385720466323e-06, "loss": 0.0416, "max_completion_length": 233.0, "max_terminated_completion_length": 233.0, "mean_completion_length": 134.6428680419922, "mean_terminated_completion_length": 134.6428680419922, "min_completion_length": 13.0, "min_terminated_completion_length": 13.0, "num_tokens": 1385896.0, "reward": 2.816983938217163, "reward_std": 0.08936531841754913, "rewards/check_gptzero_func/mean": 0.8392857313156128, "rewards/check_gptzero_func/std": 0.3705909848213196, "rewards/check_winston_local_func/mean": 0.9891775846481323, "rewards/check_winston_local_func/std": 0.014210877008736134, "rewards/sentence_count_match_reward_logic/mean": 0.9885204434394836, "rewards/sentence_count_match_reward_logic/std": 0.06915442645549774, "step": 2349 }, { "clip_ratio": 0.01784575544297695, "epoch": 0.8222533240027992, "grad_norm": 1.0610844286755925, "kl": 2.046875, "learning_rate": 7.330684554793417e-06, "loss": 0.0377, "step": 2350 }, { "clip_ratio": 0.0331849679350853, "epoch": 0.8226032190342897, "grad_norm": 1.093095424365017, "kl": 2.125, "learning_rate": 7.3279825197972306e-06, "loss": 0.0348, "step": 2351 }, { "clip_ratio": 0.051013097167015076, "epoch": 0.8229531140657803, "grad_norm": 1.1170544775874915, "kl": 2.140625, "learning_rate": 7.325279616485599e-06, "loss": 0.0323, "step": 2352 }, { "clip_ratio": 0.004842695780098438, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8233030090972708, "grad_norm": 2.6752557192474007, "kl": 2.953125, "learning_rate": 7.322575845866675e-06, "loss": 0.0313, "max_completion_length": 256.0, "max_terminated_completion_length": 115.0, "mean_completion_length": 72.625, "mean_terminated_completion_length": 42.0625, "min_completion_length": 9.0, "min_terminated_completion_length": 9.0, "num_tokens": 1396419.0, "reward": 2.9125382900238037, "reward_std": 0.042430032044649124, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9675979614257812, "rewards/check_winston_local_func/std": 0.04741491749882698, "rewards/sentence_count_match_reward_logic/mean": 0.9449405074119568, "rewards/sentence_count_match_reward_logic/std": 0.11379227042198181, "step": 2353 }, { "clip_ratio": 0.022050799801945686, "epoch": 0.8236529041287614, "grad_norm": 1.3391973463725964, "kl": 2.953125, "learning_rate": 7.3198712089489435e-06, "loss": 0.0253, "step": 2354 }, { "clip_ratio": 0.05476130172610283, "epoch": 0.8240027991602519, "grad_norm": 0.900610847656959, "kl": 2.984375, "learning_rate": 7.317165706741207e-06, "loss": 0.021, "step": 2355 }, { "clip_ratio": 0.07274115085601807, "epoch": 0.8243526941917425, "grad_norm": 0.8515592465841234, "kl": 2.984375, "learning_rate": 7.314459340252593e-06, "loss": 0.0193, "step": 2356 }, { "clip_ratio": 0.005730907898396254, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.824702589223233, "grad_norm": 2.043616444332999, "kl": 2.40625, "learning_rate": 7.311752110492549e-06, "loss": 0.0539, "max_completion_length": 256.0, "max_terminated_completion_length": 219.0, "mean_completion_length": 124.16072082519531, "mean_terminated_completion_length": 102.1875, "min_completion_length": 10.0, "min_terminated_completion_length": 10.0, "num_tokens": 1412652.0, "reward": 2.841758966445923, "reward_std": 0.09318067133426666, "rewards/check_gptzero_func/mean": 0.8928571343421936, "rewards/check_gptzero_func/std": 0.3120938837528229, "rewards/check_winston_local_func/mean": 0.9856361746788025, "rewards/check_winston_local_func/std": 0.011945956386625767, "rewards/sentence_count_match_reward_logic/mean": 0.9632652997970581, "rewards/sentence_count_match_reward_logic/std": 0.07646441459655762, "step": 2357 }, { "clip_ratio": 0.022702736780047417, "epoch": 0.8250524842547236, "grad_norm": 1.2506096064976702, "kl": 2.421875, "learning_rate": 7.309044018470848e-06, "loss": 0.0478, "step": 2358 }, { "clip_ratio": 0.03578862547874451, "epoch": 0.8254023792862142, "grad_norm": 0.8710747144070948, "kl": 2.421875, "learning_rate": 7.306335065197581e-06, "loss": 0.0441, "step": 2359 }, { "clip_ratio": 0.049864839762449265, "epoch": 0.8257522743177047, "grad_norm": 0.7572817864783881, "kl": 2.4375, "learning_rate": 7.303625251683162e-06, "loss": 0.0415, "step": 2360 }, { "clip_ratio": 0.0013625199208036065, "clipped_completions_ratio": 0.4285714285714286, "epoch": 0.8261021693491952, "grad_norm": 2.043305480996517, "kl": 2.40625, "learning_rate": 7.300914578938327e-06, "loss": 0.0198, "max_completion_length": 256.0, "max_terminated_completion_length": 217.0, "mean_completion_length": 146.5357208251953, "mean_terminated_completion_length": 64.4375, "min_completion_length": 12.0, "min_terminated_completion_length": 12.0, "num_tokens": 1432394.0, "reward": 2.860705852508545, "reward_std": 0.023467188701033592, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9491033554077148, "rewards/check_winston_local_func/std": 0.10148520767688751, "rewards/sentence_count_match_reward_logic/mean": 0.9116023182868958, "rewards/sentence_count_match_reward_logic/std": 0.13594667613506317, "step": 2361 }, { "clip_ratio": 0.016518136486411095, "epoch": 0.8264520643806857, "grad_norm": 0.8085341774278341, "kl": 2.40625, "learning_rate": 7.2982030479741315e-06, "loss": 0.0155, "step": 2362 }, { "clip_ratio": 0.02824287861585617, "epoch": 0.8268019594121764, "grad_norm": 0.535557327434175, "kl": 2.40625, "learning_rate": 7.29549065980195e-06, "loss": 0.0135, "step": 2363 }, { "clip_ratio": 0.03702089190483093, "epoch": 0.8271518544436669, "grad_norm": 0.4946163052685544, "kl": 2.40625, "learning_rate": 7.2927774154334765e-06, "loss": 0.0116, "step": 2364 }, { "clip_ratio": 0.005579653661698103, "clipped_completions_ratio": 0.0, "epoch": 0.8275017494751574, "grad_norm": 1.2008395702101005, "kl": 2.6875, "learning_rate": 7.290063315880727e-06, "loss": 0.0255, "max_completion_length": 220.0, "max_terminated_completion_length": 220.0, "mean_completion_length": 113.66072082519531, "mean_terminated_completion_length": 113.66072082519531, "min_completion_length": 23.0, "min_terminated_completion_length": 23.0, "num_tokens": 1446839.0, "reward": 2.9304964542388916, "reward_std": 0.13408511877059937, "rewards/check_gptzero_func/mean": 0.9642857313156128, "rewards/check_gptzero_func/std": 0.187256321310997, "rewards/check_winston_local_func/mean": 0.9776899218559265, "rewards/check_winston_local_func/std": 0.04009845107793808, "rewards/sentence_count_match_reward_logic/mean": 0.9885204434394836, "rewards/sentence_count_match_reward_logic/std": 0.06915442645549774, "step": 2365 }, { "clip_ratio": 0.015407656319439411, "epoch": 0.827851644506648, "grad_norm": 0.8337727004913046, "kl": 2.671875, "learning_rate": 7.287348362156035e-06, "loss": 0.0217, "step": 2366 }, { "clip_ratio": 0.03156581148505211, "epoch": 0.8282015395381386, "grad_norm": 0.6452788196145758, "kl": 2.671875, "learning_rate": 7.2846325552720536e-06, "loss": 0.0185, "step": 2367 }, { "clip_ratio": 0.04907846823334694, "epoch": 0.8285514345696291, "grad_norm": 0.5432576036240845, "kl": 2.671875, "learning_rate": 7.281915896241749e-06, "loss": 0.0163, "step": 2368 }, { "clip_ratio": 0.004784522578120232, "clipped_completions_ratio": 0.2678571428571429, "epoch": 0.8289013296011196, "grad_norm": 1.4837671966808068, "kl": 2.28125, "learning_rate": 7.279198386078414e-06, "loss": 0.0298, "max_completion_length": 256.0, "max_terminated_completion_length": 254.0, "mean_completion_length": 128.2857208251953, "mean_terminated_completion_length": 81.56097412109375, "min_completion_length": 15.0, "min_terminated_completion_length": 15.0, "num_tokens": 1463399.0, "reward": 2.958326816558838, "reward_std": 0.032293498516082764, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9889541268348694, "rewards/check_winston_local_func/std": 0.005794030614197254, "rewards/sentence_count_match_reward_logic/mean": 0.9693723320960999, "rewards/sentence_count_match_reward_logic/std": 0.0671018660068512, "step": 2369 }, { "clip_ratio": 0.018211543560028076, "epoch": 0.8292512246326103, "grad_norm": 0.8401592892865086, "kl": 2.28125, "learning_rate": 7.276480025795649e-06, "loss": 0.0264, "step": 2370 }, { "clip_ratio": 0.03310099244117737, "epoch": 0.8296011196641008, "grad_norm": 0.6407833799720319, "kl": 2.296875, "learning_rate": 7.273760816407384e-06, "loss": 0.0238, "step": 2371 }, { "clip_ratio": 0.044816721230745316, "epoch": 0.8299510146955913, "grad_norm": 0.5312840505730202, "kl": 2.3125, "learning_rate": 7.271040758927852e-06, "loss": 0.0208, "step": 2372 }, { "clip_ratio": 0.008452807553112507, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8303009097270819, "grad_norm": 0.9796358205855256, "kl": 1.8125, "learning_rate": 7.268319854371611e-06, "loss": 0.0106, "max_completion_length": 256.0, "max_terminated_completion_length": 245.0, "mean_completion_length": 151.6607208251953, "mean_terminated_completion_length": 134.27084350585938, "min_completion_length": 12.0, "min_terminated_completion_length": 12.0, "num_tokens": 1482636.0, "reward": 2.914167881011963, "reward_std": 0.0037112832069396973, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9691125750541687, "rewards/check_winston_local_func/std": 0.0586063414812088, "rewards/sentence_count_match_reward_logic/mean": 0.9450549483299255, "rewards/sentence_count_match_reward_logic/std": 0.13580535352230072, "step": 2373 }, { "clip_ratio": 0.019376475363969803, "epoch": 0.8306508047585724, "grad_norm": 0.8250431561062126, "kl": 1.8046875, "learning_rate": 7.265598103753533e-06, "loss": 0.0082, "step": 2374 }, { "clip_ratio": 0.030562879517674446, "epoch": 0.831000699790063, "grad_norm": 0.6417767948443652, "kl": 1.78125, "learning_rate": 7.262875508088805e-06, "loss": 0.0045, "step": 2375 }, { "clip_ratio": 0.04522400349378586, "epoch": 0.8313505948215535, "grad_norm": 0.5593206224657356, "kl": 1.765625, "learning_rate": 7.260152068392927e-06, "loss": 0.0016, "step": 2376 }, { "clip_ratio": 0.0074844700284302235, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8317004898530441, "grad_norm": 1.7722968413954214, "kl": 2.1875, "learning_rate": 7.25742778568172e-06, "loss": 0.0214, "max_completion_length": 256.0, "max_terminated_completion_length": 211.0, "mean_completion_length": 117.28572082519531, "mean_terminated_completion_length": 94.16667175292969, "min_completion_length": 19.0, "min_terminated_completion_length": 19.0, "num_tokens": 1498324.0, "reward": 2.952713966369629, "reward_std": 0.01761510968208313, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9887714385986328, "rewards/check_winston_local_func/std": 0.005630460102111101, "rewards/sentence_count_match_reward_logic/mean": 0.9639423489570618, "rewards/sentence_count_match_reward_logic/std": 0.0837898775935173, "step": 2377 }, { "clip_ratio": 0.019865509122610092, "epoch": 0.8320503848845346, "grad_norm": 1.2383644768804338, "kl": 2.1875, "learning_rate": 7.254702660971313e-06, "loss": 0.0164, "step": 2378 }, { "clip_ratio": 0.04012542963027954, "epoch": 0.8324002799160252, "grad_norm": 0.7757756482147964, "kl": 2.203125, "learning_rate": 7.251976695278155e-06, "loss": 0.012, "step": 2379 }, { "clip_ratio": 0.05488245561718941, "epoch": 0.8327501749475158, "grad_norm": 0.6677427011981636, "kl": 2.203125, "learning_rate": 7.2492498896190015e-06, "loss": 0.0084, "step": 2380 }, { "clip_ratio": 0.00646232208237052, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8331000699790063, "grad_norm": 2.4150560235460143, "kl": 3.28125, "learning_rate": 7.246522245010926e-06, "loss": 0.0125, "max_completion_length": 256.0, "max_terminated_completion_length": 239.0, "mean_completion_length": 116.01786041259766, "mean_terminated_completion_length": 92.6875, "min_completion_length": 15.0, "min_terminated_completion_length": 15.0, "num_tokens": 1513693.0, "reward": 2.940453290939331, "reward_std": 0.0060663470067083836, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9901981353759766, "rewards/check_winston_local_func/std": 0.0041389260441064835, "rewards/sentence_count_match_reward_logic/mean": 0.9502550959587097, "rewards/sentence_count_match_reward_logic/std": 0.12328199297189713, "step": 2381 }, { "clip_ratio": 0.020362932235002518, "epoch": 0.8334499650104968, "grad_norm": 1.590148888023329, "kl": 3.28125, "learning_rate": 7.243793762471316e-06, "loss": 0.0055, "step": 2382 }, { "clip_ratio": 0.04778437688946724, "epoch": 0.8337998600419874, "grad_norm": 0.8887151042991392, "kl": 3.28125, "learning_rate": 7.24106444301787e-06, "loss": 0.0008, "step": 2383 }, { "clip_ratio": 0.06936793774366379, "epoch": 0.834149755073478, "grad_norm": 0.7645827994653175, "kl": 3.28125, "learning_rate": 7.238334287668595e-06, "loss": -0.0026, "step": 2384 }, { "clip_ratio": 0.0038715102709829807, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8344996501049685, "grad_norm": 1.0537169049688289, "kl": 1.9296875, "learning_rate": 7.235603297441817e-06, "loss": 0.0263, "max_completion_length": 256.0, "max_terminated_completion_length": 242.0, "mean_completion_length": 153.0357208251953, "mean_terminated_completion_length": 135.875, "min_completion_length": 28.0, "min_terminated_completion_length": 28.0, "num_tokens": 1532919.0, "reward": 2.9583423137664795, "reward_std": 0.008338485844433308, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9911864399909973, "rewards/check_winston_local_func/std": 0.0033770352602005005, "rewards/sentence_count_match_reward_logic/mean": 0.9671555757522583, "rewards/sentence_count_match_reward_logic/std": 0.07658171653747559, "step": 2385 }, { "clip_ratio": 0.008308122865855694, "epoch": 0.834849545136459, "grad_norm": 0.7438246119751981, "kl": 1.9296875, "learning_rate": 7.2328714733561655e-06, "loss": 0.0235, "step": 2386 }, { "clip_ratio": 0.019493645057082176, "epoch": 0.8351994401679497, "grad_norm": 0.6461321250200389, "kl": 1.921875, "learning_rate": 7.230138816430587e-06, "loss": 0.0207, "step": 2387 }, { "clip_ratio": 0.03147531673312187, "epoch": 0.8355493351994402, "grad_norm": 0.5299829274201268, "kl": 1.9296875, "learning_rate": 7.227405327684339e-06, "loss": 0.0175, "step": 2388 }, { "clip_ratio": 0.003242942038923502, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8358992302309307, "grad_norm": 2.9162284370857083, "kl": 2.84375, "learning_rate": 7.224671008136983e-06, "loss": 0.03, "max_completion_length": 256.0, "max_terminated_completion_length": 213.0, "mean_completion_length": 117.41072082519531, "mean_terminated_completion_length": 94.3125, "min_completion_length": 10.0, "min_terminated_completion_length": 10.0, "num_tokens": 1548822.0, "reward": 2.9369149208068848, "reward_std": 0.009207399562001228, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9833435416221619, "rewards/check_winston_local_func/std": 0.012297362089157104, "rewards/sentence_count_match_reward_logic/mean": 0.9535714387893677, "rewards/sentence_count_match_reward_logic/std": 0.11506318300962448, "step": 2389 }, { "clip_ratio": 0.020959550514817238, "epoch": 0.8362491252624212, "grad_norm": 1.0317567166820525, "kl": 2.84375, "learning_rate": 7.221935858808398e-06, "loss": 0.0253, "step": 2390 }, { "clip_ratio": 0.028445236384868622, "epoch": 0.8365990202939119, "grad_norm": 0.676489945823779, "kl": 2.828125, "learning_rate": 7.219199880718763e-06, "loss": 0.0223, "step": 2391 }, { "clip_ratio": 0.04578724876046181, "epoch": 0.8369489153254024, "grad_norm": 0.5704324949852297, "kl": 2.828125, "learning_rate": 7.216463074888579e-06, "loss": 0.0188, "step": 2392 }, { "clip_ratio": 0.003042953321710229, "clipped_completions_ratio": 0.1607142857142857, "epoch": 0.8372988103568929, "grad_norm": 1.8032177129090365, "kl": 2.5625, "learning_rate": 7.213725442338645e-06, "loss": 0.0356, "max_completion_length": 256.0, "max_terminated_completion_length": 252.0, "mean_completion_length": 119.3214340209961, "mean_terminated_completion_length": 93.14893341064453, "min_completion_length": 21.0, "min_terminated_completion_length": 21.0, "num_tokens": 1564288.0, "reward": 2.9531970024108887, "reward_std": 0.018455028533935547, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.975518524646759, "rewards/check_winston_local_func/std": 0.04692678526043892, "rewards/sentence_count_match_reward_logic/mean": 0.9776785969734192, "rewards/sentence_count_match_reward_logic/std": 0.05616325885057449, "step": 2393 }, { "clip_ratio": 0.01142770517617464, "epoch": 0.8376487053883834, "grad_norm": 1.0625986425514613, "kl": 2.5625, "learning_rate": 7.210986984090073e-06, "loss": 0.0311, "step": 2394 }, { "clip_ratio": 0.03509416803717613, "epoch": 0.8379986004198741, "grad_norm": 0.7992599197610161, "kl": 2.625, "learning_rate": 7.2082477011642795e-06, "loss": 0.0276, "step": 2395 }, { "clip_ratio": 0.05761134997010231, "epoch": 0.8383484954513646, "grad_norm": 0.7228991452485116, "kl": 2.640625, "learning_rate": 7.205507594582994e-06, "loss": 0.0253, "step": 2396 }, { "clip_ratio": 0.004731528460979462, "clipped_completions_ratio": 0.1428571428571429, "epoch": 0.8386983904828551, "grad_norm": 2.037613887528295, "kl": 3.046875, "learning_rate": 7.20276666536825e-06, "loss": 0.0362, "max_completion_length": 256.0, "max_terminated_completion_length": 168.0, "mean_completion_length": 99.46428680419922, "mean_terminated_completion_length": 73.375, "min_completion_length": 17.0, "min_terminated_completion_length": 17.0, "num_tokens": 1577162.0, "reward": 2.9759469032287598, "reward_std": 0.007454239763319492, "rewards/check_gptzero_func/mean": 1.0, "rewards/check_gptzero_func/std": 0.0, "rewards/check_winston_local_func/mean": 0.9898356199264526, "rewards/check_winston_local_func/std": 0.003587164916098118, "rewards/sentence_count_match_reward_logic/mean": 0.9861111044883728, "rewards/sentence_count_match_reward_logic/std": 0.0370790958404541, "step": 2397 }, { "clip_ratio": 0.017875779420137405, "epoch": 0.8390482855143457, "grad_norm": 1.5856442828356647, "kl": 3.046875, "learning_rate": 7.2000249145423874e-06, "loss": 0.0311, "step": 2398 }, { "clip_ratio": 0.05487055703997612, "epoch": 0.8393981805458363, "grad_norm": 0.9970027445385021, "kl": 3.046875, "learning_rate": 7.197282343128053e-06, "loss": 0.0258, "step": 2399 }, { "clip_ratio": 0.07321655005216599, "epoch": 0.8397480755773268, "grad_norm": 0.7574321907866123, "kl": 3.0625, "learning_rate": 7.194538952148201e-06, "loss": 0.0231, "step": 2400 } ], "logging_steps": 1, "max_steps": 5716, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }