{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9974554707379135, "eval_steps": 100, "global_step": 884, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 239.5625, "completions/mean_terminated_length": 134.61538696289062, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.0033927056827820186, "frac_reward_zero_std": 0.0, "grad_norm": 2.6258533000946045, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0193, "num_tokens": 279396.0, "reward": 4.808608055114746, "reward_std": 3.558130979537964, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.7557047009468079, "rewards/format_reward/mean": 0.1458333283662796, "rewards/format_reward/std": 0.3538617491722107, "rewards/judge_reward/mean": 1.8260416984558105, "rewards/judge_reward/std": 1.7488212585449219, "rewards/ngrams_iou_reward/mean": 0.03003612719476223, "rewards/ngrams_iou_reward/std": 0.07492002099752426, "rewards/schema_keywords_iou_reward/mean": 0.20461322367191315, "rewards/schema_keywords_iou_reward/std": 0.25545889139175415, "rewards/syntax_reward/mean": 0.1666666716337204, "rewards/syntax_reward/std": 0.37365230917930603, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 241.703125, "completions/mean_terminated_length": 146.1999969482422, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.006785411365564037, "frac_reward_zero_std": 0.0, "grad_norm": 1.8498804569244385, "kl": 0.0, "learning_rate": 1.1235955056179774e-08, "loss": -0.0183, "num_tokens": 531381.0, "reward": 4.623265743255615, "reward_std": 3.7126784324645996, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.8981601595878601, "rewards/format_reward/mean": 0.1145833358168602, "rewards/format_reward/std": 0.3193511664867401, "rewards/judge_reward/mean": 1.6114583015441895, "rewards/judge_reward/std": 1.7774839401245117, "rewards/ngrams_iou_reward/mean": 0.0392448715865612, "rewards/ngrams_iou_reward/std": 0.11049783229827881, "rewards/schema_keywords_iou_reward/mean": 0.19964562356472015, "rewards/schema_keywords_iou_reward/std": 0.24097704887390137, "rewards/syntax_reward/mean": 0.15625, "rewards/syntax_reward/std": 0.36404144763946533, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 248.16146850585938, "completions/mean_terminated_length": 180.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.010178117048346057, "frac_reward_zero_std": 0.0, "grad_norm": 3.0134685039520264, "kl": 0.00020360946655273438, "learning_rate": 2.2471910112359548e-08, "loss": -0.0076, "num_tokens": 790840.0, "reward": 4.787107467651367, "reward_std": 3.477874279022217, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.6683279871940613, "rewards/format_reward/mean": 0.1354166716337204, "rewards/format_reward/std": 0.3430626094341278, "rewards/judge_reward/mean": 1.910416603088379, "rewards/judge_reward/std": 1.803166389465332, "rewards/ngrams_iou_reward/mean": 0.031192995607852936, "rewards/ngrams_iou_reward/std": 0.09558312594890594, "rewards/schema_keywords_iou_reward/mean": 0.17466449737548828, "rewards/schema_keywords_iou_reward/std": 0.21793729066848755, "rewards/syntax_reward/mean": 0.15625, "rewards/syntax_reward/std": 0.36404144763946533, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 242.45834350585938, "completions/mean_terminated_length": 142.95652770996094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.013570822731128074, "frac_reward_zero_std": 0.0, "grad_norm": 2.030182123184204, "kl": 0.00024056434631347656, "learning_rate": 3.370786516853932e-08, "loss": -0.0091, "num_tokens": 1069394.0, "reward": 4.959733009338379, "reward_std": 3.6748435497283936, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.7280828952789307, "rewards/format_reward/mean": 0.1510416716337204, "rewards/format_reward/std": 0.35902565717697144, "rewards/judge_reward/mean": 1.9333332777023315, "rewards/judge_reward/std": 1.8475208282470703, "rewards/ngrams_iou_reward/mean": 0.03470459207892418, "rewards/ngrams_iou_reward/std": 0.10400296747684479, "rewards/schema_keywords_iou_reward/mean": 0.19377803802490234, "rewards/schema_keywords_iou_reward/std": 0.25607648491859436, "rewards/syntax_reward/mean": 0.1510416716337204, "rewards/syntax_reward/std": 0.35902565717697144, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 243.2135467529297, "completions/mean_terminated_length": 149.26087951660156, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.016963528413910092, "frac_reward_zero_std": 0.0, "grad_norm": 3.7111551761627197, "kl": 0.00019121170043945312, "learning_rate": 4.4943820224719096e-08, "loss": -0.0064, "num_tokens": 1319353.0, "reward": 5.287733554840088, "reward_std": 3.690511465072632, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.7557047009468079, "rewards/format_reward/mean": 0.1458333283662796, "rewards/format_reward/std": 0.3538617491722107, "rewards/judge_reward/mean": 2.0687501430511475, "rewards/judge_reward/std": 1.7685626745224, "rewards/ngrams_iou_reward/mean": 0.025088896974921227, "rewards/ngrams_iou_reward/std": 0.057423003017902374, "rewards/schema_keywords_iou_reward/mean": 0.20847749710083008, "rewards/schema_keywords_iou_reward/std": 0.22526420652866364, "rewards/syntax_reward/mean": 0.1614583283662796, "rewards/syntax_reward/std": 0.3689151406288147, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 240.921875, "completions/mean_terminated_length": 130.13043212890625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.020356234096692113, "frac_reward_zero_std": 0.0, "grad_norm": 3.0095958709716797, "kl": 0.00021982192993164062, "learning_rate": 5.617977528089887e-08, "loss": -0.0079, "num_tokens": 1587316.0, "reward": 5.221112251281738, "reward_std": 3.7031478881835938, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.8767278790473938, "rewards/format_reward/mean": 0.1510416716337204, "rewards/format_reward/std": 0.35902562737464905, "rewards/judge_reward/mean": 1.933333396911621, "rewards/judge_reward/std": 1.8039321899414062, "rewards/ngrams_iou_reward/mean": 0.0293264240026474, "rewards/ngrams_iou_reward/std": 0.08379238843917847, "rewards/schema_keywords_iou_reward/mean": 0.20532716810703278, "rewards/schema_keywords_iou_reward/std": 0.24210377037525177, "rewards/syntax_reward/mean": 0.125, "rewards/syntax_reward/std": 0.33158352971076965, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 245.59375, "completions/mean_terminated_length": 172.75, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.02374893977947413, "frac_reward_zero_std": 0.0, "grad_norm": 3.3725264072418213, "kl": 0.00020360946655273438, "learning_rate": 6.741573033707864e-08, "loss": -0.0163, "num_tokens": 1860274.0, "reward": 4.919992446899414, "reward_std": 3.605278968811035, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.5637529492378235, "rewards/format_reward/mean": 0.0989583358168602, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 2.0843751430511475, "rewards/judge_reward/std": 1.758558988571167, "rewards/ngrams_iou_reward/mean": 0.028877362608909607, "rewards/ngrams_iou_reward/std": 0.06718809902667999, "rewards/schema_keywords_iou_reward/mean": 0.1911150962114334, "rewards/schema_keywords_iou_reward/std": 0.22258354723453522, "rewards/syntax_reward/mean": 0.1041666641831398, "rewards/syntax_reward/std": 0.30627527832984924, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 236.52084350585938, "completions/mean_terminated_length": 146.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.02714164546225615, "frac_reward_zero_std": 0.0, "grad_norm": 5.23004150390625, "kl": 0.000244140625, "learning_rate": 7.865168539325842e-08, "loss": -0.0538, "num_tokens": 2127818.0, "reward": 4.486685752868652, "reward_std": 3.3138177394866943, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.8072093725204468, "rewards/format_reward/mean": 0.1354166716337204, "rewards/format_reward/std": 0.3430626094341278, "rewards/judge_reward/mean": 1.6124998331069946, "rewards/judge_reward/std": 1.7803611755371094, "rewards/ngrams_iou_reward/mean": 0.039094310253858566, "rewards/ngrams_iou_reward/std": 0.09878856688737869, "rewards/schema_keywords_iou_reward/mean": 0.20696599781513214, "rewards/schema_keywords_iou_reward/std": 0.26156535744667053, "rewards/syntax_reward/mean": 0.1770833283662796, "rewards/syntax_reward/std": 0.3827372193336487, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 242.64584350585938, "completions/mean_terminated_length": 157.38462829589844, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.030534351145038167, "frac_reward_zero_std": 0.0, "grad_norm": 1.8836013078689575, "kl": 0.00022649765014648438, "learning_rate": 8.988764044943819e-08, "loss": -0.0071, "num_tokens": 2413002.0, "reward": 4.668792724609375, "reward_std": 3.804647445678711, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.8072093725204468, "rewards/format_reward/mean": 0.1458333283662796, "rewards/format_reward/std": 0.3538617491722107, "rewards/judge_reward/mean": 1.712499976158142, "rewards/judge_reward/std": 1.7534557580947876, "rewards/ngrams_iou_reward/mean": 0.049696553498506546, "rewards/ngrams_iou_reward/std": 0.1393377184867859, "rewards/schema_keywords_iou_reward/mean": 0.20972098410129547, "rewards/schema_keywords_iou_reward/std": 0.2634650468826294, "rewards/syntax_reward/mean": 0.1354166716337204, "rewards/syntax_reward/std": 0.3430626094341278, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 249.046875, "completions/mean_terminated_length": 177.47059631347656, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.033927056827820185, "frac_reward_zero_std": 0.0, "grad_norm": 5.232537269592285, "kl": 0.0002219676971435547, "learning_rate": 1.0112359550561797e-07, "loss": 0.0068, "num_tokens": 2642133.0, "reward": 4.092692852020264, "reward_std": 3.6329073905944824, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.4295985996723175, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.33158352971076965, "rewards/judge_reward/mean": 1.728124976158142, "rewards/judge_reward/std": 1.763159990310669, "rewards/ngrams_iou_reward/mean": 0.02655528299510479, "rewards/ngrams_iou_reward/std": 0.06723800301551819, "rewards/schema_keywords_iou_reward/mean": 0.1880124807357788, "rewards/schema_keywords_iou_reward/std": 0.2192741334438324, "rewards/syntax_reward/mean": 0.109375, "rewards/syntax_reward/std": 0.3129251003265381, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 238.31771850585938, "completions/mean_terminated_length": 146.48387145996094, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.037319762510602206, "frac_reward_zero_std": 0.0, "grad_norm": 3.9386889934539795, "kl": 0.00024390220642089844, "learning_rate": 1.1235955056179774e-07, "loss": -0.0146, "num_tokens": 2921242.0, "reward": 4.794195175170898, "reward_std": 3.8790931701660156, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.8767278790473938, "rewards/format_reward/mean": 0.1614583283662796, "rewards/format_reward/std": 0.3689151406288147, "rewards/judge_reward/mean": 1.673958420753479, "rewards/judge_reward/std": 1.7350555658340454, "rewards/ngrams_iou_reward/mean": 0.05927819013595581, "rewards/ngrams_iou_reward/std": 0.15375958383083344, "rewards/schema_keywords_iou_reward/mean": 0.2255418300628662, "rewards/schema_keywords_iou_reward/std": 0.2794684171676636, "rewards/syntax_reward/mean": 0.15625, "rewards/syntax_reward/std": 0.36404144763946533, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 243.984375, "completions/mean_terminated_length": 146.14285278320312, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.04071246819338423, "frac_reward_zero_std": 0.0, "grad_norm": 2.0821144580841064, "kl": 0.0003132820129394531, "learning_rate": 1.2359550561797752e-07, "loss": -0.0069, "num_tokens": 3193681.0, "reward": 4.270644187927246, "reward_std": 2.9964776039123535, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.7557047009468079, "rewards/format_reward/mean": 0.1197916641831398, "rewards/format_reward/std": 0.32556667923927307, "rewards/judge_reward/mean": 1.5833333730697632, "rewards/judge_reward/std": 1.7504324913024902, "rewards/ngrams_iou_reward/mean": 0.027024010196328163, "rewards/ngrams_iou_reward/std": 0.0768943727016449, "rewards/schema_keywords_iou_reward/mean": 0.20716120302677155, "rewards/schema_keywords_iou_reward/std": 0.23764140903949738, "rewards/syntax_reward/mean": 0.140625, "rewards/syntax_reward/std": 0.3485431373119354, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 251.67709350585938, "completions/mean_terminated_length": 180.5454559326172, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.04410517387616624, "frac_reward_zero_std": 0.0, "grad_norm": 2.7832579612731934, "kl": 0.00026416778564453125, "learning_rate": 1.3483146067415728e-07, "loss": 0.0076, "num_tokens": 3457361.0, "reward": 4.037741661071777, "reward_std": 3.239654541015625, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.5637529492378235, "rewards/format_reward/mean": 0.0885416641831398, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.6651042699813843, "rewards/judge_reward/std": 1.7355477809906006, "rewards/ngrams_iou_reward/mean": 0.023800158873200417, "rewards/ngrams_iou_reward/std": 0.06359826773405075, "rewards/schema_keywords_iou_reward/mean": 0.1785241961479187, "rewards/schema_keywords_iou_reward/std": 0.22443684935569763, "rewards/syntax_reward/mean": 0.0885416641831398, "rewards/syntax_reward/std": 0.2848237454891205, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 244.03646850585938, "completions/mean_terminated_length": 160.2916717529297, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.04749787955894826, "frac_reward_zero_std": 0.0, "grad_norm": 4.811630725860596, "kl": 0.000423431396484375, "learning_rate": 1.4606741573033706e-07, "loss": -0.0115, "num_tokens": 3722976.0, "reward": 4.029069900512695, "reward_std": 3.4708640575408936, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.6010462045669556, "rewards/format_reward/mean": 0.0885416641831398, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.6020832061767578, "rewards/judge_reward/std": 1.7514379024505615, "rewards/ngrams_iou_reward/mean": 0.02806633710861206, "rewards/ngrams_iou_reward/std": 0.07760611921548843, "rewards/schema_keywords_iou_reward/mean": 0.19267039000988007, "rewards/schema_keywords_iou_reward/std": 0.23620323836803436, "rewards/syntax_reward/mean": 0.140625, "rewards/syntax_reward/std": 0.3485431373119354, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 248.0885467529297, "completions/mean_terminated_length": 154.73333740234375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.05089058524173028, "frac_reward_zero_std": 0.0, "grad_norm": 2.6045093536376953, "kl": 0.0004992485046386719, "learning_rate": 1.5730337078651685e-07, "loss": -0.0198, "num_tokens": 3997649.0, "reward": 5.2566962242126465, "reward_std": 3.3614022731781006, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.8544712066650391, "rewards/format_reward/mean": 0.1770833283662796, "rewards/format_reward/std": 0.3827372193336487, "rewards/judge_reward/mean": 1.9052082300186157, "rewards/judge_reward/std": 1.766818642616272, "rewards/ngrams_iou_reward/mean": 0.041785791516304016, "rewards/ngrams_iou_reward/std": 0.09388843923807144, "rewards/schema_keywords_iou_reward/mean": 0.248244047164917, "rewards/schema_keywords_iou_reward/std": 0.27186155319213867, "rewards/syntax_reward/mean": 0.1822916716337204, "rewards/syntax_reward/std": 0.38709405064582825, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 237.859375, "completions/mean_terminated_length": 159.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.0542832909245123, "frac_reward_zero_std": 0.0, "grad_norm": 5.922573566436768, "kl": 0.0004954338073730469, "learning_rate": 1.6853932584269663e-07, "loss": -0.0343, "num_tokens": 4264808.0, "reward": 4.945152282714844, "reward_std": 3.415262222290039, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.958053469657898, "rewards/format_reward/mean": 0.1666666716337204, "rewards/format_reward/std": 0.37365230917930603, "rewards/judge_reward/mean": 1.6416667699813843, "rewards/judge_reward/std": 1.7594422101974487, "rewards/ngrams_iou_reward/mean": 0.03926548734307289, "rewards/ngrams_iou_reward/std": 0.09769777208566666, "rewards/schema_keywords_iou_reward/mean": 0.24755370616912842, "rewards/schema_keywords_iou_reward/std": 0.2645433247089386, "rewards/syntax_reward/mean": 0.1770833283662796, "rewards/syntax_reward/std": 0.3827372193336487, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 243.5260467529297, "completions/mean_terminated_length": 160.1999969482422, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.05767599660729432, "frac_reward_zero_std": 0.0, "grad_norm": 3.0790977478027344, "kl": 0.0007104873657226562, "learning_rate": 1.7977528089887638e-07, "loss": -0.0136, "num_tokens": 4512841.0, "reward": 4.909486770629883, "reward_std": 3.4259650707244873, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.7557047009468079, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.33158352971076965, "rewards/judge_reward/mean": 1.8854166269302368, "rewards/judge_reward/std": 1.7438585758209229, "rewards/ngrams_iou_reward/mean": 0.03178558871150017, "rewards/ngrams_iou_reward/std": 0.0757630318403244, "rewards/schema_keywords_iou_reward/mean": 0.23707592487335205, "rewards/schema_keywords_iou_reward/std": 0.2642539143562317, "rewards/syntax_reward/mean": 0.1354166716337204, "rewards/syntax_reward/std": 0.3430626094341278, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 242.5104217529297, "completions/mean_terminated_length": 156.38462829589844, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.061068702290076333, "frac_reward_zero_std": 0.0, "grad_norm": 4.350247859954834, "kl": 0.0007047653198242188, "learning_rate": 1.9101123595505617e-07, "loss": -0.0258, "num_tokens": 4774611.0, "reward": 6.0376996994018555, "reward_std": 3.8785853385925293, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.9767000675201416, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.3913327753543854, "rewards/judge_reward/mean": 2.1625001430511475, "rewards/judge_reward/std": 1.864366888999939, "rewards/ngrams_iou_reward/mean": 0.03311467170715332, "rewards/ngrams_iou_reward/std": 0.0798945426940918, "rewards/schema_keywords_iou_reward/mean": 0.21604229509830475, "rewards/schema_keywords_iou_reward/std": 0.24712729454040527, "rewards/syntax_reward/mean": 0.1979166716337204, "rewards/syntax_reward/std": 0.39947062730789185, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 247.69271850585938, "completions/mean_terminated_length": 156.3125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.06446140797285835, "frac_reward_zero_std": 0.0, "grad_norm": 6.6695427894592285, "kl": 0.0008945465087890625, "learning_rate": 2.0224719101123595e-07, "loss": -0.0152, "num_tokens": 5027968.0, "reward": 5.189974784851074, "reward_std": 3.5487802028656006, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.8981601595878601, "rewards/format_reward/mean": 0.1458333283662796, "rewards/format_reward/std": 0.3538617789745331, "rewards/judge_reward/mean": 1.8531250953674316, "rewards/judge_reward/std": 1.8091942071914673, "rewards/ngrams_iou_reward/mean": 0.04204200208187103, "rewards/ngrams_iou_reward/std": 0.09972143173217773, "rewards/schema_keywords_iou_reward/mean": 0.23334890604019165, "rewards/schema_keywords_iou_reward/std": 0.2563181519508362, "rewards/syntax_reward/mean": 0.171875, "rewards/syntax_reward/std": 0.37825807929039, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 237.4635467529297, "completions/mean_terminated_length": 141.19354248046875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.06785411365564037, "frac_reward_zero_std": 0.0, "grad_norm": 6.660654544830322, "kl": 0.0009889602661132812, "learning_rate": 2.134831460674157e-07, "loss": 0.004, "num_tokens": 5303613.0, "reward": 4.527315139770508, "reward_std": 3.4544482231140137, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.6010462045669556, "rewards/format_reward/mean": 0.0989583358168602, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.850000023841858, "rewards/judge_reward/std": 1.753142237663269, "rewards/ngrams_iou_reward/mean": 0.04058518633246422, "rewards/ngrams_iou_reward/std": 0.11791524291038513, "rewards/schema_keywords_iou_reward/mean": 0.20860464870929718, "rewards/schema_keywords_iou_reward/std": 0.24490278959274292, "rewards/syntax_reward/mean": 0.1041666641831398, "rewards/syntax_reward/std": 0.30627524852752686, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 243.89584350585938, "completions/mean_terminated_length": 163.0399932861328, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.07124681933842239, "frac_reward_zero_std": 0.0, "grad_norm": 4.49359130859375, "kl": 0.0011539459228515625, "learning_rate": 2.2471910112359549e-07, "loss": -0.0119, "num_tokens": 5566399.0, "reward": 4.754743576049805, "reward_std": 3.48162841796875, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.6990199685096741, "rewards/format_reward/mean": 0.1510416716337204, "rewards/format_reward/std": 0.35902565717697144, "rewards/judge_reward/mean": 1.8093751668930054, "rewards/judge_reward/std": 1.732146143913269, "rewards/ngrams_iou_reward/mean": 0.03396293893456459, "rewards/ngrams_iou_reward/std": 0.06266939640045166, "rewards/schema_keywords_iou_reward/mean": 0.24265557527542114, "rewards/schema_keywords_iou_reward/std": 0.240478515625, "rewards/syntax_reward/mean": 0.1927083283662796, "rewards/syntax_reward/std": 0.39545711874961853, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 238.7760467529297, "completions/mean_terminated_length": 145.7666778564453, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.07463952502120441, "frac_reward_zero_std": 0.0, "grad_norm": 3.5102009773254395, "kl": 0.0012989044189453125, "learning_rate": 2.3595505617977527e-07, "loss": -0.025, "num_tokens": 5812134.0, "reward": 5.555011749267578, "reward_std": 3.473635673522949, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.7820382118225098, "rewards/format_reward/mean": 0.2083333283662796, "rewards/format_reward/std": 0.40717819333076477, "rewards/judge_reward/mean": 2.049999952316284, "rewards/judge_reward/std": 1.7363377809524536, "rewards/ngrams_iou_reward/mean": 0.06368248909711838, "rewards/ngrams_iou_reward/std": 0.12638692557811737, "rewards/schema_keywords_iou_reward/mean": 0.313203901052475, "rewards/schema_keywords_iou_reward/std": 0.28440243005752563, "rewards/syntax_reward/mean": 0.2135416716337204, "rewards/syntax_reward/std": 0.4108782112598419, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 242.40625, "completions/mean_terminated_length": 179.23529052734375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.07803223070398643, "frac_reward_zero_std": 0.0, "grad_norm": 6.334768295288086, "kl": 0.00156402587890625, "learning_rate": 2.4719101123595505e-07, "loss": 0.026, "num_tokens": 6090984.0, "reward": 5.209197998046875, "reward_std": 3.2332773208618164, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.8072093725204468, "rewards/format_reward/mean": 0.171875, "rewards/format_reward/std": 0.37825807929039, "rewards/judge_reward/mean": 1.896875023841858, "rewards/judge_reward/std": 1.6750683784484863, "rewards/ngrams_iou_reward/mean": 0.03921046108007431, "rewards/ngrams_iou_reward/std": 0.06599873304367065, "rewards/schema_keywords_iou_reward/mean": 0.2876957654953003, "rewards/schema_keywords_iou_reward/std": 0.25443679094314575, "rewards/syntax_reward/mean": 0.2135416716337204, "rewards/syntax_reward/std": 0.4108782112598419, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 241.7291717529297, "completions/mean_terminated_length": 177.7142791748047, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.08142493638676845, "frac_reward_zero_std": 0.0, "grad_norm": 4.571689605712891, "kl": 0.002239227294921875, "learning_rate": 2.5842696629213486e-07, "loss": 0.0108, "num_tokens": 6364718.0, "reward": 4.828306198120117, "reward_std": 3.3450229167938232, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.958053469657898, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.3913327753543854, "rewards/judge_reward/mean": 1.5385417938232422, "rewards/judge_reward/std": 1.680243730545044, "rewards/ngrams_iou_reward/mean": 0.05813319981098175, "rewards/ngrams_iou_reward/std": 0.13391250371932983, "rewards/schema_keywords_iou_reward/mean": 0.2816310226917267, "rewards/schema_keywords_iou_reward/std": 0.2676549553871155, "rewards/syntax_reward/mean": 0.1927083283662796, "rewards/syntax_reward/std": 0.39545711874961853, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 241.41146850585938, "completions/mean_terminated_length": 159.41378784179688, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.08481764206955046, "frac_reward_zero_std": 0.0, "grad_norm": 12.376133918762207, "kl": 0.002124786376953125, "learning_rate": 2.6966292134831456e-07, "loss": 0.001, "num_tokens": 6615981.0, "reward": 5.529187202453613, "reward_std": 3.2030856609344482, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 1.1209568977355957, "rewards/format_reward/mean": 0.1979166716337204, "rewards/format_reward/std": 0.39947062730789185, "rewards/judge_reward/mean": 1.589583396911621, "rewards/judge_reward/std": 1.7192156314849854, "rewards/ngrams_iou_reward/mean": 0.06355898827314377, "rewards/ngrams_iou_reward/std": 0.1365286260843277, "rewards/schema_keywords_iou_reward/mean": 0.3229195773601532, "rewards/schema_keywords_iou_reward/std": 0.28018543124198914, "rewards/syntax_reward/mean": 0.265625, "rewards/syntax_reward/std": 0.44282010197639465, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 249.75, "completions/mean_terminated_length": 192.84210205078125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.08821034775233248, "frac_reward_zero_std": 0.0, "grad_norm": 3.6654391288757324, "kl": 0.00182342529296875, "learning_rate": 2.8089887640449437e-07, "loss": 0.0004, "num_tokens": 6871749.0, "reward": 5.727860927581787, "reward_std": 3.4160869121551514, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.9767000675201416, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.3913327753543854, "rewards/judge_reward/mean": 1.9041666984558105, "rewards/judge_reward/std": 1.7440348863601685, "rewards/ngrams_iou_reward/mean": 0.055372532457113266, "rewards/ngrams_iou_reward/std": 0.11257503181695938, "rewards/schema_keywords_iou_reward/mean": 0.31727948784828186, "rewards/schema_keywords_iou_reward/std": 0.2826869487762451, "rewards/syntax_reward/mean": 0.28125, "rewards/syntax_reward/std": 0.4507846534252167, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.671875, "completions/mean_terminated_length": 175.61538696289062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0916030534351145, "frac_reward_zero_std": 0.0, "grad_norm": 1.3822600841522217, "kl": 0.002025604248046875, "learning_rate": 2.921348314606741e-07, "loss": 0.0154, "num_tokens": 7132446.0, "reward": 5.298697471618652, "reward_std": 3.1114673614501953, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.8767278790473938, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.36404144763946533, "rewards/judge_reward/mean": 1.869270920753479, "rewards/judge_reward/std": 1.6879751682281494, "rewards/ngrams_iou_reward/mean": 0.052138570696115494, "rewards/ngrams_iou_reward/std": 0.12940584123134613, "rewards/schema_keywords_iou_reward/mean": 0.2736416161060333, "rewards/schema_keywords_iou_reward/std": 0.25442636013031006, "rewards/syntax_reward/mean": 0.234375, "rewards/syntax_reward/std": 0.4247150123119354, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 239.25521850585938, "completions/mean_terminated_length": 136.92593383789062, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.09499575911789652, "frac_reward_zero_std": 0.0, "grad_norm": 1.9841241836547852, "kl": 0.002349853515625, "learning_rate": 3.0337078651685393e-07, "loss": 0.015, "num_tokens": 7398415.0, "reward": 5.023322582244873, "reward_std": 3.614788055419922, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.8981601595878601, "rewards/format_reward/mean": 0.1770833283662796, "rewards/format_reward/std": 0.3827372193336487, "rewards/judge_reward/mean": 1.7239583730697632, "rewards/judge_reward/std": 1.7244313955307007, "rewards/ngrams_iou_reward/mean": 0.045199137181043625, "rewards/ngrams_iou_reward/std": 0.08408678323030472, "rewards/schema_keywords_iou_reward/mean": 0.30103951692581177, "rewards/schema_keywords_iou_reward/std": 0.24807311594486237, "rewards/syntax_reward/mean": 0.1614583283662796, "rewards/syntax_reward/std": 0.3689151406288147, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 241.5416717529297, "completions/mean_terminated_length": 169.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.09838846480067855, "frac_reward_zero_std": 0.0, "grad_norm": 2.857107400894165, "kl": 0.002117156982421875, "learning_rate": 3.146067415730337e-07, "loss": -0.0363, "num_tokens": 7691463.0, "reward": 5.37518310546875, "reward_std": 3.6316792964935303, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.958053469657898, "rewards/format_reward/mean": 0.234375, "rewards/format_reward/std": 0.4247150123119354, "rewards/judge_reward/mean": 1.7708333730697632, "rewards/judge_reward/std": 1.718023657798767, "rewards/ngrams_iou_reward/mean": 0.03884166479110718, "rewards/ngrams_iou_reward/std": 0.07367770373821259, "rewards/schema_keywords_iou_reward/mean": 0.299882709980011, "rewards/schema_keywords_iou_reward/std": 0.2542040944099426, "rewards/syntax_reward/mean": 0.2291666716337204, "rewards/syntax_reward/std": 0.421395480632782, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 240.546875, "completions/mean_terminated_length": 157.10000610351562, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.10178117048346055, "frac_reward_zero_std": 0.0, "grad_norm": 3.469900131225586, "kl": 0.002712249755859375, "learning_rate": 3.258426966292135e-07, "loss": 0.0055, "num_tokens": 7931922.0, "reward": 5.311439037322998, "reward_std": 3.2756271362304688, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.8981601595878601, "rewards/format_reward/mean": 0.2135416716337204, "rewards/format_reward/std": 0.4108782112598419, "rewards/judge_reward/mean": 1.7885417938232422, "rewards/judge_reward/std": 1.6772184371948242, "rewards/ngrams_iou_reward/mean": 0.06504596024751663, "rewards/ngrams_iou_reward/std": 0.13478770852088928, "rewards/schema_keywords_iou_reward/mean": 0.3359760344028473, "rewards/schema_keywords_iou_reward/std": 0.28025805950164795, "rewards/syntax_reward/mean": 0.2291666716337204, "rewards/syntax_reward/std": 0.421395480632782, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 242.78125, "completions/mean_terminated_length": 176.6875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.10517387616624257, "frac_reward_zero_std": 0.0, "grad_norm": 1.8416393995285034, "kl": 0.002857208251953125, "learning_rate": 3.3707865168539325e-07, "loss": 0.011, "num_tokens": 8198784.0, "reward": 6.647122859954834, "reward_std": 3.061551094055176, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.9947506189346313, "rewards/format_reward/mean": 0.2395833283662796, "rewards/format_reward/std": 0.4279450476169586, "rewards/judge_reward/mean": 2.3187499046325684, "rewards/judge_reward/std": 1.6981858015060425, "rewards/ngrams_iou_reward/mean": 0.055537402629852295, "rewards/ngrams_iou_reward/std": 0.11917069554328918, "rewards/schema_keywords_iou_reward/mean": 0.3186679184436798, "rewards/schema_keywords_iou_reward/std": 0.2797251045703888, "rewards/syntax_reward/mean": 0.2708333432674408, "rewards/syntax_reward/std": 0.44555196166038513, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 246.92709350585938, "completions/mean_terminated_length": 176.8181915283203, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.1085665818490246, "frac_reward_zero_std": 0.0, "grad_norm": 3.963526964187622, "kl": 0.00357818603515625, "learning_rate": 3.48314606741573e-07, "loss": 0.0109, "num_tokens": 8482876.0, "reward": 5.221710205078125, "reward_std": 3.5367801189422607, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.9188257455825806, "rewards/format_reward/mean": 0.171875, "rewards/format_reward/std": 0.37825807929039, "rewards/judge_reward/mean": 1.7427083253860474, "rewards/judge_reward/std": 1.69008469581604, "rewards/ngrams_iou_reward/mean": 0.054343920201063156, "rewards/ngrams_iou_reward/std": 0.0924958661198616, "rewards/schema_keywords_iou_reward/mean": 0.332990825176239, "rewards/schema_keywords_iou_reward/std": 0.26766982674598694, "rewards/syntax_reward/mean": 0.2395833283662796, "rewards/syntax_reward/std": 0.4279450476169586, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.4375, "completions/mean_terminated_length": 182.42105102539062, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.11195928753180662, "frac_reward_zero_std": 0.0, "grad_norm": 1.118076205253601, "kl": 0.0025730133056640625, "learning_rate": 3.5955056179775277e-07, "loss": 0.0115, "num_tokens": 8757034.0, "reward": 5.874207973480225, "reward_std": 2.910546064376831, "rewards/accuracy_reward/mean": 0.40625, "rewards/accuracy_reward/std": 1.0291879177093506, "rewards/format_reward/mean": 0.2395833283662796, "rewards/format_reward/std": 0.427945077419281, "rewards/judge_reward/mean": 1.9052084684371948, "rewards/judge_reward/std": 1.7290780544281006, "rewards/ngrams_iou_reward/mean": 0.05059130862355232, "rewards/ngrams_iou_reward/std": 0.09847215563058853, "rewards/schema_keywords_iou_reward/mean": 0.33611658215522766, "rewards/schema_keywords_iou_reward/std": 0.2709704637527466, "rewards/syntax_reward/mean": 0.21875, "rewards/syntax_reward/std": 0.41447943449020386, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 246.109375, "completions/mean_terminated_length": 192.70001220703125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.11535199321458864, "frac_reward_zero_std": 0.0, "grad_norm": 2.2683603763580322, "kl": 0.00551605224609375, "learning_rate": 3.707865168539326e-07, "loss": -0.0007, "num_tokens": 9018907.0, "reward": 5.399112701416016, "reward_std": 3.2441365718841553, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.9387753009796143, "rewards/format_reward/mean": 0.1979166716337204, "rewards/format_reward/std": 0.39947062730789185, "rewards/judge_reward/mean": 1.7999998331069946, "rewards/judge_reward/std": 1.7787576913833618, "rewards/ngrams_iou_reward/mean": 0.04596255347132683, "rewards/ngrams_iou_reward/std": 0.09404848515987396, "rewards/schema_keywords_iou_reward/mean": 0.3208580017089844, "rewards/schema_keywords_iou_reward/std": 0.2563953995704651, "rewards/syntax_reward/mean": 0.25, "rewards/syntax_reward/std": 0.4341447353363037, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 246.27084350585938, "completions/mean_terminated_length": 178.1666717529297, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.11874469889737066, "frac_reward_zero_std": 0.0, "grad_norm": 1.1904125213623047, "kl": 0.003383636474609375, "learning_rate": 3.8202247191011233e-07, "loss": -0.0042, "num_tokens": 9282179.0, "reward": 6.032573223114014, "reward_std": 3.424551010131836, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 1.1482117176055908, "rewards/format_reward/mean": 0.1875, "rewards/format_reward/std": 0.3913327753543854, "rewards/judge_reward/mean": 1.7916666269302368, "rewards/judge_reward/std": 1.7555099725723267, "rewards/ngrams_iou_reward/mean": 0.07395479083061218, "rewards/ngrams_iou_reward/std": 0.1451846808195114, "rewards/schema_keywords_iou_reward/mean": 0.3284096419811249, "rewards/schema_keywords_iou_reward/std": 0.28021594882011414, "rewards/syntax_reward/mean": 0.265625, "rewards/syntax_reward/std": 0.44282010197639465, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 239.0625, "completions/mean_terminated_length": 163.08570861816406, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.12213740458015267, "frac_reward_zero_std": 0.0, "grad_norm": 1.4173794984817505, "kl": 0.0029296875, "learning_rate": 3.9325842696629214e-07, "loss": -0.0062, "num_tokens": 9569327.0, "reward": 5.773731231689453, "reward_std": 3.0757431983947754, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.9947506189346313, "rewards/format_reward/mean": 0.2083333283662796, "rewards/format_reward/std": 0.40717819333076477, "rewards/judge_reward/mean": 1.9124999046325684, "rewards/judge_reward/std": 1.7570351362228394, "rewards/ngrams_iou_reward/mean": 0.05832124873995781, "rewards/ngrams_iou_reward/std": 0.11015293747186661, "rewards/schema_keywords_iou_reward/mean": 0.3279096484184265, "rewards/schema_keywords_iou_reward/std": 0.2577066719532013, "rewards/syntax_reward/mean": 0.2291666716337204, "rewards/syntax_reward/std": 0.421395480632782, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 245.23959350585938, "completions/mean_terminated_length": 195.23529052734375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.1255301102629347, "frac_reward_zero_std": 0.0, "grad_norm": 3.2677268981933594, "kl": 0.0035247802734375, "learning_rate": 4.044943820224719e-07, "loss": 0.0054, "num_tokens": 9824703.0, "reward": 5.711038112640381, "reward_std": 3.2138729095458984, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.9188257455825806, "rewards/format_reward/mean": 0.2760416567325592, "rewards/format_reward/std": 0.4482063949108124, "rewards/judge_reward/mean": 1.9208332300186157, "rewards/judge_reward/std": 1.624415636062622, "rewards/ngrams_iou_reward/mean": 0.04888283088803291, "rewards/ngrams_iou_reward/std": 0.07998286932706833, "rewards/schema_keywords_iou_reward/mean": 0.3465304374694824, "rewards/schema_keywords_iou_reward/std": 0.2603627145290375, "rewards/syntax_reward/mean": 0.2604166567325592, "rewards/syntax_reward/std": 0.44000932574272156, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 246.15625, "completions/mean_terminated_length": 202.0, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.1289228159457167, "frac_reward_zero_std": 0.0, "grad_norm": 0.9190229773521423, "kl": 0.002704620361328125, "learning_rate": 4.157303370786517e-07, "loss": 0.0056, "num_tokens": 10087815.0, "reward": 6.656896114349365, "reward_std": 3.1069064140319824, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 1.092124342918396, "rewards/format_reward/mean": 0.2552083432674408, "rewards/format_reward/std": 0.4371180534362793, "rewards/judge_reward/mean": 2.116666793823242, "rewards/judge_reward/std": 1.6922779083251953, "rewards/ngrams_iou_reward/mean": 0.05168645456433296, "rewards/ngrams_iou_reward/std": 0.08464475721120834, "rewards/schema_keywords_iou_reward/mean": 0.36666765809059143, "rewards/schema_keywords_iou_reward/std": 0.26463082432746887, "rewards/syntax_reward/mean": 0.34375, "rewards/syntax_reward/std": 0.47620058059692383, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 242.9479217529297, "completions/mean_terminated_length": 190.05262756347656, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.13231552162849872, "frac_reward_zero_std": 0.0, "grad_norm": 1.2053065299987793, "kl": 0.0064544677734375, "learning_rate": 4.269662921348314e-07, "loss": 0.0071, "num_tokens": 10343273.0, "reward": 6.045838832855225, "reward_std": 3.1320319175720215, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.9947506189346313, "rewards/format_reward/mean": 0.2395833283662796, "rewards/format_reward/std": 0.427945077419281, "rewards/judge_reward/mean": 1.980208396911621, "rewards/judge_reward/std": 1.6676232814788818, "rewards/ngrams_iou_reward/mean": 0.06941912323236465, "rewards/ngrams_iou_reward/std": 0.13063479959964752, "rewards/schema_keywords_iou_reward/mean": 0.34933626651763916, "rewards/schema_keywords_iou_reward/std": 0.2680214047431946, "rewards/syntax_reward/mean": 0.3020833432674408, "rewards/syntax_reward/std": 0.4603615701198578, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 247.34896850585938, "completions/mean_terminated_length": 207.14706420898438, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.13570822731128074, "frac_reward_zero_std": 0.0, "grad_norm": 8.078385353088379, "kl": 0.007137298583984375, "learning_rate": 4.382022471910112e-07, "loss": 0.0157, "num_tokens": 10593408.0, "reward": 5.469001770019531, "reward_std": 3.101335048675537, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.9188257455825806, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.4603615701198578, "rewards/judge_reward/mean": 1.769791603088379, "rewards/judge_reward/std": 1.6269084215164185, "rewards/ngrams_iou_reward/mean": 0.05278665944933891, "rewards/ngrams_iou_reward/std": 0.08211027830839157, "rewards/schema_keywords_iou_reward/mean": 0.3557983636856079, "rewards/schema_keywords_iou_reward/std": 0.24982258677482605, "rewards/syntax_reward/mean": 0.28125, "rewards/syntax_reward/std": 0.4507846534252167, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 245.0, "completions/mean_terminated_length": 193.88235473632812, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.13910093299406276, "frac_reward_zero_std": 0.0, "grad_norm": 1.5889798402786255, "kl": 0.003387451171875, "learning_rate": 4.4943820224719097e-07, "loss": 0.001, "num_tokens": 10877688.0, "reward": 5.745233058929443, "reward_std": 3.2097721099853516, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.9947506189346313, "rewards/format_reward/mean": 0.2708333432674408, "rewards/format_reward/std": 0.44555196166038513, "rewards/judge_reward/mean": 1.8447917699813843, "rewards/judge_reward/std": 1.6935583353042603, "rewards/ngrams_iou_reward/mean": 0.05210016667842865, "rewards/ngrams_iou_reward/std": 0.08457709848880768, "rewards/schema_keywords_iou_reward/mean": 0.35250774025917053, "rewards/schema_keywords_iou_reward/std": 0.26016515493392944, "rewards/syntax_reward/mean": 0.2552083432674408, "rewards/syntax_reward/std": 0.4371180534362793, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 243.4166717529297, "completions/mean_terminated_length": 184.94117736816406, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.14249363867684478, "frac_reward_zero_std": 0.0, "grad_norm": 1.759983777999878, "kl": 0.00400543212890625, "learning_rate": 4.606741573033708e-07, "loss": 0.0026, "num_tokens": 11128610.0, "reward": 5.2731242179870605, "reward_std": 3.5066850185394287, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.9387753009796143, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4341447353363037, "rewards/judge_reward/mean": 1.683333396911621, "rewards/judge_reward/std": 1.622399926185608, "rewards/ngrams_iou_reward/mean": 0.07192561775445938, "rewards/ngrams_iou_reward/std": 0.12848150730133057, "rewards/schema_keywords_iou_reward/mean": 0.3397398889064789, "rewards/schema_keywords_iou_reward/std": 0.2539899945259094, "rewards/syntax_reward/mean": 0.2604166567325592, "rewards/syntax_reward/std": 0.44000932574272156, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 240.0104217529297, "completions/mean_terminated_length": 181.1219482421875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.1458863443596268, "frac_reward_zero_std": 0.0, "grad_norm": 3.036172866821289, "kl": 0.005718231201171875, "learning_rate": 4.7191011235955054e-07, "loss": 0.0084, "num_tokens": 11369416.0, "reward": 6.54745626449585, "reward_std": 3.2204959392547607, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 1.1863713264465332, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.49030742049217224, "rewards/judge_reward/mean": 1.7718749046325684, "rewards/judge_reward/std": 1.5912853479385376, "rewards/ngrams_iou_reward/mean": 0.06471949070692062, "rewards/ngrams_iou_reward/std": 0.11716829240322113, "rewards/schema_keywords_iou_reward/mean": 0.40252816677093506, "rewards/schema_keywords_iou_reward/std": 0.25076907873153687, "rewards/syntax_reward/mean": 0.40625, "rewards/syntax_reward/std": 0.49241629242897034, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.22396850585938, "completions/mean_terminated_length": 185.0749969482422, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.14927905004240882, "frac_reward_zero_std": 0.0, "grad_norm": 3.0175812244415283, "kl": 0.006572723388671875, "learning_rate": 4.831460674157303e-07, "loss": 0.0178, "num_tokens": 11619941.0, "reward": 6.397426605224609, "reward_std": 3.0068740844726562, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.9767000675201416, "rewards/format_reward/mean": 0.296875, "rewards/format_reward/std": 0.4580754339694977, "rewards/judge_reward/mean": 2.1499998569488525, "rewards/judge_reward/std": 1.7350106239318848, "rewards/ngrams_iou_reward/mean": 0.06351717561483383, "rewards/ngrams_iou_reward/std": 0.13041840493679047, "rewards/schema_keywords_iou_reward/mean": 0.36203455924987793, "rewards/schema_keywords_iou_reward/std": 0.2686781585216522, "rewards/syntax_reward/mean": 0.296875, "rewards/syntax_reward/std": 0.4580754339694977, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 238.50521850585938, "completions/mean_terminated_length": 190.13726806640625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.15267175572519084, "frac_reward_zero_std": 0.0, "grad_norm": 3.31221604347229, "kl": 0.005519866943359375, "learning_rate": 4.943820224719101e-07, "loss": 0.0299, "num_tokens": 11881536.0, "reward": 6.286830425262451, "reward_std": 2.971569061279297, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.9767000675201416, "rewards/format_reward/mean": 0.3385416567325592, "rewards/format_reward/std": 0.47445085644721985, "rewards/judge_reward/mean": 2.054166555404663, "rewards/judge_reward/std": 1.6060041189193726, "rewards/ngrams_iou_reward/mean": 0.06353550404310226, "rewards/ngrams_iou_reward/std": 0.11130090802907944, "rewards/schema_keywords_iou_reward/mean": 0.3805861473083496, "rewards/schema_keywords_iou_reward/std": 0.26022714376449585, "rewards/syntax_reward/mean": 0.3177083432674408, "rewards/syntax_reward/std": 0.46680256724357605, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 242.48959350585938, "completions/mean_terminated_length": 194.23809814453125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.15606446140797287, "frac_reward_zero_std": 0.0, "grad_norm": 1.1358730792999268, "kl": 0.006565093994140625, "learning_rate": 5.056179775280899e-07, "loss": 0.0343, "num_tokens": 12135730.0, "reward": 6.419012069702148, "reward_std": 3.3085379600524902, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 1.1863713264465332, "rewards/format_reward/mean": 0.3072916567325592, "rewards/format_reward/std": 0.46257755160331726, "rewards/judge_reward/mean": 1.8218750953674316, "rewards/judge_reward/std": 1.706084132194519, "rewards/ngrams_iou_reward/mean": 0.07226443290710449, "rewards/ngrams_iou_reward/std": 0.14280134439468384, "rewards/schema_keywords_iou_reward/mean": 0.36445602774620056, "rewards/schema_keywords_iou_reward/std": 0.2623903751373291, "rewards/syntax_reward/mean": 0.296875, "rewards/syntax_reward/std": 0.4580754339694977, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 244.4010467529297, "completions/mean_terminated_length": 201.68292236328125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.1594571670907549, "frac_reward_zero_std": 0.0, "grad_norm": 7.436099529266357, "kl": 0.004276275634765625, "learning_rate": 5.168539325842697e-07, "loss": 0.0063, "num_tokens": 12380835.0, "reward": 5.4635725021362305, "reward_std": 3.063023328781128, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.9767000675201416, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.49030742049217224, "rewards/judge_reward/mean": 1.6104167699813843, "rewards/judge_reward/std": 1.5368069410324097, "rewards/ngrams_iou_reward/mean": 0.048240140080451965, "rewards/ngrams_iou_reward/std": 0.07052183896303177, "rewards/schema_keywords_iou_reward/mean": 0.39762380719184875, "rewards/schema_keywords_iou_reward/std": 0.23257045447826385, "rewards/syntax_reward/mean": 0.3229166567325592, "rewards/syntax_reward/std": 0.46881362795829773, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.88021850585938, "completions/mean_terminated_length": 194.09999084472656, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.1628498727735369, "frac_reward_zero_std": 0.0, "grad_norm": 5.772529125213623, "kl": 0.01033782958984375, "learning_rate": 5.280898876404494e-07, "loss": 0.0201, "num_tokens": 12653152.0, "reward": 6.722418785095215, "reward_std": 2.9238739013671875, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 1.1984119415283203, "rewards/format_reward/mean": 0.3802083432674408, "rewards/format_reward/std": 0.48670700192451477, "rewards/judge_reward/mean": 1.8583332300186157, "rewards/judge_reward/std": 1.5936830043792725, "rewards/ngrams_iou_reward/mean": 0.09372449666261673, "rewards/ngrams_iou_reward/std": 0.15846951305866241, "rewards/schema_keywords_iou_reward/mean": 0.42244377732276917, "rewards/schema_keywords_iou_reward/std": 0.26681920886039734, "rewards/syntax_reward/mean": 0.328125, "rewards/syntax_reward/std": 0.4707581400871277, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 243.6875, "completions/mean_terminated_length": 198.34146118164062, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.1662425784563189, "frac_reward_zero_std": 0.0, "grad_norm": 1.2101649045944214, "kl": 0.00421905517578125, "learning_rate": 5.393258426966291e-07, "loss": 0.0253, "num_tokens": 12906448.0, "reward": 6.334313869476318, "reward_std": 2.8165836334228516, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.958053469657898, "rewards/format_reward/mean": 0.328125, "rewards/format_reward/std": 0.4707581400871277, "rewards/judge_reward/mean": 2.102083206176758, "rewards/judge_reward/std": 1.65507173538208, "rewards/ngrams_iou_reward/mean": 0.07371022552251816, "rewards/ngrams_iou_reward/std": 0.12796875834465027, "rewards/schema_keywords_iou_reward/mean": 0.3793535530567169, "rewards/schema_keywords_iou_reward/std": 0.27037933468818665, "rewards/syntax_reward/mean": 0.3177083432674408, "rewards/syntax_reward/std": 0.46680256724357605, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 240.0625, "completions/mean_terminated_length": 190.89361572265625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.16963528413910092, "frac_reward_zero_std": 0.0, "grad_norm": 2.1949303150177, "kl": 0.0043182373046875, "learning_rate": 5.50561797752809e-07, "loss": 0.0405, "num_tokens": 13175308.0, "reward": 6.061517715454102, "reward_std": 2.9098024368286133, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.9188257455825806, "rewards/format_reward/mean": 0.3385416567325592, "rewards/format_reward/std": 0.47445085644721985, "rewards/judge_reward/mean": 2.0479166507720947, "rewards/judge_reward/std": 1.5430288314819336, "rewards/ngrams_iou_reward/mean": 0.057056549936532974, "rewards/ngrams_iou_reward/std": 0.10168220847845078, "rewards/schema_keywords_iou_reward/mean": 0.3773781359195709, "rewards/schema_keywords_iou_reward/std": 0.24019277095794678, "rewards/syntax_reward/mean": 0.2552083432674408, "rewards/syntax_reward/std": 0.4371180534362793, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 244.52084350585938, "completions/mean_terminated_length": 205.9091033935547, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.17302798982188294, "frac_reward_zero_std": 0.0, "grad_norm": 1.6404805183410645, "kl": 0.004669189453125, "learning_rate": 5.617977528089887e-07, "loss": 0.0119, "num_tokens": 13430732.0, "reward": 6.653632164001465, "reward_std": 3.0733814239501953, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 1.1612823009490967, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.49241629242897034, "rewards/judge_reward/mean": 1.8822917938232422, "rewards/judge_reward/std": 1.6289587020874023, "rewards/ngrams_iou_reward/mean": 0.06167406961321831, "rewards/ngrams_iou_reward/std": 0.09573862701654434, "rewards/schema_keywords_iou_reward/mean": 0.4107078015804291, "rewards/schema_keywords_iou_reward/std": 0.24281205236911774, "rewards/syntax_reward/mean": 0.3697916567325592, "rewards/syntax_reward/std": 0.4840102791786194, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 241.4010467529297, "completions/mean_terminated_length": 192.2954559326172, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.17642069550466496, "frac_reward_zero_std": 0.0, "grad_norm": 4.148083686828613, "kl": 0.00705718994140625, "learning_rate": 5.730337078651685e-07, "loss": 0.0228, "num_tokens": 13695919.0, "reward": 5.67096471786499, "reward_std": 3.2021145820617676, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.8767278790473938, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.46472418308258057, "rewards/judge_reward/mean": 1.8937498331069946, "rewards/judge_reward/std": 1.6414661407470703, "rewards/ngrams_iou_reward/mean": 0.054073382169008255, "rewards/ngrams_iou_reward/std": 0.08225143700838089, "rewards/schema_keywords_iou_reward/mean": 0.3606410026550293, "rewards/schema_keywords_iou_reward/std": 0.24278171360492706, "rewards/syntax_reward/mean": 0.3125, "rewards/syntax_reward/std": 0.46472418308258057, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.06771850585938, "completions/mean_terminated_length": 187.73809814453125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.179813401187447, "frac_reward_zero_std": 0.0, "grad_norm": 1.0312576293945312, "kl": 0.00506591796875, "learning_rate": 5.842696629213483e-07, "loss": 0.0263, "num_tokens": 13949432.0, "reward": 5.69898796081543, "reward_std": 2.8869054317474365, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.8981601595878601, "rewards/format_reward/mean": 0.390625, "rewards/format_reward/std": 0.4891659915447235, "rewards/judge_reward/mean": 1.8645833730697632, "rewards/judge_reward/std": 1.6538059711456299, "rewards/ngrams_iou_reward/mean": 0.05424784496426582, "rewards/ngrams_iou_reward/std": 0.09535835683345795, "rewards/schema_keywords_iou_reward/mean": 0.36869844794273376, "rewards/schema_keywords_iou_reward/std": 0.2514267563819885, "rewards/syntax_reward/mean": 0.265625, "rewards/syntax_reward/std": 0.44282010197639465, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 237.8854217529297, "completions/mean_terminated_length": 196.03448486328125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.183206106870229, "frac_reward_zero_std": 0.0, "grad_norm": 3.93890380859375, "kl": 0.00887298583984375, "learning_rate": 5.955056179775281e-07, "loss": 0.0161, "num_tokens": 14221318.0, "reward": 6.607994079589844, "reward_std": 2.88435697555542, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 1.1612823009490967, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.49429556727409363, "rewards/judge_reward/mean": 1.8802083730697632, "rewards/judge_reward/std": 1.642250895500183, "rewards/ngrams_iou_reward/mean": 0.05397835373878479, "rewards/ngrams_iou_reward/std": 0.07850240170955658, "rewards/schema_keywords_iou_reward/mean": 0.4029732644557953, "rewards/schema_keywords_iou_reward/std": 0.25147679448127747, "rewards/syntax_reward/mean": 0.3333333432674408, "rewards/syntax_reward/std": 0.4726369380950928, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 234.9375, "completions/mean_terminated_length": 188.60000610351562, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.18659881255301103, "frac_reward_zero_std": 0.0, "grad_norm": 1.1291247606277466, "kl": 0.00577545166015625, "learning_rate": 6.067415730337079e-07, "loss": 0.0396, "num_tokens": 14475196.0, "reward": 6.966236114501953, "reward_std": 3.3799891471862793, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 1.2641865015029907, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4973753094673157, "rewards/judge_reward/mean": 1.8125, "rewards/judge_reward/std": 1.6579334735870361, "rewards/ngrams_iou_reward/mean": 0.07592972368001938, "rewards/ngrams_iou_reward/std": 0.13737481832504272, "rewards/schema_keywords_iou_reward/mean": 0.4059309661388397, "rewards/schema_keywords_iou_reward/std": 0.2606310546398163, "rewards/syntax_reward/mean": 0.359375, "rewards/syntax_reward/std": 0.48107168078422546, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 238.609375, "completions/mean_terminated_length": 187.8571319580078, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.18999151823579305, "frac_reward_zero_std": 0.0, "grad_norm": 1.341596245765686, "kl": 0.00611114501953125, "learning_rate": 6.179775280898875e-07, "loss": 0.026, "num_tokens": 14735899.0, "reward": 6.351171016693115, "reward_std": 3.1339831352233887, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 1.1612823009490967, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.48796600103378296, "rewards/judge_reward/mean": 1.7572917938232422, "rewards/judge_reward/std": 1.6165534257888794, "rewards/ngrams_iou_reward/mean": 0.06059088930487633, "rewards/ngrams_iou_reward/std": 0.07717961817979813, "rewards/schema_keywords_iou_reward/mean": 0.40620484948158264, "rewards/schema_keywords_iou_reward/std": 0.258095920085907, "rewards/syntax_reward/mean": 0.34375, "rewards/syntax_reward/std": 0.47620058059692383, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 235.16146850585938, "completions/mean_terminated_length": 193.484375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.19338422391857507, "frac_reward_zero_std": 0.0, "grad_norm": 0.9409461617469788, "kl": 0.00701141357421875, "learning_rate": 6.292134831460674e-07, "loss": 0.0126, "num_tokens": 14996144.0, "reward": 6.174826622009277, "reward_std": 2.9702224731445312, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 1.1739983558654785, "rewards/format_reward/mean": 0.4322916567325592, "rewards/format_reward/std": 0.49668949842453003, "rewards/judge_reward/mean": 1.5854166746139526, "rewards/judge_reward/std": 1.5637903213500977, "rewards/ngrams_iou_reward/mean": 0.07604581862688065, "rewards/ngrams_iou_reward/std": 0.14899684488773346, "rewards/schema_keywords_iou_reward/mean": 0.44357189536094666, "rewards/schema_keywords_iou_reward/std": 0.2583380937576294, "rewards/syntax_reward/mean": 0.3645833432674408, "rewards/syntax_reward/std": 0.48257145285606384, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 228.6510467529297, "completions/mean_terminated_length": 186.90789794921875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1967769296013571, "frac_reward_zero_std": 0.0, "grad_norm": 0.9385547041893005, "kl": 0.00792694091796875, "learning_rate": 6.404494382022471e-07, "loss": 0.0587, "num_tokens": 15239521.0, "reward": 7.077910900115967, "reward_std": 3.114408016204834, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 1.1739983558654785, "rewards/format_reward/mean": 0.5416666865348816, "rewards/format_reward/std": 0.4995634853839874, "rewards/judge_reward/mean": 1.939583420753479, "rewards/judge_reward/std": 1.6279289722442627, "rewards/ngrams_iou_reward/mean": 0.09108512848615646, "rewards/ngrams_iou_reward/std": 0.15852928161621094, "rewards/schema_keywords_iou_reward/mean": 0.46182548999786377, "rewards/schema_keywords_iou_reward/std": 0.2660059928894043, "rewards/syntax_reward/mean": 0.4166666567325592, "rewards/syntax_reward/std": 0.49429556727409363, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.6666717529297, "completions/mean_terminated_length": 177.671630859375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.2001696352841391, "frac_reward_zero_std": 0.0, "grad_norm": 0.9338399767875671, "kl": 0.00687408447265625, "learning_rate": 6.51685393258427e-07, "loss": 0.0236, "num_tokens": 15489183.0, "reward": 6.862953186035156, "reward_std": 3.0722103118896484, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 1.2434382438659668, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5008718371391296, "rewards/judge_reward/mean": 1.6979166269302368, "rewards/judge_reward/std": 1.6456815004348755, "rewards/ngrams_iou_reward/mean": 0.07885382324457169, "rewards/ngrams_iou_reward/std": 0.10649603605270386, "rewards/schema_keywords_iou_reward/mean": 0.4715994894504547, "rewards/schema_keywords_iou_reward/std": 0.26455098390579224, "rewards/syntax_reward/mean": 0.4270833432674408, "rewards/syntax_reward/std": 0.49594777822494507, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 223.71875, "completions/mean_terminated_length": 174.44737243652344, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2035623409669211, "frac_reward_zero_std": 0.0, "grad_norm": 1.054356575012207, "kl": 0.0070648193359375, "learning_rate": 6.629213483146066e-07, "loss": 0.0258, "num_tokens": 15742767.0, "reward": 6.802951812744141, "reward_std": 3.132028341293335, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 1.2326345443725586, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5003271102905273, "rewards/judge_reward/mean": 1.7114583253860474, "rewards/judge_reward/std": 1.5388423204421997, "rewards/ngrams_iou_reward/mean": 0.07826735824346542, "rewards/ngrams_iou_reward/std": 0.14481067657470703, "rewards/schema_keywords_iou_reward/mean": 0.468434602022171, "rewards/schema_keywords_iou_reward/std": 0.2487783282995224, "rewards/syntax_reward/mean": 0.3802083432674408, "rewards/syntax_reward/std": 0.48670700192451477, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 227.27084350585938, "completions/mean_terminated_length": 186.17721557617188, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.20695504664970313, "frac_reward_zero_std": 0.0, "grad_norm": 1.25531804561615, "kl": 0.00731658935546875, "learning_rate": 6.741573033707865e-07, "loss": 0.0121, "num_tokens": 15986239.0, "reward": 7.024882793426514, "reward_std": 2.8579254150390625, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 1.2434382438659668, "rewards/format_reward/mean": 0.578125, "rewards/format_reward/std": 0.4951499104499817, "rewards/judge_reward/mean": 1.712499976158142, "rewards/judge_reward/std": 1.5464364290237427, "rewards/ngrams_iou_reward/mean": 0.0955837145447731, "rewards/ngrams_iou_reward/std": 0.12588554620742798, "rewards/schema_keywords_iou_reward/mean": 0.4834655821323395, "rewards/schema_keywords_iou_reward/std": 0.25198790431022644, "rewards/syntax_reward/mean": 0.4739583432674408, "rewards/syntax_reward/std": 0.5006267428398132, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.9947967529297, "completions/mean_terminated_length": 179.1857147216797, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.21034775233248515, "frac_reward_zero_std": 0.0, "grad_norm": 1.1218388080596924, "kl": 0.0073089599609375, "learning_rate": 6.853932584269663e-07, "loss": 0.0456, "num_tokens": 16226088.0, "reward": 6.575740814208984, "reward_std": 2.6254196166992188, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 1.1482117176055908, "rewards/format_reward/mean": 0.5260416865348816, "rewards/format_reward/std": 0.5006267428398132, "rewards/judge_reward/mean": 1.7687500715255737, "rewards/judge_reward/std": 1.5960233211517334, "rewards/ngrams_iou_reward/mean": 0.08206448704004288, "rewards/ngrams_iou_reward/std": 0.1331986039876938, "rewards/schema_keywords_iou_reward/mean": 0.4665921628475189, "rewards/schema_keywords_iou_reward/std": 0.25899553298950195, "rewards/syntax_reward/mean": 0.3697916567325592, "rewards/syntax_reward/std": 0.4840102791786194, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.6979217529297, "completions/mean_terminated_length": 179.9729766845703, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.21374045801526717, "frac_reward_zero_std": 0.0, "grad_norm": 1.0954909324645996, "kl": 0.0098114013671875, "learning_rate": 6.96629213483146e-07, "loss": 0.0508, "num_tokens": 16507640.0, "reward": 6.559129238128662, "reward_std": 2.875121593475342, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 1.1347742080688477, "rewards/format_reward/mean": 0.5677083134651184, "rewards/format_reward/std": 0.49668949842453003, "rewards/judge_reward/mean": 1.7333332300186157, "rewards/judge_reward/std": 1.5225127935409546, "rewards/ngrams_iou_reward/mean": 0.08012760430574417, "rewards/ngrams_iou_reward/std": 0.11167032271623611, "rewards/schema_keywords_iou_reward/mean": 0.4706675112247467, "rewards/schema_keywords_iou_reward/std": 0.2278205305337906, "rewards/syntax_reward/mean": 0.4270833432674408, "rewards/syntax_reward/std": 0.49594777822494507, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.69271850585938, "completions/mean_terminated_length": 173.6666717529297, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.2171331636980492, "frac_reward_zero_std": 0.0, "grad_norm": 1.799676537513733, "kl": 0.0154876708984375, "learning_rate": 7.078651685393258e-07, "loss": 0.0427, "num_tokens": 16747893.0, "reward": 6.6538848876953125, "reward_std": 2.8341891765594482, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 1.1984119415283203, "rewards/format_reward/mean": 0.5989583134651184, "rewards/format_reward/std": 0.49139073491096497, "rewards/judge_reward/mean": 1.6166666746139526, "rewards/judge_reward/std": 1.5205857753753662, "rewards/ngrams_iou_reward/mean": 0.09568631649017334, "rewards/ngrams_iou_reward/std": 0.1571492999792099, "rewards/schema_keywords_iou_reward/mean": 0.5071560144424438, "rewards/schema_keywords_iou_reward/std": 0.2519457936286926, "rewards/syntax_reward/mean": 0.4375, "rewards/syntax_reward/std": 0.4973753094673157, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.6979217529297, "completions/mean_terminated_length": 175.8850555419922, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.2205258693808312, "frac_reward_zero_std": 0.0, "grad_norm": 1.300784945487976, "kl": 0.0124969482421875, "learning_rate": 7.191011235955055e-07, "loss": 0.0481, "num_tokens": 17022491.0, "reward": 7.391824245452881, "reward_std": 2.670814275741577, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 1.311354160308838, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.4707581400871277, "rewards/judge_reward/mean": 1.634374976158142, "rewards/judge_reward/std": 1.5906354188919067, "rewards/ngrams_iou_reward/mean": 0.1148911714553833, "rewards/ngrams_iou_reward/std": 0.14783146977424622, "rewards/schema_keywords_iou_reward/mean": 0.5133906006813049, "rewards/schema_keywords_iou_reward/std": 0.2384675145149231, "rewards/syntax_reward/mean": 0.5260416865348816, "rewards/syntax_reward/std": 0.500626802444458, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.55209350585938, "completions/mean_terminated_length": 180.61334228515625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.22391857506361323, "frac_reward_zero_std": 0.0, "grad_norm": 1.07028067111969, "kl": 0.011474609375, "learning_rate": 7.303370786516854e-07, "loss": 0.0269, "num_tokens": 17251311.0, "reward": 7.8707194328308105, "reward_std": 2.7380871772766113, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 1.3446191549301147, "rewards/format_reward/mean": 0.703125, "rewards/format_reward/std": 0.4580754339694977, "rewards/judge_reward/mean": 1.7822917699813843, "rewards/judge_reward/std": 1.5670356750488281, "rewards/ngrams_iou_reward/mean": 0.10730018466711044, "rewards/ngrams_iou_reward/std": 0.14987686276435852, "rewards/schema_keywords_iou_reward/mean": 0.5217523574829102, "rewards/schema_keywords_iou_reward/std": 0.24282923340797424, "rewards/syntax_reward/mean": 0.4895833432674408, "rewards/syntax_reward/std": 0.5011983513832092, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 209.39584350585938, "completions/mean_terminated_length": 165.6161651611328, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.22731128074639526, "frac_reward_zero_std": 0.0, "grad_norm": 1.199790120124817, "kl": 0.0181732177734375, "learning_rate": 7.415730337078651e-07, "loss": 0.0209, "num_tokens": 17516035.0, "reward": 7.037328243255615, "reward_std": 2.4470150470733643, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 1.2641865015029907, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4507846534252167, "rewards/judge_reward/mean": 1.6020833253860474, "rewards/judge_reward/std": 1.43654465675354, "rewards/ngrams_iou_reward/mean": 0.08526784181594849, "rewards/ngrams_iou_reward/std": 0.1305283159017563, "rewards/schema_keywords_iou_reward/mean": 0.5135179162025452, "rewards/schema_keywords_iou_reward/std": 0.21872013807296753, "rewards/syntax_reward/mean": 0.453125, "rewards/syntax_reward/std": 0.4990993142127991, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 201.75, "completions/mean_terminated_length": 163.0, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.23070398642917728, "frac_reward_zero_std": 0.0, "grad_norm": 1.4833307266235352, "kl": 0.0171661376953125, "learning_rate": 7.528089887640449e-07, "loss": 0.0255, "num_tokens": 17757019.0, "reward": 8.500919342041016, "reward_std": 2.337280750274658, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.8697916865348816, "rewards/format_reward/std": 0.3374122977256775, "rewards/judge_reward/mean": 1.1281250715255737, "rewards/judge_reward/std": 1.4029290676116943, "rewards/ngrams_iou_reward/mean": 0.15214239060878754, "rewards/ngrams_iou_reward/std": 0.19182537496089935, "rewards/schema_keywords_iou_reward/mean": 0.6550260186195374, "rewards/schema_keywords_iou_reward/std": 0.22187058627605438, "rewards/syntax_reward/mean": 0.6770833134651184, "rewards/syntax_reward/std": 0.46881362795829773, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 195.83334350585938, "completions/mean_terminated_length": 157.2649688720703, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.2340966921119593, "frac_reward_zero_std": 0.0, "grad_norm": 1.1580990552902222, "kl": 0.021484375, "learning_rate": 7.640449438202247e-07, "loss": 0.0125, "num_tokens": 18002405.0, "reward": 7.703563690185547, "reward_std": 2.357654571533203, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 1.2434382438659668, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.41447943449020386, "rewards/judge_reward/mean": 1.8781250715255737, "rewards/judge_reward/std": 1.5851207971572876, "rewards/ngrams_iou_reward/mean": 0.1056792363524437, "rewards/ngrams_iou_reward/std": 0.13449141383171082, "rewards/schema_keywords_iou_reward/mean": 0.5655925273895264, "rewards/schema_keywords_iou_reward/std": 0.24429352581501007, "rewards/syntax_reward/mean": 0.5260416865348816, "rewards/syntax_reward/std": 0.5006267428398132, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 206.03646850585938, "completions/mean_terminated_length": 166.34579467773438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.23748939779474132, "frac_reward_zero_std": 0.0, "grad_norm": 1.2185876369476318, "kl": 0.0214996337890625, "learning_rate": 7.752808988764044e-07, "loss": 0.0375, "num_tokens": 18240606.0, "reward": 8.162500381469727, "reward_std": 2.911954402923584, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 1.3366559743881226, "rewards/format_reward/mean": 0.8020833134651184, "rewards/format_reward/std": 0.39947062730789185, "rewards/judge_reward/mean": 1.8385416269302368, "rewards/judge_reward/std": 1.6044355630874634, "rewards/ngrams_iou_reward/mean": 0.11595511436462402, "rewards/ngrams_iou_reward/std": 0.1726832538843155, "rewards/schema_keywords_iou_reward/mean": 0.5517531037330627, "rewards/schema_keywords_iou_reward/std": 0.23376953601837158, "rewards/syntax_reward/mean": 0.578125, "rewards/syntax_reward/std": 0.4951499104499817, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 198.09375, "completions/mean_terminated_length": 152.0934600830078, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.2408821034775233, "frac_reward_zero_std": 0.0, "grad_norm": 1.5465986728668213, "kl": 0.019378662109375, "learning_rate": 7.865168539325843e-07, "loss": 0.05, "num_tokens": 18493962.0, "reward": 7.03378438949585, "reward_std": 2.442488431930542, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 1.2932630777359009, "rewards/format_reward/mean": 0.8541666865348816, "rewards/format_reward/std": 0.3538617789745331, "rewards/judge_reward/mean": 1.3697916269302368, "rewards/judge_reward/std": 1.285979151725769, "rewards/ngrams_iou_reward/mean": 0.096718929708004, "rewards/ngrams_iou_reward/std": 0.1067277044057846, "rewards/schema_keywords_iou_reward/mean": 0.5464398860931396, "rewards/schema_keywords_iou_reward/std": 0.20174740254878998, "rewards/syntax_reward/mean": 0.59375, "rewards/syntax_reward/std": 0.49241629242897034, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 186.50521850585938, "completions/mean_terminated_length": 142.92373657226562, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.24427480916030533, "frac_reward_zero_std": 0.0, "grad_norm": 1.08103346824646, "kl": 0.02197265625, "learning_rate": 7.977528089887639e-07, "loss": 0.0315, "num_tokens": 18750685.0, "reward": 8.080093383789062, "reward_std": 2.5458316802978516, "rewards/accuracy_reward/mean": 1.15625, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.8697916865348816, "rewards/format_reward/std": 0.3374123275279999, "rewards/judge_reward/mean": 1.181249976158142, "rewards/judge_reward/std": 1.421032190322876, "rewards/ngrams_iou_reward/mean": 0.09380451589822769, "rewards/ngrams_iou_reward/std": 0.14420104026794434, "rewards/schema_keywords_iou_reward/mean": 0.5664973855018616, "rewards/schema_keywords_iou_reward/std": 0.22791646420955658, "rewards/syntax_reward/mean": 0.71875, "rewards/syntax_reward/std": 0.4507846534252167, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 184.640625, "completions/mean_terminated_length": 137.88792419433594, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.24766751484308736, "frac_reward_zero_std": 0.0, "grad_norm": 1.698535680770874, "kl": 0.02374267578125, "learning_rate": 8.089887640449438e-07, "loss": 0.0136, "num_tokens": 18987520.0, "reward": 8.666259765625, "reward_std": 2.068162441253662, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237156867981, "rewards/judge_reward/mean": 0.9885417819023132, "rewards/judge_reward/std": 1.3312489986419678, "rewards/ngrams_iou_reward/mean": 0.14204256236553192, "rewards/ngrams_iou_reward/std": 0.196555495262146, "rewards/schema_keywords_iou_reward/mean": 0.6148409843444824, "rewards/schema_keywords_iou_reward/std": 0.22072069346904755, "rewards/syntax_reward/mean": 0.7083333134651184, "rewards/syntax_reward/std": 0.4557180106639862, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.25, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 201.484375, "completions/mean_terminated_length": 155.35577392578125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.2510602205258694, "frac_reward_zero_std": 0.0, "grad_norm": 1.1826355457305908, "kl": 0.027923583984375, "learning_rate": 8.202247191011235e-07, "loss": 0.0108, "num_tokens": 19222195.0, "reward": 7.8895182609558105, "reward_std": 2.2317402362823486, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 1.2434382438659668, "rewards/format_reward/mean": 0.8697916865348816, "rewards/format_reward/std": 0.3374122977256775, "rewards/judge_reward/mean": 1.881250023841858, "rewards/judge_reward/std": 1.4899532794952393, "rewards/ngrams_iou_reward/mean": 0.14016573131084442, "rewards/ngrams_iou_reward/std": 0.19328567385673523, "rewards/schema_keywords_iou_reward/mean": 0.5858108401298523, "rewards/schema_keywords_iou_reward/std": 0.21793349087238312, "rewards/syntax_reward/mean": 0.5625, "rewards/syntax_reward/std": 0.4973753094673157, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 190.11459350585938, "completions/mean_terminated_length": 145.03509521484375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.2544529262086514, "frac_reward_zero_std": 0.0, "grad_norm": 1.5029252767562866, "kl": 0.027496337890625, "learning_rate": 8.314606741573034e-07, "loss": 0.0491, "num_tokens": 19460327.0, "reward": 8.05005168914795, "reward_std": 2.2245540618896484, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 1.3200279474258423, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33158352971076965, "rewards/judge_reward/mean": 1.796875, "rewards/judge_reward/std": 1.5262417793273926, "rewards/ngrams_iou_reward/mean": 0.06893058866262436, "rewards/ngrams_iou_reward/std": 0.0875948891043663, "rewards/schema_keywords_iou_reward/mean": 0.5748706459999084, "rewards/schema_keywords_iou_reward/std": 0.2066272646188736, "rewards/syntax_reward/mean": 0.59375, "rewards/syntax_reward/std": 0.49241629242897034, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 195.53125, "completions/mean_terminated_length": 150.4545440673828, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.2578456318914334, "frac_reward_zero_std": 0.0, "grad_norm": 1.0876084566116333, "kl": 0.02508544921875, "learning_rate": 8.426966292134831e-07, "loss": 0.0479, "num_tokens": 19698527.0, "reward": 8.146247863769531, "reward_std": 2.5267062187194824, "rewards/accuracy_reward/mean": 1.109375, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3129251003265381, "rewards/judge_reward/mean": 1.3354167938232422, "rewards/judge_reward/std": 1.427037239074707, "rewards/ngrams_iou_reward/mean": 0.08705323189496994, "rewards/ngrams_iou_reward/std": 0.1295478641986847, "rewards/schema_keywords_iou_reward/mean": 0.5862776637077332, "rewards/schema_keywords_iou_reward/std": 0.19978445768356323, "rewards/syntax_reward/mean": 0.5833333134651184, "rewards/syntax_reward/std": 0.49429556727409363, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 200.0260467529297, "completions/mean_terminated_length": 139.18478393554688, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.26123833757421544, "frac_reward_zero_std": 0.0, "grad_norm": 1.1834217309951782, "kl": 0.028076171875, "learning_rate": 8.539325842696628e-07, "loss": 0.0308, "num_tokens": 19940056.0, "reward": 8.536707878112793, "reward_std": 2.8610615730285645, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36404144763946533, "rewards/judge_reward/mean": 1.1552082300186157, "rewards/judge_reward/std": 1.493537187576294, "rewards/ngrams_iou_reward/mean": 0.15851372480392456, "rewards/ngrams_iou_reward/std": 0.23457807302474976, "rewards/schema_keywords_iou_reward/mean": 0.5834022164344788, "rewards/schema_keywords_iou_reward/std": 0.24035170674324036, "rewards/syntax_reward/mean": 0.609375, "rewards/syntax_reward/std": 0.4891659915447235, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 208.63021850585938, "completions/mean_terminated_length": 147.7261962890625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.26463104325699743, "frac_reward_zero_std": 0.0, "grad_norm": 1.4016437530517578, "kl": 0.032073974609375, "learning_rate": 8.651685393258427e-07, "loss": 0.0225, "num_tokens": 20210501.0, "reward": 8.162825584411621, "reward_std": 2.276712417602539, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 1.3941725492477417, "rewards/format_reward/mean": 0.8854166865348816, "rewards/format_reward/std": 0.3193511366844177, "rewards/judge_reward/mean": 1.5708333253860474, "rewards/judge_reward/std": 1.5396958589553833, "rewards/ngrams_iou_reward/mean": 0.0930049791932106, "rewards/ngrams_iou_reward/std": 0.1277054101228714, "rewards/schema_keywords_iou_reward/mean": 0.5739865303039551, "rewards/schema_keywords_iou_reward/std": 0.20893342792987823, "rewards/syntax_reward/mean": 0.65625, "rewards/syntax_reward/std": 0.47620058059692383, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 201.9322967529297, "completions/mean_terminated_length": 135.2906951904297, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.2680237489397795, "frac_reward_zero_std": 0.0, "grad_norm": 1.1162060499191284, "kl": 0.027618408203125, "learning_rate": 8.764044943820224e-07, "loss": -0.0239, "num_tokens": 20470114.0, "reward": 7.561850547790527, "reward_std": 2.43158221244812, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 1.2641865015029907, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33158352971076965, "rewards/judge_reward/mean": 1.7041667699813843, "rewards/judge_reward/std": 1.4641199111938477, "rewards/ngrams_iou_reward/mean": 0.06094999611377716, "rewards/ngrams_iou_reward/std": 0.060578301548957825, "rewards/schema_keywords_iou_reward/mean": 0.5352745652198792, "rewards/schema_keywords_iou_reward/std": 0.18333764374256134, "rewards/syntax_reward/mean": 0.6197916865348816, "rewards/syntax_reward/std": 0.48670700192451477, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 213.046875, "completions/mean_terminated_length": 150.26922607421875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.2714164546225615, "frac_reward_zero_std": 0.0, "grad_norm": 1.2552645206451416, "kl": 0.028564453125, "learning_rate": 8.876404494382022e-07, "loss": 0.0251, "num_tokens": 20739001.0, "reward": 7.802571773529053, "reward_std": 1.9018763303756714, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 1.3742263317108154, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.4854165315628052, "rewards/judge_reward/std": 1.4079043865203857, "rewards/ngrams_iou_reward/mean": 0.12762482464313507, "rewards/ngrams_iou_reward/std": 0.18244674801826477, "rewards/schema_keywords_iou_reward/mean": 0.5791130065917969, "rewards/schema_keywords_iou_reward/std": 0.22025275230407715, "rewards/syntax_reward/mean": 0.5364583134651184, "rewards/syntax_reward/std": 0.49997273087501526, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 222.40625, "completions/mean_terminated_length": 158.27273559570312, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.2748091603053435, "frac_reward_zero_std": 0.0, "grad_norm": 1.2064387798309326, "kl": 0.026885986328125, "learning_rate": 8.988764044943819e-07, "loss": -0.0043, "num_tokens": 20995237.0, "reward": 8.262694358825684, "reward_std": 2.4073500633239746, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 1.3941725492477417, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.6385416984558105, "rewards/judge_reward/std": 1.5282244682312012, "rewards/ngrams_iou_reward/mean": 0.09662985056638718, "rewards/ngrams_iou_reward/std": 0.07779260724782944, "rewards/schema_keywords_iou_reward/mean": 0.5921056270599365, "rewards/schema_keywords_iou_reward/std": 0.1941176950931549, "rewards/syntax_reward/mean": 0.5625, "rewards/syntax_reward/std": 0.4973753094673157, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 216.0572967529297, "completions/mean_terminated_length": 149.48611450195312, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.2782018659881255, "frac_reward_zero_std": 0.0, "grad_norm": 1.4865624904632568, "kl": 0.026214599609375, "learning_rate": 9.101123595505618e-07, "loss": -0.0041, "num_tokens": 21226836.0, "reward": 8.111324310302734, "reward_std": 2.287806272506714, "rewards/accuracy_reward/mean": 0.859375, "rewards/accuracy_reward/std": 1.3598642349243164, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.6416667699813843, "rewards/judge_reward/std": 1.5518720149993896, "rewards/ngrams_iou_reward/mean": 0.10001951456069946, "rewards/ngrams_iou_reward/std": 0.10800912976264954, "rewards/schema_keywords_iou_reward/mean": 0.5821375846862793, "rewards/schema_keywords_iou_reward/std": 0.20061540603637695, "rewards/syntax_reward/mean": 0.65625, "rewards/syntax_reward/std": 0.47620058059692383, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 222.00521850585938, "completions/mean_terminated_length": 152.39683532714844, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.28159457167090757, "frac_reward_zero_std": 0.0, "grad_norm": 1.2273881435394287, "kl": 0.02362060546875, "learning_rate": 9.213483146067416e-07, "loss": 0.037, "num_tokens": 21485047.0, "reward": 7.703174114227295, "reward_std": 1.9468019008636475, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 1.2932630777359009, "rewards/format_reward/mean": 0.8802083134651184, "rewards/format_reward/std": 0.32556670904159546, "rewards/judge_reward/mean": 1.7166666984558105, "rewards/judge_reward/std": 1.4895563125610352, "rewards/ngrams_iou_reward/mean": 0.09034530073404312, "rewards/ngrams_iou_reward/std": 0.1215687170624733, "rewards/schema_keywords_iou_reward/mean": 0.5544948577880859, "rewards/schema_keywords_iou_reward/std": 0.23445703089237213, "rewards/syntax_reward/mean": 0.5416666865348816, "rewards/syntax_reward/std": 0.4995634853839874, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 216.046875, "completions/mean_terminated_length": 130.2458953857422, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.28498727735368956, "frac_reward_zero_std": 0.0, "grad_norm": 1.276920199394226, "kl": 0.02978515625, "learning_rate": 9.325842696629212e-07, "loss": 0.0287, "num_tokens": 21746278.0, "reward": 8.542695045471191, "reward_std": 2.121609687805176, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 1.3941725492477417, "rewards/format_reward/mean": 0.8541666865348816, "rewards/format_reward/std": 0.3538617789745331, "rewards/judge_reward/mean": 1.7864583730697632, "rewards/judge_reward/std": 1.6660525798797607, "rewards/ngrams_iou_reward/mean": 0.11243131011724472, "rewards/ngrams_iou_reward/std": 0.13828878104686737, "rewards/schema_keywords_iou_reward/mean": 0.5708882808685303, "rewards/schema_keywords_iou_reward/std": 0.220788836479187, "rewards/syntax_reward/mean": 0.6197916865348816, "rewards/syntax_reward/std": 0.48670700192451477, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 219.9635467529297, "completions/mean_terminated_length": 142.57376098632812, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.2883799830364716, "frac_reward_zero_std": 0.03125, "grad_norm": 1.0227446556091309, "kl": 0.023681640625, "learning_rate": 9.438202247191011e-07, "loss": 0.0017, "num_tokens": 21995673.0, "reward": 8.991811752319336, "reward_std": 1.897948145866394, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.8802083134651184, "rewards/format_reward/std": 0.32556667923927307, "rewards/judge_reward/mean": 1.7218750715255737, "rewards/judge_reward/std": 1.6262056827545166, "rewards/ngrams_iou_reward/mean": 0.1462937742471695, "rewards/ngrams_iou_reward/std": 0.214718297123909, "rewards/schema_keywords_iou_reward/mean": 0.6205170750617981, "rewards/schema_keywords_iou_reward/std": 0.23377738893032074, "rewards/syntax_reward/mean": 0.6197916865348816, "rewards/syntax_reward/std": 0.48670700192451477, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 214.4166717529297, "completions/mean_terminated_length": 153.64102172851562, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.2917726887192536, "frac_reward_zero_std": 0.0, "grad_norm": 1.0589011907577515, "kl": 0.026947021484375, "learning_rate": 9.55056179775281e-07, "loss": 0.0295, "num_tokens": 22253909.0, "reward": 8.258440971374512, "reward_std": 2.41654109954834, "rewards/accuracy_reward/mean": 1.109375, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.8854166865348816, "rewards/format_reward/std": 0.3193511664867401, "rewards/judge_reward/mean": 1.3760417699813843, "rewards/judge_reward/std": 1.513931393623352, "rewards/ngrams_iou_reward/mean": 0.09897658973932266, "rewards/ngrams_iou_reward/std": 0.14175963401794434, "rewards/schema_keywords_iou_reward/mean": 0.5792551040649414, "rewards/schema_keywords_iou_reward/std": 0.2235826849937439, "rewards/syntax_reward/mean": 0.6145833134651184, "rewards/syntax_reward/std": 0.48796597123146057, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 206.8229217529297, "completions/mean_terminated_length": 130.10667419433594, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.2951653944020356, "frac_reward_zero_std": 0.03125, "grad_norm": 1.3481906652450562, "kl": 0.0245361328125, "learning_rate": 9.662921348314607e-07, "loss": -0.0275, "num_tokens": 22517023.0, "reward": 8.475128173828125, "reward_std": 1.9701045751571655, "rewards/accuracy_reward/mean": 0.96875, "rewards/accuracy_reward/std": 1.4064408540725708, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.6869791746139526, "rewards/judge_reward/std": 1.502663254737854, "rewards/ngrams_iou_reward/mean": 0.11691104620695114, "rewards/ngrams_iou_reward/std": 0.16753508150577545, "rewards/schema_keywords_iou_reward/mean": 0.6144661903381348, "rewards/schema_keywords_iou_reward/std": 0.20729143917560577, "rewards/syntax_reward/mean": 0.5520833134651184, "rewards/syntax_reward/std": 0.4985799789428711, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 202.5, "completions/mean_terminated_length": 136.55813598632812, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.29855810008481765, "frac_reward_zero_std": 0.03125, "grad_norm": 1.3504608869552612, "kl": 0.028839111328125, "learning_rate": 9.775280898876404e-07, "loss": -0.0069, "num_tokens": 22773913.0, "reward": 9.077900886535645, "reward_std": 1.5416126251220703, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33158352971076965, "rewards/judge_reward/mean": 1.212499976158142, "rewards/judge_reward/std": 1.4475289583206177, "rewards/ngrams_iou_reward/mean": 0.1629806011915207, "rewards/ngrams_iou_reward/std": 0.22604894638061523, "rewards/schema_keywords_iou_reward/mean": 0.6305447220802307, "rewards/schema_keywords_iou_reward/std": 0.21640901267528534, "rewards/syntax_reward/mean": 0.71875, "rewards/syntax_reward/std": 0.4507846534252167, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 202.64584350585938, "completions/mean_terminated_length": 138.25286865234375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.30195080576759964, "frac_reward_zero_std": 0.0, "grad_norm": 1.775941014289856, "kl": 0.028778076171875, "learning_rate": 9.887640449438202e-07, "loss": -0.0202, "num_tokens": 23036243.0, "reward": 8.59593677520752, "reward_std": 1.8617372512817383, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 1.4179108142852783, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.658333420753479, "rewards/judge_reward/std": 1.5922367572784424, "rewards/ngrams_iou_reward/mean": 0.16039443016052246, "rewards/ngrams_iou_reward/std": 0.21164590120315552, "rewards/schema_keywords_iou_reward/mean": 0.6188754439353943, "rewards/schema_keywords_iou_reward/std": 0.22027184069156647, "rewards/syntax_reward/mean": 0.578125, "rewards/syntax_reward/std": 0.4951499104499817, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.42709350585938, "completions/mean_terminated_length": 152.73529052734375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.3053435114503817, "frac_reward_zero_std": 0.0, "grad_norm": 0.9712012410163879, "kl": 0.022918701171875, "learning_rate": 1e-06, "loss": 0.035, "num_tokens": 23279529.0, "reward": 7.670914649963379, "reward_std": 2.120391845703125, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 1.3742263317108154, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33158352971076965, "rewards/judge_reward/mean": 1.4677082300186157, "rewards/judge_reward/std": 1.4344302415847778, "rewards/ngrams_iou_reward/mean": 0.09229392558336258, "rewards/ngrams_iou_reward/std": 0.10205727070569992, "rewards/schema_keywords_iou_reward/mean": 0.5650789737701416, "rewards/schema_keywords_iou_reward/std": 0.20432521402835846, "rewards/syntax_reward/mean": 0.53125, "rewards/syntax_reward/std": 0.5003271102905273, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 202.4947967529297, "completions/mean_terminated_length": 139.26136779785156, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.3087362171331637, "frac_reward_zero_std": 0.0625, "grad_norm": 1.1257421970367432, "kl": 0.025054931640625, "learning_rate": 9.99996105846605e-07, "loss": 0.0478, "num_tokens": 23549480.0, "reward": 8.929415702819824, "reward_std": 1.770254373550415, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.8802083134651184, "rewards/format_reward/std": 0.32556670904159546, "rewards/judge_reward/mean": 1.0770833492279053, "rewards/judge_reward/std": 1.3798458576202393, "rewards/ngrams_iou_reward/mean": 0.18879376351833344, "rewards/ngrams_iou_reward/std": 0.25918880105018616, "rewards/schema_keywords_iou_reward/mean": 0.6333299875259399, "rewards/schema_keywords_iou_reward/std": 0.23912173509597778, "rewards/syntax_reward/mean": 0.6666666865348816, "rewards/syntax_reward/std": 0.4726369380950928, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 204.0572967529297, "completions/mean_terminated_length": 137.27381896972656, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.31212892281594573, "frac_reward_zero_std": 0.0, "grad_norm": 1.1410378217697144, "kl": 0.028106689453125, "learning_rate": 9.99984423447078e-07, "loss": 0.026, "num_tokens": 23788291.0, "reward": 8.718164443969727, "reward_std": 1.952917456626892, "rewards/accuracy_reward/mean": 0.953125, "rewards/accuracy_reward/std": 1.4004077911376953, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.7447916269302368, "rewards/judge_reward/std": 1.5033901929855347, "rewards/ngrams_iou_reward/mean": 0.0912134125828743, "rewards/ngrams_iou_reward/std": 0.12452221661806107, "rewards/schema_keywords_iou_reward/mean": 0.6529912352561951, "rewards/schema_keywords_iou_reward/std": 0.1457684338092804, "rewards/syntax_reward/mean": 0.6979166865348816, "rewards/syntax_reward/std": 0.4603615701198578, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 206.7916717529297, "completions/mean_terminated_length": 143.52381896972656, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.3155216284987277, "frac_reward_zero_std": 0.0, "grad_norm": 1.3988828659057617, "kl": 0.03070068359375, "learning_rate": 9.999649529833914e-07, "loss": 0.0328, "num_tokens": 24050265.0, "reward": 8.584341049194336, "reward_std": 2.244311809539795, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 1.3446191549301147, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.928125023841858, "rewards/judge_reward/std": 1.5502880811691284, "rewards/ngrams_iou_reward/mean": 0.14422744512557983, "rewards/ngrams_iou_reward/std": 0.18203014135360718, "rewards/schema_keywords_iou_reward/mean": 0.6255298256874084, "rewards/schema_keywords_iou_reward/std": 0.1996641606092453, "rewards/syntax_reward/mean": 0.5729166865348816, "rewards/syntax_reward/std": 0.49594783782958984, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 145.1327362060547, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.3189143341815098, "frac_reward_zero_std": 0.0, "grad_norm": 1.2586008310317993, "kl": 0.027984619140625, "learning_rate": 9.999376947588285e-07, "loss": 0.0135, "num_tokens": 24298767.0, "reward": 9.023799896240234, "reward_std": 2.230459690093994, "rewards/accuracy_reward/mean": 1.125, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.6354166269302368, "rewards/judge_reward/std": 1.5938348770141602, "rewards/ngrams_iou_reward/mean": 0.10731035470962524, "rewards/ngrams_iou_reward/std": 0.16284945607185364, "rewards/schema_keywords_iou_reward/mean": 0.6091969013214111, "rewards/schema_keywords_iou_reward/std": 0.18734461069107056, "rewards/syntax_reward/mean": 0.7291666865348816, "rewards/syntax_reward/std": 0.44555196166038513, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 198.2604217529297, "completions/mean_terminated_length": 148.36892700195312, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.32230703986429177, "frac_reward_zero_std": 0.0, "grad_norm": 1.26724112033844, "kl": 0.030517578125, "learning_rate": 9.999026491979807e-07, "loss": 0.0192, "num_tokens": 24551501.0, "reward": 8.836078643798828, "reward_std": 1.7512801885604858, "rewards/accuracy_reward/mean": 0.921875, "rewards/accuracy_reward/std": 1.3877325057983398, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.8197916746139526, "rewards/judge_reward/std": 1.4578990936279297, "rewards/ngrams_iou_reward/mean": 0.12883242964744568, "rewards/ngrams_iou_reward/std": 0.17854827642440796, "rewards/schema_keywords_iou_reward/mean": 0.6405789256095886, "rewards/schema_keywords_iou_reward/std": 0.19198061525821686, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 188.80209350585938, "completions/mean_terminated_length": 151.10568237304688, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.3256997455470738, "frac_reward_zero_std": 0.0, "grad_norm": 1.1158562898635864, "kl": 0.029693603515625, "learning_rate": 9.99859816846739e-07, "loss": 0.0483, "num_tokens": 24796383.0, "reward": 8.803162574768066, "reward_std": 2.2742507457733154, "rewards/accuracy_reward/mean": 1.046875, "rewards/accuracy_reward/std": 1.433660626411438, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.6645832061767578, "rewards/judge_reward/std": 1.5501378774642944, "rewards/ngrams_iou_reward/mean": 0.11494836211204529, "rewards/ngrams_iou_reward/std": 0.1420796513557434, "rewards/schema_keywords_iou_reward/mean": 0.6194632649421692, "rewards/schema_keywords_iou_reward/std": 0.17779454588890076, "rewards/syntax_reward/mean": 0.6666666865348816, "rewards/syntax_reward/std": 0.4726369380950928, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 183.58334350585938, "completions/mean_terminated_length": 144.76800537109375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.3290924512298558, "frac_reward_zero_std": 0.03125, "grad_norm": 1.2957168817520142, "kl": 0.033203125, "learning_rate": 9.99809198372286e-07, "loss": 0.0556, "num_tokens": 25044121.0, "reward": 9.432010650634766, "reward_std": 2.0976369380950928, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.0541666746139526, "rewards/judge_reward/std": 1.3766881227493286, "rewards/ngrams_iou_reward/mean": 0.1662929207086563, "rewards/ngrams_iou_reward/std": 0.18636976182460785, "rewards/schema_keywords_iou_reward/mean": 0.6625922322273254, "rewards/schema_keywords_iou_reward/std": 0.20525327324867249, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 195.28125, "completions/mean_terminated_length": 153.73684692382812, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.3324851569126378, "frac_reward_zero_std": 0.03125, "grad_norm": 1.5475517511367798, "kl": 0.029876708984375, "learning_rate": 9.99750794563087e-07, "loss": 0.0301, "num_tokens": 25259503.0, "reward": 9.657742500305176, "reward_std": 1.543560266494751, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.2083333730697632, "rewards/judge_reward/std": 1.5199660062789917, "rewards/ngrams_iou_reward/mean": 0.18556584417819977, "rewards/ngrams_iou_reward/std": 0.21706312894821167, "rewards/schema_keywords_iou_reward/mean": 0.6753013134002686, "rewards/schema_keywords_iou_reward/std": 0.1868273913860321, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.125, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 176.4479217529297, "completions/mean_terminated_length": 140.2878875732422, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.33587786259541985, "frac_reward_zero_std": 0.0, "grad_norm": 2.0475363731384277, "kl": 0.03668212890625, "learning_rate": 9.996846063288745e-07, "loss": 0.0077, "num_tokens": 25520415.0, "reward": 9.416318893432617, "reward_std": 1.727085828781128, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.3197916746139526, "rewards/judge_reward/std": 1.6048433780670166, "rewards/ngrams_iou_reward/mean": 0.15615005791187286, "rewards/ngrams_iou_reward/std": 0.21659539639949799, "rewards/schema_keywords_iou_reward/mean": 0.6362102031707764, "rewards/schema_keywords_iou_reward/std": 0.21927696466445923, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 191.84375, "completions/mean_terminated_length": 147.94737243652344, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.33927056827820185, "frac_reward_zero_std": 0.0, "grad_norm": 1.2524328231811523, "kl": 0.031494140625, "learning_rate": 9.996106347006379e-07, "loss": 0.0068, "num_tokens": 25785375.0, "reward": 8.603668212890625, "reward_std": 2.0441062450408936, "rewards/accuracy_reward/mean": 1.203125, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.8697916865348816, "rewards/format_reward/std": 0.3374122977256775, "rewards/judge_reward/mean": 1.3541666269302368, "rewards/judge_reward/std": 1.4833043813705444, "rewards/ngrams_iou_reward/mean": 0.11567280441522598, "rewards/ngrams_iou_reward/std": 0.1444907933473587, "rewards/schema_keywords_iou_reward/mean": 0.5921606421470642, "rewards/schema_keywords_iou_reward/std": 0.19117134809494019, "rewards/syntax_reward/mean": 0.7083333134651184, "rewards/syntax_reward/std": 0.4557179808616638, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 199.80209350585938, "completions/mean_terminated_length": 151.2427215576172, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.3426632739609839, "frac_reward_zero_std": 0.0, "grad_norm": 1.3043783903121948, "kl": 0.03082275390625, "learning_rate": 9.99528880830604e-07, "loss": -0.0099, "num_tokens": 26041285.0, "reward": 8.613112449645996, "reward_std": 2.063668727874756, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 1.3941725492477417, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.7177082300186157, "rewards/judge_reward/std": 1.5535465478897095, "rewards/ngrams_iou_reward/mean": 0.10391423851251602, "rewards/ngrams_iou_reward/std": 0.11147184669971466, "rewards/schema_keywords_iou_reward/mean": 0.6258634924888611, "rewards/schema_keywords_iou_reward/std": 0.16946756839752197, "rewards/syntax_reward/mean": 0.7395833134651184, "rewards/syntax_reward/std": 0.44000932574272156, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 203.56771850585938, "completions/mean_terminated_length": 155.3300018310547, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.3460559796437659, "frac_reward_zero_std": 0.0, "grad_norm": 1.7664121389389038, "kl": 0.031341552734375, "learning_rate": 9.994393459922216e-07, "loss": -0.0029, "num_tokens": 26272010.0, "reward": 8.858150482177734, "reward_std": 1.6212437152862549, "rewards/accuracy_reward/mean": 1.25, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.381250023841858, "rewards/judge_reward/std": 1.4836146831512451, "rewards/ngrams_iou_reward/mean": 0.11831009387969971, "rewards/ngrams_iou_reward/std": 0.16120223701000214, "rewards/schema_keywords_iou_reward/mean": 0.6335899233818054, "rewards/schema_keywords_iou_reward/std": 0.18448884785175323, "rewards/syntax_reward/mean": 0.6666666865348816, "rewards/syntax_reward/std": 0.4726369380950928, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 202.16146850585938, "completions/mean_terminated_length": 141.1444549560547, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.34944868532654794, "frac_reward_zero_std": 0.0, "grad_norm": 1.7952020168304443, "kl": 0.03363037109375, "learning_rate": 9.993420315801405e-07, "loss": 0.0044, "num_tokens": 26509869.0, "reward": 8.78827953338623, "reward_std": 1.7812350988388062, "rewards/accuracy_reward/mean": 1.25, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.3447917699813843, "rewards/judge_reward/std": 1.4969685077667236, "rewards/ngrams_iou_reward/mean": 0.10413641482591629, "rewards/ngrams_iou_reward/std": 0.11220812052488327, "rewards/schema_keywords_iou_reward/mean": 0.6195592284202576, "rewards/schema_keywords_iou_reward/std": 0.1970679759979248, "rewards/syntax_reward/mean": 0.7135416865348816, "rewards/syntax_reward/std": 0.45328810811042786, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 215.70834350585938, "completions/mean_terminated_length": 143.88406372070312, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.35284139100932993, "frac_reward_zero_std": 0.0, "grad_norm": 1.591999888420105, "kl": 0.03399658203125, "learning_rate": 9.992369391101894e-07, "loss": 0.0198, "num_tokens": 26779669.0, "reward": 8.405960083007812, "reward_std": 1.9206275939941406, "rewards/accuracy_reward/mean": 1.015625, "rewards/accuracy_reward/std": 1.423352599143982, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.4968751668930054, "rewards/judge_reward/std": 1.5064226388931274, "rewards/ngrams_iou_reward/mean": 0.13421334326267242, "rewards/ngrams_iou_reward/std": 0.19022606313228607, "rewards/schema_keywords_iou_reward/mean": 0.6009133458137512, "rewards/schema_keywords_iou_reward/std": 0.21455951035022736, "rewards/syntax_reward/mean": 0.7291666865348816, "rewards/syntax_reward/std": 0.44555196166038513, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 215.9791717529297, "completions/mean_terminated_length": 141.3134307861328, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.356234096692112, "frac_reward_zero_std": 0.0, "grad_norm": 1.5926222801208496, "kl": 0.04345703125, "learning_rate": 9.991240702193531e-07, "loss": 0.0206, "num_tokens": 27028857.0, "reward": 9.503074645996094, "reward_std": 2.013258457183838, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.4500001668930054, "rewards/judge_reward/std": 1.633292317390442, "rewards/ngrams_iou_reward/mean": 0.16259709000587463, "rewards/ngrams_iou_reward/std": 0.20717167854309082, "rewards/schema_keywords_iou_reward/mean": 0.6644356846809387, "rewards/schema_keywords_iou_reward/std": 0.198873832821846, "rewards/syntax_reward/mean": 0.734375, "rewards/syntax_reward/std": 0.44282010197639465, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.72396850585938, "completions/mean_terminated_length": 140.5192413330078, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.359626802374894, "frac_reward_zero_std": 0.0, "grad_norm": 1.5283708572387695, "kl": 0.030975341796875, "learning_rate": 9.990034266657467e-07, "loss": -0.0118, "num_tokens": 27300898.0, "reward": 8.739726066589355, "reward_std": 1.8685444593429565, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.5374999046325684, "rewards/judge_reward/std": 1.5391062498092651, "rewards/ngrams_iou_reward/mean": 0.13817442953586578, "rewards/ngrams_iou_reward/std": 0.19757206737995148, "rewards/schema_keywords_iou_reward/mean": 0.6098842620849609, "rewards/schema_keywords_iou_reward/std": 0.21345488727092743, "rewards/syntax_reward/mean": 0.7239583134651184, "rewards/syntax_reward/std": 0.4482063353061676, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.4947967529297, "completions/mean_terminated_length": 151.0465087890625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.363019508057676, "frac_reward_zero_std": 0.0, "grad_norm": 1.3675175905227661, "kl": 0.028472900390625, "learning_rate": 9.98875010328588e-07, "loss": 0.0179, "num_tokens": 27556971.0, "reward": 8.73992919921875, "reward_std": 2.1390185356140137, "rewards/accuracy_reward/mean": 1.140625, "rewards/accuracy_reward/std": 1.4601210355758667, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.527083396911621, "rewards/judge_reward/std": 1.551387071609497, "rewards/ngrams_iou_reward/mean": 0.1161474660038948, "rewards/ngrams_iou_reward/std": 0.1588987410068512, "rewards/schema_keywords_iou_reward/mean": 0.6216978430747986, "rewards/schema_keywords_iou_reward/std": 0.21331806480884552, "rewards/syntax_reward/mean": 0.625, "rewards/syntax_reward/std": 0.4853885769844055, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 230.859375, "completions/mean_terminated_length": 148.73333740234375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.366412213740458, "frac_reward_zero_std": 0.0, "grad_norm": 1.1999385356903076, "kl": 0.026153564453125, "learning_rate": 9.987388232081693e-07, "loss": 0.0217, "num_tokens": 27812340.0, "reward": 9.564772605895996, "reward_std": 2.1631131172180176, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.5031250715255737, "rewards/judge_reward/std": 1.6721903085708618, "rewards/ngrams_iou_reward/mean": 0.1771235316991806, "rewards/ngrams_iou_reward/std": 0.21547582745552063, "rewards/schema_keywords_iou_reward/mean": 0.6730648875236511, "rewards/schema_keywords_iou_reward/std": 0.2235236018896103, "rewards/syntax_reward/mean": 0.6614583134651184, "rewards/syntax_reward/std": 0.47445085644721985, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 233.19271850585938, "completions/mean_terminated_length": 149.1951141357422, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.36980491942324, "frac_reward_zero_std": 0.0, "grad_norm": 1.0344549417495728, "kl": 0.027679443359375, "learning_rate": 9.985948674258242e-07, "loss": 0.0166, "num_tokens": 28070875.0, "reward": 8.88571834564209, "reward_std": 2.168470859527588, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.8697916865348816, "rewards/format_reward/std": 0.3374122977256775, "rewards/judge_reward/mean": 1.4583333730697632, "rewards/judge_reward/std": 1.5987995862960815, "rewards/ngrams_iou_reward/mean": 0.1359727382659912, "rewards/ngrams_iou_reward/std": 0.1569548100233078, "rewards/schema_keywords_iou_reward/mean": 0.6351618766784668, "rewards/schema_keywords_iou_reward/std": 0.2122126668691635, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 225.91146850585938, "completions/mean_terminated_length": 149.01852416992188, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.37319762510602206, "frac_reward_zero_std": 0.0, "grad_norm": 0.9545531272888184, "kl": 0.02691650390625, "learning_rate": 9.984431452238966e-07, "loss": 0.0168, "num_tokens": 28334408.0, "reward": 8.519354820251465, "reward_std": 2.0655031204223633, "rewards/accuracy_reward/mean": 1.078125, "rewards/accuracy_reward/std": 1.4432151317596436, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.4822916984558105, "rewards/judge_reward/std": 1.5241507291793823, "rewards/ngrams_iou_reward/mean": 0.12170889973640442, "rewards/ngrams_iou_reward/std": 0.1417846977710724, "rewards/schema_keywords_iou_reward/mean": 0.6205617785453796, "rewards/schema_keywords_iou_reward/std": 0.20419307053089142, "rewards/syntax_reward/mean": 0.6614583134651184, "rewards/syntax_reward/std": 0.47445085644721985, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 232.0885467529297, "completions/mean_terminated_length": 149.23255920410156, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.37659033078880405, "frac_reward_zero_std": 0.0, "grad_norm": 0.9369682669639587, "kl": 0.02691650390625, "learning_rate": 9.982836589657042e-07, "loss": 0.0074, "num_tokens": 28581787.0, "reward": 9.075246810913086, "reward_std": 2.080397129058838, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.3770833015441895, "rewards/judge_reward/std": 1.4851077795028687, "rewards/ngrams_iou_reward/mean": 0.1039457693696022, "rewards/ngrams_iou_reward/std": 0.06860649585723877, "rewards/schema_keywords_iou_reward/mean": 0.6442177295684814, "rewards/schema_keywords_iou_reward/std": 0.19168974459171295, "rewards/syntax_reward/mean": 0.7239583134651184, "rewards/syntax_reward/std": 0.4482063353061676, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 225.70834350585938, "completions/mean_terminated_length": 139.67999267578125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.3799830364715861, "frac_reward_zero_std": 0.0, "grad_norm": 1.0761561393737793, "kl": 0.02752685546875, "learning_rate": 9.981164111355034e-07, "loss": 0.0075, "num_tokens": 28815917.0, "reward": 8.943937301635742, "reward_std": 2.2974190711975098, "rewards/accuracy_reward/mean": 1.25, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.396875023841858, "rewards/judge_reward/std": 1.5223950147628784, "rewards/ngrams_iou_reward/mean": 0.13540996611118317, "rewards/ngrams_iou_reward/std": 0.1877364069223404, "rewards/schema_keywords_iou_reward/mean": 0.6710266470909119, "rewards/schema_keywords_iou_reward/std": 0.18709926307201385, "rewards/syntax_reward/mean": 0.6822916865348816, "rewards/syntax_reward/std": 0.46680256724357605, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 215.390625, "completions/mean_terminated_length": 141.33824157714844, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.3833757421543681, "frac_reward_zero_std": 0.0, "grad_norm": 1.3484869003295898, "kl": 0.025360107421875, "learning_rate": 9.979414043384484e-07, "loss": 0.0041, "num_tokens": 29053874.0, "reward": 9.399392127990723, "reward_std": 1.6029624938964844, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.506250023841858, "rewards/judge_reward/std": 1.604170799255371, "rewards/ngrams_iou_reward/mean": 0.14179575443267822, "rewards/ngrams_iou_reward/std": 0.17528758943080902, "rewards/schema_keywords_iou_reward/mean": 0.6825960278511047, "rewards/schema_keywords_iou_reward/std": 0.17877595126628876, "rewards/syntax_reward/mean": 0.71875, "rewards/syntax_reward/std": 0.4507846534252167, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 206.578125, "completions/mean_terminated_length": 140.28048706054688, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.38676844783715014, "frac_reward_zero_std": 0.0, "grad_norm": 1.1849943399429321, "kl": 0.029388427734375, "learning_rate": 9.97758641300553e-07, "loss": 0.0098, "num_tokens": 29328527.0, "reward": 8.987627983093262, "reward_std": 1.6627382040023804, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.3041666746139526, "rewards/judge_reward/std": 1.436394214630127, "rewards/ngrams_iou_reward/mean": 0.17125193774700165, "rewards/ngrams_iou_reward/std": 0.21042759716510773, "rewards/schema_keywords_iou_reward/mean": 0.6767921447753906, "rewards/schema_keywords_iou_reward/std": 0.2020070105791092, "rewards/syntax_reward/mean": 0.7291666865348816, "rewards/syntax_reward/std": 0.44555196166038513, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 202.859375, "completions/mean_terminated_length": 147.4574432373047, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.39016115351993214, "frac_reward_zero_std": 0.0, "grad_norm": 0.9749577045440674, "kl": 0.032012939453125, "learning_rate": 9.97568124868646e-07, "loss": -0.0136, "num_tokens": 29590430.0, "reward": 8.68874740600586, "reward_std": 1.7074183225631714, "rewards/accuracy_reward/mean": 1.125, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.4520832300186157, "rewards/judge_reward/std": 1.4914048910140991, "rewards/ngrams_iou_reward/mean": 0.1264796406030655, "rewards/ngrams_iou_reward/std": 0.16040536761283875, "rewards/schema_keywords_iou_reward/mean": 0.6424763202667236, "rewards/schema_keywords_iou_reward/std": 0.18062256276607513, "rewards/syntax_reward/mean": 0.6822916865348816, "rewards/syntax_reward/std": 0.46680256724357605, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 213.53646850585938, "completions/mean_terminated_length": 148.7236785888672, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.3935538592027142, "frac_reward_zero_std": 0.0, "grad_norm": 1.026920199394226, "kl": 0.03045654296875, "learning_rate": 9.973698580103284e-07, "loss": 0.0053, "num_tokens": 29856495.0, "reward": 8.815619468688965, "reward_std": 2.135012149810791, "rewards/accuracy_reward/mean": 1.046875, "rewards/accuracy_reward/std": 1.433660626411438, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.649999976158142, "rewards/judge_reward/std": 1.5802444219589233, "rewards/ngrams_iou_reward/mean": 0.13234633207321167, "rewards/ngrams_iou_reward/std": 0.13129393756389618, "rewards/schema_keywords_iou_reward/mean": 0.6436898112297058, "rewards/schema_keywords_iou_reward/std": 0.18491695821285248, "rewards/syntax_reward/mean": 0.703125, "rewards/syntax_reward/std": 0.4580754339694977, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 221.6666717529297, "completions/mean_terminated_length": 154.58460998535156, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.3969465648854962, "frac_reward_zero_std": 0.0, "grad_norm": 1.117220163345337, "kl": 0.032806396484375, "learning_rate": 9.971638438139264e-07, "loss": 0.0073, "num_tokens": 30102791.0, "reward": 9.108315467834473, "reward_std": 1.706704020500183, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.4479166269302368, "rewards/judge_reward/std": 1.5174363851547241, "rewards/ngrams_iou_reward/mean": 0.10693174600601196, "rewards/ngrams_iou_reward/std": 0.11004293709993362, "rewards/schema_keywords_iou_reward/mean": 0.6524251103401184, "rewards/schema_keywords_iou_reward/std": 0.1898295283317566, "rewards/syntax_reward/mean": 0.7239583134651184, "rewards/syntax_reward/std": 0.4482063353061676, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 224.50521850585938, "completions/mean_terminated_length": 146.05453491210938, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.4003392705682782, "frac_reward_zero_std": 0.0, "grad_norm": 0.7699753046035767, "kl": 0.033935546875, "learning_rate": 9.96950085488444e-07, "loss": 0.0369, "num_tokens": 30348816.0, "reward": 9.297576904296875, "reward_std": 2.2477285861968994, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.4479166269302368, "rewards/judge_reward/std": 1.5711387395858765, "rewards/ngrams_iou_reward/mean": 0.1243731677532196, "rewards/ngrams_iou_reward/std": 0.1604512482881546, "rewards/schema_keywords_iou_reward/mean": 0.621119499206543, "rewards/schema_keywords_iou_reward/std": 0.20725414156913757, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 233.05209350585938, "completions/mean_terminated_length": 166.08163452148438, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.4037319762510602, "frac_reward_zero_std": 0.0, "grad_norm": 1.1451362371444702, "kl": 0.029449462890625, "learning_rate": 9.96728586363511e-07, "loss": 0.0059, "num_tokens": 30592480.0, "reward": 8.687209129333496, "reward_std": 1.739615797996521, "rewards/accuracy_reward/mean": 1.03125, "rewards/accuracy_reward/std": 1.4286017417907715, "rewards/format_reward/mean": 0.8854166865348816, "rewards/format_reward/std": 0.3193511664867401, "rewards/judge_reward/mean": 1.6416667699813843, "rewards/judge_reward/std": 1.5959153175354004, "rewards/ngrams_iou_reward/mean": 0.1182062104344368, "rewards/ngrams_iou_reward/std": 0.12350776791572571, "rewards/schema_keywords_iou_reward/mean": 0.6294189095497131, "rewards/schema_keywords_iou_reward/std": 0.19279614090919495, "rewards/syntax_reward/mean": 0.6770833134651184, "rewards/syntax_reward/std": 0.46881362795829773, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 219.9010467529297, "completions/mean_terminated_length": 154.0735321044922, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.4071246819338422, "frac_reward_zero_std": 0.0, "grad_norm": 0.8644776344299316, "kl": 0.034027099609375, "learning_rate": 9.964993498893348e-07, "loss": 0.0059, "num_tokens": 30837417.0, "reward": 8.918764114379883, "reward_std": 1.8430927991867065, "rewards/accuracy_reward/mean": 1.25, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.370833396911621, "rewards/judge_reward/std": 1.5481058359146118, "rewards/ngrams_iou_reward/mean": 0.13339103758335114, "rewards/ngrams_iou_reward/std": 0.1483432501554489, "rewards/schema_keywords_iou_reward/mean": 0.6322475075721741, "rewards/schema_keywords_iou_reward/std": 0.19139984250068665, "rewards/syntax_reward/mean": 0.7395833134651184, "rewards/syntax_reward/std": 0.44000932574272156, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 225.4322967529297, "completions/mean_terminated_length": 156.52542114257812, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.41051738761662426, "frac_reward_zero_std": 0.0, "grad_norm": 0.8672334551811218, "kl": 0.034393310546875, "learning_rate": 9.962623796366428e-07, "loss": 0.0045, "num_tokens": 31106570.0, "reward": 9.592226028442383, "reward_std": 1.8446944952011108, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.394791603088379, "rewards/judge_reward/std": 1.570464849472046, "rewards/ngrams_iou_reward/mean": 0.13030122220516205, "rewards/ngrams_iou_reward/std": 0.13681358098983765, "rewards/schema_keywords_iou_reward/mean": 0.6983821988105774, "rewards/schema_keywords_iou_reward/std": 0.18164846301078796, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 223.53646850585938, "completions/mean_terminated_length": 157.06350708007812, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.41391009329940626, "frac_reward_zero_std": 0.0, "grad_norm": 1.3120713233947754, "kl": 0.03863525390625, "learning_rate": 9.960176792966288e-07, "loss": -0.0041, "num_tokens": 31366401.0, "reward": 8.937124252319336, "reward_std": 1.7642138004302979, "rewards/accuracy_reward/mean": 1.109375, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.29938673973083496, "rewards/judge_reward/mean": 1.6437500715255737, "rewards/judge_reward/std": 1.553567886352539, "rewards/ngrams_iou_reward/mean": 0.12527552247047424, "rewards/ngrams_iou_reward/std": 0.14191488921642303, "rewards/schema_keywords_iou_reward/mean": 0.6389312148094177, "rewards/schema_keywords_iou_reward/std": 0.20471309125423431, "rewards/syntax_reward/mean": 0.65625, "rewards/syntax_reward/std": 0.47620058059692383, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.4947967529297, "completions/mean_terminated_length": 153.88890075683594, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.4173027989821883, "frac_reward_zero_std": 0.0, "grad_norm": 1.1662062406539917, "kl": 0.036224365234375, "learning_rate": 9.95765252680896e-07, "loss": 0.0188, "num_tokens": 31627718.0, "reward": 8.708475112915039, "reward_std": 2.13387393951416, "rewards/accuracy_reward/mean": 0.953125, "rewards/accuracy_reward/std": 1.4004077911376953, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.7708333730697632, "rewards/judge_reward/std": 1.6599953174591064, "rewards/ngrams_iou_reward/mean": 0.13697449862957, "rewards/ngrams_iou_reward/std": 0.15745113790035248, "rewards/schema_keywords_iou_reward/mean": 0.6339996457099915, "rewards/schema_keywords_iou_reward/std": 0.20645475387573242, "rewards/syntax_reward/mean": 0.6354166865348816, "rewards/syntax_reward/std": 0.48257145285606384, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 220.64584350585938, "completions/mean_terminated_length": 160.3943634033203, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.4206955046649703, "frac_reward_zero_std": 0.0, "grad_norm": 1.138081669807434, "kl": 0.0330810546875, "learning_rate": 9.95505103721396e-07, "loss": 0.0343, "num_tokens": 31882020.0, "reward": 9.038778305053711, "reward_std": 1.7085990905761719, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.271875023841858, "rewards/judge_reward/std": 1.4769575595855713, "rewards/ngrams_iou_reward/mean": 0.1452450156211853, "rewards/ngrams_iou_reward/std": 0.18729373812675476, "rewards/schema_keywords_iou_reward/mean": 0.6414492130279541, "rewards/schema_keywords_iou_reward/std": 0.19655923545360565, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.0416717529297, "completions/mean_terminated_length": 161.42465209960938, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.42408821034775235, "frac_reward_zero_std": 0.0, "grad_norm": 1.0036427974700928, "kl": 0.0364990234375, "learning_rate": 9.952372364703686e-07, "loss": 0.0167, "num_tokens": 32163812.0, "reward": 8.357007026672363, "reward_std": 1.887357473373413, "rewards/accuracy_reward/mean": 1.125, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.3229166269302368, "rewards/judge_reward/std": 1.521054983139038, "rewards/ngrams_iou_reward/mean": 0.1306840032339096, "rewards/ngrams_iou_reward/std": 0.18012607097625732, "rewards/schema_keywords_iou_reward/mean": 0.6429886221885681, "rewards/schema_keywords_iou_reward/std": 0.20801670849323273, "rewards/syntax_reward/mean": 0.6666666865348816, "rewards/syntax_reward/std": 0.4726369380950928, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 214.23959350585938, "completions/mean_terminated_length": 153.2051239013672, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.42748091603053434, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9853479266166687, "kl": 0.0361328125, "learning_rate": 9.949616551002785e-07, "loss": 0.0019, "num_tokens": 32416338.0, "reward": 8.492802619934082, "reward_std": 1.9030383825302124, "rewards/accuracy_reward/mean": 0.921875, "rewards/accuracy_reward/std": 1.3877325057983398, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.6302083730697632, "rewards/judge_reward/std": 1.4791125059127808, "rewards/ngrams_iou_reward/mean": 0.1396193504333496, "rewards/ngrams_iou_reward/std": 0.18210646510124207, "rewards/schema_keywords_iou_reward/mean": 0.6396409869194031, "rewards/schema_keywords_iou_reward/std": 0.17855805158615112, "rewards/syntax_reward/mean": 0.7291666865348816, "rewards/syntax_reward/std": 0.44555196166038513, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 223.08334350585938, "completions/mean_terminated_length": 160.242431640625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.4308736217133164, "frac_reward_zero_std": 0.0, "grad_norm": 1.1119533777236938, "kl": 0.0364990234375, "learning_rate": 9.946783639037502e-07, "loss": -0.0199, "num_tokens": 32688124.0, "reward": 7.995141983032227, "reward_std": 2.0468311309814453, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 1.3523539304733276, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.564583420753479, "rewards/judge_reward/std": 1.455372929573059, "rewards/ngrams_iou_reward/mean": 0.14050129055976868, "rewards/ngrams_iou_reward/std": 0.177481546998024, "rewards/schema_keywords_iou_reward/mean": 0.6213073134422302, "rewards/schema_keywords_iou_reward/std": 0.20509465038776398, "rewards/syntax_reward/mean": 0.6770833134651184, "rewards/syntax_reward/std": 0.46881362795829773, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 211.265625, "completions/mean_terminated_length": 156.12789916992188, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.4342663273960984, "frac_reward_zero_std": 0.0, "grad_norm": 0.9184350371360779, "kl": 0.0338134765625, "learning_rate": 9.943873672935013e-07, "loss": 0.0062, "num_tokens": 32936707.0, "reward": 8.48525619506836, "reward_std": 2.4251716136932373, "rewards/accuracy_reward/mean": 1.125, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.3427084684371948, "rewards/judge_reward/std": 1.4398401975631714, "rewards/ngrams_iou_reward/mean": 0.14757853746414185, "rewards/ngrams_iou_reward/std": 0.15034909546375275, "rewards/schema_keywords_iou_reward/mean": 0.6314274668693542, "rewards/schema_keywords_iou_reward/std": 0.1947576403617859, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 203.5260467529297, "completions/mean_terminated_length": 142.79776000976562, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.43765903307888043, "frac_reward_zero_std": 0.0, "grad_norm": 1.0294996500015259, "kl": 0.038787841796875, "learning_rate": 9.940886698022732e-07, "loss": -0.0146, "num_tokens": 33177996.0, "reward": 9.340060234069824, "reward_std": 1.7197034358978271, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.8854166865348816, "rewards/format_reward/std": 0.3193511664867401, "rewards/judge_reward/mean": 1.5135416984558105, "rewards/judge_reward/std": 1.6405285596847534, "rewards/ngrams_iou_reward/mean": 0.181673064827919, "rewards/ngrams_iou_reward/std": 0.228786438703537, "rewards/schema_keywords_iou_reward/mean": 0.6625531315803528, "rewards/schema_keywords_iou_reward/std": 0.22379924356937408, "rewards/syntax_reward/mean": 0.7395833134651184, "rewards/syntax_reward/std": 0.44000929594039917, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 214.08334350585938, "completions/mean_terminated_length": 156.64198303222656, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.4410517387616624, "frac_reward_zero_std": 0.0, "grad_norm": 0.9597503542900085, "kl": 0.03826904296875, "learning_rate": 9.937822760827619e-07, "loss": 0.0325, "num_tokens": 33428134.0, "reward": 9.418458938598633, "reward_std": 1.6494773626327515, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.4614583253860474, "rewards/judge_reward/std": 1.5569381713867188, "rewards/ngrams_iou_reward/mean": 0.15580053627490997, "rewards/ngrams_iou_reward/std": 0.17550411820411682, "rewards/schema_keywords_iou_reward/mean": 0.6939077377319336, "rewards/schema_keywords_iou_reward/std": 0.18212437629699707, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.3590256869792938, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.5260467529297, "completions/mean_terminated_length": 165.32876586914062, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.4444444444444444, "frac_reward_zero_std": 0.0, "grad_norm": 1.0778627395629883, "kl": 0.03692626953125, "learning_rate": 9.934681909075434e-07, "loss": 0.0237, "num_tokens": 33669075.0, "reward": 9.002737045288086, "reward_std": 2.0724892616271973, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3485431373119354, "rewards/judge_reward/mean": 1.5697916746139526, "rewards/judge_reward/std": 1.657009243965149, "rewards/ngrams_iou_reward/mean": 0.13462816178798676, "rewards/ngrams_iou_reward/std": 0.1242605447769165, "rewards/schema_keywords_iou_reward/mean": 0.6712333559989929, "rewards/schema_keywords_iou_reward/std": 0.18104752898216248, "rewards/syntax_reward/mean": 0.6354166865348816, "rewards/syntax_reward/std": 0.48257145285606384, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 221.03125, "completions/mean_terminated_length": 172.0749969482422, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.44783715012722647, "frac_reward_zero_std": 0.0, "grad_norm": 1.1547516584396362, "kl": 0.0386962890625, "learning_rate": 9.931464191690013e-07, "loss": 0.0184, "num_tokens": 33930081.0, "reward": 9.485767364501953, "reward_std": 1.6920170783996582, "rewards/accuracy_reward/mean": 1.109375, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.8645833730697632, "rewards/judge_reward/std": 1.6964377164840698, "rewards/ngrams_iou_reward/mean": 0.13738635182380676, "rewards/ngrams_iou_reward/std": 0.1334318369626999, "rewards/schema_keywords_iou_reward/mean": 0.6869214177131653, "rewards/schema_keywords_iou_reward/std": 0.20294521749019623, "rewards/syntax_reward/mean": 0.6927083134651184, "rewards/syntax_reward/std": 0.4625774919986725, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 214.19271850585938, "completions/mean_terminated_length": 159.28915405273438, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.45122985581000846, "frac_reward_zero_std": 0.0, "grad_norm": 0.9439511895179749, "kl": 0.03436279296875, "learning_rate": 9.928169658792497e-07, "loss": 0.0042, "num_tokens": 34207096.0, "reward": 9.668974876403809, "reward_std": 1.8748548030853271, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.2572916746139526, "rewards/judge_reward/std": 1.5274962186813354, "rewards/ngrams_iou_reward/mean": 0.1696424037218094, "rewards/ngrams_iou_reward/std": 0.19666443765163422, "rewards/schema_keywords_iou_reward/mean": 0.6670398116111755, "rewards/schema_keywords_iou_reward/std": 0.222762331366539, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 215.38021850585938, "completions/mean_terminated_length": 167.375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.4546225614927905, "frac_reward_zero_std": 0.03125, "grad_norm": 1.2277485132217407, "kl": 0.03790283203125, "learning_rate": 9.924798361700554e-07, "loss": 0.0158, "num_tokens": 34439861.0, "reward": 9.694846153259277, "reward_std": 1.8336138725280762, "rewards/accuracy_reward/mean": 1.25, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.740625023841858, "rewards/judge_reward/std": 1.705554723739624, "rewards/ngrams_iou_reward/mean": 0.1638404130935669, "rewards/ngrams_iou_reward/std": 0.20139159262180328, "rewards/schema_keywords_iou_reward/mean": 0.6955888271331787, "rewards/schema_keywords_iou_reward/std": 0.19162894785404205, "rewards/syntax_reward/mean": 0.6927083134651184, "rewards/syntax_reward/std": 0.4625774919986725, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 219.125, "completions/mean_terminated_length": 165.23077392578125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.4580152671755725, "frac_reward_zero_std": 0.0, "grad_norm": 0.9133231043815613, "kl": 0.0389404296875, "learning_rate": 9.92135035292757e-07, "loss": 0.0359, "num_tokens": 34693469.0, "reward": 9.01561164855957, "reward_std": 1.686680555343628, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 1.3941725492477417, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3485431373119354, "rewards/judge_reward/mean": 1.935416579246521, "rewards/judge_reward/std": 1.6199009418487549, "rewards/ngrams_iou_reward/mean": 0.14519019424915314, "rewards/ngrams_iou_reward/std": 0.1441095471382141, "rewards/schema_keywords_iou_reward/mean": 0.6662535667419434, "rewards/schema_keywords_iou_reward/std": 0.1982613503932953, "rewards/syntax_reward/mean": 0.6614583134651184, "rewards/syntax_reward/std": 0.47445085644721985, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 209.9947967529297, "completions/mean_terminated_length": 154.4712677001953, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.46140797285835455, "frac_reward_zero_std": 0.0, "grad_norm": 0.9920307993888855, "kl": 0.03521728515625, "learning_rate": 9.917825686181848e-07, "loss": -0.0035, "num_tokens": 34931494.0, "reward": 9.852813720703125, "reward_std": 1.880568504333496, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.3406249284744263, "rewards/judge_reward/std": 1.6645410060882568, "rewards/ngrams_iou_reward/mean": 0.16863436996936798, "rewards/ngrams_iou_reward/std": 0.2019926905632019, "rewards/schema_keywords_iou_reward/mean": 0.6591787338256836, "rewards/schema_keywords_iou_reward/std": 0.20960189402103424, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 211.89584350585938, "completions/mean_terminated_length": 151.456787109375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.46480067854113655, "frac_reward_zero_std": 0.0, "grad_norm": 1.324090600013733, "kl": 0.04022216796875, "learning_rate": 9.914224416365763e-07, "loss": 0.0402, "num_tokens": 35180474.0, "reward": 9.137462615966797, "reward_std": 1.7776144742965698, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.8802083134651184, "rewards/format_reward/std": 0.32556667923927307, "rewards/judge_reward/mean": 1.350000023841858, "rewards/judge_reward/std": 1.573737382888794, "rewards/ngrams_iou_reward/mean": 0.11828819662332535, "rewards/ngrams_iou_reward/std": 0.1159617081284523, "rewards/schema_keywords_iou_reward/mean": 0.6316747665405273, "rewards/schema_keywords_iou_reward/std": 0.21233566105365753, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 212.609375, "completions/mean_terminated_length": 161.3295440673828, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.4681933842239186, "frac_reward_zero_std": 0.0, "grad_norm": 0.9504607319831848, "kl": 0.04132080078125, "learning_rate": 9.910546599574902e-07, "loss": 0.0005, "num_tokens": 35452739.0, "reward": 8.684114456176758, "reward_std": 1.8147348165512085, "rewards/accuracy_reward/mean": 1.125, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.8541666865348816, "rewards/format_reward/std": 0.3538617491722107, "rewards/judge_reward/mean": 1.4666666984558105, "rewards/judge_reward/std": 1.4933476448059082, "rewards/ngrams_iou_reward/mean": 0.1439206302165985, "rewards/ngrams_iou_reward/std": 0.1620098501443863, "rewards/schema_keywords_iou_reward/mean": 0.6329022645950317, "rewards/schema_keywords_iou_reward/std": 0.21750685572624207, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 203.0572967529297, "completions/mean_terminated_length": 153.32322692871094, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.4715860899067006, "frac_reward_zero_std": 0.0, "grad_norm": 1.0229545831680298, "kl": 0.0404052734375, "learning_rate": 9.906792293097193e-07, "loss": 0.0206, "num_tokens": 35705434.0, "reward": 8.350381851196289, "reward_std": 1.8633331060409546, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 1.3200279474258423, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36404144763946533, "rewards/judge_reward/mean": 1.8666666746139526, "rewards/judge_reward/std": 1.5100823640823364, "rewards/ngrams_iou_reward/mean": 0.1355983167886734, "rewards/ngrams_iou_reward/std": 0.18061839044094086, "rewards/schema_keywords_iou_reward/mean": 0.6064493656158447, "rewards/schema_keywords_iou_reward/std": 0.21988485753536224, "rewards/syntax_reward/mean": 0.6875, "rewards/syntax_reward/std": 0.46472418308258057, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.22396850585938, "completions/mean_terminated_length": 170.6702117919922, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.47497879558948264, "frac_reward_zero_std": 0.0, "grad_norm": 1.2659803628921509, "kl": 0.04461669921875, "learning_rate": 9.90296155541202e-07, "loss": 0.0163, "num_tokens": 35978375.0, "reward": 8.380086898803711, "reward_std": 2.257727861404419, "rewards/accuracy_reward/mean": 1.03125, "rewards/accuracy_reward/std": 1.4286017417907715, "rewards/format_reward/mean": 0.8489583134651184, "rewards/format_reward/std": 0.35902565717697144, "rewards/judge_reward/mean": 1.515625, "rewards/judge_reward/std": 1.4894272089004517, "rewards/ngrams_iou_reward/mean": 0.12148982286453247, "rewards/ngrams_iou_reward/std": 0.1146501749753952, "rewards/schema_keywords_iou_reward/mean": 0.6179704666137695, "rewards/schema_keywords_iou_reward/std": 0.21819579601287842, "rewards/syntax_reward/mean": 0.6666666865348816, "rewards/syntax_reward/std": 0.4726369380950928, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 200.46875, "completions/mean_terminated_length": 154.4571533203125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.47837150127226463, "frac_reward_zero_std": 0.0, "grad_norm": 0.9275773763656616, "kl": 0.04095458984375, "learning_rate": 9.899054446189302e-07, "loss": 0.0329, "num_tokens": 36219827.0, "reward": 9.423248291015625, "reward_std": 1.9732422828674316, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.2885416746139526, "rewards/judge_reward/std": 1.5313389301300049, "rewards/ngrams_iou_reward/mean": 0.14970558881759644, "rewards/ngrams_iou_reward/std": 0.17358623445034027, "rewards/schema_keywords_iou_reward/mean": 0.659999668598175, "rewards/schema_keywords_iou_reward/std": 0.21371114253997803, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 193.71875, "completions/mean_terminated_length": 152.91378784179688, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.4817642069550466, "frac_reward_zero_std": 0.0, "grad_norm": 0.995588481426239, "kl": 0.041748046875, "learning_rate": 9.895071026288573e-07, "loss": 0.025, "num_tokens": 36471449.0, "reward": 9.592098236083984, "reward_std": 1.737514853477478, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.551041603088379, "rewards/judge_reward/std": 1.6145603656768799, "rewards/ngrams_iou_reward/mean": 0.14934654533863068, "rewards/ngrams_iou_reward/std": 0.19054833054542542, "rewards/schema_keywords_iou_reward/mean": 0.6531679034233093, "rewards/schema_keywords_iou_reward/std": 0.18982219696044922, "rewards/syntax_reward/mean": 0.7135416865348816, "rewards/syntax_reward/std": 0.45328810811042786, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 195.859375, "completions/mean_terminated_length": 162.1219482421875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.4851569126378287, "frac_reward_zero_std": 0.0, "grad_norm": 0.9743634462356567, "kl": 0.04498291015625, "learning_rate": 9.89101135775802e-07, "loss": 0.0324, "num_tokens": 36709406.0, "reward": 9.711942672729492, "reward_std": 1.7895591259002686, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.4708333015441895, "rewards/judge_reward/std": 1.6076984405517578, "rewards/ngrams_iou_reward/mean": 0.13330072164535522, "rewards/ngrams_iou_reward/std": 0.13991059362888336, "rewards/schema_keywords_iou_reward/mean": 0.673431932926178, "rewards/schema_keywords_iou_reward/std": 0.17854741215705872, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 189.94271850585938, "completions/mean_terminated_length": 162.05184936523438, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.48854961832061067, "frac_reward_zero_std": 0.0, "grad_norm": 0.9263571500778198, "kl": 0.044677734375, "learning_rate": 9.886875503833537e-07, "loss": 0.0532, "num_tokens": 36960147.0, "reward": 9.668159484863281, "reward_std": 2.123537540435791, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.1614583730697632, "rewards/judge_reward/std": 1.5633141994476318, "rewards/ngrams_iou_reward/mean": 0.14315150678157806, "rewards/ngrams_iou_reward/std": 0.13584141433238983, "rewards/schema_keywords_iou_reward/mean": 0.6708411574363708, "rewards/schema_keywords_iou_reward/std": 0.19129659235477448, "rewards/syntax_reward/mean": 0.7604166865348816, "rewards/syntax_reward/std": 0.427945077419281, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 186.86459350585938, "completions/mean_terminated_length": 154.6717529296875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.4919423240033927, "frac_reward_zero_std": 0.0, "grad_norm": 1.408432960510254, "kl": 0.048583984375, "learning_rate": 9.882663528937716e-07, "loss": 0.0209, "num_tokens": 37235125.0, "reward": 9.196722030639648, "reward_std": 2.2291862964630127, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33158352971076965, "rewards/judge_reward/mean": 1.426041603088379, "rewards/judge_reward/std": 1.54009211063385, "rewards/ngrams_iou_reward/mean": 0.17036020755767822, "rewards/ngrams_iou_reward/std": 0.21171598136425018, "rewards/schema_keywords_iou_reward/mean": 0.6586522459983826, "rewards/schema_keywords_iou_reward/std": 0.2334972470998764, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 190.28125, "completions/mean_terminated_length": 166.51063537597656, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.4953350296861747, "frac_reward_zero_std": 0.0, "grad_norm": 0.938805103302002, "kl": 0.0516357421875, "learning_rate": 9.878375498678867e-07, "loss": 0.0151, "num_tokens": 37482799.0, "reward": 9.123594284057617, "reward_std": 1.9953174591064453, "rewards/accuracy_reward/mean": 1.203125, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33158352971076965, "rewards/judge_reward/mean": 1.5749999284744263, "rewards/judge_reward/std": 1.6140092611312866, "rewards/ngrams_iou_reward/mean": 0.14102543890476227, "rewards/ngrams_iou_reward/std": 0.1834087371826172, "rewards/schema_keywords_iou_reward/mean": 0.6502764821052551, "rewards/schema_keywords_iou_reward/std": 0.21707434952259064, "rewards/syntax_reward/mean": 0.6979166865348816, "rewards/syntax_reward/std": 0.4603615999221802, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.5, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 184.1354217529297, "completions/mean_terminated_length": 160.18055725097656, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.49872773536895676, "frac_reward_zero_std": 0.0, "grad_norm": 0.8267194032669067, "kl": 0.0408935546875, "learning_rate": 9.87401147984998e-07, "loss": -0.0032, "num_tokens": 37731477.0, "reward": 9.478845596313477, "reward_std": 1.678941249847412, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.8645833134651184, "rewards/format_reward/std": 0.3430626094341278, "rewards/judge_reward/mean": 1.3625000715255737, "rewards/judge_reward/std": 1.556694507598877, "rewards/ngrams_iou_reward/mean": 0.22250527143478394, "rewards/ngrams_iou_reward/std": 0.22988253831863403, "rewards/schema_keywords_iou_reward/mean": 0.6980063319206238, "rewards/schema_keywords_iou_reward/std": 0.22529344260692596, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 181.56771850585938, "completions/mean_terminated_length": 149.35073852539062, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.5021204410517388, "frac_reward_zero_std": 0.0, "grad_norm": 1.1349817514419556, "kl": 0.0450439453125, "learning_rate": 9.869571540427689e-07, "loss": -0.0057, "num_tokens": 37990216.0, "reward": 9.323561668395996, "reward_std": 1.840775489807129, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3485431373119354, "rewards/judge_reward/mean": 1.509374976158142, "rewards/judge_reward/std": 1.6288996934890747, "rewards/ngrams_iou_reward/mean": 0.1198708713054657, "rewards/ngrams_iou_reward/std": 0.12652790546417236, "rewards/schema_keywords_iou_reward/mean": 0.6432731747627258, "rewards/schema_keywords_iou_reward/std": 0.19302207231521606, "rewards/syntax_reward/mean": 0.6979166865348816, "rewards/syntax_reward/std": 0.4603615701198578, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 194.28646850585938, "completions/mean_terminated_length": 167.57461547851562, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.5055131467345207, "frac_reward_zero_std": 0.0, "grad_norm": 0.8266201615333557, "kl": 0.04180908203125, "learning_rate": 9.865055749571213e-07, "loss": 0.0198, "num_tokens": 38233025.0, "reward": 8.96904468536377, "reward_std": 1.8814020156860352, "rewards/accuracy_reward/mean": 1.140625, "rewards/accuracy_reward/std": 1.4601210355758667, "rewards/format_reward/mean": 0.8854166865348816, "rewards/format_reward/std": 0.3193511664867401, "rewards/judge_reward/mean": 1.5812500715255737, "rewards/judge_reward/std": 1.580795168876648, "rewards/ngrams_iou_reward/mean": 0.11578521877527237, "rewards/ngrams_iou_reward/std": 0.09230979532003403, "rewards/schema_keywords_iou_reward/mean": 0.6543000340461731, "rewards/schema_keywords_iou_reward/std": 0.1684461385011673, "rewards/syntax_reward/mean": 0.7291666865348816, "rewards/syntax_reward/std": 0.44555196166038513, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 183.47396850585938, "completions/mean_terminated_length": 141.8606414794922, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.5089058524173028, "frac_reward_zero_std": 0.03125, "grad_norm": 0.94831782579422, "kl": 0.04547119140625, "learning_rate": 9.860464177621284e-07, "loss": 0.0055, "num_tokens": 38463018.0, "reward": 9.830144882202148, "reward_std": 1.772131323814392, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3485431373119354, "rewards/judge_reward/mean": 1.386458396911621, "rewards/judge_reward/std": 1.5975189208984375, "rewards/ngrams_iou_reward/mean": 0.2345970869064331, "rewards/ngrams_iou_reward/std": 0.29753199219703674, "rewards/schema_keywords_iou_reward/mean": 0.713254988193512, "rewards/schema_keywords_iou_reward/std": 0.21537713706493378, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 208.59896850585938, "completions/mean_terminated_length": 159.18084716796875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.5122985581000848, "frac_reward_zero_std": 0.0, "grad_norm": 0.8836914300918579, "kl": 0.0367431640625, "learning_rate": 9.855796896099044e-07, "loss": 0.0117, "num_tokens": 38743507.0, "reward": 8.495279312133789, "reward_std": 1.5838353633880615, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.8854166865348816, "rewards/format_reward/std": 0.3193511664867401, "rewards/judge_reward/mean": 1.4895833730697632, "rewards/judge_reward/std": 1.510138750076294, "rewards/ngrams_iou_reward/mean": 0.14869146049022675, "rewards/ngrams_iou_reward/std": 0.16201940178871155, "rewards/schema_keywords_iou_reward/mean": 0.6434618830680847, "rewards/schema_keywords_iou_reward/std": 0.17753711342811584, "rewards/syntax_reward/mean": 0.5572916865348816, "rewards/syntax_reward/std": 0.49800539016723633, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 205.2447967529297, "completions/mean_terminated_length": 159.5148468017578, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.5156912637828668, "frac_reward_zero_std": 0.0, "grad_norm": 0.9752041697502136, "kl": 0.03778076171875, "learning_rate": 9.85105397770493e-07, "loss": 0.0296, "num_tokens": 38981964.0, "reward": 8.743927001953125, "reward_std": 1.7833144664764404, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.2520833015441895, "rewards/judge_reward/std": 1.4630513191223145, "rewards/ngrams_iou_reward/mean": 0.13209421932697296, "rewards/ngrams_iou_reward/std": 0.17448994517326355, "rewards/schema_keywords_iou_reward/mean": 0.6493321061134338, "rewards/schema_keywords_iou_reward/std": 0.19293005764484406, "rewards/syntax_reward/mean": 0.7395833134651184, "rewards/syntax_reward/std": 0.44000932574272156, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 214.0, "completions/mean_terminated_length": 147.0270233154297, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.5190839694656488, "frac_reward_zero_std": 0.0, "grad_norm": 1.1291546821594238, "kl": 0.04046630859375, "learning_rate": 9.846235496317553e-07, "loss": -0.0154, "num_tokens": 39225156.0, "reward": 9.55750560760498, "reward_std": 1.4625365734100342, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.8802083134651184, "rewards/format_reward/std": 0.32556670904159546, "rewards/judge_reward/mean": 1.1572917699813843, "rewards/judge_reward/std": 1.415267825126648, "rewards/ngrams_iou_reward/mean": 0.14090867340564728, "rewards/ngrams_iou_reward/std": 0.16784802079200745, "rewards/schema_keywords_iou_reward/mean": 0.6593047976493835, "rewards/schema_keywords_iou_reward/std": 0.1883929818868637, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 222.5729217529297, "completions/mean_terminated_length": 165.6056365966797, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5224766751484309, "frac_reward_zero_std": 0.0, "grad_norm": 0.9084067344665527, "kl": 0.0355224609375, "learning_rate": 9.841341526992535e-07, "loss": 0.009, "num_tokens": 39491270.0, "reward": 9.065409660339355, "reward_std": 1.517130732536316, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.8541666865348816, "rewards/format_reward/std": 0.3538617491722107, "rewards/judge_reward/mean": 1.4270833730697632, "rewards/judge_reward/std": 1.5676358938217163, "rewards/ngrams_iou_reward/mean": 0.1427093893289566, "rewards/ngrams_iou_reward/std": 0.1494072675704956, "rewards/schema_keywords_iou_reward/mean": 0.6466581225395203, "rewards/schema_keywords_iou_reward/std": 0.1934434175491333, "rewards/syntax_reward/mean": 0.6302083134651184, "rewards/syntax_reward/std": 0.484010249376297, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 222.13021850585938, "completions/mean_terminated_length": 155.95384216308594, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.5258693808312129, "frac_reward_zero_std": 0.0, "grad_norm": 0.8717555403709412, "kl": 0.03973388671875, "learning_rate": 9.836372145961345e-07, "loss": 0.017, "num_tokens": 39747081.0, "reward": 10.051290512084961, "reward_std": 1.5589256286621094, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.240625023841858, "rewards/judge_reward/std": 1.6215226650238037, "rewards/ngrams_iou_reward/mean": 0.20666801929473877, "rewards/ngrams_iou_reward/std": 0.23385530710220337, "rewards/schema_keywords_iou_reward/mean": 0.7175383567810059, "rewards/schema_keywords_iou_reward/std": 0.1884652078151703, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 234.34896850585938, "completions/mean_terminated_length": 165.63043212890625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.5292620865139949, "frac_reward_zero_std": 0.0, "grad_norm": 0.9385306239128113, "kl": 0.034454345703125, "learning_rate": 9.831327430630108e-07, "loss": 0.0258, "num_tokens": 40015678.0, "reward": 8.823782920837402, "reward_std": 1.6569874286651611, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 1.3200279474258423, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237156867981, "rewards/judge_reward/mean": 2.066666603088379, "rewards/judge_reward/std": 1.5322479009628296, "rewards/ngrams_iou_reward/mean": 0.1220088079571724, "rewards/ngrams_iou_reward/std": 0.17158475518226624, "rewards/schema_keywords_iou_reward/mean": 0.6361481547355652, "rewards/schema_keywords_iou_reward/std": 0.17243394255638123, "rewards/syntax_reward/mean": 0.6770833134651184, "rewards/syntax_reward/std": 0.46881362795829773, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 234.66146850585938, "completions/mean_terminated_length": 156.0731658935547, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.5326547921967769, "frac_reward_zero_std": 0.0, "grad_norm": 0.8337653875350952, "kl": 0.0367431640625, "learning_rate": 9.82620745957841e-07, "loss": 0.0193, "num_tokens": 40289993.0, "reward": 8.570481300354004, "reward_std": 1.8324722051620483, "rewards/accuracy_reward/mean": 1.03125, "rewards/accuracy_reward/std": 1.4286017417907715, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.5031250715255737, "rewards/judge_reward/std": 1.4934377670288086, "rewards/ngrams_iou_reward/mean": 0.19644074141979218, "rewards/ngrams_iou_reward/std": 0.23566332459449768, "rewards/schema_keywords_iou_reward/mean": 0.6542488932609558, "rewards/schema_keywords_iou_reward/std": 0.21429261565208435, "rewards/syntax_reward/mean": 0.6979166865348816, "rewards/syntax_reward/std": 0.4603615999221802, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.95834350585938, "completions/mean_terminated_length": 172.3478240966797, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.536047497879559, "frac_reward_zero_std": 0.0, "grad_norm": 0.8882437944412231, "kl": 0.040283203125, "learning_rate": 9.821012312558059e-07, "loss": 0.0008, "num_tokens": 40577451.0, "reward": 8.912957191467285, "reward_std": 1.651838779449463, "rewards/accuracy_reward/mean": 1.203125, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.3718749284744263, "rewards/judge_reward/std": 1.4826891422271729, "rewards/ngrams_iou_reward/mean": 0.19326679408550262, "rewards/ngrams_iou_reward/std": 0.22967229783535004, "rewards/schema_keywords_iou_reward/mean": 0.6738560795783997, "rewards/schema_keywords_iou_reward/std": 0.20378094911575317, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 235.3385467529297, "completions/mean_terminated_length": 163.7441864013672, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.539440203562341, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9191831350326538, "kl": 0.03375244140625, "learning_rate": 9.81574207049185e-07, "loss": 0.0063, "num_tokens": 40829546.0, "reward": 9.601631164550781, "reward_std": 1.691826581954956, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.4354166984558105, "rewards/judge_reward/std": 1.6375168561935425, "rewards/ngrams_iou_reward/mean": 0.15436701476573944, "rewards/ngrams_iou_reward/std": 0.16683408617973328, "rewards/schema_keywords_iou_reward/mean": 0.6753886342048645, "rewards/schema_keywords_iou_reward/std": 0.18845593929290771, "rewards/syntax_reward/mean": 0.71875, "rewards/syntax_reward/std": 0.4507846534252167, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.8072967529297, "completions/mean_terminated_length": 159.2291717529297, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.542832909245123, "frac_reward_zero_std": 0.0, "grad_norm": 0.9325241446495056, "kl": 0.04156494140625, "learning_rate": 9.810396815472314e-07, "loss": 0.0192, "num_tokens": 41106115.0, "reward": 9.528142929077148, "reward_std": 1.5772823095321655, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.1458333730697632, "rewards/judge_reward/std": 1.4199708700180054, "rewards/ngrams_iou_reward/mean": 0.19056852161884308, "rewards/ngrams_iou_reward/std": 0.23943592607975006, "rewards/schema_keywords_iou_reward/mean": 0.6917407512664795, "rewards/schema_keywords_iou_reward/std": 0.18046216666698456, "rewards/syntax_reward/mean": 0.7604166865348816, "rewards/syntax_reward/std": 0.427945077419281, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 230.3697967529297, "completions/mean_terminated_length": 157.5800018310547, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.546225614927905, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8347919583320618, "kl": 0.0396728515625, "learning_rate": 9.804976630760418e-07, "loss": 0.0158, "num_tokens": 41358630.0, "reward": 9.09316635131836, "reward_std": 1.491328239440918, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.7062500715255737, "rewards/judge_reward/std": 1.6655304431915283, "rewards/ngrams_iou_reward/mean": 0.1459234207868576, "rewards/ngrams_iou_reward/std": 0.18034973740577698, "rewards/schema_keywords_iou_reward/mean": 0.6649511456489563, "rewards/schema_keywords_iou_reward/std": 0.21056176722049713, "rewards/syntax_reward/mean": 0.6822916865348816, "rewards/syntax_reward/std": 0.46680256724357605, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 230.16146850585938, "completions/mean_terminated_length": 162.39622497558594, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.549618320610687, "frac_reward_zero_std": 0.03125, "grad_norm": 1.1155422925949097, "kl": 0.04119873046875, "learning_rate": 9.799481600784286e-07, "loss": 0.0141, "num_tokens": 41619301.0, "reward": 9.505349159240723, "reward_std": 1.528769850730896, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.5770832300186157, "rewards/judge_reward/std": 1.6772876977920532, "rewards/ngrams_iou_reward/mean": 0.17448435723781586, "rewards/ngrams_iou_reward/std": 0.17393061518669128, "rewards/schema_keywords_iou_reward/mean": 0.6923225522041321, "rewards/schema_keywords_iou_reward/std": 0.1880953162908554, "rewards/syntax_reward/mean": 0.7135416865348816, "rewards/syntax_reward/std": 0.45328807830810547, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.3697967529297, "completions/mean_terminated_length": 155.109375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.553011026293469, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9816601872444153, "kl": 0.04364013671875, "learning_rate": 9.793911811137874e-07, "loss": 0.0172, "num_tokens": 41890350.0, "reward": 9.620302200317383, "reward_std": 1.4054040908813477, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.4838541746139526, "rewards/judge_reward/std": 1.6249686479568481, "rewards/ngrams_iou_reward/mean": 0.15102000534534454, "rewards/ngrams_iou_reward/std": 0.19097524881362915, "rewards/schema_keywords_iou_reward/mean": 0.6838648915290833, "rewards/schema_keywords_iou_reward/std": 0.19233457744121552, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 225.328125, "completions/mean_terminated_length": 162.52381896972656, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.556403731976251, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8162392377853394, "kl": 0.0430908203125, "learning_rate": 9.788267348579648e-07, "loss": -0.0022, "num_tokens": 42173715.0, "reward": 9.041387557983398, "reward_std": 1.8173636198043823, "rewards/accuracy_reward/mean": 1.125, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.5614582300186157, "rewards/judge_reward/std": 1.4871082305908203, "rewards/ngrams_iou_reward/mean": 0.20377860963344574, "rewards/ngrams_iou_reward/std": 0.24386966228485107, "rewards/schema_keywords_iou_reward/mean": 0.6886507868766785, "rewards/schema_keywords_iou_reward/std": 0.18132564425468445, "rewards/syntax_reward/mean": 0.734375, "rewards/syntax_reward/std": 0.44282010197639465, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 222.9479217529297, "completions/mean_terminated_length": 150.23333740234375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.5597964376590331, "frac_reward_zero_std": 0.0, "grad_norm": 0.8533975481987, "kl": 0.04547119140625, "learning_rate": 9.782548301031217e-07, "loss": 0.0108, "num_tokens": 42449609.0, "reward": 9.365046501159668, "reward_std": 1.946040153503418, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.484375, "rewards/judge_reward/std": 1.606845736503601, "rewards/ngrams_iou_reward/mean": 0.1894654631614685, "rewards/ngrams_iou_reward/std": 0.24014224112033844, "rewards/schema_keywords_iou_reward/mean": 0.6651632189750671, "rewards/schema_keywords_iou_reward/std": 0.21213604509830475, "rewards/syntax_reward/mean": 0.6979166865348816, "rewards/syntax_reward/std": 0.4603615999221802, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.0625, "completions/mean_terminated_length": 158.62294006347656, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.5631891433418151, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9802799820899963, "kl": 0.04541015625, "learning_rate": 9.776754757575973e-07, "loss": 0.0102, "num_tokens": 42706559.0, "reward": 9.184962272644043, "reward_std": 1.5200157165527344, "rewards/accuracy_reward/mean": 1.15625, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.5833333730697632, "rewards/judge_reward/std": 1.600828766822815, "rewards/ngrams_iou_reward/mean": 0.20209760963916779, "rewards/ngrams_iou_reward/std": 0.24158604443073273, "rewards/schema_keywords_iou_reward/mean": 0.6911972165107727, "rewards/schema_keywords_iou_reward/std": 0.18583935499191284, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 223.63021850585938, "completions/mean_terminated_length": 154.11474609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.5665818490245971, "frac_reward_zero_std": 0.0, "grad_norm": 0.8780226707458496, "kl": 0.04290771484375, "learning_rate": 9.770886808457708e-07, "loss": 0.0147, "num_tokens": 42968172.0, "reward": 9.022832870483398, "reward_std": 1.7191963195800781, "rewards/accuracy_reward/mean": 1.15625, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.5385416746139526, "rewards/judge_reward/std": 1.5769855976104736, "rewards/ngrams_iou_reward/mean": 0.15107759833335876, "rewards/ngrams_iou_reward/std": 0.15567606687545776, "rewards/schema_keywords_iou_reward/mean": 0.6488388180732727, "rewards/schema_keywords_iou_reward/std": 0.19715291261672974, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 217.09896850585938, "completions/mean_terminated_length": 153.6849365234375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.5699745547073791, "frac_reward_zero_std": 0.0, "grad_norm": 0.950370192527771, "kl": 0.04248046875, "learning_rate": 9.764944545079196e-07, "loss": 0.0184, "num_tokens": 43220053.0, "reward": 9.649789810180664, "reward_std": 1.969362497329712, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.3729166984558105, "rewards/judge_reward/std": 1.5934734344482422, "rewards/ngrams_iou_reward/mean": 0.1467982530593872, "rewards/ngrams_iou_reward/std": 0.14982759952545166, "rewards/schema_keywords_iou_reward/mean": 0.6894490718841553, "rewards/schema_keywords_iou_reward/std": 0.17344899475574493, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 220.1979217529297, "completions/mean_terminated_length": 141.43333435058594, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5733672603901612, "frac_reward_zero_std": 0.0, "grad_norm": 0.9035467505455017, "kl": 0.0400390625, "learning_rate": 9.758928060000777e-07, "loss": -0.0269, "num_tokens": 43459167.0, "reward": 9.161108016967773, "reward_std": 1.8219879865646362, "rewards/accuracy_reward/mean": 1.203125, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.511458396911621, "rewards/judge_reward/std": 1.5936633348464966, "rewards/ngrams_iou_reward/mean": 0.23992998898029327, "rewards/ngrams_iou_reward/std": 0.2739070951938629, "rewards/schema_keywords_iou_reward/mean": 0.6638856530189514, "rewards/schema_keywords_iou_reward/std": 0.22112810611724854, "rewards/syntax_reward/mean": 0.703125, "rewards/syntax_reward/std": 0.4580754339694977, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 221.53125, "completions/mean_terminated_length": 137.82144165039062, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.5767599660729432, "frac_reward_zero_std": 0.0, "grad_norm": 1.0086594820022583, "kl": 0.0445556640625, "learning_rate": 9.752837446938914e-07, "loss": 0.0161, "num_tokens": 43697553.0, "reward": 8.809263229370117, "reward_std": 1.593707799911499, "rewards/accuracy_reward/mean": 1.046875, "rewards/accuracy_reward/std": 1.433660626411438, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.6135417222976685, "rewards/judge_reward/std": 1.5532516241073608, "rewards/ngrams_iou_reward/mean": 0.14013299345970154, "rewards/ngrams_iou_reward/std": 0.18053628504276276, "rewards/schema_keywords_iou_reward/mean": 0.6347549557685852, "rewards/schema_keywords_iou_reward/std": 0.21621228754520416, "rewards/syntax_reward/mean": 0.7552083134651184, "rewards/syntax_reward/std": 0.4310877025127411, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 218.9947967529297, "completions/mean_terminated_length": 151.51470947265625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5801526717557252, "frac_reward_zero_std": 0.0, "grad_norm": 0.7848917245864868, "kl": 0.043212890625, "learning_rate": 9.746672800764734e-07, "loss": 0.005, "num_tokens": 43944344.0, "reward": 9.717406272888184, "reward_std": 1.9883232116699219, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.3614583015441895, "rewards/judge_reward/std": 1.6019536256790161, "rewards/ngrams_iou_reward/mean": 0.18901585042476654, "rewards/ngrams_iou_reward/std": 0.23155009746551514, "rewards/schema_keywords_iou_reward/mean": 0.7013063430786133, "rewards/schema_keywords_iou_reward/std": 0.1887906789779663, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 220.1979217529297, "completions/mean_terminated_length": 154.91175842285156, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.5835453774385072, "frac_reward_zero_std": 0.0, "grad_norm": 0.9988920092582703, "kl": 0.0460205078125, "learning_rate": 9.740434217502547e-07, "loss": 0.0063, "num_tokens": 44203372.0, "reward": 10.295194625854492, "reward_std": 1.4010815620422363, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.497916579246521, "rewards/judge_reward/std": 1.6958204507827759, "rewards/ngrams_iou_reward/mean": 0.1970231533050537, "rewards/ngrams_iou_reward/std": 0.22031834721565247, "rewards/schema_keywords_iou_reward/mean": 0.7169206738471985, "rewards/schema_keywords_iou_reward/std": 0.1827225536108017, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.5260467529297, "completions/mean_terminated_length": 156.93441772460938, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.5869380831212893, "frac_reward_zero_std": 0.0, "grad_norm": 1.0110023021697998, "kl": 0.0428466796875, "learning_rate": 9.734121794328356e-07, "loss": 0.0048, "num_tokens": 44466417.0, "reward": 9.294122695922852, "reward_std": 2.011638879776001, "rewards/accuracy_reward/mean": 1.203125, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.6177083253860474, "rewards/judge_reward/std": 1.5650298595428467, "rewards/ngrams_iou_reward/mean": 0.15065836906433105, "rewards/ngrams_iou_reward/std": 0.19188480079174042, "rewards/schema_keywords_iou_reward/mean": 0.6684630513191223, "rewards/schema_keywords_iou_reward/std": 0.1990535408258438, "rewards/syntax_reward/mean": 0.6822916865348816, "rewards/syntax_reward/std": 0.46680256724357605, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 222.9947967529297, "completions/mean_terminated_length": 155.41270446777344, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.5903307888040712, "frac_reward_zero_std": 0.0, "grad_norm": 0.9665901064872742, "kl": 0.0478515625, "learning_rate": 9.727735629568335e-07, "loss": 0.0233, "num_tokens": 44724482.0, "reward": 9.8590726852417, "reward_std": 1.611588954925537, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.3208333253860474, "rewards/judge_reward/std": 1.528775691986084, "rewards/ngrams_iou_reward/mean": 0.17747490108013153, "rewards/ngrams_iou_reward/std": 0.18753521144390106, "rewards/schema_keywords_iou_reward/mean": 0.6909715533256531, "rewards/schema_keywords_iou_reward/std": 0.17229442298412323, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 216.1875, "completions/mean_terminated_length": 154.0800018310547, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.5937234944868532, "frac_reward_zero_std": 0.0, "grad_norm": 0.8960906267166138, "kl": 0.04486083984375, "learning_rate": 9.721275822697305e-07, "loss": 0.0046, "num_tokens": 44965430.0, "reward": 8.72877311706543, "reward_std": 1.5515341758728027, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.4541667699813843, "rewards/judge_reward/std": 1.4592485427856445, "rewards/ngrams_iou_reward/mean": 0.18191082775592804, "rewards/ngrams_iou_reward/std": 0.1889706552028656, "rewards/schema_keywords_iou_reward/mean": 0.6801955103874207, "rewards/schema_keywords_iou_reward/std": 0.19775135815143585, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 215.59375, "completions/mean_terminated_length": 155.2467498779297, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.5971162001696353, "frac_reward_zero_std": 0.0, "grad_norm": 1.010963797569275, "kl": 0.0435791015625, "learning_rate": 9.714742474337186e-07, "loss": -0.0214, "num_tokens": 45200450.0, "reward": 9.89825439453125, "reward_std": 1.5529747009277344, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.5229167938232422, "rewards/judge_reward/std": 1.6862534284591675, "rewards/ngrams_iou_reward/mean": 0.13596999645233154, "rewards/ngrams_iou_reward/std": 0.15000437200069427, "rewards/schema_keywords_iou_reward/mean": 0.6852008700370789, "rewards/schema_keywords_iou_reward/std": 0.16412866115570068, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 219.453125, "completions/mean_terminated_length": 154.30435180664062, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.6005089058524173, "frac_reward_zero_std": 0.0, "grad_norm": 0.8376379609107971, "kl": 0.0447998046875, "learning_rate": 9.708135686255414e-07, "loss": 0.0062, "num_tokens": 45445913.0, "reward": 10.154644966125488, "reward_std": 1.4197019338607788, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.3895832300186157, "rewards/judge_reward/std": 1.7325632572174072, "rewards/ngrams_iou_reward/mean": 0.1453104466199875, "rewards/ngrams_iou_reward/std": 0.12420797348022461, "rewards/schema_keywords_iou_reward/mean": 0.6780845522880554, "rewards/schema_keywords_iou_reward/std": 0.197658970952034, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 223.984375, "completions/mean_terminated_length": 159.953125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.6039016115351993, "frac_reward_zero_std": 0.0, "grad_norm": 0.9583072066307068, "kl": 0.0496826171875, "learning_rate": 9.701455561363377e-07, "loss": 0.0045, "num_tokens": 45715550.0, "reward": 8.963400840759277, "reward_std": 1.5471811294555664, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.3760417699813843, "rewards/judge_reward/std": 1.5410776138305664, "rewards/ngrams_iou_reward/mean": 0.11744773387908936, "rewards/ngrams_iou_reward/std": 0.12414219230413437, "rewards/schema_keywords_iou_reward/mean": 0.6772028803825378, "rewards/schema_keywords_iou_reward/std": 0.16277843713760376, "rewards/syntax_reward/mean": 0.6822916865348816, "rewards/syntax_reward/std": 0.46680256724357605, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 211.640625, "completions/mean_terminated_length": 152.13414001464844, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.6072943172179813, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7865731716156006, "kl": 0.04571533203125, "learning_rate": 9.6947022037148e-07, "loss": -0.0098, "num_tokens": 45961013.0, "reward": 10.098594665527344, "reward_std": 1.280775785446167, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.660416603088379, "rewards/judge_reward/std": 1.6850732564926147, "rewards/ngrams_iou_reward/mean": 0.21921688318252563, "rewards/ngrams_iou_reward/std": 0.2473539113998413, "rewards/schema_keywords_iou_reward/mean": 0.7356272339820862, "rewards/schema_keywords_iou_reward/std": 0.16179797053337097, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 216.1666717529297, "completions/mean_terminated_length": 149.7777862548828, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.6106870229007634, "frac_reward_zero_std": 0.0, "grad_norm": 0.9869630336761475, "kl": 0.04376220703125, "learning_rate": 9.687875718504125e-07, "loss": 0.0187, "num_tokens": 46225489.0, "reward": 10.02597427368164, "reward_std": 1.6495871543884277, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.6140624284744263, "rewards/judge_reward/std": 1.659956455230713, "rewards/ngrams_iou_reward/mean": 0.19893091917037964, "rewards/ngrams_iou_reward/std": 0.2002708911895752, "rewards/schema_keywords_iou_reward/mean": 0.7239181399345398, "rewards/schema_keywords_iou_reward/std": 0.16980794072151184, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 231.21875, "completions/mean_terminated_length": 166.22642517089844, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.6140797285835454, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8371279239654541, "kl": 0.0458984375, "learning_rate": 9.680976212064874e-07, "loss": 0.0113, "num_tokens": 46470487.0, "reward": 9.552457809448242, "reward_std": 1.71470046043396, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.5885416269302368, "rewards/judge_reward/std": 1.6769685745239258, "rewards/ngrams_iou_reward/mean": 0.1502368301153183, "rewards/ngrams_iou_reward/std": 0.17208139598369598, "rewards/schema_keywords_iou_reward/mean": 0.6678457856178284, "rewards/schema_keywords_iou_reward/std": 0.18975886702537537, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 213.61459350585938, "completions/mean_terminated_length": 147.4933319091797, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.6174724342663274, "frac_reward_zero_std": 0.0, "grad_norm": 1.0934499502182007, "kl": 0.04449462890625, "learning_rate": 9.67400379186799e-07, "loss": 0.0338, "num_tokens": 46718027.0, "reward": 10.18504524230957, "reward_std": 1.4079917669296265, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.3854166269302368, "rewards/judge_reward/std": 1.6572847366333008, "rewards/ngrams_iou_reward/mean": 0.157456636428833, "rewards/ngrams_iou_reward/std": 0.14633502066135406, "rewards/schema_keywords_iou_reward/mean": 0.6577960848808289, "rewards/schema_keywords_iou_reward/std": 0.19027630984783173, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 214.4479217529297, "completions/mean_terminated_length": 148.18919372558594, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.6208651399491094, "frac_reward_zero_std": 0.0, "grad_norm": 1.1053534746170044, "kl": 0.04766845703125, "learning_rate": 9.666958566520174e-07, "loss": 0.0317, "num_tokens": 46953619.0, "reward": 9.414836883544922, "reward_std": 1.4285997152328491, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.7583332061767578, "rewards/judge_reward/std": 1.6214314699172974, "rewards/ngrams_iou_reward/mean": 0.17895393073558807, "rewards/ngrams_iou_reward/std": 0.19936884939670563, "rewards/schema_keywords_iou_reward/mean": 0.6931743621826172, "rewards/schema_keywords_iou_reward/std": 0.1822802871465683, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 220.78125, "completions/mean_terminated_length": 158.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.6242578456318915, "frac_reward_zero_std": 0.0, "grad_norm": 0.864193856716156, "kl": 0.0517578125, "learning_rate": 9.659840645762174e-07, "loss": 0.0023, "num_tokens": 47193133.0, "reward": 9.560611724853516, "reward_std": 1.432328224182129, "rewards/accuracy_reward/mean": 1.203125, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.7395833730697632, "rewards/judge_reward/std": 1.6182518005371094, "rewards/ngrams_iou_reward/mean": 0.12145182490348816, "rewards/ngrams_iou_reward/std": 0.09099024534225464, "rewards/schema_keywords_iou_reward/mean": 0.6735331416130066, "rewards/schema_keywords_iou_reward/std": 0.1712750643491745, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 226.02084350585938, "completions/mean_terminated_length": 151.34544372558594, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6276505513146734, "frac_reward_zero_std": 0.0, "grad_norm": 0.9062066674232483, "kl": 0.0472412109375, "learning_rate": 9.652650140467092e-07, "loss": -0.0048, "num_tokens": 47420819.0, "reward": 9.670104026794434, "reward_std": 1.3071321249008179, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.1354166269302368, "rewards/judge_reward/std": 1.4759488105773926, "rewards/ngrams_iou_reward/mean": 0.19689927995204926, "rewards/ngrams_iou_reward/std": 0.2377931922674179, "rewards/schema_keywords_iou_reward/mean": 0.6971621513366699, "rewards/schema_keywords_iou_reward/std": 0.20181477069854736, "rewards/syntax_reward/mean": 0.734375, "rewards/syntax_reward/std": 0.44282010197639465, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.7135467529297, "completions/mean_terminated_length": 163.10667419433594, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.6310432569974554, "frac_reward_zero_std": 0.0, "grad_norm": 0.886989414691925, "kl": 0.045654296875, "learning_rate": 9.645387162638652e-07, "loss": -0.0049, "num_tokens": 47689486.0, "reward": 9.53148078918457, "reward_std": 1.8219903707504272, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.587499976158142, "rewards/judge_reward/std": 1.6363186836242676, "rewards/ngrams_iou_reward/mean": 0.1298934817314148, "rewards/ngrams_iou_reward/std": 0.0813756212592125, "rewards/schema_keywords_iou_reward/mean": 0.6640859842300415, "rewards/schema_keywords_iou_reward/std": 0.18111194670200348, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.17709350585938, "completions/mean_terminated_length": 165.11111450195312, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.6344359626802375, "frac_reward_zero_std": 0.0, "grad_norm": 1.0040825605392456, "kl": 0.04632568359375, "learning_rate": 9.638051825409452e-07, "loss": 0.0121, "num_tokens": 47932274.0, "reward": 9.425912857055664, "reward_std": 1.5945096015930176, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.519791603088379, "rewards/judge_reward/std": 1.6808198690414429, "rewards/ngrams_iou_reward/mean": 0.13448356091976166, "rewards/ngrams_iou_reward/std": 0.16491077840328217, "rewards/schema_keywords_iou_reward/mean": 0.6789290904998779, "rewards/schema_keywords_iou_reward/std": 0.20082181692123413, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.2135467529297, "completions/mean_terminated_length": 157.71212768554688, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.6378286683630195, "frac_reward_zero_std": 0.03125, "grad_norm": 1.0632766485214233, "kl": 0.05126953125, "learning_rate": 9.630644243039206e-07, "loss": 0.0213, "num_tokens": 48201697.0, "reward": 9.756589889526367, "reward_std": 1.324461817741394, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.321874976158142, "rewards/judge_reward/std": 1.6087262630462646, "rewards/ngrams_iou_reward/mean": 0.2178829163312912, "rewards/ngrams_iou_reward/std": 0.24895311892032623, "rewards/schema_keywords_iou_reward/mean": 0.6814143061637878, "rewards/schema_keywords_iou_reward/std": 0.19882218539714813, "rewards/syntax_reward/mean": 0.7135416865348816, "rewards/syntax_reward/std": 0.45328810811042786, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 222.5416717529297, "completions/mean_terminated_length": 160.11940002441406, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.6412213740458015, "frac_reward_zero_std": 0.0, "grad_norm": 1.0276120901107788, "kl": 0.0460205078125, "learning_rate": 9.623164530912961e-07, "loss": 0.0113, "num_tokens": 48441423.0, "reward": 9.222418785095215, "reward_std": 1.6961240768432617, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.2270833253860474, "rewards/judge_reward/std": 1.4670181274414062, "rewards/ngrams_iou_reward/mean": 0.19519877433776855, "rewards/ngrams_iou_reward/std": 0.23077768087387085, "rewards/schema_keywords_iou_reward/mean": 0.6928446888923645, "rewards/schema_keywords_iou_reward/std": 0.20113569498062134, "rewards/syntax_reward/mean": 0.7239583134651184, "rewards/syntax_reward/std": 0.4482063353061676, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 227.171875, "completions/mean_terminated_length": 160.5689697265625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.6446140797285835, "frac_reward_zero_std": 0.0, "grad_norm": 0.892192006111145, "kl": 0.0445556640625, "learning_rate": 9.615612805539303e-07, "loss": 0.0197, "num_tokens": 48705096.0, "reward": 8.789876937866211, "reward_std": 2.1712615489959717, "rewards/accuracy_reward/mean": 1.234375, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.8802083134651184, "rewards/format_reward/std": 0.32556670904159546, "rewards/judge_reward/mean": 1.40625, "rewards/judge_reward/std": 1.6006419658660889, "rewards/ngrams_iou_reward/mean": 0.12840989232063293, "rewards/ngrams_iou_reward/std": 0.14359694719314575, "rewards/schema_keywords_iou_reward/mean": 0.5885499715805054, "rewards/schema_keywords_iou_reward/std": 0.21411021053791046, "rewards/syntax_reward/mean": 0.6770833134651184, "rewards/syntax_reward/std": 0.46881362795829773, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 229.9322967529297, "completions/mean_terminated_length": 155.89999389648438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6480067854113656, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8792359828948975, "kl": 0.04681396484375, "learning_rate": 9.607989184548542e-07, "loss": -0.0083, "num_tokens": 48963749.0, "reward": 9.648670196533203, "reward_std": 1.5161432027816772, "rewards/accuracy_reward/mean": 1.515625, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.2572916746139526, "rewards/judge_reward/std": 1.5185585021972656, "rewards/ngrams_iou_reward/mean": 0.18424177169799805, "rewards/ngrams_iou_reward/std": 0.20403426885604858, "rewards/schema_keywords_iou_reward/mean": 0.6790108680725098, "rewards/schema_keywords_iou_reward/std": 0.1975216120481491, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 231.72396850585938, "completions/mean_terminated_length": 171.25454711914062, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.6513994910941476, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8624633550643921, "kl": 0.0447998046875, "learning_rate": 9.600293786690872e-07, "loss": 0.0141, "num_tokens": 49225590.0, "reward": 9.175068855285645, "reward_std": 1.8677079677581787, "rewards/accuracy_reward/mean": 1.140625, "rewards/accuracy_reward/std": 1.4601210355758667, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3129251003265381, "rewards/judge_reward/mean": 1.689583420753479, "rewards/judge_reward/std": 1.5880099534988403, "rewards/ngrams_iou_reward/mean": 0.11515603214502335, "rewards/ngrams_iou_reward/std": 0.129145547747612, "rewards/schema_keywords_iou_reward/mean": 0.6390790343284607, "rewards/schema_keywords_iou_reward/std": 0.18941044807434082, "rewards/syntax_reward/mean": 0.7291666865348816, "rewards/syntax_reward/std": 0.44555196166038513, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 227.1041717529297, "completions/mean_terminated_length": 135.3913116455078, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.6547921967769296, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9349203705787659, "kl": 0.04168701171875, "learning_rate": 9.592526731834536e-07, "loss": -0.0054, "num_tokens": 49473344.0, "reward": 10.535481452941895, "reward_std": 1.2603647708892822, "rewards/accuracy_reward/mean": 1.765625, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.2802083492279053, "rewards/judge_reward/std": 1.6626553535461426, "rewards/ngrams_iou_reward/mean": 0.2145359069108963, "rewards/ngrams_iou_reward/std": 0.28770479559898376, "rewards/schema_keywords_iou_reward/mean": 0.7188615798950195, "rewards/schema_keywords_iou_reward/std": 0.1998816877603531, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 235.80209350585938, "completions/mean_terminated_length": 163.6666717529297, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.6581849024597116, "frac_reward_zero_std": 0.0, "grad_norm": 0.9476885795593262, "kl": 0.048095703125, "learning_rate": 9.584688140963944e-07, "loss": 0.0044, "num_tokens": 49720938.0, "reward": 9.6749267578125, "reward_std": 1.639914870262146, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.53125, "rewards/judge_reward/std": 1.7118810415267944, "rewards/ngrams_iou_reward/mean": 0.13843990862369537, "rewards/ngrams_iou_reward/std": 0.14176568388938904, "rewards/schema_keywords_iou_reward/mean": 0.6875276565551758, "rewards/schema_keywords_iou_reward/std": 0.1992679089307785, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 233.3541717529297, "completions/mean_terminated_length": 161.478271484375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.6615776081424937, "frac_reward_zero_std": 0.0, "grad_norm": 0.9136601686477661, "kl": 0.0443115234375, "learning_rate": 9.576778136177797e-07, "loss": 0.015, "num_tokens": 49961654.0, "reward": 9.44224739074707, "reward_std": 1.843418836593628, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3129251003265381, "rewards/judge_reward/mean": 1.5302082300186157, "rewards/judge_reward/std": 1.6681568622589111, "rewards/ngrams_iou_reward/mean": 0.15310554206371307, "rewards/ngrams_iou_reward/std": 0.1596335768699646, "rewards/schema_keywords_iou_reward/mean": 0.6662247180938721, "rewards/schema_keywords_iou_reward/std": 0.20170746743679047, "rewards/syntax_reward/mean": 0.734375, "rewards/syntax_reward/std": 0.44282010197639465, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 236.71875, "completions/mean_terminated_length": 167.85714721679688, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.6649703138252756, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8252257108688354, "kl": 0.04510498046875, "learning_rate": 9.568796840687184e-07, "loss": 0.0089, "num_tokens": 50198552.0, "reward": 9.413244247436523, "reward_std": 1.471382975578308, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.4791666269302368, "rewards/judge_reward/std": 1.5910691022872925, "rewards/ngrams_iou_reward/mean": 0.1645411103963852, "rewards/ngrams_iou_reward/std": 0.20305702090263367, "rewards/schema_keywords_iou_reward/mean": 0.6705770492553711, "rewards/schema_keywords_iou_reward/std": 0.17766840755939484, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 228.69271850585938, "completions/mean_terminated_length": 157.07546997070312, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.6683630195080577, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7919399738311768, "kl": 0.046630859375, "learning_rate": 9.56074437881366e-07, "loss": -0.0092, "num_tokens": 50476131.0, "reward": 10.06544017791748, "reward_std": 1.5932567119598389, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.3562499284744263, "rewards/judge_reward/std": 1.62034273147583, "rewards/ngrams_iou_reward/mean": 0.14902286231517792, "rewards/ngrams_iou_reward/std": 0.13379442691802979, "rewards/schema_keywords_iou_reward/mean": 0.6935003399848938, "rewards/schema_keywords_iou_reward/std": 0.17885662615299225, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617491722107, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.1354217529297, "completions/mean_terminated_length": 160.48147583007812, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.6717557251908397, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8727719783782959, "kl": 0.04498291015625, "learning_rate": 9.552620875987312e-07, "loss": 0.0119, "num_tokens": 50737745.0, "reward": 9.08240032196045, "reward_std": 1.7396548986434937, "rewards/accuracy_reward/mean": 1.15625, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.29938673973083496, "rewards/judge_reward/mean": 1.5750001668930054, "rewards/judge_reward/std": 1.6676157712936401, "rewards/ngrams_iou_reward/mean": 0.1911880373954773, "rewards/ngrams_iou_reward/std": 0.2128426432609558, "rewards/schema_keywords_iou_reward/mean": 0.6735034584999084, "rewards/schema_keywords_iou_reward/std": 0.18097101151943207, "rewards/syntax_reward/mean": 0.6979166865348816, "rewards/syntax_reward/std": 0.4603615701198578, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 228.50521850585938, "completions/mean_terminated_length": 163.38597106933594, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.6751484308736218, "frac_reward_zero_std": 0.0, "grad_norm": 0.8521469831466675, "kl": 0.04742431640625, "learning_rate": 9.544426458744803e-07, "loss": 0.0347, "num_tokens": 50992584.0, "reward": 9.82272720336914, "reward_std": 1.5493383407592773, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.372916579246521, "rewards/judge_reward/std": 1.569104790687561, "rewards/ngrams_iou_reward/mean": 0.1885981559753418, "rewards/ngrams_iou_reward/std": 0.23351098597049713, "rewards/schema_keywords_iou_reward/mean": 0.7007949352264404, "rewards/schema_keywords_iou_reward/std": 0.1776665598154068, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 229.328125, "completions/mean_terminated_length": 169.20338439941406, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.6785411365564037, "frac_reward_zero_std": 0.0, "grad_norm": 1.0581880807876587, "kl": 0.049560546875, "learning_rate": 9.536161254727406e-07, "loss": 0.026, "num_tokens": 51256053.0, "reward": 10.442399024963379, "reward_std": 1.4502770900726318, "rewards/accuracy_reward/mean": 1.96875, "rewards/accuracy_reward/std": 1.4286017417907715, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 0.9677083492279053, "rewards/judge_reward/std": 1.4773681163787842, "rewards/ngrams_iou_reward/mean": 0.17199723422527313, "rewards/ngrams_iou_reward/std": 0.18681509792804718, "rewards/schema_keywords_iou_reward/mean": 0.6891517639160156, "rewards/schema_keywords_iou_reward/std": 0.1857611984014511, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.8125, "completions/mean_terminated_length": 156.4375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.6819338422391857, "frac_reward_zero_std": 0.03125, "grad_norm": 1.1020606756210327, "kl": 0.05364990234375, "learning_rate": 9.52782539267901e-07, "loss": -0.0213, "num_tokens": 51523179.0, "reward": 8.810518264770508, "reward_std": 1.8238303661346436, "rewards/accuracy_reward/mean": 1.15625, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.451041579246521, "rewards/judge_reward/std": 1.5054869651794434, "rewards/ngrams_iou_reward/mean": 0.15596142411231995, "rewards/ngrams_iou_reward/std": 0.17582853138446808, "rewards/schema_keywords_iou_reward/mean": 0.6483070850372314, "rewards/schema_keywords_iou_reward/std": 0.19846118986606598, "rewards/syntax_reward/mean": 0.7083333134651184, "rewards/syntax_reward/std": 0.4557179808616638, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 215.4791717529297, "completions/mean_terminated_length": 161.1219482421875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.6853265479219678, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9300256967544556, "kl": 0.05035400390625, "learning_rate": 9.519419002444118e-07, "loss": -0.0147, "num_tokens": 51775799.0, "reward": 9.372350692749023, "reward_std": 1.5737347602844238, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.25, "rewards/judge_reward/std": 1.4403315782546997, "rewards/ngrams_iou_reward/mean": 0.19375234842300415, "rewards/ngrams_iou_reward/std": 0.2113201916217804, "rewards/schema_keywords_iou_reward/mean": 0.6838056445121765, "rewards/schema_keywords_iou_reward/std": 0.19975942373275757, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 225.2447967529297, "completions/mean_terminated_length": 165.15383911132812, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.6887192536047498, "frac_reward_zero_std": 0.0, "grad_norm": 0.9754816889762878, "kl": 0.05230712890625, "learning_rate": 9.510942214965818e-07, "loss": 0.0135, "num_tokens": 52015852.0, "reward": 9.584778785705566, "reward_std": 1.6323741674423218, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.162500023841858, "rewards/judge_reward/std": 1.5316041707992554, "rewards/ngrams_iou_reward/mean": 0.17968201637268066, "rewards/ngrams_iou_reward/std": 0.19785451889038086, "rewards/schema_keywords_iou_reward/mean": 0.6686376929283142, "rewards/schema_keywords_iou_reward/std": 0.20725536346435547, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 213.39584350585938, "completions/mean_terminated_length": 151.12820434570312, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.6921119592875318, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9148620367050171, "kl": 0.04803466796875, "learning_rate": 9.502395162283759e-07, "loss": 0.018, "num_tokens": 52262756.0, "reward": 9.69027042388916, "reward_std": 1.3156485557556152, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.2427083253860474, "rewards/judge_reward/std": 1.5179377794265747, "rewards/ngrams_iou_reward/mean": 0.19768284261226654, "rewards/ngrams_iou_reward/std": 0.23353593051433563, "rewards/schema_keywords_iou_reward/mean": 0.6946702003479004, "rewards/schema_keywords_iou_reward/std": 0.1927591860294342, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 209.4479217529297, "completions/mean_terminated_length": 159.89247131347656, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.6955046649703138, "frac_reward_zero_std": 0.0, "grad_norm": 1.1386655569076538, "kl": 0.0517578125, "learning_rate": 9.493777977532071e-07, "loss": -0.0073, "num_tokens": 52520044.0, "reward": 9.422779083251953, "reward_std": 1.7278878688812256, "rewards/accuracy_reward/mean": 1.171875, "rewards/accuracy_reward/std": 1.4674979448318481, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.6552082300186157, "rewards/judge_reward/std": 1.6072068214416504, "rewards/ngrams_iou_reward/mean": 0.14025825262069702, "rewards/ngrams_iou_reward/std": 0.13327406346797943, "rewards/schema_keywords_iou_reward/mean": 0.680436372756958, "rewards/schema_keywords_iou_reward/std": 0.1887361854314804, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 204.72396850585938, "completions/mean_terminated_length": 160.41748046875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.6988973706530959, "frac_reward_zero_std": 0.0, "grad_norm": 1.1270235776901245, "kl": 0.05078125, "learning_rate": 9.485090794937317e-07, "loss": -0.0151, "num_tokens": 52760087.0, "reward": 10.156307220458984, "reward_std": 1.1762446165084839, "rewards/accuracy_reward/mean": 1.828125, "rewards/accuracy_reward/std": 1.4674979448318481, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 0.9708333015441895, "rewards/judge_reward/std": 1.4298182725906372, "rewards/ngrams_iou_reward/mean": 0.21095730364322662, "rewards/ngrams_iou_reward/std": 0.23984608054161072, "rewards/schema_keywords_iou_reward/mean": 0.7172242999076843, "rewards/schema_keywords_iou_reward/std": 0.18517133593559265, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 215.27084350585938, "completions/mean_terminated_length": 170.06593322753906, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.7022900763358778, "frac_reward_zero_std": 0.0, "grad_norm": 1.004183053970337, "kl": 0.05029296875, "learning_rate": 9.476333749816381e-07, "loss": 0.0041, "num_tokens": 53012937.0, "reward": 9.426023483276367, "reward_std": 1.5773614645004272, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.3635417222976685, "rewards/judge_reward/std": 1.5122965574264526, "rewards/ngrams_iou_reward/mean": 0.1827247589826584, "rewards/ngrams_iou_reward/std": 0.24543587863445282, "rewards/schema_keywords_iou_reward/mean": 0.6620475649833679, "rewards/schema_keywords_iou_reward/std": 0.2014477550983429, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 208.42709350585938, "completions/mean_terminated_length": 152.2045440673828, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.7056827820186599, "frac_reward_zero_std": 0.0625, "grad_norm": 1.0786247253417969, "kl": 0.049072265625, "learning_rate": 9.46750697857437e-07, "loss": 0.011, "num_tokens": 53258653.0, "reward": 9.539789199829102, "reward_std": 1.318655252456665, "rewards/accuracy_reward/mean": 1.15625, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.6979166269302368, "rewards/judge_reward/std": 1.6322649717330933, "rewards/ngrams_iou_reward/mean": 0.19923709332942963, "rewards/ngrams_iou_reward/std": 0.20004284381866455, "rewards/schema_keywords_iou_reward/mean": 0.6895095705986023, "rewards/schema_keywords_iou_reward/std": 0.16448991000652313, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 212.81771850585938, "completions/mean_terminated_length": 154.89024353027344, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.7090754877014419, "frac_reward_zero_std": 0.0625, "grad_norm": 1.2131134271621704, "kl": 0.05047607421875, "learning_rate": 9.45861061870249e-07, "loss": -0.0014, "num_tokens": 53519162.0, "reward": 9.77023696899414, "reward_std": 1.5294575691223145, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.3937498331069946, "rewards/judge_reward/std": 1.5956624746322632, "rewards/ngrams_iou_reward/mean": 0.20732462406158447, "rewards/ngrams_iou_reward/std": 0.2623549699783325, "rewards/schema_keywords_iou_reward/mean": 0.6712453365325928, "rewards/schema_keywords_iou_reward/std": 0.19937801361083984, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.9635467529297, "completions/mean_terminated_length": 164.9605255126953, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.712468193384224, "frac_reward_zero_std": 0.0, "grad_norm": 1.041566014289856, "kl": 0.04693603515625, "learning_rate": 9.4496448087759e-07, "loss": 0.0019, "num_tokens": 53754211.0, "reward": 9.601938247680664, "reward_std": 1.785266399383545, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.478124976158142, "rewards/judge_reward/std": 1.5738506317138672, "rewards/ngrams_iou_reward/mean": 0.16039346158504486, "rewards/ngrams_iou_reward/std": 0.17422474920749664, "rewards/schema_keywords_iou_reward/mean": 0.6936270594596863, "rewards/schema_keywords_iou_reward/std": 0.18016734719276428, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.70834350585938, "completions/mean_terminated_length": 182.42425537109375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.7158608990670059, "frac_reward_zero_std": 0.0, "grad_norm": 0.7883560061454773, "kl": 0.04840087890625, "learning_rate": 9.44060968845156e-07, "loss": 0.0196, "num_tokens": 54026579.0, "reward": 8.629451751708984, "reward_std": 1.511556625366211, "rewards/accuracy_reward/mean": 0.953125, "rewards/accuracy_reward/std": 1.4004077911376953, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.6489582061767578, "rewards/judge_reward/std": 1.4883487224578857, "rewards/ngrams_iou_reward/mean": 0.1278751641511917, "rewards/ngrams_iou_reward/std": 0.15796436369419098, "rewards/schema_keywords_iou_reward/mean": 0.6567846536636353, "rewards/schema_keywords_iou_reward/std": 0.15550975501537323, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 224.47396850585938, "completions/mean_terminated_length": 155.11666870117188, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.719253604749788, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7789198160171509, "kl": 0.05157470703125, "learning_rate": 9.431505398466043e-07, "loss": -0.0076, "num_tokens": 54254952.0, "reward": 10.2938232421875, "reward_std": 1.135864019393921, "rewards/accuracy_reward/mean": 1.90625, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 0.9791666865348816, "rewards/judge_reward/std": 1.5354734659194946, "rewards/ngrams_iou_reward/mean": 0.15069043636322021, "rewards/ngrams_iou_reward/std": 0.15003475546836853, "rewards/schema_keywords_iou_reward/mean": 0.6900070309638977, "rewards/schema_keywords_iou_reward/std": 0.1818820983171463, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 228.1197967529297, "completions/mean_terminated_length": 166.78334045410156, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.72264631043257, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9233181476593018, "kl": 0.052734375, "learning_rate": 9.42233208063336e-07, "loss": 0.0164, "num_tokens": 54494129.0, "reward": 9.83409595489502, "reward_std": 1.778517484664917, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.359375, "rewards/judge_reward/std": 1.6903221607208252, "rewards/ngrams_iou_reward/mean": 0.1945229023694992, "rewards/ngrams_iou_reward/std": 0.23083098232746124, "rewards/schema_keywords_iou_reward/mean": 0.6968636512756348, "rewards/schema_keywords_iou_reward/std": 0.18764737248420715, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 233.640625, "completions/mean_terminated_length": 171.8235321044922, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.726039016115352, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9100298881530762, "kl": 0.04803466796875, "learning_rate": 9.413089877842735e-07, "loss": 0.0239, "num_tokens": 54736994.0, "reward": 9.309782028198242, "reward_std": 1.718564748764038, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.196874976158142, "rewards/judge_reward/std": 1.475733995437622, "rewards/ngrams_iou_reward/mean": 0.1258571743965149, "rewards/ngrams_iou_reward/std": 0.10410267114639282, "rewards/schema_keywords_iou_reward/mean": 0.6287158727645874, "rewards/schema_keywords_iou_reward/std": 0.1666823923587799, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 237.5572967529297, "completions/mean_terminated_length": 169.63414001464844, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.729431721798134, "frac_reward_zero_std": 0.0, "grad_norm": 0.8615916967391968, "kl": 0.04376220703125, "learning_rate": 9.40377893405639e-07, "loss": 0.0094, "num_tokens": 54971947.0, "reward": 9.40669059753418, "reward_std": 1.457389235496521, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33158352971076965, "rewards/judge_reward/mean": 1.261458396911621, "rewards/judge_reward/std": 1.5164625644683838, "rewards/ngrams_iou_reward/mean": 0.23413778841495514, "rewards/ngrams_iou_reward/std": 0.2796717882156372, "rewards/schema_keywords_iou_reward/mean": 0.706926167011261, "rewards/schema_keywords_iou_reward/std": 0.21412049233913422, "rewards/syntax_reward/mean": 0.7552083134651184, "rewards/syntax_reward/std": 0.4310877025127411, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 233.7760467529297, "completions/mean_terminated_length": 167.1041717529297, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.732824427480916, "frac_reward_zero_std": 0.0, "grad_norm": 0.8954695463180542, "kl": 0.04693603515625, "learning_rate": 9.394399394307302e-07, "loss": -0.0095, "num_tokens": 55223448.0, "reward": 9.454858779907227, "reward_std": 1.593413233757019, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.234375, "rewards/judge_reward/std": 1.4961960315704346, "rewards/ngrams_iou_reward/mean": 0.19288067519664764, "rewards/ngrams_iou_reward/std": 0.23422971367835999, "rewards/schema_keywords_iou_reward/mean": 0.6942699551582336, "rewards/schema_keywords_iou_reward/std": 0.204264298081398, "rewards/syntax_reward/mean": 0.7604166865348816, "rewards/syntax_reward/std": 0.427945077419281, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 238.67709350585938, "completions/mean_terminated_length": 176.8095245361328, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.7362171331636981, "frac_reward_zero_std": 0.0, "grad_norm": 0.8655994534492493, "kl": 0.05621337890625, "learning_rate": 9.384951404696933e-07, "loss": 0.0305, "num_tokens": 55463440.0, "reward": 9.693413734436035, "reward_std": 1.890845775604248, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.8802083134651184, "rewards/format_reward/std": 0.32556670904159546, "rewards/judge_reward/mean": 1.649999976158142, "rewards/judge_reward/std": 1.7529032230377197, "rewards/ngrams_iou_reward/mean": 0.1366145759820938, "rewards/ngrams_iou_reward/std": 0.1363389790058136, "rewards/schema_keywords_iou_reward/mean": 0.6942987442016602, "rewards/schema_keywords_iou_reward/std": 0.16505616903305054, "rewards/syntax_reward/mean": 0.6979166865348816, "rewards/syntax_reward/std": 0.4603615701198578, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 240.1354217529297, "completions/mean_terminated_length": 177.89744567871094, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.73960983884648, "frac_reward_zero_std": 0.0, "grad_norm": 0.9017961025238037, "kl": 0.05230712890625, "learning_rate": 9.375435112392969e-07, "loss": -0.0016, "num_tokens": 55712118.0, "reward": 9.806293487548828, "reward_std": 1.3079618215560913, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.5677083730697632, "rewards/judge_reward/std": 1.6573487520217896, "rewards/ngrams_iou_reward/mean": 0.18252550065517426, "rewards/ngrams_iou_reward/std": 0.20004642009735107, "rewards/schema_keywords_iou_reward/mean": 0.7279334664344788, "rewards/schema_keywords_iou_reward/std": 0.1514531970024109, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 240.39584350585938, "completions/mean_terminated_length": 172.7777862548828, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.7430025445292621, "frac_reward_zero_std": 0.0, "grad_norm": 0.9243572354316711, "kl": 0.04541015625, "learning_rate": 9.365850665627016e-07, "loss": -0.0152, "num_tokens": 55961350.0, "reward": 9.709531784057617, "reward_std": 1.6900229454040527, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.5072917938232422, "rewards/judge_reward/std": 1.5974944829940796, "rewards/ngrams_iou_reward/mean": 0.17312593758106232, "rewards/ngrams_iou_reward/std": 0.2061343491077423, "rewards/schema_keywords_iou_reward/mean": 0.7041136622428894, "rewards/schema_keywords_iou_reward/std": 0.17841532826423645, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 234.72396850585938, "completions/mean_terminated_length": 169.08509826660156, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.7463952502120441, "frac_reward_zero_std": 0.0, "grad_norm": 0.8589549660682678, "kl": 0.04559326171875, "learning_rate": 9.356198213692297e-07, "loss": 0.0084, "num_tokens": 56213795.0, "reward": 9.41270923614502, "reward_std": 1.6882447004318237, "rewards/accuracy_reward/mean": 1.078125, "rewards/accuracy_reward/std": 1.4432151317596436, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.8145833015441895, "rewards/judge_reward/std": 1.6951409578323364, "rewards/ngrams_iou_reward/mean": 0.14272744953632355, "rewards/ngrams_iou_reward/std": 0.15957039594650269, "rewards/schema_keywords_iou_reward/mean": 0.672064483165741, "rewards/schema_keywords_iou_reward/std": 0.18817104399204254, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 233.66146850585938, "completions/mean_terminated_length": 162.76087951660156, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7497879558948262, "frac_reward_zero_std": 0.0, "grad_norm": 0.8112618327140808, "kl": 0.04827880859375, "learning_rate": 9.346477906941331e-07, "loss": -0.0172, "num_tokens": 56462052.0, "reward": 9.517953872680664, "reward_std": 1.5904887914657593, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.2677083015441895, "rewards/judge_reward/std": 1.5502064228057861, "rewards/ngrams_iou_reward/mean": 0.19491137564182281, "rewards/ngrams_iou_reward/std": 0.2573948800563812, "rewards/schema_keywords_iou_reward/mean": 0.6782500147819519, "rewards/schema_keywords_iou_reward/std": 0.20112790167331696, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 234.9635467529297, "completions/mean_terminated_length": 166.24444580078125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.7531806615776081, "frac_reward_zero_std": 0.0, "grad_norm": 0.9250658750534058, "kl": 0.04718017578125, "learning_rate": 9.336689896783572e-07, "loss": 0.0232, "num_tokens": 56707661.0, "reward": 10.0738525390625, "reward_std": 1.49832284450531, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.6354166269302368, "rewards/judge_reward/std": 1.7111876010894775, "rewards/ngrams_iou_reward/mean": 0.1929132342338562, "rewards/ngrams_iou_reward/std": 0.24122998118400574, "rewards/schema_keywords_iou_reward/mean": 0.7142727971076965, "rewards/schema_keywords_iou_reward/std": 0.19547893106937408, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.671875, "completions/mean_terminated_length": 166.17308044433594, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.7565733672603902, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8997496366500854, "kl": 0.04766845703125, "learning_rate": 9.326834335683079e-07, "loss": -0.0006, "num_tokens": 56968028.0, "reward": 10.167144775390625, "reward_std": 1.2432916164398193, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319953322410583, "rewards/judge_reward/mean": 1.3770833015441895, "rewards/judge_reward/std": 1.670907735824585, "rewards/ngrams_iou_reward/mean": 0.14353416860103607, "rewards/ngrams_iou_reward/std": 0.10306365042924881, "rewards/schema_keywords_iou_reward/mean": 0.7329850196838379, "rewards/schema_keywords_iou_reward/std": 0.1436399221420288, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 241.23959350585938, "completions/mean_terminated_length": 172.64706420898438, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.7599660729431722, "frac_reward_zero_std": 0.0, "grad_norm": 0.8437324166297913, "kl": 0.05072021484375, "learning_rate": 9.316911377156116e-07, "loss": 0.0093, "num_tokens": 57216126.0, "reward": 9.446168899536133, "reward_std": 1.7300927639007568, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.3760417699813843, "rewards/judge_reward/std": 1.5331754684448242, "rewards/ngrams_iou_reward/mean": 0.141245499253273, "rewards/ngrams_iou_reward/std": 0.14584583044052124, "rewards/schema_keywords_iou_reward/mean": 0.6622146964073181, "rewards/schema_keywords_iou_reward/std": 0.18277786672115326, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.9166717529297, "completions/mean_terminated_length": 168.3636474609375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.7633587786259542, "frac_reward_zero_std": 0.03125, "grad_norm": 1.009837031364441, "kl": 0.05059814453125, "learning_rate": 9.306921175768773e-07, "loss": -0.0001, "num_tokens": 57465320.0, "reward": 9.297211647033691, "reward_std": 1.61690354347229, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.1875, "rewards/judge_reward/std": 1.3932710886001587, "rewards/ngrams_iou_reward/mean": 0.17881639301776886, "rewards/ngrams_iou_reward/std": 0.22112593054771423, "rewards/schema_keywords_iou_reward/mean": 0.6861028075218201, "rewards/schema_keywords_iou_reward/std": 0.19607673585414886, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 238.921875, "completions/mean_terminated_length": 174.02500915527344, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.7667514843087362, "frac_reward_zero_std": 0.0, "grad_norm": 1.047918438911438, "kl": 0.04742431640625, "learning_rate": 9.29686388713456e-07, "loss": 0.016, "num_tokens": 57711305.0, "reward": 9.240447998046875, "reward_std": 1.5146160125732422, "rewards/accuracy_reward/mean": 1.078125, "rewards/accuracy_reward/std": 1.4432151317596436, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.7468749284744263, "rewards/judge_reward/std": 1.6136493682861328, "rewards/ngrams_iou_reward/mean": 0.17367738485336304, "rewards/ngrams_iou_reward/std": 0.19729462265968323, "rewards/schema_keywords_iou_reward/mean": 0.6980203986167908, "rewards/schema_keywords_iou_reward/std": 0.19640305638313293, "rewards/syntax_reward/mean": 0.703125, "rewards/syntax_reward/std": 0.4580754339694977, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 234.6666717529297, "completions/mean_terminated_length": 175.686279296875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.7701441899915182, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7995023727416992, "kl": 0.05328369140625, "learning_rate": 9.286739667911972e-07, "loss": 0.0025, "num_tokens": 57981115.0, "reward": 9.452404022216797, "reward_std": 1.5523900985717773, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.2739583253860474, "rewards/judge_reward/std": 1.487592339515686, "rewards/ngrams_iou_reward/mean": 0.18374498188495636, "rewards/ngrams_iou_reward/std": 0.21710112690925598, "rewards/schema_keywords_iou_reward/mean": 0.7103257179260254, "rewards/schema_keywords_iou_reward/std": 0.16778673231601715, "rewards/syntax_reward/mean": 0.7291666865348816, "rewards/syntax_reward/std": 0.44555196166038513, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 235.5260467529297, "completions/mean_terminated_length": 166.6591033935547, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.7735368956743003, "frac_reward_zero_std": 0.0, "grad_norm": 1.1677501201629639, "kl": 0.04742431640625, "learning_rate": 9.276548675802058e-07, "loss": 0.0312, "num_tokens": 58237800.0, "reward": 9.442485809326172, "reward_std": 1.107158899307251, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.3583332300186157, "rewards/judge_reward/std": 1.5283476114273071, "rewards/ngrams_iou_reward/mean": 0.1264105886220932, "rewards/ngrams_iou_reward/std": 0.0742591992020607, "rewards/schema_keywords_iou_reward/mean": 0.6671168208122253, "rewards/schema_keywords_iou_reward/std": 0.1650373935699463, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 234.7447967529297, "completions/mean_terminated_length": 181.79998779296875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7769296013570822, "frac_reward_zero_std": 0.0, "grad_norm": 0.9225706458091736, "kl": 0.05218505859375, "learning_rate": 9.266291069545971e-07, "loss": 0.0299, "num_tokens": 58515611.0, "reward": 9.933282852172852, "reward_std": 1.5644052028656006, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.3604167699813843, "rewards/judge_reward/std": 1.589525818824768, "rewards/ngrams_iou_reward/mean": 0.1279701441526413, "rewards/ngrams_iou_reward/std": 0.10413546860218048, "rewards/schema_keywords_iou_reward/mean": 0.68864506483078, "rewards/schema_keywords_iou_reward/std": 0.1631442755460739, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 228.8854217529297, "completions/mean_terminated_length": 163.0357208251953, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.7803223070398643, "frac_reward_zero_std": 0.0, "grad_norm": 1.002619981765747, "kl": 0.05194091796875, "learning_rate": 9.255967008922473e-07, "loss": 0.0125, "num_tokens": 58770535.0, "reward": 10.79200267791748, "reward_std": 1.3603293895721436, "rewards/accuracy_reward/mean": 1.953125, "rewards/accuracy_reward/std": 1.433660626411438, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.125, "rewards/judge_reward/std": 1.6686201095581055, "rewards/ngrams_iou_reward/mean": 0.13935096561908722, "rewards/ngrams_iou_reward/std": 0.15893204510211945, "rewards/schema_keywords_iou_reward/mean": 0.6891099810600281, "rewards/schema_keywords_iou_reward/std": 0.17852462828159332, "rewards/syntax_reward/mean": 0.90625, "rewards/syntax_reward/std": 0.2922426164150238, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 224.72396850585938, "completions/mean_terminated_length": 167.69117736816406, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7837150127226463, "frac_reward_zero_std": 0.0, "grad_norm": 1.0622981786727905, "kl": 0.05108642578125, "learning_rate": 9.245576654745471e-07, "loss": 0.0337, "num_tokens": 59030552.0, "reward": 9.951770782470703, "reward_std": 1.3835065364837646, "rewards/accuracy_reward/mean": 1.90625, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 0.8343750834465027, "rewards/judge_reward/std": 1.3319534063339233, "rewards/ngrams_iou_reward/mean": 0.11590848118066788, "rewards/ngrams_iou_reward/std": 0.09449588507413864, "rewards/schema_keywords_iou_reward/mean": 0.6723195910453796, "rewards/schema_keywords_iou_reward/std": 0.1697620153427124, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 217.953125, "completions/mean_terminated_length": 164.6875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7871077184054284, "frac_reward_zero_std": 0.0, "grad_norm": 1.0301610231399536, "kl": 0.0611572265625, "learning_rate": 9.235120168861495e-07, "loss": 0.0169, "num_tokens": 59298725.0, "reward": 9.544816017150879, "reward_std": 1.7374404668807983, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.470833420753479, "rewards/judge_reward/std": 1.6129006147384644, "rewards/ngrams_iou_reward/mean": 0.1276400089263916, "rewards/ngrams_iou_reward/std": 0.11094975471496582, "rewards/schema_keywords_iou_reward/mean": 0.6682167053222656, "rewards/schema_keywords_iou_reward/std": 0.16806726157665253, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 212.6354217529297, "completions/mean_terminated_length": 159.18605041503906, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.7905004240882103, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8706866502761841, "kl": 0.0556640625, "learning_rate": 9.224597714147184e-07, "loss": 0.0183, "num_tokens": 59561797.0, "reward": 9.93582820892334, "reward_std": 1.1928070783615112, "rewards/accuracy_reward/mean": 1.8125, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 0.8979166150093079, "rewards/judge_reward/std": 1.302111029624939, "rewards/ngrams_iou_reward/mean": 0.1875443011522293, "rewards/ngrams_iou_reward/std": 0.1753094345331192, "rewards/schema_keywords_iou_reward/mean": 0.707658052444458, "rewards/schema_keywords_iou_reward/std": 0.1747918277978897, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 199.95834350585938, "completions/mean_terminated_length": 164.8135528564453, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.7938931297709924, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9527564644813538, "kl": 0.049560546875, "learning_rate": 9.214009454506752e-07, "loss": 0.0287, "num_tokens": 59888693.0, "reward": 9.23566722869873, "reward_std": 1.637404441833496, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.390625, "rewards/judge_reward/std": 1.5573220252990723, "rewards/ngrams_iou_reward/mean": 0.11637046188116074, "rewards/ngrams_iou_reward/std": 0.11711014062166214, "rewards/schema_keywords_iou_reward/mean": 0.6557552218437195, "rewards/schema_keywords_iou_reward/std": 0.16413599252700806, "rewards/syntax_reward/mean": 0.71875, "rewards/syntax_reward/std": 0.4507846534252167, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 204.609375, "completions/mean_terminated_length": 173.77500915527344, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.7972858354537744, "frac_reward_zero_std": 0.03125, "grad_norm": 0.773409366607666, "kl": 0.05255126953125, "learning_rate": 9.203355554869427e-07, "loss": -0.0126, "num_tokens": 60136142.0, "reward": 9.938444137573242, "reward_std": 1.2829805612564087, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.5390625, "rewards/judge_reward/std": 1.6363940238952637, "rewards/ngrams_iou_reward/mean": 0.17391717433929443, "rewards/ngrams_iou_reward/std": 0.17684334516525269, "rewards/schema_keywords_iou_reward/mean": 0.7072350382804871, "rewards/schema_keywords_iou_reward/std": 0.17032545804977417, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 193.8229217529297, "completions/mean_terminated_length": 164.1692352294922, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.8006785411365565, "frac_reward_zero_std": 0.0, "grad_norm": 0.8472452163696289, "kl": 0.06121826171875, "learning_rate": 9.192636181186887e-07, "loss": 0.0269, "num_tokens": 60396616.0, "reward": 9.47775936126709, "reward_std": 1.5896947383880615, "rewards/accuracy_reward/mean": 1.515625, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.2260416746139526, "rewards/judge_reward/std": 1.4790505170822144, "rewards/ngrams_iou_reward/mean": 0.13001565635204315, "rewards/ngrams_iou_reward/std": 0.0868435725569725, "rewards/schema_keywords_iou_reward/mean": 0.6717011332511902, "rewards/schema_keywords_iou_reward/std": 0.15016691386699677, "rewards/syntax_reward/mean": 0.7239583134651184, "rewards/syntax_reward/std": 0.4482063353061676, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.25, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 189.64584350585938, "completions/mean_terminated_length": 162.3235321044922, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.8040712468193384, "frac_reward_zero_std": 0.0, "grad_norm": 0.848886251449585, "kl": 0.05694580078125, "learning_rate": 9.181851500430672e-07, "loss": 0.0076, "num_tokens": 60654818.0, "reward": 9.52171802520752, "reward_std": 1.4165849685668945, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.3322917222976685, "rewards/judge_reward/std": 1.5850740671157837, "rewards/ngrams_iou_reward/mean": 0.17434930801391602, "rewards/ngrams_iou_reward/std": 0.233322411775589, "rewards/schema_keywords_iou_reward/mean": 0.6359099745750427, "rewards/schema_keywords_iou_reward/std": 0.22066178917884827, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 183.328125, "completions/mean_terminated_length": 156.33570861816406, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.8074639525021204, "frac_reward_zero_std": 0.0, "grad_norm": 0.9794799089431763, "kl": 0.0606689453125, "learning_rate": 9.171001680589587e-07, "loss": 0.0424, "num_tokens": 60896849.0, "reward": 10.138105392456055, "reward_std": 1.5799659490585327, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.131250023841858, "rewards/judge_reward/std": 1.5406064987182617, "rewards/ngrams_iou_reward/mean": 0.22520039975643158, "rewards/ngrams_iou_reward/std": 0.22811830043792725, "rewards/schema_keywords_iou_reward/mean": 0.7285287976264954, "rewards/schema_keywords_iou_reward/std": 0.1699439436197281, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 185.94271850585938, "completions/mean_terminated_length": 168.08497619628906, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.8108566581849025, "frac_reward_zero_std": 0.0, "grad_norm": 0.9357527494430542, "kl": 0.05877685546875, "learning_rate": 9.160086890667085e-07, "loss": 0.0085, "num_tokens": 61160064.0, "reward": 9.342142105102539, "reward_std": 1.3418997526168823, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.433333396911621, "rewards/judge_reward/std": 1.5420572757720947, "rewards/ngrams_iou_reward/mean": 0.13369546830654144, "rewards/ngrams_iou_reward/std": 0.12226149439811707, "rewards/schema_keywords_iou_reward/mean": 0.6751126646995544, "rewards/schema_keywords_iou_reward/std": 0.15130813419818878, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 184.796875, "completions/mean_terminated_length": 165.4635772705078, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.8142493638676844, "frac_reward_zero_std": 0.0, "grad_norm": 0.9205381870269775, "kl": 0.06573486328125, "learning_rate": 9.149107300678629e-07, "loss": 0.0293, "num_tokens": 61424025.0, "reward": 8.653528213500977, "reward_std": 1.4754600524902344, "rewards/accuracy_reward/mean": 0.953125, "rewards/accuracy_reward/std": 1.4004077911376953, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.734375, "rewards/judge_reward/std": 1.538771390914917, "rewards/ngrams_iou_reward/mean": 0.13073617219924927, "rewards/ngrams_iou_reward/std": 0.10117742419242859, "rewards/schema_keywords_iou_reward/mean": 0.64779132604599, "rewards/schema_keywords_iou_reward/std": 0.17059879004955292, "rewards/syntax_reward/mean": 0.59375, "rewards/syntax_reward/std": 0.49241629242897034, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 185.33334350585938, "completions/mean_terminated_length": 163.06849670410156, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.8176420695504665, "frac_reward_zero_std": 0.0, "grad_norm": 0.8902419805526733, "kl": 0.05889892578125, "learning_rate": 9.13806308164905e-07, "loss": 0.0475, "num_tokens": 61657261.0, "reward": 9.46353816986084, "reward_std": 1.6959823369979858, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.426041603088379, "rewards/judge_reward/std": 1.5561896562576294, "rewards/ngrams_iou_reward/mean": 0.1544554978609085, "rewards/ngrams_iou_reward/std": 0.14025063812732697, "rewards/schema_keywords_iou_reward/mean": 0.7017905116081238, "rewards/schema_keywords_iou_reward/std": 0.15161919593811035, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -4.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 175.4322967529297, "completions/mean_terminated_length": 159.31875610351562, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.8210347752332485, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9881713390350342, "kl": 0.06231689453125, "learning_rate": 9.126954405609882e-07, "loss": 0.0176, "num_tokens": 61935828.0, "reward": 9.04319953918457, "reward_std": 1.3423528671264648, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.453125, "rewards/judge_reward/std": 1.4814350605010986, "rewards/ngrams_iou_reward/mean": 0.21579010784626007, "rewards/ngrams_iou_reward/std": 0.2525768578052521, "rewards/schema_keywords_iou_reward/mean": 0.707617461681366, "rewards/schema_keywords_iou_reward/std": 0.16777949035167694, "rewards/syntax_reward/mean": 0.6822916865348816, "rewards/syntax_reward/std": 0.46680256724357605, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 180.8072967529297, "completions/mean_terminated_length": 159.7533416748047, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.8244274809160306, "frac_reward_zero_std": 0.0, "grad_norm": 1.0407425165176392, "kl": 0.06158447265625, "learning_rate": 9.115781445596675e-07, "loss": 0.0094, "num_tokens": 62202539.0, "reward": 9.319235801696777, "reward_std": 2.111276626586914, "rewards/accuracy_reward/mean": 1.203125, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.5604166984558105, "rewards/judge_reward/std": 1.6654467582702637, "rewards/ngrams_iou_reward/mean": 0.15098385512828827, "rewards/ngrams_iou_reward/std": 0.16367216408252716, "rewards/schema_keywords_iou_reward/mean": 0.6620010137557983, "rewards/schema_keywords_iou_reward/std": 0.17016343772411346, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 177.8541717529297, "completions/mean_terminated_length": 161.6352081298828, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.8278201865988125, "frac_reward_zero_std": 0.0, "grad_norm": 0.8890461325645447, "kl": 0.05938720703125, "learning_rate": 9.104544375646312e-07, "loss": 0.0292, "num_tokens": 62447707.0, "reward": 9.73849105834961, "reward_std": 1.4784343242645264, "rewards/accuracy_reward/mean": 1.703125, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 0.9760416150093079, "rewards/judge_reward/std": 1.447745680809021, "rewards/ngrams_iou_reward/mean": 0.1731419563293457, "rewards/ngrams_iou_reward/std": 0.21113194525241852, "rewards/schema_keywords_iou_reward/mean": 0.6913898587226868, "rewards/schema_keywords_iou_reward/std": 0.16554225981235504, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 187.7916717529297, "completions/mean_terminated_length": 161.10145568847656, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.8312128922815946, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9793894290924072, "kl": 0.0604248046875, "learning_rate": 9.09324337079429e-07, "loss": 0.0062, "num_tokens": 62705673.0, "reward": 10.396021842956543, "reward_std": 1.4881633520126343, "rewards/accuracy_reward/mean": 1.859375, "rewards/accuracy_reward/std": 1.4601210355758667, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.076041579246521, "rewards/judge_reward/std": 1.5287295579910278, "rewards/ngrams_iou_reward/mean": 0.17573495209217072, "rewards/ngrams_iou_reward/std": 0.1822175532579422, "rewards/schema_keywords_iou_reward/mean": 0.7244521975517273, "rewards/schema_keywords_iou_reward/std": 0.17445968091487885, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 195.28646850585938, "completions/mean_terminated_length": 172.13668823242188, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.8346055979643766, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8301304578781128, "kl": 0.05560302734375, "learning_rate": 9.081878607071995e-07, "loss": 0.0321, "num_tokens": 62980396.0, "reward": 9.05383586883545, "reward_std": 1.5942435264587402, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.2916666269302368, "rewards/judge_reward/std": 1.421721339225769, "rewards/ngrams_iou_reward/mean": 0.16485214233398438, "rewards/ngrams_iou_reward/std": 0.21112243831157684, "rewards/schema_keywords_iou_reward/mean": 0.6806502342224121, "rewards/schema_keywords_iou_reward/std": 0.18983706831932068, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 190.671875, "completions/mean_terminated_length": 167.04254150390625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.8379983036471587, "frac_reward_zero_std": 0.0, "grad_norm": 0.9950450658798218, "kl": 0.06121826171875, "learning_rate": 9.070450261503959e-07, "loss": -0.0201, "num_tokens": 63232117.0, "reward": 9.852965354919434, "reward_std": 1.5008835792541504, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.475000023841858, "rewards/judge_reward/std": 1.6509555578231812, "rewards/ngrams_iou_reward/mean": 0.19926603138446808, "rewards/ngrams_iou_reward/std": 0.2428976446390152, "rewards/schema_keywords_iou_reward/mean": 0.7036988735198975, "rewards/schema_keywords_iou_reward/std": 0.20065103471279144, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 206.28646850585938, "completions/mean_terminated_length": 168.43118286132812, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.8413910093299406, "frac_reward_zero_std": 0.0, "grad_norm": 1.1216374635696411, "kl": 0.05804443359375, "learning_rate": 9.058958512105104e-07, "loss": 0.0164, "num_tokens": 63462638.0, "reward": 9.276056289672852, "reward_std": 1.3540754318237305, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.3052083253860474, "rewards/judge_reward/std": 1.4920289516448975, "rewards/ngrams_iou_reward/mean": 0.17720408737659454, "rewards/ngrams_iou_reward/std": 0.1882086545228958, "rewards/schema_keywords_iou_reward/mean": 0.6603105068206787, "rewards/schema_keywords_iou_reward/std": 0.18959078192710876, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 204.2760467529297, "completions/mean_terminated_length": 173.24166870117188, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.8447837150127226, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9352987408638, "kl": 0.05206298828125, "learning_rate": 9.04740353787797e-07, "loss": 0.0122, "num_tokens": 63727669.0, "reward": 9.482316970825195, "reward_std": 1.3730342388153076, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.3249999284744263, "rewards/judge_reward/std": 1.4686247110366821, "rewards/ngrams_iou_reward/mean": 0.1957971602678299, "rewards/ngrams_iou_reward/std": 0.2259039431810379, "rewards/schema_keywords_iou_reward/mean": 0.6990192532539368, "rewards/schema_keywords_iou_reward/std": 0.1722036451101303, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 205.1666717529297, "completions/mean_terminated_length": 171.86207580566406, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.8481764206955047, "frac_reward_zero_std": 0.0, "grad_norm": 1.0213799476623535, "kl": 0.0712890625, "learning_rate": 9.035785518809926e-07, "loss": 0.016, "num_tokens": 63971169.0, "reward": 10.084535598754883, "reward_std": 1.631601333618164, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.478124976158142, "rewards/judge_reward/std": 1.6783002614974976, "rewards/ngrams_iou_reward/mean": 0.16547521948814392, "rewards/ngrams_iou_reward/std": 0.12885642051696777, "rewards/schema_keywords_iou_reward/mean": 0.69197678565979, "rewards/schema_keywords_iou_reward/std": 0.1718154102563858, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 201.703125, "completions/mean_terminated_length": 162.08108520507812, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.8515691263782866, "frac_reward_zero_std": 0.03125, "grad_norm": 1.009796380996704, "kl": 0.05535888671875, "learning_rate": 9.024104635870367e-07, "loss": 0.0268, "num_tokens": 64222374.0, "reward": 9.520469665527344, "reward_std": 1.8720601797103882, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.5572916269302368, "rewards/judge_reward/std": 1.6285570859909058, "rewards/ngrams_iou_reward/mean": 0.22302883863449097, "rewards/ngrams_iou_reward/std": 0.2854088842868805, "rewards/schema_keywords_iou_reward/mean": 0.7036895751953125, "rewards/schema_keywords_iou_reward/std": 0.21348872780799866, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 205.5416717529297, "completions/mean_terminated_length": 174.58824157714844, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.8549618320610687, "frac_reward_zero_std": 0.0, "grad_norm": 0.8948268890380859, "kl": 0.06158447265625, "learning_rate": 9.01236107100789e-07, "loss": -0.0059, "num_tokens": 64462370.0, "reward": 10.292858123779297, "reward_std": 1.0320947170257568, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.6020832061767578, "rewards/judge_reward/std": 1.7307795286178589, "rewards/ngrams_iou_reward/mean": 0.1514393836259842, "rewards/ngrams_iou_reward/std": 0.1538669317960739, "rewards/schema_keywords_iou_reward/mean": 0.7132928371429443, "rewards/schema_keywords_iou_reward/std": 0.131831556558609, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 203.44271850585938, "completions/mean_terminated_length": 167.48245239257812, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8583545377438507, "frac_reward_zero_std": 0.0, "grad_norm": 0.9167002439498901, "kl": 0.0560302734375, "learning_rate": 9.000555007147467e-07, "loss": 0.0196, "num_tokens": 64704777.0, "reward": 9.632539749145508, "reward_std": 1.2544357776641846, "rewards/accuracy_reward/mean": 1.234375, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.618749976158142, "rewards/judge_reward/std": 1.7032344341278076, "rewards/ngrams_iou_reward/mean": 0.22722335159778595, "rewards/ngrams_iou_reward/std": 0.2439805120229721, "rewards/schema_keywords_iou_reward/mean": 0.7146906852722168, "rewards/schema_keywords_iou_reward/std": 0.19603250920772552, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 206.45834350585938, "completions/mean_terminated_length": 168.7339324951172, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.8617472434266328, "frac_reward_zero_std": 0.0, "grad_norm": 0.8138647079467773, "kl": 0.05169677734375, "learning_rate": 8.988686628187596e-07, "loss": 0.0103, "num_tokens": 64947127.0, "reward": 9.929362297058105, "reward_std": 1.3103013038635254, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319954812526703, "rewards/judge_reward/mean": 1.2927082777023315, "rewards/judge_reward/std": 1.6024682521820068, "rewards/ngrams_iou_reward/mean": 0.19667065143585205, "rewards/ngrams_iou_reward/std": 0.21260519325733185, "rewards/schema_keywords_iou_reward/mean": 0.6837318539619446, "rewards/schema_keywords_iou_reward/std": 0.1972072273492813, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 191.546875, "completions/mean_terminated_length": 153.72726440429688, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.8651399491094147, "frac_reward_zero_std": 0.0, "grad_norm": 1.0196067094802856, "kl": 0.05889892578125, "learning_rate": 8.976756118997427e-07, "loss": 0.0166, "num_tokens": 65199058.0, "reward": 9.603528022766113, "reward_std": 1.8142449855804443, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.212499976158142, "rewards/judge_reward/std": 1.5378811359405518, "rewards/ngrams_iou_reward/mean": 0.25798988342285156, "rewards/ngrams_iou_reward/std": 0.25571560859680176, "rewards/schema_keywords_iou_reward/mean": 0.7278294563293457, "rewards/schema_keywords_iou_reward/std": 0.20735761523246765, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 212.58334350585938, "completions/mean_terminated_length": 178.8148193359375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.8685326547921968, "frac_reward_zero_std": 0.0, "grad_norm": 0.9792131781578064, "kl": 0.05340576171875, "learning_rate": 8.964763665413892e-07, "loss": 0.0072, "num_tokens": 65454626.0, "reward": 9.43816089630127, "reward_std": 1.3744261264801025, "rewards/accuracy_reward/mean": 1.21875, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.6177083253860474, "rewards/judge_reward/std": 1.6234855651855469, "rewards/ngrams_iou_reward/mean": 0.15819959342479706, "rewards/ngrams_iou_reward/std": 0.1675780564546585, "rewards/schema_keywords_iou_reward/mean": 0.7112105488777161, "rewards/schema_keywords_iou_reward/std": 0.15922218561172485, "rewards/syntax_reward/mean": 0.7395833134651184, "rewards/syntax_reward/std": 0.44000929594039917, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 207.38021850585938, "completions/mean_terminated_length": 170.3577880859375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.8719253604749788, "frac_reward_zero_std": 0.0, "grad_norm": 0.9361222386360168, "kl": 0.0518798828125, "learning_rate": 8.952709454238807e-07, "loss": -0.0034, "num_tokens": 65706069.0, "reward": 9.701543807983398, "reward_std": 1.1133135557174683, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.5791665315628052, "rewards/judge_reward/std": 1.6486018896102905, "rewards/ngrams_iou_reward/mean": 0.2292574644088745, "rewards/ngrams_iou_reward/std": 0.2709929645061493, "rewards/schema_keywords_iou_reward/mean": 0.7254107594490051, "rewards/schema_keywords_iou_reward/std": 0.18027403950691223, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.41087818145751953, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 215.00521850585938, "completions/mean_terminated_length": 175.6836700439453, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.8753180661577609, "frac_reward_zero_std": 0.0, "grad_norm": 0.8104617595672607, "kl": 0.0577392578125, "learning_rate": 8.940593673235961e-07, "loss": -0.0096, "num_tokens": 65965012.0, "reward": 9.05160140991211, "reward_std": 1.7867244482040405, "rewards/accuracy_reward/mean": 1.21875, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.5114582777023315, "rewards/judge_reward/std": 1.5065211057662964, "rewards/ngrams_iou_reward/mean": 0.14722640812397003, "rewards/ngrams_iou_reward/std": 0.14722679555416107, "rewards/schema_keywords_iou_reward/mean": 0.6627087593078613, "rewards/schema_keywords_iou_reward/std": 0.17923659086227417, "rewards/syntax_reward/mean": 0.6614583134651184, "rewards/syntax_reward/std": 0.47445085644721985, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 213.08334350585938, "completions/mean_terminated_length": 167.39784240722656, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.8787107718405428, "frac_reward_zero_std": 0.0, "grad_norm": 0.9819193482398987, "kl": 0.0623779296875, "learning_rate": 8.928416511128194e-07, "loss": 0.0129, "num_tokens": 66226562.0, "reward": 9.20286750793457, "reward_std": 1.6647415161132812, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.105208396911621, "rewards/judge_reward/std": 1.4449673891067505, "rewards/ngrams_iou_reward/mean": 0.17379708588123322, "rewards/ngrams_iou_reward/std": 0.19014029204845428, "rewards/schema_keywords_iou_reward/mean": 0.6832361817359924, "rewards/schema_keywords_iou_reward/std": 0.1979796290397644, "rewards/syntax_reward/mean": 0.7135416865348816, "rewards/syntax_reward/std": 0.45328807830810547, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 216.421875, "completions/mean_terminated_length": 174.2903289794922, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.8821034775233249, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8467128276824951, "kl": 0.0494384765625, "learning_rate": 8.916178157594452e-07, "loss": 0.0138, "num_tokens": 66485225.0, "reward": 9.73531723022461, "reward_std": 1.6345460414886475, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.392708420753479, "rewards/judge_reward/std": 1.6525875329971313, "rewards/ngrams_iou_reward/mean": 0.1357610672712326, "rewards/ngrams_iou_reward/std": 0.15817923843860626, "rewards/schema_keywords_iou_reward/mean": 0.6943476796150208, "rewards/schema_keywords_iou_reward/std": 0.15842823684215546, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 220.2447967529297, "completions/mean_terminated_length": 173.28915405273438, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.8854961832061069, "frac_reward_zero_std": 0.0625, "grad_norm": 0.811911404132843, "kl": 0.053955078125, "learning_rate": 8.90387880326684e-07, "loss": 0.0067, "num_tokens": 66723814.0, "reward": 10.564519882202148, "reward_std": 1.364287257194519, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.453125, "rewards/judge_reward/std": 1.6986260414123535, "rewards/ngrams_iou_reward/mean": 0.3052424192428589, "rewards/ngrams_iou_reward/std": 0.3334820866584778, "rewards/schema_keywords_iou_reward/mean": 0.7436509132385254, "rewards/schema_keywords_iou_reward/std": 0.19388622045516968, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 221.70834350585938, "completions/mean_terminated_length": 173.6999969482422, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.8888888888888888, "frac_reward_zero_std": 0.0625, "grad_norm": 1.1215749979019165, "kl": 0.04791259765625, "learning_rate": 8.891518639727649e-07, "loss": 0.0054, "num_tokens": 66969884.0, "reward": 9.40645980834961, "reward_std": 1.7534291744232178, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.384374976158142, "rewards/judge_reward/std": 1.5339664220809937, "rewards/ngrams_iou_reward/mean": 0.21505539119243622, "rewards/ngrams_iou_reward/std": 0.23523959517478943, "rewards/schema_keywords_iou_reward/mean": 0.7039041519165039, "rewards/schema_keywords_iou_reward/std": 0.1677040010690689, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.98959350585938, "completions/mean_terminated_length": 185.38235473632812, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.8922815945716709, "frac_reward_zero_std": 0.0, "grad_norm": 0.8312149047851562, "kl": 0.04974365234375, "learning_rate": 8.879097859506371e-07, "loss": 0.0307, "num_tokens": 67206846.0, "reward": 9.796958923339844, "reward_std": 1.5127133131027222, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.4739583730697632, "rewards/judge_reward/std": 1.6351341009140015, "rewards/ngrams_iou_reward/mean": 0.18522201478481293, "rewards/ngrams_iou_reward/std": 0.22773078083992004, "rewards/schema_keywords_iou_reward/mean": 0.7106941342353821, "rewards/schema_keywords_iou_reward/std": 0.15111088752746582, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.703125, "completions/mean_terminated_length": 171.05479431152344, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.8956743002544529, "frac_reward_zero_std": 0.0, "grad_norm": 0.8369292616844177, "kl": 0.05523681640625, "learning_rate": 8.866616656076696e-07, "loss": -0.0004, "num_tokens": 67461663.0, "reward": 9.624764442443848, "reward_std": 1.8248395919799805, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.3739582300186157, "rewards/judge_reward/std": 1.5682547092437744, "rewards/ngrams_iou_reward/mean": 0.1539621353149414, "rewards/ngrams_iou_reward/std": 0.17726151645183563, "rewards/schema_keywords_iou_reward/mean": 0.686427116394043, "rewards/schema_keywords_iou_reward/std": 0.17903082072734833, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.4071781635284424, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.5625, "completions/mean_terminated_length": 182.21621704101562, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.899067005937235, "frac_reward_zero_std": 0.15625, "grad_norm": 0.8031268119812012, "kl": 0.0516357421875, "learning_rate": 8.854075223853508e-07, "loss": 0.0026, "num_tokens": 67714893.0, "reward": 9.094316482543945, "reward_std": 1.4486031532287598, "rewards/accuracy_reward/mean": 1.234375, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.8697916865348816, "rewards/format_reward/std": 0.3374123275279999, "rewards/judge_reward/mean": 1.4583333730697632, "rewards/judge_reward/std": 1.5272510051727295, "rewards/ngrams_iou_reward/mean": 0.1635679453611374, "rewards/ngrams_iou_reward/std": 0.1975681632757187, "rewards/schema_keywords_iou_reward/mean": 0.6547054052352905, "rewards/schema_keywords_iou_reward/std": 0.18877412378787994, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.52084350585938, "completions/mean_terminated_length": 181.09588623046875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.9024597116200169, "frac_reward_zero_std": 0.03125, "grad_norm": 0.733848512172699, "kl": 0.0469970703125, "learning_rate": 8.841473758189852e-07, "loss": -0.0009, "num_tokens": 67968865.0, "reward": 9.841522216796875, "reward_std": 1.9221417903900146, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3129251003265381, "rewards/judge_reward/mean": 1.3802083730697632, "rewards/judge_reward/std": 1.642442226409912, "rewards/ngrams_iou_reward/mean": 0.18354551494121552, "rewards/ngrams_iou_reward/std": 0.22465436160564423, "rewards/schema_keywords_iou_reward/mean": 0.7204761505126953, "rewards/schema_keywords_iou_reward/std": 0.1796908974647522, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 223.3229217529297, "completions/mean_terminated_length": 175.56410217285156, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.905852417302799, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8119459748268127, "kl": 0.0523681640625, "learning_rate": 8.82881245537389e-07, "loss": 0.0425, "num_tokens": 68237217.0, "reward": 8.885824203491211, "reward_std": 2.043515205383301, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3129251003265381, "rewards/judge_reward/mean": 1.3104166984558105, "rewards/judge_reward/std": 1.5911717414855957, "rewards/ngrams_iou_reward/mean": 0.13375739753246307, "rewards/ngrams_iou_reward/std": 0.15665364265441895, "rewards/schema_keywords_iou_reward/mean": 0.6364404559135437, "rewards/schema_keywords_iou_reward/std": 0.21441616117954254, "rewards/syntax_reward/mean": 0.7135416865348816, "rewards/syntax_reward/std": 0.45328807830810547, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 223.703125, "completions/mean_terminated_length": 175.467529296875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.909245122985581, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7232015132904053, "kl": 0.0498046875, "learning_rate": 8.816091512625842e-07, "loss": 0.0044, "num_tokens": 68492580.0, "reward": 9.424811363220215, "reward_std": 1.682809829711914, "rewards/accuracy_reward/mean": 1.25, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.511979103088379, "rewards/judge_reward/std": 1.5763341188430786, "rewards/ngrams_iou_reward/mean": 0.19483107328414917, "rewards/ngrams_iou_reward/std": 0.25173455476760864, "rewards/schema_keywords_iou_reward/mean": 0.6956039071083069, "rewards/schema_keywords_iou_reward/std": 0.19708281755447388, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617491722107, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 230.40625, "completions/mean_terminated_length": 179.21875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.9126378286683631, "frac_reward_zero_std": 0.0, "grad_norm": 1.0307655334472656, "kl": 0.05096435546875, "learning_rate": 8.803311128094917e-07, "loss": 0.0019, "num_tokens": 68739798.0, "reward": 10.254298210144043, "reward_std": 1.4998252391815186, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.8802083134651184, "rewards/format_reward/std": 0.32556670904159546, "rewards/judge_reward/mean": 1.3333333730697632, "rewards/judge_reward/std": 1.6902966499328613, "rewards/ngrams_iou_reward/mean": 0.18255199491977692, "rewards/ngrams_iou_reward/std": 0.1818639487028122, "rewards/schema_keywords_iou_reward/mean": 0.7279953360557556, "rewards/schema_keywords_iou_reward/std": 0.17152133584022522, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 227.671875, "completions/mean_terminated_length": 173.59091186523438, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.916030534351145, "frac_reward_zero_std": 0.03125, "grad_norm": 0.820233941078186, "kl": 0.05084228515625, "learning_rate": 8.790471500856227e-07, "loss": -0.009, "num_tokens": 68990025.0, "reward": 9.773740768432617, "reward_std": 1.7385529279708862, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.6114583015441895, "rewards/judge_reward/std": 1.6389479637145996, "rewards/ngrams_iou_reward/mean": 0.16781707108020782, "rewards/ngrams_iou_reward/std": 0.21112866699695587, "rewards/schema_keywords_iou_reward/mean": 0.7215480804443359, "rewards/schema_keywords_iou_reward/std": 0.1731346994638443, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 180.0596923828125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.9194232400339271, "frac_reward_zero_std": 0.0, "grad_norm": 0.7659660577774048, "kl": 0.04840087890625, "learning_rate": 8.777572830907684e-07, "loss": 0.0081, "num_tokens": 69218145.0, "reward": 9.693902015686035, "reward_std": 1.591637372970581, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.5395832061767578, "rewards/judge_reward/std": 1.6053887605667114, "rewards/ngrams_iou_reward/mean": 0.18756842613220215, "rewards/ngrams_iou_reward/std": 0.20757701992988586, "rewards/schema_keywords_iou_reward/mean": 0.6823752522468567, "rewards/schema_keywords_iou_reward/std": 0.18209461867809296, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617491722107, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 226.1197967529297, "completions/mean_terminated_length": 186.8795166015625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.9228159457167091, "frac_reward_zero_std": 0.0, "grad_norm": 0.7898980379104614, "kl": 0.05718994140625, "learning_rate": 8.764615319166885e-07, "loss": -0.0056, "num_tokens": 69471320.0, "reward": 9.503124237060547, "reward_std": 1.6991095542907715, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3129251003265381, "rewards/judge_reward/mean": 1.4562500715255737, "rewards/judge_reward/std": 1.6457345485687256, "rewards/ngrams_iou_reward/mean": 0.1629471629858017, "rewards/ngrams_iou_reward/std": 0.1676943451166153, "rewards/schema_keywords_iou_reward/mean": 0.6828843951225281, "rewards/schema_keywords_iou_reward/std": 0.19858255982398987, "rewards/syntax_reward/mean": 0.7291666865348816, "rewards/syntax_reward/std": 0.44555196166038513, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 228.38021850585938, "completions/mean_terminated_length": 183.3561553955078, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.926208651399491, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8290581703186035, "kl": 0.0557861328125, "learning_rate": 8.751599167467984e-07, "loss": 0.0186, "num_tokens": 69736041.0, "reward": 9.05041790008545, "reward_std": 1.3112035989761353, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3485431373119354, "rewards/judge_reward/mean": 1.4166666269302368, "rewards/judge_reward/std": 1.598865270614624, "rewards/ngrams_iou_reward/mean": 0.1932845264673233, "rewards/ngrams_iou_reward/std": 0.23070308566093445, "rewards/schema_keywords_iou_reward/mean": 0.6904658675193787, "rewards/schema_keywords_iou_reward/std": 0.21832579374313354, "rewards/syntax_reward/mean": 0.6770833134651184, "rewards/syntax_reward/std": 0.46881362795829773, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.8229217529297, "completions/mean_terminated_length": 176.75408935546875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.9296013570822731, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7897836565971375, "kl": 0.05157470703125, "learning_rate": 8.738524578558546e-07, "loss": -0.006, "num_tokens": 70014113.0, "reward": 9.115970611572266, "reward_std": 1.6625447273254395, "rewards/accuracy_reward/mean": 1.015625, "rewards/accuracy_reward/std": 1.423352599143982, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.7572916746139526, "rewards/judge_reward/std": 1.59372079372406, "rewards/ngrams_iou_reward/mean": 0.2101029008626938, "rewards/ngrams_iou_reward/std": 0.2446986585855484, "rewards/schema_keywords_iou_reward/mean": 0.6933674812316895, "rewards/schema_keywords_iou_reward/std": 0.1925392895936966, "rewards/syntax_reward/mean": 0.734375, "rewards/syntax_reward/std": 0.44282010197639465, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 227.171875, "completions/mean_terminated_length": 168.1428680419922, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.9329940627650551, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8495862483978271, "kl": 0.0550537109375, "learning_rate": 8.725391756096389e-07, "loss": 0.0354, "num_tokens": 70255622.0, "reward": 9.515190124511719, "reward_std": 1.5486485958099365, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.2458332777023315, "rewards/judge_reward/std": 1.5048902034759521, "rewards/ngrams_iou_reward/mean": 0.27296391129493713, "rewards/ngrams_iou_reward/std": 0.2712905704975128, "rewards/schema_keywords_iou_reward/mean": 0.7141000628471375, "rewards/schema_keywords_iou_reward/std": 0.20519667863845825, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 231.3541717529297, "completions/mean_terminated_length": 178.4262237548828, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.9363867684478372, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8611177802085876, "kl": 0.04815673828125, "learning_rate": 8.712200904646416e-07, "loss": 0.0028, "num_tokens": 70525432.0, "reward": 9.956605911254883, "reward_std": 1.4421980381011963, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.1687499284744263, "rewards/judge_reward/std": 1.585755467414856, "rewards/ngrams_iou_reward/mean": 0.19130586087703705, "rewards/ngrams_iou_reward/std": 0.25367096066474915, "rewards/schema_keywords_iou_reward/mean": 0.703840970993042, "rewards/schema_keywords_iou_reward/std": 0.20482349395751953, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 229.609375, "completions/mean_terminated_length": 178.04615783691406, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.9397794741306191, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9364460110664368, "kl": 0.0501708984375, "learning_rate": 8.698952229677421e-07, "loss": 0.0147, "num_tokens": 70779853.0, "reward": 9.330093383789062, "reward_std": 1.7681525945663452, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.4427083730697632, "rewards/judge_reward/std": 1.6006702184677124, "rewards/ngrams_iou_reward/mean": 0.12988723814487457, "rewards/ngrams_iou_reward/std": 0.1280377358198166, "rewards/schema_keywords_iou_reward/mean": 0.6585392355918884, "rewards/schema_keywords_iou_reward/std": 0.1968698650598526, "rewards/syntax_reward/mean": 0.7083333134651184, "rewards/syntax_reward/std": 0.4557180106639862, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 230.86459350585938, "completions/mean_terminated_length": 174.20338439941406, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.9431721798134012, "frac_reward_zero_std": 0.03125, "grad_norm": 1.0352232456207275, "kl": 0.05255126953125, "learning_rate": 8.685645937558894e-07, "loss": 0.0076, "num_tokens": 71028209.0, "reward": 9.142389297485352, "reward_std": 1.5586214065551758, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.4666666984558105, "rewards/judge_reward/std": 1.6223998069763184, "rewards/ngrams_iou_reward/mean": 0.1834983229637146, "rewards/ngrams_iou_reward/std": 0.20338940620422363, "rewards/schema_keywords_iou_reward/mean": 0.6557660698890686, "rewards/schema_keywords_iou_reward/std": 0.19225351512432098, "rewards/syntax_reward/mean": 0.6666666865348816, "rewards/syntax_reward/std": 0.4726369380950928, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 237.09896850585938, "completions/mean_terminated_length": 178.7872314453125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.9465648854961832, "frac_reward_zero_std": 0.03125, "grad_norm": 0.851335346698761, "kl": 0.05621337890625, "learning_rate": 8.672282235557808e-07, "loss": 0.0045, "num_tokens": 71280714.0, "reward": 9.367599487304688, "reward_std": 1.6808289289474487, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.2135416269302368, "rewards/judge_reward/std": 1.4797848463058472, "rewards/ngrams_iou_reward/mean": 0.18218278884887695, "rewards/ngrams_iou_reward/std": 0.20218423008918762, "rewards/schema_keywords_iou_reward/mean": 0.695833683013916, "rewards/schema_keywords_iou_reward/std": 0.19009168446063995, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 233.64584350585938, "completions/mean_terminated_length": 184.4666748046875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.9499575911789653, "frac_reward_zero_std": 0.0, "grad_norm": 0.8227637410163879, "kl": 0.053955078125, "learning_rate": 8.658861331835383e-07, "loss": 0.0145, "num_tokens": 71542042.0, "reward": 9.196304321289062, "reward_std": 1.622477412223816, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3129251003265381, "rewards/judge_reward/mean": 1.370833396911621, "rewards/judge_reward/std": 1.5590248107910156, "rewards/ngrams_iou_reward/mean": 0.2110474705696106, "rewards/ngrams_iou_reward/std": 0.2557852566242218, "rewards/schema_keywords_iou_reward/mean": 0.6758812069892883, "rewards/schema_keywords_iou_reward/std": 0.20656326413154602, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.0260467529297, "completions/mean_terminated_length": 179.94827270507812, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.9533502968617472, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7250376343727112, "kl": 0.05218505859375, "learning_rate": 8.645383435443851e-07, "loss": 0.013, "num_tokens": 71784621.0, "reward": 9.539558410644531, "reward_std": 1.4685863256454468, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.1281250715255737, "rewards/judge_reward/std": 1.4449915885925293, "rewards/ngrams_iou_reward/mean": 0.2421942949295044, "rewards/ngrams_iou_reward/std": 0.28425469994544983, "rewards/schema_keywords_iou_reward/mean": 0.7442388534545898, "rewards/schema_keywords_iou_reward/std": 0.18042252957820892, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 228.2604217529297, "completions/mean_terminated_length": 172.78125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.9567430025445293, "frac_reward_zero_std": 0.0, "grad_norm": 0.8836221694946289, "kl": 0.05548095703125, "learning_rate": 8.631848756323197e-07, "loss": 0.0097, "num_tokens": 72069131.0, "reward": 9.38337516784668, "reward_std": 1.5167067050933838, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.3583332300186157, "rewards/judge_reward/std": 1.534501552581787, "rewards/ngrams_iou_reward/mean": 0.18674886226654053, "rewards/ngrams_iou_reward/std": 0.2110777050256729, "rewards/schema_keywords_iou_reward/mean": 0.677875280380249, "rewards/schema_keywords_iou_reward/std": 0.1773262321949005, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 223.30209350585938, "completions/mean_terminated_length": 168.80555725097656, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.9601357082273113, "frac_reward_zero_std": 0.125, "grad_norm": 0.8766878247261047, "kl": 0.05462646484375, "learning_rate": 8.618257505297886e-07, "loss": -0.0069, "num_tokens": 72298245.0, "reward": 10.00924301147461, "reward_std": 1.113537311553955, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.1947916746139526, "rewards/judge_reward/std": 1.583876132965088, "rewards/ngrams_iou_reward/mean": 0.28187909722328186, "rewards/ngrams_iou_reward/std": 0.3185163140296936, "rewards/schema_keywords_iou_reward/mean": 0.7336137890815735, "rewards/schema_keywords_iou_reward/std": 0.19628819823265076, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 231.9947967529297, "completions/mean_terminated_length": 169.03773498535156, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.9635284139100933, "frac_reward_zero_std": 0.0, "grad_norm": 0.9762909412384033, "kl": 0.05413818359375, "learning_rate": 8.604609894073583e-07, "loss": 0.0066, "num_tokens": 72540662.0, "reward": 10.029546737670898, "reward_std": 1.1749351024627686, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.6708332300186157, "rewards/judge_reward/std": 1.7111839056015015, "rewards/ngrams_iou_reward/mean": 0.19197697937488556, "rewards/ngrams_iou_reward/std": 0.19387342035770416, "rewards/schema_keywords_iou_reward/mean": 0.6886112689971924, "rewards/schema_keywords_iou_reward/std": 0.18129286170005798, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 229.30209350585938, "completions/mean_terminated_length": 171.9672088623047, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.9669211195928753, "frac_reward_zero_std": 0.03125, "grad_norm": 0.875368058681488, "kl": 0.05853271484375, "learning_rate": 8.590906135233853e-07, "loss": -0.008, "num_tokens": 72779568.0, "reward": 10.341565132141113, "reward_std": 1.3416099548339844, "rewards/accuracy_reward/mean": 1.703125, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.2937499284744263, "rewards/judge_reward/std": 1.6062582731246948, "rewards/ngrams_iou_reward/mean": 0.18949396908283234, "rewards/ngrams_iou_reward/std": 0.20363827049732208, "rewards/schema_keywords_iou_reward/mean": 0.7312377095222473, "rewards/schema_keywords_iou_reward/std": 0.15025869011878967, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 230.73959350585938, "completions/mean_terminated_length": 176.49179077148438, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.9703138252756573, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8594796657562256, "kl": 0.05377197265625, "learning_rate": 8.577146442236856e-07, "loss": 0.0041, "num_tokens": 73038928.0, "reward": 10.337403297424316, "reward_std": 1.0963250398635864, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.5541666746139526, "rewards/judge_reward/std": 1.7354881763458252, "rewards/ngrams_iou_reward/mean": 0.218450129032135, "rewards/ngrams_iou_reward/std": 0.2431487739086151, "rewards/schema_keywords_iou_reward/mean": 0.7554113268852234, "rewards/schema_keywords_iou_reward/std": 0.13967585563659668, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 231.20834350585938, "completions/mean_terminated_length": 180.4444580078125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.9737065309584394, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9915276169776917, "kl": 0.0533447265625, "learning_rate": 8.563331029412011e-07, "loss": 0.0073, "num_tokens": 73318214.0, "reward": 9.62673568725586, "reward_std": 1.2473604679107666, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.5864583253860474, "rewards/judge_reward/std": 1.6794488430023193, "rewards/ngrams_iou_reward/mean": 0.1633450984954834, "rewards/ngrams_iou_reward/std": 0.1631261110305786, "rewards/schema_keywords_iou_reward/mean": 0.6863066554069519, "rewards/schema_keywords_iou_reward/std": 0.17168110609054565, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 232.1979217529297, "completions/mean_terminated_length": 169.77359008789062, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.9770992366412213, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9591848850250244, "kl": 0.053466796875, "learning_rate": 8.549460111956663e-07, "loss": 0.0067, "num_tokens": 73567444.0, "reward": 10.257317543029785, "reward_std": 1.4225744009017944, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.1416667699813843, "rewards/judge_reward/std": 1.560753345489502, "rewards/ngrams_iou_reward/mean": 0.24372689425945282, "rewards/ngrams_iou_reward/std": 0.28134170174598694, "rewards/schema_keywords_iou_reward/mean": 0.7094240188598633, "rewards/schema_keywords_iou_reward/std": 0.19639897346496582, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 235.515625, "completions/mean_terminated_length": 168.60000610351562, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.9804919423240034, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7556264400482178, "kl": 0.0587158203125, "learning_rate": 8.535533905932737e-07, "loss": 0.0, "num_tokens": 73818007.0, "reward": 9.74096965789795, "reward_std": 1.6147255897521973, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.448958396911621, "rewards/judge_reward/std": 1.5876152515411377, "rewards/ngrams_iou_reward/mean": 0.24776691198349, "rewards/ngrams_iou_reward/std": 0.3024246394634247, "rewards/schema_keywords_iou_reward/mean": 0.7254939079284668, "rewards/schema_keywords_iou_reward/std": 0.18374955654144287, "rewards/syntax_reward/mean": 0.7083333134651184, "rewards/syntax_reward/std": 0.4557180106639862, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 239.03125, "completions/mean_terminated_length": 178.42857360839844, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.9838846480067854, "frac_reward_zero_std": 0.0, "grad_norm": 0.8465245366096497, "kl": 0.05853271484375, "learning_rate": 8.521552628263361e-07, "loss": -0.0054, "num_tokens": 74075119.0, "reward": 9.251218795776367, "reward_std": 1.5469712018966675, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.730208396911621, "rewards/judge_reward/std": 1.60650634765625, "rewards/ngrams_iou_reward/mean": 0.14780385792255402, "rewards/ngrams_iou_reward/std": 0.14084024727344513, "rewards/schema_keywords_iou_reward/mean": 0.6586229205131531, "rewards/schema_keywords_iou_reward/std": 0.1755410134792328, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 236.5104217529297, "completions/mean_terminated_length": 174.6521759033203, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.9872773536895675, "frac_reward_zero_std": 0.0625, "grad_norm": 1.0497316122055054, "kl": 0.04937744140625, "learning_rate": 8.507516496729493e-07, "loss": 0.0019, "num_tokens": 74336919.0, "reward": 9.36305046081543, "reward_std": 1.732349157333374, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.2531250715255737, "rewards/judge_reward/std": 1.5116616487503052, "rewards/ngrams_iou_reward/mean": 0.17518652975559235, "rewards/ngrams_iou_reward/std": 0.21626558899879456, "rewards/schema_keywords_iou_reward/mean": 0.6711959838867188, "rewards/schema_keywords_iou_reward/std": 0.1832546442747116, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 232.4479217529297, "completions/mean_terminated_length": 169.03846740722656, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.9906700593723494, "frac_reward_zero_std": 0.03125, "grad_norm": 1.0613864660263062, "kl": 0.05389404296875, "learning_rate": 8.493425729966533e-07, "loss": -0.0034, "num_tokens": 74596073.0, "reward": 10.112739562988281, "reward_std": 0.9491310119628906, "rewards/accuracy_reward/mean": 1.8125, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 0.9677083492279053, "rewards/judge_reward/std": 1.4581055641174316, "rewards/ngrams_iou_reward/mean": 0.22099702060222626, "rewards/ngrams_iou_reward/std": 0.2545998692512512, "rewards/schema_keywords_iou_reward/mean": 0.7271583676338196, "rewards/schema_keywords_iou_reward/std": 0.166539266705513, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.796875, "completions/mean_terminated_length": 172.5689697265625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.9940627650551315, "frac_reward_zero_std": 0.0, "grad_norm": 0.9868425726890564, "kl": 0.05621337890625, "learning_rate": 8.479280547460906e-07, "loss": 0.0156, "num_tokens": 74880644.0, "reward": 9.400659561157227, "reward_std": 1.2054561376571655, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.4833332300186157, "rewards/judge_reward/std": 1.6626899242401123, "rewards/ngrams_iou_reward/mean": 0.1555403470993042, "rewards/ngrams_iou_reward/std": 0.13460871577262878, "rewards/schema_keywords_iou_reward/mean": 0.6951176524162292, "rewards/schema_keywords_iou_reward/std": 0.17818307876586914, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.578125, "completions/mean_terminated_length": 197.94117736816406, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.9974554707379135, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8838077783584595, "kl": 0.05535888671875, "learning_rate": 8.465081169546658e-07, "loss": 0.0036, "num_tokens": 75173975.0, "reward": 9.163924217224121, "reward_std": 1.9042174816131592, "rewards/accuracy_reward/mean": 1.140625, "rewards/accuracy_reward/std": 1.4601210355758667, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.5760416984558105, "rewards/judge_reward/std": 1.512685775756836, "rewards/ngrams_iou_reward/mean": 0.18964223563671112, "rewards/ngrams_iou_reward/std": 0.2317001074552536, "rewards/schema_keywords_iou_reward/mean": 0.6867812275886536, "rewards/schema_keywords_iou_reward/std": 0.17344029247760773, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 233.46875, "completions/mean_terminated_length": 178.75001525878906, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 1.003392705682782, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9890689849853516, "kl": 0.05377197265625, "learning_rate": 8.450827817402011e-07, "loss": 0.0164, "num_tokens": 75412241.0, "reward": 9.768585205078125, "reward_std": 1.375563621520996, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.0281249284744263, "rewards/judge_reward/std": 1.4534451961517334, "rewards/ngrams_iou_reward/mean": 0.16987638175487518, "rewards/ngrams_iou_reward/std": 0.18500454723834991, "rewards/schema_keywords_iou_reward/mean": 0.6935000419616699, "rewards/schema_keywords_iou_reward/std": 0.18115457892417908, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 235.98959350585938, "completions/mean_terminated_length": 188.59649658203125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 1.006785411365564, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7872971892356873, "kl": 0.05328369140625, "learning_rate": 8.436520713045922e-07, "loss": -0.0091, "num_tokens": 75680571.0, "reward": 9.739313125610352, "reward_std": 1.474705457687378, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.691666603088379, "rewards/judge_reward/std": 1.6816636323928833, "rewards/ngrams_iou_reward/mean": 0.1337403804063797, "rewards/ngrams_iou_reward/std": 0.1671513020992279, "rewards/schema_keywords_iou_reward/mean": 0.6649473905563354, "rewards/schema_keywords_iou_reward/std": 0.1613309234380722, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 234.14584350585938, "completions/mean_terminated_length": 176.83018493652344, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 1.0101781170483461, "frac_reward_zero_std": 0.0, "grad_norm": 0.9714939594268799, "kl": 0.05816650390625, "learning_rate": 8.422160079334627e-07, "loss": 0.0243, "num_tokens": 75931117.0, "reward": 9.173254013061523, "reward_std": 2.024620532989502, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.240625023841858, "rewards/judge_reward/std": 1.488882303237915, "rewards/ngrams_iou_reward/mean": 0.20157963037490845, "rewards/ngrams_iou_reward/std": 0.2722023129463196, "rewards/schema_keywords_iou_reward/mean": 0.6414656043052673, "rewards/schema_keywords_iou_reward/std": 0.22392451763153076, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 225.875, "completions/mean_terminated_length": 167.015380859375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.013570822731128, "frac_reward_zero_std": 0.0, "grad_norm": 0.9318587779998779, "kl": 0.05474853515625, "learning_rate": 8.407746139958168e-07, "loss": 0.0292, "num_tokens": 76183141.0, "reward": 9.917665481567383, "reward_std": 1.5890493392944336, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.5343750715255737, "rewards/judge_reward/std": 1.7199978828430176, "rewards/ngrams_iou_reward/mean": 0.23246295750141144, "rewards/ngrams_iou_reward/std": 0.26892638206481934, "rewards/schema_keywords_iou_reward/mean": 0.6945762634277344, "rewards/schema_keywords_iou_reward/std": 0.20988033711910248, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 231.484375, "completions/mean_terminated_length": 183.58460998535156, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 1.01696352841391, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8710213899612427, "kl": 0.06121826171875, "learning_rate": 8.393279119436911e-07, "loss": 0.0017, "num_tokens": 76456606.0, "reward": 9.34317684173584, "reward_std": 1.6255906820297241, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.8854166865348816, "rewards/format_reward/std": 0.3193511664867401, "rewards/judge_reward/mean": 1.4677082300186157, "rewards/judge_reward/std": 1.5698063373565674, "rewards/ngrams_iou_reward/mean": 0.1469111293554306, "rewards/ngrams_iou_reward/std": 0.15898248553276062, "rewards/schema_keywords_iou_reward/mean": 0.6827230453491211, "rewards/schema_keywords_iou_reward/std": 0.1917751282453537, "rewards/syntax_reward/mean": 0.7083333134651184, "rewards/syntax_reward/std": 0.4557179808616638, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 234.2447967529297, "completions/mean_terminated_length": 183.98275756835938, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 1.020356234096692, "frac_reward_zero_std": 0.09375, "grad_norm": 0.855452299118042, "kl": 0.05975341796875, "learning_rate": 8.378759243118043e-07, "loss": 0.0069, "num_tokens": 76733145.0, "reward": 9.163647651672363, "reward_std": 1.4855623245239258, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.4208332300186157, "rewards/judge_reward/std": 1.5728000402450562, "rewards/ngrams_iou_reward/mean": 0.1642395257949829, "rewards/ngrams_iou_reward/std": 0.14112579822540283, "rewards/schema_keywords_iou_reward/mean": 0.6733654141426086, "rewards/schema_keywords_iou_reward/std": 0.19789697229862213, "rewards/syntax_reward/mean": 0.7135416865348816, "rewards/syntax_reward/std": 0.45328807830810547, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.5260467529297, "completions/mean_terminated_length": 165.23214721679688, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 1.0237489397794741, "frac_reward_zero_std": 0.0, "grad_norm": 1.087676763534546, "kl": 0.05596923828125, "learning_rate": 8.364186737172068e-07, "loss": 0.0071, "num_tokens": 76991444.0, "reward": 10.329177856445312, "reward_std": 1.5826090574264526, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.386458396911621, "rewards/judge_reward/std": 1.6483607292175293, "rewards/ngrams_iou_reward/mean": 0.22123245894908905, "rewards/ngrams_iou_reward/std": 0.2351592779159546, "rewards/schema_keywords_iou_reward/mean": 0.7256529927253723, "rewards/schema_keywords_iou_reward/std": 0.17982374131679535, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 226.2604217529297, "completions/mean_terminated_length": 170.776123046875, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 1.0271416454622562, "frac_reward_zero_std": 0.09375, "grad_norm": 0.9318032264709473, "kl": 0.05401611328125, "learning_rate": 8.349561828589275e-07, "loss": 0.0039, "num_tokens": 77229388.0, "reward": 9.44169807434082, "reward_std": 1.3578739166259766, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.3458333015441895, "rewards/judge_reward/std": 1.5647903680801392, "rewards/ngrams_iou_reward/mean": 0.19352352619171143, "rewards/ngrams_iou_reward/std": 0.23152439296245575, "rewards/schema_keywords_iou_reward/mean": 0.6919237971305847, "rewards/schema_keywords_iou_reward/std": 0.18036840856075287, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 227.89584350585938, "completions/mean_terminated_length": 180.0, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 1.0305343511450382, "frac_reward_zero_std": 0.0, "grad_norm": 1.019416093826294, "kl": 0.05865478515625, "learning_rate": 8.334884745176219e-07, "loss": 0.0258, "num_tokens": 77484062.0, "reward": 9.994758605957031, "reward_std": 1.3084287643432617, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.5593749284744263, "rewards/judge_reward/std": 1.657543659210205, "rewards/ngrams_iou_reward/mean": 0.20613479614257812, "rewards/ngrams_iou_reward/std": 0.23100760579109192, "rewards/schema_keywords_iou_reward/mean": 0.7063314318656921, "rewards/schema_keywords_iou_reward/std": 0.16661351919174194, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 224.265625, "completions/mean_terminated_length": 163.68182373046875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.0339270568278203, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9235818386077881, "kl": 0.05230712890625, "learning_rate": 8.320155715552155e-07, "loss": 0.0129, "num_tokens": 77731793.0, "reward": 10.512555122375488, "reward_std": 1.7303435802459717, "rewards/accuracy_reward/mean": 1.703125, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.3125, "rewards/judge_reward/std": 1.7202976942062378, "rewards/ngrams_iou_reward/mean": 0.2649877369403839, "rewards/ngrams_iou_reward/std": 0.3194259703159332, "rewards/schema_keywords_iou_reward/mean": 0.7371500134468079, "rewards/schema_keywords_iou_reward/std": 0.2052575647830963, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374122977256775, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.50521850585938, "completions/mean_terminated_length": 177.34722900390625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 1.0373197625106023, "frac_reward_zero_std": 0.0, "grad_norm": 1.0110901594161987, "kl": 0.05767822265625, "learning_rate": 8.305374969145487e-07, "loss": 0.0126, "num_tokens": 77996160.0, "reward": 9.625341415405273, "reward_std": 1.593178153038025, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.495833396911621, "rewards/judge_reward/std": 1.557815432548523, "rewards/ngrams_iou_reward/mean": 0.20381300151348114, "rewards/ngrams_iou_reward/std": 0.23438461124897003, "rewards/schema_keywords_iou_reward/mean": 0.6954858899116516, "rewards/schema_keywords_iou_reward/std": 0.1903950572013855, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151108264923, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 225.7447967529297, "completions/mean_terminated_length": 183.3874969482422, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 1.0407124681933841, "frac_reward_zero_std": 0.0, "grad_norm": 0.7696357369422913, "kl": 0.0601806640625, "learning_rate": 8.290542736190188e-07, "loss": 0.0063, "num_tokens": 78262823.0, "reward": 9.47662353515625, "reward_std": 1.9026954174041748, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.2833333015441895, "rewards/judge_reward/std": 1.5274566411972046, "rewards/ngrams_iou_reward/mean": 0.16284343600273132, "rewards/ngrams_iou_reward/std": 0.21375161409378052, "rewards/schema_keywords_iou_reward/mean": 0.6898207664489746, "rewards/schema_keywords_iou_reward/std": 0.18085913360118866, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 222.25521850585938, "completions/mean_terminated_length": 164.74647521972656, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 1.0441051738761662, "frac_reward_zero_std": 0.03125, "grad_norm": 1.1117867231369019, "kl": 0.0528564453125, "learning_rate": 8.275659247722221e-07, "loss": 0.0029, "num_tokens": 78507462.0, "reward": 9.924399375915527, "reward_std": 1.3577125072479248, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710795402526855, "rewards/judge_reward/mean": 1.363541603088379, "rewards/judge_reward/std": 1.617484211921692, "rewards/ngrams_iou_reward/mean": 0.21879346668720245, "rewards/ngrams_iou_reward/std": 0.2779501676559448, "rewards/schema_keywords_iou_reward/mean": 0.692063570022583, "rewards/schema_keywords_iou_reward/std": 0.20377576351165771, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 227.40625, "completions/mean_terminated_length": 172.8181915283203, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.0474978795589482, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9880484342575073, "kl": 0.055908203125, "learning_rate": 8.260724735575932e-07, "loss": 0.0189, "num_tokens": 78753150.0, "reward": 9.433656692504883, "reward_std": 1.5886752605438232, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710795402526855, "rewards/judge_reward/mean": 1.28125, "rewards/judge_reward/std": 1.483473539352417, "rewards/ngrams_iou_reward/mean": 0.24704675376415253, "rewards/ngrams_iou_reward/std": 0.2665778696537018, "rewards/schema_keywords_iou_reward/mean": 0.7022343277931213, "rewards/schema_keywords_iou_reward/std": 0.20395904779434204, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.5885467529297, "completions/mean_terminated_length": 183.91357421875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.0508905852417303, "frac_reward_zero_std": 0.03125, "grad_norm": 1.2011899948120117, "kl": 0.05743408203125, "learning_rate": 8.24573943238045e-07, "loss": 0.0098, "num_tokens": 79001945.0, "reward": 10.228862762451172, "reward_std": 1.4377658367156982, "rewards/accuracy_reward/mean": 1.796875, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.1583333015441895, "rewards/judge_reward/std": 1.579559564590454, "rewards/ngrams_iou_reward/mean": 0.14399170875549316, "rewards/ngrams_iou_reward/std": 0.13412149250507355, "rewards/schema_keywords_iou_reward/mean": 0.7057035565376282, "rewards/schema_keywords_iou_reward/std": 0.18667787313461304, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 216.9791717529297, "completions/mean_terminated_length": 173.6703338623047, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 1.0542832909245123, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8111373782157898, "kl": 0.0552978515625, "learning_rate": 8.230703571556048e-07, "loss": -0.0008, "num_tokens": 79270363.0, "reward": 10.329174995422363, "reward_std": 1.2216477394104004, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.2343748807907104, "rewards/judge_reward/std": 1.586020588874817, "rewards/ngrams_iou_reward/mean": 0.2229411005973816, "rewards/ngrams_iou_reward/std": 0.2150125652551651, "rewards/schema_keywords_iou_reward/mean": 0.7103994488716125, "rewards/schema_keywords_iou_reward/std": 0.16547754406929016, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 221.69271850585938, "completions/mean_terminated_length": 175.6707305908203, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.0576759966072944, "frac_reward_zero_std": 0.0, "grad_norm": 0.821366012096405, "kl": 0.05609130859375, "learning_rate": 8.215617387310522e-07, "loss": 0.0214, "num_tokens": 79533632.0, "reward": 9.72574234008789, "reward_std": 1.3606181144714355, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.1979166269302368, "rewards/judge_reward/std": 1.4997366666793823, "rewards/ngrams_iou_reward/mean": 0.12412545830011368, "rewards/ngrams_iou_reward/std": 0.10270538181066513, "rewards/schema_keywords_iou_reward/mean": 0.6693246960639954, "rewards/schema_keywords_iou_reward/std": 0.17370016872882843, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 211.33334350585938, "completions/mean_terminated_length": 171.089111328125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 1.0610687022900764, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8926442861557007, "kl": 0.0557861328125, "learning_rate": 8.200481114635536e-07, "loss": -0.005, "num_tokens": 79789896.0, "reward": 10.230069160461426, "reward_std": 1.529884696006775, "rewards/accuracy_reward/mean": 1.515625, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.527083396911621, "rewards/judge_reward/std": 1.7125638723373413, "rewards/ngrams_iou_reward/mean": 0.16486847400665283, "rewards/ngrams_iou_reward/std": 0.1916797012090683, "rewards/schema_keywords_iou_reward/mean": 0.714158296585083, "rewards/schema_keywords_iou_reward/std": 0.1827513724565506, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 213.453125, "completions/mean_terminated_length": 170.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 1.0610687022900764, "frac_reward_zero_std": 0.0, "grad_norm": 0.8718066215515137, "kl": 0.0595703125, "learning_rate": 8.185294989302957e-07, "loss": 0.021, "num_tokens": 80046087.0, "reward": 10.37564468383789, "reward_std": 1.6183526515960693, "rewards/accuracy_reward/mean": 1.765625, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.2187498807907104, "rewards/judge_reward/std": 1.655725359916687, "rewards/ngrams_iou_reward/mean": 0.16581566631793976, "rewards/ngrams_iou_reward/std": 0.1735735535621643, "rewards/schema_keywords_iou_reward/mean": 0.7046200633049011, "rewards/schema_keywords_iou_reward/std": 0.17989203333854675, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 215.390625, "completions/mean_terminated_length": 167.39773559570312, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 1.0644614079728583, "frac_reward_zero_std": 0.0625, "grad_norm": 0.793599545955658, "kl": 0.05426025390625, "learning_rate": 8.170059247861193e-07, "loss": 0.0114, "num_tokens": 80293242.0, "reward": 10.167749404907227, "reward_std": 1.0045160055160522, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.1729166507720947, "rewards/judge_reward/std": 1.5813360214233398, "rewards/ngrams_iou_reward/mean": 0.20295989513397217, "rewards/ngrams_iou_reward/std": 0.23718927800655365, "rewards/schema_keywords_iou_reward/mean": 0.7283303141593933, "rewards/schema_keywords_iou_reward/std": 0.1696314662694931, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 210.09896850585938, "completions/mean_terminated_length": 164.1979217529297, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 1.0678541136556403, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8773514032363892, "kl": 0.05804443359375, "learning_rate": 8.154774127631501e-07, "loss": 0.0027, "num_tokens": 80563633.0, "reward": 9.226722717285156, "reward_std": 1.7824853658676147, "rewards/accuracy_reward/mean": 1.109375, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.6489582061767578, "rewards/judge_reward/std": 1.5556344985961914, "rewards/ngrams_iou_reward/mean": 0.20896808803081512, "rewards/ngrams_iou_reward/std": 0.23159094154834747, "rewards/schema_keywords_iou_reward/mean": 0.7198372483253479, "rewards/schema_keywords_iou_reward/std": 0.17523716390132904, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 219.171875, "completions/mean_terminated_length": 174.72413635253906, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 1.0712468193384224, "frac_reward_zero_std": 0.0, "grad_norm": 0.9402597546577454, "kl": 0.0592041015625, "learning_rate": 8.139439866704292e-07, "loss": 0.006, "num_tokens": 80821672.0, "reward": 9.65654468536377, "reward_std": 1.8326537609100342, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.3953126668930054, "rewards/judge_reward/std": 1.617647647857666, "rewards/ngrams_iou_reward/mean": 0.17372508347034454, "rewards/ngrams_iou_reward/std": 0.18790669739246368, "rewards/schema_keywords_iou_reward/mean": 0.6713606715202332, "rewards/schema_keywords_iou_reward/std": 0.17839671671390533, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947065711021423, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 214.28125, "completions/mean_terminated_length": 172.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.0746395250212044, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9217004179954529, "kl": 0.055419921875, "learning_rate": 8.124056703935423e-07, "loss": 0.0165, "num_tokens": 81062704.0, "reward": 10.25395393371582, "reward_std": 1.3493568897247314, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.3354166746139526, "rewards/judge_reward/std": 1.6160178184509277, "rewards/ngrams_iou_reward/mean": 0.2295682430267334, "rewards/ngrams_iou_reward/std": 0.2161162942647934, "rewards/schema_keywords_iou_reward/mean": 0.7389690279960632, "rewards/schema_keywords_iou_reward/std": 0.16753704845905304, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 210.28646850585938, "completions/mean_terminated_length": 165.5154571533203, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 1.0780322307039865, "frac_reward_zero_std": 0.03125, "grad_norm": 0.770844042301178, "kl": 0.054931640625, "learning_rate": 8.108624878942476e-07, "loss": 0.0107, "num_tokens": 81335387.0, "reward": 9.699958801269531, "reward_std": 1.1958950757980347, "rewards/accuracy_reward/mean": 1.15625, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.7947916984558105, "rewards/judge_reward/std": 1.606066346168518, "rewards/ngrams_iou_reward/mean": 0.148253932595253, "rewards/ngrams_iou_reward/std": 0.10889882594347, "rewards/schema_keywords_iou_reward/mean": 0.6860787272453308, "rewards/schema_keywords_iou_reward/std": 0.1462683528661728, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 205.77084350585938, "completions/mean_terminated_length": 159.55999755859375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 1.0814249363867685, "frac_reward_zero_std": 0.0, "grad_norm": 0.9143744111061096, "kl": 0.057373046875, "learning_rate": 8.093144632101026e-07, "loss": 0.0006, "num_tokens": 81602217.0, "reward": 9.919059753417969, "reward_std": 1.4239743947982788, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.3229166269302368, "rewards/judge_reward/std": 1.6218068599700928, "rewards/ngrams_iou_reward/mean": 0.19991308450698853, "rewards/ngrams_iou_reward/std": 0.22680449485778809, "rewards/schema_keywords_iou_reward/mean": 0.703521728515625, "rewards/schema_keywords_iou_reward/std": 0.1951061189174652, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 204.9375, "completions/mean_terminated_length": 155.9591827392578, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.0848176420695506, "frac_reward_zero_std": 0.0, "grad_norm": 0.8646680116653442, "kl": 0.057861328125, "learning_rate": 8.077616204540896e-07, "loss": -0.0088, "num_tokens": 81846477.0, "reward": 9.505252838134766, "reward_std": 1.272391438484192, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.53125, "rewards/judge_reward/std": 1.6823886632919312, "rewards/ngrams_iou_reward/mean": 0.1605631560087204, "rewards/ngrams_iou_reward/std": 0.18240217864513397, "rewards/schema_keywords_iou_reward/mean": 0.6519807577133179, "rewards/schema_keywords_iou_reward/std": 0.19256624579429626, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 166.93333435058594, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 1.0882103477523324, "frac_reward_zero_std": 0.0, "grad_norm": 1.2786229848861694, "kl": 0.0535888671875, "learning_rate": 8.062039838142401e-07, "loss": 0.0228, "num_tokens": 82095429.0, "reward": 9.760969161987305, "reward_std": 1.0230294466018677, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.5770832300186157, "rewards/judge_reward/std": 1.5926190614700317, "rewards/ngrams_iou_reward/mean": 0.193797305226326, "rewards/ngrams_iou_reward/std": 0.22574199736118317, "rewards/schema_keywords_iou_reward/mean": 0.6890456676483154, "rewards/schema_keywords_iou_reward/std": 0.17974433302879333, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 211.61459350585938, "completions/mean_terminated_length": 164.36558532714844, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 1.0916030534351144, "frac_reward_zero_std": 0.03125, "grad_norm": 0.885250449180603, "kl": 0.05859375, "learning_rate": 8.046415775532584e-07, "loss": 0.0055, "num_tokens": 82350739.0, "reward": 10.00889778137207, "reward_std": 1.1683967113494873, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319954812526703, "rewards/judge_reward/mean": 1.2468749284744263, "rewards/judge_reward/std": 1.5107609033584595, "rewards/ngrams_iou_reward/mean": 0.1839783936738968, "rewards/ngrams_iou_reward/std": 0.19261394441127777, "rewards/schema_keywords_iou_reward/mean": 0.7165853381156921, "rewards/schema_keywords_iou_reward/std": 0.14628154039382935, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 219.23959350585938, "completions/mean_terminated_length": 161.89334106445312, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 1.0949957591178965, "frac_reward_zero_std": 0.0, "grad_norm": 0.8388928174972534, "kl": 0.05426025390625, "learning_rate": 8.030744260081426e-07, "loss": 0.0381, "num_tokens": 82612865.0, "reward": 10.068007469177246, "reward_std": 1.5954036712646484, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.1994792222976685, "rewards/judge_reward/std": 1.5143465995788574, "rewards/ngrams_iou_reward/mean": 0.1650736927986145, "rewards/ngrams_iou_reward/std": 0.16880550980567932, "rewards/schema_keywords_iou_reward/mean": 0.7279332280158997, "rewards/schema_keywords_iou_reward/std": 0.1476493626832962, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.41087818145751953, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 211.453125, "completions/mean_terminated_length": 151.6951141357422, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.0983884648006785, "frac_reward_zero_std": 0.03125, "grad_norm": 1.333794355392456, "kl": 0.0555419921875, "learning_rate": 8.015025535898073e-07, "loss": 0.0164, "num_tokens": 82900136.0, "reward": 9.904359817504883, "reward_std": 1.637561321258545, "rewards/accuracy_reward/mean": 1.796875, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 0.921875, "rewards/judge_reward/std": 1.3618360757827759, "rewards/ngrams_iou_reward/mean": 0.13693547248840332, "rewards/ngrams_iou_reward/std": 0.13010233640670776, "rewards/schema_keywords_iou_reward/mean": 0.6632572412490845, "rewards/schema_keywords_iou_reward/std": 0.17831331491470337, "rewards/syntax_reward/mean": 0.90625, "rewards/syntax_reward/std": 0.2922426164150238, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 215.7135467529297, "completions/mean_terminated_length": 156.8333282470703, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 1.1017811704834606, "frac_reward_zero_std": 0.03125, "grad_norm": 1.2762739658355713, "kl": 0.05682373046875, "learning_rate": 7.999259847827013e-07, "loss": -0.0017, "num_tokens": 83135827.0, "reward": 9.502435684204102, "reward_std": 1.1811659336090088, "rewards/accuracy_reward/mean": 1.125, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.7333332300186157, "rewards/judge_reward/std": 1.632693886756897, "rewards/ngrams_iou_reward/mean": 0.18301887810230255, "rewards/ngrams_iou_reward/std": 0.21729274094104767, "rewards/schema_keywords_iou_reward/mean": 0.7121232151985168, "rewards/schema_keywords_iou_reward/std": 0.1777794361114502, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 216.9322967529297, "completions/mean_terminated_length": 151.81944274902344, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.1051738761662426, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7264842987060547, "kl": 0.0482177734375, "learning_rate": 7.98344744144428e-07, "loss": 0.0065, "num_tokens": 83397726.0, "reward": 10.046810150146484, "reward_std": 1.2917124032974243, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.5947915315628052, "rewards/judge_reward/std": 1.6279540061950684, "rewards/ngrams_iou_reward/mean": 0.16928718984127045, "rewards/ngrams_iou_reward/std": 0.19902940094470978, "rewards/schema_keywords_iou_reward/mean": 0.6983553767204285, "rewards/schema_keywords_iou_reward/std": 0.18024751543998718, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 218.6354217529297, "completions/mean_terminated_length": 159.05406188964844, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 1.1085665818490247, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9302680492401123, "kl": 0.05322265625, "learning_rate": 7.967588563053616e-07, "loss": -0.0078, "num_tokens": 83676386.0, "reward": 9.43391227722168, "reward_std": 1.4098962545394897, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.446874976158142, "rewards/judge_reward/std": 1.5745656490325928, "rewards/ngrams_iou_reward/mean": 0.18636316061019897, "rewards/ngrams_iou_reward/std": 0.2137748748064041, "rewards/schema_keywords_iou_reward/mean": 0.6923399567604065, "rewards/schema_keywords_iou_reward/std": 0.17874819040298462, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 225.921875, "completions/mean_terminated_length": 164.33334350585938, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.1119592875318065, "frac_reward_zero_std": 0.03125, "grad_norm": 0.884101390838623, "kl": 0.052490234375, "learning_rate": 7.951683459682641e-07, "loss": 0.0029, "num_tokens": 83947565.0, "reward": 9.438532829284668, "reward_std": 1.5356905460357666, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.4010416269302368, "rewards/judge_reward/std": 1.6410311460494995, "rewards/ngrams_iou_reward/mean": 0.18760903179645538, "rewards/ngrams_iou_reward/std": 0.20442089438438416, "rewards/schema_keywords_iou_reward/mean": 0.7040480971336365, "rewards/schema_keywords_iou_reward/std": 0.17402642965316772, "rewards/syntax_reward/mean": 0.734375, "rewards/syntax_reward/std": 0.44282010197639465, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 225.55209350585938, "completions/mean_terminated_length": 158.56668090820312, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 1.1153519932145886, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8021243214607239, "kl": 0.05401611328125, "learning_rate": 7.935732379079008e-07, "loss": -0.0264, "num_tokens": 84205491.0, "reward": 9.23995590209961, "reward_std": 1.291415810585022, "rewards/accuracy_reward/mean": 1.125, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.6322916746139526, "rewards/judge_reward/std": 1.5893620252609253, "rewards/ngrams_iou_reward/mean": 0.13320063054561615, "rewards/ngrams_iou_reward/std": 0.09893643856048584, "rewards/schema_keywords_iou_reward/mean": 0.6859223246574402, "rewards/schema_keywords_iou_reward/std": 0.1546471118927002, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 228.4010467529297, "completions/mean_terminated_length": 161.375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 1.1187446988973706, "frac_reward_zero_std": 0.0, "grad_norm": 0.7575882077217102, "kl": 0.049072265625, "learning_rate": 7.919735569706532e-07, "loss": 0.0037, "num_tokens": 84465332.0, "reward": 9.733318328857422, "reward_std": 1.1591399908065796, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 0.8979166150093079, "rewards/judge_reward/std": 1.3462361097335815, "rewards/ngrams_iou_reward/mean": 0.21737205982208252, "rewards/ngrams_iou_reward/std": 0.24926157295703888, "rewards/schema_keywords_iou_reward/mean": 0.6836550831794739, "rewards/schema_keywords_iou_reward/std": 0.2114126831293106, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 234.27084350585938, "completions/mean_terminated_length": 163.2888946533203, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 1.1221374045801527, "frac_reward_zero_std": 0.0, "grad_norm": 0.8008944392204285, "kl": 0.048095703125, "learning_rate": 7.903693280741331e-07, "loss": 0.0008, "num_tokens": 84751560.0, "reward": 9.853906631469727, "reward_std": 1.0658493041992188, "rewards/accuracy_reward/mean": 1.234375, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319954812526703, "rewards/judge_reward/mean": 1.712499976158142, "rewards/judge_reward/std": 1.6407920122146606, "rewards/ngrams_iou_reward/mean": 0.2193083018064499, "rewards/ngrams_iou_reward/std": 0.25759750604629517, "rewards/schema_keywords_iou_reward/mean": 0.746055543422699, "rewards/schema_keywords_iou_reward/std": 0.1433304101228714, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 228.0729217529297, "completions/mean_terminated_length": 156.70370483398438, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 1.1255301102629347, "frac_reward_zero_std": 0.03125, "grad_norm": 0.87140291929245, "kl": 0.04833984375, "learning_rate": 7.887605762067944e-07, "loss": -0.0048, "num_tokens": 85003640.0, "reward": 9.995172500610352, "reward_std": 1.5002796649932861, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319953322410583, "rewards/judge_reward/mean": 1.271875023841858, "rewards/judge_reward/std": 1.566548466682434, "rewards/ngrams_iou_reward/mean": 0.1558881253004074, "rewards/ngrams_iou_reward/std": 0.15712286531925201, "rewards/schema_keywords_iou_reward/mean": 0.7017833590507507, "rewards/schema_keywords_iou_reward/std": 0.17470888793468475, "rewards/syntax_reward/mean": 0.8802083134651184, "rewards/syntax_reward/std": 0.32556670904159546, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 235.796875, "completions/mean_terminated_length": 163.64285278320312, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.1289228159457168, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8065359592437744, "kl": 0.0543212890625, "learning_rate": 7.871473264275429e-07, "loss": 0.0121, "num_tokens": 85264763.0, "reward": 10.442981719970703, "reward_std": 0.9980249404907227, "rewards/accuracy_reward/mean": 1.78125, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.1760417222976685, "rewards/judge_reward/std": 1.6453166007995605, "rewards/ngrams_iou_reward/mean": 0.16151106357574463, "rewards/ngrams_iou_reward/std": 0.12870529294013977, "rewards/schema_keywords_iou_reward/mean": 0.7366778254508972, "rewards/schema_keywords_iou_reward/std": 0.1437462866306305, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 238.4375, "completions/mean_terminated_length": 156.8235321044922, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 1.1323155216284988, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8813567757606506, "kl": 0.04913330078125, "learning_rate": 7.855296038653473e-07, "loss": 0.0255, "num_tokens": 85519355.0, "reward": 10.23059368133545, "reward_std": 0.9956215620040894, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.3520833253860474, "rewards/judge_reward/std": 1.6467628479003906, "rewards/ngrams_iou_reward/mean": 0.19666431844234467, "rewards/ngrams_iou_reward/std": 0.2202742099761963, "rewards/schema_keywords_iou_reward/mean": 0.7047626376152039, "rewards/schema_keywords_iou_reward/std": 0.17465931177139282, "rewards/syntax_reward/mean": 0.9166666865348816, "rewards/syntax_reward/std": 0.27710798382759094, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 241.9947967529297, "completions/mean_terminated_length": 176.91175842285156, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.1357082273112806, "frac_reward_zero_std": 0.0, "grad_norm": 0.7638717889785767, "kl": 0.05096435546875, "learning_rate": 7.839074337188469e-07, "loss": 0.0191, "num_tokens": 85768462.0, "reward": 9.23202133178711, "reward_std": 1.6851227283477783, "rewards/accuracy_reward/mean": 1.078125, "rewards/accuracy_reward/std": 1.4432151317596436, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.7197917699813843, "rewards/judge_reward/std": 1.6440353393554688, "rewards/ngrams_iou_reward/mean": 0.19200532138347626, "rewards/ngrams_iou_reward/std": 0.2295907437801361, "rewards/schema_keywords_iou_reward/mean": 0.6993906497955322, "rewards/schema_keywords_iou_reward/std": 0.16595223546028137, "rewards/syntax_reward/mean": 0.71875, "rewards/syntax_reward/std": 0.4507846534252167, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.40625, "completions/mean_terminated_length": 172.40000915527344, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 1.1391009329940627, "frac_reward_zero_std": 0.0, "grad_norm": 1.0738258361816406, "kl": 0.05694580078125, "learning_rate": 7.822808412559589e-07, "loss": 0.0046, "num_tokens": 86051092.0, "reward": 10.005931854248047, "reward_std": 1.4825156927108765, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967556834220886, "rewards/judge_reward/mean": 1.774999976158142, "rewards/judge_reward/std": 1.712228775024414, "rewards/ngrams_iou_reward/mean": 0.14520862698554993, "rewards/ngrams_iou_reward/std": 0.13834476470947266, "rewards/schema_keywords_iou_reward/mean": 0.6961398124694824, "rewards/schema_keywords_iou_reward/std": 0.15333148837089539, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 242.00521850585938, "completions/mean_terminated_length": 160.0357208251953, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 1.1424936386768447, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8110923767089844, "kl": 0.04949951171875, "learning_rate": 7.80649851813486e-07, "loss": -0.0026, "num_tokens": 86301743.0, "reward": 9.706282615661621, "reward_std": 1.3412556648254395, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967556834220886, "rewards/judge_reward/mean": 1.1843749284744263, "rewards/judge_reward/std": 1.538737416267395, "rewards/ngrams_iou_reward/mean": 0.2283570021390915, "rewards/ngrams_iou_reward/std": 0.2618401050567627, "rewards/schema_keywords_iou_reward/mean": 0.687300980091095, "rewards/schema_keywords_iou_reward/std": 0.2046896368265152, "rewards/syntax_reward/mean": 0.7604166865348816, "rewards/syntax_reward/std": 0.427945077419281, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.828125, "completions/mean_terminated_length": 148.06976318359375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 1.1458863443596268, "frac_reward_zero_std": 0.0, "grad_norm": 0.9051237106323242, "kl": 0.05224609375, "learning_rate": 7.7901449079672e-07, "loss": 0.0229, "num_tokens": 86540978.0, "reward": 10.315507888793945, "reward_std": 1.1112574338912964, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9947916865348816, "rewards/format_reward/std": 0.07216878235340118, "rewards/judge_reward/mean": 1.2385417222976685, "rewards/judge_reward/std": 1.595403790473938, "rewards/ngrams_iou_reward/mean": 0.18527577817440033, "rewards/ngrams_iou_reward/std": 0.21290861070156097, "rewards/schema_keywords_iou_reward/mean": 0.7260644435882568, "rewards/schema_keywords_iou_reward/std": 0.1522604525089264, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374123275279999, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 235.91146850585938, "completions/mean_terminated_length": 151.7567596435547, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.1492790500424088, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9735745787620544, "kl": 0.04705810546875, "learning_rate": 7.77374783679048e-07, "loss": 0.012, "num_tokens": 86790483.0, "reward": 9.770817756652832, "reward_std": 1.3144465684890747, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319954812526703, "rewards/judge_reward/mean": 1.4239583015441895, "rewards/judge_reward/std": 1.6371501684188843, "rewards/ngrams_iou_reward/mean": 0.1874728947877884, "rewards/ngrams_iou_reward/std": 0.21136443316936493, "rewards/schema_keywords_iou_reward/mean": 0.71980220079422, "rewards/schema_keywords_iou_reward/std": 0.15332835912704468, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.387094110250473, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 231.328125, "completions/mean_terminated_length": 157.3125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.1526717557251909, "frac_reward_zero_std": 0.0, "grad_norm": 0.8556755185127258, "kl": 0.0498046875, "learning_rate": 7.757307560015537e-07, "loss": -0.0038, "num_tokens": 87050700.0, "reward": 10.061688423156738, "reward_std": 1.1140140295028687, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.9270833730697632, "rewards/judge_reward/std": 1.7655503749847412, "rewards/ngrams_iou_reward/mean": 0.17110872268676758, "rewards/ngrams_iou_reward/std": 0.16582122445106506, "rewards/schema_keywords_iou_reward/mean": 0.7239127159118652, "rewards/schema_keywords_iou_reward/std": 0.14425204694271088, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.7447967529297, "completions/mean_terminated_length": 153.8727264404297, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 1.156064461407973, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8991941809654236, "kl": 0.0511474609375, "learning_rate": 7.740824333726213e-07, "loss": 0.0217, "num_tokens": 87301871.0, "reward": 10.295595169067383, "reward_std": 0.9266669750213623, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.316666603088379, "rewards/judge_reward/std": 1.598472237586975, "rewards/ngrams_iou_reward/mean": 0.23196625709533691, "rewards/ngrams_iou_reward/std": 0.2874225080013275, "rewards/schema_keywords_iou_reward/mean": 0.7271701693534851, "rewards/schema_keywords_iou_reward/std": 0.1880195438861847, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 230.0104217529297, "completions/mean_terminated_length": 165.27272033691406, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.159457167090755, "frac_reward_zero_std": 0.0, "grad_norm": 0.7783213257789612, "kl": 0.0599365234375, "learning_rate": 7.724298414675352e-07, "loss": 0.0057, "num_tokens": 87544213.0, "reward": 9.389909744262695, "reward_std": 1.2869757413864136, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.355208396911621, "rewards/judge_reward/std": 1.561806559562683, "rewards/ngrams_iou_reward/mean": 0.14494900405406952, "rewards/ngrams_iou_reward/std": 0.15502025187015533, "rewards/schema_keywords_iou_reward/mean": 0.6960015296936035, "rewards/schema_keywords_iou_reward/std": 0.1495337337255478, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 220.30209350585938, "completions/mean_terminated_length": 141.7666778564453, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.162849872773537, "frac_reward_zero_std": 0.0, "grad_norm": 0.9449948668479919, "kl": 0.0555419921875, "learning_rate": 7.707730060280811e-07, "loss": 0.0238, "num_tokens": 87812771.0, "reward": 9.839994430541992, "reward_std": 1.6492574214935303, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.2260416746139526, "rewards/judge_reward/std": 1.4797582626342773, "rewards/ngrams_iou_reward/mean": 0.1939619779586792, "rewards/ngrams_iou_reward/std": 0.22779399156570435, "rewards/schema_keywords_iou_reward/mean": 0.6887401938438416, "rewards/schema_keywords_iou_reward/std": 0.18651330471038818, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617491722107, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 229.09896850585938, "completions/mean_terminated_length": 148.39584350585938, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 1.1662425784563188, "frac_reward_zero_std": 0.0, "grad_norm": 0.924759030342102, "kl": 0.05523681640625, "learning_rate": 7.691119528621444e-07, "loss": -0.0211, "num_tokens": 88063266.0, "reward": 9.039977073669434, "reward_std": 1.3483972549438477, "rewards/accuracy_reward/mean": 0.921875, "rewards/accuracy_reward/std": 1.3877325057983398, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.8156250715255737, "rewards/judge_reward/std": 1.5392816066741943, "rewards/ngrams_iou_reward/mean": 0.20850931107997894, "rewards/ngrams_iou_reward/std": 0.23633307218551636, "rewards/schema_keywords_iou_reward/mean": 0.7002173066139221, "rewards/schema_keywords_iou_reward/std": 0.1827690452337265, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 224.28125, "completions/mean_terminated_length": 160.84375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.169635284139101, "frac_reward_zero_std": 0.0, "grad_norm": 0.8695075511932373, "kl": 0.058349609375, "learning_rate": 7.67446707843308e-07, "loss": 0.0091, "num_tokens": 88329096.0, "reward": 9.812032699584961, "reward_std": 1.5780925750732422, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.3718749284744263, "rewards/judge_reward/std": 1.6164859533309937, "rewards/ngrams_iou_reward/mean": 0.18232859671115875, "rewards/ngrams_iou_reward/std": 0.21407344937324524, "rewards/schema_keywords_iou_reward/mean": 0.698453426361084, "rewards/schema_keywords_iou_reward/std": 0.18480484187602997, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 223.1979217529297, "completions/mean_terminated_length": 166.02857971191406, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.173027989821883, "frac_reward_zero_std": 0.03125, "grad_norm": 0.808914840221405, "kl": 0.0634765625, "learning_rate": 7.657772969104507e-07, "loss": 0.0106, "num_tokens": 88580426.0, "reward": 10.488302230834961, "reward_std": 0.9424228668212891, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.472916603088379, "rewards/judge_reward/std": 1.7465884685516357, "rewards/ngrams_iou_reward/mean": 0.1392589658498764, "rewards/ngrams_iou_reward/std": 0.12595510482788086, "rewards/schema_keywords_iou_reward/mean": 0.6948757171630859, "rewards/schema_keywords_iou_reward/std": 0.17613591253757477, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 231.0104217529297, "completions/mean_terminated_length": 171.82456970214844, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.176420695504665, "frac_reward_zero_std": 0.0, "grad_norm": 1.0404045581817627, "kl": 0.05389404296875, "learning_rate": 7.641037460673412e-07, "loss": -0.0082, "num_tokens": 88843618.0, "reward": 9.959892272949219, "reward_std": 0.9903817176818848, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.3958333730697632, "rewards/judge_reward/std": 1.6305919885635376, "rewards/ngrams_iou_reward/mean": 0.17750728130340576, "rewards/ngrams_iou_reward/std": 0.17815732955932617, "rewards/schema_keywords_iou_reward/mean": 0.714676558971405, "rewards/schema_keywords_iou_reward/std": 0.1833844780921936, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617491722107, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 225.6875, "completions/mean_terminated_length": 162.1290283203125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 1.179813401187447, "frac_reward_zero_std": 0.0, "grad_norm": 0.8217852711677551, "kl": 0.05902099609375, "learning_rate": 7.62426081382234e-07, "loss": -0.0067, "num_tokens": 89094178.0, "reward": 9.371110916137695, "reward_std": 1.5568301677703857, "rewards/accuracy_reward/mean": 1.25, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.5697916746139526, "rewards/judge_reward/std": 1.6303802728652954, "rewards/ngrams_iou_reward/mean": 0.14543671905994415, "rewards/ngrams_iou_reward/std": 0.16915807127952576, "rewards/schema_keywords_iou_reward/mean": 0.6798398494720459, "rewards/schema_keywords_iou_reward/std": 0.1762518435716629, "rewards/syntax_reward/mean": 0.7135416865348816, "rewards/syntax_reward/std": 0.45328810811042786, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 219.7291717529297, "completions/mean_terminated_length": 163.14666748046875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.183206106870229, "frac_reward_zero_std": 0.0, "grad_norm": 0.9839764833450317, "kl": 0.060302734375, "learning_rate": 7.607443289874642e-07, "loss": 0.0215, "num_tokens": 89362578.0, "reward": 9.839712142944336, "reward_std": 1.366499662399292, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.4114583730697632, "rewards/judge_reward/std": 1.6418843269348145, "rewards/ngrams_iou_reward/mean": 0.1396258920431137, "rewards/ngrams_iou_reward/std": 0.13197587430477142, "rewards/schema_keywords_iou_reward/mean": 0.689668595790863, "rewards/schema_keywords_iou_reward/std": 0.18939326703548431, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 168.3076934814453, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 1.1865988125530111, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8064976930618286, "kl": 0.05963134765625, "learning_rate": 7.590585150790387e-07, "loss": -0.0298, "num_tokens": 89613660.0, "reward": 10.4305419921875, "reward_std": 1.2539172172546387, "rewards/accuracy_reward/mean": 1.703125, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.3270834684371948, "rewards/judge_reward/std": 1.669842004776001, "rewards/ngrams_iou_reward/mean": 0.17452837526798248, "rewards/ngrams_iou_reward/std": 0.1783057600259781, "rewards/schema_keywords_iou_reward/mean": 0.700805127620697, "rewards/schema_keywords_iou_reward/std": 0.16629153490066528, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 226.27084350585938, "completions/mean_terminated_length": 170.80596923828125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.189991518235793, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8760790824890137, "kl": 0.05645751953125, "learning_rate": 7.573686659162293e-07, "loss": 0.0096, "num_tokens": 89871994.0, "reward": 9.282788276672363, "reward_std": 1.9929172992706299, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.3927083015441895, "rewards/judge_reward/std": 1.656385064125061, "rewards/ngrams_iou_reward/mean": 0.12468159198760986, "rewards/ngrams_iou_reward/std": 0.10948912054300308, "rewards/schema_keywords_iou_reward/mean": 0.6591477990150452, "rewards/schema_keywords_iou_reward/std": 0.19719377160072327, "rewards/syntax_reward/mean": 0.7239583134651184, "rewards/syntax_reward/std": 0.4482063353061676, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 225.5729217529297, "completions/mean_terminated_length": 171.33334350585938, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 1.193384223918575, "frac_reward_zero_std": 0.03125, "grad_norm": 0.79683518409729, "kl": 0.05474853515625, "learning_rate": 7.556748078211634e-07, "loss": 0.0134, "num_tokens": 90130398.0, "reward": 9.591808319091797, "reward_std": 1.295658826828003, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.464583396911621, "rewards/judge_reward/std": 1.615499496459961, "rewards/ngrams_iou_reward/mean": 0.15862850844860077, "rewards/ngrams_iou_reward/std": 0.21922926604747772, "rewards/schema_keywords_iou_reward/mean": 0.6863040924072266, "rewards/schema_keywords_iou_reward/std": 0.19846126437187195, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 226.5729217529297, "completions/mean_terminated_length": 167.71875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.196776929601357, "frac_reward_zero_std": 0.0, "grad_norm": 0.8629254102706909, "kl": 0.05322265625, "learning_rate": 7.539769671784139e-07, "loss": 0.0136, "num_tokens": 90373376.0, "reward": 10.61042308807373, "reward_std": 1.3201625347137451, "rewards/accuracy_reward/mean": 1.859375, "rewards/accuracy_reward/std": 1.4601210355758667, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319954812526703, "rewards/judge_reward/mean": 1.0989583730697632, "rewards/judge_reward/std": 1.6242916584014893, "rewards/ngrams_iou_reward/mean": 0.193964421749115, "rewards/ngrams_iou_reward/std": 0.21583624184131622, "rewards/schema_keywords_iou_reward/mean": 0.7289579510688782, "rewards/schema_keywords_iou_reward/std": 0.1784152239561081, "rewards/syntax_reward/mean": 0.9322916865348816, "rewards/syntax_reward/std": 0.2519015669822693, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.5572967529297, "completions/mean_terminated_length": 177.9857177734375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.2001696352841391, "frac_reward_zero_std": 0.0, "grad_norm": 0.938429057598114, "kl": 0.0570068359375, "learning_rate": 7.522751704345887e-07, "loss": 0.0119, "num_tokens": 90622825.0, "reward": 10.003329277038574, "reward_std": 1.2958276271820068, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.386458396911621, "rewards/judge_reward/std": 1.697310447692871, "rewards/ngrams_iou_reward/mean": 0.1738482564687729, "rewards/ngrams_iou_reward/std": 0.1729818433523178, "rewards/schema_keywords_iou_reward/mean": 0.7336464524269104, "rewards/schema_keywords_iou_reward/std": 0.16105026006698608, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.6822967529297, "completions/mean_terminated_length": 164.89395141601562, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.2035623409669212, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8690049052238464, "kl": 0.05364990234375, "learning_rate": 7.505694440979178e-07, "loss": 0.0029, "num_tokens": 90876594.0, "reward": 9.975821495056152, "reward_std": 1.1190942525863647, "rewards/accuracy_reward/mean": 1.765625, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.0114582777023315, "rewards/judge_reward/std": 1.455113410949707, "rewards/ngrams_iou_reward/mean": 0.1452503353357315, "rewards/ngrams_iou_reward/std": 0.18749140202999115, "rewards/schema_keywords_iou_reward/mean": 0.6930704712867737, "rewards/schema_keywords_iou_reward/std": 0.1769508272409439, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 225.20834350585938, "completions/mean_terminated_length": 167.7611846923828, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.2069550466497032, "frac_reward_zero_std": 0.0, "grad_norm": 0.8506135940551758, "kl": 0.06207275390625, "learning_rate": 7.488598147378416e-07, "loss": -0.004, "num_tokens": 91119670.0, "reward": 9.92415714263916, "reward_std": 1.4746170043945312, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.5677083730697632, "rewards/judge_reward/std": 1.623646855354309, "rewards/ngrams_iou_reward/mean": 0.1992148905992508, "rewards/ngrams_iou_reward/std": 0.21347543597221375, "rewards/schema_keywords_iou_reward/mean": 0.693692147731781, "rewards/schema_keywords_iou_reward/std": 0.17196452617645264, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 221.7135467529297, "completions/mean_terminated_length": 172.67088317871094, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.2103477523324853, "frac_reward_zero_std": 0.0, "grad_norm": 0.972655713558197, "kl": 0.0579833984375, "learning_rate": 7.471463089845955e-07, "loss": 0.0057, "num_tokens": 91358313.0, "reward": 9.097395896911621, "reward_std": 1.4739127159118652, "rewards/accuracy_reward/mean": 1.203125, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.5229166746139526, "rewards/judge_reward/std": 1.6476846933364868, "rewards/ngrams_iou_reward/mean": 0.16603301465511322, "rewards/ngrams_iou_reward/std": 0.1992856115102768, "rewards/schema_keywords_iou_reward/mean": 0.6615699529647827, "rewards/schema_keywords_iou_reward/std": 0.2153676301240921, "rewards/syntax_reward/mean": 0.7135416865348816, "rewards/syntax_reward/std": 0.45328807830810547, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.5104217529297, "completions/mean_terminated_length": 178.4383544921875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 1.213740458015267, "frac_reward_zero_std": 0.0, "grad_norm": 0.9872158765792847, "kl": 0.06072998046875, "learning_rate": 7.454289535287967e-07, "loss": 0.0198, "num_tokens": 91639829.0, "reward": 9.291292190551758, "reward_std": 1.6596711874008179, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 1.4179108142852783, "rewards/format_reward/mean": 0.8802083134651184, "rewards/format_reward/std": 0.32556670904159546, "rewards/judge_reward/mean": 1.9343749284744263, "rewards/judge_reward/std": 1.672088623046875, "rewards/ngrams_iou_reward/mean": 0.13817058503627777, "rewards/ngrams_iou_reward/std": 0.15005280077457428, "rewards/schema_keywords_iou_reward/mean": 0.6489542126655579, "rewards/schema_keywords_iou_reward/std": 0.17848515510559082, "rewards/syntax_reward/mean": 0.7552083134651184, "rewards/syntax_reward/std": 0.4310877323150635, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 178.3896026611328, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 1.2171331636980491, "frac_reward_zero_std": 0.0, "grad_norm": 0.856636106967926, "kl": 0.05548095703125, "learning_rate": 7.437077751210278e-07, "loss": 0.0221, "num_tokens": 91881269.0, "reward": 9.566969871520996, "reward_std": 1.1806364059448242, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.5239582061767578, "rewards/judge_reward/std": 1.675274133682251, "rewards/ngrams_iou_reward/mean": 0.16794945299625397, "rewards/ngrams_iou_reward/std": 0.2023196816444397, "rewards/schema_keywords_iou_reward/mean": 0.7208951115608215, "rewards/schema_keywords_iou_reward/std": 0.1561347246170044, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 222.96875, "completions/mean_terminated_length": 174.6923065185547, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 1.2205258693808312, "frac_reward_zero_std": 0.0, "grad_norm": 0.8602960109710693, "kl": 0.0614013671875, "learning_rate": 7.419828005714194e-07, "loss": 0.0123, "num_tokens": 92156495.0, "reward": 9.689400672912598, "reward_std": 1.6283587217330933, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.5104166269302368, "rewards/judge_reward/std": 1.5905137062072754, "rewards/ngrams_iou_reward/mean": 0.2191077619791031, "rewards/ngrams_iou_reward/std": 0.21918907761573792, "rewards/schema_keywords_iou_reward/mean": 0.6994585990905762, "rewards/schema_keywords_iou_reward/std": 0.1742536723613739, "rewards/syntax_reward/mean": 0.734375, "rewards/syntax_reward/std": 0.44282010197639465, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 221.05209350585938, "completions/mean_terminated_length": 171.06329345703125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.2239185750636132, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8066856265068054, "kl": 0.06329345703125, "learning_rate": 7.402540567492336e-07, "loss": 0.0152, "num_tokens": 92384259.0, "reward": 9.97671127319336, "reward_std": 1.5404270887374878, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.3489583730697632, "rewards/judge_reward/std": 1.626900315284729, "rewards/ngrams_iou_reward/mean": 0.1798357367515564, "rewards/ngrams_iou_reward/std": 0.20060889422893524, "rewards/schema_keywords_iou_reward/mean": 0.6979167461395264, "rewards/schema_keywords_iou_reward/std": 0.1920759528875351, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 222.70834350585938, "completions/mean_terminated_length": 171.89474487304688, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.2273112807463953, "frac_reward_zero_std": 0.03125, "grad_norm": 0.832939624786377, "kl": 0.070068359375, "learning_rate": 7.385215705824448e-07, "loss": -0.0054, "num_tokens": 92651539.0, "reward": 9.323663711547852, "reward_std": 1.9028223752975464, "rewards/accuracy_reward/mean": 1.203125, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.5906249284744263, "rewards/judge_reward/std": 1.6013461351394653, "rewards/ngrams_iou_reward/mean": 0.14617417752742767, "rewards/ngrams_iou_reward/std": 0.19119669497013092, "rewards/schema_keywords_iou_reward/mean": 0.6785309314727783, "rewards/schema_keywords_iou_reward/std": 0.1921960860490799, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 213.3697967529297, "completions/mean_terminated_length": 161.91954040527344, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 1.2307039864291773, "frac_reward_zero_std": 0.09375, "grad_norm": 0.8465540409088135, "kl": 0.0640869140625, "learning_rate": 7.367853690573207e-07, "loss": 0.0146, "num_tokens": 92894754.0, "reward": 10.32972240447998, "reward_std": 1.2362217903137207, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 0.9312500357627869, "rewards/judge_reward/std": 1.4512901306152344, "rewards/ngrams_iou_reward/mean": 0.2584484815597534, "rewards/ngrams_iou_reward/std": 0.26925867795944214, "rewards/schema_keywords_iou_reward/mean": 0.7712733149528503, "rewards/schema_keywords_iou_reward/std": 0.19570395350456238, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 218.234375, "completions/mean_terminated_length": 165.3625030517578, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 1.2340966921119594, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8414109349250793, "kl": 0.0576171875, "learning_rate": 7.350454792180016e-07, "loss": 0.0195, "num_tokens": 93144441.0, "reward": 9.980506896972656, "reward_std": 1.2588326930999756, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.613541603088379, "rewards/judge_reward/std": 1.6901389360427856, "rewards/ngrams_iou_reward/mean": 0.1872016042470932, "rewards/ngrams_iou_reward/std": 0.19544517993927002, "rewards/schema_keywords_iou_reward/mean": 0.7276793122291565, "rewards/schema_keywords_iou_reward/std": 0.1590542197227478, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.921875, "completions/mean_terminated_length": 179.5, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.2374893977947412, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9248372316360474, "kl": 0.061767578125, "learning_rate": 7.333019281660788e-07, "loss": -0.0032, "num_tokens": 93386268.0, "reward": 10.149675369262695, "reward_std": 1.5773227214813232, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.4458332061767578, "rewards/judge_reward/std": 1.6726888418197632, "rewards/ngrams_iou_reward/mean": 0.18978162109851837, "rewards/ngrams_iou_reward/std": 0.23262757062911987, "rewards/schema_keywords_iou_reward/mean": 0.693225622177124, "rewards/schema_keywords_iou_reward/std": 0.18507599830627441, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 223.23959350585938, "completions/mean_terminated_length": 176.37974548339844, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 1.2408821034775233, "frac_reward_zero_std": 0.0, "grad_norm": 0.8538742661476135, "kl": 0.06494140625, "learning_rate": 7.315547430601738e-07, "loss": 0.0314, "num_tokens": 93644452.0, "reward": 10.038044929504395, "reward_std": 1.6599658727645874, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.2697917222976685, "rewards/judge_reward/std": 1.544971227645874, "rewards/ngrams_iou_reward/mean": 0.2103716880083084, "rewards/ngrams_iou_reward/std": 0.20838379859924316, "rewards/schema_keywords_iou_reward/mean": 0.7255894541740417, "rewards/schema_keywords_iou_reward/std": 0.19584578275680542, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 214.296875, "completions/mean_terminated_length": 168.96739196777344, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 1.2442748091603053, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8455172777175903, "kl": 0.06280517578125, "learning_rate": 7.298039511155137e-07, "loss": 0.0044, "num_tokens": 93906511.0, "reward": 9.481849670410156, "reward_std": 1.4258334636688232, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.2947916984558105, "rewards/judge_reward/std": 1.5506707429885864, "rewards/ngrams_iou_reward/mean": 0.1649455577135086, "rewards/ngrams_iou_reward/std": 0.2172859013080597, "rewards/schema_keywords_iou_reward/mean": 0.6804451942443848, "rewards/schema_keywords_iou_reward/std": 0.2027798891067505, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 217.25, "completions/mean_terminated_length": 168.47059631347656, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.2476675148430874, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8161606788635254, "kl": 0.06402587890625, "learning_rate": 7.280495796035079e-07, "loss": 0.014, "num_tokens": 94149607.0, "reward": 10.011594772338867, "reward_std": 1.2936160564422607, "rewards/accuracy_reward/mean": 1.734375, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.0385416746139526, "rewards/judge_reward/std": 1.4949737787246704, "rewards/ngrams_iou_reward/mean": 0.21148037910461426, "rewards/ngrams_iou_reward/std": 0.25155988335609436, "rewards/schema_keywords_iou_reward/mean": 0.7230296730995178, "rewards/schema_keywords_iou_reward/std": 0.20020607113838196, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617491722107, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 212.40625, "completions/mean_terminated_length": 164.02198791503906, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.2510602205258694, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8793842196464539, "kl": 0.0587158203125, "learning_rate": 7.262916558513236e-07, "loss": 0.0139, "num_tokens": 94386379.0, "reward": 9.89924430847168, "reward_std": 1.2888245582580566, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.6218751668930054, "rewards/judge_reward/std": 1.6891834735870361, "rewards/ngrams_iou_reward/mean": 0.22045284509658813, "rewards/ngrams_iou_reward/std": 0.27540162205696106, "rewards/schema_keywords_iou_reward/mean": 0.6954581141471863, "rewards/schema_keywords_iou_reward/std": 0.1817678064107895, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 213.0729217529297, "completions/mean_terminated_length": 163.3932647705078, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.2544529262086515, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8578415513038635, "kl": 0.0633544921875, "learning_rate": 7.245302072414601e-07, "loss": 0.0368, "num_tokens": 94628919.0, "reward": 9.543521881103516, "reward_std": 1.6895002126693726, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.1770833730697632, "rewards/judge_reward/std": 1.4918612241744995, "rewards/ngrams_iou_reward/mean": 0.12282705307006836, "rewards/ngrams_iou_reward/std": 0.10495413094758987, "rewards/schema_keywords_iou_reward/mean": 0.6967352032661438, "rewards/schema_keywords_iou_reward/std": 0.18275195360183716, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 212.20834350585938, "completions/mean_terminated_length": 171.07070922851562, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 1.2578456318914335, "frac_reward_zero_std": 0.0625, "grad_norm": 1.028933048248291, "kl": 0.068359375, "learning_rate": 7.227652612113213e-07, "loss": 0.013, "num_tokens": 94870321.0, "reward": 10.109478950500488, "reward_std": 1.2054556608200073, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.2864583730697632, "rewards/judge_reward/std": 1.5952231884002686, "rewards/ngrams_iou_reward/mean": 0.16994114220142365, "rewards/ngrams_iou_reward/std": 0.22404888272285461, "rewards/schema_keywords_iou_reward/mean": 0.7312037348747253, "rewards/schema_keywords_iou_reward/std": 0.15906789898872375, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 210.27084350585938, "completions/mean_terminated_length": 166.4081573486328, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 1.2612383375742153, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9175559878349304, "kl": 0.0693359375, "learning_rate": 7.209968452527896e-07, "loss": 0.0187, "num_tokens": 95113817.0, "reward": 10.55419635772705, "reward_std": 1.446678638458252, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.056249976158142, "rewards/judge_reward/std": 1.5258204936981201, "rewards/ngrams_iou_reward/mean": 0.27483415603637695, "rewards/ngrams_iou_reward/std": 0.3350065350532532, "rewards/schema_keywords_iou_reward/mean": 0.7189447283744812, "rewards/schema_keywords_iou_reward/std": 0.19123820960521698, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 208.3385467529297, "completions/mean_terminated_length": 172.80908203125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 1.2646310432569974, "frac_reward_zero_std": 0.0, "grad_norm": 0.9285529255867004, "kl": 0.06298828125, "learning_rate": 7.19224986911797e-07, "loss": 0.0141, "num_tokens": 95384206.0, "reward": 9.848979949951172, "reward_std": 1.2113797664642334, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.3583332300186157, "rewards/judge_reward/std": 1.6007633209228516, "rewards/ngrams_iou_reward/mean": 0.13218431174755096, "rewards/ngrams_iou_reward/std": 0.0945950523018837, "rewards/schema_keywords_iou_reward/mean": 0.698045551776886, "rewards/schema_keywords_iou_reward/std": 0.15451885759830475, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.75, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 198.203125, "completions/mean_terminated_length": 163.52500915527344, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 1.2680237489397794, "frac_reward_zero_std": 0.0, "grad_norm": 1.0221006870269775, "kl": 0.070556640625, "learning_rate": 7.174497137878965e-07, "loss": -0.0296, "num_tokens": 95643103.0, "reward": 9.430953025817871, "reward_std": 1.4599326848983765, "rewards/accuracy_reward/mean": 1.234375, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319953322410583, "rewards/judge_reward/mean": 1.5541666746139526, "rewards/judge_reward/std": 1.562111258506775, "rewards/ngrams_iou_reward/mean": 0.11019162088632584, "rewards/ngrams_iou_reward/std": 0.07159334421157837, "rewards/schema_keywords_iou_reward/mean": 0.6759693622589111, "rewards/schema_keywords_iou_reward/std": 0.1354278177022934, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617789745331, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -3.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 190.16146850585938, "completions/mean_terminated_length": 161.66416931152344, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 1.2714164546225615, "frac_reward_zero_std": 0.09375, "grad_norm": 0.8431328535079956, "kl": 0.0635986328125, "learning_rate": 7.156710535338312e-07, "loss": -0.0103, "num_tokens": 95907596.0, "reward": 9.677416801452637, "reward_std": 1.114107370376587, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319954812526703, "rewards/judge_reward/mean": 1.25, "rewards/judge_reward/std": 1.4550867080688477, "rewards/ngrams_iou_reward/mean": 0.22569973766803741, "rewards/ngrams_iou_reward/std": 0.27935636043548584, "rewards/schema_keywords_iou_reward/mean": 0.7017157673835754, "rewards/schema_keywords_iou_reward/std": 0.19105756282806396, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.387094110250473, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 213.953125, "completions/mean_terminated_length": 176.06930541992188, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 1.2748091603053435, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8833341002464294, "kl": 0.05767822265625, "learning_rate": 7.138890338551048e-07, "loss": 0.0082, "num_tokens": 96162209.0, "reward": 9.824974060058594, "reward_std": 1.5009077787399292, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.410416603088379, "rewards/judge_reward/std": 1.641955018043518, "rewards/ngrams_iou_reward/mean": 0.13164237141609192, "rewards/ngrams_iou_reward/std": 0.08624421805143356, "rewards/schema_keywords_iou_reward/mean": 0.6797893047332764, "rewards/schema_keywords_iou_reward/std": 0.15899863839149475, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 200.8697967529297, "completions/mean_terminated_length": 156.14151000976562, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.2782018659881256, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7795740962028503, "kl": 0.057373046875, "learning_rate": 7.121036825095491e-07, "loss": 0.0152, "num_tokens": 96390892.0, "reward": 9.504457473754883, "reward_std": 1.04148268699646, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319953322410583, "rewards/judge_reward/mean": 1.3135417699813843, "rewards/judge_reward/std": 1.4644920825958252, "rewards/ngrams_iou_reward/mean": 0.155113086104393, "rewards/ngrams_iou_reward/std": 0.14767098426818848, "rewards/schema_keywords_iou_reward/mean": 0.6962180733680725, "rewards/schema_keywords_iou_reward/std": 0.14858275651931763, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 211.71875, "completions/mean_terminated_length": 170.1212158203125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 1.2815945716709076, "frac_reward_zero_std": 0.0, "grad_norm": 0.8826022744178772, "kl": 0.06640625, "learning_rate": 7.103150273068921e-07, "loss": 0.0127, "num_tokens": 96647128.0, "reward": 9.05374526977539, "reward_std": 1.3958775997161865, "rewards/accuracy_reward/mean": 1.15625, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.5166667699813843, "rewards/judge_reward/std": 1.5381138324737549, "rewards/ngrams_iou_reward/mean": 0.14126574993133545, "rewards/ngrams_iou_reward/std": 0.1747959554195404, "rewards/schema_keywords_iou_reward/mean": 0.6708126068115234, "rewards/schema_keywords_iou_reward/std": 0.19044806063175201, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 200.69271850585938, "completions/mean_terminated_length": 159.46363830566406, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 1.2849872773536894, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9743348360061646, "kl": 0.07049560546875, "learning_rate": 7.085230961083248e-07, "loss": -0.0015, "num_tokens": 96905411.0, "reward": 10.249019622802734, "reward_std": 1.0998995304107666, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319954812526703, "rewards/judge_reward/mean": 1.4145832061767578, "rewards/judge_reward/std": 1.7165030241012573, "rewards/ngrams_iou_reward/mean": 0.18535344302654266, "rewards/ngrams_iou_reward/std": 0.19091232120990753, "rewards/schema_keywords_iou_reward/mean": 0.7397066950798035, "rewards/schema_keywords_iou_reward/std": 0.1452585756778717, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 200.28125, "completions/mean_terminated_length": 147.93939208984375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.2883799830364717, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7871307134628296, "kl": 0.053955078125, "learning_rate": 7.067279168260671e-07, "loss": 0.023, "num_tokens": 97151027.0, "reward": 10.53372859954834, "reward_std": 1.0109862089157104, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.4583333730697632, "rewards/judge_reward/std": 1.706265926361084, "rewards/ngrams_iou_reward/mean": 0.22200627624988556, "rewards/ngrams_iou_reward/std": 0.26203975081443787, "rewards/schema_keywords_iou_reward/mean": 0.7335969805717468, "rewards/schema_keywords_iou_reward/std": 0.171200692653656, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 202.1822967529297, "completions/mean_terminated_length": 161.2018280029297, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 1.2917726887192535, "frac_reward_zero_std": 0.0625, "grad_norm": 1.0365562438964844, "kl": 0.0692138671875, "learning_rate": 7.049295174229328e-07, "loss": 0.0068, "num_tokens": 97406914.0, "reward": 10.05256175994873, "reward_std": 1.125281572341919, "rewards/accuracy_reward/mean": 1.828125, "rewards/accuracy_reward/std": 1.4674979448318481, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 0.8958333134651184, "rewards/judge_reward/std": 1.369519829750061, "rewards/ngrams_iou_reward/mean": 0.1702326089143753, "rewards/ngrams_iou_reward/std": 0.19159600138664246, "rewards/schema_keywords_iou_reward/mean": 0.7260794043540955, "rewards/schema_keywords_iou_reward/std": 0.13731442391872406, "rewards/syntax_reward/mean": 0.90625, "rewards/syntax_reward/std": 0.2922426164150238, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 206.4479217529297, "completions/mean_terminated_length": 161.80197143554688, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 1.2951653944020356, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7572941780090332, "kl": 0.06292724609375, "learning_rate": 7.031279259118946e-07, "loss": -0.0048, "num_tokens": 97669956.0, "reward": 10.2578763961792, "reward_std": 1.1234962940216064, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.3072916269302368, "rewards/judge_reward/std": 1.641461730003357, "rewards/ngrams_iou_reward/mean": 0.17217333614826202, "rewards/ngrams_iou_reward/std": 0.17405284941196442, "rewards/schema_keywords_iou_reward/mean": 0.7367434501647949, "rewards/schema_keywords_iou_reward/std": 0.1518159806728363, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 214.859375, "completions/mean_terminated_length": 159.6707305908203, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 1.2985581000848176, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9097636342048645, "kl": 0.0562744140625, "learning_rate": 7.01323170355647e-07, "loss": 0.0053, "num_tokens": 97929219.0, "reward": 10.384749412536621, "reward_std": 1.0222771167755127, "rewards/accuracy_reward/mean": 2.0, "rewards/accuracy_reward/std": 1.4179108142852783, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319954812526703, "rewards/judge_reward/mean": 0.7760416865348816, "rewards/judge_reward/std": 1.259455919265747, "rewards/ngrams_iou_reward/mean": 0.22684484720230103, "rewards/ngrams_iou_reward/std": 0.2566297650337219, "rewards/schema_keywords_iou_reward/mean": 0.7308209538459778, "rewards/schema_keywords_iou_reward/std": 0.17221049964427948, "rewards/syntax_reward/mean": 0.8958333134651184, "rewards/syntax_reward/std": 0.30627527832984924, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.9166717529297, "completions/mean_terminated_length": 155.1219482421875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 1.3019508057675997, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8908746242523193, "kl": 0.05706787109375, "learning_rate": 6.995152788661705e-07, "loss": -0.0155, "num_tokens": 98193521.0, "reward": 10.009319305419922, "reward_std": 1.5086649656295776, "rewards/accuracy_reward/mean": 1.515625, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.384374976158142, "rewards/judge_reward/std": 1.6281442642211914, "rewards/ngrams_iou_reward/mean": 0.24188821017742157, "rewards/ngrams_iou_reward/std": 0.2927873730659485, "rewards/schema_keywords_iou_reward/mean": 0.7226381301879883, "rewards/schema_keywords_iou_reward/std": 0.18892894685268402, "rewards/syntax_reward/mean": 0.7552083134651184, "rewards/syntax_reward/std": 0.4310877025127411, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 216.9635467529297, "completions/mean_terminated_length": 168.8488311767578, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.3053435114503817, "frac_reward_zero_std": 0.0, "grad_norm": 1.0694968700408936, "kl": 0.06304931640625, "learning_rate": 6.977042796042917e-07, "loss": 0.0102, "num_tokens": 98436334.0, "reward": 9.119303703308105, "reward_std": 1.4275901317596436, "rewards/accuracy_reward/mean": 1.203125, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.5322917699813843, "rewards/judge_reward/std": 1.536501407623291, "rewards/ngrams_iou_reward/mean": 0.13735122978687286, "rewards/ngrams_iou_reward/std": 0.13642294704914093, "rewards/schema_keywords_iou_reward/mean": 0.6517429351806641, "rewards/schema_keywords_iou_reward/std": 0.18597501516342163, "rewards/syntax_reward/mean": 0.71875, "rewards/syntax_reward/std": 0.4507846534252167, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 204.359375, "completions/mean_terminated_length": 150.52127075195312, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 1.3087362171331636, "frac_reward_zero_std": 0.0, "grad_norm": 0.9895726442337036, "kl": 0.0614013671875, "learning_rate": 6.958902007792465e-07, "loss": 0.024, "num_tokens": 98706643.0, "reward": 10.047900199890137, "reward_std": 1.7444933652877808, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.3583332300186157, "rewards/judge_reward/std": 1.578233242034912, "rewards/ngrams_iou_reward/mean": 0.2378503531217575, "rewards/ngrams_iou_reward/std": 0.28613147139549255, "rewards/schema_keywords_iou_reward/mean": 0.7287985682487488, "rewards/schema_keywords_iou_reward/std": 0.19034883379936218, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.875, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 209.203125, "completions/mean_terminated_length": 158.3369598388672, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 1.3121289228159458, "frac_reward_zero_std": 0.0, "grad_norm": 0.7864013314247131, "kl": 0.0625, "learning_rate": 6.940730706482398e-07, "loss": 0.0255, "num_tokens": 98946442.0, "reward": 9.988862037658691, "reward_std": 1.1510069370269775, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.6968750953674316, "rewards/judge_reward/std": 1.681805968284607, "rewards/ngrams_iou_reward/mean": 0.1337459236383438, "rewards/ngrams_iou_reward/std": 0.13112467527389526, "rewards/schema_keywords_iou_reward/mean": 0.7374069094657898, "rewards/schema_keywords_iou_reward/std": 0.13559922575950623, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 211.2916717529297, "completions/mean_terminated_length": 165.64210510253906, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.3155216284987277, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8033066391944885, "kl": 0.060791015625, "learning_rate": 6.922529175160054e-07, "loss": -0.0093, "num_tokens": 99209280.0, "reward": 10.191368103027344, "reward_std": 1.1803436279296875, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12434382736682892, "rewards/judge_reward/mean": 1.7458332777023315, "rewards/judge_reward/std": 1.732831358909607, "rewards/ngrams_iou_reward/mean": 0.2049909234046936, "rewards/ngrams_iou_reward/std": 0.24490872025489807, "rewards/schema_keywords_iou_reward/mean": 0.7238764762878418, "rewards/schema_keywords_iou_reward/std": 0.15597347915172577, "rewards/syntax_reward/mean": 0.7552083134651184, "rewards/syntax_reward/std": 0.4310877025127411, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.625, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 213.359375, "completions/mean_terminated_length": 158.5357208251953, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.3189143341815097, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9940117597579956, "kl": 0.061767578125, "learning_rate": 6.904297697343654e-07, "loss": -0.0134, "num_tokens": 99462123.0, "reward": 10.165464401245117, "reward_std": 1.2473762035369873, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.459375023841858, "rewards/judge_reward/std": 1.6938495635986328, "rewards/ngrams_iou_reward/mean": 0.14102180302143097, "rewards/ngrams_iou_reward/std": 0.15196117758750916, "rewards/schema_keywords_iou_reward/mean": 0.6942341923713684, "rewards/schema_keywords_iou_reward/std": 0.16942529380321503, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 214.6041717529297, "completions/mean_terminated_length": 166.69662475585938, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.3223070398642918, "frac_reward_zero_std": 0.0, "grad_norm": 0.9441159963607788, "kl": 0.06475830078125, "learning_rate": 6.886036557017881e-07, "loss": 0.013, "num_tokens": 99717995.0, "reward": 10.298194885253906, "reward_std": 1.2322391271591187, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.394791603088379, "rewards/judge_reward/std": 1.6132222414016724, "rewards/ngrams_iou_reward/mean": 0.21851129829883575, "rewards/ngrams_iou_reward/std": 0.2572447955608368, "rewards/schema_keywords_iou_reward/mean": 0.7432246804237366, "rewards/schema_keywords_iou_reward/std": 0.1677035540342331, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.625, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 215.265625, "completions/mean_terminated_length": 162.89285278320312, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.3256997455470738, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7555763721466064, "kl": 0.0604248046875, "learning_rate": 6.867746038629462e-07, "loss": 0.0163, "num_tokens": 99967958.0, "reward": 10.350996017456055, "reward_std": 1.3743455410003662, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.2208333015441895, "rewards/judge_reward/std": 1.5893570184707642, "rewards/ngrams_iou_reward/mean": 0.19338655471801758, "rewards/ngrams_iou_reward/std": 0.21826551854610443, "rewards/schema_keywords_iou_reward/mean": 0.7367755770683289, "rewards/schema_keywords_iou_reward/std": 0.14305450022220612, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -2.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 207.20834350585938, "completions/mean_terminated_length": 159.42266845703125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.3290924512298559, "frac_reward_zero_std": 0.0, "grad_norm": 0.8762460947036743, "kl": 0.06793212890625, "learning_rate": 6.849426427082734e-07, "loss": 0.0115, "num_tokens": 100220232.0, "reward": 10.747462272644043, "reward_std": 1.3112726211547852, "rewards/accuracy_reward/mean": 2.0, "rewards/accuracy_reward/std": 1.4179108142852783, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 0.9739583134651184, "rewards/judge_reward/std": 1.5179033279418945, "rewards/ngrams_iou_reward/mean": 0.25360119342803955, "rewards/ngrams_iou_reward/std": 0.2603984773159027, "rewards/schema_keywords_iou_reward/mean": 0.7386525273323059, "rewards/schema_keywords_iou_reward/std": 0.18001607060432434, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617491722107, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 228.59375, "completions/mean_terminated_length": 169.73770141601562, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.3324851569126377, "frac_reward_zero_std": 0.0, "grad_norm": 0.8806194067001343, "kl": 0.05487060546875, "learning_rate": 6.831078007735209e-07, "loss": -0.0043, "num_tokens": 100442010.0, "reward": 10.655435562133789, "reward_std": 1.3704522848129272, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.0500000715255737, "rewards/judge_reward/std": 1.558761715888977, "rewards/ngrams_iou_reward/mean": 0.2506045699119568, "rewards/ngrams_iou_reward/std": 0.26759931445121765, "rewards/schema_keywords_iou_reward/mean": 0.7527477145195007, "rewards/schema_keywords_iou_reward/std": 0.1738397777080536, "rewards/syntax_reward/mean": 0.921875, "rewards/syntax_reward/std": 0.2690697908401489, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 212.91146850585938, "completions/mean_terminated_length": 157.51190185546875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 1.33587786259542, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8381478190422058, "kl": 0.060302734375, "learning_rate": 6.812701066393123e-07, "loss": 0.0131, "num_tokens": 100709923.0, "reward": 10.320173263549805, "reward_std": 1.1546356678009033, "rewards/accuracy_reward/mean": 1.734375, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.1822916269302368, "rewards/judge_reward/std": 1.6000977754592896, "rewards/ngrams_iou_reward/mean": 0.22687335312366486, "rewards/ngrams_iou_reward/std": 0.27889031171798706, "rewards/schema_keywords_iou_reward/mean": 0.6922579407691956, "rewards/schema_keywords_iou_reward/std": 0.21219411492347717, "rewards/syntax_reward/mean": 0.8854166865348816, "rewards/syntax_reward/std": 0.3193511366844177, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 219.1197967529297, "completions/mean_terminated_length": 160.31082153320312, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 1.3392705682782018, "frac_reward_zero_std": 0.0, "grad_norm": 1.1010990142822266, "kl": 0.06695556640625, "learning_rate": 6.794295889306989e-07, "loss": 0.0254, "num_tokens": 100980120.0, "reward": 9.725345611572266, "reward_std": 1.8566539287567139, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.5010417699813843, "rewards/judge_reward/std": 1.741756558418274, "rewards/ngrams_iou_reward/mean": 0.1666383147239685, "rewards/ngrams_iou_reward/std": 0.19309164583683014, "rewards/schema_keywords_iou_reward/mean": 0.6451658010482788, "rewards/schema_keywords_iou_reward/std": 0.2035459578037262, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 221.2916717529297, "completions/mean_terminated_length": 167.14666748046875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 1.3426632739609838, "frac_reward_zero_std": 0.03125, "grad_norm": 1.0906296968460083, "kl": 0.0570068359375, "learning_rate": 6.775862763167142e-07, "loss": -0.0068, "num_tokens": 101240156.0, "reward": 9.682973861694336, "reward_std": 1.4992496967315674, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.754166603088379, "rewards/judge_reward/std": 1.6987779140472412, "rewards/ngrams_iou_reward/mean": 0.1662127822637558, "rewards/ngrams_iou_reward/std": 0.19654415547847748, "rewards/schema_keywords_iou_reward/mean": 0.685509204864502, "rewards/schema_keywords_iou_reward/std": 0.17344780266284943, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 221.61459350585938, "completions/mean_terminated_length": 167.9733428955078, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 1.3460559796437659, "frac_reward_zero_std": 0.0, "grad_norm": 0.8640257120132446, "kl": 0.05657958984375, "learning_rate": 6.757401975099262e-07, "loss": 0.0304, "num_tokens": 101474346.0, "reward": 9.960250854492188, "reward_std": 1.2265441417694092, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.4322916269302368, "rewards/judge_reward/std": 1.5833885669708252, "rewards/ngrams_iou_reward/mean": 0.18742792308330536, "rewards/ngrams_iou_reward/std": 0.24157433211803436, "rewards/schema_keywords_iou_reward/mean": 0.7103220820426941, "rewards/schema_keywords_iou_reward/std": 0.1739943027496338, "rewards/syntax_reward/mean": 0.7395833134651184, "rewards/syntax_reward/std": 0.44000932574272156, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 222.40625, "completions/mean_terminated_length": 166.4166717529297, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.349448685326548, "frac_reward_zero_std": 0.0, "grad_norm": 0.8992330431938171, "kl": 0.0576171875, "learning_rate": 6.738913812659912e-07, "loss": -0.0058, "num_tokens": 101716092.0, "reward": 9.925162315368652, "reward_std": 1.288896083831787, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.0854166746139526, "rewards/judge_reward/std": 1.5252827405929565, "rewards/ngrams_iou_reward/mean": 0.1497933268547058, "rewards/ngrams_iou_reward/std": 0.14653752744197845, "rewards/schema_keywords_iou_reward/mean": 0.6930767893791199, "rewards/schema_keywords_iou_reward/std": 0.18731153011322021, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.5625, "completions/mean_terminated_length": 171.64178466796875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.35284139100933, "frac_reward_zero_std": 0.0, "grad_norm": 0.9205455780029297, "kl": 0.05877685546875, "learning_rate": 6.720398563832055e-07, "loss": -0.0037, "num_tokens": 101987976.0, "reward": 9.33505916595459, "reward_std": 1.355860710144043, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.2833333015441895, "rewards/judge_reward/std": 1.5871648788452148, "rewards/ngrams_iou_reward/mean": 0.17444287240505219, "rewards/ngrams_iou_reward/std": 0.21254105865955353, "rewards/schema_keywords_iou_reward/mean": 0.6616575717926025, "rewards/schema_keywords_iou_reward/std": 0.19897998869419098, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 223.4322967529297, "completions/mean_terminated_length": 161.25758361816406, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 1.356234096692112, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9690723419189453, "kl": 0.05352783203125, "learning_rate": 6.701856517020565e-07, "loss": 0.0084, "num_tokens": 102238595.0, "reward": 10.86899185180664, "reward_std": 1.024258017539978, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.1760417222976685, "rewards/judge_reward/std": 1.6402173042297363, "rewards/ngrams_iou_reward/mean": 0.23237276077270508, "rewards/ngrams_iou_reward/std": 0.27579426765441895, "rewards/schema_keywords_iou_reward/mean": 0.7480771541595459, "rewards/schema_keywords_iou_reward/std": 0.16071616113185883, "rewards/syntax_reward/mean": 0.90625, "rewards/syntax_reward/std": 0.2922426164150238, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 220.4375, "completions/mean_terminated_length": 161.1666717529297, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 1.359626802374894, "frac_reward_zero_std": 0.0, "grad_norm": 0.8590338826179504, "kl": 0.05926513671875, "learning_rate": 6.683287961047741e-07, "loss": 0.0119, "num_tokens": 102509813.0, "reward": 9.579488754272461, "reward_std": 1.6785444021224976, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.4718750715255737, "rewards/judge_reward/std": 1.5773396492004395, "rewards/ngrams_iou_reward/mean": 0.19183290004730225, "rewards/ngrams_iou_reward/std": 0.23242641985416412, "rewards/schema_keywords_iou_reward/mean": 0.6939046382904053, "rewards/schema_keywords_iou_reward/std": 0.16894687712192535, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 229.19271850585938, "completions/mean_terminated_length": 168.76271057128906, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.363019508057676, "frac_reward_zero_std": 0.0, "grad_norm": 0.7872699499130249, "kl": 0.05999755859375, "learning_rate": 6.664693185148806e-07, "loss": 0.0001, "num_tokens": 102765252.0, "reward": 9.786752700805664, "reward_std": 1.5239222049713135, "rewards/accuracy_reward/mean": 1.515625, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.337499976158142, "rewards/judge_reward/std": 1.605830192565918, "rewards/ngrams_iou_reward/mean": 0.15481512248516083, "rewards/ngrams_iou_reward/std": 0.1893416941165924, "rewards/schema_keywords_iou_reward/mean": 0.6861039996147156, "rewards/schema_keywords_iou_reward/std": 0.19316862523555756, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.6979217529297, "completions/mean_terminated_length": 163.77047729492188, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 1.366412213740458, "frac_reward_zero_std": 0.0, "grad_norm": 0.841779887676239, "kl": 0.05450439453125, "learning_rate": 6.646072478967396e-07, "loss": -0.0127, "num_tokens": 103019822.0, "reward": 10.833751678466797, "reward_std": 1.0372166633605957, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.2010416984558105, "rewards/judge_reward/std": 1.702784538269043, "rewards/ngrams_iou_reward/mean": 0.25206682085990906, "rewards/ngrams_iou_reward/std": 0.2795403003692627, "rewards/schema_keywords_iou_reward/mean": 0.7681426405906677, "rewards/schema_keywords_iou_reward/std": 0.15921173989772797, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.09896850585938, "completions/mean_terminated_length": 167.75, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.36980491942324, "frac_reward_zero_std": 0.03125, "grad_norm": 0.749713659286499, "kl": 0.05023193359375, "learning_rate": 6.627426132551058e-07, "loss": 0.0083, "num_tokens": 103278147.0, "reward": 10.178123474121094, "reward_std": 0.9639959335327148, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.2979165315628052, "rewards/judge_reward/std": 1.616795301437378, "rewards/ngrams_iou_reward/mean": 0.1830851286649704, "rewards/ngrams_iou_reward/std": 0.19341520965099335, "rewards/schema_keywords_iou_reward/mean": 0.7064964175224304, "rewards/schema_keywords_iou_reward/std": 0.1659216582775116, "rewards/syntax_reward/mean": 0.9114583134651184, "rewards/syntax_reward/std": 0.2848237454891205, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 234.71875, "completions/mean_terminated_length": 178.90567016601562, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 1.373197625106022, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7931334972381592, "kl": 0.052978515625, "learning_rate": 6.608754436346724e-07, "loss": -0.0121, "num_tokens": 103543371.0, "reward": 10.117579460144043, "reward_std": 1.3176732063293457, "rewards/accuracy_reward/mean": 1.703125, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.1708333492279053, "rewards/judge_reward/std": 1.6103016138076782, "rewards/ngrams_iou_reward/mean": 0.18208162486553192, "rewards/ngrams_iou_reward/std": 0.19704513251781464, "rewards/schema_keywords_iou_reward/mean": 0.6979976296424866, "rewards/schema_keywords_iou_reward/std": 0.18945720791816711, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.1041717529297, "completions/mean_terminated_length": 176.07272338867188, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 1.376590330788804, "frac_reward_zero_std": 0.0, "grad_norm": 0.8933154940605164, "kl": 0.05364990234375, "learning_rate": 6.590057681196191e-07, "loss": 0.0317, "num_tokens": 103790945.0, "reward": 10.214266777038574, "reward_std": 1.52122962474823, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.3229166269302368, "rewards/judge_reward/std": 1.6268351078033447, "rewards/ngrams_iou_reward/mean": 0.1391204446554184, "rewards/ngrams_iou_reward/std": 0.09454017132520676, "rewards/schema_keywords_iou_reward/mean": 0.7105622887611389, "rewards/schema_keywords_iou_reward/std": 0.16608460247516632, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 169.27273559570312, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.3799830364715862, "frac_reward_zero_std": 0.0, "grad_norm": 0.9245941638946533, "kl": 0.048828125, "learning_rate": 6.571336158331589e-07, "loss": 0.0118, "num_tokens": 104027075.0, "reward": 9.988401412963867, "reward_std": 1.8111460208892822, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.4270833730697632, "rewards/judge_reward/std": 1.6860672235488892, "rewards/ngrams_iou_reward/mean": 0.19135038554668427, "rewards/ngrams_iou_reward/std": 0.2295452505350113, "rewards/schema_keywords_iou_reward/mean": 0.7137168049812317, "rewards/schema_keywords_iou_reward/std": 0.18420825898647308, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 237.41146850585938, "completions/mean_terminated_length": 176.6888885498047, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 1.3833757421543682, "frac_reward_zero_std": 0.0, "grad_norm": 0.8223982453346252, "kl": 0.053955078125, "learning_rate": 6.552590159370844e-07, "loss": 0.003, "num_tokens": 104269260.0, "reward": 10.051709175109863, "reward_std": 1.7297489643096924, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.294791579246521, "rewards/judge_reward/std": 1.6394270658493042, "rewards/ngrams_iou_reward/mean": 0.17956602573394775, "rewards/ngrams_iou_reward/std": 0.2176062911748886, "rewards/schema_keywords_iou_reward/mean": 0.6992263793945312, "rewards/schema_keywords_iou_reward/std": 0.19975371658802032, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 234.05209350585938, "completions/mean_terminated_length": 164.3913116455078, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 1.38676844783715, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8361489772796631, "kl": 0.05126953125, "learning_rate": 6.53381997631314e-07, "loss": 0.0042, "num_tokens": 104549188.0, "reward": 9.423700332641602, "reward_std": 1.6638438701629639, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.8802083134651184, "rewards/format_reward/std": 0.32556667923927307, "rewards/judge_reward/mean": 1.2635416984558105, "rewards/judge_reward/std": 1.5043128728866577, "rewards/ngrams_iou_reward/mean": 0.1979636698961258, "rewards/ngrams_iou_reward/std": 0.23846404254436493, "rewards/schema_keywords_iou_reward/mean": 0.709068775177002, "rewards/schema_keywords_iou_reward/std": 0.19386811554431915, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 239.09375, "completions/mean_terminated_length": 178.71429443359375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 1.390161153519932, "frac_reward_zero_std": 0.0, "grad_norm": 0.7360630035400391, "kl": 0.0509033203125, "learning_rate": 6.515025901534363e-07, "loss": -0.0071, "num_tokens": 104818048.0, "reward": 9.612404823303223, "reward_std": 1.5381253957748413, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.3489583730697632, "rewards/judge_reward/std": 1.5846444368362427, "rewards/ngrams_iou_reward/mean": 0.14482560753822327, "rewards/ngrams_iou_reward/std": 0.1328619122505188, "rewards/schema_keywords_iou_reward/mean": 0.6811200976371765, "rewards/schema_keywords_iou_reward/std": 0.17826072871685028, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 236.66146850585938, "completions/mean_terminated_length": 163.1750030517578, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 1.3935538592027141, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9630967378616333, "kl": 0.0521240234375, "learning_rate": 6.496208227782556e-07, "loss": -0.0092, "num_tokens": 105088553.0, "reward": 9.845484733581543, "reward_std": 1.668228030204773, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.509374976158142, "rewards/judge_reward/std": 1.6838983297348022, "rewards/ngrams_iou_reward/mean": 0.16515402495861053, "rewards/ngrams_iou_reward/std": 0.15552394092082977, "rewards/schema_keywords_iou_reward/mean": 0.687621533870697, "rewards/schema_keywords_iou_reward/std": 0.19969230890274048, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.3590256869792938, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 232.90625, "completions/mean_terminated_length": 161.65957641601562, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 1.3969465648854962, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9731674194335938, "kl": 0.0509033203125, "learning_rate": 6.477367248173351e-07, "loss": -0.0157, "num_tokens": 105337007.0, "reward": 10.00657844543457, "reward_std": 1.2700542211532593, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.472916603088379, "rewards/judge_reward/std": 1.6131967306137085, "rewards/ngrams_iou_reward/mean": 0.16047035157680511, "rewards/ngrams_iou_reward/std": 0.16946567595005035, "rewards/schema_keywords_iou_reward/mean": 0.7179822325706482, "rewards/schema_keywords_iou_reward/std": 0.15812090039253235, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 166.94737243652344, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 1.4003392705682782, "frac_reward_zero_std": 0.03125, "grad_norm": 0.834109902381897, "kl": 0.0450439453125, "learning_rate": 6.458503256185403e-07, "loss": 0.0084, "num_tokens": 105585695.0, "reward": 10.431401252746582, "reward_std": 1.1626262664794922, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 1.4432151317596436, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.021875023841858, "rewards/judge_reward/std": 1.52540123462677, "rewards/ngrams_iou_reward/mean": 0.16949152946472168, "rewards/ngrams_iou_reward/std": 0.20177945494651794, "rewards/schema_keywords_iou_reward/mean": 0.6869089007377625, "rewards/schema_keywords_iou_reward/std": 0.1811518669128418, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.3590256869792938, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 242.21875, "completions/mean_terminated_length": 189.85000610351562, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 1.4037319762510603, "frac_reward_zero_std": 0.0, "grad_norm": 0.7941436171531677, "kl": 0.04620361328125, "learning_rate": 6.439616545655833e-07, "loss": 0.012, "num_tokens": 105831119.0, "reward": 9.435258865356445, "reward_std": 1.3923060894012451, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.464583396911621, "rewards/judge_reward/std": 1.658989667892456, "rewards/ngrams_iou_reward/mean": 0.15889698266983032, "rewards/ngrams_iou_reward/std": 0.16609562933444977, "rewards/schema_keywords_iou_reward/mean": 0.6701105237007141, "rewards/schema_keywords_iou_reward/std": 0.1670958697795868, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.9322967529297, "completions/mean_terminated_length": 176.55882263183594, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 1.4071246819338423, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8147391080856323, "kl": 0.04901123046875, "learning_rate": 6.420707410775625e-07, "loss": 0.0146, "num_tokens": 106080286.0, "reward": 9.513673782348633, "reward_std": 1.4508888721466064, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.3604167699813843, "rewards/judge_reward/std": 1.562953233718872, "rewards/ngrams_iou_reward/mean": 0.18519876897335052, "rewards/ngrams_iou_reward/std": 0.18727852404117584, "rewards/schema_keywords_iou_reward/mean": 0.696182906627655, "rewards/schema_keywords_iou_reward/std": 0.1308232545852661, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.78125, "completions/mean_terminated_length": 168.58824157714844, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.4105173876166242, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7467554807662964, "kl": 0.0496826171875, "learning_rate": 6.401776146085071e-07, "loss": -0.0098, "num_tokens": 106350850.0, "reward": 10.188121795654297, "reward_std": 1.1817048788070679, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.4322916269302368, "rewards/judge_reward/std": 1.656211256980896, "rewards/ngrams_iou_reward/mean": 0.16556619107723236, "rewards/ngrams_iou_reward/std": 0.16939204931259155, "rewards/schema_keywords_iou_reward/mean": 0.7308884263038635, "rewards/schema_keywords_iou_reward/std": 0.15993539988994598, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 241.55209350585938, "completions/mean_terminated_length": 183.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 1.4139100932994062, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8027727007865906, "kl": 0.0545654296875, "learning_rate": 6.382823046469166e-07, "loss": -0.0022, "num_tokens": 106614140.0, "reward": 9.856372833251953, "reward_std": 1.4573006629943848, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.3052083253860474, "rewards/judge_reward/std": 1.5873762369155884, "rewards/ngrams_iou_reward/mean": 0.1610046774148941, "rewards/ngrams_iou_reward/std": 0.16044776141643524, "rewards/schema_keywords_iou_reward/mean": 0.6891185641288757, "rewards/schema_keywords_iou_reward/std": 0.1891510933637619, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 239.9479217529297, "completions/mean_terminated_length": 170.38888549804688, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 1.4173027989821882, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7784231305122375, "kl": 0.05255126953125, "learning_rate": 6.363848407153017e-07, "loss": 0.0062, "num_tokens": 106878808.0, "reward": 9.96084213256836, "reward_std": 1.491498351097107, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.584375023841858, "rewards/judge_reward/std": 1.7324635982513428, "rewards/ngrams_iou_reward/mean": 0.21874655783176422, "rewards/ngrams_iou_reward/std": 0.24229279160499573, "rewards/schema_keywords_iou_reward/mean": 0.7035534381866455, "rewards/schema_keywords_iou_reward/std": 0.17651775479316711, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.578125, "completions/mean_terminated_length": 178.07894897460938, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 1.4206955046649703, "frac_reward_zero_std": 0.0, "grad_norm": 0.7706515789031982, "kl": 0.0576171875, "learning_rate": 6.344852523697246e-07, "loss": 0.0114, "num_tokens": 107136937.0, "reward": 9.491106986999512, "reward_std": 1.9195992946624756, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.8854166865348816, "rewards/format_reward/std": 0.3193511664867401, "rewards/judge_reward/mean": 1.2791666984558105, "rewards/judge_reward/std": 1.5091980695724487, "rewards/ngrams_iou_reward/mean": 0.19113533198833466, "rewards/ngrams_iou_reward/std": 0.23521527647972107, "rewards/schema_keywords_iou_reward/mean": 0.6687209606170654, "rewards/schema_keywords_iou_reward/std": 0.2098887711763382, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 243.4479217529297, "completions/mean_terminated_length": 182.96969604492188, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 1.4240882103477523, "frac_reward_zero_std": 0.0, "grad_norm": 0.8653765320777893, "kl": 0.0528564453125, "learning_rate": 6.325835691993394e-07, "loss": 0.006, "num_tokens": 107423223.0, "reward": 9.309514999389648, "reward_std": 1.7752692699432373, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.2624999284744263, "rewards/judge_reward/std": 1.5703569650650024, "rewards/ngrams_iou_reward/mean": 0.17140965163707733, "rewards/ngrams_iou_reward/std": 0.2292817085981369, "rewards/schema_keywords_iou_reward/mean": 0.6651880145072937, "rewards/schema_keywords_iou_reward/std": 0.2203405201435089, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 239.7916717529297, "completions/mean_terminated_length": 178.1999969482422, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 1.4274809160305344, "frac_reward_zero_std": 0.0, "grad_norm": 1.796061635017395, "kl": 0.08001708984375, "learning_rate": 6.306798208259297e-07, "loss": 0.0154, "num_tokens": 107680655.0, "reward": 9.137235641479492, "reward_std": 2.043635368347168, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.6645833253860474, "rewards/judge_reward/std": 1.6205472946166992, "rewards/ngrams_iou_reward/mean": 0.18695761263370514, "rewards/ngrams_iou_reward/std": 0.2325410097837448, "rewards/schema_keywords_iou_reward/mean": 0.6679863333702087, "rewards/schema_keywords_iou_reward/std": 0.20508822798728943, "rewards/syntax_reward/mean": 0.734375, "rewards/syntax_reward/std": 0.44282010197639465, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 240.25521850585938, "completions/mean_terminated_length": 185.6976776123047, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.4308736217133164, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8571077585220337, "kl": 0.05169677734375, "learning_rate": 6.287740369034485e-07, "loss": 0.0046, "num_tokens": 107955738.0, "reward": 9.376387596130371, "reward_std": 1.6199860572814941, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.433854103088379, "rewards/judge_reward/std": 1.5774754285812378, "rewards/ngrams_iou_reward/mean": 0.1835811734199524, "rewards/ngrams_iou_reward/std": 0.204595148563385, "rewards/schema_keywords_iou_reward/mean": 0.6844727396965027, "rewards/schema_keywords_iou_reward/std": 0.18591062724590302, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 242.1875, "completions/mean_terminated_length": 188.0, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.4342663273960983, "frac_reward_zero_std": 0.0, "grad_norm": 0.8126377463340759, "kl": 0.047607421875, "learning_rate": 6.26866247117555e-07, "loss": -0.0018, "num_tokens": 108210258.0, "reward": 9.423921585083008, "reward_std": 2.118187665939331, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.2572916746139526, "rewards/judge_reward/std": 1.5103998184204102, "rewards/ngrams_iou_reward/mean": 0.18459491431713104, "rewards/ngrams_iou_reward/std": 0.17950837314128876, "rewards/schema_keywords_iou_reward/mean": 0.6674507260322571, "rewards/schema_keywords_iou_reward/std": 0.18272940814495087, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 243.14584350585938, "completions/mean_terminated_length": 157.27999877929688, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.4376590330788805, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7845024466514587, "kl": 0.05126953125, "learning_rate": 6.249564811851543e-07, "loss": 0.0164, "num_tokens": 108459154.0, "reward": 10.078593254089355, "reward_std": 1.5432504415512085, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.1645833253860474, "rewards/judge_reward/std": 1.572005033493042, "rewards/ngrams_iou_reward/mean": 0.2647489607334137, "rewards/ngrams_iou_reward/std": 0.30357715487480164, "rewards/schema_keywords_iou_reward/mean": 0.7242596745491028, "rewards/schema_keywords_iou_reward/std": 0.19415882229804993, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947065711021423, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 239.59375, "completions/mean_terminated_length": 170.8648681640625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.4410517387616624, "frac_reward_zero_std": 0.0, "grad_norm": 0.9394514560699463, "kl": 0.05853271484375, "learning_rate": 6.230447688539315e-07, "loss": -0.0202, "num_tokens": 108714190.0, "reward": 10.097436904907227, "reward_std": 1.296720027923584, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.328125, "rewards/judge_reward/std": 1.635867714881897, "rewards/ngrams_iou_reward/mean": 0.21572232246398926, "rewards/ngrams_iou_reward/std": 0.24132773280143738, "rewards/schema_keywords_iou_reward/mean": 0.7410895228385925, "rewards/schema_keywords_iou_reward/std": 0.1669415384531021, "rewards/syntax_reward/mean": 0.9114583134651184, "rewards/syntax_reward/std": 0.2848237454891205, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 246.734375, "completions/mean_terminated_length": 187.57693481445312, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 1.4444444444444444, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7327577471733093, "kl": 0.0477294921875, "learning_rate": 6.211311399018916e-07, "loss": -0.002, "num_tokens": 108959971.0, "reward": 9.95765495300293, "reward_std": 1.2839674949645996, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.2177082300186157, "rewards/judge_reward/std": 1.6172819137573242, "rewards/ngrams_iou_reward/mean": 0.17872445285320282, "rewards/ngrams_iou_reward/std": 0.16752536594867706, "rewards/schema_keywords_iou_reward/mean": 0.718512773513794, "rewards/schema_keywords_iou_reward/std": 0.15821734070777893, "rewards/syntax_reward/mean": 0.7604166865348816, "rewards/syntax_reward/std": 0.4279450476169586, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 241.15625, "completions/mean_terminated_length": 169.63636779785156, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 1.4478371501272265, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7366200685501099, "kl": 0.055419921875, "learning_rate": 6.192156241368929e-07, "loss": -0.0074, "num_tokens": 109224841.0, "reward": 9.691737174987793, "reward_std": 1.499206781387329, "rewards/accuracy_reward/mean": 1.25, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.698958396911621, "rewards/judge_reward/std": 1.6768203973770142, "rewards/ngrams_iou_reward/mean": 0.16827483475208282, "rewards/ngrams_iou_reward/std": 0.16280613839626312, "rewards/schema_keywords_iou_reward/mean": 0.7088780403137207, "rewards/schema_keywords_iou_reward/std": 0.19367855787277222, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 241.31771850585938, "completions/mean_terminated_length": 175.45713806152344, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.4512298558100085, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7987509965896606, "kl": 0.05181884765625, "learning_rate": 6.172982513961844e-07, "loss": 0.002, "num_tokens": 109507064.0, "reward": 10.137243270874023, "reward_std": 1.5054371356964111, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.1531249284744263, "rewards/judge_reward/std": 1.5153285264968872, "rewards/ngrams_iou_reward/mean": 0.23077178001403809, "rewards/ngrams_iou_reward/std": 0.2807687819004059, "rewards/schema_keywords_iou_reward/mean": 0.7148041725158691, "rewards/schema_keywords_iou_reward/std": 0.19175542891025543, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 248.9010467529297, "completions/mean_terminated_length": 187.85000610351562, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 1.4546225614927906, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7854547500610352, "kl": 0.05194091796875, "learning_rate": 6.153790515459403e-07, "loss": 0.0006, "num_tokens": 109746265.0, "reward": 9.9907865524292, "reward_std": 1.4115025997161865, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.597916603088379, "rewards/judge_reward/std": 1.737301230430603, "rewards/ngrams_iou_reward/mean": 0.20273037254810333, "rewards/ngrams_iou_reward/std": 0.23421481251716614, "rewards/schema_keywords_iou_reward/mean": 0.7172229886054993, "rewards/schema_keywords_iou_reward/std": 0.17133867740631104, "rewards/syntax_reward/mean": 0.7395833134651184, "rewards/syntax_reward/std": 0.44000932574272156, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 241.0104217529297, "completions/mean_terminated_length": 176.05555725097656, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 1.4580152671755724, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7201678156852722, "kl": 0.05194091796875, "learning_rate": 6.13458054480795e-07, "loss": 0.0056, "num_tokens": 110004075.0, "reward": 10.010892868041992, "reward_std": 1.1479365825653076, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.6531249284744263, "rewards/judge_reward/std": 1.7190921306610107, "rewards/ngrams_iou_reward/mean": 0.1797318458557129, "rewards/ngrams_iou_reward/std": 0.18106399476528168, "rewards/schema_keywords_iou_reward/mean": 0.7124105095863342, "rewards/schema_keywords_iou_reward/std": 0.16020435094833374, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 239.14584350585938, "completions/mean_terminated_length": 170.84210205078125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 1.4614079728583547, "frac_reward_zero_std": 0.0625, "grad_norm": 0.810545027256012, "kl": 0.05023193359375, "learning_rate": 6.115352901233778e-07, "loss": -0.004, "num_tokens": 110247697.0, "reward": 10.581023216247559, "reward_std": 1.2457244396209717, "rewards/accuracy_reward/mean": 1.90625, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.071874976158142, "rewards/judge_reward/std": 1.6226282119750977, "rewards/ngrams_iou_reward/mean": 0.2116107940673828, "rewards/ngrams_iou_reward/std": 0.22209131717681885, "rewards/schema_keywords_iou_reward/mean": 0.6839949488639832, "rewards/schema_keywords_iou_reward/std": 0.19124829769134521, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374122977256775, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 243.953125, "completions/mean_terminated_length": 159.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 1.4648006785411365, "frac_reward_zero_std": 0.0, "grad_norm": 0.7983020544052124, "kl": 0.0589599609375, "learning_rate": 6.096107884238458e-07, "loss": 0.0064, "num_tokens": 110502832.0, "reward": 9.811166763305664, "reward_std": 1.6461642980575562, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.2572916746139526, "rewards/judge_reward/std": 1.5655481815338135, "rewards/ngrams_iou_reward/mean": 0.14091704785823822, "rewards/ngrams_iou_reward/std": 0.14758890867233276, "rewards/schema_keywords_iou_reward/mean": 0.6608734130859375, "rewards/schema_keywords_iou_reward/std": 0.20503267645835876, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 243.9635467529297, "completions/mean_terminated_length": 178.9666748046875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 1.4681933842239185, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7226220369338989, "kl": 0.0496826171875, "learning_rate": 6.076845793594181e-07, "loss": 0.0003, "num_tokens": 110781117.0, "reward": 9.421930313110352, "reward_std": 1.3731364011764526, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.417708396911621, "rewards/judge_reward/std": 1.58034348487854, "rewards/ngrams_iou_reward/mean": 0.19283242523670197, "rewards/ngrams_iou_reward/std": 0.2030593454837799, "rewards/schema_keywords_iou_reward/mean": 0.6853463053703308, "rewards/schema_keywords_iou_reward/std": 0.18384549021720886, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 246.5885467529297, "completions/mean_terminated_length": 173.8636474609375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 1.4715860899067006, "frac_reward_zero_std": 0.0, "grad_norm": 0.8849555253982544, "kl": 0.0484619140625, "learning_rate": 6.057566929339095e-07, "loss": 0.0193, "num_tokens": 111042170.0, "reward": 8.874171257019043, "reward_std": 1.9925470352172852, "rewards/accuracy_reward/mean": 0.9375, "rewards/accuracy_reward/std": 1.3941725492477417, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3129251003265381, "rewards/judge_reward/mean": 1.787500023841858, "rewards/judge_reward/std": 1.5747517347335815, "rewards/ngrams_iou_reward/mean": 0.1776304692029953, "rewards/ngrams_iou_reward/std": 0.2179342359304428, "rewards/schema_keywords_iou_reward/mean": 0.6423740983009338, "rewards/schema_keywords_iou_reward/std": 0.19195640087127686, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 240.91146850585938, "completions/mean_terminated_length": 175.5277862548828, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 1.4749787955894826, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8449696898460388, "kl": 0.05072021484375, "learning_rate": 6.038271591772614e-07, "loss": 0.0084, "num_tokens": 111320235.0, "reward": 9.449987411499023, "reward_std": 1.6475242376327515, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.665624976158142, "rewards/judge_reward/std": 1.5889887809753418, "rewards/ngrams_iou_reward/mean": 0.16226954758167267, "rewards/ngrams_iou_reward/std": 0.14333246648311615, "rewards/schema_keywords_iou_reward/mean": 0.6908421516418457, "rewards/schema_keywords_iou_reward/std": 0.15616770088672638, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 245.7447967529297, "completions/mean_terminated_length": 180.2692413330078, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.4783715012722647, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7683651447296143, "kl": 0.04754638671875, "learning_rate": 6.01896008145076e-07, "loss": -0.0016, "num_tokens": 111570380.0, "reward": 10.198015213012695, "reward_std": 1.837196707725525, "rewards/accuracy_reward/mean": 1.8125, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.040624976158142, "rewards/judge_reward/std": 1.536915898323059, "rewards/ngrams_iou_reward/mean": 0.1818772405385971, "rewards/ngrams_iou_reward/std": 0.21343863010406494, "rewards/schema_keywords_iou_reward/mean": 0.6953046321868896, "rewards/schema_keywords_iou_reward/std": 0.1960587501525879, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 239.2916717529297, "completions/mean_terminated_length": 175.8000030517578, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.4817642069550465, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7896808385848999, "kl": 0.05377197265625, "learning_rate": 5.999632699181465e-07, "loss": -0.0041, "num_tokens": 111830752.0, "reward": 10.52552604675293, "reward_std": 1.145244836807251, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.0677083730697632, "rewards/judge_reward/std": 1.5581566095352173, "rewards/ngrams_iou_reward/mean": 0.1991051584482193, "rewards/ngrams_iou_reward/std": 0.23487690091133118, "rewards/schema_keywords_iou_reward/mean": 0.6910036206245422, "rewards/schema_keywords_iou_reward/std": 0.16687943041324615, "rewards/syntax_reward/mean": 0.8802083134651184, "rewards/syntax_reward/std": 0.32556670904159546, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 239.7291717529297, "completions/mean_terminated_length": 177.90000915527344, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 1.4851569126378288, "frac_reward_zero_std": 0.0, "grad_norm": 0.950864851474762, "kl": 0.054931640625, "learning_rate": 5.980289746019891e-07, "loss": 0.0187, "num_tokens": 112077132.0, "reward": 10.533148765563965, "reward_std": 1.3813178539276123, "rewards/accuracy_reward/mean": 1.703125, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.3697916269302368, "rewards/judge_reward/std": 1.7133060693740845, "rewards/ngrams_iou_reward/mean": 0.1831190586090088, "rewards/ngrams_iou_reward/std": 0.19755882024765015, "rewards/schema_keywords_iou_reward/mean": 0.7041957974433899, "rewards/schema_keywords_iou_reward/std": 0.17473524808883667, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374123275279999, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 242.58334350585938, "completions/mean_terminated_length": 177.93939208984375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 1.4885496183206106, "frac_reward_zero_std": 0.0, "grad_norm": 0.9360278248786926, "kl": 0.05157470703125, "learning_rate": 5.960931523263749e-07, "loss": 0.0278, "num_tokens": 112337980.0, "reward": 10.142951011657715, "reward_std": 1.437922477722168, "rewards/accuracy_reward/mean": 1.78125, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.078125, "rewards/judge_reward/std": 1.5373672246932983, "rewards/ngrams_iou_reward/mean": 0.17827056348323822, "rewards/ngrams_iou_reward/std": 0.18725566565990448, "rewards/schema_keywords_iou_reward/mean": 0.7094714045524597, "rewards/schema_keywords_iou_reward/std": 0.18741202354431152, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.5260467529297, "completions/mean_terminated_length": 178.92156982421875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 1.4919423240033927, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7967984676361084, "kl": 0.05657958984375, "learning_rate": 5.941558332448588e-07, "loss": 0.0121, "num_tokens": 112622301.0, "reward": 9.86624813079834, "reward_std": 1.6964377164840698, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.3645833730697632, "rewards/judge_reward/std": 1.6166657209396362, "rewards/ngrams_iou_reward/mean": 0.21446019411087036, "rewards/ngrams_iou_reward/std": 0.2518269419670105, "rewards/schema_keywords_iou_reward/mean": 0.7142875790596008, "rewards/schema_keywords_iou_reward/std": 0.2183172106742859, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 236.7447967529297, "completions/mean_terminated_length": 177.34042358398438, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 1.4953350296861747, "frac_reward_zero_std": 0.0, "grad_norm": 0.7065747976303101, "kl": 0.05377197265625, "learning_rate": 5.922170475343124e-07, "loss": 0.0172, "num_tokens": 112878896.0, "reward": 10.057242393493652, "reward_std": 1.375718593597412, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.5135416984558105, "rewards/judge_reward/std": 1.6741408109664917, "rewards/ngrams_iou_reward/mean": 0.20495669543743134, "rewards/ngrams_iou_reward/std": 0.24406015872955322, "rewards/schema_keywords_iou_reward/mean": 0.715826690196991, "rewards/schema_keywords_iou_reward/std": 0.1823674589395523, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 237.50521850585938, "completions/mean_terminated_length": 171.45237731933594, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 1.4987277353689568, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7917941808700562, "kl": 0.053466796875, "learning_rate": 5.902768253944511e-07, "loss": 0.005, "num_tokens": 113137821.0, "reward": 10.034317016601562, "reward_std": 1.2782554626464844, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.328125, "rewards/judge_reward/std": 1.597000002861023, "rewards/ngrams_iou_reward/mean": 0.25505462288856506, "rewards/ngrams_iou_reward/std": 0.23863284289836884, "rewards/schema_keywords_iou_reward/mean": 0.7271790504455566, "rewards/schema_keywords_iou_reward/std": 0.1945391446352005, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.41087818145751953, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 236.64584350585938, "completions/mean_terminated_length": 176.9361572265625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 1.5021204410517388, "frac_reward_zero_std": 0.0, "grad_norm": 0.7792260050773621, "kl": 0.0621337890625, "learning_rate": 5.883351970473654e-07, "loss": 0.015, "num_tokens": 113407135.0, "reward": 9.745528221130371, "reward_std": 1.49678373336792, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.274999976158142, "rewards/judge_reward/std": 1.5686054229736328, "rewards/ngrams_iou_reward/mean": 0.16801577806472778, "rewards/ngrams_iou_reward/std": 0.1858803778886795, "rewards/schema_keywords_iou_reward/mean": 0.6733447909355164, "rewards/schema_keywords_iou_reward/std": 0.18110023438930511, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 239.3229217529297, "completions/mean_terminated_length": 184.84445190429688, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 1.5055131467345206, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7699872851371765, "kl": 0.052734375, "learning_rate": 5.863921927370498e-07, "loss": 0.0036, "num_tokens": 113658591.0, "reward": 9.448888778686523, "reward_std": 1.3005698919296265, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.2927082777023315, "rewards/judge_reward/std": 1.5261589288711548, "rewards/ngrams_iou_reward/mean": 0.13646294176578522, "rewards/ngrams_iou_reward/std": 0.1278943419456482, "rewards/schema_keywords_iou_reward/mean": 0.6801331639289856, "rewards/schema_keywords_iou_reward/std": 0.14800572395324707, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 234.609375, "completions/mean_terminated_length": 166.71739196777344, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 1.508905852417303, "frac_reward_zero_std": 0.03125, "grad_norm": 0.794632077217102, "kl": 0.05328369140625, "learning_rate": 5.844478427289316e-07, "loss": -0.0027, "num_tokens": 113898402.0, "reward": 10.708064079284668, "reward_std": 0.8044885396957397, "rewards/accuracy_reward/mean": 1.84375, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.1416666507720947, "rewards/judge_reward/std": 1.6011557579040527, "rewards/ngrams_iou_reward/mean": 0.2911047637462616, "rewards/ngrams_iou_reward/std": 0.31897997856140137, "rewards/schema_keywords_iou_reward/mean": 0.7534179091453552, "rewards/schema_keywords_iou_reward/std": 0.1812889128923416, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 232.3697967529297, "completions/mean_terminated_length": 177.77586364746094, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.5122985581000847, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7722175717353821, "kl": 0.05706787109375, "learning_rate": 5.825021773093996e-07, "loss": -0.0019, "num_tokens": 114183455.0, "reward": 8.885305404663086, "reward_std": 1.5161843299865723, "rewards/accuracy_reward/mean": 1.25, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.3520832061767578, "rewards/judge_reward/std": 1.4606157541275024, "rewards/ngrams_iou_reward/mean": 0.1580856293439865, "rewards/ngrams_iou_reward/std": 0.15707558393478394, "rewards/schema_keywords_iou_reward/mean": 0.6584689021110535, "rewards/schema_keywords_iou_reward/std": 0.178176149725914, "rewards/syntax_reward/mean": 0.703125, "rewards/syntax_reward/std": 0.4580754339694977, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.6354217529297, "completions/mean_terminated_length": 171.35848999023438, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 1.5156912637828668, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7576325535774231, "kl": 0.06329345703125, "learning_rate": 5.805552267853322e-07, "loss": 0.0079, "num_tokens": 114427171.0, "reward": 9.19698429107666, "reward_std": 1.3819421529769897, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.3562499284744263, "rewards/judge_reward/std": 1.5203205347061157, "rewards/ngrams_iou_reward/mean": 0.1628998965024948, "rewards/ngrams_iou_reward/std": 0.1971205323934555, "rewards/schema_keywords_iou_reward/mean": 0.6757504343986511, "rewards/schema_keywords_iou_reward/std": 0.17329347133636475, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 236.1875, "completions/mean_terminated_length": 178.36734008789062, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 1.5190839694656488, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8391982913017273, "kl": 0.05419921875, "learning_rate": 5.786070214836254e-07, "loss": 0.0043, "num_tokens": 114674623.0, "reward": 9.894515037536621, "reward_std": 1.3103506565093994, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.294791579246521, "rewards/judge_reward/std": 1.5054433345794678, "rewards/ngrams_iou_reward/mean": 0.18620723485946655, "rewards/ngrams_iou_reward/std": 0.2158885896205902, "rewards/schema_keywords_iou_reward/mean": 0.707265317440033, "rewards/schema_keywords_iou_reward/std": 0.1752166897058487, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374122977256775, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 237.375, "completions/mean_terminated_length": 188.52830505371094, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 1.5224766751484309, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7772090435028076, "kl": 0.05657958984375, "learning_rate": 5.766575917507202e-07, "loss": 0.0122, "num_tokens": 114943579.0, "reward": 9.8274507522583, "reward_std": 1.5362190008163452, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.4375, "rewards/judge_reward/std": 1.6545194387435913, "rewards/ngrams_iou_reward/mean": 0.17800748348236084, "rewards/ngrams_iou_reward/std": 0.17149899899959564, "rewards/schema_keywords_iou_reward/mean": 0.6963179707527161, "rewards/schema_keywords_iou_reward/std": 0.15541939437389374, "rewards/syntax_reward/mean": 0.7604166865348816, "rewards/syntax_reward/std": 0.427945077419281, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 231.640625, "completions/mean_terminated_length": 172.48214721679688, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.525869380831213, "frac_reward_zero_std": 0.0, "grad_norm": 0.7963797450065613, "kl": 0.06170654296875, "learning_rate": 5.747069679521305e-07, "loss": 0.0206, "num_tokens": 115201216.0, "reward": 10.45644474029541, "reward_std": 1.2279472351074219, "rewards/accuracy_reward/mean": 1.984375, "rewards/accuracy_reward/std": 1.423352599143982, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 0.8635416030883789, "rewards/judge_reward/std": 1.4147775173187256, "rewards/ngrams_iou_reward/mean": 0.24177932739257812, "rewards/ngrams_iou_reward/std": 0.259743869304657, "rewards/schema_keywords_iou_reward/mean": 0.737581193447113, "rewards/schema_keywords_iou_reward/std": 0.17499732971191406, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 238.59375, "completions/mean_terminated_length": 180.0454559326172, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 1.5292620865139948, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7223302721977234, "kl": 0.05438232421875, "learning_rate": 5.727551804719693e-07, "loss": 0.0168, "num_tokens": 115470628.0, "reward": 9.077165603637695, "reward_std": 1.4652189016342163, "rewards/accuracy_reward/mean": 1.015625, "rewards/accuracy_reward/std": 1.423352599143982, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.743749976158142, "rewards/judge_reward/std": 1.592969536781311, "rewards/ngrams_iou_reward/mean": 0.13476817309856415, "rewards/ngrams_iou_reward/std": 0.18432123959064484, "rewards/schema_keywords_iou_reward/mean": 0.6528133749961853, "rewards/schema_keywords_iou_reward/std": 0.18720661103725433, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 232.33334350585938, "completions/mean_terminated_length": 176.28070068359375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 1.532654792196777, "frac_reward_zero_std": 0.0, "grad_norm": 0.8215197920799255, "kl": 0.0546875, "learning_rate": 5.708022597124758e-07, "loss": -0.0027, "num_tokens": 115744496.0, "reward": 9.155815124511719, "reward_std": 1.9481886625289917, "rewards/accuracy_reward/mean": 1.171875, "rewards/accuracy_reward/std": 1.4674979448318481, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.5500000715255737, "rewards/judge_reward/std": 1.5967243909835815, "rewards/ngrams_iou_reward/mean": 0.2181798219680786, "rewards/ngrams_iou_reward/std": 0.24259303510189056, "rewards/schema_keywords_iou_reward/mean": 0.6813850402832031, "rewards/schema_keywords_iou_reward/std": 0.21108174324035645, "rewards/syntax_reward/mean": 0.7291666865348816, "rewards/syntax_reward/std": 0.44555196166038513, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 229.390625, "completions/mean_terminated_length": 164.7678680419922, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 1.5360474978795589, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7990840673446655, "kl": 0.05413818359375, "learning_rate": 5.688482360935423e-07, "loss": 0.0068, "num_tokens": 116030693.0, "reward": 9.929269790649414, "reward_std": 1.3104698657989502, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.2781250476837158, "rewards/judge_reward/std": 1.5806223154067993, "rewards/ngrams_iou_reward/mean": 0.22448980808258057, "rewards/ngrams_iou_reward/std": 0.23775170743465424, "rewards/schema_keywords_iou_reward/mean": 0.7266542911529541, "rewards/schema_keywords_iou_reward/std": 0.1742003858089447, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.359375, "completions/mean_terminated_length": 181.59014892578125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 1.5394402035623411, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6954178810119629, "kl": 0.05841064453125, "learning_rate": 5.668931400522395e-07, "loss": -0.0003, "num_tokens": 116282216.0, "reward": 10.248246192932129, "reward_std": 1.1435809135437012, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.3604167699813843, "rewards/judge_reward/std": 1.6597784757614136, "rewards/ngrams_iou_reward/mean": 0.18298639357089996, "rewards/ngrams_iou_reward/std": 0.1868448555469513, "rewards/schema_keywords_iou_reward/mean": 0.7142176628112793, "rewards/schema_keywords_iou_reward/std": 0.15376685559749603, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 231.9322967529297, "completions/mean_terminated_length": 171.9818115234375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.542832909245123, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8279592394828796, "kl": 0.06500244140625, "learning_rate": 5.64937002042343e-07, "loss": -0.0, "num_tokens": 116558809.0, "reward": 10.030065536499023, "reward_std": 1.417937994003296, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.048958420753479, "rewards/judge_reward/std": 1.454330563545227, "rewards/ngrams_iou_reward/mean": 0.226801797747612, "rewards/ngrams_iou_reward/std": 0.27491116523742676, "rewards/schema_keywords_iou_reward/mean": 0.7157642245292664, "rewards/schema_keywords_iou_reward/std": 0.18486060202121735, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 235.7760467529297, "completions/mean_terminated_length": 179.8627471923828, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 1.546225614927905, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7641986608505249, "kl": 0.05572509765625, "learning_rate": 5.629798525338588e-07, "loss": 0.0227, "num_tokens": 116812362.0, "reward": 9.756113052368164, "reward_std": 1.2874263525009155, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.4812499284744263, "rewards/judge_reward/std": 1.7174382209777832, "rewards/ngrams_iou_reward/mean": 0.15748633444309235, "rewards/ngrams_iou_reward/std": 0.17537914216518402, "rewards/schema_keywords_iou_reward/mean": 0.6934173703193665, "rewards/schema_keywords_iou_reward/std": 0.19658306241035461, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 231.9166717529297, "completions/mean_terminated_length": 173.42857360839844, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 1.549618320610687, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7993530035018921, "kl": 0.05328369140625, "learning_rate": 5.610217220125483e-07, "loss": -0.025, "num_tokens": 117073370.0, "reward": 9.938850402832031, "reward_std": 1.254902720451355, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.3312500715255737, "rewards/judge_reward/std": 1.6305532455444336, "rewards/ngrams_iou_reward/mean": 0.20449884235858917, "rewards/ngrams_iou_reward/std": 0.21595913171768188, "rewards/schema_keywords_iou_reward/mean": 0.6916418075561523, "rewards/schema_keywords_iou_reward/std": 0.2067168802022934, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 234.98959350585938, "completions/mean_terminated_length": 176.90196228027344, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 1.5530110262934689, "frac_reward_zero_std": 0.0, "grad_norm": 0.7958040237426758, "kl": 0.05218505859375, "learning_rate": 5.59062640979454e-07, "loss": -0.0131, "num_tokens": 117346842.0, "reward": 9.785524368286133, "reward_std": 1.6147069931030273, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.53125, "rewards/judge_reward/std": 1.6803959608078003, "rewards/ngrams_iou_reward/mean": 0.14782454073429108, "rewards/ngrams_iou_reward/std": 0.14921970665454865, "rewards/schema_keywords_iou_reward/mean": 0.7001991271972656, "rewards/schema_keywords_iou_reward/std": 0.1699308454990387, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 233.53125, "completions/mean_terminated_length": 167.9591827392578, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 1.5564037319762511, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7492514848709106, "kl": 0.05755615234375, "learning_rate": 5.571026399504243e-07, "loss": 0.0123, "num_tokens": 117631782.0, "reward": 9.348177909851074, "reward_std": 1.6297886371612549, "rewards/accuracy_reward/mean": 1.234375, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.5062499046325684, "rewards/judge_reward/std": 1.5071078538894653, "rewards/ngrams_iou_reward/mean": 0.21166975796222687, "rewards/ngrams_iou_reward/std": 0.23051723837852478, "rewards/schema_keywords_iou_reward/mean": 0.7073403000831604, "rewards/schema_keywords_iou_reward/std": 0.1704857498407364, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 228.984375, "completions/mean_terminated_length": 172.3386993408203, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 1.559796437659033, "frac_reward_zero_std": 0.03125, "grad_norm": 0.885811984539032, "kl": 0.05413818359375, "learning_rate": 5.551417494556375e-07, "loss": 0.0192, "num_tokens": 117908835.0, "reward": 9.94956111907959, "reward_std": 1.7084825038909912, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.2135416269302368, "rewards/judge_reward/std": 1.5523074865341187, "rewards/ngrams_iou_reward/mean": 0.22138237953186035, "rewards/ngrams_iou_reward/std": 0.27671927213668823, "rewards/schema_keywords_iou_reward/mean": 0.686511218547821, "rewards/schema_keywords_iou_reward/std": 0.19698430597782135, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 237.2760467529297, "completions/mean_terminated_length": 168.3170623779297, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 1.5631891433418152, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7824541330337524, "kl": 0.0528564453125, "learning_rate": 5.531800000391275e-07, "loss": 0.0104, "num_tokens": 118168130.0, "reward": 9.252311706542969, "reward_std": 1.495772361755371, "rewards/accuracy_reward/mean": 1.171875, "rewards/accuracy_reward/std": 1.4674979448318481, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3129251003265381, "rewards/judge_reward/mean": 1.6104167699813843, "rewards/judge_reward/std": 1.6550400257110596, "rewards/ngrams_iou_reward/mean": 0.21911007165908813, "rewards/ngrams_iou_reward/std": 0.2512621581554413, "rewards/schema_keywords_iou_reward/mean": 0.6977839469909668, "rewards/schema_keywords_iou_reward/std": 0.1796724945306778, "rewards/syntax_reward/mean": 0.7083333134651184, "rewards/syntax_reward/std": 0.4557179808616638, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 235.421875, "completions/mean_terminated_length": 170.10870361328125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 1.566581849024597, "frac_reward_zero_std": 0.0, "grad_norm": 0.8134412169456482, "kl": 0.0501708984375, "learning_rate": 5.512174222583066e-07, "loss": -0.0017, "num_tokens": 118432007.0, "reward": 9.585755348205566, "reward_std": 1.4887924194335938, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.3447917699813843, "rewards/judge_reward/std": 1.5701647996902466, "rewards/ngrams_iou_reward/mean": 0.19165687263011932, "rewards/ngrams_iou_reward/std": 0.21135663986206055, "rewards/schema_keywords_iou_reward/mean": 0.68888920545578, "rewards/schema_keywords_iou_reward/std": 0.18848295509815216, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 232.9635467529297, "completions/mean_terminated_length": 170.9423065185547, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 1.5699745547073791, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7847983837127686, "kl": 0.0518798828125, "learning_rate": 5.492540466834907e-07, "loss": 0.0101, "num_tokens": 118686934.0, "reward": 10.22634506225586, "reward_std": 1.2161917686462402, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.2374999523162842, "rewards/judge_reward/std": 1.5788688659667969, "rewards/ngrams_iou_reward/mean": 0.1691131442785263, "rewards/ngrams_iou_reward/std": 0.15276522934436798, "rewards/schema_keywords_iou_reward/mean": 0.7124392986297607, "rewards/schema_keywords_iou_reward/std": 0.16396118700504303, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 235.9166717529297, "completions/mean_terminated_length": 164.1904754638672, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 1.5733672603901612, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8369552493095398, "kl": 0.0513916015625, "learning_rate": 5.472899038974225e-07, "loss": 0.0157, "num_tokens": 118929066.0, "reward": 9.900093078613281, "reward_std": 1.2153242826461792, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.1385416984558105, "rewards/judge_reward/std": 1.5063472986221313, "rewards/ngrams_iou_reward/mean": 0.2840511202812195, "rewards/ngrams_iou_reward/std": 0.3148219585418701, "rewards/schema_keywords_iou_reward/mean": 0.7035412788391113, "rewards/schema_keywords_iou_reward/std": 0.20392543077468872, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 237.95834350585938, "completions/mean_terminated_length": 173.52381896972656, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 1.5767599660729432, "frac_reward_zero_std": 0.0, "grad_norm": 0.8054487705230713, "kl": 0.050048828125, "learning_rate": 5.45325024494795e-07, "loss": 0.0004, "num_tokens": 119170606.0, "reward": 9.57183837890625, "reward_std": 1.394579291343689, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.3666666746139526, "rewards/judge_reward/std": 1.5157577991485596, "rewards/ngrams_iou_reward/mean": 0.1627359837293625, "rewards/ngrams_iou_reward/std": 0.18018868565559387, "rewards/schema_keywords_iou_reward/mean": 0.6757686734199524, "rewards/schema_keywords_iou_reward/std": 0.1778784543275833, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 241.890625, "completions/mean_terminated_length": 173.9091033935547, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 1.5801526717557253, "frac_reward_zero_std": 0.0, "grad_norm": 0.8750158548355103, "kl": 0.0501708984375, "learning_rate": 5.433594390817755e-07, "loss": 0.0261, "num_tokens": 119421793.0, "reward": 10.043193817138672, "reward_std": 1.5201563835144043, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.2729166746139526, "rewards/judge_reward/std": 1.5501716136932373, "rewards/ngrams_iou_reward/mean": 0.20134766399860382, "rewards/ngrams_iou_reward/std": 0.2276075929403305, "rewards/schema_keywords_iou_reward/mean": 0.7178876996040344, "rewards/schema_keywords_iou_reward/std": 0.16876259446144104, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 236.59896850585938, "completions/mean_terminated_length": 178.39584350585938, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 1.583545377438507, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7619965076446533, "kl": 0.05242919921875, "learning_rate": 5.413931782755282e-07, "loss": -0.002, "num_tokens": 119683970.0, "reward": 10.616806030273438, "reward_std": 0.982151985168457, "rewards/accuracy_reward/mean": 1.796875, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.1927082538604736, "rewards/judge_reward/std": 1.5876977443695068, "rewards/ngrams_iou_reward/mean": 0.23575490713119507, "rewards/ngrams_iou_reward/std": 0.25259628891944885, "rewards/schema_keywords_iou_reward/mean": 0.7508421540260315, "rewards/schema_keywords_iou_reward/std": 0.15336264669895172, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 230.6510467529297, "completions/mean_terminated_length": 165.87037658691406, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.5869380831212894, "frac_reward_zero_std": 0.0, "grad_norm": 0.7514991164207458, "kl": 0.056640625, "learning_rate": 5.394262727037381e-07, "loss": -0.0012, "num_tokens": 119948191.0, "reward": 9.863323211669922, "reward_std": 1.4490129947662354, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.480208396911621, "rewards/judge_reward/std": 1.6705718040466309, "rewards/ngrams_iou_reward/mean": 0.1639636754989624, "rewards/ngrams_iou_reward/std": 0.19722631573677063, "rewards/schema_keywords_iou_reward/mean": 0.6972751021385193, "rewards/schema_keywords_iou_reward/std": 0.18838760256767273, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 240.21875, "completions/mean_terminated_length": 169.42857360839844, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.5903307888040712, "frac_reward_zero_std": 0.0, "grad_norm": 0.7825528979301453, "kl": 0.04962158203125, "learning_rate": 5.374587530041334e-07, "loss": -0.003, "num_tokens": 120209563.0, "reward": 10.276716232299805, "reward_std": 1.360630750656128, "rewards/accuracy_reward/mean": 1.78125, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.1041666269302368, "rewards/judge_reward/std": 1.4784961938858032, "rewards/ngrams_iou_reward/mean": 0.19245362281799316, "rewards/ngrams_iou_reward/std": 0.18387366831302643, "rewards/schema_keywords_iou_reward/mean": 0.7092621922492981, "rewards/schema_keywords_iou_reward/std": 0.1631712019443512, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 238.21875, "completions/mean_terminated_length": 168.4615478515625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 1.5937234944868532, "frac_reward_zero_std": 0.0, "grad_norm": 0.8791813850402832, "kl": 0.0577392578125, "learning_rate": 5.354906498240079e-07, "loss": -0.013, "num_tokens": 120454741.0, "reward": 9.399682998657227, "reward_std": 1.4631707668304443, "rewards/accuracy_reward/mean": 1.203125, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.5343750715255737, "rewards/judge_reward/std": 1.5318145751953125, "rewards/ngrams_iou_reward/mean": 0.24229323863983154, "rewards/ngrams_iou_reward/std": 0.26008254289627075, "rewards/schema_keywords_iou_reward/mean": 0.72405606508255, "rewards/schema_keywords_iou_reward/std": 0.1884971410036087, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 240.40625, "completions/mean_terminated_length": 162.4375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 1.5971162001696353, "frac_reward_zero_std": 0.0, "grad_norm": 0.8217745423316956, "kl": 0.04949951171875, "learning_rate": 5.335219938197445e-07, "loss": 0.0089, "num_tokens": 120694525.0, "reward": 10.402630805969238, "reward_std": 1.0616471767425537, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.4187499284744263, "rewards/judge_reward/std": 1.7343051433563232, "rewards/ngrams_iou_reward/mean": 0.15434251725673676, "rewards/ngrams_iou_reward/std": 0.1668763905763626, "rewards/schema_keywords_iou_reward/mean": 0.7076621055603027, "rewards/schema_keywords_iou_reward/std": 0.1506708562374115, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374123275279999, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 240.5572967529297, "completions/mean_terminated_length": 173.63888549804688, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 1.6005089058524173, "frac_reward_zero_std": 0.0, "grad_norm": 0.8122285008430481, "kl": 0.0531005859375, "learning_rate": 5.315528156563367e-07, "loss": 0.0155, "num_tokens": 120944040.0, "reward": 10.422542572021484, "reward_std": 1.168042540550232, "rewards/accuracy_reward/mean": 1.703125, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.323958396911621, "rewards/judge_reward/std": 1.7484320402145386, "rewards/ngrams_iou_reward/mean": 0.18345873057842255, "rewards/ngrams_iou_reward/std": 0.1661568135023117, "rewards/schema_keywords_iou_reward/mean": 0.7109593749046326, "rewards/schema_keywords_iou_reward/std": 0.1620490550994873, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 231.91146850585938, "completions/mean_terminated_length": 176.2586212158203, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 1.6039016115351994, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8312649726867676, "kl": 0.064453125, "learning_rate": 5.295831460069124e-07, "loss": 0.0063, "num_tokens": 121215199.0, "reward": 9.4342041015625, "reward_std": 1.212465763092041, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.332291603088379, "rewards/judge_reward/std": 1.525833010673523, "rewards/ngrams_iou_reward/mean": 0.14634473621845245, "rewards/ngrams_iou_reward/std": 0.16089649498462677, "rewards/schema_keywords_iou_reward/mean": 0.6909832954406738, "rewards/schema_keywords_iou_reward/std": 0.17597796022891998, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.44271850585938, "completions/mean_terminated_length": 177.05999755859375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 1.6072943172179812, "frac_reward_zero_std": 0.0, "grad_norm": 0.8919988870620728, "kl": 0.060302734375, "learning_rate": 5.27613015552254e-07, "loss": 0.0034, "num_tokens": 121465232.0, "reward": 10.698417663574219, "reward_std": 0.9695321917533875, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.542708396911621, "rewards/judge_reward/std": 1.7425529956817627, "rewards/ngrams_iou_reward/mean": 0.25727489590644836, "rewards/ngrams_iou_reward/std": 0.2742845118045807, "rewards/schema_keywords_iou_reward/mean": 0.7619760036468506, "rewards/schema_keywords_iou_reward/std": 0.1546964794397354, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 226.4010467529297, "completions/mean_terminated_length": 158.01724243164062, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 1.6106870229007635, "frac_reward_zero_std": 0.0, "grad_norm": 0.8549332618713379, "kl": 0.05206298828125, "learning_rate": 5.256424549803227e-07, "loss": 0.0009, "num_tokens": 121731673.0, "reward": 10.637577056884766, "reward_std": 0.9440735578536987, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9895833134651184, "rewards/format_reward/std": 0.1017945408821106, "rewards/judge_reward/mean": 1.4854167699813843, "rewards/judge_reward/std": 1.6698733568191528, "rewards/ngrams_iou_reward/mean": 0.227607861161232, "rewards/ngrams_iou_reward/std": 0.24463143944740295, "rewards/schema_keywords_iou_reward/mean": 0.7360100150108337, "rewards/schema_keywords_iou_reward/std": 0.16069014370441437, "rewards/syntax_reward/mean": 0.9322916865348816, "rewards/syntax_reward/std": 0.2519015669822693, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 240.84375, "completions/mean_terminated_length": 179.42105102539062, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 1.6140797285835453, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7413836121559143, "kl": 0.05328369140625, "learning_rate": 5.236714949857791e-07, "loss": 0.0047, "num_tokens": 121978519.0, "reward": 9.94184684753418, "reward_std": 1.3996036052703857, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.521875023841858, "rewards/judge_reward/std": 1.7284623384475708, "rewards/ngrams_iou_reward/mean": 0.17471836507320404, "rewards/ngrams_iou_reward/std": 0.19275440275669098, "rewards/schema_keywords_iou_reward/mean": 0.6973360180854797, "rewards/schema_keywords_iou_reward/std": 0.1566951870918274, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 237.4791717529297, "completions/mean_terminated_length": 167.10000610351562, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 1.6174724342663274, "frac_reward_zero_std": 0.0, "grad_norm": 0.7764521241188049, "kl": 0.05255126953125, "learning_rate": 5.21700166269505e-07, "loss": 0.0123, "num_tokens": 122230641.0, "reward": 10.5496826171875, "reward_std": 1.05826997756958, "rewards/accuracy_reward/mean": 1.78125, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12434382736682892, "rewards/judge_reward/mean": 1.2416666746139526, "rewards/judge_reward/std": 1.642094612121582, "rewards/ngrams_iou_reward/mean": 0.17383001744747162, "rewards/ngrams_iou_reward/std": 0.15020357072353363, "rewards/schema_keywords_iou_reward/mean": 0.6841850280761719, "rewards/schema_keywords_iou_reward/std": 0.17755389213562012, "rewards/syntax_reward/mean": 0.8802083134651184, "rewards/syntax_reward/std": 0.32556670904159546, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 232.0260467529297, "completions/mean_terminated_length": 167.48077392578125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 1.6208651399491094, "frac_reward_zero_std": 0.03125, "grad_norm": 0.793605625629425, "kl": 0.06365966796875, "learning_rate": 5.197284995381264e-07, "loss": -0.0112, "num_tokens": 122469608.0, "reward": 9.654582977294922, "reward_std": 1.5184286832809448, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.524999976158142, "rewards/judge_reward/std": 1.6007195711135864, "rewards/ngrams_iou_reward/mean": 0.2055300623178482, "rewards/ngrams_iou_reward/std": 0.22393827140331268, "rewards/schema_keywords_iou_reward/mean": 0.7063444256782532, "rewards/schema_keywords_iou_reward/std": 0.1776086986064911, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 234.28646850585938, "completions/mean_terminated_length": 172.6199951171875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 1.6242578456318915, "frac_reward_zero_std": 0.0, "grad_norm": 0.8134180307388306, "kl": 0.05810546875, "learning_rate": 5.17756525503534e-07, "loss": 0.0108, "num_tokens": 122711715.0, "reward": 10.157035827636719, "reward_std": 1.2475507259368896, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.6374999284744263, "rewards/judge_reward/std": 1.6560375690460205, "rewards/ngrams_iou_reward/mean": 0.14618130028247833, "rewards/ngrams_iou_reward/std": 0.11225701123476028, "rewards/schema_keywords_iou_reward/mean": 0.7098124623298645, "rewards/schema_keywords_iou_reward/std": 0.12204762548208237, "rewards/syntax_reward/mean": 0.8802083134651184, "rewards/syntax_reward/std": 0.32556667923927307, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 234.6354217529297, "completions/mean_terminated_length": 168.72340393066406, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.6276505513146735, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7907595634460449, "kl": 0.0550537109375, "learning_rate": 5.157842748824053e-07, "loss": 0.0074, "num_tokens": 122941055.0, "reward": 10.3005952835083, "reward_std": 1.1703009605407715, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.283333420753479, "rewards/judge_reward/std": 1.6462342739105225, "rewards/ngrams_iou_reward/mean": 0.23213498294353485, "rewards/ngrams_iou_reward/std": 0.25347504019737244, "rewards/schema_keywords_iou_reward/mean": 0.7205435633659363, "rewards/schema_keywords_iou_reward/std": 0.17047010362148285, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 237.15625, "completions/mean_terminated_length": 175.60000610351562, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.6310432569974553, "frac_reward_zero_std": 0.0, "grad_norm": 0.8220237493515015, "kl": 0.05914306640625, "learning_rate": 5.138117783957261e-07, "loss": 0.015, "num_tokens": 123213071.0, "reward": 9.32929801940918, "reward_std": 1.792645812034607, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.4302083253860474, "rewards/judge_reward/std": 1.6081349849700928, "rewards/ngrams_iou_reward/mean": 0.1385653167963028, "rewards/ngrams_iou_reward/std": 0.08061164617538452, "rewards/schema_keywords_iou_reward/mean": 0.674065351486206, "rewards/schema_keywords_iou_reward/std": 0.17518319189548492, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 233.67709350585938, "completions/mean_terminated_length": 175.132080078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 1.6344359626802376, "frac_reward_zero_std": 0.0, "grad_norm": 0.9354327321052551, "kl": 0.05938720703125, "learning_rate": 5.118390667683119e-07, "loss": -0.0245, "num_tokens": 123457299.0, "reward": 10.01407527923584, "reward_std": 1.3033559322357178, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.4385417699813843, "rewards/judge_reward/std": 1.6932183504104614, "rewards/ngrams_iou_reward/mean": 0.14365848898887634, "rewards/ngrams_iou_reward/std": 0.14937865734100342, "rewards/schema_keywords_iou_reward/mean": 0.706874668598175, "rewards/schema_keywords_iou_reward/std": 0.15604569017887115, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 231.88021850585938, "completions/mean_terminated_length": 166.9423065185547, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.6378286683630194, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7796295881271362, "kl": 0.0595703125, "learning_rate": 5.098661707283297e-07, "loss": -0.0042, "num_tokens": 123728578.0, "reward": 9.739723205566406, "reward_std": 1.1870090961456299, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.4375, "rewards/judge_reward/std": 1.630292296409607, "rewards/ngrams_iou_reward/mean": 0.235450878739357, "rewards/ngrams_iou_reward/std": 0.26103368401527405, "rewards/schema_keywords_iou_reward/mean": 0.7021889686584473, "rewards/schema_keywords_iou_reward/std": 0.17985379695892334, "rewards/syntax_reward/mean": 0.6875, "rewards/syntax_reward/std": 0.46472418308258057, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 236.140625, "completions/mean_terminated_length": 176.5625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 1.6412213740458015, "frac_reward_zero_std": 0.0, "grad_norm": 0.8506251573562622, "kl": 0.05670166015625, "learning_rate": 5.078931210068185e-07, "loss": 0.0112, "num_tokens": 123970915.0, "reward": 9.54675006866455, "reward_std": 1.5989965200424194, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.2104166746139526, "rewards/judge_reward/std": 1.4673749208450317, "rewards/ngrams_iou_reward/mean": 0.21967709064483643, "rewards/ngrams_iou_reward/std": 0.25541412830352783, "rewards/schema_keywords_iou_reward/mean": 0.7031142115592957, "rewards/schema_keywords_iou_reward/std": 0.19691739976406097, "rewards/syntax_reward/mean": 0.7552083134651184, "rewards/syntax_reward/std": 0.4310877025127411, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 235.2760467529297, "completions/mean_terminated_length": 171.34042358398438, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.6446140797285835, "frac_reward_zero_std": 0.0, "grad_norm": 0.7923882007598877, "kl": 0.05853271484375, "learning_rate": 5.059199483372114e-07, "loss": 0.018, "num_tokens": 124236144.0, "reward": 9.64492416381836, "reward_std": 1.700652837753296, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33158352971076965, "rewards/judge_reward/mean": 1.1802083253860474, "rewards/judge_reward/std": 1.5060867071151733, "rewards/ngrams_iou_reward/mean": 0.1712794452905655, "rewards/ngrams_iou_reward/std": 0.19095496833324432, "rewards/schema_keywords_iou_reward/mean": 0.6392694115638733, "rewards/schema_keywords_iou_reward/std": 0.1992378681898117, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 235.0, "completions/mean_terminated_length": 172.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 1.6480067854113656, "frac_reward_zero_std": 0.0, "grad_norm": 0.7851933240890503, "kl": 0.05755615234375, "learning_rate": 5.039466834548567e-07, "loss": 0.0158, "num_tokens": 124495770.0, "reward": 10.18069839477539, "reward_std": 0.9787673950195312, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.1166666746139526, "rewards/judge_reward/std": 1.4885717630386353, "rewards/ngrams_iou_reward/mean": 0.20537878572940826, "rewards/ngrams_iou_reward/std": 0.20882344245910645, "rewards/schema_keywords_iou_reward/mean": 0.7315691113471985, "rewards/schema_keywords_iou_reward/std": 0.15648913383483887, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 235.703125, "completions/mean_terminated_length": 173.08509826660156, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 1.6513994910941476, "frac_reward_zero_std": 0.0, "grad_norm": 0.7639694213867188, "kl": 0.05712890625, "learning_rate": 5.019733570965387e-07, "loss": 0.0017, "num_tokens": 124758375.0, "reward": 9.765227317810059, "reward_std": 1.2945455312728882, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.6781249046325684, "rewards/judge_reward/std": 1.7100688219070435, "rewards/ngrams_iou_reward/mean": 0.1400153487920761, "rewards/ngrams_iou_reward/std": 0.15062859654426575, "rewards/schema_keywords_iou_reward/mean": 0.6752117276191711, "rewards/schema_keywords_iou_reward/std": 0.17352531850337982, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 235.1197967529297, "completions/mean_terminated_length": 175.8199920654297, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 1.6547921967769295, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8694849610328674, "kl": 0.059814453125, "learning_rate": 5e-07, "loss": -0.0214, "num_tokens": 125007668.0, "reward": 10.797298431396484, "reward_std": 1.076805830001831, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.2020833492279053, "rewards/judge_reward/std": 1.6616384983062744, "rewards/ngrams_iou_reward/mean": 0.22414664924144745, "rewards/ngrams_iou_reward/std": 0.27510541677474976, "rewards/schema_keywords_iou_reward/mean": 0.721067488193512, "rewards/schema_keywords_iou_reward/std": 0.21209512650966644, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.5572967529297, "completions/mean_terminated_length": 170.6727294921875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 1.6581849024597117, "frac_reward_zero_std": 0.0, "grad_norm": 0.8306647539138794, "kl": 0.05657958984375, "learning_rate": 4.980266429034612e-07, "loss": 0.0144, "num_tokens": 125254447.0, "reward": 10.073958396911621, "reward_std": 1.3408551216125488, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.5697916746139526, "rewards/judge_reward/std": 1.7701786756515503, "rewards/ngrams_iou_reward/mean": 0.16331028938293457, "rewards/ngrams_iou_reward/std": 0.1699792742729187, "rewards/schema_keywords_iou_reward/mean": 0.7137724757194519, "rewards/schema_keywords_iou_reward/std": 0.17190244793891907, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 236.015625, "completions/mean_terminated_length": 174.3616943359375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 1.6615776081424936, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7867283821105957, "kl": 0.0572509765625, "learning_rate": 4.960533165451435e-07, "loss": 0.0013, "num_tokens": 125495674.0, "reward": 9.658001899719238, "reward_std": 1.5718005895614624, "rewards/accuracy_reward/mean": 1.25, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.6843751668930054, "rewards/judge_reward/std": 1.728288173675537, "rewards/ngrams_iou_reward/mean": 0.15771985054016113, "rewards/ngrams_iou_reward/std": 0.15422523021697998, "rewards/schema_keywords_iou_reward/mean": 0.6836150288581848, "rewards/schema_keywords_iou_reward/std": 0.18469463288784027, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 241.390625, "completions/mean_terminated_length": 185.875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 1.6649703138252756, "frac_reward_zero_std": 0.0, "grad_norm": 1.1122207641601562, "kl": 0.05615234375, "learning_rate": 4.940800516627885e-07, "loss": -0.0176, "num_tokens": 125733469.0, "reward": 9.81690502166748, "reward_std": 1.0689330101013184, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.5572916269302368, "rewards/judge_reward/std": 1.6821978092193604, "rewards/ngrams_iou_reward/mean": 0.16263875365257263, "rewards/ngrams_iou_reward/std": 0.17680779099464417, "rewards/schema_keywords_iou_reward/mean": 0.680306613445282, "rewards/schema_keywords_iou_reward/std": 0.15889202058315277, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 230.0260467529297, "completions/mean_terminated_length": 180.43939208984375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 1.6683630195080577, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8493671417236328, "kl": 0.06121826171875, "learning_rate": 4.921068789931816e-07, "loss": 0.0214, "num_tokens": 126011304.0, "reward": 10.570619583129883, "reward_std": 1.207798719406128, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.384374976158142, "rewards/judge_reward/std": 1.7028357982635498, "rewards/ngrams_iou_reward/mean": 0.15435783565044403, "rewards/ngrams_iou_reward/std": 0.12327472120523453, "rewards/schema_keywords_iou_reward/mean": 0.7204289436340332, "rewards/schema_keywords_iou_reward/std": 0.15574705600738525, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 233.203125, "completions/mean_terminated_length": 166.6734619140625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 1.6717557251908397, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7077060341835022, "kl": 0.05615234375, "learning_rate": 4.901338292716703e-07, "loss": -0.0028, "num_tokens": 126273699.0, "reward": 9.612001419067383, "reward_std": 1.2469643354415894, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.558333396911621, "rewards/judge_reward/std": 1.655178427696228, "rewards/ngrams_iou_reward/mean": 0.22138379514217377, "rewards/ngrams_iou_reward/std": 0.23354759812355042, "rewards/schema_keywords_iou_reward/mean": 0.701033890247345, "rewards/schema_keywords_iou_reward/std": 0.18977965414524078, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 238.94271850585938, "completions/mean_terminated_length": 172.02565002441406, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 1.6751484308736218, "frac_reward_zero_std": 0.0, "grad_norm": 0.8318282961845398, "kl": 0.05645751953125, "learning_rate": 4.881609332316881e-07, "loss": 0.0181, "num_tokens": 126530542.0, "reward": 10.2625150680542, "reward_std": 1.578536033630371, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.3093750476837158, "rewards/judge_reward/std": 1.5989248752593994, "rewards/ngrams_iou_reward/mean": 0.20816759765148163, "rewards/ngrams_iou_reward/std": 0.233770951628685, "rewards/schema_keywords_iou_reward/mean": 0.7220552563667297, "rewards/schema_keywords_iou_reward/std": 0.17210054397583008, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.6979217529297, "completions/mean_terminated_length": 174.14035034179688, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.6785411365564036, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8184838891029358, "kl": 0.05596923828125, "learning_rate": 4.86188221604274e-07, "loss": 0.0148, "num_tokens": 126794466.0, "reward": 10.94813346862793, "reward_std": 1.3493832349777222, "rewards/accuracy_reward/mean": 2.171875, "rewards/accuracy_reward/std": 1.3446191549301147, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 0.8333333134651184, "rewards/judge_reward/std": 1.480531096458435, "rewards/ngrams_iou_reward/mean": 0.18445120751857758, "rewards/ngrams_iou_reward/std": 0.19006936252117157, "rewards/schema_keywords_iou_reward/mean": 0.7115979790687561, "rewards/schema_keywords_iou_reward/std": 0.1577197015285492, "rewards/syntax_reward/mean": 0.8958333134651184, "rewards/syntax_reward/std": 0.30627527832984924, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 231.83334350585938, "completions/mean_terminated_length": 173.1428680419922, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.6819338422391859, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8394675254821777, "kl": 0.0621337890625, "learning_rate": 4.842157251175947e-07, "loss": 0.0111, "num_tokens": 127063324.0, "reward": 9.548568725585938, "reward_std": 1.8403606414794922, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300665616989136, "rewards/judge_reward/mean": 1.0854166746139526, "rewards/judge_reward/std": 1.4614757299423218, "rewards/ngrams_iou_reward/mean": 0.1759636253118515, "rewards/ngrams_iou_reward/std": 0.19716835021972656, "rewards/schema_keywords_iou_reward/mean": 0.670521080493927, "rewards/schema_keywords_iou_reward/std": 0.20285938680171967, "rewards/syntax_reward/mean": 0.7604166865348816, "rewards/syntax_reward/std": 0.4279450476169586, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 228.66146850585938, "completions/mean_terminated_length": 167.03390502929688, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 1.6853265479219677, "frac_reward_zero_std": 0.0, "grad_norm": 0.8188462853431702, "kl": 0.06231689453125, "learning_rate": 4.82243474496466e-07, "loss": -0.0004, "num_tokens": 127318475.0, "reward": 9.970623016357422, "reward_std": 1.3782174587249756, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.28125, "rewards/judge_reward/std": 1.6015903949737549, "rewards/ngrams_iou_reward/mean": 0.21664293110370636, "rewards/ngrams_iou_reward/std": 0.21060557663440704, "rewards/schema_keywords_iou_reward/mean": 0.7227299809455872, "rewards/schema_keywords_iou_reward/std": 0.17350181937217712, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 241.64584350585938, "completions/mean_terminated_length": 172.48484802246094, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 1.68871925360475, "frac_reward_zero_std": 0.0, "grad_norm": 0.8512357473373413, "kl": 0.05584716796875, "learning_rate": 4.802715004618737e-07, "loss": 0.0092, "num_tokens": 127561677.0, "reward": 9.955495834350586, "reward_std": 1.540293574333191, "rewards/accuracy_reward/mean": 1.875, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 0.8395833373069763, "rewards/judge_reward/std": 1.3600491285324097, "rewards/ngrams_iou_reward/mean": 0.18942688405513763, "rewards/ngrams_iou_reward/std": 0.21103106439113617, "rewards/schema_keywords_iou_reward/mean": 0.6806510090827942, "rewards/schema_keywords_iou_reward/std": 0.21203066408634186, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 237.7760467529297, "completions/mean_terminated_length": 170.6585235595703, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 1.6921119592875318, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7337601184844971, "kl": 0.0518798828125, "learning_rate": 4.78299833730495e-07, "loss": 0.0141, "num_tokens": 127813262.0, "reward": 10.212045669555664, "reward_std": 1.0275311470031738, "rewards/accuracy_reward/mean": 1.734375, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.140625, "rewards/judge_reward/std": 1.5550005435943604, "rewards/ngrams_iou_reward/mean": 0.22073711454868317, "rewards/ngrams_iou_reward/std": 0.2387687712907791, "rewards/schema_keywords_iou_reward/mean": 0.7361001372337341, "rewards/schema_keywords_iou_reward/std": 0.16102685034275055, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 239.0260467529297, "completions/mean_terminated_length": 180.2093048095703, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 1.6955046649703138, "frac_reward_zero_std": 0.0, "grad_norm": 0.7828051447868347, "kl": 0.05633544921875, "learning_rate": 4.7632850501422106e-07, "loss": 0.0058, "num_tokens": 128076229.0, "reward": 9.816642761230469, "reward_std": 1.4067180156707764, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.3218750953674316, "rewards/judge_reward/std": 1.604816198348999, "rewards/ngrams_iou_reward/mean": 0.1534169763326645, "rewards/ngrams_iou_reward/std": 0.14540016651153564, "rewards/schema_keywords_iou_reward/mean": 0.7121837735176086, "rewards/schema_keywords_iou_reward/std": 0.17305178940296173, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374123275279999, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 237.5729217529297, "completions/mean_terminated_length": 169.70730590820312, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 1.6988973706530959, "frac_reward_zero_std": 0.0, "grad_norm": 0.7883481383323669, "kl": 0.0523681640625, "learning_rate": 4.743575450196772e-07, "loss": 0.0063, "num_tokens": 128322579.0, "reward": 10.137350082397461, "reward_std": 1.4041990041732788, "rewards/accuracy_reward/mean": 1.875, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 0.890625, "rewards/judge_reward/std": 1.3650519847869873, "rewards/ngrams_iou_reward/mean": 0.21396426856517792, "rewards/ngrams_iou_reward/std": 0.2443198710680008, "rewards/schema_keywords_iou_reward/mean": 0.7150521278381348, "rewards/schema_keywords_iou_reward/std": 0.18307343125343323, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 239.1666717529297, "completions/mean_terminated_length": 182.5454559326172, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 1.7022900763358777, "frac_reward_zero_std": 0.0, "grad_norm": 0.8127928972244263, "kl": 0.05303955078125, "learning_rate": 4.7238698444774593e-07, "loss": 0.0241, "num_tokens": 128580017.0, "reward": 9.775516510009766, "reward_std": 1.8370099067687988, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.2625000476837158, "rewards/judge_reward/std": 1.530920386314392, "rewards/ngrams_iou_reward/mean": 0.19897471368312836, "rewards/ngrams_iou_reward/std": 0.2434396892786026, "rewards/schema_keywords_iou_reward/mean": 0.6869580745697021, "rewards/schema_keywords_iou_reward/std": 0.190422922372818, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 235.65625, "completions/mean_terminated_length": 163.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 1.70568278201866, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7824251055717468, "kl": 0.051513671875, "learning_rate": 4.704168539930877e-07, "loss": 0.0176, "num_tokens": 128830961.0, "reward": 10.07884407043457, "reward_std": 1.3418571949005127, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967556834220886, "rewards/judge_reward/mean": 1.5572916269302368, "rewards/judge_reward/std": 1.665432095527649, "rewards/ngrams_iou_reward/mean": 0.23771290481090546, "rewards/ngrams_iou_reward/std": 0.27657195925712585, "rewards/schema_keywords_iou_reward/mean": 0.7057144045829773, "rewards/schema_keywords_iou_reward/std": 0.18514132499694824, "rewards/syntax_reward/mean": 0.921875, "rewards/syntax_reward/std": 0.2690697908401489, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 233.890625, "completions/mean_terminated_length": 154.92857360839844, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.7090754877014418, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9031952619552612, "kl": 0.04852294921875, "learning_rate": 4.684471843436633e-07, "loss": -0.0058, "num_tokens": 129095516.0, "reward": 10.111079216003418, "reward_std": 1.4079439640045166, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.1572916507720947, "rewards/judge_reward/std": 1.5261932611465454, "rewards/ngrams_iou_reward/mean": 0.22162286937236786, "rewards/ngrams_iou_reward/std": 0.26269444823265076, "rewards/schema_keywords_iou_reward/mean": 0.6894555687904358, "rewards/schema_keywords_iou_reward/std": 0.188664510846138, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 240.859375, "completions/mean_terminated_length": 165.15625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 1.712468193384224, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7873842716217041, "kl": 0.0513916015625, "learning_rate": 4.664780061802557e-07, "loss": -0.0064, "num_tokens": 129334577.0, "reward": 10.089933395385742, "reward_std": 1.2275562286376953, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.4520832300186157, "rewards/judge_reward/std": 1.710391879081726, "rewards/ngrams_iou_reward/mean": 0.17931103706359863, "rewards/ngrams_iou_reward/std": 0.19553866982460022, "rewards/schema_keywords_iou_reward/mean": 0.7304131984710693, "rewards/schema_keywords_iou_reward/std": 0.1627102643251419, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 234.03646850585938, "completions/mean_terminated_length": 173.31373596191406, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.715860899067006, "frac_reward_zero_std": 0.03125, "grad_norm": 2.2527873516082764, "kl": 0.050537109375, "learning_rate": 4.6450935017599195e-07, "loss": -0.0048, "num_tokens": 129607584.0, "reward": 8.728326797485352, "reward_std": 1.4451122283935547, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 1.3810847997665405, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319954812526703, "rewards/judge_reward/mean": 1.7333332300186157, "rewards/judge_reward/std": 1.4779826402664185, "rewards/ngrams_iou_reward/mean": 0.135198712348938, "rewards/ngrams_iou_reward/std": 0.1768587827682495, "rewards/schema_keywords_iou_reward/mean": 0.6681268811225891, "rewards/schema_keywords_iou_reward/std": 0.15182089805603027, "rewards/syntax_reward/mean": 0.7604166865348816, "rewards/syntax_reward/std": 0.427945077419281, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 240.1510467529297, "completions/mean_terminated_length": 173.7567596435547, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 1.719253604749788, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8227330446243286, "kl": 0.05279541015625, "learning_rate": 4.625412469958665e-07, "loss": 0.013, "num_tokens": 129838967.0, "reward": 10.319351196289062, "reward_std": 1.3987617492675781, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 1.4432151317596436, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 0.9395833015441895, "rewards/judge_reward/std": 1.466304063796997, "rewards/ngrams_iou_reward/mean": 0.1668042093515396, "rewards/ngrams_iou_reward/std": 0.17185215651988983, "rewards/schema_keywords_iou_reward/mean": 0.7004637718200684, "rewards/schema_keywords_iou_reward/std": 0.17196843028068542, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 240.55209350585938, "completions/mean_terminated_length": 157.1333465576172, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 1.72264631043257, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8765036463737488, "kl": 0.060302734375, "learning_rate": 4.6057372729626176e-07, "loss": -0.0062, "num_tokens": 130080531.0, "reward": 10.395023345947266, "reward_std": 1.5741863250732422, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.3802083730697632, "rewards/judge_reward/std": 1.7310904264450073, "rewards/ngrams_iou_reward/mean": 0.21188396215438843, "rewards/ngrams_iou_reward/std": 0.23922018706798553, "rewards/schema_keywords_iou_reward/mean": 0.7195977568626404, "rewards/schema_keywords_iou_reward/std": 0.180233895778656, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 240.5885467529297, "completions/mean_terminated_length": 163.53125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 1.726039016115352, "frac_reward_zero_std": 0.0, "grad_norm": 0.7822827100753784, "kl": 0.048583984375, "learning_rate": 4.586068217244718e-07, "loss": -0.0147, "num_tokens": 130324730.0, "reward": 9.68255615234375, "reward_std": 1.4864578247070312, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.1770833730697632, "rewards/judge_reward/std": 1.5163664817810059, "rewards/ngrams_iou_reward/mean": 0.149819016456604, "rewards/ngrams_iou_reward/std": 0.11998151242733002, "rewards/schema_keywords_iou_reward/mean": 0.6316947340965271, "rewards/schema_keywords_iou_reward/std": 0.18305604159832, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 244.23959350585938, "completions/mean_terminated_length": 175.35714721679688, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.729431721798134, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7707465291023254, "kl": 0.05523681640625, "learning_rate": 4.5664056091822465e-07, "loss": 0.0026, "num_tokens": 130560966.0, "reward": 9.827753067016602, "reward_std": 1.4919066429138184, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.8697916865348816, "rewards/format_reward/std": 0.3374122977256775, "rewards/judge_reward/mean": 1.243749976158142, "rewards/judge_reward/std": 1.5676581859588623, "rewards/ngrams_iou_reward/mean": 0.24742339551448822, "rewards/ngrams_iou_reward/std": 0.28848564624786377, "rewards/schema_keywords_iou_reward/mean": 0.7126212120056152, "rewards/schema_keywords_iou_reward/std": 0.20607835054397583, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 236.59375, "completions/mean_terminated_length": 160.4615478515625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 1.732824427480916, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8311086297035217, "kl": 0.05133056640625, "learning_rate": 4.5467497550520505e-07, "loss": 0.0141, "num_tokens": 130813008.0, "reward": 10.08024787902832, "reward_std": 1.0589572191238403, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.2145832777023315, "rewards/judge_reward/std": 1.572471261024475, "rewards/ngrams_iou_reward/mean": 0.23952703177928925, "rewards/ngrams_iou_reward/std": 0.27794504165649414, "rewards/schema_keywords_iou_reward/mean": 0.7344700694084167, "rewards/schema_keywords_iou_reward/std": 0.17845210433006287, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 245.31771850585938, "completions/mean_terminated_length": 177.11538696289062, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 1.7362171331636982, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7719337344169617, "kl": 0.0513916015625, "learning_rate": 4.5271009610257756e-07, "loss": 0.0063, "num_tokens": 131054275.0, "reward": 10.470499038696289, "reward_std": 1.2431831359863281, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.4791666269302368, "rewards/judge_reward/std": 1.7828688621520996, "rewards/ngrams_iou_reward/mean": 0.16453148424625397, "rewards/ngrams_iou_reward/std": 0.1614513099193573, "rewards/schema_keywords_iou_reward/mean": 0.7278425097465515, "rewards/schema_keywords_iou_reward/std": 0.14254911243915558, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 236.984375, "completions/mean_terminated_length": 169.07142639160156, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 1.73960983884648, "frac_reward_zero_std": 0.0, "grad_norm": 0.803892970085144, "kl": 0.05206298828125, "learning_rate": 4.507459533165093e-07, "loss": -0.0096, "num_tokens": 131302348.0, "reward": 10.063722610473633, "reward_std": 1.1659764051437378, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.4322916269302368, "rewards/judge_reward/std": 1.6517796516418457, "rewards/ngrams_iou_reward/mean": 0.19942283630371094, "rewards/ngrams_iou_reward/std": 0.20458941161632538, "rewards/schema_keywords_iou_reward/mean": 0.7497158050537109, "rewards/schema_keywords_iou_reward/std": 0.14692586660385132, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.06771850585938, "completions/mean_terminated_length": 173.97958374023438, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 1.743002544529262, "frac_reward_zero_std": 0.0, "grad_norm": 0.8062900304794312, "kl": 0.0504150390625, "learning_rate": 4.4878257774169345e-07, "loss": 0.0024, "num_tokens": 131550557.0, "reward": 10.021085739135742, "reward_std": 1.742682933807373, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.4229167699813843, "rewards/judge_reward/std": 1.6295361518859863, "rewards/ngrams_iou_reward/mean": 0.21204252541065216, "rewards/ngrams_iou_reward/std": 0.2620553970336914, "rewards/schema_keywords_iou_reward/mean": 0.7132088541984558, "rewards/schema_keywords_iou_reward/std": 0.18502387404441833, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 235.50521850585938, "completions/mean_terminated_length": 170.45652770996094, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 1.7463952502120441, "frac_reward_zero_std": 0.0, "grad_norm": 0.8239884376525879, "kl": 0.05377197265625, "learning_rate": 4.4681999996087266e-07, "loss": 0.0096, "num_tokens": 131803152.0, "reward": 9.786706924438477, "reward_std": 1.6163578033447266, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.6260417699813843, "rewards/judge_reward/std": 1.717281699180603, "rewards/ngrams_iou_reward/mean": 0.1565200388431549, "rewards/ngrams_iou_reward/std": 0.1793343871831894, "rewards/schema_keywords_iou_reward/mean": 0.6906032562255859, "rewards/schema_keywords_iou_reward/std": 0.17919224500656128, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.4071781635284424, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 234.875, "completions/mean_terminated_length": 161.67442321777344, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 1.7497879558948262, "frac_reward_zero_std": 0.0, "grad_norm": 0.7857974767684937, "kl": 0.04833984375, "learning_rate": 4.4485825054436243e-07, "loss": -0.0043, "num_tokens": 132051642.0, "reward": 9.691580772399902, "reward_std": 1.1743910312652588, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.3781250715255737, "rewards/judge_reward/std": 1.5329335927963257, "rewards/ngrams_iou_reward/mean": 0.21094931662082672, "rewards/ngrams_iou_reward/std": 0.25394973158836365, "rewards/schema_keywords_iou_reward/mean": 0.6931303143501282, "rewards/schema_keywords_iou_reward/std": 0.19199304282665253, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.09375, "completions/mean_terminated_length": 169.13636779785156, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.7531806615776082, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8785597085952759, "kl": 0.05474853515625, "learning_rate": 4.4289736004957576e-07, "loss": -0.0028, "num_tokens": 132297468.0, "reward": 10.475859642028809, "reward_std": 1.1426814794540405, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.5031248331069946, "rewards/judge_reward/std": 1.7362143993377686, "rewards/ngrams_iou_reward/mean": 0.23745660483837128, "rewards/ngrams_iou_reward/std": 0.28248804807662964, "rewards/schema_keywords_iou_reward/mean": 0.7321522831916809, "rewards/schema_keywords_iou_reward/std": 0.19481249153614044, "rewards/syntax_reward/mean": 0.8854166865348816, "rewards/syntax_reward/std": 0.3193511664867401, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 233.3854217529297, "completions/mean_terminated_length": 174.07546997070312, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 1.75657336726039, "frac_reward_zero_std": 0.03125, "grad_norm": 0.864059567451477, "kl": 0.05670166015625, "learning_rate": 4.4093735902054603e-07, "loss": 0.0258, "num_tokens": 132558164.0, "reward": 10.389656066894531, "reward_std": 1.3049941062927246, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.4520832300186157, "rewards/judge_reward/std": 1.7147941589355469, "rewards/ngrams_iou_reward/mean": 0.14816580712795258, "rewards/ngrams_iou_reward/std": 0.11317811161279678, "rewards/schema_keywords_iou_reward/mean": 0.7383651733398438, "rewards/schema_keywords_iou_reward/std": 0.1451757550239563, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 232.890625, "completions/mean_terminated_length": 163.5625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 1.7599660729431723, "frac_reward_zero_std": 0.0, "grad_norm": 0.942065417766571, "kl": 0.05499267578125, "learning_rate": 4.389782779874518e-07, "loss": -0.0047, "num_tokens": 132804659.0, "reward": 9.987112998962402, "reward_std": 1.4597673416137695, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.1843749284744263, "rewards/judge_reward/std": 1.5217671394348145, "rewards/ngrams_iou_reward/mean": 0.17140676081180573, "rewards/ngrams_iou_reward/std": 0.1855248361825943, "rewards/schema_keywords_iou_reward/mean": 0.6813305020332336, "rewards/schema_keywords_iou_reward/std": 0.18148034811019897, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 237.9322967529297, "completions/mean_terminated_length": 180.5869598388672, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 1.7633587786259541, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7888087034225464, "kl": 0.0501708984375, "learning_rate": 4.3702014746614127e-07, "loss": 0.0167, "num_tokens": 133054240.0, "reward": 9.834753036499023, "reward_std": 1.398454189300537, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.2114583253860474, "rewards/judge_reward/std": 1.4943081140518188, "rewards/ngrams_iou_reward/mean": 0.21710877120494843, "rewards/ngrams_iou_reward/std": 0.2668655216693878, "rewards/schema_keywords_iou_reward/mean": 0.6895191669464111, "rewards/schema_keywords_iou_reward/std": 0.2135353535413742, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 241.9010467529297, "completions/mean_terminated_length": 178.6571502685547, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 1.7667514843087362, "frac_reward_zero_std": 0.03125, "grad_norm": 0.822318434715271, "kl": 0.0491943359375, "learning_rate": 4.350629979576569e-07, "loss": 0.0036, "num_tokens": 133300797.0, "reward": 9.475744247436523, "reward_std": 1.2123793363571167, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.6963540315628052, "rewards/judge_reward/std": 1.672987461090088, "rewards/ngrams_iou_reward/mean": 0.19386857748031616, "rewards/ngrams_iou_reward/std": 0.21532444655895233, "rewards/schema_keywords_iou_reward/mean": 0.7068741917610168, "rewards/schema_keywords_iou_reward/std": 0.19086898863315582, "rewards/syntax_reward/mean": 0.71875, "rewards/syntax_reward/std": 0.4507846534252167, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.125, "completions/mean_terminated_length": 183.99998474121094, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 1.7701441899915182, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8858991861343384, "kl": 0.05322265625, "learning_rate": 4.331068599477605e-07, "loss": 0.0203, "num_tokens": 133570311.0, "reward": 9.551031112670898, "reward_std": 1.3334341049194336, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.4510416984558105, "rewards/judge_reward/std": 1.585437297821045, "rewards/ngrams_iou_reward/mean": 0.18341119587421417, "rewards/ngrams_iou_reward/std": 0.20227739214897156, "rewards/schema_keywords_iou_reward/mean": 0.6999104022979736, "rewards/schema_keywords_iou_reward/std": 0.1769954264163971, "rewards/syntax_reward/mean": 0.7552083134651184, "rewards/syntax_reward/std": 0.4310877025127411, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 238.97396850585938, "completions/mean_terminated_length": 181.7045440673828, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 1.7735368956743003, "frac_reward_zero_std": 0.0, "grad_norm": 0.8027633428573608, "kl": 0.05517578125, "learning_rate": 4.3115176390645774e-07, "loss": 0.0104, "num_tokens": 133827658.0, "reward": 9.674078941345215, "reward_std": 1.2857956886291504, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.350000023841858, "rewards/judge_reward/std": 1.5686722993850708, "rewards/ngrams_iou_reward/mean": 0.13504017889499664, "rewards/ngrams_iou_reward/std": 0.08796059340238571, "rewards/schema_keywords_iou_reward/mean": 0.6619547009468079, "rewards/schema_keywords_iou_reward/std": 0.18893355131149292, "rewards/syntax_reward/mean": 0.8802083134651184, "rewards/syntax_reward/std": 0.32556670904159546, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 225.09896850585938, "completions/mean_terminated_length": 170.01449584960938, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 1.7769296013570823, "frac_reward_zero_std": 0.0, "grad_norm": 0.8719081282615662, "kl": 0.0531005859375, "learning_rate": 4.291977402875243e-07, "loss": 0.0068, "num_tokens": 134103617.0, "reward": 10.166086196899414, "reward_std": 1.8151969909667969, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.2989583015441895, "rewards/judge_reward/std": 1.6506381034851074, "rewards/ngrams_iou_reward/mean": 0.1499747931957245, "rewards/ngrams_iou_reward/std": 0.14633993804454803, "rewards/schema_keywords_iou_reward/mean": 0.6942350268363953, "rewards/schema_keywords_iou_reward/std": 0.17321646213531494, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 232.50521850585938, "completions/mean_terminated_length": 172.4629669189453, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 1.7803223070398642, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9283246994018555, "kl": 0.05389404296875, "learning_rate": 4.2724481952803093e-07, "loss": 0.0217, "num_tokens": 134359236.0, "reward": 10.991645812988281, "reward_std": 0.8088976740837097, "rewards/accuracy_reward/mean": 1.953125, "rewards/accuracy_reward/std": 1.433660626411438, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967556834220886, "rewards/judge_reward/mean": 1.1645833253860474, "rewards/judge_reward/std": 1.6679284572601318, "rewards/ngrams_iou_reward/mean": 0.1579105406999588, "rewards/ngrams_iou_reward/std": 0.1834704726934433, "rewards/schema_keywords_iou_reward/mean": 0.7076938152313232, "rewards/schema_keywords_iou_reward/std": 0.1462407261133194, "rewards/syntax_reward/mean": 0.9635416865348816, "rewards/syntax_reward/std": 0.18791763484477997, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 227.83334350585938, "completions/mean_terminated_length": 165.86666870117188, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.7837150127226464, "frac_reward_zero_std": 0.0, "grad_norm": 0.8403437733650208, "kl": 0.04931640625, "learning_rate": 4.252930320478695e-07, "loss": 0.0049, "num_tokens": 134619850.0, "reward": 10.284928321838379, "reward_std": 1.3295485973358154, "rewards/accuracy_reward/mean": 2.09375, "rewards/accuracy_reward/std": 1.3810847997665405, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 0.7020833492279053, "rewards/judge_reward/std": 1.2726670503616333, "rewards/ngrams_iou_reward/mean": 0.12874414026737213, "rewards/ngrams_iou_reward/std": 0.10776354372501373, "rewards/schema_keywords_iou_reward/mean": 0.679100513458252, "rewards/schema_keywords_iou_reward/std": 0.16453471779823303, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 227.046875, "completions/mean_terminated_length": 174.25, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.7871077184054283, "frac_reward_zero_std": 0.0, "grad_norm": 0.7681349515914917, "kl": 0.05487060546875, "learning_rate": 4.233424082492797e-07, "loss": 0.0036, "num_tokens": 134889769.0, "reward": 10.09354019165039, "reward_std": 1.1325361728668213, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.419791579246521, "rewards/judge_reward/std": 1.6470260620117188, "rewards/ngrams_iou_reward/mean": 0.1398942470550537, "rewards/ngrams_iou_reward/std": 0.13503716886043549, "rewards/schema_keywords_iou_reward/mean": 0.6661458015441895, "rewards/schema_keywords_iou_reward/std": 0.1690286248922348, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.609375, "completions/mean_terminated_length": 164.98387145996094, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 1.7905004240882103, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7615322470664978, "kl": 0.052490234375, "learning_rate": 4.213929785163746e-07, "loss": 0.0121, "num_tokens": 135155524.0, "reward": 10.004633903503418, "reward_std": 1.3346771001815796, "rewards/accuracy_reward/mean": 1.796875, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 0.9593749642372131, "rewards/judge_reward/std": 1.383150577545166, "rewards/ngrams_iou_reward/mean": 0.22029118239879608, "rewards/ngrams_iou_reward/std": 0.22445252537727356, "rewards/schema_keywords_iou_reward/mean": 0.7041335105895996, "rewards/schema_keywords_iou_reward/std": 0.2023656964302063, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 222.5260467529297, "completions/mean_terminated_length": 170.30667114257812, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.7938931297709924, "frac_reward_zero_std": 0.0, "grad_norm": 0.824266791343689, "kl": 0.0576171875, "learning_rate": 4.194447732146678e-07, "loss": 0.0065, "num_tokens": 135486753.0, "reward": 9.56921672821045, "reward_std": 1.6046810150146484, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.3572916984558105, "rewards/judge_reward/std": 1.6021088361740112, "rewards/ngrams_iou_reward/mean": 0.12642008066177368, "rewards/ngrams_iou_reward/std": 0.14187347888946533, "rewards/schema_keywords_iou_reward/mean": 0.6552954316139221, "rewards/schema_keywords_iou_reward/std": 0.19102062284946442, "rewards/syntax_reward/mean": 0.7552083134651184, "rewards/syntax_reward/std": 0.4310877025127411, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 226.140625, "completions/mean_terminated_length": 169.13636779785156, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.7972858354537744, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7077962160110474, "kl": 0.05419921875, "learning_rate": 4.1749782269060043e-07, "loss": -0.0036, "num_tokens": 135738336.0, "reward": 10.466493606567383, "reward_std": 1.171679973602295, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.3406249284744263, "rewards/judge_reward/std": 1.6272598505020142, "rewards/ngrams_iou_reward/mean": 0.1997978836297989, "rewards/ngrams_iou_reward/std": 0.21172712743282318, "rewards/schema_keywords_iou_reward/mean": 0.7312781810760498, "rewards/schema_keywords_iou_reward/std": 0.1486920267343521, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 227.7604217529297, "completions/mean_terminated_length": 167.11474609375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 1.8006785411365565, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7952877283096313, "kl": 0.04541015625, "learning_rate": 4.155521572710684e-07, "loss": -0.008, "num_tokens": 136005326.0, "reward": 9.739367485046387, "reward_std": 1.2050334215164185, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.1062500476837158, "rewards/judge_reward/std": 1.4400269985198975, "rewards/ngrams_iou_reward/mean": 0.14942379295825958, "rewards/ngrams_iou_reward/std": 0.11393453925848007, "rewards/schema_keywords_iou_reward/mean": 0.6847348213195801, "rewards/schema_keywords_iou_reward/std": 0.15007708966732025, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 231.7135467529297, "completions/mean_terminated_length": 174.19297790527344, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 1.8040712468193383, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7377777695655823, "kl": 0.05743408203125, "learning_rate": 4.1360780726295026e-07, "loss": 0.0052, "num_tokens": 136271605.0, "reward": 9.940135955810547, "reward_std": 1.3636691570281982, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.1479166746139526, "rewards/judge_reward/std": 1.5580211877822876, "rewards/ngrams_iou_reward/mean": 0.20679044723510742, "rewards/ngrams_iou_reward/std": 0.26032522320747375, "rewards/schema_keywords_iou_reward/mean": 0.6666781306266785, "rewards/schema_keywords_iou_reward/std": 0.20987531542778015, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 223.45834350585938, "completions/mean_terminated_length": 164.11764526367188, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 1.8074639525021206, "frac_reward_zero_std": 0.0, "grad_norm": 0.8981190323829651, "kl": 0.05450439453125, "learning_rate": 4.116648029526347e-07, "loss": -0.0019, "num_tokens": 136521341.0, "reward": 10.538156509399414, "reward_std": 0.9435046911239624, "rewards/accuracy_reward/mean": 1.734375, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.2760417461395264, "rewards/judge_reward/std": 1.658817172050476, "rewards/ngrams_iou_reward/mean": 0.24119164049625397, "rewards/ngrams_iou_reward/std": 0.25012966990470886, "rewards/schema_keywords_iou_reward/mean": 0.7448808550834656, "rewards/schema_keywords_iou_reward/std": 0.15669426321983337, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 228.1822967529297, "completions/mean_terminated_length": 175.0757598876953, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 1.8108566581849024, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7620242238044739, "kl": 0.05194091796875, "learning_rate": 4.09723174605549e-07, "loss": 0.0116, "num_tokens": 136792666.0, "reward": 9.374702453613281, "reward_std": 1.194413661956787, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.2041666507720947, "rewards/judge_reward/std": 1.4540729522705078, "rewards/ngrams_iou_reward/mean": 0.15268032252788544, "rewards/ngrams_iou_reward/std": 0.13078513741493225, "rewards/schema_keywords_iou_reward/mean": 0.678271472454071, "rewards/schema_keywords_iou_reward/std": 0.15776920318603516, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.6197967529297, "completions/mean_terminated_length": 174.671630859375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.8142493638676844, "frac_reward_zero_std": 0.0, "grad_norm": 0.8178566098213196, "kl": 0.05413818359375, "learning_rate": 4.077829524656877e-07, "loss": 0.0088, "num_tokens": 137064849.0, "reward": 9.212514877319336, "reward_std": 1.258040189743042, "rewards/accuracy_reward/mean": 1.171875, "rewards/accuracy_reward/std": 1.4674979448318481, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.620833396911621, "rewards/judge_reward/std": 1.598160982131958, "rewards/ngrams_iou_reward/mean": 0.16761986911296844, "rewards/ngrams_iou_reward/std": 0.16303902864456177, "rewards/schema_keywords_iou_reward/mean": 0.6678115725517273, "rewards/schema_keywords_iou_reward/std": 0.17971597611904144, "rewards/syntax_reward/mean": 0.6666666865348816, "rewards/syntax_reward/std": 0.4726369380950928, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.1875, "completions/mean_terminated_length": 172.8727264404297, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 1.8176420695504665, "frac_reward_zero_std": 0.0, "grad_norm": 0.8180758357048035, "kl": 0.0543212890625, "learning_rate": 4.05844166755141e-07, "loss": 0.0144, "num_tokens": 137307081.0, "reward": 9.98645305633545, "reward_std": 1.5343073606491089, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.3468750715255737, "rewards/judge_reward/std": 1.6504768133163452, "rewards/ngrams_iou_reward/mean": 0.18265236914157867, "rewards/ngrams_iou_reward/std": 0.17778973281383514, "rewards/schema_keywords_iou_reward/mean": 0.7037997841835022, "rewards/schema_keywords_iou_reward/std": 0.17201204597949982, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 222.8072967529297, "completions/mean_terminated_length": 167.48611450195312, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 1.8210347752332485, "frac_reward_zero_std": 0.0, "grad_norm": 0.9465208053588867, "kl": 0.062744140625, "learning_rate": 4.0390684767362526e-07, "loss": -0.001, "num_tokens": 137594744.0, "reward": 9.326761245727539, "reward_std": 1.3569536209106445, "rewards/accuracy_reward/mean": 1.15625, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.6124998331069946, "rewards/judge_reward/std": 1.561530351638794, "rewards/ngrams_iou_reward/mean": 0.2150150090456009, "rewards/ngrams_iou_reward/std": 0.23278966546058655, "rewards/schema_keywords_iou_reward/mean": 0.7096622586250305, "rewards/schema_keywords_iou_reward/std": 0.16544464230537415, "rewards/syntax_reward/mean": 0.7395833134651184, "rewards/syntax_reward/std": 0.44000932574272156, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 221.0885467529297, "completions/mean_terminated_length": 171.1519012451172, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.8244274809160306, "frac_reward_zero_std": 0.0625, "grad_norm": 0.932377278804779, "kl": 0.0582275390625, "learning_rate": 4.01971025398011e-07, "loss": -0.0183, "num_tokens": 137869189.0, "reward": 9.764864921569824, "reward_std": 2.100283622741699, "rewards/accuracy_reward/mean": 1.25, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.6765624284744263, "rewards/judge_reward/std": 1.7480049133300781, "rewards/ngrams_iou_reward/mean": 0.17811845242977142, "rewards/ngrams_iou_reward/std": 0.23168006539344788, "rewards/schema_keywords_iou_reward/mean": 0.6763295531272888, "rewards/schema_keywords_iou_reward/std": 0.1915998011827469, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374123275279999, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 233.4166717529297, "completions/mean_terminated_length": 175.70370483398438, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 1.8278201865988124, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7826368808746338, "kl": 0.04632568359375, "learning_rate": 4.000367300818537e-07, "loss": 0.0166, "num_tokens": 138125025.0, "reward": 10.182326316833496, "reward_std": 1.3367652893066406, "rewards/accuracy_reward/mean": 1.90625, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 0.8687500357627869, "rewards/judge_reward/std": 1.3682496547698975, "rewards/ngrams_iou_reward/mean": 0.19020338356494904, "rewards/ngrams_iou_reward/std": 0.22320975363254547, "rewards/schema_keywords_iou_reward/mean": 0.7181639671325684, "rewards/schema_keywords_iou_reward/std": 0.1597214788198471, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374122977256775, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 227.5416717529297, "completions/mean_terminated_length": 174.44775390625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 1.8312128922815947, "frac_reward_zero_std": 0.0, "grad_norm": 0.8546758890151978, "kl": 0.06060791015625, "learning_rate": 3.98103991854924e-07, "loss": 0.0145, "num_tokens": 138390623.0, "reward": 10.878565788269043, "reward_std": 1.1466004848480225, "rewards/accuracy_reward/mean": 1.953125, "rewards/accuracy_reward/std": 1.433660626411438, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.0927082300186157, "rewards/judge_reward/std": 1.6286534070968628, "rewards/ngrams_iou_reward/mean": 0.21111561357975006, "rewards/ngrams_iou_reward/std": 0.22408831119537354, "rewards/schema_keywords_iou_reward/mean": 0.7528660297393799, "rewards/schema_keywords_iou_reward/std": 0.1565592736005783, "rewards/syntax_reward/mean": 0.8958333134651184, "rewards/syntax_reward/std": 0.30627527832984924, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 235.11459350585938, "completions/mean_terminated_length": 183.0908966064453, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 1.8346055979643765, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8972079753875732, "kl": 0.0518798828125, "learning_rate": 3.9617284082273836e-07, "loss": 0.0057, "num_tokens": 138672993.0, "reward": 9.12024974822998, "reward_std": 1.7919249534606934, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.2406249046325684, "rewards/judge_reward/std": 1.4761686325073242, "rewards/ngrams_iou_reward/mean": 0.16217021644115448, "rewards/ngrams_iou_reward/std": 0.19976937770843506, "rewards/schema_keywords_iou_reward/mean": 0.6747450828552246, "rewards/schema_keywords_iou_reward/std": 0.2094312608242035, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617491722107, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 230.0729217529297, "completions/mean_terminated_length": 165.49090576171875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.8379983036471588, "frac_reward_zero_std": 0.0, "grad_norm": 0.9298669695854187, "kl": 0.055419921875, "learning_rate": 3.942433070660905e-07, "loss": 0.0435, "num_tokens": 138932279.0, "reward": 10.142328262329102, "reward_std": 1.5381178855895996, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.3510416746139526, "rewards/judge_reward/std": 1.6531418561935425, "rewards/ngrams_iou_reward/mean": 0.20650358498096466, "rewards/ngrams_iou_reward/std": 0.24277973175048828, "rewards/schema_keywords_iou_reward/mean": 0.7129072546958923, "rewards/schema_keywords_iou_reward/std": 0.20611009001731873, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374122977256775, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 241.78646850585938, "completions/mean_terminated_length": 170.71875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 1.8413910093299406, "frac_reward_zero_std": 0.0, "grad_norm": 0.8098766207695007, "kl": 0.04925537109375, "learning_rate": 3.9231542064058187e-07, "loss": -0.0049, "num_tokens": 139169616.0, "reward": 9.731024742126465, "reward_std": 1.2084534168243408, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.4119791984558105, "rewards/judge_reward/std": 1.602391004562378, "rewards/ngrams_iou_reward/mean": 0.1950516253709793, "rewards/ngrams_iou_reward/std": 0.20239992439746857, "rewards/schema_keywords_iou_reward/mean": 0.6599307656288147, "rewards/schema_keywords_iou_reward/std": 0.19780491292476654, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.671875, "completions/mean_terminated_length": 169.36923217773438, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 1.8447837150127226, "frac_reward_zero_std": 0.09375, "grad_norm": 0.8741549253463745, "kl": 0.0491943359375, "learning_rate": 3.903892115761544e-07, "loss": 0.0232, "num_tokens": 139438947.0, "reward": 10.023553848266602, "reward_std": 1.341085433959961, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.2067707777023315, "rewards/judge_reward/std": 1.5525038242340088, "rewards/ngrams_iou_reward/mean": 0.21564064919948578, "rewards/ngrams_iou_reward/std": 0.25007742643356323, "rewards/schema_keywords_iou_reward/mean": 0.7172872424125671, "rewards/schema_keywords_iou_reward/std": 0.17337098717689514, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 234.28646850585938, "completions/mean_terminated_length": 180.1999969482422, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 1.8481764206955047, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7406323552131653, "kl": 0.0496826171875, "learning_rate": 3.884647098766224e-07, "loss": 0.0028, "num_tokens": 139688038.0, "reward": 10.417183876037598, "reward_std": 1.1560261249542236, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.3197916746139526, "rewards/judge_reward/std": 1.7061710357666016, "rewards/ngrams_iou_reward/mean": 0.18520782887935638, "rewards/ngrams_iou_reward/std": 0.16263261437416077, "rewards/schema_keywords_iou_reward/mean": 0.712183952331543, "rewards/schema_keywords_iou_reward/std": 0.1741466522216797, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 231.61459350585938, "completions/mean_terminated_length": 170.8727264404297, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 1.8515691263782865, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9538265466690063, "kl": 0.055419921875, "learning_rate": 3.865419455192048e-07, "loss": 0.018, "num_tokens": 139944986.0, "reward": 10.05881118774414, "reward_std": 1.2082347869873047, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.779166579246521, "rewards/judge_reward/std": 1.6929738521575928, "rewards/ngrams_iou_reward/mean": 0.23547379672527313, "rewards/ngrams_iou_reward/std": 0.3014358580112457, "rewards/schema_keywords_iou_reward/mean": 0.723336935043335, "rewards/schema_keywords_iou_reward/std": 0.20047974586486816, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 237.98959350585938, "completions/mean_terminated_length": 179.1555633544922, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 1.8549618320610688, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7615506649017334, "kl": 0.056640625, "learning_rate": 3.846209484540597e-07, "loss": 0.0108, "num_tokens": 140191212.0, "reward": 10.281084060668945, "reward_std": 1.3246712684631348, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.7437500953674316, "rewards/judge_reward/std": 1.8038244247436523, "rewards/ngrams_iou_reward/mean": 0.163461834192276, "rewards/ngrams_iou_reward/std": 0.1639045923948288, "rewards/schema_keywords_iou_reward/mean": 0.6978303790092468, "rewards/schema_keywords_iou_reward/std": 0.15540438890457153, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 234.17709350585938, "completions/mean_terminated_length": 170.4897918701172, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 1.8583545377438506, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8417345881462097, "kl": 0.05462646484375, "learning_rate": 3.8270174860381564e-07, "loss": -0.0054, "num_tokens": 140439520.0, "reward": 9.95991039276123, "reward_std": 1.3183506727218628, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.544791579246521, "rewards/judge_reward/std": 1.7180513143539429, "rewards/ngrams_iou_reward/mean": 0.2590309679508209, "rewards/ngrams_iou_reward/std": 0.2503933012485504, "rewards/schema_keywords_iou_reward/mean": 0.7362952828407288, "rewards/schema_keywords_iou_reward/std": 0.180110901594162, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 232.1666717529297, "completions/mean_terminated_length": 168.0, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 1.861747243426633, "frac_reward_zero_std": 0.0, "grad_norm": 0.8856585621833801, "kl": 0.05133056640625, "learning_rate": 3.8078437586310713e-07, "loss": 0.029, "num_tokens": 140686806.0, "reward": 10.050323486328125, "reward_std": 1.5560741424560547, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.3229166269302368, "rewards/judge_reward/std": 1.6613550186157227, "rewards/ngrams_iou_reward/mean": 0.20506691932678223, "rewards/ngrams_iou_reward/std": 0.22409012913703918, "rewards/schema_keywords_iou_reward/mean": 0.6837987899780273, "rewards/schema_keywords_iou_reward/std": 0.21352118253707886, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 227.359375, "completions/mean_terminated_length": 159.5263214111328, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 1.8651399491094147, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8299749493598938, "kl": 0.05035400390625, "learning_rate": 3.788688600981085e-07, "loss": 0.0145, "num_tokens": 140945613.0, "reward": 10.021961212158203, "reward_std": 1.5300703048706055, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.3979166746139526, "rewards/judge_reward/std": 1.6835501194000244, "rewards/ngrams_iou_reward/mean": 0.281209796667099, "rewards/ngrams_iou_reward/std": 0.2789056599140167, "rewards/schema_keywords_iou_reward/mean": 0.7313759326934814, "rewards/schema_keywords_iou_reward/std": 0.20118367671966553, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.828125, "completions/mean_terminated_length": 178.1343231201172, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 1.8685326547921968, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7252238988876343, "kl": 0.0491943359375, "learning_rate": 3.7695523114606836e-07, "loss": -0.0154, "num_tokens": 141204300.0, "reward": 9.77562141418457, "reward_std": 1.0077269077301025, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.642708420753479, "rewards/judge_reward/std": 1.6061559915542603, "rewards/ngrams_iou_reward/mean": 0.1750805377960205, "rewards/ngrams_iou_reward/std": 0.18393027782440186, "rewards/schema_keywords_iou_reward/mean": 0.7317907214164734, "rewards/schema_keywords_iou_reward/std": 0.1493416577577591, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 223.84896850585938, "completions/mean_terminated_length": 165.22059631347656, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.8719253604749788, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9137868285179138, "kl": 0.05120849609375, "learning_rate": 3.7504351881484586e-07, "loss": -0.0182, "num_tokens": 141458905.0, "reward": 10.043073654174805, "reward_std": 1.3139076232910156, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9895833134651184, "rewards/format_reward/std": 0.1017945408821106, "rewards/judge_reward/mean": 1.5947917699813843, "rewards/judge_reward/std": 1.6660997867584229, "rewards/ngrams_iou_reward/mean": 0.26667627692222595, "rewards/ngrams_iou_reward/std": 0.29871776700019836, "rewards/schema_keywords_iou_reward/mean": 0.7482711672782898, "rewards/schema_keywords_iou_reward/std": 0.15490849316120148, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 233.6822967529297, "completions/mean_terminated_length": 180.82456970214844, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 1.8753180661577609, "frac_reward_zero_std": 0.0, "grad_norm": 0.840215265750885, "kl": 0.05047607421875, "learning_rate": 3.7313375288244497e-07, "loss": -0.002, "num_tokens": 141721434.0, "reward": 9.449020385742188, "reward_std": 1.2442959547042847, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.4604166746139526, "rewards/judge_reward/std": 1.4848963022232056, "rewards/ngrams_iou_reward/mean": 0.17733411490917206, "rewards/ngrams_iou_reward/std": 0.19355663657188416, "rewards/schema_keywords_iou_reward/mean": 0.7102273106575012, "rewards/schema_keywords_iou_reward/std": 0.14383453130722046, "rewards/syntax_reward/mean": 0.7604166865348816, "rewards/syntax_reward/std": 0.427945077419281, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 231.00521850585938, "completions/mean_terminated_length": 170.30357360839844, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.878710771840543, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7992311716079712, "kl": 0.0487060546875, "learning_rate": 3.7122596309655174e-07, "loss": -0.0016, "num_tokens": 141986425.0, "reward": 9.67129135131836, "reward_std": 1.4428143501281738, "rewards/accuracy_reward/mean": 1.703125, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 0.9666666984558105, "rewards/judge_reward/std": 1.3625251054763794, "rewards/ngrams_iou_reward/mean": 0.1961841583251953, "rewards/ngrams_iou_reward/std": 0.20239119231700897, "rewards/schema_keywords_iou_reward/mean": 0.7032322883605957, "rewards/schema_keywords_iou_reward/std": 0.17941002547740936, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 234.296875, "completions/mean_terminated_length": 170.9591827392578, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 1.8821034775233247, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8355216383934021, "kl": 0.0513916015625, "learning_rate": 3.6932017917407045e-07, "loss": 0.0246, "num_tokens": 142248520.0, "reward": 10.195194244384766, "reward_std": 1.2273318767547607, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.553125023841858, "rewards/judge_reward/std": 1.7625809907913208, "rewards/ngrams_iou_reward/mean": 0.15087203681468964, "rewards/ngrams_iou_reward/std": 0.16495971381664276, "rewards/schema_keywords_iou_reward/mean": 0.7089040279388428, "rewards/schema_keywords_iou_reward/std": 0.14301927387714386, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 231.3072967529297, "completions/mean_terminated_length": 168.20370483398438, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 1.885496183206107, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8526034951210022, "kl": 0.04840087890625, "learning_rate": 3.674164308006606e-07, "loss": 0.017, "num_tokens": 142489233.0, "reward": 10.472196578979492, "reward_std": 1.7260253429412842, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.314583420753479, "rewards/judge_reward/std": 1.6692463159561157, "rewards/ngrams_iou_reward/mean": 0.32133418321609497, "rewards/ngrams_iou_reward/std": 0.3311360478401184, "rewards/schema_keywords_iou_reward/mean": 0.7560704350471497, "rewards/schema_keywords_iou_reward/std": 0.20487898588180542, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 230.9166717529297, "completions/mean_terminated_length": 170.0, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 1.8888888888888888, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7919936776161194, "kl": 0.04876708984375, "learning_rate": 3.655147476302753e-07, "loss": 0.0061, "num_tokens": 142737071.0, "reward": 9.78797435760498, "reward_std": 1.427082896232605, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.4583333730697632, "rewards/judge_reward/std": 1.5794270038604736, "rewards/ngrams_iou_reward/mean": 0.23407883942127228, "rewards/ngrams_iou_reward/std": 0.26056763529777527, "rewards/schema_keywords_iou_reward/mean": 0.7153533101081848, "rewards/schema_keywords_iou_reward/std": 0.17569240927696228, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 230.47396850585938, "completions/mean_terminated_length": 174.31668090820312, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 1.8922815945716709, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7237861752510071, "kl": 0.0504150390625, "learning_rate": 3.6361515928469845e-07, "loss": 0.0082, "num_tokens": 142973934.0, "reward": 10.019027709960938, "reward_std": 1.7372907400131226, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.2791666984558105, "rewards/judge_reward/std": 1.6191210746765137, "rewards/ngrams_iou_reward/mean": 0.1908135861158371, "rewards/ngrams_iou_reward/std": 0.22628700733184814, "rewards/schema_keywords_iou_reward/mean": 0.707380473613739, "rewards/schema_keywords_iou_reward/std": 0.1813303530216217, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947065711021423, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 225.70834350585938, "completions/mean_terminated_length": 162.19354248046875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 1.895674300254453, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8257517218589783, "kl": 0.051513671875, "learning_rate": 3.617176953530835e-07, "loss": 0.0071, "num_tokens": 143229136.0, "reward": 10.168570518493652, "reward_std": 1.3629831075668335, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.3520833253860474, "rewards/judge_reward/std": 1.6073768138885498, "rewards/ngrams_iou_reward/mean": 0.18418724834918976, "rewards/ngrams_iou_reward/std": 0.21088282763957977, "rewards/schema_keywords_iou_reward/mean": 0.7020909190177917, "rewards/schema_keywords_iou_reward/std": 0.16184166073799133, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.89584350585938, "completions/mean_terminated_length": 172.78431701660156, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 1.899067005937235, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7322782874107361, "kl": 0.05059814453125, "learning_rate": 3.5982238539149283e-07, "loss": -0.0108, "num_tokens": 143483582.0, "reward": 9.75570297241211, "reward_std": 1.4636056423187256, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.4072917699813843, "rewards/judge_reward/std": 1.6132140159606934, "rewards/ngrams_iou_reward/mean": 0.15999968349933624, "rewards/ngrams_iou_reward/std": 0.17361797392368317, "rewards/schema_keywords_iou_reward/mean": 0.6717450022697449, "rewards/schema_keywords_iou_reward/std": 0.1953389197587967, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 232.984375, "completions/mean_terminated_length": 174.1666717529297, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 1.902459711620017, "frac_reward_zero_std": 0.0, "grad_norm": 0.9619553089141846, "kl": 0.04736328125, "learning_rate": 3.579292589224374e-07, "loss": 0.0314, "num_tokens": 143738603.0, "reward": 10.410528182983398, "reward_std": 1.468672752380371, "rewards/accuracy_reward/mean": 1.734375, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.2395833730697632, "rewards/judge_reward/std": 1.6245806217193604, "rewards/ngrams_iou_reward/mean": 0.18547511100769043, "rewards/ngrams_iou_reward/std": 0.21651552617549896, "rewards/schema_keywords_iou_reward/mean": 0.7354698181152344, "rewards/schema_keywords_iou_reward/std": 0.15238229930400848, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 225.63021850585938, "completions/mean_terminated_length": 168.9701385498047, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 1.9058524173027989, "frac_reward_zero_std": 0.0, "grad_norm": 0.8182759284973145, "kl": 0.056884765625, "learning_rate": 3.560383454344168e-07, "loss": -0.0004, "num_tokens": 144007398.0, "reward": 9.528377532958984, "reward_std": 1.3971208333969116, "rewards/accuracy_reward/mean": 1.515625, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.209375023841858, "rewards/judge_reward/std": 1.577231764793396, "rewards/ngrams_iou_reward/mean": 0.15753749012947083, "rewards/ngrams_iou_reward/std": 0.17215818166732788, "rewards/schema_keywords_iou_reward/mean": 0.6760479807853699, "rewards/schema_keywords_iou_reward/std": 0.18540474772453308, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.0572967529297, "completions/mean_terminated_length": 179.34666442871094, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.9092451229855811, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8301640748977661, "kl": 0.059814453125, "learning_rate": 3.541496743814596e-07, "loss": 0.0085, "num_tokens": 144263213.0, "reward": 9.730182647705078, "reward_std": 1.374137282371521, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.571874976158142, "rewards/judge_reward/std": 1.5718200206756592, "rewards/ngrams_iou_reward/mean": 0.204095721244812, "rewards/ngrams_iou_reward/std": 0.24646615982055664, "rewards/schema_keywords_iou_reward/mean": 0.7208787798881531, "rewards/schema_keywords_iou_reward/std": 0.15990476310253143, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617789745331, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.515625, "completions/mean_terminated_length": 163.09524536132812, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 1.912637828668363, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7726386785507202, "kl": 0.04986572265625, "learning_rate": 3.5226327518266506e-07, "loss": 0.0227, "num_tokens": 144509492.0, "reward": 10.513873100280762, "reward_std": 1.2423865795135498, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.0812500715255737, "rewards/judge_reward/std": 1.5586483478546143, "rewards/ngrams_iou_reward/mean": 0.1958845853805542, "rewards/ngrams_iou_reward/std": 0.18852783739566803, "rewards/schema_keywords_iou_reward/mean": 0.7388207316398621, "rewards/schema_keywords_iou_reward/std": 0.15516315400600433, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 229.65625, "completions/mean_terminated_length": 156.8235321044922, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 1.916030534351145, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7938498854637146, "kl": 0.04949951171875, "learning_rate": 3.5037917722174446e-07, "loss": 0.0069, "num_tokens": 144760100.0, "reward": 9.939351081848145, "reward_std": 1.3596566915512085, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.5479167699813843, "rewards/judge_reward/std": 1.6222909688949585, "rewards/ngrams_iou_reward/mean": 0.1817501336336136, "rewards/ngrams_iou_reward/std": 0.2190289944410324, "rewards/schema_keywords_iou_reward/mean": 0.7346835732460022, "rewards/schema_keywords_iou_reward/std": 0.16287477314472198, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 224.71875, "completions/mean_terminated_length": 160.6666717529297, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.919423240033927, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7938176393508911, "kl": 0.050048828125, "learning_rate": 3.484974098465636e-07, "loss": -0.0043, "num_tokens": 144987302.0, "reward": 9.923343658447266, "reward_std": 1.1531586647033691, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.3760417699813843, "rewards/judge_reward/std": 1.528387188911438, "rewards/ngrams_iou_reward/mean": 0.1927504688501358, "rewards/ngrams_iou_reward/std": 0.20245316624641418, "rewards/schema_keywords_iou_reward/mean": 0.6972594261169434, "rewards/schema_keywords_iou_reward/std": 0.1734960973262787, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 227.31771850585938, "completions/mean_terminated_length": 172.56060791015625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 1.922815945716709, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7268356084823608, "kl": 0.05230712890625, "learning_rate": 3.46618002368686e-07, "loss": 0.0077, "num_tokens": 145240707.0, "reward": 10.723246574401855, "reward_std": 0.997339129447937, "rewards/accuracy_reward/mean": 1.703125, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.4322916269302368, "rewards/judge_reward/std": 1.7568411827087402, "rewards/ngrams_iou_reward/mean": 0.176305890083313, "rewards/ngrams_iou_reward/std": 0.17773495614528656, "rewards/schema_keywords_iou_reward/mean": 0.7344393730163574, "rewards/schema_keywords_iou_reward/std": 0.15327642858028412, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374123275279999, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 221.0885467529297, "completions/mean_terminated_length": 168.94805908203125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 1.9262086513994912, "frac_reward_zero_std": 0.0625, "grad_norm": 0.748548150062561, "kl": 0.0531005859375, "learning_rate": 3.4474098406291557e-07, "loss": 0.0117, "num_tokens": 145504028.0, "reward": 10.020956039428711, "reward_std": 1.0536459684371948, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.504166603088379, "rewards/judge_reward/std": 1.7025952339172363, "rewards/ngrams_iou_reward/mean": 0.20603172481060028, "rewards/ngrams_iou_reward/std": 0.2260293960571289, "rewards/schema_keywords_iou_reward/mean": 0.7128405570983887, "rewards/schema_keywords_iou_reward/std": 0.19291500747203827, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 221.171875, "completions/mean_terminated_length": 164.3972625732422, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.929601357082273, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8352175354957581, "kl": 0.05279541015625, "learning_rate": 3.4286638416684115e-07, "loss": 0.0194, "num_tokens": 145780247.0, "reward": 9.636938095092773, "reward_std": 1.5399806499481201, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.580208420753479, "rewards/judge_reward/std": 1.6623404026031494, "rewards/ngrams_iou_reward/mean": 0.22218012809753418, "rewards/ngrams_iou_reward/std": 0.2666931748390198, "rewards/schema_keywords_iou_reward/mean": 0.7074652314186096, "rewards/schema_keywords_iou_reward/std": 0.21444827318191528, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947065711021423, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 216.84375, "completions/mean_terminated_length": 155.760009765625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 1.9329940627650553, "frac_reward_zero_std": 0.09375, "grad_norm": 0.806816577911377, "kl": 0.05706787109375, "learning_rate": 3.409942318803809e-07, "loss": 0.017, "num_tokens": 146019773.0, "reward": 10.101053237915039, "reward_std": 1.2059497833251953, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9947916865348816, "rewards/format_reward/std": 0.07216878235340118, "rewards/judge_reward/mean": 1.209375023841858, "rewards/judge_reward/std": 1.4990801811218262, "rewards/ngrams_iou_reward/mean": 0.28839626908302307, "rewards/ngrams_iou_reward/std": 0.2883675694465637, "rewards/schema_keywords_iou_reward/mean": 0.7376556396484375, "rewards/schema_keywords_iou_reward/std": 0.19015194475650787, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 222.88021850585938, "completions/mean_terminated_length": 168.89041137695312, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 1.936386768447837, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7637850642204285, "kl": 0.053955078125, "learning_rate": 3.391245563653276e-07, "loss": 0.0011, "num_tokens": 146287956.0, "reward": 10.151586532592773, "reward_std": 1.1136412620544434, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.1875001192092896, "rewards/judge_reward/std": 1.6146742105484009, "rewards/ngrams_iou_reward/mean": 0.20132970809936523, "rewards/ngrams_iou_reward/std": 0.2519593834877014, "rewards/schema_keywords_iou_reward/mean": 0.7262971997261047, "rewards/schema_keywords_iou_reward/std": 0.17343394458293915, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.20834350585938, "completions/mean_terminated_length": 176.3582000732422, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 1.9397794741306191, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7840045094490051, "kl": 0.05291748046875, "learning_rate": 3.372573867448941e-07, "loss": -0.0048, "num_tokens": 146542108.0, "reward": 9.989836692810059, "reward_std": 1.6288177967071533, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.3875001668930054, "rewards/judge_reward/std": 1.6741489171981812, "rewards/ngrams_iou_reward/mean": 0.13441963493824005, "rewards/ngrams_iou_reward/std": 0.1265997737646103, "rewards/schema_keywords_iou_reward/mean": 0.6741663813591003, "rewards/schema_keywords_iou_reward/std": 0.17555469274520874, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 219.4479217529297, "completions/mean_terminated_length": 174.39535522460938, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 1.9431721798134012, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8204793930053711, "kl": 0.0517578125, "learning_rate": 3.353927521032604e-07, "loss": 0.0075, "num_tokens": 146788272.0, "reward": 9.39116096496582, "reward_std": 1.3837316036224365, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.419791579246521, "rewards/judge_reward/std": 1.63810133934021, "rewards/ngrams_iou_reward/mean": 0.18430180847644806, "rewards/ngrams_iou_reward/std": 0.2029140740633011, "rewards/schema_keywords_iou_reward/mean": 0.6693582534790039, "rewards/schema_keywords_iou_reward/std": 0.18966881930828094, "rewards/syntax_reward/mean": 0.6979166865348816, "rewards/syntax_reward/std": 0.4603615701198578, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.453125, "completions/mean_terminated_length": 178.1428680419922, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 1.9465648854961832, "frac_reward_zero_std": 0.0, "grad_norm": 0.8177334666252136, "kl": 0.05450439453125, "learning_rate": 3.335306814851195e-07, "loss": 0.0312, "num_tokens": 147039501.0, "reward": 9.87584114074707, "reward_std": 1.4121456146240234, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.265625, "rewards/judge_reward/std": 1.527844786643982, "rewards/ngrams_iou_reward/mean": 0.18389201164245605, "rewards/ngrams_iou_reward/std": 0.196946918964386, "rewards/schema_keywords_iou_reward/mean": 0.7023653984069824, "rewards/schema_keywords_iou_reward/std": 0.17563283443450928, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 222.78646850585938, "completions/mean_terminated_length": 174.24359130859375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 1.9499575911789653, "frac_reward_zero_std": 0.0, "grad_norm": 0.786650538444519, "kl": 0.0498046875, "learning_rate": 3.3167120389522597e-07, "loss": 0.0057, "num_tokens": 147298744.0, "reward": 9.592445373535156, "reward_std": 1.4237513542175293, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.2604166269302368, "rewards/judge_reward/std": 1.5379648208618164, "rewards/ngrams_iou_reward/mean": 0.21201105415821075, "rewards/ngrams_iou_reward/std": 0.23400285840034485, "rewards/schema_keywords_iou_reward/mean": 0.6929343342781067, "rewards/schema_keywords_iou_reward/std": 0.19974996149539948, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.640625, "completions/mean_terminated_length": 162.0500030517578, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 1.953350296861747, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8857300281524658, "kl": 0.04974365234375, "learning_rate": 3.2981434829794353e-07, "loss": -0.0146, "num_tokens": 147540097.0, "reward": 9.700419425964355, "reward_std": 1.1214358806610107, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.1197916269302368, "rewards/judge_reward/std": 1.4868441820144653, "rewards/ngrams_iou_reward/mean": 0.2452196627855301, "rewards/ngrams_iou_reward/std": 0.2709151804447174, "rewards/schema_keywords_iou_reward/mean": 0.7520745396614075, "rewards/schema_keywords_iou_reward/std": 0.16406890749931335, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 221.6875, "completions/mean_terminated_length": 164.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 1.9567430025445294, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8073752522468567, "kl": 0.0501708984375, "learning_rate": 3.279601436167946e-07, "loss": 0.0004, "num_tokens": 147823345.0, "reward": 9.243552207946777, "reward_std": 1.596841812133789, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3129251003265381, "rewards/judge_reward/mean": 1.379166603088379, "rewards/judge_reward/std": 1.5537770986557007, "rewards/ngrams_iou_reward/mean": 0.18462122976779938, "rewards/ngrams_iou_reward/std": 0.230301633477211, "rewards/schema_keywords_iou_reward/mean": 0.659972071647644, "rewards/schema_keywords_iou_reward/std": 0.19815801084041595, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 218.78125, "completions/mean_terminated_length": 156.75, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 1.9601357082273112, "frac_reward_zero_std": 0.03125, "grad_norm": 0.852505624294281, "kl": 0.04815673828125, "learning_rate": 3.261086187340088e-07, "loss": 0.0358, "num_tokens": 148051591.0, "reward": 10.46658992767334, "reward_std": 0.9923979043960571, "rewards/accuracy_reward/mean": 1.8125, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.0947917699813843, "rewards/judge_reward/std": 1.588563084602356, "rewards/ngrams_iou_reward/mean": 0.29275190830230713, "rewards/ngrams_iou_reward/std": 0.30211973190307617, "rewards/schema_keywords_iou_reward/mean": 0.7550875544548035, "rewards/schema_keywords_iou_reward/std": 0.18296141922473907, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 225.4010467529297, "completions/mean_terminated_length": 162.7460479736328, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 1.9635284139100933, "frac_reward_zero_std": 0.03125, "grad_norm": 0.767459511756897, "kl": 0.05010986328125, "learning_rate": 3.2425980249007377e-07, "loss": 0.0209, "num_tokens": 148292742.0, "reward": 10.002623558044434, "reward_std": 1.0107516050338745, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.3562499284744263, "rewards/judge_reward/std": 1.5982197523117065, "rewards/ngrams_iou_reward/mean": 0.20485258102416992, "rewards/ngrams_iou_reward/std": 0.19554123282432556, "rewards/schema_keywords_iou_reward/mean": 0.7154778838157654, "rewards/schema_keywords_iou_reward/std": 0.17376752197742462, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 224.00521850585938, "completions/mean_terminated_length": 172.9864959716797, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 1.9669211195928753, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7727705836296082, "kl": 0.048583984375, "learning_rate": 3.2241372368328585e-07, "loss": 0.001, "num_tokens": 148530631.0, "reward": 10.493895530700684, "reward_std": 0.8599008321762085, "rewards/accuracy_reward/mean": 1.84375, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.1437500715255737, "rewards/judge_reward/std": 1.5683258771896362, "rewards/ngrams_iou_reward/mean": 0.18600676953792572, "rewards/ngrams_iou_reward/std": 0.20369195938110352, "rewards/schema_keywords_iou_reward/mean": 0.7078881859779358, "rewards/schema_keywords_iou_reward/std": 0.1724989116191864, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 224.7604217529297, "completions/mean_terminated_length": 169.0724639892578, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 1.9703138252756573, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7621785402297974, "kl": 0.0517578125, "learning_rate": 3.20570411069301e-07, "loss": 0.0072, "num_tokens": 148788843.0, "reward": 10.32653522491455, "reward_std": 1.4455546140670776, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.3572916984558105, "rewards/judge_reward/std": 1.65597403049469, "rewards/ngrams_iou_reward/mean": 0.21734987199306488, "rewards/ngrams_iou_reward/std": 0.2264506220817566, "rewards/schema_keywords_iou_reward/mean": 0.7383511662483215, "rewards/schema_keywords_iou_reward/std": 0.17185302078723907, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.44271850585938, "completions/mean_terminated_length": 174.92857360839844, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.9737065309584394, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8376007676124573, "kl": 0.050537109375, "learning_rate": 3.187298933606878e-07, "loss": 0.0284, "num_tokens": 149067214.0, "reward": 9.827670097351074, "reward_std": 1.4569566249847412, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.2916666269302368, "rewards/judge_reward/std": 1.6161272525787354, "rewards/ngrams_iou_reward/mean": 0.18308359384536743, "rewards/ngrams_iou_reward/std": 0.20989936590194702, "rewards/schema_keywords_iou_reward/mean": 0.6966698169708252, "rewards/schema_keywords_iou_reward/std": 0.17444363236427307, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.6822967529297, "completions/mean_terminated_length": 170.10000610351562, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 1.9770992366412212, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7708368897438049, "kl": 0.05316162109375, "learning_rate": 3.168921992264792e-07, "loss": 0.0113, "num_tokens": 149315001.0, "reward": 10.17692756652832, "reward_std": 1.4781650304794312, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.176041603088379, "rewards/judge_reward/std": 1.5981824398040771, "rewards/ngrams_iou_reward/mean": 0.24503588676452637, "rewards/ngrams_iou_reward/std": 0.2923140823841095, "rewards/schema_keywords_iou_reward/mean": 0.7100164294242859, "rewards/schema_keywords_iou_reward/std": 0.2159806191921234, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 226.53646850585938, "completions/mean_terminated_length": 167.609375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 1.9804919423240035, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7565421462059021, "kl": 0.0545654296875, "learning_rate": 3.150573572917267e-07, "loss": 0.0191, "num_tokens": 149563840.0, "reward": 10.184414863586426, "reward_std": 1.250970482826233, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.543229103088379, "rewards/judge_reward/std": 1.6536033153533936, "rewards/ngrams_iou_reward/mean": 0.28573504090309143, "rewards/ngrams_iou_reward/std": 0.32279881834983826, "rewards/schema_keywords_iou_reward/mean": 0.7549285292625427, "rewards/schema_keywords_iou_reward/std": 0.17711907625198364, "rewards/syntax_reward/mean": 0.7395833134651184, "rewards/syntax_reward/std": 0.44000932574272156, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 226.9479217529297, "completions/mean_terminated_length": 180.6216278076172, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.9838846480067853, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8732813000679016, "kl": 0.0548095703125, "learning_rate": 3.1322539613705394e-07, "loss": 0.0158, "num_tokens": 149818632.0, "reward": 9.80420970916748, "reward_std": 1.3003119230270386, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.553125023841858, "rewards/judge_reward/std": 1.6050935983657837, "rewards/ngrams_iou_reward/mean": 0.15050451457500458, "rewards/ngrams_iou_reward/std": 0.13283051550388336, "rewards/schema_keywords_iou_reward/mean": 0.6724544167518616, "rewards/schema_keywords_iou_reward/std": 0.16558487713336945, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 229.05209350585938, "completions/mean_terminated_length": 163.60714721679688, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 1.9872773536895676, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7397467494010925, "kl": 0.0467529296875, "learning_rate": 3.1139634429821195e-07, "loss": 0.0043, "num_tokens": 150079000.0, "reward": 9.606849670410156, "reward_std": 1.0499866008758545, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.006250023841858, "rewards/judge_reward/std": 1.3810847997665405, "rewards/ngrams_iou_reward/mean": 0.19521093368530273, "rewards/ngrams_iou_reward/std": 0.23228825628757477, "rewards/schema_keywords_iou_reward/mean": 0.690804660320282, "rewards/schema_keywords_iou_reward/std": 0.17500977218151093, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 224.6510467529297, "completions/mean_terminated_length": 167.48529052734375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.9906700593723494, "frac_reward_zero_std": 0.03125, "grad_norm": 0.823133111000061, "kl": 0.0477294921875, "learning_rate": 3.095702302656347e-07, "loss": -0.0004, "num_tokens": 150336657.0, "reward": 10.445825576782227, "reward_std": 0.8903040885925293, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 1.4432151317596436, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 0.9510416984558105, "rewards/judge_reward/std": 1.5044432878494263, "rewards/ngrams_iou_reward/mean": 0.23151887953281403, "rewards/ngrams_iou_reward/std": 0.2639347016811371, "rewards/schema_keywords_iou_reward/mean": 0.7288894653320312, "rewards/schema_keywords_iou_reward/std": 0.1829707771539688, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 218.109375, "completions/mean_terminated_length": 165.0625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 1.9940627650551315, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8047539591789246, "kl": 0.0538330078125, "learning_rate": 3.0774708248399467e-07, "loss": 0.0191, "num_tokens": 150618792.0, "reward": 9.533975601196289, "reward_std": 1.2862927913665771, "rewards/accuracy_reward/mean": 1.21875, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.6364582777023315, "rewards/judge_reward/std": 1.660670280456543, "rewards/ngrams_iou_reward/mean": 0.15754525363445282, "rewards/ngrams_iou_reward/std": 0.1490796059370041, "rewards/schema_keywords_iou_reward/mean": 0.6972630620002747, "rewards/schema_keywords_iou_reward/std": 0.1761595457792282, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 220.7916717529297, "completions/mean_terminated_length": 172.543212890625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.9974554707379135, "frac_reward_zero_std": 0.0, "grad_norm": 0.995823085308075, "kl": 0.053466796875, "learning_rate": 3.059269293517603e-07, "loss": 0.0008, "num_tokens": 150908324.0, "reward": 9.485957145690918, "reward_std": 1.5814647674560547, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.439583420753479, "rewards/judge_reward/std": 1.5432662963867188, "rewards/ngrams_iou_reward/mean": 0.20977431535720825, "rewards/ngrams_iou_reward/std": 0.2503415048122406, "rewards/schema_keywords_iou_reward/mean": 0.6834742426872253, "rewards/schema_keywords_iou_reward/std": 0.20140723884105682, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 224.78125, "completions/mean_terminated_length": 154.4067840576172, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.003392705682782, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7851107120513916, "kl": 0.05023193359375, "learning_rate": 3.041097992207534e-07, "loss": -0.0059, "num_tokens": 151144922.0, "reward": 10.377545356750488, "reward_std": 1.4791052341461182, "rewards/accuracy_reward/mean": 1.96875, "rewards/accuracy_reward/std": 1.4286017417907715, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 0.8697916865348816, "rewards/judge_reward/std": 1.4009435176849365, "rewards/ngrams_iou_reward/mean": 0.19418780505657196, "rewards/ngrams_iou_reward/std": 0.2197825163602829, "rewards/schema_keywords_iou_reward/mean": 0.7093982696533203, "rewards/schema_keywords_iou_reward/std": 0.19158609211444855, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 223.6979217529297, "completions/mean_terminated_length": 162.03030395507812, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 2.006785411365564, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8352448344230652, "kl": 0.05133056640625, "learning_rate": 3.0229572039570826e-07, "loss": -0.0074, "num_tokens": 151410892.0, "reward": 9.983519554138184, "reward_std": 1.311126708984375, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.6552082300186157, "rewards/judge_reward/std": 1.714268445968628, "rewards/ngrams_iou_reward/mean": 0.13576972484588623, "rewards/ngrams_iou_reward/std": 0.17042233049869537, "rewards/schema_keywords_iou_reward/mean": 0.6519156694412231, "rewards/schema_keywords_iou_reward/std": 0.18092520534992218, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 225.046875, "completions/mean_terminated_length": 165.9545440673828, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 2.010178117048346, "frac_reward_zero_std": 0.0, "grad_norm": 0.8080920577049255, "kl": 0.0587158203125, "learning_rate": 3.004847211338295e-07, "loss": 0.014, "num_tokens": 151659691.0, "reward": 9.626522064208984, "reward_std": 1.5266926288604736, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.1447917222976685, "rewards/judge_reward/std": 1.469150185585022, "rewards/ngrams_iou_reward/mean": 0.1986345499753952, "rewards/ngrams_iou_reward/std": 0.267845094203949, "rewards/schema_keywords_iou_reward/mean": 0.6695540547370911, "rewards/schema_keywords_iou_reward/std": 0.20505259931087494, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.8072967529297, "completions/mean_terminated_length": 162.6724090576172, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 2.013570822731128, "frac_reward_zero_std": 0.0625, "grad_norm": 0.762998640537262, "kl": 0.04888916015625, "learning_rate": 2.986768296443529e-07, "loss": 0.0108, "num_tokens": 151912086.0, "reward": 10.109918594360352, "reward_std": 1.5936174392700195, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.4968749284744263, "rewards/judge_reward/std": 1.698533535003662, "rewards/ngrams_iou_reward/mean": 0.24452757835388184, "rewards/ngrams_iou_reward/std": 0.2763651907444, "rewards/schema_keywords_iou_reward/mean": 0.7205983996391296, "rewards/schema_keywords_iou_reward/std": 0.18947428464889526, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 223.3854217529297, "completions/mean_terminated_length": 169.0277862548828, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 2.01696352841391, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7560403943061829, "kl": 0.05389404296875, "learning_rate": 2.9687207408810555e-07, "loss": 0.0115, "num_tokens": 152183996.0, "reward": 9.723423957824707, "reward_std": 1.3566746711730957, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.5749999284744263, "rewards/judge_reward/std": 1.5996726751327515, "rewards/ngrams_iou_reward/mean": 0.15150095522403717, "rewards/ngrams_iou_reward/std": 0.16460755467414856, "rewards/schema_keywords_iou_reward/mean": 0.7031721472740173, "rewards/schema_keywords_iou_reward/std": 0.1586088240146637, "rewards/syntax_reward/mean": 0.7604166865348816, "rewards/syntax_reward/std": 0.427945077419281, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 227.6666717529297, "completions/mean_terminated_length": 165.33334350585938, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.0203562340966923, "frac_reward_zero_std": 0.0, "grad_norm": 0.777320384979248, "kl": 0.05780029296875, "learning_rate": 2.9507048257706725e-07, "loss": -0.0251, "num_tokens": 152459272.0, "reward": 9.619264602661133, "reward_std": 1.4630591869354248, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710795402526855, "rewards/judge_reward/mean": 1.347916603088379, "rewards/judge_reward/std": 1.594557523727417, "rewards/ngrams_iou_reward/mean": 0.17432688176631927, "rewards/ngrams_iou_reward/std": 0.15647083520889282, "rewards/schema_keywords_iou_reward/mean": 0.6866034865379333, "rewards/schema_keywords_iou_reward/std": 0.18411782383918762, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 224.015625, "completions/mean_terminated_length": 156.9516143798828, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 2.023748939779474, "frac_reward_zero_std": 0.0, "grad_norm": 0.8732931017875671, "kl": 0.05548095703125, "learning_rate": 2.93272083173933e-07, "loss": 0.0034, "num_tokens": 152716513.0, "reward": 10.807960510253906, "reward_std": 1.0956910848617554, "rewards/accuracy_reward/mean": 1.9375, "rewards/accuracy_reward/std": 1.4385310411453247, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.1125000715255737, "rewards/judge_reward/std": 1.5999507904052734, "rewards/ngrams_iou_reward/mean": 0.23967695236206055, "rewards/ngrams_iou_reward/std": 0.24876156449317932, "rewards/schema_keywords_iou_reward/mean": 0.7495326399803162, "rewards/schema_keywords_iou_reward/std": 0.1668316125869751, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 231.7760467529297, "completions/mean_terminated_length": 162.97999572753906, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 2.027141645462256, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8890158534049988, "kl": 0.05157470703125, "learning_rate": 2.9147690389167513e-07, "loss": 0.0272, "num_tokens": 152955516.0, "reward": 9.847485542297363, "reward_std": 1.6155366897583008, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.3885416984558105, "rewards/judge_reward/std": 1.6169501543045044, "rewards/ngrams_iou_reward/mean": 0.19987130165100098, "rewards/ngrams_iou_reward/std": 0.2313680499792099, "rewards/schema_keywords_iou_reward/mean": 0.6882384419441223, "rewards/schema_keywords_iou_reward/std": 0.18384860455989838, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.3854217529297, "completions/mean_terminated_length": 181.3207550048828, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 2.030534351145038, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7881672978401184, "kl": 0.04974365234375, "learning_rate": 2.8968497269310797e-07, "loss": 0.0031, "num_tokens": 153211628.0, "reward": 10.284018516540527, "reward_std": 1.5710735321044922, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.5812498331069946, "rewards/judge_reward/std": 1.692627191543579, "rewards/ngrams_iou_reward/mean": 0.21628837287425995, "rewards/ngrams_iou_reward/std": 0.22988903522491455, "rewards/schema_keywords_iou_reward/mean": 0.7177292704582214, "rewards/schema_keywords_iou_reward/std": 0.16968317329883575, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 231.515625, "completions/mean_terminated_length": 161.97999572753906, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 2.03392705682782, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7756643295288086, "kl": 0.05291748046875, "learning_rate": 2.8789631749045093e-07, "loss": 0.0095, "num_tokens": 153460751.0, "reward": 10.57677936553955, "reward_std": 1.3556761741638184, "rewards/accuracy_reward/mean": 1.84375, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.0968750715255737, "rewards/judge_reward/std": 1.6143306493759155, "rewards/ngrams_iou_reward/mean": 0.27904045581817627, "rewards/ngrams_iou_reward/std": 0.31963416934013367, "rewards/schema_keywords_iou_reward/mean": 0.7550298571586609, "rewards/schema_keywords_iou_reward/std": 0.19219639897346497, "rewards/syntax_reward/mean": 0.8854166865348816, "rewards/syntax_reward/std": 0.3193511664867401, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.06771850585938, "completions/mean_terminated_length": 175.0192413330078, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 2.0373197625106023, "frac_reward_zero_std": 0.0, "grad_norm": 0.8073059916496277, "kl": 0.047607421875, "learning_rate": 2.8611096614489517e-07, "loss": 0.0008, "num_tokens": 153726570.0, "reward": 10.130285263061523, "reward_std": 1.1065433025360107, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.433333396911621, "rewards/judge_reward/std": 1.6183934211730957, "rewards/ngrams_iou_reward/mean": 0.22730374336242676, "rewards/ngrams_iou_reward/std": 0.2585611641407013, "rewards/schema_keywords_iou_reward/mean": 0.7238146662712097, "rewards/schema_keywords_iou_reward/std": 0.16643761098384857, "rewards/syntax_reward/mean": 0.8958333134651184, "rewards/syntax_reward/std": 0.30627527832984924, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 233.36459350585938, "completions/mean_terminated_length": 167.30612182617188, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 2.040712468193384, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8235946297645569, "kl": 0.05426025390625, "learning_rate": 2.8432894646616886e-07, "loss": 0.0161, "num_tokens": 153994696.0, "reward": 9.910828590393066, "reward_std": 1.5639724731445312, "rewards/accuracy_reward/mean": 1.765625, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 0.9729166626930237, "rewards/judge_reward/std": 1.4236946105957031, "rewards/ngrams_iou_reward/mean": 0.1777835339307785, "rewards/ngrams_iou_reward/std": 0.2201441526412964, "rewards/schema_keywords_iou_reward/mean": 0.6934605240821838, "rewards/schema_keywords_iou_reward/std": 0.20528697967529297, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374123275279999, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 227.75521850585938, "completions/mean_terminated_length": 157.39999389648438, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.0441051738761664, "frac_reward_zero_std": 0.03125, "grad_norm": 0.815027117729187, "kl": 0.0499267578125, "learning_rate": 2.8255028621210354e-07, "loss": 0.0074, "num_tokens": 154240391.0, "reward": 10.229255676269531, "reward_std": 1.2524855136871338, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.321874976158142, "rewards/judge_reward/std": 1.5938167572021484, "rewards/ngrams_iou_reward/mean": 0.22999732196331024, "rewards/ngrams_iou_reward/std": 0.2798272371292114, "rewards/schema_keywords_iou_reward/mean": 0.7252998352050781, "rewards/schema_keywords_iou_reward/std": 0.1650983691215515, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 233.48959350585938, "completions/mean_terminated_length": 165.95834350585938, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 2.0474978795589482, "frac_reward_zero_std": 0.125, "grad_norm": 0.903454601764679, "kl": 0.05035400390625, "learning_rate": 2.8077501308820304e-07, "loss": -0.0025, "num_tokens": 154487247.0, "reward": 9.52840805053711, "reward_std": 1.3174763917922974, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.2395833730697632, "rewards/judge_reward/std": 1.4881365299224854, "rewards/ngrams_iou_reward/mean": 0.27128925919532776, "rewards/ngrams_iou_reward/std": 0.288320928812027, "rewards/schema_keywords_iou_reward/mean": 0.7206607460975647, "rewards/schema_keywords_iou_reward/std": 0.20059600472450256, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 233.2291717529297, "completions/mean_terminated_length": 175.0370330810547, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.05089058524173, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7300722002983093, "kl": 0.048583984375, "learning_rate": 2.790031547472105e-07, "loss": -0.0096, "num_tokens": 154737509.0, "reward": 10.599185943603516, "reward_std": 0.950697124004364, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 1.4432151317596436, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.087499976158142, "rewards/judge_reward/std": 1.596806287765503, "rewards/ngrams_iou_reward/mean": 0.15044525265693665, "rewards/ngrams_iou_reward/std": 0.14027196168899536, "rewards/schema_keywords_iou_reward/mean": 0.7372815608978271, "rewards/schema_keywords_iou_reward/std": 0.14097042381763458, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 229.0, "completions/mean_terminated_length": 163.42857360839844, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 2.0542832909245123, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9810498356819153, "kl": 0.0511474609375, "learning_rate": 2.7723473878867877e-07, "loss": 0.0106, "num_tokens": 155008235.0, "reward": 10.686925888061523, "reward_std": 0.8356198072433472, "rewards/accuracy_reward/mean": 1.9375, "rewards/accuracy_reward/std": 1.4385310411453247, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.037500023841858, "rewards/judge_reward/std": 1.5343365669250488, "rewards/ngrams_iou_reward/mean": 0.24058520793914795, "rewards/ngrams_iou_reward/std": 0.2534562647342682, "rewards/schema_keywords_iou_reward/mean": 0.7202978134155273, "rewards/schema_keywords_iou_reward/std": 0.165839284658432, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374122977256775, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 225.98959350585938, "completions/mean_terminated_length": 165.96875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 2.057675996607294, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8057388067245483, "kl": 0.05419921875, "learning_rate": 2.7546979275853987e-07, "loss": -0.0055, "num_tokens": 155272329.0, "reward": 9.972620010375977, "reward_std": 1.0203925371170044, "rewards/accuracy_reward/mean": 1.8125, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319954812526703, "rewards/judge_reward/mean": 0.9437499642372131, "rewards/judge_reward/std": 1.3746252059936523, "rewards/ngrams_iou_reward/mean": 0.1345055103302002, "rewards/ngrams_iou_reward/std": 0.10358226299285889, "rewards/schema_keywords_iou_reward/mean": 0.6849886775016785, "rewards/schema_keywords_iou_reward/std": 0.1582004278898239, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 231.81771850585938, "completions/mean_terminated_length": 171.58181762695312, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 2.0610687022900764, "frac_reward_zero_std": 0.0, "grad_norm": 0.7962809801101685, "kl": 0.064208984375, "learning_rate": 2.7370834414867626e-07, "loss": 0.0154, "num_tokens": 155532526.0, "reward": 10.525264739990234, "reward_std": 1.43827486038208, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.4354166984558105, "rewards/judge_reward/std": 1.74091374874115, "rewards/ngrams_iou_reward/mean": 0.16610777378082275, "rewards/ngrams_iou_reward/std": 0.17293289303779602, "rewards/schema_keywords_iou_reward/mean": 0.7331154346466064, "rewards/schema_keywords_iou_reward/std": 0.15209774672985077, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 237.7291717529297, "completions/mean_terminated_length": 185.83999633789062, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 2.0644614079728583, "frac_reward_zero_std": 0.0, "grad_norm": 0.7897317409515381, "kl": 0.0489501953125, "learning_rate": 2.71950420396492e-07, "loss": 0.0084, "num_tokens": 155787162.0, "reward": 10.086523056030273, "reward_std": 1.0406473875045776, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.484375, "rewards/judge_reward/std": 1.633345365524292, "rewards/ngrams_iou_reward/mean": 0.17418646812438965, "rewards/ngrams_iou_reward/std": 0.1922805905342102, "rewards/schema_keywords_iou_reward/mean": 0.7040020823478699, "rewards/schema_keywords_iou_reward/std": 0.16310973465442657, "rewards/syntax_reward/mean": 0.8854166865348816, "rewards/syntax_reward/std": 0.3193511664867401, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 230.94271850585938, "completions/mean_terminated_length": 170.08929443359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.0678541136556405, "frac_reward_zero_std": 0.03125, "grad_norm": 1.0036461353302002, "kl": 0.05255126953125, "learning_rate": 2.701960488844864e-07, "loss": 0.0311, "num_tokens": 156056161.0, "reward": 10.085514068603516, "reward_std": 1.026965856552124, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.381250023841858, "rewards/judge_reward/std": 1.6068122386932373, "rewards/ngrams_iou_reward/mean": 0.19219334423542023, "rewards/ngrams_iou_reward/std": 0.1715940535068512, "rewards/schema_keywords_iou_reward/mean": 0.7454042434692383, "rewards/schema_keywords_iou_reward/std": 0.14712008833885193, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.77084350585938, "completions/mean_terminated_length": 176.35714721679688, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 2.0712468193384224, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8230433464050293, "kl": 0.05450439453125, "learning_rate": 2.684452569398261e-07, "loss": 0.0126, "num_tokens": 156325139.0, "reward": 9.632787704467773, "reward_std": 1.7221421003341675, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.129166603088379, "rewards/judge_reward/std": 1.4990339279174805, "rewards/ngrams_iou_reward/mean": 0.18369996547698975, "rewards/ngrams_iou_reward/std": 0.21589234471321106, "rewards/schema_keywords_iou_reward/mean": 0.6907528042793274, "rewards/schema_keywords_iou_reward/std": 0.16070660948753357, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 236.7760467529297, "completions/mean_terminated_length": 186.35848999023438, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 2.0746395250212046, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8147648572921753, "kl": 0.0540771484375, "learning_rate": 2.6669807183392105e-07, "loss": 0.0125, "num_tokens": 156570490.0, "reward": 9.285051345825195, "reward_std": 2.01206636428833, "rewards/accuracy_reward/mean": 1.21875, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.8854166865348816, "rewards/format_reward/std": 0.3193511664867401, "rewards/judge_reward/mean": 1.6177082061767578, "rewards/judge_reward/std": 1.6811548471450806, "rewards/ngrams_iou_reward/mean": 0.1809982806444168, "rewards/ngrams_iou_reward/std": 0.18932490050792694, "rewards/schema_keywords_iou_reward/mean": 0.6603028178215027, "rewards/schema_keywords_iou_reward/std": 0.19428354501724243, "rewards/syntax_reward/mean": 0.6666666865348816, "rewards/syntax_reward/std": 0.4726369380950928, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 232.53125, "completions/mean_terminated_length": 162.125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 2.0780322307039865, "frac_reward_zero_std": 0.09375, "grad_norm": 0.9096114039421082, "kl": 0.052734375, "learning_rate": 2.649545207819986e-07, "loss": -0.0077, "num_tokens": 156810790.0, "reward": 10.092845916748047, "reward_std": 1.0768778324127197, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.3000000715255737, "rewards/judge_reward/std": 1.5674703121185303, "rewards/ngrams_iou_reward/mean": 0.21180333197116852, "rewards/ngrams_iou_reward/std": 0.2264113575220108, "rewards/schema_keywords_iou_reward/mean": 0.7237502932548523, "rewards/schema_keywords_iou_reward/std": 0.16918055713176727, "rewards/syntax_reward/mean": 0.8958333134651184, "rewards/syntax_reward/std": 0.30627527832984924, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 230.78125, "completions/mean_terminated_length": 175.3000030517578, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 2.0814249363867683, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8469012379646301, "kl": 0.0511474609375, "learning_rate": 2.632146309426793e-07, "loss": 0.0137, "num_tokens": 157079320.0, "reward": 9.940287590026855, "reward_std": 1.3963291645050049, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.2385417222976685, "rewards/judge_reward/std": 1.5499954223632812, "rewards/ngrams_iou_reward/mean": 0.23744980990886688, "rewards/ngrams_iou_reward/std": 0.28572940826416016, "rewards/schema_keywords_iou_reward/mean": 0.7257540822029114, "rewards/schema_keywords_iou_reward/std": 0.18131683766841888, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 233.2135467529297, "completions/mean_terminated_length": 183.08334350585938, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 2.0848176420695506, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7796133160591125, "kl": 0.048583984375, "learning_rate": 2.614784294175554e-07, "loss": 0.021, "num_tokens": 157352271.0, "reward": 9.40585708618164, "reward_std": 1.4239228963851929, "rewards/accuracy_reward/mean": 1.234375, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.546875, "rewards/judge_reward/std": 1.6413154602050781, "rewards/ngrams_iou_reward/mean": 0.21132487058639526, "rewards/ngrams_iou_reward/std": 0.2195822298526764, "rewards/schema_keywords_iou_reward/mean": 0.7205728888511658, "rewards/schema_keywords_iou_reward/std": 0.16545423865318298, "rewards/syntax_reward/mean": 0.7552083134651184, "rewards/syntax_reward/std": 0.4310877025127411, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 184.78688049316406, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 2.0882103477523324, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7550505995750427, "kl": 0.05908203125, "learning_rate": 2.5974594325076636e-07, "loss": 0.0152, "num_tokens": 157608915.0, "reward": 10.243576049804688, "reward_std": 1.3318051099777222, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.486458420753479, "rewards/judge_reward/std": 1.6951498985290527, "rewards/ngrams_iou_reward/mean": 0.18530873954296112, "rewards/ngrams_iou_reward/std": 0.2426707148551941, "rewards/schema_keywords_iou_reward/mean": 0.6947250366210938, "rewards/schema_keywords_iou_reward/std": 0.1853850930929184, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 235.1041717529297, "completions/mean_terminated_length": 170.63829040527344, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.0916030534351147, "frac_reward_zero_std": 0.0, "grad_norm": 0.7982754111289978, "kl": 0.05426025390625, "learning_rate": 2.5801719942858065e-07, "loss": 0.0105, "num_tokens": 157850687.0, "reward": 10.229738235473633, "reward_std": 1.2643077373504639, "rewards/accuracy_reward/mean": 1.859375, "rewards/accuracy_reward/std": 1.4601210355758667, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 0.9729166030883789, "rewards/judge_reward/std": 1.4791024923324585, "rewards/ngrams_iou_reward/mean": 0.17158299684524536, "rewards/ngrams_iou_reward/std": 0.19628135859966278, "rewards/schema_keywords_iou_reward/mean": 0.7008630633354187, "rewards/schema_keywords_iou_reward/std": 0.1802559643983841, "rewards/syntax_reward/mean": 0.8958333134651184, "rewards/syntax_reward/std": 0.30627527832984924, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 231.96875, "completions/mean_terminated_length": 176.44827270507812, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 2.0949957591178965, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8009859323501587, "kl": 0.0552978515625, "learning_rate": 2.5629222487897217e-07, "loss": -0.0105, "num_tokens": 158112899.0, "reward": 10.418464660644531, "reward_std": 1.3259695768356323, "rewards/accuracy_reward/mean": 1.734375, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.2604166269302368, "rewards/judge_reward/std": 1.6432937383651733, "rewards/ngrams_iou_reward/mean": 0.18363864719867706, "rewards/ngrams_iou_reward/std": 0.18197058141231537, "rewards/schema_keywords_iou_reward/mean": 0.7400341629981995, "rewards/schema_keywords_iou_reward/std": 0.15504203736782074, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 220.9635467529297, "completions/mean_terminated_length": 163.8493194580078, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.0983884648006788, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9268183708190918, "kl": 0.05078125, "learning_rate": 2.545710464712032e-07, "loss": 0.0119, "num_tokens": 158363512.0, "reward": 10.950815200805664, "reward_std": 1.3177943229675293, "rewards/accuracy_reward/mean": 2.3125, "rewards/accuracy_reward/std": 1.2641865015029907, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 0.565625011920929, "rewards/judge_reward/std": 1.227207064628601, "rewards/ngrams_iou_reward/mean": 0.2633550465106964, "rewards/ngrams_iou_reward/std": 0.2720346450805664, "rewards/schema_keywords_iou_reward/mean": 0.7541265487670898, "rewards/schema_keywords_iou_reward/std": 0.17348556220531464, "rewards/syntax_reward/mean": 0.9010416865348816, "rewards/syntax_reward/std": 0.2993867099285126, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 236.2135467529297, "completions/mean_terminated_length": 167.6511688232422, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 2.1017811704834606, "frac_reward_zero_std": 0.0, "grad_norm": 0.7991536855697632, "kl": 0.048583984375, "learning_rate": 2.5285369101540445e-07, "loss": 0.0029, "num_tokens": 158606499.0, "reward": 10.018030166625977, "reward_std": 1.4320634603500366, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.3760417699813843, "rewards/judge_reward/std": 1.7270405292510986, "rewards/ngrams_iou_reward/mean": 0.19456326961517334, "rewards/ngrams_iou_reward/std": 0.19600439071655273, "rewards/schema_keywords_iou_reward/mean": 0.753674328327179, "rewards/schema_keywords_iou_reward/std": 0.1684902310371399, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 233.3541717529297, "completions/mean_terminated_length": 167.2653045654297, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.1051738761662424, "frac_reward_zero_std": 0.0625, "grad_norm": 0.744186520576477, "kl": 0.04931640625, "learning_rate": 2.511401852621584e-07, "loss": 0.0156, "num_tokens": 158853887.0, "reward": 9.67191219329834, "reward_std": 1.601174235343933, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.59375, "rewards/judge_reward/std": 1.626727819442749, "rewards/ngrams_iou_reward/mean": 0.16020818054676056, "rewards/ngrams_iou_reward/std": 0.1755114197731018, "rewards/schema_keywords_iou_reward/mean": 0.7044119834899902, "rewards/schema_keywords_iou_reward/std": 0.16283152997493744, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 238.640625, "completions/mean_terminated_length": 178.48837280273438, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 2.1085665818490247, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7187923192977905, "kl": 0.05029296875, "learning_rate": 2.494305559020822e-07, "loss": -0.0136, "num_tokens": 159094202.0, "reward": 9.878854751586914, "reward_std": 1.363203763961792, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.404166579246521, "rewards/judge_reward/std": 1.5824902057647705, "rewards/ngrams_iou_reward/mean": 0.2099515199661255, "rewards/ngrams_iou_reward/std": 0.22830460965633392, "rewards/schema_keywords_iou_reward/mean": 0.7407776713371277, "rewards/schema_keywords_iou_reward/std": 0.16893628239631653, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 233.9947967529297, "completions/mean_terminated_length": 181.877197265625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 2.1119592875318065, "frac_reward_zero_std": 0.0, "grad_norm": 0.8143629431724548, "kl": 0.0513916015625, "learning_rate": 2.477248295654113e-07, "loss": 0.0074, "num_tokens": 159366031.0, "reward": 9.912099838256836, "reward_std": 1.3912267684936523, "rewards/accuracy_reward/mean": 1.25, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.7645834684371948, "rewards/judge_reward/std": 1.6842962503433228, "rewards/ngrams_iou_reward/mean": 0.19294960796833038, "rewards/ngrams_iou_reward/std": 0.19338327646255493, "rewards/schema_keywords_iou_reward/mean": 0.6951916813850403, "rewards/schema_keywords_iou_reward/std": 0.17628629505634308, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 224.890625, "completions/mean_terminated_length": 166.85073852539062, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.1153519932145888, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8888816833496094, "kl": 0.057861328125, "learning_rate": 2.4602303282158613e-07, "loss": -0.0151, "num_tokens": 159610732.0, "reward": 9.979604721069336, "reward_std": 1.5318889617919922, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.53125, "rewards/judge_reward/std": 1.6811435222625732, "rewards/ngrams_iou_reward/mean": 0.22324566543102264, "rewards/ngrams_iou_reward/std": 0.26386409997940063, "rewards/schema_keywords_iou_reward/mean": 0.7094841599464417, "rewards/schema_keywords_iou_reward/std": 0.18661797046661377, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 228.7760467529297, "completions/mean_terminated_length": 168.8833465576172, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 2.1187446988973706, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7286864519119263, "kl": 0.05279541015625, "learning_rate": 2.4432519217883676e-07, "loss": 0.0172, "num_tokens": 159872967.0, "reward": 10.180624961853027, "reward_std": 1.197986125946045, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.4010416269302368, "rewards/judge_reward/std": 1.6603156328201294, "rewards/ngrams_iou_reward/mean": 0.21471361815929413, "rewards/ngrams_iou_reward/std": 0.22260889410972595, "rewards/schema_keywords_iou_reward/mean": 0.7107018828392029, "rewards/schema_keywords_iou_reward/std": 0.1943087875843048, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.15625, "completions/mean_terminated_length": 169.27272033691406, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 2.1187446988973706, "frac_reward_zero_std": 0.0, "grad_norm": 0.7667890191078186, "kl": 0.0496826171875, "learning_rate": 2.4263133408377073e-07, "loss": -0.0033, "num_tokens": 160133337.0, "reward": 9.783143043518066, "reward_std": 1.2667944431304932, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 0.9197916984558105, "rewards/judge_reward/std": 1.4021762609481812, "rewards/ngrams_iou_reward/mean": 0.23491771519184113, "rewards/ngrams_iou_reward/std": 0.2556712329387665, "rewards/schema_keywords_iou_reward/mean": 0.6930161118507385, "rewards/schema_keywords_iou_reward/std": 0.20609132945537567, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 227.8854217529297, "completions/mean_terminated_length": 179.97183227539062, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 2.122137404580153, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7601056098937988, "kl": 0.05303955078125, "learning_rate": 2.409414849209612e-07, "loss": 0.0101, "num_tokens": 160418339.0, "reward": 9.874082565307617, "reward_std": 1.1106438636779785, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.455208420753479, "rewards/judge_reward/std": 1.619667649269104, "rewards/ngrams_iou_reward/mean": 0.21381787955760956, "rewards/ngrams_iou_reward/std": 0.23861117660999298, "rewards/schema_keywords_iou_reward/mean": 0.7342214584350586, "rewards/schema_keywords_iou_reward/std": 0.16247357428073883, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 228.140625, "completions/mean_terminated_length": 173.7076873779297, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.1255301102629347, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7479321956634521, "kl": 0.045654296875, "learning_rate": 2.392556710125357e-07, "loss": -0.0089, "num_tokens": 160670432.0, "reward": 9.98267936706543, "reward_std": 1.412597417831421, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.2114583253860474, "rewards/judge_reward/std": 1.5359134674072266, "rewards/ngrams_iou_reward/mean": 0.16141168773174286, "rewards/ngrams_iou_reward/std": 0.17177869379520416, "rewards/schema_keywords_iou_reward/mean": 0.7004340291023254, "rewards/schema_keywords_iou_reward/std": 0.1707422137260437, "rewards/syntax_reward/mean": 0.9114583134651184, "rewards/syntax_reward/std": 0.2848237454891205, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 230.7916717529297, "completions/mean_terminated_length": 173.96609497070312, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 2.1289228159457165, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8369835019111633, "kl": 0.051513671875, "learning_rate": 2.3757391861776583e-07, "loss": 0.0003, "num_tokens": 160930594.0, "reward": 10.62984848022461, "reward_std": 0.9464221000671387, "rewards/accuracy_reward/mean": 1.859375, "rewards/accuracy_reward/std": 1.4601210355758667, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.1177083253860474, "rewards/judge_reward/std": 1.6205805540084839, "rewards/ngrams_iou_reward/mean": 0.17505304515361786, "rewards/ngrams_iou_reward/std": 0.13711391389369965, "rewards/schema_keywords_iou_reward/mean": 0.750628650188446, "rewards/schema_keywords_iou_reward/std": 0.126149520277977, "rewards/syntax_reward/mean": 0.9270833134651184, "rewards/syntax_reward/std": 0.2606794238090515, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 229.40625, "completions/mean_terminated_length": 179.79104614257812, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 2.132315521628499, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9370481967926025, "kl": 0.0518798828125, "learning_rate": 2.3589625393265893e-07, "loss": -0.0094, "num_tokens": 161183452.0, "reward": 10.211790084838867, "reward_std": 1.0438613891601562, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.082291603088379, "rewards/judge_reward/std": 1.5180928707122803, "rewards/ngrams_iou_reward/mean": 0.19738204777240753, "rewards/ngrams_iou_reward/std": 0.20192869007587433, "rewards/schema_keywords_iou_reward/mean": 0.7144067883491516, "rewards/schema_keywords_iou_reward/std": 0.17107099294662476, "rewards/syntax_reward/mean": 0.9322916865348816, "rewards/syntax_reward/std": 0.2519015669822693, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.875, "completions/mean_terminated_length": 171.36842346191406, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 2.1357082273112806, "frac_reward_zero_std": 0.0, "grad_norm": 0.8129884600639343, "kl": 0.06085205078125, "learning_rate": 2.3422270308954933e-07, "loss": -0.0065, "num_tokens": 161430424.0, "reward": 9.431751251220703, "reward_std": 1.6479309797286987, "rewards/accuracy_reward/mean": 1.03125, "rewards/accuracy_reward/std": 1.4286017417907715, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.8739582300186157, "rewards/judge_reward/std": 1.6707364320755005, "rewards/ngrams_iou_reward/mean": 0.2038516402244568, "rewards/ngrams_iou_reward/std": 0.2492244839668274, "rewards/schema_keywords_iou_reward/mean": 0.6883153915405273, "rewards/schema_keywords_iou_reward/std": 0.18483522534370422, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.30209350585938, "completions/mean_terminated_length": 178.94595336914062, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.139100932994063, "frac_reward_zero_std": 0.0, "grad_norm": 0.7884067296981812, "kl": 0.0528564453125, "learning_rate": 2.3255329215669184e-07, "loss": 0.0133, "num_tokens": 161711114.0, "reward": 10.309122085571289, "reward_std": 1.2787082195281982, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.6531251668930054, "rewards/judge_reward/std": 1.7482050657272339, "rewards/ngrams_iou_reward/mean": 0.16356289386749268, "rewards/ngrams_iou_reward/std": 0.16754762828350067, "rewards/schema_keywords_iou_reward/mean": 0.7143089771270752, "rewards/schema_keywords_iou_reward/std": 0.15534213185310364, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.7291717529297, "completions/mean_terminated_length": 164.29090881347656, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 2.1424936386768447, "frac_reward_zero_std": 0.0, "grad_norm": 0.8614149689674377, "kl": 0.05145263671875, "learning_rate": 2.308880471378558e-07, "loss": 0.0029, "num_tokens": 161959408.0, "reward": 10.114229202270508, "reward_std": 1.1665129661560059, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.236458420753479, "rewards/judge_reward/std": 1.5546330213546753, "rewards/ngrams_iou_reward/mean": 0.2422330379486084, "rewards/ngrams_iou_reward/std": 0.26084792613983154, "rewards/schema_keywords_iou_reward/mean": 0.7063705325126648, "rewards/schema_keywords_iou_reward/std": 0.18503910303115845, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 227.9322967529297, "completions/mean_terminated_length": 161.45614624023438, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 2.145886344359627, "frac_reward_zero_std": 0.0, "grad_norm": 0.7746372818946838, "kl": 0.04998779296875, "learning_rate": 2.2922699397191892e-07, "loss": 0.0091, "num_tokens": 162197895.0, "reward": 10.374190330505371, "reward_std": 1.1454501152038574, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319953322410583, "rewards/judge_reward/mean": 1.2572916746139526, "rewards/judge_reward/std": 1.594443440437317, "rewards/ngrams_iou_reward/mean": 0.1949009746313095, "rewards/ngrams_iou_reward/std": 0.20823970437049866, "rewards/schema_keywords_iou_reward/mean": 0.7219976782798767, "rewards/schema_keywords_iou_reward/std": 0.1585872322320938, "rewards/syntax_reward/mean": 0.9010416865348816, "rewards/syntax_reward/std": 0.29938673973083496, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 229.58334350585938, "completions/mean_terminated_length": 167.01754760742188, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.149279050042409, "frac_reward_zero_std": 0.0625, "grad_norm": 0.760769784450531, "kl": 0.04791259765625, "learning_rate": 2.275701585324649e-07, "loss": 0.0215, "num_tokens": 162446185.0, "reward": 9.936126708984375, "reward_std": 1.6183631420135498, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.1072916984558105, "rewards/judge_reward/std": 1.5357345342636108, "rewards/ngrams_iou_reward/mean": 0.20274408161640167, "rewards/ngrams_iou_reward/std": 0.23255622386932373, "rewards/schema_keywords_iou_reward/mean": 0.7167161107063293, "rewards/schema_keywords_iou_reward/std": 0.178395614027977, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.08334350585938, "completions/mean_terminated_length": 173.49020385742188, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 2.1526717557251906, "frac_reward_zero_std": 0.0, "grad_norm": 0.7806082963943481, "kl": 0.05322265625, "learning_rate": 2.259175666273786e-07, "loss": 0.0238, "num_tokens": 162706931.0, "reward": 10.13093376159668, "reward_std": 1.2689474821090698, "rewards/accuracy_reward/mean": 1.21875, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.910416603088379, "rewards/judge_reward/std": 1.7564527988433838, "rewards/ngrams_iou_reward/mean": 0.18192081153392792, "rewards/ngrams_iou_reward/std": 0.18950755894184113, "rewards/schema_keywords_iou_reward/mean": 0.7219287753105164, "rewards/schema_keywords_iou_reward/std": 0.16136883199214935, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.41087818145751953, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.078125, "completions/mean_terminated_length": 170.5319061279297, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.156064461407973, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6880298256874084, "kl": 0.05096435546875, "learning_rate": 2.2426924399844627e-07, "loss": -0.0013, "num_tokens": 162959702.0, "reward": 10.4893217086792, "reward_std": 0.9915164709091187, "rewards/accuracy_reward/mean": 1.796875, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319953322410583, "rewards/judge_reward/mean": 1.1677082777023315, "rewards/judge_reward/std": 1.5863286256790161, "rewards/ngrams_iou_reward/mean": 0.24620918929576874, "rewards/ngrams_iou_reward/std": 0.29120418429374695, "rewards/schema_keywords_iou_reward/mean": 0.7306120991706848, "rewards/schema_keywords_iou_reward/std": 0.18557073175907135, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.6197967529297, "completions/mean_terminated_length": 173.0441131591797, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 2.1594571670907547, "frac_reward_zero_std": 0.0, "grad_norm": 0.8129580020904541, "kl": 0.05987548828125, "learning_rate": 2.22625216320952e-07, "loss": 0.029, "num_tokens": 163201393.0, "reward": 9.877017974853516, "reward_std": 1.464226484298706, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.318750023841858, "rewards/judge_reward/std": 1.5511058568954468, "rewards/ngrams_iou_reward/mean": 0.1622009426355362, "rewards/ngrams_iou_reward/std": 0.16541150212287903, "rewards/schema_keywords_iou_reward/mean": 0.6971076130867004, "rewards/schema_keywords_iou_reward/std": 0.15095065534114838, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 226.11459350585938, "completions/mean_terminated_length": 166.34375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 2.162849872773537, "frac_reward_zero_std": 0.0, "grad_norm": 0.8492228388786316, "kl": 0.0482177734375, "learning_rate": 2.2098550920327995e-07, "loss": 0.0183, "num_tokens": 163471067.0, "reward": 10.364118576049805, "reward_std": 0.9644243717193604, "rewards/accuracy_reward/mean": 1.8125, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.0729166269302368, "rewards/judge_reward/std": 1.4604365825653076, "rewards/ngrams_iou_reward/mean": 0.21556484699249268, "rewards/ngrams_iou_reward/std": 0.244747593998909, "rewards/schema_keywords_iou_reward/mean": 0.7110527157783508, "rewards/schema_keywords_iou_reward/std": 0.17801736295223236, "rewards/syntax_reward/mean": 0.8802083134651184, "rewards/syntax_reward/std": 0.32556670904159546, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 227.8385467529297, "completions/mean_terminated_length": 155.87037658691406, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 2.166242578456319, "frac_reward_zero_std": 0.0, "grad_norm": 0.9349397420883179, "kl": 0.0498046875, "learning_rate": 2.1935014818651403e-07, "loss": -0.0059, "num_tokens": 163721320.0, "reward": 9.254476547241211, "reward_std": 1.4794639348983765, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 1.4122743606567383, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3129251003265381, "rewards/judge_reward/mean": 1.8697916269302368, "rewards/judge_reward/std": 1.5905228853225708, "rewards/ngrams_iou_reward/mean": 0.2146693617105484, "rewards/ngrams_iou_reward/std": 0.22058157622814178, "rewards/schema_keywords_iou_reward/mean": 0.6960563659667969, "rewards/schema_keywords_iou_reward/std": 0.1801934689283371, "rewards/syntax_reward/mean": 0.7604166865348816, "rewards/syntax_reward/std": 0.4279450476169586, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 227.765625, "completions/mean_terminated_length": 177.43478393554688, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 2.169635284139101, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8616597652435303, "kl": 0.05078125, "learning_rate": 2.177191587440409e-07, "loss": 0.0107, "num_tokens": 163987819.0, "reward": 10.256109237670898, "reward_std": 1.192466139793396, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.3302083015441895, "rewards/judge_reward/std": 1.6262003183364868, "rewards/ngrams_iou_reward/mean": 0.20522741973400116, "rewards/ngrams_iou_reward/std": 0.2158416211605072, "rewards/schema_keywords_iou_reward/mean": 0.7133817672729492, "rewards/schema_keywords_iou_reward/std": 0.1599292755126953, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617789745331, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 238.65625, "completions/mean_terminated_length": 185.14892578125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.173027989821883, "frac_reward_zero_std": 0.0, "grad_norm": 0.7717929482460022, "kl": 0.0474853515625, "learning_rate": 2.1609256628115312e-07, "loss": 0.0055, "num_tokens": 164242117.0, "reward": 10.588598251342773, "reward_std": 1.005959391593933, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.2864583730697632, "rewards/judge_reward/std": 1.6562821865081787, "rewards/ngrams_iou_reward/mean": 0.17345845699310303, "rewards/ngrams_iou_reward/std": 0.1693097949028015, "rewards/schema_keywords_iou_reward/mean": 0.7224307656288147, "rewards/schema_keywords_iou_reward/std": 0.14212480187416077, "rewards/syntax_reward/mean": 0.9114583134651184, "rewards/syntax_reward/std": 0.2848237454891205, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.1979217529297, "completions/mean_terminated_length": 180.41270446777344, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 2.1764206955046648, "frac_reward_zero_std": 0.0, "grad_norm": 0.8500003218650818, "kl": 0.0587158203125, "learning_rate": 2.144703961346526e-07, "loss": -0.0186, "num_tokens": 164505345.0, "reward": 10.213310241699219, "reward_std": 1.2653663158416748, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319953322410583, "rewards/judge_reward/mean": 1.4864583015441895, "rewards/judge_reward/std": 1.6641032695770264, "rewards/ngrams_iou_reward/mean": 0.18823736906051636, "rewards/ngrams_iou_reward/std": 0.1887616515159607, "rewards/schema_keywords_iou_reward/mean": 0.7136139869689941, "rewards/schema_keywords_iou_reward/std": 0.15607380867004395, "rewards/syntax_reward/mean": 0.90625, "rewards/syntax_reward/std": 0.2922426164150238, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 229.453125, "completions/mean_terminated_length": 178.77273559570312, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 2.179813401187447, "frac_reward_zero_std": 0.0, "grad_norm": 0.8916187882423401, "kl": 0.04833984375, "learning_rate": 2.1285267357245717e-07, "loss": 0.0081, "num_tokens": 164756628.0, "reward": 9.804496765136719, "reward_std": 1.3823626041412354, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.5020833015441895, "rewards/judge_reward/std": 1.696067452430725, "rewards/ngrams_iou_reward/mean": 0.15192626416683197, "rewards/ngrams_iou_reward/std": 0.1618758738040924, "rewards/schema_keywords_iou_reward/mean": 0.6744444966316223, "rewards/schema_keywords_iou_reward/std": 0.18506720662117004, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 232.2135467529297, "completions/mean_terminated_length": 172.96363830566406, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 2.183206106870229, "frac_reward_zero_std": 0.0, "grad_norm": 0.8236421942710876, "kl": 0.0491943359375, "learning_rate": 2.1123942379320575e-07, "loss": 0.0083, "num_tokens": 165027425.0, "reward": 10.282882690429688, "reward_std": 1.2343266010284424, "rewards/accuracy_reward/mean": 1.796875, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.1041666269302368, "rewards/judge_reward/std": 1.5677319765090942, "rewards/ngrams_iou_reward/mean": 0.16224052011966705, "rewards/ngrams_iou_reward/std": 0.16947218775749207, "rewards/schema_keywords_iou_reward/mean": 0.7039744853973389, "rewards/schema_keywords_iou_reward/std": 0.1838168501853943, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617789745331, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 227.84896850585938, "completions/mean_terminated_length": 164.38983154296875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 2.186598812553011, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8078756928443909, "kl": 0.05242919921875, "learning_rate": 2.0963067192586686e-07, "loss": -0.0006, "num_tokens": 165279942.0, "reward": 10.418890953063965, "reward_std": 1.2444065809249878, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.2666666507720947, "rewards/judge_reward/std": 1.6196868419647217, "rewards/ngrams_iou_reward/mean": 0.19391894340515137, "rewards/ngrams_iou_reward/std": 0.20087465643882751, "rewards/schema_keywords_iou_reward/mean": 0.7124713063240051, "rewards/schema_keywords_iou_reward/std": 0.1541634202003479, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617789745331, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 230.75521850585938, "completions/mean_terminated_length": 180.265625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 2.189991518235793, "frac_reward_zero_std": 0.0, "grad_norm": 0.8410570025444031, "kl": 0.05267333984375, "learning_rate": 2.080264430293468e-07, "loss": 0.0224, "num_tokens": 165539137.0, "reward": 9.73422622680664, "reward_std": 1.4054325819015503, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.3854166269302368, "rewards/judge_reward/std": 1.5987547636032104, "rewards/ngrams_iou_reward/mean": 0.15677005052566528, "rewards/ngrams_iou_reward/std": 0.14917141199111938, "rewards/schema_keywords_iou_reward/mean": 0.6920388340950012, "rewards/schema_keywords_iou_reward/std": 0.14557094871997833, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 231.47396850585938, "completions/mean_terminated_length": 181.2539825439453, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 2.1933842239185752, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7155824303627014, "kl": 0.05078125, "learning_rate": 2.064267620920993e-07, "loss": 0.0222, "num_tokens": 165798674.0, "reward": 10.069981575012207, "reward_std": 1.2293603420257568, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.353124976158142, "rewards/judge_reward/std": 1.6198023557662964, "rewards/ngrams_iou_reward/mean": 0.17063365876674652, "rewards/ngrams_iou_reward/std": 0.2229927033185959, "rewards/schema_keywords_iou_reward/mean": 0.6930971145629883, "rewards/schema_keywords_iou_reward/std": 0.1565118432044983, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 233.83334350585938, "completions/mean_terminated_length": 174.1538543701172, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 2.196776929601357, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7812225818634033, "kl": 0.0545654296875, "learning_rate": 2.048316540317358e-07, "loss": 0.0067, "num_tokens": 166043046.0, "reward": 10.457633972167969, "reward_std": 1.5411099195480347, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 0.9677082896232605, "rewards/judge_reward/std": 1.5330216884613037, "rewards/ngrams_iou_reward/mean": 0.2063003033399582, "rewards/ngrams_iou_reward/std": 0.2220817357301712, "rewards/schema_keywords_iou_reward/mean": 0.7482075691223145, "rewards/schema_keywords_iou_reward/std": 0.16833727061748505, "rewards/syntax_reward/mean": 0.9270833134651184, "rewards/syntax_reward/std": 0.2606794238090515, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.0729217529297, "completions/mean_terminated_length": 183.8360595703125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 2.200169635284139, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9160175919532776, "kl": 0.05731201171875, "learning_rate": 2.0324114369463851e-07, "loss": 0.005, "num_tokens": 166293554.0, "reward": 10.10042953491211, "reward_std": 1.4434542655944824, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.2374999523162842, "rewards/judge_reward/std": 1.6349103450775146, "rewards/ngrams_iou_reward/mean": 0.18870937824249268, "rewards/ngrams_iou_reward/std": 0.18986250460147858, "rewards/schema_keywords_iou_reward/mean": 0.7231777310371399, "rewards/schema_keywords_iou_reward/std": 0.17574603855609894, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.828125, "completions/mean_terminated_length": 167.09524536132812, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.203562340966921, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7209925055503845, "kl": 0.05419921875, "learning_rate": 2.0165525585557203e-07, "loss": -0.0098, "num_tokens": 166547735.0, "reward": 9.933588981628418, "reward_std": 1.165139079093933, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.1875, "rewards/judge_reward/std": 1.5316041707992554, "rewards/ngrams_iou_reward/mean": 0.16970466077327728, "rewards/ngrams_iou_reward/std": 0.2184140682220459, "rewards/schema_keywords_iou_reward/mean": 0.6961758136749268, "rewards/schema_keywords_iou_reward/std": 0.1850985884666443, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.6875, "completions/mean_terminated_length": 173.63636779785156, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 2.206955046649703, "frac_reward_zero_std": 0.0, "grad_norm": 0.8093327283859253, "kl": 0.05279541015625, "learning_rate": 2.000740152172986e-07, "loss": 0.0032, "num_tokens": 166791287.0, "reward": 10.030677795410156, "reward_std": 1.379948616027832, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.5697916746139526, "rewards/judge_reward/std": 1.6664612293243408, "rewards/ngrams_iou_reward/mean": 0.2127888798713684, "rewards/ngrams_iou_reward/std": 0.21145182847976685, "rewards/schema_keywords_iou_reward/mean": 0.6887216567993164, "rewards/schema_keywords_iou_reward/std": 0.18004019558429718, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 231.67709350585938, "completions/mean_terminated_length": 178.1666717529297, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 2.2103477523324853, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7775216698646545, "kl": 0.05572509765625, "learning_rate": 1.984974464101928e-07, "loss": -0.0143, "num_tokens": 167031843.0, "reward": 9.285272598266602, "reward_std": 1.6169517040252686, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.3229166269302368, "rewards/judge_reward/std": 1.5639578104019165, "rewards/ngrams_iou_reward/mean": 0.19168925285339355, "rewards/ngrams_iou_reward/std": 0.23848891258239746, "rewards/schema_keywords_iou_reward/mean": 0.6769153475761414, "rewards/schema_keywords_iou_reward/std": 0.20699264109134674, "rewards/syntax_reward/mean": 0.703125, "rewards/syntax_reward/std": 0.4580754339694977, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.4635467529297, "completions/mean_terminated_length": 175.1269989013672, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.213740458015267, "frac_reward_zero_std": 0.0, "grad_norm": 0.8781770467758179, "kl": 0.057373046875, "learning_rate": 1.9692557399185733e-07, "loss": 0.0114, "num_tokens": 167313926.0, "reward": 9.320229530334473, "reward_std": 1.948251485824585, "rewards/accuracy_reward/mean": 1.15625, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.6895833015441895, "rewards/judge_reward/std": 1.6540275812149048, "rewards/ngrams_iou_reward/mean": 0.1513785868883133, "rewards/ngrams_iou_reward/std": 0.16304586827754974, "rewards/schema_keywords_iou_reward/mean": 0.6594755053520203, "rewards/schema_keywords_iou_reward/std": 0.1832156926393509, "rewards/syntax_reward/mean": 0.7395833134651184, "rewards/syntax_reward/std": 0.44000932574272156, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.20834350585938, "completions/mean_terminated_length": 169.122802734375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 2.2171331636980494, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7889268398284912, "kl": 0.052734375, "learning_rate": 1.953584224467418e-07, "loss": 0.0187, "num_tokens": 167556390.0, "reward": 9.462425231933594, "reward_std": 1.4842252731323242, "rewards/accuracy_reward/mean": 1.203125, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.6635416746139526, "rewards/judge_reward/std": 1.692336916923523, "rewards/ngrams_iou_reward/mean": 0.17754000425338745, "rewards/ngrams_iou_reward/std": 0.21450641751289368, "rewards/schema_keywords_iou_reward/mean": 0.7182182669639587, "rewards/schema_keywords_iou_reward/std": 0.16324110329151154, "rewards/syntax_reward/mean": 0.71875, "rewards/syntax_reward/std": 0.4507846534252167, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 234.109375, "completions/mean_terminated_length": 173.58824157714844, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 2.220525869380831, "frac_reward_zero_std": 0.0, "grad_norm": 0.7974915504455566, "kl": 0.05950927734375, "learning_rate": 1.9379601618575975e-07, "loss": -0.0002, "num_tokens": 167833755.0, "reward": 9.887392044067383, "reward_std": 1.4424798488616943, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.3989583253860474, "rewards/judge_reward/std": 1.6121914386749268, "rewards/ngrams_iou_reward/mean": 0.2391669899225235, "rewards/ngrams_iou_reward/std": 0.22775232791900635, "rewards/schema_keywords_iou_reward/mean": 0.7148900032043457, "rewards/schema_keywords_iou_reward/std": 0.18627901375293732, "rewards/syntax_reward/mean": 0.75, "rewards/syntax_reward/std": 0.4341447353363037, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 233.59896850585938, "completions/mean_terminated_length": 177.79998779296875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 2.223918575063613, "frac_reward_zero_std": 0.09375, "grad_norm": 0.8603849411010742, "kl": 0.0509033203125, "learning_rate": 1.9223837954591043e-07, "loss": 0.0124, "num_tokens": 168063928.0, "reward": 10.558252334594727, "reward_std": 0.8225307464599609, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.2510416507720947, "rewards/judge_reward/std": 1.6263854503631592, "rewards/ngrams_iou_reward/mean": 0.199326753616333, "rewards/ngrams_iou_reward/std": 0.21479791402816772, "rewards/schema_keywords_iou_reward/mean": 0.7266330718994141, "rewards/schema_keywords_iou_reward/std": 0.15448641777038574, "rewards/syntax_reward/mean": 0.9166666865348816, "rewards/syntax_reward/std": 0.27710798382759094, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 230.8072967529297, "completions/mean_terminated_length": 176.7049102783203, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 2.2273112807463953, "frac_reward_zero_std": 0.0, "grad_norm": 0.898921012878418, "kl": 0.0596923828125, "learning_rate": 1.9068553678989735e-07, "loss": 0.0083, "num_tokens": 168332763.0, "reward": 9.817299842834473, "reward_std": 1.517459750175476, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.3854166269302368, "rewards/judge_reward/std": 1.6310455799102783, "rewards/ngrams_iou_reward/mean": 0.17358805239200592, "rewards/ngrams_iou_reward/std": 0.21647387742996216, "rewards/schema_keywords_iou_reward/mean": 0.706211268901825, "rewards/schema_keywords_iou_reward/std": 0.18197491765022278, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 228.1822967529297, "completions/mean_terminated_length": 163.91378784179688, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.230703986429177, "frac_reward_zero_std": 0.15625, "grad_norm": 0.7407997250556946, "kl": 0.0443115234375, "learning_rate": 1.8913751210575247e-07, "loss": 0.0303, "num_tokens": 168578822.0, "reward": 10.83222484588623, "reward_std": 0.8497076034545898, "rewards/accuracy_reward/mean": 2.03125, "rewards/accuracy_reward/std": 1.4064408540725708, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 0.8864583373069763, "rewards/judge_reward/std": 1.4537274837493896, "rewards/ngrams_iou_reward/mean": 0.29060301184654236, "rewards/ngrams_iou_reward/std": 0.2865182161331177, "rewards/schema_keywords_iou_reward/mean": 0.7843285202980042, "rewards/schema_keywords_iou_reward/std": 0.18643473088741302, "rewards/syntax_reward/mean": 0.9270833134651184, "rewards/syntax_reward/std": 0.2606793940067291, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 226.1875, "completions/mean_terminated_length": 167.93846130371094, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 2.2340966921119594, "frac_reward_zero_std": 0.0, "grad_norm": 0.8858321905136108, "kl": 0.0560302734375, "learning_rate": 1.875943296064577e-07, "loss": 0.0149, "num_tokens": 168830036.0, "reward": 9.871698379516602, "reward_std": 1.4902757406234741, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.573958396911621, "rewards/judge_reward/std": 1.7148410081863403, "rewards/ngrams_iou_reward/mean": 0.19388537108898163, "rewards/ngrams_iou_reward/std": 0.21377505362033844, "rewards/schema_keywords_iou_reward/mean": 0.7069784998893738, "rewards/schema_keywords_iou_reward/std": 0.1920248121023178, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 228.5572967529297, "completions/mean_terminated_length": 178.51470947265625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.237489397794741, "frac_reward_zero_std": 0.0625, "grad_norm": 0.807306170463562, "kl": 0.05450439453125, "learning_rate": 1.8605601332957078e-07, "loss": 0.0112, "num_tokens": 169072561.0, "reward": 10.454863548278809, "reward_std": 1.1848580837249756, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.426041603088379, "rewards/judge_reward/std": 1.6805474758148193, "rewards/ngrams_iou_reward/mean": 0.20825541019439697, "rewards/ngrams_iou_reward/std": 0.2377305030822754, "rewards/schema_keywords_iou_reward/mean": 0.7226495742797852, "rewards/schema_keywords_iou_reward/std": 0.15871547162532806, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.1979217529297, "completions/mean_terminated_length": 179.83334350585938, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 2.2408821034775235, "frac_reward_zero_std": 0.03125, "grad_norm": 0.838284432888031, "kl": 0.05584716796875, "learning_rate": 1.8452258723684995e-07, "loss": 0.0051, "num_tokens": 169332465.0, "reward": 10.403704643249512, "reward_std": 1.1768511533737183, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 0.9166666865348816, "rewards/judge_reward/std": 1.4123364686965942, "rewards/ngrams_iou_reward/mean": 0.24140453338623047, "rewards/ngrams_iou_reward/std": 0.24270756542682648, "rewards/schema_keywords_iou_reward/mean": 0.7456325888633728, "rewards/schema_keywords_iou_reward/std": 0.17118032276630402, "rewards/syntax_reward/mean": 0.9427083134651184, "rewards/syntax_reward/std": 0.23300664126873016, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.5625, "completions/mean_terminated_length": 167.93548583984375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.2442748091603053, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8945732712745667, "kl": 0.0458984375, "learning_rate": 1.8299407521388065e-07, "loss": 0.0104, "num_tokens": 169597071.0, "reward": 9.793127059936523, "reward_std": 1.11122465133667, "rewards/accuracy_reward/mean": 1.515625, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.2666666507720947, "rewards/judge_reward/std": 1.5909868478775024, "rewards/ngrams_iou_reward/mean": 0.18944883346557617, "rewards/ngrams_iou_reward/std": 0.23082616925239563, "rewards/schema_keywords_iou_reward/mean": 0.6953443884849548, "rewards/schema_keywords_iou_reward/std": 0.18460671603679657, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374122977256775, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 228.9479217529297, "completions/mean_terminated_length": 167.96609497070312, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 2.247667514843087, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8879026770591736, "kl": 0.05145263671875, "learning_rate": 1.8147050106970434e-07, "loss": -0.0124, "num_tokens": 169842413.0, "reward": 10.525496482849121, "reward_std": 0.9911246299743652, "rewards/accuracy_reward/mean": 1.9375, "rewards/accuracy_reward/std": 1.4385310411453247, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 0.9510416984558105, "rewards/judge_reward/std": 1.5148475170135498, "rewards/ngrams_iou_reward/mean": 0.23413722217082977, "rewards/ngrams_iou_reward/std": 0.2768632769584656, "rewards/schema_keywords_iou_reward/mean": 0.7382335066795349, "rewards/schema_keywords_iou_reward/std": 0.19196105003356934, "rewards/syntax_reward/mean": 0.8854166865348816, "rewards/syntax_reward/std": 0.3193511664867401, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 226.22396850585938, "completions/mean_terminated_length": 169.37879943847656, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.2510602205258694, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7788165807723999, "kl": 0.0526123046875, "learning_rate": 1.7995188853644644e-07, "loss": 0.0022, "num_tokens": 170081838.0, "reward": 9.740299224853516, "reward_std": 1.5270227193832397, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.5208333730697632, "rewards/judge_reward/std": 1.6432578563690186, "rewards/ngrams_iou_reward/mean": 0.2268899828195572, "rewards/ngrams_iou_reward/std": 0.2802591323852539, "rewards/schema_keywords_iou_reward/mean": 0.6852838397026062, "rewards/schema_keywords_iou_reward/std": 0.19081410765647888, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.4166717529297, "completions/mean_terminated_length": 169.93939208984375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 2.2544529262086512, "frac_reward_zero_std": 0.0, "grad_norm": 0.7776523232460022, "kl": 0.052978515625, "learning_rate": 1.7843826126894767e-07, "loss": -0.0008, "num_tokens": 170326940.0, "reward": 9.840124130249023, "reward_std": 1.1439099311828613, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.2833333015441895, "rewards/judge_reward/std": 1.5586721897125244, "rewards/ngrams_iou_reward/mean": 0.1326083391904831, "rewards/ngrams_iou_reward/std": 0.10274671018123627, "rewards/schema_keywords_iou_reward/mean": 0.7137651443481445, "rewards/schema_keywords_iou_reward/std": 0.15430952608585358, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 232.53125, "completions/mean_terminated_length": 169.34616088867188, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 2.2578456318914335, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7692065238952637, "kl": 0.05126953125, "learning_rate": 1.7692964284439506e-07, "loss": -0.0115, "num_tokens": 170572244.0, "reward": 10.012320518493652, "reward_std": 1.1755763292312622, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.171875, "rewards/judge_reward/std": 1.529685378074646, "rewards/ngrams_iou_reward/mean": 0.17465589940547943, "rewards/ngrams_iou_reward/std": 0.21411053836345673, "rewards/schema_keywords_iou_reward/mean": 0.7230800986289978, "rewards/schema_keywords_iou_reward/std": 0.16928055882453918, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 231.1822967529297, "completions/mean_terminated_length": 170.9107208251953, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.2612383375742153, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7632495164871216, "kl": 0.04974365234375, "learning_rate": 1.7542605676195504e-07, "loss": 0.0274, "num_tokens": 170819755.0, "reward": 10.938494682312012, "reward_std": 1.224548578262329, "rewards/accuracy_reward/mean": 2.109375, "rewards/accuracy_reward/std": 1.3742263317108154, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 0.8791666030883789, "rewards/judge_reward/std": 1.499872088432312, "rewards/ngrams_iou_reward/mean": 0.28196316957473755, "rewards/ngrams_iou_reward/std": 0.33670368790626526, "rewards/schema_keywords_iou_reward/mean": 0.7263226509094238, "rewards/schema_keywords_iou_reward/std": 0.18404406309127808, "rewards/syntax_reward/mean": 0.8854166865348816, "rewards/syntax_reward/std": 0.3193511664867401, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 223.546875, "completions/mean_terminated_length": 169.4583282470703, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 2.2646310432569976, "frac_reward_zero_std": 0.0, "grad_norm": 0.7586610913276672, "kl": 0.05224609375, "learning_rate": 1.7392752644240665e-07, "loss": 0.0072, "num_tokens": 171093064.0, "reward": 9.970932006835938, "reward_std": 1.3582509756088257, "rewards/accuracy_reward/mean": 1.515625, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.3697916269302368, "rewards/judge_reward/std": 1.6221387386322021, "rewards/ngrams_iou_reward/mean": 0.1611032485961914, "rewards/ngrams_iou_reward/std": 0.16591045260429382, "rewards/schema_keywords_iou_reward/mean": 0.6952456831932068, "rewards/schema_keywords_iou_reward/std": 0.16253092885017395, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.625, "completions/mean_terminated_length": 176.56336975097656, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 2.2680237489397794, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7605949640274048, "kl": 0.052490234375, "learning_rate": 1.7243407522777804e-07, "loss": 0.0121, "num_tokens": 171357418.0, "reward": 9.379222869873047, "reward_std": 1.4432761669158936, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.3520832061767578, "rewards/judge_reward/std": 1.543707251548767, "rewards/ngrams_iou_reward/mean": 0.11271926015615463, "rewards/ngrams_iou_reward/std": 0.08219221979379654, "rewards/schema_keywords_iou_reward/mean": 0.6508780121803284, "rewards/schema_keywords_iou_reward/std": 0.15753115713596344, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.6510467529297, "completions/mean_terminated_length": 174.66217041015625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 2.2714164546225613, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9422287344932556, "kl": 0.0548095703125, "learning_rate": 1.709457263809812e-07, "loss": -0.0022, "num_tokens": 171628533.0, "reward": 9.887824058532715, "reward_std": 1.061447262763977, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319953322410583, "rewards/judge_reward/mean": 1.1927083730697632, "rewards/judge_reward/std": 1.4562913179397583, "rewards/ngrams_iou_reward/mean": 0.2426144927740097, "rewards/ngrams_iou_reward/std": 0.2903759479522705, "rewards/schema_keywords_iou_reward/mean": 0.7077088356018066, "rewards/schema_keywords_iou_reward/std": 0.1833214908838272, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4375, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 225.5729217529297, "completions/mean_terminated_length": 181.10256958007812, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 2.2748091603053435, "frac_reward_zero_std": 0.0, "grad_norm": 0.8446488380432129, "kl": 0.05609130859375, "learning_rate": 1.6946250308545124e-07, "loss": -0.0145, "num_tokens": 171885377.0, "reward": 10.15268611907959, "reward_std": 1.3945121765136719, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.359375, "rewards/judge_reward/std": 1.6604470014572144, "rewards/ngrams_iou_reward/mean": 0.14681832492351532, "rewards/ngrams_iou_reward/std": 0.09507815539836884, "rewards/schema_keywords_iou_reward/mean": 0.6933672428131104, "rewards/schema_keywords_iou_reward/std": 0.14193782210350037, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 173.21739196777344, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.2782018659881254, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7800244688987732, "kl": 0.0469970703125, "learning_rate": 1.6798442844478443e-07, "loss": 0.0062, "num_tokens": 172118933.0, "reward": 9.67370891571045, "reward_std": 1.0472042560577393, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.318750023841858, "rewards/judge_reward/std": 1.5226260423660278, "rewards/ngrams_iou_reward/mean": 0.15764209628105164, "rewards/ngrams_iou_reward/std": 0.14270517230033875, "rewards/schema_keywords_iou_reward/mean": 0.6962742805480957, "rewards/schema_keywords_iou_reward/std": 0.14793303608894348, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617491722107, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.328125, "completions/mean_terminated_length": 178.96609497070312, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 2.2815945716709076, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7791006565093994, "kl": 0.05426025390625, "learning_rate": 1.66511525482378e-07, "loss": 0.0044, "num_tokens": 172379126.0, "reward": 8.82827377319336, "reward_std": 1.839156150817871, "rewards/accuracy_reward/mean": 1.109375, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.8854166865348816, "rewards/format_reward/std": 0.3193511366844177, "rewards/judge_reward/mean": 1.5322917699813843, "rewards/judge_reward/std": 1.553310513496399, "rewards/ngrams_iou_reward/mean": 0.15178509056568146, "rewards/ngrams_iou_reward/std": 0.17529751360416412, "rewards/schema_keywords_iou_reward/mean": 0.6431560516357422, "rewards/schema_keywords_iou_reward/std": 0.21059465408325195, "rewards/syntax_reward/mean": 0.7552083134651184, "rewards/syntax_reward/std": 0.4310877025127411, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 224.578125, "completions/mean_terminated_length": 172.2083282470703, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 2.2849872773536894, "frac_reward_zero_std": 0.0, "grad_norm": 0.8128858804702759, "kl": 0.0535888671875, "learning_rate": 1.6504381714107252e-07, "loss": -0.0009, "num_tokens": 172641995.0, "reward": 10.228699684143066, "reward_std": 1.081789493560791, "rewards/accuracy_reward/mean": 1.515625, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.4656251668930054, "rewards/judge_reward/std": 1.7430987358093262, "rewards/ngrams_iou_reward/mean": 0.2092103511095047, "rewards/ngrams_iou_reward/std": 0.2111303210258484, "rewards/schema_keywords_iou_reward/mean": 0.7236554622650146, "rewards/schema_keywords_iou_reward/std": 0.17560870945453644, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 226.27084350585938, "completions/mean_terminated_length": 172.05882263183594, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.2883799830364717, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8709229230880737, "kl": 0.0489501953125, "learning_rate": 1.635813262827932e-07, "loss": -0.0044, "num_tokens": 172892601.0, "reward": 10.486810684204102, "reward_std": 1.0458903312683105, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.5958333015441895, "rewards/judge_reward/std": 1.7407896518707275, "rewards/ngrams_iou_reward/mean": 0.23615486919879913, "rewards/ngrams_iou_reward/std": 0.27449366450309753, "rewards/schema_keywords_iou_reward/mean": 0.7360718846321106, "rewards/schema_keywords_iou_reward/std": 0.16749738156795502, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 220.50521850585938, "completions/mean_terminated_length": 165.13333129882812, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 2.2917726887192535, "frac_reward_zero_std": 0.0, "grad_norm": 0.8236631155014038, "kl": 0.0528564453125, "learning_rate": 1.6212407568819565e-07, "loss": 0.0051, "num_tokens": 173152006.0, "reward": 10.164397239685059, "reward_std": 1.0730819702148438, "rewards/accuracy_reward/mean": 1.859375, "rewards/accuracy_reward/std": 1.4601210355758667, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 0.9197916984558105, "rewards/judge_reward/std": 1.432174801826477, "rewards/ngrams_iou_reward/mean": 0.16539064049720764, "rewards/ngrams_iou_reward/std": 0.167673721909523, "rewards/schema_keywords_iou_reward/mean": 0.7167143821716309, "rewards/schema_keywords_iou_reward/std": 0.15200592577457428, "rewards/syntax_reward/mean": 0.9166666865348816, "rewards/syntax_reward/std": 0.27710798382759094, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 226.91146850585938, "completions/mean_terminated_length": 165.9193572998047, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 2.2951653944020354, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8568248152732849, "kl": 0.04949951171875, "learning_rate": 1.6067208805630876e-07, "loss": -0.0087, "num_tokens": 173418977.0, "reward": 10.209228515625, "reward_std": 1.476750135421753, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.40625, "rewards/judge_reward/std": 1.6511338949203491, "rewards/ngrams_iou_reward/mean": 0.19418127834796906, "rewards/ngrams_iou_reward/std": 0.18235494196414948, "rewards/schema_keywords_iou_reward/mean": 0.7390053868293762, "rewards/schema_keywords_iou_reward/std": 0.15617138147354126, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 227.7291717529297, "completions/mean_terminated_length": 174.98507690429688, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 2.2985581000848176, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7778981328010559, "kl": 0.05072021484375, "learning_rate": 1.5922538600418317e-07, "loss": 0.0118, "num_tokens": 173680711.0, "reward": 10.251426696777344, "reward_std": 0.9671257734298706, "rewards/accuracy_reward/mean": 1.984375, "rewards/accuracy_reward/std": 1.423352599143982, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 0.7583333849906921, "rewards/judge_reward/std": 1.2550768852233887, "rewards/ngrams_iou_reward/mean": 0.23026560246944427, "rewards/ngrams_iou_reward/std": 0.25775548815727234, "rewards/schema_keywords_iou_reward/mean": 0.7128276824951172, "rewards/schema_keywords_iou_reward/std": 0.1900802105665207, "rewards/syntax_reward/mean": 0.8854166865348816, "rewards/syntax_reward/std": 0.3193511664867401, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 227.16146850585938, "completions/mean_terminated_length": 162.1525421142578, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 2.3019508057675995, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8333850502967834, "kl": 0.04974365234375, "learning_rate": 1.577839920665373e-07, "loss": 0.0053, "num_tokens": 173947748.0, "reward": 10.191568374633789, "reward_std": 1.2061436176300049, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319953322410583, "rewards/judge_reward/mean": 1.2531250715255737, "rewards/judge_reward/std": 1.579412817955017, "rewards/ngrams_iou_reward/mean": 0.25807544589042664, "rewards/ngrams_iou_reward/std": 0.2889432907104492, "rewards/schema_keywords_iou_reward/mean": 0.7345336079597473, "rewards/schema_keywords_iou_reward/std": 0.17079752683639526, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.3697967529297, "completions/mean_terminated_length": 176.13462829589844, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 2.3053435114503817, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7402075529098511, "kl": 0.054443359375, "learning_rate": 1.563479286954078e-07, "loss": 0.0119, "num_tokens": 174193903.0, "reward": 9.376510620117188, "reward_std": 1.2164082527160645, "rewards/accuracy_reward/mean": 1.296875, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.493749976158142, "rewards/judge_reward/std": 1.6122444868087769, "rewards/ngrams_iou_reward/mean": 0.1460164189338684, "rewards/ngrams_iou_reward/std": 0.11909015476703644, "rewards/schema_keywords_iou_reward/mean": 0.6648690104484558, "rewards/schema_keywords_iou_reward/std": 0.17580018937587738, "rewards/syntax_reward/mean": 0.7604166865348816, "rewards/syntax_reward/std": 0.4279450476169586, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 227.1510467529297, "completions/mean_terminated_length": 168.07937622070312, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.3087362171331636, "frac_reward_zero_std": 0.09375, "grad_norm": 0.8049189448356628, "kl": 0.054443359375, "learning_rate": 1.54917218259799e-07, "loss": 0.001, "num_tokens": 174468588.0, "reward": 10.17347526550293, "reward_std": 1.3961586952209473, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.1989582777023315, "rewards/judge_reward/std": 1.5330901145935059, "rewards/ngrams_iou_reward/mean": 0.26061275601387024, "rewards/ngrams_iou_reward/std": 0.279367595911026, "rewards/schema_keywords_iou_reward/mean": 0.7441113591194153, "rewards/schema_keywords_iou_reward/std": 0.17556717991828918, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 233.4635467529297, "completions/mean_terminated_length": 171.15687561035156, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 2.312128922815946, "frac_reward_zero_std": 0.0, "grad_norm": 0.7679020762443542, "kl": 0.05255126953125, "learning_rate": 1.534918830453341e-07, "loss": 0.0145, "num_tokens": 174713045.0, "reward": 9.781676292419434, "reward_std": 1.2069448232650757, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.607291579246521, "rewards/judge_reward/std": 1.6587855815887451, "rewards/ngrams_iou_reward/mean": 0.1586281657218933, "rewards/ngrams_iou_reward/std": 0.1855659782886505, "rewards/schema_keywords_iou_reward/mean": 0.7313801646232605, "rewards/schema_keywords_iou_reward/std": 0.1343497335910797, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 229.33334350585938, "completions/mean_terminated_length": 169.2203369140625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 2.3155216284987277, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7344152927398682, "kl": 0.04833984375, "learning_rate": 1.5207194525390937e-07, "loss": 0.0039, "num_tokens": 174979347.0, "reward": 9.887033462524414, "reward_std": 1.589040756225586, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.6854166984558105, "rewards/judge_reward/std": 1.7213764190673828, "rewards/ngrams_iou_reward/mean": 0.2171822339296341, "rewards/ngrams_iou_reward/std": 0.2520003318786621, "rewards/schema_keywords_iou_reward/mean": 0.7104761600494385, "rewards/schema_keywords_iou_reward/std": 0.17726187407970428, "rewards/syntax_reward/mean": 0.703125, "rewards/syntax_reward/std": 0.4580754339694977, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 229.4375, "completions/mean_terminated_length": 175.04762268066406, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 2.31891433418151, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8135226964950562, "kl": 0.049072265625, "learning_rate": 1.5065742700334677e-07, "loss": 0.0062, "num_tokens": 175235277.0, "reward": 10.493714332580566, "reward_std": 0.9464852809906006, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.4989582300186157, "rewards/judge_reward/std": 1.722775936126709, "rewards/ngrams_iou_reward/mean": 0.16340552270412445, "rewards/ngrams_iou_reward/std": 0.15911337733268738, "rewards/schema_keywords_iou_reward/mean": 0.717808723449707, "rewards/schema_keywords_iou_reward/std": 0.1543654501438141, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 229.484375, "completions/mean_terminated_length": 169.7118682861328, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.3223070398642918, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7421500086784363, "kl": 0.05126953125, "learning_rate": 1.4924835032705063e-07, "loss": 0.0091, "num_tokens": 175494006.0, "reward": 10.2708101272583, "reward_std": 1.15019690990448, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.3354167938232422, "rewards/judge_reward/std": 1.568804383277893, "rewards/ngrams_iou_reward/mean": 0.23284603655338287, "rewards/ngrams_iou_reward/std": 0.2659146785736084, "rewards/schema_keywords_iou_reward/mean": 0.7473384737968445, "rewards/schema_keywords_iou_reward/std": 0.15770108997821808, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617789745331, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 224.67709350585938, "completions/mean_terminated_length": 170.08570861816406, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 2.325699745547074, "frac_reward_zero_std": 0.0625, "grad_norm": 0.79814612865448, "kl": 0.05517578125, "learning_rate": 1.4784473717366387e-07, "loss": -0.0071, "num_tokens": 175745776.0, "reward": 10.363814353942871, "reward_std": 1.429511547088623, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.1791666746139526, "rewards/judge_reward/std": 1.5896204710006714, "rewards/ngrams_iou_reward/mean": 0.20587284862995148, "rewards/ngrams_iou_reward/std": 0.21858274936676025, "rewards/schema_keywords_iou_reward/mean": 0.7318994402885437, "rewards/schema_keywords_iou_reward/std": 0.14758841693401337, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.1041717529297, "completions/mean_terminated_length": 176.33334350585938, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.329092451229856, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7978417277336121, "kl": 0.04705810546875, "learning_rate": 1.4644660940672627e-07, "loss": -0.0035, "num_tokens": 176002638.0, "reward": 10.791902542114258, "reward_std": 1.057331919670105, "rewards/accuracy_reward/mean": 2.0625, "rewards/accuracy_reward/std": 1.3941725492477417, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 0.8541666865348816, "rewards/judge_reward/std": 1.4455503225326538, "rewards/ngrams_iou_reward/mean": 0.2791918218135834, "rewards/ngrams_iou_reward/std": 0.29432985186576843, "rewards/schema_keywords_iou_reward/mean": 0.7470847964286804, "rewards/schema_keywords_iou_reward/std": 0.1833643764257431, "rewards/syntax_reward/mean": 0.8958333134651184, "rewards/syntax_reward/std": 0.30627527832984924, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.671875, "completions/mean_terminated_length": 176.61111450195312, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 2.3324851569126377, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6976929903030396, "kl": 0.0465087890625, "learning_rate": 1.4505398880433368e-07, "loss": 0.0103, "num_tokens": 176225391.0, "reward": 10.7340087890625, "reward_std": 0.905536413192749, "rewards/accuracy_reward/mean": 1.875, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.0906249284744263, "rewards/judge_reward/std": 1.5499756336212158, "rewards/ngrams_iou_reward/mean": 0.27227190136909485, "rewards/ngrams_iou_reward/std": 0.2805827856063843, "rewards/schema_keywords_iou_reward/mean": 0.7648606896400452, "rewards/schema_keywords_iou_reward/std": 0.14335773885250092, "rewards/syntax_reward/mean": 0.9270833134651184, "rewards/syntax_reward/std": 0.2606794238090515, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 228.25521850585938, "completions/mean_terminated_length": 159.14544677734375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 2.33587786259542, "frac_reward_zero_std": 0.0, "grad_norm": 0.8682234883308411, "kl": 0.05126953125, "learning_rate": 1.4366689705879897e-07, "loss": 0.0191, "num_tokens": 176496250.0, "reward": 10.41016960144043, "reward_std": 0.9138315916061401, "rewards/accuracy_reward/mean": 1.78125, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.1395832300186157, "rewards/judge_reward/std": 1.5911061763763428, "rewards/ngrams_iou_reward/mean": 0.230524942278862, "rewards/ngrams_iou_reward/std": 0.27751410007476807, "rewards/schema_keywords_iou_reward/mean": 0.707768976688385, "rewards/schema_keywords_iou_reward/std": 0.1954781711101532, "rewards/syntax_reward/mean": 0.9010416865348816, "rewards/syntax_reward/std": 0.2993867099285126, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 226.9010467529297, "completions/mean_terminated_length": 172.6119384765625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 2.339270568278202, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8170960545539856, "kl": 0.050537109375, "learning_rate": 1.422853557763144e-07, "loss": -0.017, "num_tokens": 176767941.0, "reward": 9.841815948486328, "reward_std": 1.1794288158416748, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.2916666269302368, "rewards/judge_reward/std": 1.6437517404556274, "rewards/ngrams_iou_reward/mean": 0.17980702221393585, "rewards/ngrams_iou_reward/std": 0.19772422313690186, "rewards/schema_keywords_iou_reward/mean": 0.6724249720573425, "rewards/schema_keywords_iou_reward/std": 0.1805170178413391, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 223.97396850585938, "completions/mean_terminated_length": 166.88406372070312, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 2.342663273960984, "frac_reward_zero_std": 0.0, "grad_norm": 0.7888423800468445, "kl": 0.0521240234375, "learning_rate": 1.409093864766146e-07, "loss": -0.0069, "num_tokens": 177028492.0, "reward": 9.743616104125977, "reward_std": 1.4579696655273438, "rewards/accuracy_reward/mean": 1.234375, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.728124976158142, "rewards/judge_reward/std": 1.746931552886963, "rewards/ngrams_iou_reward/mean": 0.17708851397037506, "rewards/ngrams_iou_reward/std": 0.18547914922237396, "rewards/schema_keywords_iou_reward/mean": 0.6936106085777283, "rewards/schema_keywords_iou_reward/std": 0.15990734100341797, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.734375, "completions/mean_terminated_length": 171.56451416015625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 2.346055979643766, "frac_reward_zero_std": 0.0, "grad_norm": 0.8737409710884094, "kl": 0.0482177734375, "learning_rate": 1.395390105926419e-07, "loss": 0.0115, "num_tokens": 177264049.0, "reward": 9.899486541748047, "reward_std": 1.2919464111328125, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.2218750715255737, "rewards/judge_reward/std": 1.499228596687317, "rewards/ngrams_iou_reward/mean": 0.19721651077270508, "rewards/ngrams_iou_reward/std": 0.24178192019462585, "rewards/schema_keywords_iou_reward/mean": 0.7220607399940491, "rewards/schema_keywords_iou_reward/std": 0.16797226667404175, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 224.5625, "completions/mean_terminated_length": 157.04917907714844, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 2.349448685326548, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8548998832702637, "kl": 0.0478515625, "learning_rate": 1.381742494702115e-07, "loss": 0.019, "num_tokens": 177506209.0, "reward": 9.958273887634277, "reward_std": 1.2610667943954468, "rewards/accuracy_reward/mean": 1.8125, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 0.9114583134651184, "rewards/judge_reward/std": 1.368866205215454, "rewards/ngrams_iou_reward/mean": 0.155812606215477, "rewards/ngrams_iou_reward/std": 0.13623353838920593, "rewards/schema_keywords_iou_reward/mean": 0.6930856704711914, "rewards/schema_keywords_iou_reward/std": 0.18558895587921143, "rewards/syntax_reward/mean": 0.8854166865348816, "rewards/syntax_reward/std": 0.3193511664867401, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 229.9791717529297, "completions/mean_terminated_length": 174.09835815429688, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 2.35284139100933, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7938417196273804, "kl": 0.05120849609375, "learning_rate": 1.3681512436768046e-07, "loss": -0.0034, "num_tokens": 177778749.0, "reward": 9.595041275024414, "reward_std": 1.386001467704773, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.254166603088379, "rewards/judge_reward/std": 1.5607701539993286, "rewards/ngrams_iou_reward/mean": 0.1886753886938095, "rewards/ngrams_iou_reward/std": 0.22449766099452972, "rewards/schema_keywords_iou_reward/mean": 0.6740732789039612, "rewards/schema_keywords_iou_reward/std": 0.1888127326965332, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 227.796875, "completions/mean_terminated_length": 162.63792419433594, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 2.356234096692112, "frac_reward_zero_std": 0.03125, "grad_norm": 1.4370090961456299, "kl": 0.09454345703125, "learning_rate": 1.3546165645561486e-07, "loss": 0.014, "num_tokens": 178030206.0, "reward": 10.734126091003418, "reward_std": 1.4708484411239624, "rewards/accuracy_reward/mean": 1.984375, "rewards/accuracy_reward/std": 1.423352599143982, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 0.9916666150093079, "rewards/judge_reward/std": 1.5722507238388062, "rewards/ngrams_iou_reward/mean": 0.25316330790519714, "rewards/ngrams_iou_reward/std": 0.28949928283691406, "rewards/schema_keywords_iou_reward/mean": 0.7476282715797424, "rewards/schema_keywords_iou_reward/std": 0.16827364265918732, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617491722107, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 223.6875, "completions/mean_terminated_length": 169.8333282470703, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 2.359626802374894, "frac_reward_zero_std": 0.0, "grad_norm": 1.0343058109283447, "kl": 0.0506591796875, "learning_rate": 1.3411386681646164e-07, "loss": 0.0168, "num_tokens": 178302048.0, "reward": 9.675437927246094, "reward_std": 1.3210453987121582, "rewards/accuracy_reward/mean": 1.28125, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.568750023841858, "rewards/judge_reward/std": 1.6527503728866577, "rewards/ngrams_iou_reward/mean": 0.20369863510131836, "rewards/ngrams_iou_reward/std": 0.23388127982616425, "rewards/schema_keywords_iou_reward/mean": 0.6936144828796387, "rewards/schema_keywords_iou_reward/std": 0.18217137455940247, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 232.921875, "completions/mean_terminated_length": 173.94444274902344, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 2.363019508057676, "frac_reward_zero_std": 0.0, "grad_norm": 0.8383036255836487, "kl": 0.04827880859375, "learning_rate": 1.3277177644421923e-07, "loss": -0.0067, "num_tokens": 178558203.0, "reward": 10.211783409118652, "reward_std": 1.1754145622253418, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.2458332777023315, "rewards/judge_reward/std": 1.5910691022872925, "rewards/ngrams_iou_reward/mean": 0.17931543290615082, "rewards/ngrams_iou_reward/std": 0.20156343281269073, "rewards/schema_keywords_iou_reward/mean": 0.7178845405578613, "rewards/schema_keywords_iou_reward/std": 0.16271428763866425, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 228.38021850585938, "completions/mean_terminated_length": 166.11863708496094, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.366412213740458, "frac_reward_zero_std": 0.0, "grad_norm": 0.868706226348877, "kl": 0.0450439453125, "learning_rate": 1.3143540624411058e-07, "loss": 0.0017, "num_tokens": 178813096.0, "reward": 10.9669189453125, "reward_std": 0.9751631021499634, "rewards/accuracy_reward/mean": 1.953125, "rewards/accuracy_reward/std": 1.433660626411438, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.1489583253860474, "rewards/judge_reward/std": 1.6859439611434937, "rewards/ngrams_iou_reward/mean": 0.2751985788345337, "rewards/ngrams_iou_reward/std": 0.3020598888397217, "rewards/schema_keywords_iou_reward/mean": 0.7688034176826477, "rewards/schema_keywords_iou_reward/std": 0.15461908280849457, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 232.45834350585938, "completions/mean_terminated_length": 170.71697998046875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 2.36980491942324, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8008247017860413, "kl": 0.0474853515625, "learning_rate": 1.3010477703225808e-07, "loss": 0.0231, "num_tokens": 179071490.0, "reward": 9.94621467590332, "reward_std": 1.3628435134887695, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.1687499284744263, "rewards/judge_reward/std": 1.5170453786849976, "rewards/ngrams_iou_reward/mean": 0.18399332463741302, "rewards/ngrams_iou_reward/std": 0.1954360157251358, "rewards/schema_keywords_iou_reward/mean": 0.7007617950439453, "rewards/schema_keywords_iou_reward/std": 0.17488126456737518, "rewards/syntax_reward/mean": 0.90625, "rewards/syntax_reward/std": 0.2922426164150238, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 228.63021850585938, "completions/mean_terminated_length": 172.58731079101562, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 2.3731976251060223, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8758860230445862, "kl": 0.05108642578125, "learning_rate": 1.287799095353584e-07, "loss": -0.02, "num_tokens": 179335545.0, "reward": 10.011551856994629, "reward_std": 1.4091638326644897, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.1854166984558105, "rewards/judge_reward/std": 1.5712720155715942, "rewards/ngrams_iou_reward/mean": 0.20084123313426971, "rewards/ngrams_iou_reward/std": 0.2128085345029831, "rewards/schema_keywords_iou_reward/mean": 0.7107097506523132, "rewards/schema_keywords_iou_reward/std": 0.19101271033287048, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 175.3333282470703, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 2.376590330788804, "frac_reward_zero_std": 0.0, "grad_norm": 0.8711112141609192, "kl": 0.0513916015625, "learning_rate": 1.2746082439036114e-07, "loss": -0.01, "num_tokens": 179584611.0, "reward": 10.601289749145508, "reward_std": 1.4305680990219116, "rewards/accuracy_reward/mean": 1.859375, "rewards/accuracy_reward/std": 1.4601210355758667, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.189062476158142, "rewards/judge_reward/std": 1.6493486166000366, "rewards/ngrams_iou_reward/mean": 0.15021224319934845, "rewards/ngrams_iou_reward/std": 0.111853688955307, "rewards/schema_keywords_iou_reward/mean": 0.7135767936706543, "rewards/schema_keywords_iou_reward/std": 0.18054687976837158, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 229.42709350585938, "completions/mean_terminated_length": 168.03448486328125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 2.379983036471586, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7459380626678467, "kl": 0.05316162109375, "learning_rate": 1.2614754214414548e-07, "loss": -0.0124, "num_tokens": 179819455.0, "reward": 10.037909507751465, "reward_std": 1.5783586502075195, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.4760416746139526, "rewards/judge_reward/std": 1.7124882936477661, "rewards/ngrams_iou_reward/mean": 0.1975182294845581, "rewards/ngrams_iou_reward/std": 0.22230957448482513, "rewards/schema_keywords_iou_reward/mean": 0.7216407656669617, "rewards/schema_keywords_iou_reward/std": 0.17364126443862915, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 236.91146850585938, "completions/mean_terminated_length": 164.375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 2.383375742154368, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7509382367134094, "kl": 0.04656982421875, "learning_rate": 1.2484008325320171e-07, "loss": -0.0049, "num_tokens": 180061544.0, "reward": 10.346017837524414, "reward_std": 1.2580766677856445, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.303125023841858, "rewards/judge_reward/std": 1.664029836654663, "rewards/ngrams_iou_reward/mean": 0.18932688236236572, "rewards/ngrams_iou_reward/std": 0.21058551967144012, "rewards/schema_keywords_iou_reward/mean": 0.7223153710365295, "rewards/schema_keywords_iou_reward/std": 0.1559682935476303, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 225.03646850585938, "completions/mean_terminated_length": 156.9166717529297, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.38676844783715, "frac_reward_zero_std": 0.0, "grad_norm": 0.8159397840499878, "kl": 0.0469970703125, "learning_rate": 1.2353846808331152e-07, "loss": -0.0198, "num_tokens": 180339741.0, "reward": 9.762248039245605, "reward_std": 1.3903121948242188, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.1239583492279053, "rewards/judge_reward/std": 1.4405674934387207, "rewards/ngrams_iou_reward/mean": 0.20963692665100098, "rewards/ngrams_iou_reward/std": 0.22898539900779724, "rewards/schema_keywords_iou_reward/mean": 0.7317771911621094, "rewards/schema_keywords_iou_reward/std": 0.1707792580127716, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 232.9947967529297, "completions/mean_terminated_length": 181.13558959960938, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 2.3901611535199323, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8670368194580078, "kl": 0.04949951171875, "learning_rate": 1.2224271690923155e-07, "loss": 0.0188, "num_tokens": 180607430.0, "reward": 9.764484405517578, "reward_std": 1.4129537343978882, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.415624976158142, "rewards/judge_reward/std": 1.5871095657348633, "rewards/ngrams_iou_reward/mean": 0.15427595376968384, "rewards/ngrams_iou_reward/std": 0.14055225253105164, "rewards/schema_keywords_iou_reward/mean": 0.7060418128967285, "rewards/schema_keywords_iou_reward/std": 0.1542356312274933, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 234.64584350585938, "completions/mean_terminated_length": 178.64151000976562, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 2.393553859202714, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8673786520957947, "kl": 0.0496826171875, "learning_rate": 1.2095284991437733e-07, "loss": -0.0026, "num_tokens": 180877548.0, "reward": 10.135706901550293, "reward_std": 1.1177705526351929, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.2916666269302368, "rewards/judge_reward/std": 1.6264610290527344, "rewards/ngrams_iou_reward/mean": 0.17962723970413208, "rewards/ngrams_iou_reward/std": 0.16155223548412323, "rewards/schema_keywords_iou_reward/mean": 0.6956623196601868, "rewards/schema_keywords_iou_reward/std": 0.15968675911426544, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 227.2760467529297, "completions/mean_terminated_length": 157.5178680419922, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 2.3969465648854964, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9289210438728333, "kl": 0.05633544921875, "learning_rate": 1.1966888719050827e-07, "loss": -0.0162, "num_tokens": 181124921.0, "reward": 10.07658576965332, "reward_std": 1.3057385683059692, "rewards/accuracy_reward/mean": 1.515625, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.40625, "rewards/judge_reward/std": 1.6126340627670288, "rewards/ngrams_iou_reward/mean": 0.17168551683425903, "rewards/ngrams_iou_reward/std": 0.1913423091173172, "rewards/schema_keywords_iou_reward/mean": 0.717399537563324, "rewards/schema_keywords_iou_reward/std": 0.16781795024871826, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374122977256775, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 231.40625, "completions/mean_terminated_length": 170.14544677734375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.4003392705682782, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7959769368171692, "kl": 0.04888916015625, "learning_rate": 1.1839084873741584e-07, "loss": -0.0024, "num_tokens": 181372271.0, "reward": 10.590134620666504, "reward_std": 1.2293829917907715, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.1177083253860474, "rewards/judge_reward/std": 1.5953134298324585, "rewards/ngrams_iou_reward/mean": 0.1911962628364563, "rewards/ngrams_iou_reward/std": 0.21829001605510712, "rewards/schema_keywords_iou_reward/mean": 0.6947707533836365, "rewards/schema_keywords_iou_reward/std": 0.18152879178524017, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 231.0416717529297, "completions/mean_terminated_length": 171.92982482910156, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 2.40373197625106, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7977736592292786, "kl": 0.04901123046875, "learning_rate": 1.1711875446261093e-07, "loss": 0.024, "num_tokens": 181615549.0, "reward": 9.258485794067383, "reward_std": 1.4227054119110107, "rewards/accuracy_reward/mean": 1.1875, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.5802083015441895, "rewards/judge_reward/std": 1.6977423429489136, "rewards/ngrams_iou_reward/mean": 0.160567507147789, "rewards/ngrams_iou_reward/std": 0.17218703031539917, "rewards/schema_keywords_iou_reward/mean": 0.6614586114883423, "rewards/schema_keywords_iou_reward/std": 0.1867782324552536, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.3697967529297, "completions/mean_terminated_length": 180.6199951171875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 2.4071246819338423, "frac_reward_zero_std": 0.0, "grad_norm": 0.7827903628349304, "kl": 0.0509033203125, "learning_rate": 1.1585262418101466e-07, "loss": -0.0111, "num_tokens": 181863648.0, "reward": 9.654977798461914, "reward_std": 1.4876811504364014, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.2843750715255737, "rewards/judge_reward/std": 1.564048171043396, "rewards/ngrams_iou_reward/mean": 0.1945483684539795, "rewards/ngrams_iou_reward/std": 0.19029931724071503, "rewards/schema_keywords_iou_reward/mean": 0.688553512096405, "rewards/schema_keywords_iou_reward/std": 0.15007208287715912, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.671875, "completions/mean_terminated_length": 162.15000915527344, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.410517387616624, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7651329636573792, "kl": 0.05133056640625, "learning_rate": 1.1459247761464907e-07, "loss": -0.0027, "num_tokens": 182133039.0, "reward": 10.302032470703125, "reward_std": 1.5651791095733643, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.4614583253860474, "rewards/judge_reward/std": 1.6414059400558472, "rewards/ngrams_iou_reward/mean": 0.16355347633361816, "rewards/ngrams_iou_reward/std": 0.1680763214826584, "rewards/schema_keywords_iou_reward/mean": 0.7311857342720032, "rewards/schema_keywords_iou_reward/std": 0.15828078985214233, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 235.6354217529297, "completions/mean_terminated_length": 179.33334350585938, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 2.4139100932994064, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8191847205162048, "kl": 0.05010986328125, "learning_rate": 1.1333833439233053e-07, "loss": 0.0164, "num_tokens": 182395193.0, "reward": 9.964228630065918, "reward_std": 1.1865429878234863, "rewards/accuracy_reward/mean": 1.515625, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.399999976158142, "rewards/judge_reward/std": 1.6097867488861084, "rewards/ngrams_iou_reward/mean": 0.18209922313690186, "rewards/ngrams_iou_reward/std": 0.19531263411045074, "rewards/schema_keywords_iou_reward/mean": 0.7060859799385071, "rewards/schema_keywords_iou_reward/std": 0.1719299852848053, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 238.33334350585938, "completions/mean_terminated_length": 175.23809814453125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 2.4173027989821882, "frac_reward_zero_std": 0.0, "grad_norm": 0.7390357851982117, "kl": 0.05181884765625, "learning_rate": 1.1209021404936303e-07, "loss": 0.0118, "num_tokens": 182659551.0, "reward": 10.194314956665039, "reward_std": 1.2547211647033691, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.5520833730697632, "rewards/judge_reward/std": 1.7786974906921387, "rewards/ngrams_iou_reward/mean": 0.21080708503723145, "rewards/ngrams_iou_reward/std": 0.2271052896976471, "rewards/schema_keywords_iou_reward/mean": 0.7074654698371887, "rewards/schema_keywords_iou_reward/std": 0.19103506207466125, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.06771850585938, "completions/mean_terminated_length": 172.203125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 2.4206955046649705, "frac_reward_zero_std": 0.0, "grad_norm": 0.7652490139007568, "kl": 0.04840087890625, "learning_rate": 1.1084813602723514e-07, "loss": 0.0006, "num_tokens": 182915278.0, "reward": 9.814687728881836, "reward_std": 1.1752740144729614, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.3979167938232422, "rewards/judge_reward/std": 1.5988200902938843, "rewards/ngrams_iou_reward/mean": 0.1953461766242981, "rewards/ngrams_iou_reward/std": 0.22362634539604187, "rewards/schema_keywords_iou_reward/mean": 0.6932992935180664, "rewards/schema_keywords_iou_reward/std": 0.18533997237682343, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 231.984375, "completions/mean_terminated_length": 172.16363525390625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 2.4240882103477523, "frac_reward_zero_std": 0.0, "grad_norm": 0.8301250338554382, "kl": 0.04754638671875, "learning_rate": 1.0961211967331596e-07, "loss": -0.0074, "num_tokens": 183199363.0, "reward": 9.52551555633545, "reward_std": 1.749449610710144, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.1385416984558105, "rewards/judge_reward/std": 1.546681523323059, "rewards/ngrams_iou_reward/mean": 0.18115724623203278, "rewards/ngrams_iou_reward/std": 0.23749279975891113, "rewards/schema_keywords_iou_reward/mean": 0.6818572878837585, "rewards/schema_keywords_iou_reward/std": 0.20238301157951355, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 230.1875, "completions/mean_terminated_length": 169.05262756347656, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 2.427480916030534, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7999146580696106, "kl": 0.05682373046875, "learning_rate": 1.0838218424055479e-07, "loss": 0.0203, "num_tokens": 183454951.0, "reward": 9.22420883178711, "reward_std": 1.6591280698776245, "rewards/accuracy_reward/mean": 1.125, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12434382736682892, "rewards/judge_reward/mean": 1.595833420753479, "rewards/judge_reward/std": 1.5747960805892944, "rewards/ngrams_iou_reward/mean": 0.20868517458438873, "rewards/ngrams_iou_reward/std": 0.2435658574104309, "rewards/schema_keywords_iou_reward/mean": 0.704064667224884, "rewards/schema_keywords_iou_reward/std": 0.16488176584243774, "rewards/syntax_reward/mean": 0.7604166865348816, "rewards/syntax_reward/std": 0.427945077419281, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.3072967529297, "completions/mean_terminated_length": 175.13113403320312, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 2.4308736217133164, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8972803950309753, "kl": 0.0540771484375, "learning_rate": 1.0715834888718072e-07, "loss": 0.017, "num_tokens": 183728124.0, "reward": 9.056427001953125, "reward_std": 1.7211236953735352, "rewards/accuracy_reward/mean": 1.234375, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.3729166984558105, "rewards/judge_reward/std": 1.5137746334075928, "rewards/ngrams_iou_reward/mean": 0.17943866550922394, "rewards/ngrams_iou_reward/std": 0.19237889349460602, "rewards/schema_keywords_iou_reward/mean": 0.6624035835266113, "rewards/schema_keywords_iou_reward/std": 0.1901015192270279, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.90625, "completions/mean_terminated_length": 178.11538696289062, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 2.4342663273960983, "frac_reward_zero_std": 0.0, "grad_norm": 1.0036295652389526, "kl": 0.0472412109375, "learning_rate": 1.0594063267640385e-07, "loss": 0.009, "num_tokens": 183981246.0, "reward": 9.552529335021973, "reward_std": 1.6266282796859741, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.29938673973083496, "rewards/judge_reward/mean": 1.2260416746139526, "rewards/judge_reward/std": 1.4965051412582397, "rewards/ngrams_iou_reward/mean": 0.1947314739227295, "rewards/ngrams_iou_reward/std": 0.18734505772590637, "rewards/schema_keywords_iou_reward/mean": 0.681755542755127, "rewards/schema_keywords_iou_reward/std": 0.17279082536697388, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 227.2760467529297, "completions/mean_terminated_length": 159.24562072753906, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.4376590330788805, "frac_reward_zero_std": 0.09375, "grad_norm": 0.8468599915504456, "kl": 0.04742431640625, "learning_rate": 1.0472905457611935e-07, "loss": 0.0124, "num_tokens": 184227095.0, "reward": 10.450754165649414, "reward_std": 1.128627061843872, "rewards/accuracy_reward/mean": 1.703125, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.2729166746139526, "rewards/judge_reward/std": 1.639721393585205, "rewards/ngrams_iou_reward/mean": 0.28056037425994873, "rewards/ngrams_iou_reward/std": 0.31126490235328674, "rewards/schema_keywords_iou_reward/mean": 0.7545683979988098, "rewards/schema_keywords_iou_reward/std": 0.17772221565246582, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 233.9947967529297, "completions/mean_terminated_length": 176.28302001953125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 2.4410517387616624, "frac_reward_zero_std": 0.0, "grad_norm": 0.7864798903465271, "kl": 0.049072265625, "learning_rate": 1.0352363345861065e-07, "loss": 0.0064, "num_tokens": 184481056.0, "reward": 10.130729675292969, "reward_std": 1.693893313407898, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.225000023841858, "rewards/judge_reward/std": 1.5853385925292969, "rewards/ngrams_iou_reward/mean": 0.21868503093719482, "rewards/ngrams_iou_reward/std": 0.23509831726551056, "rewards/schema_keywords_iou_reward/mean": 0.7380854487419128, "rewards/schema_keywords_iou_reward/std": 0.1845288723707199, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 173.46340942382812, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.4444444444444446, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8299030065536499, "kl": 0.05426025390625, "learning_rate": 1.0232438810025728e-07, "loss": 0.0051, "num_tokens": 184725232.0, "reward": 9.906766891479492, "reward_std": 1.3348917961120605, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.306249976158142, "rewards/judge_reward/std": 1.650626540184021, "rewards/ngrams_iou_reward/mean": 0.17571872472763062, "rewards/ngrams_iou_reward/std": 0.14696857333183289, "rewards/schema_keywords_iou_reward/mean": 0.7175058722496033, "rewards/schema_keywords_iou_reward/std": 0.1605502963066101, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.94271850585938, "completions/mean_terminated_length": 174.5576934814453, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 2.4478371501272265, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7918272018432617, "kl": 0.0528564453125, "learning_rate": 1.0113133718124034e-07, "loss": 0.0044, "num_tokens": 184988717.0, "reward": 9.9998779296875, "reward_std": 1.4744625091552734, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.548958420753479, "rewards/judge_reward/std": 1.6790982484817505, "rewards/ngrams_iou_reward/mean": 0.17415569722652435, "rewards/ngrams_iou_reward/std": 0.1638064682483673, "rewards/schema_keywords_iou_reward/mean": 0.7173891067504883, "rewards/schema_keywords_iou_reward/std": 0.19489338994026184, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 227.4635467529297, "completions/mean_terminated_length": 158.1607208251953, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 2.4512298558100083, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8857631087303162, "kl": 0.0477294921875, "learning_rate": 9.994449928525323e-08, "loss": 0.0257, "num_tokens": 185268280.0, "reward": 10.480490684509277, "reward_std": 1.1980013847351074, "rewards/accuracy_reward/mean": 1.8125, "rewards/accuracy_reward/std": 1.4709222316741943, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.1270833015441895, "rewards/judge_reward/std": 1.551117181777954, "rewards/ngrams_iou_reward/mean": 0.24501590430736542, "rewards/ngrams_iou_reward/std": 0.30332469940185547, "rewards/schema_keywords_iou_reward/mean": 0.7156825661659241, "rewards/schema_keywords_iou_reward/std": 0.20725055038928986, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 238.69271850585938, "completions/mean_terminated_length": 176.88095092773438, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.4546225614927906, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7390483617782593, "kl": 0.04644775390625, "learning_rate": 9.876389289921105e-08, "loss": 0.0063, "num_tokens": 185505521.0, "reward": 10.068916320800781, "reward_std": 1.2707825899124146, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.642187476158142, "rewards/judge_reward/std": 1.7016595602035522, "rewards/ngrams_iou_reward/mean": 0.20437967777252197, "rewards/ngrams_iou_reward/std": 0.22693930566310883, "rewards/schema_keywords_iou_reward/mean": 0.7416195869445801, "rewards/schema_keywords_iou_reward/std": 0.14681266248226166, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 231.9010467529297, "completions/mean_terminated_length": 173.37501525878906, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.4580152671755724, "frac_reward_zero_std": 0.0, "grad_norm": 0.7855750322341919, "kl": 0.0531005859375, "learning_rate": 9.75895364129633e-08, "loss": 0.027, "num_tokens": 185761582.0, "reward": 10.09444522857666, "reward_std": 1.2973158359527588, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.7208333015441895, "rewards/judge_reward/std": 1.7439148426055908, "rewards/ngrams_iou_reward/mean": 0.18570345640182495, "rewards/ngrams_iou_reward/std": 0.20127998292446136, "rewards/schema_keywords_iou_reward/mean": 0.7222834229469299, "rewards/schema_keywords_iou_reward/std": 0.15503527224063873, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 234.78646850585938, "completions/mean_terminated_length": 165.4888916015625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 2.4614079728583547, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9458563327789307, "kl": 0.0550537109375, "learning_rate": 9.642144811900737e-08, "loss": -0.0007, "num_tokens": 186004367.0, "reward": 10.42463207244873, "reward_std": 1.7838408946990967, "rewards/accuracy_reward/mean": 1.90625, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.004166603088379, "rewards/judge_reward/std": 1.5933051109313965, "rewards/ngrams_iou_reward/mean": 0.208288311958313, "rewards/ngrams_iou_reward/std": 0.21384936571121216, "rewards/schema_keywords_iou_reward/mean": 0.681968629360199, "rewards/schema_keywords_iou_reward/std": 0.1991550177335739, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617789745331, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 228.5260467529297, "completions/mean_terminated_length": 158.3148193359375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.4648006785411365, "frac_reward_zero_std": 0.0, "grad_norm": 0.8666093349456787, "kl": 0.04827880859375, "learning_rate": 9.5259646212203e-08, "loss": 0.0053, "num_tokens": 186256540.0, "reward": 10.028688430786133, "reward_std": 1.1719458103179932, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.0395833253860474, "rewards/judge_reward/std": 1.4620132446289062, "rewards/ngrams_iou_reward/mean": 0.16056805849075317, "rewards/ngrams_iou_reward/std": 0.14549681544303894, "rewards/schema_keywords_iou_reward/mean": 0.6795775294303894, "rewards/schema_keywords_iou_reward/std": 0.1821286678314209, "rewards/syntax_reward/mean": 0.8958333134651184, "rewards/syntax_reward/std": 0.30627527832984924, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.73959350585938, "completions/mean_terminated_length": 176.25001525878906, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 2.4681933842239188, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8658123016357422, "kl": 0.0506591796875, "learning_rate": 9.410414878948975e-08, "loss": -0.0004, "num_tokens": 186532670.0, "reward": 9.498924255371094, "reward_std": 1.3271961212158203, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.3729166984558105, "rewards/judge_reward/std": 1.5401419401168823, "rewards/ngrams_iou_reward/mean": 0.20621748268604279, "rewards/ngrams_iou_reward/std": 0.1968972086906433, "rewards/schema_keywords_iou_reward/mean": 0.6979143619537354, "rewards/schema_keywords_iou_reward/std": 0.18562161922454834, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 231.6510467529297, "completions/mean_terminated_length": 167.79244995117188, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 2.4715860899067006, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8021472692489624, "kl": 0.0484619140625, "learning_rate": 9.295497384960415e-08, "loss": 0.0178, "num_tokens": 186790855.0, "reward": 9.183795928955078, "reward_std": 1.7615742683410645, "rewards/accuracy_reward/mean": 1.09375, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.6572917699813843, "rewards/judge_reward/std": 1.6208875179290771, "rewards/ngrams_iou_reward/mean": 0.19544291496276855, "rewards/ngrams_iou_reward/std": 0.2379842847585678, "rewards/schema_keywords_iou_reward/mean": 0.658145010471344, "rewards/schema_keywords_iou_reward/std": 0.20171964168548584, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 235.7760467529297, "completions/mean_terminated_length": 175.1041717529297, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 2.4749787955894824, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7414120435714722, "kl": 0.04742431640625, "learning_rate": 9.181213929280046e-08, "loss": -0.0025, "num_tokens": 187067934.0, "reward": 9.736311912536621, "reward_std": 1.0463472604751587, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.5322917699813843, "rewards/judge_reward/std": 1.5739364624023438, "rewards/ngrams_iou_reward/mean": 0.15865731239318848, "rewards/ngrams_iou_reward/std": 0.1309078186750412, "rewards/schema_keywords_iou_reward/mean": 0.700570821762085, "rewards/schema_keywords_iou_reward/std": 0.1518615335226059, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374122977256775, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 235.9010467529297, "completions/mean_terminated_length": 168.2954559326172, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.4783715012722647, "frac_reward_zero_std": 0.0, "grad_norm": 0.7950136661529541, "kl": 0.0594482421875, "learning_rate": 9.067566292057083e-08, "loss": 0.0128, "num_tokens": 187316189.0, "reward": 10.41788387298584, "reward_std": 1.1885883808135986, "rewards/accuracy_reward/mean": 1.90625, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 0.9677083492279053, "rewards/judge_reward/std": 1.4990394115447998, "rewards/ngrams_iou_reward/mean": 0.18768586218357086, "rewards/ngrams_iou_reward/std": 0.21018990874290466, "rewards/schema_keywords_iou_reward/mean": 0.7166552543640137, "rewards/schema_keywords_iou_reward/std": 0.179171621799469, "rewards/syntax_reward/mean": 0.8958333134651184, "rewards/syntax_reward/std": 0.30627527832984924, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 229.078125, "completions/mean_terminated_length": 169.85000610351562, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 2.4817642069550465, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8846349716186523, "kl": 0.04913330078125, "learning_rate": 8.954556243536875e-08, "loss": 0.0167, "num_tokens": 187574600.0, "reward": 10.70096492767334, "reward_std": 0.9977569580078125, "rewards/accuracy_reward/mean": 1.953125, "rewards/accuracy_reward/std": 1.433660626411438, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.040624976158142, "rewards/judge_reward/std": 1.5605796575546265, "rewards/ngrams_iou_reward/mean": 0.20478315651416779, "rewards/ngrams_iou_reward/std": 0.2366413176059723, "rewards/schema_keywords_iou_reward/mean": 0.7013899683952332, "rewards/schema_keywords_iou_reward/std": 0.16342611610889435, "rewards/syntax_reward/mean": 0.8854166865348816, "rewards/syntax_reward/std": 0.3193511664867401, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 232.4010467529297, "completions/mean_terminated_length": 165.37998962402344, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 2.485156912637829, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671582341194153, "kl": 0.0496826171875, "learning_rate": 8.842185544033254e-08, "loss": 0.005, "num_tokens": 187819573.0, "reward": 10.729612350463867, "reward_std": 1.1695783138275146, "rewards/accuracy_reward/mean": 1.875, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.160416603088379, "rewards/judge_reward/std": 1.6118980646133423, "rewards/ngrams_iou_reward/mean": 0.2034282237291336, "rewards/ngrams_iou_reward/std": 0.21817801892757416, "rewards/schema_keywords_iou_reward/mean": 0.7157654762268066, "rewards/schema_keywords_iou_reward/std": 0.1572578400373459, "rewards/syntax_reward/mean": 0.9322916865348816, "rewards/syntax_reward/std": 0.2519015669822693, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 230.59375, "completions/mean_terminated_length": 173.32203674316406, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 2.4885496183206106, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7784700989723206, "kl": 0.0498046875, "learning_rate": 8.730455943901199e-08, "loss": 0.0092, "num_tokens": 188078119.0, "reward": 10.42100715637207, "reward_std": 1.2135558128356934, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.2260416746139526, "rewards/judge_reward/std": 1.6267235279083252, "rewards/ngrams_iou_reward/mean": 0.1774642914533615, "rewards/ngrams_iou_reward/std": 0.15948839485645294, "rewards/schema_keywords_iou_reward/mean": 0.7237496376037598, "rewards/schema_keywords_iou_reward/std": 0.14423418045043945, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 232.390625, "completions/mean_terminated_length": 165.33999633789062, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 2.491942324003393, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9325762987136841, "kl": 0.06427001953125, "learning_rate": 8.619369183509501e-08, "loss": 0.0081, "num_tokens": 188361838.0, "reward": 10.135982513427734, "reward_std": 1.3501394987106323, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.2927082777023315, "rewards/judge_reward/std": 1.5754493474960327, "rewards/ngrams_iou_reward/mean": 0.2360241860151291, "rewards/ngrams_iou_reward/std": 0.2756761610507965, "rewards/schema_keywords_iou_reward/mean": 0.7207911014556885, "rewards/schema_keywords_iou_reward/std": 0.18753254413604736, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 230.5729217529297, "completions/mean_terminated_length": 165.59259033203125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 2.4953350296861747, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8532559871673584, "kl": 0.04730224609375, "learning_rate": 8.508926993213711e-08, "loss": -0.023, "num_tokens": 188617248.0, "reward": 10.076135635375977, "reward_std": 1.402830958366394, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.453125, "rewards/judge_reward/std": 1.663243055343628, "rewards/ngrams_iou_reward/mean": 0.20258276164531708, "rewards/ngrams_iou_reward/std": 0.23973603546619415, "rewards/schema_keywords_iou_reward/mean": 0.722510039806366, "rewards/schema_keywords_iou_reward/std": 0.17342884838581085, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 227.421875, "completions/mean_terminated_length": 159.73684692382812, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.4987277353689565, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7328648567199707, "kl": 0.044677734375, "learning_rate": 8.399131093329159e-08, "loss": 0.006, "num_tokens": 188874237.0, "reward": 10.096244812011719, "reward_std": 1.0979865789413452, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.4375, "rewards/judge_reward/std": 1.6288788318634033, "rewards/ngrams_iou_reward/mean": 0.27565690875053406, "rewards/ngrams_iou_reward/std": 0.26519760489463806, "rewards/schema_keywords_iou_reward/mean": 0.7424624562263489, "rewards/schema_keywords_iou_reward/std": 0.17983436584472656, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 229.4635467529297, "completions/mean_terminated_length": 171.08334350585938, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.502120441051739, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7401113510131836, "kl": 0.0509033203125, "learning_rate": 8.289983194104127e-08, "loss": -0.0104, "num_tokens": 189142172.0, "reward": 10.116644859313965, "reward_std": 1.1442856788635254, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.3458333015441895, "rewards/judge_reward/std": 1.6290502548217773, "rewards/ngrams_iou_reward/mean": 0.18240070343017578, "rewards/ngrams_iou_reward/std": 0.19563870131969452, "rewards/schema_keywords_iou_reward/mean": 0.690493106842041, "rewards/schema_keywords_iou_reward/std": 0.154254749417305, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 236.2447967529297, "completions/mean_terminated_length": 180.13999938964844, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.5055131467345206, "frac_reward_zero_std": 0.0, "grad_norm": 0.8020886778831482, "kl": 0.0443115234375, "learning_rate": 8.181484995693295e-08, "loss": 0.0156, "num_tokens": 189393037.0, "reward": 9.678207397460938, "reward_std": 1.3753570318222046, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.513541579246521, "rewards/judge_reward/std": 1.6877210140228271, "rewards/ngrams_iou_reward/mean": 0.14455099403858185, "rewards/ngrams_iou_reward/std": 0.13190338015556335, "rewards/schema_keywords_iou_reward/mean": 0.6784467101097107, "rewards/schema_keywords_iou_reward/std": 0.1740284264087677, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 222.6354217529297, "completions/mean_terminated_length": 158.93939208984375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.508905852417303, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8476444482803345, "kl": 0.048583984375, "learning_rate": 8.073638188131127e-08, "loss": -0.0036, "num_tokens": 189630549.0, "reward": 10.952262878417969, "reward_std": 0.7654959559440613, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 1.4432151317596436, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319953322410583, "rewards/judge_reward/mean": 1.0968749523162842, "rewards/judge_reward/std": 1.6032657623291016, "rewards/ngrams_iou_reward/mean": 0.31224653124809265, "rewards/ngrams_iou_reward/std": 0.33729004859924316, "rewards/schema_keywords_iou_reward/mean": 0.7743914723396301, "rewards/schema_keywords_iou_reward/std": 0.16939449310302734, "rewards/syntax_reward/mean": 0.9270833134651184, "rewards/syntax_reward/std": 0.2606794238090515, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 225.42709350585938, "completions/mean_terminated_length": 164.28125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 2.5122985581000847, "frac_reward_zero_std": 0.0, "grad_norm": 0.7487633228302002, "kl": 0.05181884765625, "learning_rate": 7.966444451305727e-08, "loss": 0.0007, "num_tokens": 189914269.0, "reward": 9.275732040405273, "reward_std": 1.5130650997161865, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.3625000715255737, "rewards/judge_reward/std": 1.528319001197815, "rewards/ngrams_iou_reward/mean": 0.17545919120311737, "rewards/ngrams_iou_reward/std": 0.18279673159122467, "rewards/schema_keywords_iou_reward/mean": 0.672147274017334, "rewards/schema_keywords_iou_reward/std": 0.17726537585258484, "rewards/syntax_reward/mean": 0.7395833134651184, "rewards/syntax_reward/std": 0.44000932574272156, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.64584350585938, "completions/mean_terminated_length": 171.5454559326172, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 2.515691263782867, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8354591131210327, "kl": 0.046142578125, "learning_rate": 7.85990545493247e-08, "loss": 0.0131, "num_tokens": 190158755.0, "reward": 9.941486358642578, "reward_std": 1.0221259593963623, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.1687499284744263, "rewards/judge_reward/std": 1.480010747909546, "rewards/ngrams_iou_reward/mean": 0.18463446199893951, "rewards/ngrams_iou_reward/std": 0.22629866003990173, "rewards/schema_keywords_iou_reward/mean": 0.6901843547821045, "rewards/schema_keywords_iou_reward/std": 0.1758948415517807, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.8854217529297, "completions/mean_terminated_length": 173.7941131591797, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 2.519083969465649, "frac_reward_zero_std": 0.0625, "grad_norm": 0.931840717792511, "kl": 0.04803466796875, "learning_rate": 7.754022858528158e-08, "loss": 0.0026, "num_tokens": 190404421.0, "reward": 9.948182106018066, "reward_std": 1.2587679624557495, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967556834220886, "rewards/judge_reward/mean": 1.2041666507720947, "rewards/judge_reward/std": 1.489081859588623, "rewards/ngrams_iou_reward/mean": 0.18103145062923431, "rewards/ngrams_iou_reward/std": 0.2149311602115631, "rewards/schema_keywords_iou_reward/mean": 0.6973579525947571, "rewards/schema_keywords_iou_reward/std": 0.1836036890745163, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 172.375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 2.5224766751484307, "frac_reward_zero_std": 0.0, "grad_norm": 0.8106144070625305, "kl": 0.05120849609375, "learning_rate": 7.648798311385058e-08, "loss": 0.0126, "num_tokens": 190671601.0, "reward": 10.091999053955078, "reward_std": 1.1760435104370117, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.4729167222976685, "rewards/judge_reward/std": 1.6918950080871582, "rewards/ngrams_iou_reward/mean": 0.19138550758361816, "rewards/ngrams_iou_reward/std": 0.18453297019004822, "rewards/schema_keywords_iou_reward/mean": 0.7151969075202942, "rewards/schema_keywords_iou_reward/std": 0.13989311456680298, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 231.28125, "completions/mean_terminated_length": 171.25001525878906, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 2.525869380831213, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7405107021331787, "kl": 0.051513671875, "learning_rate": 7.544233452545296e-08, "loss": 0.0091, "num_tokens": 190929169.0, "reward": 10.381233215332031, "reward_std": 1.3553276062011719, "rewards/accuracy_reward/mean": 1.9375, "rewards/accuracy_reward/std": 1.4385310411453247, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 0.893750011920929, "rewards/judge_reward/std": 1.4027506113052368, "rewards/ngrams_iou_reward/mean": 0.26514920592308044, "rewards/ngrams_iou_reward/std": 0.2802067995071411, "rewards/schema_keywords_iou_reward/mean": 0.7348330616950989, "rewards/schema_keywords_iou_reward/std": 0.1816176474094391, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 229.67709350585938, "completions/mean_terminated_length": 180.56715393066406, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.5292620865139948, "frac_reward_zero_std": 0.0, "grad_norm": 0.7967899441719055, "kl": 0.05279541015625, "learning_rate": 7.440329910775272e-08, "loss": 0.0267, "num_tokens": 191196869.0, "reward": 9.37468147277832, "reward_std": 1.4176892042160034, "rewards/accuracy_reward/mean": 1.015625, "rewards/accuracy_reward/std": 1.423352599143982, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.926041603088379, "rewards/judge_reward/std": 1.628203272819519, "rewards/ngrams_iou_reward/mean": 0.1339763104915619, "rewards/ngrams_iou_reward/std": 0.16137543320655823, "rewards/schema_keywords_iou_reward/mean": 0.6334131360054016, "rewards/schema_keywords_iou_reward/std": 0.17531397938728333, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 231.31771850585938, "completions/mean_terminated_length": 174.29310607910156, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.532654792196777, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7942616939544678, "kl": 0.05572509765625, "learning_rate": 7.3370893045403e-08, "loss": 0.0094, "num_tokens": 191470542.0, "reward": 9.718475341796875, "reward_std": 1.181032419204712, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.4520832300186157, "rewards/judge_reward/std": 1.6094602346420288, "rewards/ngrams_iou_reward/mean": 0.23152406513690948, "rewards/ngrams_iou_reward/std": 0.25633272528648376, "rewards/schema_keywords_iou_reward/mean": 0.6973665356636047, "rewards/schema_keywords_iou_reward/std": 0.1958904266357422, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 220.1666717529297, "completions/mean_terminated_length": 165.4736785888672, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 2.536047497879559, "frac_reward_zero_std": 0.0, "grad_norm": 0.8969821333885193, "kl": 0.053466796875, "learning_rate": 7.234513241979418e-08, "loss": 0.0298, "num_tokens": 191754968.0, "reward": 9.945701599121094, "reward_std": 1.2188729047775269, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.1218749284744263, "rewards/judge_reward/std": 1.5052927732467651, "rewards/ngrams_iou_reward/mean": 0.227716326713562, "rewards/ngrams_iou_reward/std": 0.24465829133987427, "rewards/schema_keywords_iou_reward/mean": 0.724234402179718, "rewards/schema_keywords_iou_reward/std": 0.18387117981910706, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 177.53846740722656, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.539440203562341, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8332445025444031, "kl": 0.05072021484375, "learning_rate": 7.132603320880293e-08, "loss": 0.0238, "num_tokens": 192006950.0, "reward": 10.584478378295898, "reward_std": 0.7771739363670349, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.4572917222976685, "rewards/judge_reward/std": 1.7451951503753662, "rewards/ngrams_iou_reward/mean": 0.1938677281141281, "rewards/ngrams_iou_reward/std": 0.17534872889518738, "rewards/schema_keywords_iou_reward/mean": 0.7312354445457458, "rewards/schema_keywords_iou_reward/std": 0.14410217106342316, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 219.4166717529297, "completions/mean_terminated_length": 158.44444274902344, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 2.542832909245123, "frac_reward_zero_std": 0.0, "grad_norm": 1.1948962211608887, "kl": 0.06610107421875, "learning_rate": 7.0313611286544e-08, "loss": 0.0255, "num_tokens": 192281140.0, "reward": 10.523789405822754, "reward_std": 1.4678176641464233, "rewards/accuracy_reward/mean": 1.875, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12434382736682892, "rewards/judge_reward/mean": 1.053125023841858, "rewards/judge_reward/std": 1.5538805723190308, "rewards/ngrams_iou_reward/mean": 0.24072913825511932, "rewards/ngrams_iou_reward/std": 0.273495078086853, "rewards/schema_keywords_iou_reward/mean": 0.723685085773468, "rewards/schema_keywords_iou_reward/std": 0.18855777382850647, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 232.765625, "completions/mean_terminated_length": 173.38888549804688, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 2.546225614927905, "frac_reward_zero_std": 0.0625, "grad_norm": 0.794619083404541, "kl": 0.04937744140625, "learning_rate": 6.930788242312252e-08, "loss": 0.0149, "num_tokens": 192534115.0, "reward": 10.028999328613281, "reward_std": 0.9106936454772949, "rewards/accuracy_reward/mean": 1.265625, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12434382736682892, "rewards/judge_reward/mean": 1.75, "rewards/judge_reward/std": 1.7639793157577515, "rewards/ngrams_iou_reward/mean": 0.1746664047241211, "rewards/ngrams_iou_reward/std": 0.18688349425792694, "rewards/schema_keywords_iou_reward/mean": 0.7241244316101074, "rewards/schema_keywords_iou_reward/std": 0.16441798210144043, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 228.5885467529297, "completions/mean_terminated_length": 178.60293579101562, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 2.549618320610687, "frac_reward_zero_std": 0.0, "grad_norm": 0.8529637455940247, "kl": 0.0672607421875, "learning_rate": 6.830886228438837e-08, "loss": 0.0324, "num_tokens": 192794484.0, "reward": 9.831564903259277, "reward_std": 1.4250506162643433, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.271875023841858, "rewards/judge_reward/std": 1.6177161931991577, "rewards/ngrams_iou_reward/mean": 0.20713980495929718, "rewards/ngrams_iou_reward/std": 0.2191702127456665, "rewards/schema_keywords_iou_reward/mean": 0.7004660964012146, "rewards/schema_keywords_iou_reward/std": 0.19089579582214355, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.387094110250473, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 225.55209350585938, "completions/mean_terminated_length": 172.4857177734375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.553011026293469, "frac_reward_zero_std": 0.0, "grad_norm": 0.8676683306694031, "kl": 0.05438232421875, "learning_rate": 6.731656643169203e-08, "loss": 0.0155, "num_tokens": 193066144.0, "reward": 10.06049919128418, "reward_std": 1.2012767791748047, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.3697916269302368, "rewards/judge_reward/std": 1.6473758220672607, "rewards/ngrams_iou_reward/mean": 0.15532894432544708, "rewards/ngrams_iou_reward/std": 0.1523953378200531, "rewards/schema_keywords_iou_reward/mean": 0.7124611735343933, "rewards/schema_keywords_iou_reward/std": 0.15182259678840637, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 227.2760467529297, "completions/mean_terminated_length": 171.15383911132812, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.556403731976251, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8062768578529358, "kl": 0.052978515625, "learning_rate": 6.633101032164273e-08, "loss": 0.0271, "num_tokens": 193349883.0, "reward": 9.640575408935547, "reward_std": 1.397627353668213, "rewards/accuracy_reward/mean": 1.234375, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.6260417699813843, "rewards/judge_reward/std": 1.6005066633224487, "rewards/ngrams_iou_reward/mean": 0.22394245862960815, "rewards/ngrams_iou_reward/std": 0.23644596338272095, "rewards/schema_keywords_iou_reward/mean": 0.721841037273407, "rewards/schema_keywords_iou_reward/std": 0.15517859160900116, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 223.4375, "completions/mean_terminated_length": 165.3913116455078, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.559796437659033, "frac_reward_zero_std": 0.0, "grad_norm": 0.8248627781867981, "kl": 0.05535888671875, "learning_rate": 6.535220930586704e-08, "loss": -0.005, "num_tokens": 193625871.0, "reward": 9.95461368560791, "reward_std": 1.8219504356384277, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237156867981, "rewards/judge_reward/mean": 1.134895920753479, "rewards/judge_reward/std": 1.5239728689193726, "rewards/ngrams_iou_reward/mean": 0.2223329097032547, "rewards/ngrams_iou_reward/std": 0.264505535364151, "rewards/schema_keywords_iou_reward/mean": 0.7020712494850159, "rewards/schema_keywords_iou_reward/std": 0.18303938210010529, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 222.609375, "completions/mean_terminated_length": 164.41429138183594, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 2.5631891433418152, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9885637164115906, "kl": 0.05072021484375, "learning_rate": 6.438017863077022e-08, "loss": -0.015, "num_tokens": 193882350.0, "reward": 9.718646049499512, "reward_std": 0.9473025798797607, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.4812501668930054, "rewards/judge_reward/std": 1.6616594791412354, "rewards/ngrams_iou_reward/mean": 0.22844572365283966, "rewards/ngrams_iou_reward/std": 0.24667072296142578, "rewards/schema_keywords_iou_reward/mean": 0.7204082012176514, "rewards/schema_keywords_iou_reward/std": 0.15711437165737152, "rewards/syntax_reward/mean": 0.7552083134651184, "rewards/syntax_reward/std": 0.4310877025127411, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.9166717529297, "completions/mean_terminated_length": 164.94544982910156, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.566581849024597, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8726136684417725, "kl": 0.0572509765625, "learning_rate": 6.341493343729853e-08, "loss": -0.0145, "num_tokens": 194145170.0, "reward": 9.858200073242188, "reward_std": 1.1119365692138672, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.4614583253860474, "rewards/judge_reward/std": 1.6630804538726807, "rewards/ngrams_iou_reward/mean": 0.19695182144641876, "rewards/ngrams_iou_reward/std": 0.19910812377929688, "rewards/schema_keywords_iou_reward/mean": 0.7018726468086243, "rewards/schema_keywords_iou_reward/std": 0.16713179647922516, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 256.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 223.328125, "completions/mean_terminated_length": 170.06849670410156, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 2.569974554707379, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9962001442909241, "kl": 0.0517578125, "learning_rate": 6.24564887607032e-08, "loss": -0.0166, "num_tokens": 194398247.0, "reward": 10.280694961547852, "reward_std": 1.7415876388549805, "rewards/accuracy_reward/mean": 1.734375, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.1791666746139526, "rewards/judge_reward/std": 1.5725336074829102, "rewards/ngrams_iou_reward/mean": 0.1677711457014084, "rewards/ngrams_iou_reward/std": 0.15453937649726868, "rewards/schema_keywords_iou_reward/mean": 0.7025066018104553, "rewards/schema_keywords_iou_reward/std": 0.17257948219776154, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 228.5572967529297, "completions/mean_terminated_length": 168.183349609375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 2.573367260390161, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7930972576141357, "kl": 0.0489501953125, "learning_rate": 6.150485953030676e-08, "loss": -0.0123, "num_tokens": 194638966.0, "reward": 10.095669746398926, "reward_std": 1.004044532775879, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.328125, "rewards/judge_reward/std": 1.6365077495574951, "rewards/ngrams_iou_reward/mean": 0.29285600781440735, "rewards/ngrams_iou_reward/std": 0.30715352296829224, "rewards/schema_keywords_iou_reward/mean": 0.7142717838287354, "rewards/schema_keywords_iou_reward/std": 0.19974182546138763, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 231.33334350585938, "completions/mean_terminated_length": 169.89089965820312, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 2.5767599660729434, "frac_reward_zero_std": 0.0, "grad_norm": 0.8199918866157532, "kl": 0.056396484375, "learning_rate": 6.056006056926976e-08, "loss": -0.0115, "num_tokens": 194879234.0, "reward": 9.298471450805664, "reward_std": 1.6757014989852905, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.3562499284744263, "rewards/judge_reward/std": 1.5455940961837769, "rewards/ngrams_iou_reward/mean": 0.15394987165927887, "rewards/ngrams_iou_reward/std": 0.17841431498527527, "rewards/schema_keywords_iou_reward/mean": 0.6611873507499695, "rewards/schema_keywords_iou_reward/std": 0.19338341057300568, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.515625, "completions/mean_terminated_length": 167.03636169433594, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 2.5801526717557253, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7982020378112793, "kl": 0.0467529296875, "learning_rate": 5.96221065943609e-08, "loss": 0.0174, "num_tokens": 195128237.0, "reward": 10.37445068359375, "reward_std": 1.1553850173950195, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9010416865348816, "rewards/format_reward/std": 0.2993867099285126, "rewards/judge_reward/mean": 1.2557291984558105, "rewards/judge_reward/std": 1.6379876136779785, "rewards/ngrams_iou_reward/mean": 0.22048604488372803, "rewards/ngrams_iou_reward/std": 0.24543122947216034, "rewards/schema_keywords_iou_reward/mean": 0.7310473918914795, "rewards/schema_keywords_iou_reward/std": 0.17419826984405518, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617789745331, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.06771850585938, "completions/mean_terminated_length": 170.90740966796875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 2.583545377438507, "frac_reward_zero_std": 0.0, "grad_norm": 0.7410261631011963, "kl": 0.05133056640625, "learning_rate": 5.869101221572653e-08, "loss": 0.0135, "num_tokens": 195389544.0, "reward": 10.523895263671875, "reward_std": 1.2919111251831055, "rewards/accuracy_reward/mean": 1.703125, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.3302083015441895, "rewards/judge_reward/std": 1.6873795986175537, "rewards/ngrams_iou_reward/mean": 0.22929047048091888, "rewards/ngrams_iou_reward/std": 0.24134749174118042, "rewards/schema_keywords_iou_reward/mean": 0.7331467270851135, "rewards/schema_keywords_iou_reward/std": 0.17022360861301422, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 235.3854217529297, "completions/mean_terminated_length": 179.88462829589844, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 2.5869380831212894, "frac_reward_zero_std": 0.0, "grad_norm": 0.8758540153503418, "kl": 0.05120849609375, "learning_rate": 5.7766791936664114e-08, "loss": 0.0017, "num_tokens": 195654674.0, "reward": 10.113618850708008, "reward_std": 1.2461812496185303, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.3739584684371948, "rewards/judge_reward/std": 1.661497712135315, "rewards/ngrams_iou_reward/mean": 0.16410622000694275, "rewards/ngrams_iou_reward/std": 0.19373612105846405, "rewards/schema_keywords_iou_reward/mean": 0.7015948295593262, "rewards/schema_keywords_iou_reward/std": 0.18312354385852814, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.3854217529297, "completions/mean_terminated_length": 178.3921661376953, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 2.590330788804071, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8413735032081604, "kl": 0.04931640625, "learning_rate": 5.68494601533957e-08, "loss": 0.0181, "num_tokens": 195915118.0, "reward": 10.260601997375488, "reward_std": 1.7184871435165405, "rewards/accuracy_reward/mean": 1.828125, "rewards/accuracy_reward/std": 1.4674979448318481, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.0364583730697632, "rewards/judge_reward/std": 1.4880057573318481, "rewards/ngrams_iou_reward/mean": 0.19699053466320038, "rewards/ngrams_iou_reward/std": 0.20010890066623688, "rewards/schema_keywords_iou_reward/mean": 0.699026882648468, "rewards/schema_keywords_iou_reward/std": 0.190683051943779, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374122977256775, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.15625, "completions/mean_terminated_length": 170.8000030517578, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.593723494486853, "frac_reward_zero_std": 0.0, "grad_norm": 0.9923948049545288, "kl": 0.04803466796875, "learning_rate": 5.5939031154844e-08, "loss": -0.0085, "num_tokens": 196158172.0, "reward": 9.317792892456055, "reward_std": 1.4478181600570679, "rewards/accuracy_reward/mean": 1.234375, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.4229167699813843, "rewards/judge_reward/std": 1.530115008354187, "rewards/ngrams_iou_reward/mean": 0.243641659617424, "rewards/ngrams_iou_reward/std": 0.25638654828071594, "rewards/schema_keywords_iou_reward/mean": 0.723109245300293, "rewards/schema_keywords_iou_reward/std": 0.17986539006233215, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 235.1197967529297, "completions/mean_terminated_length": 178.9038543701172, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.5971162001696353, "frac_reward_zero_std": 0.0, "grad_norm": 0.7398818731307983, "kl": 0.05010986328125, "learning_rate": 5.503551912240989e-08, "loss": -0.0115, "num_tokens": 196396941.0, "reward": 10.187559127807617, "reward_std": 1.2948061227798462, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.4166666269302368, "rewards/judge_reward/std": 1.7287629842758179, "rewards/ngrams_iou_reward/mean": 0.1625807285308838, "rewards/ngrams_iou_reward/std": 0.17390292882919312, "rewards/schema_keywords_iou_reward/mean": 0.691644012928009, "rewards/schema_keywords_iou_reward/std": 0.16897346079349518, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617789745331, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.81771850585938, "completions/mean_terminated_length": 181.81668090820312, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 2.6005089058524176, "frac_reward_zero_std": 0.0, "grad_norm": 0.8376525044441223, "kl": 0.0501708984375, "learning_rate": 5.4138938129750955e-08, "loss": -0.0206, "num_tokens": 196644970.0, "reward": 10.5343656539917, "reward_std": 1.2879223823547363, "rewards/accuracy_reward/mean": 1.75, "rewards/accuracy_reward/std": 1.482886552810669, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.3125, "rewards/judge_reward/std": 1.7917349338531494, "rewards/ngrams_iou_reward/mean": 0.2007588893175125, "rewards/ngrams_iou_reward/std": 0.2062750607728958, "rewards/schema_keywords_iou_reward/mean": 0.7138146758079529, "rewards/schema_keywords_iou_reward/std": 0.1576499342918396, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.40625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 224.9322967529297, "completions/mean_terminated_length": 178.532470703125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 2.6039016115351994, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7738091945648193, "kl": 0.05908203125, "learning_rate": 5.324930214256301e-08, "loss": 0.0135, "num_tokens": 196914789.0, "reward": 9.244377136230469, "reward_std": 1.3706905841827393, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.3432291746139526, "rewards/judge_reward/std": 1.523139476776123, "rewards/ngrams_iou_reward/mean": 0.15493525564670563, "rewards/ngrams_iou_reward/std": 0.17457790672779083, "rewards/schema_keywords_iou_reward/mean": 0.6894413828849792, "rewards/schema_keywords_iou_reward/std": 0.17119911313056946, "rewards/syntax_reward/mean": 0.7083333134651184, "rewards/syntax_reward/std": 0.4557180106639862, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 228.0260467529297, "completions/mean_terminated_length": 160.08929443359375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 2.607294317217981, "frac_reward_zero_std": 0.0, "grad_norm": 1.047885775566101, "kl": 0.04962158203125, "learning_rate": 5.236662501836192e-08, "loss": 0.0161, "num_tokens": 197163398.0, "reward": 10.566604614257812, "reward_std": 1.1829869747161865, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.4406248331069946, "rewards/judge_reward/std": 1.6960117816925049, "rewards/ngrams_iou_reward/mean": 0.27508023381233215, "rewards/ngrams_iou_reward/std": 0.28184837102890015, "rewards/schema_keywords_iou_reward/mean": 0.7644405364990234, "rewards/schema_keywords_iou_reward/std": 0.15913422405719757, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374123275279999, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 228.3541717529297, "completions/mean_terminated_length": 175.5757598876953, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.6106870229007635, "frac_reward_zero_std": 0.0, "grad_norm": 0.7960390448570251, "kl": 0.05621337890625, "learning_rate": 5.1490920506268246e-08, "loss": 0.0098, "num_tokens": 197430214.0, "reward": 10.663387298583984, "reward_std": 0.9476136565208435, "rewards/accuracy_reward/mean": 1.78125, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.2458332777023315, "rewards/judge_reward/std": 1.649871587753296, "rewards/ngrams_iou_reward/mean": 0.23380832374095917, "rewards/ngrams_iou_reward/std": 0.23092493414878845, "rewards/schema_keywords_iou_reward/mean": 0.7399954199790955, "rewards/schema_keywords_iou_reward/std": 0.16116435825824738, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 237.21875, "completions/mean_terminated_length": 170.14285278320312, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 2.6140797285835453, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8003585338592529, "kl": 0.045166015625, "learning_rate": 5.062220224679276e-08, "loss": 0.0088, "num_tokens": 197676364.0, "reward": 9.970847129821777, "reward_std": 1.326645851135254, "rewards/accuracy_reward/mean": 1.484375, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.435416579246521, "rewards/judge_reward/std": 1.6947085857391357, "rewards/ngrams_iou_reward/mean": 0.186130091547966, "rewards/ngrams_iou_reward/std": 0.2017897218465805, "rewards/schema_keywords_iou_reward/mean": 0.7055497765541077, "rewards/schema_keywords_iou_reward/std": 0.13925690948963165, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709408044815063, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 227.22396850585938, "completions/mean_terminated_length": 168.3015899658203, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 2.617472434266327, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7832573056221008, "kl": 0.04949951171875, "learning_rate": 4.9760483771624226e-08, "loss": 0.0072, "num_tokens": 197926517.0, "reward": 10.83757209777832, "reward_std": 0.7152811288833618, "rewards/accuracy_reward/mean": 1.890625, "rewards/accuracy_reward/std": 1.4520306587219238, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.2166666984558105, "rewards/judge_reward/std": 1.6631935834884644, "rewards/ngrams_iou_reward/mean": 0.19440270960330963, "rewards/ngrams_iou_reward/std": 0.1655207872390747, "rewards/schema_keywords_iou_reward/mean": 0.7046263813972473, "rewards/schema_keywords_iou_reward/std": 0.1646820306777954, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626392364502, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 227.796875, "completions/mean_terminated_length": 165.75001525878906, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 2.6208651399491094, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7589611411094666, "kl": 0.05419921875, "learning_rate": 4.8905778503418094e-08, "loss": 0.0223, "num_tokens": 198164672.0, "reward": 10.01465129852295, "reward_std": 1.2246077060699463, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.4385415315628052, "rewards/judge_reward/std": 1.6127678155899048, "rewards/ngrams_iou_reward/mean": 0.23881137371063232, "rewards/ngrams_iou_reward/std": 0.2567511796951294, "rewards/schema_keywords_iou_reward/mean": 0.7216720581054688, "rewards/schema_keywords_iou_reward/std": 0.18256478011608124, "rewards/syntax_reward/mean": 0.9010416865348816, "rewards/syntax_reward/std": 0.2993867099285126, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 231.41146850585938, "completions/mean_terminated_length": 168.57408142089844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 2.6242578456318917, "frac_reward_zero_std": 0.0, "grad_norm": 0.7852879762649536, "kl": 0.0511474609375, "learning_rate": 4.805809975558828e-08, "loss": -0.0195, "num_tokens": 198406227.0, "reward": 10.067610740661621, "reward_std": 1.2375309467315674, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.654166579246521, "rewards/judge_reward/std": 1.6695557832717896, "rewards/ngrams_iou_reward/mean": 0.1555650532245636, "rewards/ngrams_iou_reward/std": 0.145614892244339, "rewards/schema_keywords_iou_reward/mean": 0.7078782916069031, "rewards/schema_keywords_iou_reward/std": 0.1489298939704895, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 236.6822967529297, "completions/mean_terminated_length": 169.7441864013672, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.6276505513146735, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7945829629898071, "kl": 0.04827880859375, "learning_rate": 4.721746073209892e-08, "loss": -0.0065, "num_tokens": 198635960.0, "reward": 10.154695510864258, "reward_std": 1.396093726158142, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.3041666746139526, "rewards/judge_reward/std": 1.6398131847381592, "rewards/ngrams_iou_reward/mean": 0.2355140894651413, "rewards/ngrams_iou_reward/std": 0.25436556339263916, "rewards/schema_keywords_iou_reward/mean": 0.7066801190376282, "rewards/schema_keywords_iou_reward/std": 0.18229785561561584, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 228.59896850585938, "completions/mean_terminated_length": 175.06153869628906, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 2.6310432569974553, "frac_reward_zero_std": 0.0, "grad_norm": 0.7395984530448914, "kl": 0.05352783203125, "learning_rate": 4.6383874527259335e-08, "loss": 0.0037, "num_tokens": 198906333.0, "reward": 9.744049072265625, "reward_std": 1.6791801452636719, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.4864583015441895, "rewards/judge_reward/std": 1.6482971906661987, "rewards/ngrams_iou_reward/mean": 0.14824597537517548, "rewards/ngrams_iou_reward/std": 0.09767190366983414, "rewards/schema_keywords_iou_reward/mean": 0.6958021521568298, "rewards/schema_keywords_iou_reward/std": 0.14528204500675201, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617789745331, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 235.4010467529297, "completions/mean_terminated_length": 178.45098876953125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 2.6344359626802376, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8776006698608398, "kl": 0.04547119140625, "learning_rate": 4.555735412551975e-08, "loss": 0.0082, "num_tokens": 199150892.0, "reward": 10.09450626373291, "reward_std": 1.1406550407409668, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.4697915315628052, "rewards/judge_reward/std": 1.6996222734451294, "rewards/ngrams_iou_reward/mean": 0.153548002243042, "rewards/ngrams_iou_reward/std": 0.1687966138124466, "rewards/schema_keywords_iou_reward/mean": 0.7201244235038757, "rewards/schema_keywords_iou_reward/std": 0.13689181208610535, "rewards/syntax_reward/mean": 0.90625, "rewards/syntax_reward/std": 0.2922426164150238, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 231.1041717529297, "completions/mean_terminated_length": 172.14035034179688, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 2.6378286683630194, "frac_reward_zero_std": 0.03125, "grad_norm": 0.784859299659729, "kl": 0.05413818359375, "learning_rate": 4.4737912401268894e-08, "loss": 0.0099, "num_tokens": 199422022.0, "reward": 9.923824310302734, "reward_std": 1.2624553442001343, "rewards/accuracy_reward/mean": 1.46875, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.457291603088379, "rewards/judge_reward/std": 1.646270990371704, "rewards/ngrams_iou_reward/mean": 0.22787554562091827, "rewards/ngrams_iou_reward/std": 0.2521696984767914, "rewards/schema_keywords_iou_reward/mean": 0.7032399773597717, "rewards/schema_keywords_iou_reward/std": 0.17048636078834534, "rewards/syntax_reward/mean": 0.71875, "rewards/syntax_reward/std": 0.4507846534252167, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 227.00521850585938, "completions/mean_terminated_length": 164.73770141601562, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 2.6412213740458013, "frac_reward_zero_std": 0.0625, "grad_norm": 0.803588330745697, "kl": 0.04827880859375, "learning_rate": 4.392556211863413e-08, "loss": 0.0198, "num_tokens": 199662605.0, "reward": 10.284900665283203, "reward_std": 1.3163046836853027, "rewards/accuracy_reward/mean": 1.734375, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.167708396911621, "rewards/judge_reward/std": 1.5183342695236206, "rewards/ngrams_iou_reward/mean": 0.24736814200878143, "rewards/ngrams_iou_reward/std": 0.27724945545196533, "rewards/schema_keywords_iou_reward/mean": 0.7229490280151367, "rewards/schema_keywords_iou_reward/std": 0.18140295147895813, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 234.86459350585938, "completions/mean_terminated_length": 173.1836700439453, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 2.6446140797285835, "frac_reward_zero_std": 0.0, "grad_norm": 0.8643428683280945, "kl": 0.05059814453125, "learning_rate": 4.3120315931281626e-08, "loss": -0.0043, "num_tokens": 199927755.0, "reward": 9.950311660766602, "reward_std": 1.3339027166366577, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.171875, "rewards/judge_reward/std": 1.5554802417755127, "rewards/ngrams_iou_reward/mean": 0.16798995435237885, "rewards/ngrams_iou_reward/std": 0.1696224808692932, "rewards/schema_keywords_iou_reward/mean": 0.6521124243736267, "rewards/schema_keywords_iou_reward/std": 0.1754225194454193, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617491722107, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.84896850585938, "completions/mean_terminated_length": 170.5192413330078, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 2.648006785411366, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7620438933372498, "kl": 0.05181884765625, "learning_rate": 4.2322186382220295e-08, "loss": 0.0048, "num_tokens": 200186968.0, "reward": 10.3099365234375, "reward_std": 1.0589966773986816, "rewards/accuracy_reward/mean": 1.875, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 0.984375, "rewards/judge_reward/std": 1.432153344154358, "rewards/ngrams_iou_reward/mean": 0.22657574713230133, "rewards/ngrams_iou_reward/std": 0.24579571187496185, "rewards/schema_keywords_iou_reward/mean": 0.7291938662528992, "rewards/schema_keywords_iou_reward/std": 0.15716642141342163, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 239.28125, "completions/mean_terminated_length": 187.70211791992188, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 2.6513994910941476, "frac_reward_zero_std": 0.0, "grad_norm": 0.8160983324050903, "kl": 0.05517578125, "learning_rate": 4.15311859036056e-08, "loss": 0.0152, "num_tokens": 200450260.0, "reward": 9.414203643798828, "reward_std": 1.8547065258026123, "rewards/accuracy_reward/mean": 1.171875, "rewards/accuracy_reward/std": 1.4674979448318481, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3129251003265381, "rewards/judge_reward/mean": 1.7333332300186157, "rewards/judge_reward/std": 1.6911638975143433, "rewards/ngrams_iou_reward/mean": 0.1263057440519333, "rewards/ngrams_iou_reward/std": 0.12602558732032776, "rewards/schema_keywords_iou_reward/mean": 0.6701889634132385, "rewards/schema_keywords_iou_reward/std": 0.1736993044614792, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 231.2447967529297, "completions/mean_terminated_length": 156.9791717529297, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 2.6547921967769295, "frac_reward_zero_std": 0.09375, "grad_norm": 0.8086091876029968, "kl": 0.0479736328125, "learning_rate": 4.074732681654647e-08, "loss": 0.0127, "num_tokens": 200698809.0, "reward": 10.88521957397461, "reward_std": 0.8784407377243042, "rewards/accuracy_reward/mean": 1.90625, "rewards/accuracy_reward/std": 1.4477143287658691, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.171875, "rewards/judge_reward/std": 1.6628495454788208, "rewards/ngrams_iou_reward/mean": 0.24116270244121552, "rewards/ngrams_iou_reward/std": 0.28587281703948975, "rewards/schema_keywords_iou_reward/mean": 0.7534310817718506, "rewards/schema_keywords_iou_reward/std": 0.16820427775382996, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 231.671875, "completions/mean_terminated_length": 176.83050537109375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 2.6581849024597117, "frac_reward_zero_std": 0.0, "grad_norm": 0.821800172328949, "kl": 0.04669189453125, "learning_rate": 3.997062133091283e-08, "loss": -0.0062, "num_tokens": 200945610.0, "reward": 10.369344711303711, "reward_std": 0.8163098096847534, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12434382736682892, "rewards/judge_reward/mean": 1.4166666269302368, "rewards/judge_reward/std": 1.739992618560791, "rewards/ngrams_iou_reward/mean": 0.17525805532932281, "rewards/ngrams_iou_reward/std": 0.1771496832370758, "rewards/schema_keywords_iou_reward/mean": 0.725335419178009, "rewards/schema_keywords_iou_reward/std": 0.15079626441001892, "rewards/syntax_reward/mean": 0.9166666865348816, "rewards/syntax_reward/std": 0.27710798382759094, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 234.2604217529297, "completions/mean_terminated_length": 174.15687561035156, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 2.6615776081424936, "frac_reward_zero_std": 0.03125, "grad_norm": 0.886532187461853, "kl": 0.05096435546875, "learning_rate": 3.920108154514584e-08, "loss": 0.0209, "num_tokens": 201186500.0, "reward": 9.525668144226074, "reward_std": 1.3468555212020874, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.4385417699813843, "rewards/judge_reward/std": 1.6588565111160278, "rewards/ngrams_iou_reward/mean": 0.15900704264640808, "rewards/ngrams_iou_reward/std": 0.15268053114414215, "rewards/schema_keywords_iou_reward/mean": 0.6822850704193115, "rewards/schema_keywords_iou_reward/std": 0.1843254268169403, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 234.5260467529297, "completions/mean_terminated_length": 179.6481475830078, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.6649703138252754, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7642063498497009, "kl": 0.0535888671875, "learning_rate": 3.843871944606969e-08, "loss": -0.0005, "num_tokens": 201422977.0, "reward": 9.842177391052246, "reward_std": 1.1633338928222656, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.5489583015441895, "rewards/judge_reward/std": 1.6464776992797852, "rewards/ngrams_iou_reward/mean": 0.17581391334533691, "rewards/ngrams_iou_reward/std": 0.1997576206922531, "rewards/schema_keywords_iou_reward/mean": 0.6778216361999512, "rewards/schema_keywords_iou_reward/std": 0.16728244721889496, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 227.0572967529297, "completions/mean_terminated_length": 160.1896514892578, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.6683630195080577, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8665434122085571, "kl": 0.0513916015625, "learning_rate": 3.7683546908703903e-08, "loss": 0.0123, "num_tokens": 201700242.0, "reward": 10.403215408325195, "reward_std": 1.4051449298858643, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9895833134651184, "rewards/format_reward/std": 0.1017945408821106, "rewards/judge_reward/mean": 1.5010417699813843, "rewards/judge_reward/std": 1.7237482070922852, "rewards/ngrams_iou_reward/mean": 0.17099829018115997, "rewards/ngrams_iou_reward/std": 0.13217546045780182, "rewards/schema_keywords_iou_reward/mean": 0.7145087122917175, "rewards/schema_keywords_iou_reward/std": 0.14650578796863556, "rewards/syntax_reward/mean": 0.8854166865348816, "rewards/syntax_reward/std": 0.3193511664867401, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 223.5572967529297, "completions/mean_terminated_length": 167.0142822265625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 2.67175572519084, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7537136077880859, "kl": 0.05035400390625, "learning_rate": 3.693557569607947e-08, "loss": -0.0143, "num_tokens": 201960785.0, "reward": 9.680522918701172, "reward_std": 1.2325820922851562, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 1.5072916746139526, "rewards/judge_reward/std": 1.6451575756072998, "rewards/ngrams_iou_reward/mean": 0.24841703474521637, "rewards/ngrams_iou_reward/std": 0.27080321311950684, "rewards/schema_keywords_iou_reward/mean": 0.7039806246757507, "rewards/schema_keywords_iou_reward/std": 0.19964197278022766, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 228.953125, "completions/mean_terminated_length": 158.0188751220703, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 2.6751484308736218, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7359159588813782, "kl": 0.05224609375, "learning_rate": 3.619481745905467e-08, "loss": -0.0072, "num_tokens": 202215710.0, "reward": 10.527286529541016, "reward_std": 1.244936466217041, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.4135417938232422, "rewards/judge_reward/std": 1.6461994647979736, "rewards/ngrams_iou_reward/mean": 0.2173522263765335, "rewards/ngrams_iou_reward/std": 0.24475474655628204, "rewards/schema_keywords_iou_reward/mean": 0.727641761302948, "rewards/schema_keywords_iou_reward/std": 0.15829172730445862, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.75521850585938, "completions/mean_terminated_length": 175.7413787841797, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 2.6785411365564036, "frac_reward_zero_std": 0.0, "grad_norm": 0.8157823085784912, "kl": 0.0538330078125, "learning_rate": 3.546128373613472e-08, "loss": 0.0123, "num_tokens": 202479645.0, "reward": 10.842353820800781, "reward_std": 1.3628987073898315, "rewards/accuracy_reward/mean": 2.171875, "rewards/accuracy_reward/std": 1.3446191549301147, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 0.8041666150093079, "rewards/judge_reward/std": 1.4112426042556763, "rewards/ngrams_iou_reward/mean": 0.18175534904003143, "rewards/ngrams_iou_reward/std": 0.1807553470134735, "rewards/schema_keywords_iou_reward/mean": 0.6980981826782227, "rewards/schema_keywords_iou_reward/std": 0.17049972712993622, "rewards/syntax_reward/mean": 0.8854166865348816, "rewards/syntax_reward/std": 0.3193511664867401, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 256.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 225.21875, "completions/mean_terminated_length": 171.57142639160156, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 2.681933842239186, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8512758016586304, "kl": 0.0501708984375, "learning_rate": 3.4734985953290774e-08, "loss": 0.0156, "num_tokens": 202747233.0, "reward": 9.855181694030762, "reward_std": 1.6735355854034424, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.0, "rewards/judge_reward/std": 1.3918235301971436, "rewards/ngrams_iou_reward/mean": 0.20533967018127441, "rewards/ngrams_iou_reward/std": 0.23131264746189117, "rewards/schema_keywords_iou_reward/mean": 0.7019256949424744, "rewards/schema_keywords_iou_reward/std": 0.1682194024324417, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 228.515625, "completions/mean_terminated_length": 178.39706420898438, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.6853265479219677, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8321572542190552, "kl": 0.053955078125, "learning_rate": 3.4015935423782615e-08, "loss": 0.0062, "num_tokens": 203002356.0, "reward": 9.799124717712402, "reward_std": 1.7444722652435303, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.1322916746139526, "rewards/judge_reward/std": 1.4647691249847412, "rewards/ngrams_iou_reward/mean": 0.21127764880657196, "rewards/ngrams_iou_reward/std": 0.21255221962928772, "rewards/schema_keywords_iou_reward/mean": 0.7086798548698425, "rewards/schema_keywords_iou_reward/std": 0.17777957022190094, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 237.17709350585938, "completions/mean_terminated_length": 186.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 2.68871925360475, "frac_reward_zero_std": 0.0, "grad_norm": 0.8100751042366028, "kl": 0.04815673828125, "learning_rate": 3.330414334798265e-08, "loss": 0.0114, "num_tokens": 203244700.0, "reward": 10.07859992980957, "reward_std": 1.8040361404418945, "rewards/accuracy_reward/mean": 1.84375, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237454891205, "rewards/judge_reward/mean": 0.9437499642372131, "rewards/judge_reward/std": 1.4415533542633057, "rewards/ngrams_iou_reward/mean": 0.1991948038339615, "rewards/ngrams_iou_reward/std": 0.22453030943870544, "rewards/schema_keywords_iou_reward/mean": 0.6794045567512512, "rewards/schema_keywords_iou_reward/std": 0.1990511417388916, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374123275279999, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.8229217529297, "completions/mean_terminated_length": 170.45899963378906, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 2.6921119592875318, "frac_reward_zero_std": 0.09375, "grad_norm": 0.8025432825088501, "kl": 0.04986572265625, "learning_rate": 3.2599620813200835e-08, "loss": -0.0013, "num_tokens": 203494566.0, "reward": 10.21800422668457, "reward_std": 1.053748369216919, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.25, "rewards/judge_reward/std": 1.5805094242095947, "rewards/ngrams_iou_reward/mean": 0.22442756593227386, "rewards/ngrams_iou_reward/std": 0.24313528835773468, "rewards/schema_keywords_iou_reward/mean": 0.7331603169441223, "rewards/schema_keywords_iou_reward/std": 0.1587141752243042, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 225.83334350585938, "completions/mean_terminated_length": 169.55223083496094, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 2.695504664970314, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8616142272949219, "kl": 0.05548095703125, "learning_rate": 3.190237879351265e-08, "loss": 0.0163, "num_tokens": 203755000.0, "reward": 9.770280838012695, "reward_std": 1.4027414321899414, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.4864583015441895, "rewards/judge_reward/std": 1.6124593019485474, "rewards/ngrams_iou_reward/mean": 0.15964367985725403, "rewards/ngrams_iou_reward/std": 0.13148120045661926, "rewards/schema_keywords_iou_reward/mean": 0.705427885055542, "rewards/schema_keywords_iou_reward/std": 0.16674888134002686, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617491722107, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.0416717529297, "completions/mean_terminated_length": 170.46153259277344, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 2.698897370653096, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9085482954978943, "kl": 0.0479736328125, "learning_rate": 3.121242814958747e-08, "loss": 0.0019, "num_tokens": 203999328.0, "reward": 10.3851900100708, "reward_std": 1.0980074405670166, "rewards/accuracy_reward/mean": 2.046875, "rewards/accuracy_reward/std": 1.4004077911376953, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 0.7166666984558105, "rewards/judge_reward/std": 1.266951322555542, "rewards/ngrams_iou_reward/mean": 0.2135336548089981, "rewards/ngrams_iou_reward/std": 0.23764564096927643, "rewards/schema_keywords_iou_reward/mean": 0.7279059290885925, "rewards/schema_keywords_iou_reward/std": 0.18047496676445007, "rewards/syntax_reward/mean": 0.90625, "rewards/syntax_reward/std": 0.2922426164150238, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 231.6354217529297, "completions/mean_terminated_length": 173.92982482910156, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 2.7022900763358777, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7689874172210693, "kl": 0.05078125, "learning_rate": 3.052977962851999e-08, "loss": 0.0123, "num_tokens": 204255320.0, "reward": 9.975982666015625, "reward_std": 1.4311585426330566, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.105208396911621, "rewards/judge_reward/std": 1.4658321142196655, "rewards/ngrams_iou_reward/mean": 0.21070724725723267, "rewards/ngrams_iou_reward/std": 0.2546265721321106, "rewards/schema_keywords_iou_reward/mean": 0.6954822540283203, "rewards/schema_keywords_iou_reward/std": 0.1845916211605072, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 231.38021850585938, "completions/mean_terminated_length": 170.05453491210938, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 2.70568278201866, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7758039832115173, "kl": 0.04541015625, "learning_rate": 2.985444386366226e-08, "loss": 0.0153, "num_tokens": 204505443.0, "reward": 10.247539520263672, "reward_std": 0.8944751620292664, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.446874976158142, "rewards/judge_reward/std": 1.6486358642578125, "rewards/ngrams_iou_reward/mean": 0.2403329461812973, "rewards/ngrams_iou_reward/std": 0.2675497233867645, "rewards/schema_keywords_iou_reward/mean": 0.7072060704231262, "rewards/schema_keywords_iou_reward/std": 0.1796817034482956, "rewards/syntax_reward/mean": 0.9375, "rewards/syntax_reward/std": 0.24269428849220276, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 218.11459350585938, "completions/mean_terminated_length": 159.01333618164062, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 2.709075487701442, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8611348867416382, "kl": 0.04931640625, "learning_rate": 2.9186431374458588e-08, "loss": 0.0157, "num_tokens": 204766969.0, "reward": 10.22756576538086, "reward_std": 1.195054292678833, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.2062500715255737, "rewards/judge_reward/std": 1.5512070655822754, "rewards/ngrams_iou_reward/mean": 0.22698713839054108, "rewards/ngrams_iou_reward/std": 0.27447929978370667, "rewards/schema_keywords_iou_reward/mean": 0.687035858631134, "rewards/schema_keywords_iou_reward/std": 0.18475398421287537, "rewards/syntax_reward/mean": 0.8802083134651184, "rewards/syntax_reward/std": 0.32556670904159546, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 231.31771850585938, "completions/mean_terminated_length": 172.85964965820312, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.712468193384224, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9694423079490662, "kl": 0.04486083984375, "learning_rate": 2.852575256628148e-08, "loss": -0.0054, "num_tokens": 205004198.0, "reward": 10.301666259765625, "reward_std": 1.1219738721847534, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.421875, "rewards/judge_reward/std": 1.6951240301132202, "rewards/ngrams_iou_reward/mean": 0.1915406733751297, "rewards/ngrams_iou_reward/std": 0.19122008979320526, "rewards/schema_keywords_iou_reward/mean": 0.7351247668266296, "rewards/schema_keywords_iou_reward/std": 0.14636977016925812, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 237.48959350585938, "completions/mean_terminated_length": 187.6538543701172, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 2.715860899067006, "frac_reward_zero_std": 0.0, "grad_norm": 0.877247154712677, "kl": 0.05255126953125, "learning_rate": 2.7872417730269327e-08, "loss": -0.0008, "num_tokens": 205277868.0, "reward": 8.781246185302734, "reward_std": 1.3624093532562256, "rewards/accuracy_reward/mean": 1.0625, "rewards/accuracy_reward/std": 1.4385310411453247, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.5229166746139526, "rewards/judge_reward/std": 1.5419765710830688, "rewards/ngrams_iou_reward/mean": 0.1421930342912674, "rewards/ngrams_iou_reward/std": 0.16057132184505463, "rewards/schema_keywords_iou_reward/mean": 0.6713440418243408, "rewards/schema_keywords_iou_reward/std": 0.14680182933807373, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.67709350585938, "completions/mean_terminated_length": 163.9649200439453, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 2.719253604749788, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9356450438499451, "kl": 0.0484619140625, "learning_rate": 2.7226437043166518e-08, "loss": 0.0141, "num_tokens": 205507048.0, "reward": 10.450654029846191, "reward_std": 1.0482244491577148, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 1.4432151317596436, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 0.9864583015441895, "rewards/judge_reward/std": 1.499048113822937, "rewards/ngrams_iou_reward/mean": 0.18773432075977325, "rewards/ngrams_iou_reward/std": 0.2137177735567093, "rewards/schema_keywords_iou_reward/mean": 0.7170855402946472, "rewards/schema_keywords_iou_reward/std": 0.16205768287181854, "rewards/syntax_reward/mean": 0.8697916865348816, "rewards/syntax_reward/std": 0.3374122977256775, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 230.234375, "completions/mean_terminated_length": 160.86538696289062, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 2.72264631043257, "frac_reward_zero_std": 0.0, "grad_norm": 0.8431116342544556, "kl": 0.04827880859375, "learning_rate": 2.658782056716441e-08, "loss": 0.0215, "num_tokens": 205746631.0, "reward": 10.460691452026367, "reward_std": 1.2829865217208862, "rewards/accuracy_reward/mean": 1.734375, "rewards/accuracy_reward/std": 1.4854496717453003, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319954812526703, "rewards/judge_reward/mean": 1.2177083492279053, "rewards/judge_reward/std": 1.6172819137573242, "rewards/ngrams_iou_reward/mean": 0.2097470760345459, "rewards/ngrams_iou_reward/std": 0.20951110124588013, "rewards/schema_keywords_iou_reward/mean": 0.7478181719779968, "rewards/schema_keywords_iou_reward/std": 0.13815471529960632, "rewards/syntax_reward/mean": 0.8854166865348816, "rewards/syntax_reward/std": 0.3193511664867401, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 230.05209350585938, "completions/mean_terminated_length": 170.10345458984375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 2.726039016115352, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8298220038414001, "kl": 0.04931640625, "learning_rate": 2.5956578249745232e-08, "loss": 0.0135, "num_tokens": 205988807.0, "reward": 9.717909812927246, "reward_std": 1.4338576793670654, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.8958333134651184, "rewards/format_reward/std": 0.30627527832984924, "rewards/judge_reward/mean": 1.2645833492279053, "rewards/judge_reward/std": 1.5769264698028564, "rewards/ngrams_iou_reward/mean": 0.1546066850423813, "rewards/ngrams_iou_reward/std": 0.13991796970367432, "rewards/schema_keywords_iou_reward/mean": 0.6122611165046692, "rewards/schema_keywords_iou_reward/std": 0.19517850875854492, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 237.6510467529297, "completions/mean_terminated_length": 182.6041717529297, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 2.729431721798134, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7972800731658936, "kl": 0.04437255859375, "learning_rate": 2.5332719923526593e-08, "loss": 0.0057, "num_tokens": 206223778.0, "reward": 10.118422508239746, "reward_std": 1.5154311656951904, "rewards/accuracy_reward/mean": 1.6875, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33158352971076965, "rewards/judge_reward/mean": 1.1750000715255737, "rewards/judge_reward/std": 1.6150468587875366, "rewards/ngrams_iou_reward/mean": 0.2732184827327728, "rewards/ngrams_iou_reward/std": 0.30711308121681213, "rewards/schema_keywords_iou_reward/mean": 0.7243707776069641, "rewards/schema_keywords_iou_reward/std": 0.20680448412895203, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.0260467529297, "completions/mean_terminated_length": 169.683349609375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 2.732824427480916, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8192742466926575, "kl": 0.04974365234375, "learning_rate": 2.47162553061086e-08, "loss": 0.0076, "num_tokens": 206474367.0, "reward": 10.219111442565918, "reward_std": 0.8650211691856384, "rewards/accuracy_reward/mean": 1.703125, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.181249976158142, "rewards/judge_reward/std": 1.6162986755371094, "rewards/ngrams_iou_reward/mean": 0.24045313894748688, "rewards/ngrams_iou_reward/std": 0.2692996561527252, "rewards/schema_keywords_iou_reward/mean": 0.7411573529243469, "rewards/schema_keywords_iou_reward/std": 0.16275151073932648, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 236.09896850585938, "completions/mean_terminated_length": 183.90567016601562, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 2.736217133163698, "frac_reward_zero_std": 0.0, "grad_norm": 0.789196252822876, "kl": 0.04833984375, "learning_rate": 2.4107193999922282e-08, "loss": 0.003, "num_tokens": 206713864.0, "reward": 10.439363479614258, "reward_std": 1.2562683820724487, "rewards/accuracy_reward/mean": 1.515625, "rewards/accuracy_reward/std": 1.5038399696350098, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.6510416269302368, "rewards/judge_reward/std": 1.7941436767578125, "rewards/ngrams_iou_reward/mean": 0.16532467305660248, "rewards/ngrams_iou_reward/std": 0.16138245165348053, "rewards/schema_keywords_iou_reward/mean": 0.7323716282844543, "rewards/schema_keywords_iou_reward/std": 0.13960690796375275, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 230.2760467529297, "completions/mean_terminated_length": 173.683349609375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.73960983884648, "frac_reward_zero_std": 0.0, "grad_norm": 0.8411672115325928, "kl": 0.0491943359375, "learning_rate": 2.3505545492080392e-08, "loss": -0.006, "num_tokens": 206960649.0, "reward": 10.141775131225586, "reward_std": 1.300107717514038, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.448958396911621, "rewards/judge_reward/std": 1.6766642332077026, "rewards/ngrams_iou_reward/mean": 0.20562110841274261, "rewards/ngrams_iou_reward/std": 0.20202572643756866, "rewards/schema_keywords_iou_reward/mean": 0.7465693950653076, "rewards/schema_keywords_iou_reward/std": 0.14268946647644043, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 231.109375, "completions/mean_terminated_length": 180.1428680419922, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 2.7430025445292623, "frac_reward_zero_std": 0.0, "grad_norm": 0.8304406404495239, "kl": 0.04888916015625, "learning_rate": 2.291131915422917e-08, "loss": 0.0175, "num_tokens": 207208098.0, "reward": 10.285652160644531, "reward_std": 1.4977377653121948, "rewards/accuracy_reward/mean": 1.703125, "rewards/accuracy_reward/std": 1.4900685548782349, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.248437523841858, "rewards/judge_reward/std": 1.5989516973495483, "rewards/ngrams_iou_reward/mean": 0.20091009140014648, "rewards/ngrams_iou_reward/std": 0.23072464764118195, "rewards/schema_keywords_iou_reward/mean": 0.7076575756072998, "rewards/schema_keywords_iou_reward/std": 0.1746833175420761, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.63021850585938, "completions/mean_terminated_length": 174.81668090820312, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.746395250212044, "frac_reward_zero_std": 0.0, "grad_norm": 0.7551831007003784, "kl": 0.0504150390625, "learning_rate": 2.232452424240261e-08, "loss": -0.0024, "num_tokens": 207459757.0, "reward": 9.914801597595215, "reward_std": 1.1555606126785278, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.646875023841858, "rewards/judge_reward/std": 1.7661418914794922, "rewards/ngrams_iou_reward/mean": 0.16706879436969757, "rewards/ngrams_iou_reward/std": 0.17886139452457428, "rewards/schema_keywords_iou_reward/mean": 0.6987742781639099, "rewards/schema_keywords_iou_reward/std": 0.1862107664346695, "rewards/syntax_reward/mean": 0.7708333134651184, "rewards/syntax_reward/std": 0.421395480632782, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 227.46875, "completions/mean_terminated_length": 171.72308349609375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 2.749787955894826, "frac_reward_zero_std": 0.0625, "grad_norm": 0.801074743270874, "kl": 0.0467529296875, "learning_rate": 2.1745169896878412e-08, "loss": 0.0118, "num_tokens": 207706825.0, "reward": 9.902496337890625, "reward_std": 1.0551862716674805, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.234375, "rewards/judge_reward/std": 1.5418305397033691, "rewards/ngrams_iou_reward/mean": 0.2170732021331787, "rewards/ngrams_iou_reward/std": 0.2604008913040161, "rewards/schema_keywords_iou_reward/mean": 0.7062564492225647, "rewards/schema_keywords_iou_reward/std": 0.18300531804561615, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.95834350585938, "completions/mean_terminated_length": 166.48275756835938, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 2.753180661577608, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8271944522857666, "kl": 0.04925537109375, "learning_rate": 2.1173265142035268e-08, "loss": -0.0081, "num_tokens": 207951281.0, "reward": 10.431583404541016, "reward_std": 0.9624678492546082, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.521875023841858, "rewards/judge_reward/std": 1.7059000730514526, "rewards/ngrams_iou_reward/mean": 0.23111748695373535, "rewards/ngrams_iou_reward/std": 0.26795482635498047, "rewards/schema_keywords_iou_reward/mean": 0.7400497794151306, "rewards/schema_keywords_iou_reward/std": 0.16285938024520874, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 227.0885467529297, "completions/mean_terminated_length": 175.5507354736328, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 2.75657336726039, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9434738159179688, "kl": 0.04998779296875, "learning_rate": 2.0608818886212576e-08, "loss": -0.0015, "num_tokens": 208210768.0, "reward": 10.376981735229492, "reward_std": 1.2632579803466797, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967558324337006, "rewards/judge_reward/mean": 1.3468750715255737, "rewards/judge_reward/std": 1.687617301940918, "rewards/ngrams_iou_reward/mean": 0.15831579267978668, "rewards/ngrams_iou_reward/std": 0.12816651165485382, "rewards/schema_keywords_iou_reward/mean": 0.7436644434928894, "rewards/schema_keywords_iou_reward/std": 0.1491892784833908, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151108264923, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 226.046875, "completions/mean_terminated_length": 172.6521759033203, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 2.7599660729431723, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9309205412864685, "kl": 0.05029296875, "learning_rate": 2.0051839921571444e-08, "loss": 0.023, "num_tokens": 208455949.0, "reward": 9.948183059692383, "reward_std": 1.6617029905319214, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319954812526703, "rewards/judge_reward/mean": 1.105208396911621, "rewards/judge_reward/std": 1.4403220415115356, "rewards/ngrams_iou_reward/mean": 0.19094812870025635, "rewards/ngrams_iou_reward/std": 0.20523254573345184, "rewards/schema_keywords_iou_reward/mean": 0.6874424815177917, "rewards/schema_keywords_iou_reward/std": 0.17155030369758606, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 227.71875, "completions/mean_terminated_length": 172.46153259277344, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.763358778625954, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8103916049003601, "kl": 0.049560546875, "learning_rate": 1.9502336923958252e-08, "loss": 0.0121, "num_tokens": 208703569.0, "reward": 9.687254905700684, "reward_std": 1.2014250755310059, "rewards/accuracy_reward/mean": 1.53125, "rewards/accuracy_reward/std": 1.5035951137542725, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.1999999284744263, "rewards/judge_reward/std": 1.4729676246643066, "rewards/ngrams_iou_reward/mean": 0.22899210453033447, "rewards/ngrams_iou_reward/std": 0.25667378306388855, "rewards/schema_keywords_iou_reward/mean": 0.7040960192680359, "rewards/schema_keywords_iou_reward/std": 0.1919153332710266, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 232.44271850585938, "completions/mean_terminated_length": 179.3389892578125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 2.7667514843087364, "frac_reward_zero_std": 0.0, "grad_norm": 0.893337607383728, "kl": 0.05157470703125, "learning_rate": 1.8960318452768576e-08, "loss": 0.0033, "num_tokens": 208948310.0, "reward": 9.545836448669434, "reward_std": 1.1434592008590698, "rewards/accuracy_reward/mean": 1.15625, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.75, "rewards/judge_reward/std": 1.6714415550231934, "rewards/ngrams_iou_reward/mean": 0.1877732276916504, "rewards/ngrams_iou_reward/std": 0.20020487904548645, "rewards/schema_keywords_iou_reward/mean": 0.7174375057220459, "rewards/schema_keywords_iou_reward/std": 0.16095782816410065, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 231.6510467529297, "completions/mean_terminated_length": 175.39654541015625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.7701441899915182, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7346154451370239, "kl": 0.05474853515625, "learning_rate": 1.8425792950814867e-08, "loss": 0.0081, "num_tokens": 209217541.0, "reward": 9.735183715820312, "reward_std": 1.0118622779846191, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791763484477997, "rewards/judge_reward/mean": 1.2864583730697632, "rewards/judge_reward/std": 1.561253309249878, "rewards/ngrams_iou_reward/mean": 0.18456298112869263, "rewards/ngrams_iou_reward/std": 0.19983921945095062, "rewards/schema_keywords_iou_reward/mean": 0.7068698406219482, "rewards/schema_keywords_iou_reward/std": 0.1573977917432785, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.84896850585938, "completions/mean_terminated_length": 176.8360595703125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 2.7735368956743, "frac_reward_zero_std": 0.0, "grad_norm": 1.9997918605804443, "kl": 0.048095703125, "learning_rate": 1.789876874419416e-08, "loss": 0.0101, "num_tokens": 209473328.0, "reward": 9.663339614868164, "reward_std": 1.1597052812576294, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.1875, "rewards/judge_reward/std": 1.4616385698318481, "rewards/ngrams_iou_reward/mean": 0.1420236974954605, "rewards/ngrams_iou_reward/std": 0.09414520114660263, "rewards/schema_keywords_iou_reward/mean": 0.6775646209716797, "rewards/schema_keywords_iou_reward/std": 0.16848351061344147, "rewards/syntax_reward/mean": 0.8802083134651184, "rewards/syntax_reward/std": 0.32556667923927307, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.69271850585938, "completions/mean_terminated_length": 177.2318878173828, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 2.7769296013570823, "frac_reward_zero_std": 0.0, "grad_norm": 0.8219529390335083, "kl": 0.04876708984375, "learning_rate": 1.7379254042158952e-08, "loss": 0.0175, "num_tokens": 209749785.0, "reward": 10.2998046875, "reward_std": 1.179900884628296, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.5052083730697632, "rewards/judge_reward/std": 1.711058497428894, "rewards/ngrams_iou_reward/mean": 0.15798182785511017, "rewards/ngrams_iou_reward/std": 0.14828096330165863, "rewards/schema_keywords_iou_reward/mean": 0.6991139054298401, "rewards/schema_keywords_iou_reward/std": 0.16386336088180542, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 228.55209350585938, "completions/mean_terminated_length": 179.62318420410156, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 2.780322307039864, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8400294780731201, "kl": 0.04864501953125, "learning_rate": 1.6867256936989094e-08, "loss": -0.0111, "num_tokens": 210004645.0, "reward": 10.94138240814209, "reward_std": 0.9474401473999023, "rewards/accuracy_reward/mean": 1.984375, "rewards/accuracy_reward/std": 1.423352599143982, "rewards/format_reward/mean": 0.9791666865348816, "rewards/format_reward/std": 0.14319953322410583, "rewards/judge_reward/mean": 1.0968750715255737, "rewards/judge_reward/std": 1.6277663707733154, "rewards/ngrams_iou_reward/mean": 0.17228782176971436, "rewards/ngrams_iou_reward/std": 0.20045262575149536, "rewards/schema_keywords_iou_reward/mean": 0.7107610702514648, "rewards/schema_keywords_iou_reward/std": 0.15204009413719177, "rewards/syntax_reward/mean": 0.9322916865348816, "rewards/syntax_reward/std": 0.2519015669822693, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 231.1510467529297, "completions/mean_terminated_length": 176.48333740234375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 2.7837150127226464, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7745940089225769, "kl": 0.05242919921875, "learning_rate": 1.6362785403865488e-08, "loss": 0.0147, "num_tokens": 210265896.0, "reward": 10.513715744018555, "reward_std": 1.0029635429382324, "rewards/accuracy_reward/mean": 2.109375, "rewards/accuracy_reward/std": 1.3742263317108154, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 0.7770833373069763, "rewards/judge_reward/std": 1.394790530204773, "rewards/ngrams_iou_reward/mean": 0.13780255615711212, "rewards/ngrams_iou_reward/std": 0.11678808927536011, "rewards/schema_keywords_iou_reward/mean": 0.6863295435905457, "rewards/schema_keywords_iou_reward/std": 0.16692453622817993, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617789745331, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 222.11459350585938, "completions/mean_terminated_length": 175.67901611328125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 2.7871077184054283, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6953917145729065, "kl": 0.05474853515625, "learning_rate": 1.5865847300746415e-08, "loss": 0.0009, "num_tokens": 210534868.0, "reward": 10.181174278259277, "reward_std": 1.19087553024292, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.3291667699813843, "rewards/judge_reward/std": 1.5871152877807617, "rewards/ngrams_iou_reward/mean": 0.15444530546665192, "rewards/ngrams_iou_reward/std": 0.1584811955690384, "rewards/schema_keywords_iou_reward/mean": 0.6861035227775574, "rewards/schema_keywords_iou_reward/std": 0.14690309762954712, "rewards/syntax_reward/mean": 0.9010416865348816, "rewards/syntax_reward/std": 0.2993867099285126, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 225.31771850585938, "completions/mean_terminated_length": 170.62318420410156, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 2.7905004240882105, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7803793549537659, "kl": 0.05169677734375, "learning_rate": 1.5376450368244585e-08, "loss": 0.0098, "num_tokens": 210800375.0, "reward": 10.071551322937012, "reward_std": 1.338319182395935, "rewards/accuracy_reward/mean": 1.828125, "rewards/accuracy_reward/std": 1.4674979448318481, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 0.9208333492279053, "rewards/judge_reward/std": 1.3607755899429321, "rewards/ngrams_iou_reward/mean": 0.22475390136241913, "rewards/ngrams_iou_reward/std": 0.22944115102291107, "rewards/schema_keywords_iou_reward/mean": 0.7082552909851074, "rewards/schema_keywords_iou_reward/std": 0.19146835803985596, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 223.828125, "completions/mean_terminated_length": 170.2083282470703, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 2.7938931297709924, "frac_reward_zero_std": 0.0, "grad_norm": 0.8750317096710205, "kl": 0.04888916015625, "learning_rate": 1.4894602229506892e-08, "loss": 0.0092, "num_tokens": 211131854.0, "reward": 9.550418853759766, "reward_std": 1.3331698179244995, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9114583134651184, "rewards/format_reward/std": 0.2848237156867981, "rewards/judge_reward/mean": 1.3666667938232422, "rewards/judge_reward/std": 1.6320524215698242, "rewards/ngrams_iou_reward/mean": 0.1383988857269287, "rewards/ngrams_iou_reward/std": 0.1559116244316101, "rewards/schema_keywords_iou_reward/mean": 0.6630615592002869, "rewards/schema_keywords_iou_reward/std": 0.17483575642108917, "rewards/syntax_reward/mean": 0.7447916865348816, "rewards/syntax_reward/std": 0.4371180832386017, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 232.65625, "completions/mean_terminated_length": 175.96429443359375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 2.797285835453774, "frac_reward_zero_std": 0.0, "grad_norm": 0.7752999067306519, "kl": 0.04876708984375, "learning_rate": 1.4420310390095613e-08, "loss": -0.0008, "num_tokens": 211384688.0, "reward": 10.275988578796387, "reward_std": 1.2837908267974854, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.3052083253860474, "rewards/judge_reward/std": 1.59592866897583, "rewards/ngrams_iou_reward/mean": 0.18719150125980377, "rewards/ngrams_iou_reward/std": 0.1937071979045868, "rewards/schema_keywords_iou_reward/mean": 0.7283795475959778, "rewards/schema_keywords_iou_reward/std": 0.1398443728685379, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 224.6197967529297, "completions/mean_terminated_length": 174.58108520507812, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 2.8006785411365565, "frac_reward_zero_std": 0.0, "grad_norm": 0.7754351496696472, "kl": 0.04815673828125, "learning_rate": 1.395358223787152e-08, "loss": -0.0132, "num_tokens": 211651075.0, "reward": 9.73554801940918, "reward_std": 1.1638367176055908, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12434382736682892, "rewards/judge_reward/mean": 1.1114583015441895, "rewards/judge_reward/std": 1.400177240371704, "rewards/ngrams_iou_reward/mean": 0.14539681375026703, "rewards/ngrams_iou_reward/std": 0.09342881292104721, "rewards/schema_keywords_iou_reward/mean": 0.6693169474601746, "rewards/schema_keywords_iou_reward/std": 0.15405817329883575, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 170.28570556640625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 2.8040712468193383, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7605258822441101, "kl": 0.0487060546875, "learning_rate": 1.349442504287862e-08, "loss": 0.0005, "num_tokens": 211917817.0, "reward": 9.911356925964355, "reward_std": 1.1168954372406006, "rewards/accuracy_reward/mean": 1.671875, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.136458396911621, "rewards/judge_reward/std": 1.53777015209198, "rewards/ngrams_iou_reward/mean": 0.20890915393829346, "rewards/ngrams_iou_reward/std": 0.2559897303581238, "rewards/schema_keywords_iou_reward/mean": 0.6691139340400696, "rewards/schema_keywords_iou_reward/std": 0.19993451237678528, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.0729217529297, "completions/mean_terminated_length": 177.6666717529297, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 2.8074639525021206, "frac_reward_zero_std": 0.0, "grad_norm": 0.9287909865379333, "kl": 0.0540771484375, "learning_rate": 1.3042845957231153e-08, "loss": 0.0261, "num_tokens": 212168631.0, "reward": 10.599613189697266, "reward_std": 0.850269079208374, "rewards/accuracy_reward/mean": 1.796875, "rewards/accuracy_reward/std": 1.4741722345352173, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.207291603088379, "rewards/judge_reward/std": 1.6154195070266724, "rewards/ngrams_iou_reward/mean": 0.2431327849626541, "rewards/ngrams_iou_reward/std": 0.23520322144031525, "rewards/schema_keywords_iou_reward/mean": 0.7491877675056458, "rewards/schema_keywords_iou_reward/std": 0.14588971436023712, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 230.140625, "completions/mean_terminated_length": 174.60655212402344, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 2.8108566581849024, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7512199878692627, "kl": 0.052490234375, "learning_rate": 1.2598852015001992e-08, "loss": -0.0024, "num_tokens": 212440332.0, "reward": 9.51388931274414, "reward_std": 1.302465558052063, "rewards/accuracy_reward/mean": 1.390625, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.3666666746139526, "rewards/judge_reward/std": 1.5965057611465454, "rewards/ngrams_iou_reward/mean": 0.15965639054775238, "rewards/ngrams_iou_reward/std": 0.1501312553882599, "rewards/schema_keywords_iou_reward/mean": 0.6729816794395447, "rewards/schema_keywords_iou_reward/std": 0.1702708899974823, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 229.7447967529297, "completions/mean_terminated_length": 177.234375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 2.8142493638676847, "frac_reward_zero_std": 0.0, "grad_norm": 0.8009103536605835, "kl": 0.0555419921875, "learning_rate": 1.21624501321132e-08, "loss": 0.0027, "num_tokens": 212712923.0, "reward": 9.24618148803711, "reward_std": 1.2894132137298584, "rewards/accuracy_reward/mean": 1.15625, "rewards/accuracy_reward/std": 1.463897943496704, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.6687501668930054, "rewards/judge_reward/std": 1.6236028671264648, "rewards/ngrams_iou_reward/mean": 0.16999785602092743, "rewards/ngrams_iou_reward/std": 0.16817040741443634, "rewards/schema_keywords_iou_reward/mean": 0.6709752678871155, "rewards/schema_keywords_iou_reward/std": 0.17753638327121735, "rewards/syntax_reward/mean": 0.640625, "rewards/syntax_reward/std": 0.48107168078422546, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 229.44271850585938, "completions/mean_terminated_length": 169.57627868652344, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 2.8176420695504665, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7729120254516602, "kl": 0.04486083984375, "learning_rate": 1.1733647106228373e-08, "loss": 0.0223, "num_tokens": 212954628.0, "reward": 10.269258499145508, "reward_std": 1.451896071434021, "rewards/accuracy_reward/mean": 1.5625, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.4187499284744263, "rewards/judge_reward/std": 1.6679494380950928, "rewards/ngrams_iou_reward/mean": 0.19871513545513153, "rewards/ngrams_iou_reward/std": 0.20200590789318085, "rewards/schema_keywords_iou_reward/mean": 0.7174182534217834, "rewards/schema_keywords_iou_reward/std": 0.15652480721473694, "rewards/syntax_reward/mean": 0.8854166865348816, "rewards/syntax_reward/std": 0.3193511664867401, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.45834350585938, "completions/mean_terminated_length": 166.59649658203125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 2.8210347752332483, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8175360560417175, "kl": 0.05511474609375, "learning_rate": 1.1312449616646403e-08, "loss": 0.0001, "num_tokens": 213243568.0, "reward": 9.2261381149292, "reward_std": 1.4776942729949951, "rewards/accuracy_reward/mean": 1.21875, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.4875000715255737, "rewards/judge_reward/std": 1.5213145017623901, "rewards/ngrams_iou_reward/mean": 0.2217777967453003, "rewards/ngrams_iou_reward/std": 0.24449145793914795, "rewards/schema_keywords_iou_reward/mean": 0.7116509079933167, "rewards/schema_keywords_iou_reward/std": 0.17314039170742035, "rewards/syntax_reward/mean": 0.71875, "rewards/syntax_reward/std": 0.4507846534252167, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 226.2447967529297, "completions/mean_terminated_length": 169.43939208984375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 2.8244274809160306, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7968302369117737, "kl": 0.0560302734375, "learning_rate": 1.0898864224197945e-08, "loss": -0.01, "num_tokens": 213519003.0, "reward": 10.128589630126953, "reward_std": 1.6237467527389526, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.5187498331069946, "rewards/judge_reward/std": 1.7481365203857422, "rewards/ngrams_iou_reward/mean": 0.1967894285917282, "rewards/ngrams_iou_reward/std": 0.23628820478916168, "rewards/schema_keywords_iou_reward/mean": 0.6963832974433899, "rewards/schema_keywords_iou_reward/std": 0.18293990194797516, "rewards/syntax_reward/mean": 0.9010416865348816, "rewards/syntax_reward/std": 0.2993867099285126, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 230.3697967529297, "completions/mean_terminated_length": 168.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 2.8278201865988124, "frac_reward_zero_std": 0.03125, "grad_norm": 0.814461350440979, "kl": 0.04779052734375, "learning_rate": 1.0492897371142728e-08, "loss": 0.0172, "num_tokens": 213774254.0, "reward": 10.13404655456543, "reward_std": 1.28468918800354, "rewards/accuracy_reward/mean": 1.875, "rewards/accuracy_reward/std": 1.4561657905578613, "rewards/format_reward/mean": 0.9739583134651184, "rewards/format_reward/std": 0.15967556834220886, "rewards/judge_reward/mean": 0.8729166984558105, "rewards/judge_reward/std": 1.3817417621612549, "rewards/ngrams_iou_reward/mean": 0.18881838023662567, "rewards/ngrams_iou_reward/std": 0.21737995743751526, "rewards/schema_keywords_iou_reward/mean": 0.7098104357719421, "rewards/schema_keywords_iou_reward/std": 0.14184333384037018, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 233.6041717529297, "completions/mean_terminated_length": 171.686279296875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.8312128922815947, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8758096098899841, "kl": 0.0494384765625, "learning_rate": 1.009455538106968e-08, "loss": 0.0077, "num_tokens": 214041016.0, "reward": 10.869410514831543, "reward_std": 1.0659860372543335, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 1.4432151317596436, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.1531249284744263, "rewards/judge_reward/std": 1.6688367128372192, "rewards/ngrams_iou_reward/mean": 0.20626860857009888, "rewards/ngrams_iou_reward/std": 0.21767447888851166, "rewards/schema_keywords_iou_reward/mean": 0.752724826335907, "rewards/schema_keywords_iou_reward/std": 0.15796488523483276, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 232.7135467529297, "completions/mean_terminated_length": 174.70909118652344, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 2.8346055979643765, "frac_reward_zero_std": 0.0, "grad_norm": 0.994685709476471, "kl": 0.04791259765625, "learning_rate": 9.70384445879796e-09, "loss": 0.0064, "num_tokens": 214322925.0, "reward": 9.493053436279297, "reward_std": 1.3593233823776245, "rewards/accuracy_reward/mean": 1.40625, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.3020833730697632, "rewards/judge_reward/std": 1.5371135473251343, "rewards/ngrams_iou_reward/mean": 0.17455901205539703, "rewards/ngrams_iou_reward/std": 0.20322445034980774, "rewards/schema_keywords_iou_reward/mean": 0.6830773949623108, "rewards/schema_keywords_iou_reward/std": 0.20366565883159637, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 227.59896850585938, "completions/mean_terminated_length": 166.60655212402344, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.8379983036471588, "frac_reward_zero_std": 0.09375, "grad_norm": 0.8669584393501282, "kl": 0.06304931640625, "learning_rate": 9.320770690280644e-09, "loss": 0.0123, "num_tokens": 214581736.0, "reward": 10.516397476196289, "reward_std": 1.4511950016021729, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.27710798382759094, "rewards/judge_reward/mean": 1.3072916269302368, "rewards/judge_reward/std": 1.6851519346237183, "rewards/ngrams_iou_reward/mean": 0.21887946128845215, "rewards/ngrams_iou_reward/std": 0.27335166931152344, "rewards/schema_keywords_iou_reward/mean": 0.7141836285591125, "rewards/schema_keywords_iou_reward/std": 0.2004450559616089, "rewards/syntax_reward/mean": 0.8958333134651184, "rewards/syntax_reward/std": 0.30627527832984924, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 183.58621215820312, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 2.8413910093299406, "frac_reward_zero_std": 0.0, "grad_norm": 0.7483617663383484, "kl": 0.04534912109375, "learning_rate": 8.945340042509797e-09, "loss": -0.0097, "num_tokens": 214817602.0, "reward": 9.69105339050293, "reward_std": 1.0965813398361206, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.4083333015441895, "rewards/judge_reward/std": 1.533955454826355, "rewards/ngrams_iou_reward/mean": 0.19904440641403198, "rewards/ngrams_iou_reward/std": 0.19336070120334625, "rewards/schema_keywords_iou_reward/mean": 0.675341784954071, "rewards/schema_keywords_iou_reward/std": 0.18150953948497772, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 229.09896850585938, "completions/mean_terminated_length": 172.69354248046875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 2.8447837150127224, "frac_reward_zero_std": 0.0625, "grad_norm": 0.829755961894989, "kl": 0.0576171875, "learning_rate": 8.577558363423553e-09, "loss": 0.0133, "num_tokens": 215087399.0, "reward": 9.749781608581543, "reward_std": 1.3438100814819336, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.3541666269302368, "rewards/judge_reward/std": 1.5272681713104248, "rewards/ngrams_iou_reward/mean": 0.20819151401519775, "rewards/ngrams_iou_reward/std": 0.22446854412555695, "rewards/schema_keywords_iou_reward/mean": 0.703048050403595, "rewards/schema_keywords_iou_reward/std": 0.16325020790100098, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 230.0, "completions/mean_terminated_length": 171.38983154296875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 2.8481764206955047, "frac_reward_zero_std": 0.0, "grad_norm": 0.8860522508621216, "kl": 0.045166015625, "learning_rate": 8.217431381815076e-09, "loss": 0.0158, "num_tokens": 215335667.0, "reward": 10.53847885131836, "reward_std": 0.9449849128723145, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.3229166269302368, "rewards/judge_reward/std": 1.724810004234314, "rewards/ngrams_iou_reward/mean": 0.1976301074028015, "rewards/ngrams_iou_reward/std": 0.1634640097618103, "rewards/schema_keywords_iou_reward/mean": 0.7262654900550842, "rewards/schema_keywords_iou_reward/std": 0.1463654488325119, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.36459350585938, "completions/mean_terminated_length": 171.07461547851562, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 2.8515691263782865, "frac_reward_zero_std": 0.0625, "grad_norm": 0.851938784122467, "kl": 0.05194091796875, "learning_rate": 7.864964707243071e-09, "loss": -0.0012, "num_tokens": 215591607.0, "reward": 10.204736709594727, "reward_std": 1.2343732118606567, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.792708396911621, "rewards/judge_reward/std": 1.7163667678833008, "rewards/ngrams_iou_reward/mean": 0.22668300569057465, "rewards/ngrams_iou_reward/std": 0.28161031007766724, "rewards/schema_keywords_iou_reward/mean": 0.7155532836914062, "rewards/schema_keywords_iou_reward/std": 0.2061089426279068, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 230.46875, "completions/mean_terminated_length": 176.93548583984375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 2.854961832061069, "frac_reward_zero_std": 0.0, "grad_norm": 0.9201685786247253, "kl": 0.0489501953125, "learning_rate": 7.520163829944803e-09, "loss": -0.0095, "num_tokens": 215836389.0, "reward": 10.466147422790527, "reward_std": 0.9774050712585449, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.6041666269302368, "rewards/judge_reward/std": 1.7863893508911133, "rewards/ngrams_iou_reward/mean": 0.16881245374679565, "rewards/ngrams_iou_reward/std": 0.17142543196678162, "rewards/schema_keywords_iou_reward/mean": 0.7140005230903625, "rewards/schema_keywords_iou_reward/std": 0.13912750780582428, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 231.2291717529297, "completions/mean_terminated_length": 172.5614013671875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 2.8583545377438506, "frac_reward_zero_std": 0.0, "grad_norm": 0.7879418134689331, "kl": 0.0601806640625, "learning_rate": 7.18303412075022e-09, "loss": 0.0027, "num_tokens": 216084131.0, "reward": 10.146032333374023, "reward_std": 1.2519474029541016, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.611458420753479, "rewards/judge_reward/std": 1.7588887214660645, "rewards/ngrams_iou_reward/mean": 0.2758134603500366, "rewards/ngrams_iou_reward/std": 0.2775636315345764, "rewards/schema_keywords_iou_reward/mean": 0.741051197052002, "rewards/schema_keywords_iou_reward/std": 0.18291877210140228, "rewards/syntax_reward/mean": 0.875, "rewards/syntax_reward/std": 0.33158352971076965, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 228.48959350585938, "completions/mean_terminated_length": 164.9310302734375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 2.861747243426633, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8038710951805115, "kl": 0.04931640625, "learning_rate": 6.853580830998629e-09, "loss": -0.008, "num_tokens": 216330711.0, "reward": 10.033955574035645, "reward_std": 1.422906756401062, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.359375, "rewards/judge_reward/std": 1.6279674768447876, "rewards/ngrams_iou_reward/mean": 0.2022130936384201, "rewards/ngrams_iou_reward/std": 0.23199529945850372, "rewards/schema_keywords_iou_reward/mean": 0.6859087944030762, "rewards/schema_keywords_iou_reward/std": 0.20755788683891296, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 226.94271850585938, "completions/mean_terminated_length": 168.828125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 2.8651399491094147, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8142340779304504, "kl": 0.0513916015625, "learning_rate": 6.531809092456597e-09, "loss": 0.0035, "num_tokens": 216589438.0, "reward": 10.123580932617188, "reward_std": 1.3469160795211792, "rewards/accuracy_reward/mean": 1.546875, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.3333333730697632, "rewards/judge_reward/std": 1.6586544513702393, "rewards/ngrams_iou_reward/mean": 0.2720637321472168, "rewards/ngrams_iou_reward/std": 0.27642861008644104, "rewards/schema_keywords_iou_reward/mean": 0.7265167236328125, "rewards/schema_keywords_iou_reward/std": 0.19213898479938507, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 228.39584350585938, "completions/mean_terminated_length": 169.11474609375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 2.8685326547921965, "frac_reward_zero_std": 0.0, "grad_norm": 0.7840889096260071, "kl": 0.0560302734375, "learning_rate": 6.217723917238127e-09, "loss": -0.0225, "num_tokens": 216848042.0, "reward": 9.879205703735352, "reward_std": 1.2013951539993286, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.556249976158142, "rewards/judge_reward/std": 1.635778784751892, "rewards/ngrams_iou_reward/mean": 0.18047381937503815, "rewards/ngrams_iou_reward/std": 0.21111249923706055, "rewards/schema_keywords_iou_reward/mean": 0.721646785736084, "rewards/schema_keywords_iou_reward/std": 0.16255588829517365, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 224.45834350585938, "completions/mean_terminated_length": 162.8307647705078, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 2.871925360474979, "frac_reward_zero_std": 0.09375, "grad_norm": 0.725601077079773, "kl": 0.05072021484375, "learning_rate": 5.9113301977266605e-09, "loss": 0.0019, "num_tokens": 217102764.0, "reward": 9.97712516784668, "reward_std": 1.1292059421539307, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12434382736682892, "rewards/judge_reward/mean": 1.5749999284744263, "rewards/judge_reward/std": 1.6543769836425781, "rewards/ngrams_iou_reward/mean": 0.2785976231098175, "rewards/ngrams_iou_reward/std": 0.31882742047309875, "rewards/schema_keywords_iou_reward/mean": 0.7412347793579102, "rewards/schema_keywords_iou_reward/std": 0.16333357989788055, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 227.88021850585938, "completions/mean_terminated_length": 179.95774841308594, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 2.875318066157761, "frac_reward_zero_std": 0.0, "grad_norm": 0.7682471871376038, "kl": 0.0599365234375, "learning_rate": 5.612632706498754e-09, "loss": -0.0164, "num_tokens": 217364179.0, "reward": 9.486684799194336, "reward_std": 1.5950515270233154, "rewards/accuracy_reward/mean": 1.359375, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24269428849220276, "rewards/judge_reward/mean": 1.4354166984558105, "rewards/judge_reward/std": 1.5007139444351196, "rewards/ngrams_iou_reward/mean": 0.18085260689258575, "rewards/ngrams_iou_reward/std": 0.18688973784446716, "rewards/schema_keywords_iou_reward/mean": 0.6954143047332764, "rewards/schema_keywords_iou_reward/std": 0.1649477183818817, "rewards/syntax_reward/mean": 0.7239583134651184, "rewards/syntax_reward/std": 0.4482063353061676, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 232.2135467529297, "completions/mean_terminated_length": 174.44644165039062, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.878710771840543, "frac_reward_zero_std": 0.0625, "grad_norm": 0.839872419834137, "kl": 0.047119140625, "learning_rate": 5.321636096249749e-09, "loss": 0.0097, "num_tokens": 217629402.0, "reward": 10.026606559753418, "reward_std": 1.2378112077713013, "rewards/accuracy_reward/mean": 1.78125, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 0.9708333015441895, "rewards/judge_reward/std": 1.4483003616333008, "rewards/ngrams_iou_reward/mean": 0.20781570672988892, "rewards/ngrams_iou_reward/std": 0.21669545769691467, "rewards/schema_keywords_iou_reward/mean": 0.7208738923072815, "rewards/schema_keywords_iou_reward/std": 0.16737507283687592, "rewards/syntax_reward/mean": 0.8489583134651184, "rewards/syntax_reward/std": 0.35902565717697144, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.63021850585938, "completions/mean_terminated_length": 181.73016357421875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 2.8821034775233247, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7749503254890442, "kl": 0.05010986328125, "learning_rate": 5.038344899721436e-09, "loss": 0.0091, "num_tokens": 217890985.0, "reward": 9.874067306518555, "reward_std": 1.22401762008667, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.5052083730697632, "rewards/judge_reward/std": 1.713382363319397, "rewards/ngrams_iou_reward/mean": 0.15400061011314392, "rewards/ngrams_iou_reward/std": 0.162291020154953, "rewards/schema_keywords_iou_reward/mean": 0.6992327570915222, "rewards/schema_keywords_iou_reward/std": 0.14314275979995728, "rewards/syntax_reward/mean": 0.7760416865348816, "rewards/syntax_reward/std": 0.41798436641693115, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.36459350585938, "completions/mean_terminated_length": 173.51612854003906, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 2.885496183206107, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7430111169815063, "kl": 0.04632568359375, "learning_rate": 4.762763529631342e-09, "loss": 0.0056, "num_tokens": 218131325.0, "reward": 10.90503215789795, "reward_std": 1.2364274263381958, "rewards/accuracy_reward/mean": 1.78125, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.3270832300186157, "rewards/judge_reward/std": 1.7352808713912964, "rewards/ngrams_iou_reward/mean": 0.3285422623157501, "rewards/ngrams_iou_reward/std": 0.33155953884124756, "rewards/schema_keywords_iou_reward/mean": 0.7712805867195129, "rewards/schema_keywords_iou_reward/std": 0.16376806795597076, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 231.4166717529297, "completions/mean_terminated_length": 173.19297790527344, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 2.888888888888889, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8028118014335632, "kl": 0.0531005859375, "learning_rate": 4.4948962786039435e-09, "loss": 0.021, "num_tokens": 218379259.0, "reward": 9.717171669006348, "reward_std": 1.4753272533416748, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.5401042699813843, "rewards/judge_reward/std": 1.6362407207489014, "rewards/ngrams_iou_reward/mean": 0.2361944317817688, "rewards/ngrams_iou_reward/std": 0.25513383746147156, "rewards/schema_keywords_iou_reward/mean": 0.7028513550758362, "rewards/schema_keywords_iou_reward/std": 0.1826753318309784, "rewards/syntax_reward/mean": 0.7395833134651184, "rewards/syntax_reward/std": 0.44000932574272156, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 230.75521850585938, "completions/mean_terminated_length": 177.82257080078125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 2.8922815945716707, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7287147045135498, "kl": 0.04461669921875, "learning_rate": 4.234747319103948e-09, "loss": -0.0069, "num_tokens": 218616176.0, "reward": 10.153785705566406, "reward_std": 1.2377736568450928, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.3463541269302368, "rewards/judge_reward/std": 1.6133238077163696, "rewards/ngrams_iou_reward/mean": 0.20911812782287598, "rewards/ngrams_iou_reward/std": 0.24675065279006958, "rewards/schema_keywords_iou_reward/mean": 0.7207088470458984, "rewards/schema_keywords_iou_reward/std": 0.1579391360282898, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 222.78646850585938, "completions/mean_terminated_length": 169.82432556152344, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.895674300254453, "frac_reward_zero_std": 0.0, "grad_norm": 1.0147181749343872, "kl": 0.0560302734375, "learning_rate": 3.982320703371067e-09, "loss": 0.0159, "num_tokens": 218870817.0, "reward": 10.29623031616211, "reward_std": 1.267899513244629, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.426041603088379, "rewards/judge_reward/std": 1.6541708707809448, "rewards/ngrams_iou_reward/mean": 0.19493229687213898, "rewards/ngrams_iou_reward/std": 0.22449705004692078, "rewards/schema_keywords_iou_reward/mean": 0.7023382782936096, "rewards/schema_keywords_iou_reward/std": 0.1617046296596527, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 233.9375, "completions/mean_terminated_length": 180.35714721679688, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 2.899067005937235, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7352743744850159, "kl": 0.05206298828125, "learning_rate": 3.737620363357286e-09, "loss": 0.0006, "num_tokens": 219125271.0, "reward": 9.908401489257812, "reward_std": 1.4790571928024292, "rewards/accuracy_reward/mean": 1.4375, "rewards/accuracy_reward/std": 1.5026154518127441, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.455208420753479, "rewards/judge_reward/std": 1.6128002405166626, "rewards/ngrams_iou_reward/mean": 0.1734592765569687, "rewards/ngrams_iou_reward/std": 0.17069180309772491, "rewards/schema_keywords_iou_reward/mean": 0.6943166851997375, "rewards/schema_keywords_iou_reward/std": 0.15144780278205872, "rewards/syntax_reward/mean": 0.859375, "rewards/syntax_reward/std": 0.3485431373119354, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 228.296875, "completions/mean_terminated_length": 170.2096710205078, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 2.902459711620017, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8843387365341187, "kl": 0.04937744140625, "learning_rate": 3.500650110665193e-09, "loss": 0.0268, "num_tokens": 219379392.0, "reward": 10.199833869934082, "reward_std": 1.684457778930664, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.165624976158142, "rewards/judge_reward/std": 1.5895817279815674, "rewards/ngrams_iou_reward/mean": 0.18941731750965118, "rewards/ngrams_iou_reward/std": 0.21189160645008087, "rewards/schema_keywords_iou_reward/mean": 0.7260410189628601, "rewards/schema_keywords_iou_reward/std": 0.15547595918178558, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617491722107, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 228.1666717529297, "completions/mean_terminated_length": 173.7846221923828, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 2.905852417302799, "frac_reward_zero_std": 0.0, "grad_norm": 0.8284968733787537, "kl": 0.059326171875, "learning_rate": 3.2714136364888065e-09, "loss": -0.0154, "num_tokens": 219648674.0, "reward": 9.798956871032715, "reward_std": 1.240302324295044, "rewards/accuracy_reward/mean": 1.609375, "rewards/accuracy_reward/std": 1.4999181032180786, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.1729166507720947, "rewards/judge_reward/std": 1.5774908065795898, "rewards/ngrams_iou_reward/mean": 0.1603587120771408, "rewards/ngrams_iou_reward/std": 0.17325206100940704, "rewards/schema_keywords_iou_reward/mean": 0.6781811714172363, "rewards/schema_keywords_iou_reward/std": 0.1808580607175827, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 228.6197967529297, "completions/mean_terminated_length": 172.55557250976562, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 2.909245122985581, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8252243399620056, "kl": 0.05096435546875, "learning_rate": 3.0499145115561177e-09, "loss": 0.0228, "num_tokens": 219904981.0, "reward": 9.61595344543457, "reward_std": 1.5049662590026855, "rewards/accuracy_reward/mean": 1.34375, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.4333332777023315, "rewards/judge_reward/std": 1.5175528526306152, "rewards/ngrams_iou_reward/mean": 0.2062745839357376, "rewards/ngrams_iou_reward/std": 0.24305468797683716, "rewards/schema_keywords_iou_reward/mean": 0.7096786499023438, "rewards/schema_keywords_iou_reward/std": 0.16768620908260345, "rewards/syntax_reward/mean": 0.8541666865348816, "rewards/syntax_reward/std": 0.3538617789745331, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 229.20834350585938, "completions/mean_terminated_length": 170.2666778564453, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 2.912637828668363, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7161747217178345, "kl": 0.04803466796875, "learning_rate": 2.836156186073413e-09, "loss": 0.0219, "num_tokens": 220151969.0, "reward": 10.632795333862305, "reward_std": 1.4782218933105469, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 1.4432151317596436, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.1010416746139526, "rewards/judge_reward/std": 1.596592903137207, "rewards/ngrams_iou_reward/mean": 0.18973158299922943, "rewards/ngrams_iou_reward/std": 0.17610298097133636, "rewards/schema_keywords_iou_reward/mean": 0.7253541350364685, "rewards/schema_keywords_iou_reward/std": 0.1619417518377304, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 222.203125, "completions/mean_terminated_length": 156.1692352294922, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 2.916030534351145, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8317294716835022, "kl": 0.049072265625, "learning_rate": 2.6301419896715416e-09, "loss": 0.0125, "num_tokens": 220401146.0, "reward": 10.229527473449707, "reward_std": 1.2955372333526611, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 1.513541579246521, "rewards/judge_reward/std": 1.6685020923614502, "rewards/ngrams_iou_reward/mean": 0.18112903833389282, "rewards/ngrams_iou_reward/std": 0.22131524980068207, "rewards/schema_keywords_iou_reward/mean": 0.734856128692627, "rewards/schema_keywords_iou_reward/std": 0.16876445710659027, "rewards/syntax_reward/mean": 0.8229166865348816, "rewards/syntax_reward/std": 0.3827372193336487, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 256.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 227.78646850585938, "completions/mean_terminated_length": 172.6615447998047, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 2.919423240033927, "frac_reward_zero_std": 0.09375, "grad_norm": 0.7397957444190979, "kl": 0.0501708984375, "learning_rate": 2.431875131354011e-09, "loss": 0.0152, "num_tokens": 220628937.0, "reward": 9.728015899658203, "reward_std": 1.419995665550232, "rewards/accuracy_reward/mean": 1.5, "rewards/accuracy_reward/std": 1.5039215087890625, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.25, "rewards/judge_reward/std": 1.4706909656524658, "rewards/ngrams_iou_reward/mean": 0.19492237269878387, "rewards/ngrams_iou_reward/std": 0.2064415067434311, "rewards/schema_keywords_iou_reward/mean": 0.6945516467094421, "rewards/schema_keywords_iou_reward/std": 0.18100668489933014, "rewards/syntax_reward/mean": 0.890625, "rewards/syntax_reward/std": 0.3129251003265381, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 233.02084350585938, "completions/mean_terminated_length": 177.21429443359375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 2.9228159457167093, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8061403036117554, "kl": 0.05267333984375, "learning_rate": 2.2413586994470825e-09, "loss": -0.0077, "num_tokens": 220883437.0, "reward": 10.309630393981934, "reward_std": 1.56017005443573, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2922426164150238, "rewards/judge_reward/mean": 1.3916665315628052, "rewards/judge_reward/std": 1.7087191343307495, "rewards/ngrams_iou_reward/mean": 0.1812792420387268, "rewards/ngrams_iou_reward/std": 0.16729354858398438, "rewards/schema_keywords_iou_reward/mean": 0.7304337620735168, "rewards/schema_keywords_iou_reward/std": 0.1512717753648758, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 256.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 229.84896850585938, "completions/mean_terminated_length": 159.4423065185547, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 2.926208651399491, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7568052411079407, "kl": 0.04608154296875, "learning_rate": 2.058595661551532e-09, "loss": -0.0225, "num_tokens": 221148440.0, "reward": 9.806318283081055, "reward_std": 1.3839658498764038, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.4447917938232422, "rewards/judge_reward/std": 1.681894302368164, "rewards/ngrams_iou_reward/mean": 0.20805774629116058, "rewards/ngrams_iou_reward/std": 0.2277674525976181, "rewards/schema_keywords_iou_reward/mean": 0.7034685611724854, "rewards/schema_keywords_iou_reward/std": 0.1934986412525177, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 227.6979217529297, "completions/mean_terminated_length": 176.08824157714844, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 2.929601357082273, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8621458411216736, "kl": 0.0533447265625, "learning_rate": 1.8835888644966324e-09, "loss": 0.002, "num_tokens": 221425912.0, "reward": 9.76765251159668, "reward_std": 1.487558364868164, "rewards/accuracy_reward/mean": 1.21875, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277598083019257, "rewards/judge_reward/mean": 1.6906250715255737, "rewards/judge_reward/std": 1.619228482246399, "rewards/ngrams_iou_reward/mean": 0.25248444080352783, "rewards/ngrams_iou_reward/std": 0.28124523162841797, "rewards/schema_keywords_iou_reward/mean": 0.7328751683235168, "rewards/schema_keywords_iou_reward/std": 0.18042834103107452, "rewards/syntax_reward/mean": 0.796875, "rewards/syntax_reward/std": 0.40337660908699036, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 224.234375, "completions/mean_terminated_length": 156.01638793945312, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 2.9329940627650553, "frac_reward_zero_std": 0.09375, "grad_norm": 0.830262303352356, "kl": 0.05584716796875, "learning_rate": 1.7163410342956874e-09, "loss": 0.0257, "num_tokens": 221666857.0, "reward": 10.144376754760742, "reward_std": 1.1345958709716797, "rewards/accuracy_reward/mean": 1.59375, "rewards/accuracy_reward/std": 1.500981330871582, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.28125, "rewards/judge_reward/std": 1.566689372062683, "rewards/ngrams_iou_reward/mean": 0.29390355944633484, "rewards/ngrams_iou_reward/std": 0.2980216443538666, "rewards/schema_keywords_iou_reward/mean": 0.7306806445121765, "rewards/schema_keywords_iou_reward/std": 0.19342175126075745, "rewards/syntax_reward/mean": 0.8177083134651184, "rewards/syntax_reward/std": 0.38709405064582825, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 223.55209350585938, "completions/mean_terminated_length": 165.71014404296875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 2.936386768447837, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7548751831054688, "kl": 0.0501708984375, "learning_rate": 1.5568547761034001e-09, "loss": 0.0122, "num_tokens": 221935169.0, "reward": 10.48901653289795, "reward_std": 0.9267624616622925, "rewards/accuracy_reward/mean": 1.984375, "rewards/accuracy_reward/std": 1.423352599143982, "rewards/format_reward/mean": 0.9635416865348816, "rewards/format_reward/std": 0.18791764974594116, "rewards/judge_reward/mean": 0.8843750357627869, "rewards/judge_reward/std": 1.4506778717041016, "rewards/ngrams_iou_reward/mean": 0.20734380185604095, "rewards/ngrams_iou_reward/std": 0.2557760179042816, "rewards/schema_keywords_iou_reward/mean": 0.7316716313362122, "rewards/schema_keywords_iou_reward/std": 0.1721956878900528, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 174.9091033935547, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 2.939779474130619, "frac_reward_zero_std": 0.0, "grad_norm": 0.7518686652183533, "kl": 0.05035400390625, "learning_rate": 1.4051325741756825e-09, "loss": 0.026, "num_tokens": 222189305.0, "reward": 9.681356430053711, "reward_std": 1.4373013973236084, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606793940067291, "rewards/judge_reward/mean": 1.4499999284744263, "rewards/judge_reward/std": 1.6603790521621704, "rewards/ngrams_iou_reward/mean": 0.14982594549655914, "rewards/ngrams_iou_reward/std": 0.16922703385353088, "rewards/schema_keywords_iou_reward/mean": 0.6731967329978943, "rewards/schema_keywords_iou_reward/std": 0.18222838640213013, "rewards/syntax_reward/mean": 0.765625, "rewards/syntax_reward/std": 0.4247150123119354, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 227.265625, "completions/mean_terminated_length": 167.01612854003906, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.943172179813401, "frac_reward_zero_std": 0.0, "grad_norm": 0.874667227268219, "kl": 0.05059814453125, "learning_rate": 1.2611767918306315e-09, "loss": 0.0277, "num_tokens": 222436970.0, "reward": 9.523887634277344, "reward_std": 1.2980563640594482, "rewards/accuracy_reward/mean": 1.328125, "rewards/accuracy_reward/std": 1.494016170501709, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.509374976158142, "rewards/judge_reward/std": 1.6200042963027954, "rewards/ngrams_iou_reward/mean": 0.19545917212963104, "rewards/ngrams_iou_reward/std": 0.19859513640403748, "rewards/schema_keywords_iou_reward/mean": 0.6794695854187012, "rewards/schema_keywords_iou_reward/std": 0.17642831802368164, "rewards/syntax_reward/mean": 0.71875, "rewards/syntax_reward/std": 0.4507846534252167, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 230.91146850585938, "completions/mean_terminated_length": 175.7166748046875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 2.9465648854961835, "frac_reward_zero_std": 0.0, "grad_norm": 0.8349516987800598, "kl": 0.05072021484375, "learning_rate": 1.1249896714117802e-09, "loss": 0.0102, "num_tokens": 222688287.0, "reward": 9.647517204284668, "reward_std": 1.2428374290466309, "rewards/accuracy_reward/mean": 1.625, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034871995449066, "rewards/judge_reward/mean": 1.03125, "rewards/judge_reward/std": 1.377098560333252, "rewards/ngrams_iou_reward/mean": 0.18094803392887115, "rewards/ngrams_iou_reward/std": 0.1789059042930603, "rewards/schema_keywords_iou_reward/mean": 0.7061519622802734, "rewards/schema_keywords_iou_reward/std": 0.15408116579055786, "rewards/syntax_reward/mean": 0.8645833134651184, "rewards/syntax_reward/std": 0.3430626094341278, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.63021850585938, "completions/mean_terminated_length": 176.890625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 2.9499575911789653, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8113992810249329, "kl": 0.0460205078125, "learning_rate": 9.965733342532923e-10, "loss": -0.0028, "num_tokens": 222948844.0, "reward": 9.517145156860352, "reward_std": 1.1089060306549072, "rewards/accuracy_reward/mean": 1.421875, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 1.2843749523162842, "rewards/judge_reward/std": 1.5130031108856201, "rewards/ngrams_iou_reward/mean": 0.2249247282743454, "rewards/ngrams_iou_reward/std": 0.26250746846199036, "rewards/schema_keywords_iou_reward/mean": 0.7078443169593811, "rewards/schema_keywords_iou_reward/std": 0.18789246678352356, "rewards/syntax_reward/mean": 0.828125, "rewards/syntax_reward/std": 0.37825807929039, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 228.8072967529297, "completions/mean_terminated_length": 168.98333740234375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 2.953350296861747, "frac_reward_zero_std": 0.125, "grad_norm": 0.7343104481697083, "kl": 0.050048828125, "learning_rate": 8.759297806469335e-10, "loss": 0.0031, "num_tokens": 223190613.0, "reward": 9.95028305053711, "reward_std": 1.202857255935669, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.1104166507720947, "rewards/judge_reward/std": 1.4536092281341553, "rewards/ngrams_iou_reward/mean": 0.25214359164237976, "rewards/ngrams_iou_reward/std": 0.2771517038345337, "rewards/schema_keywords_iou_reward/mean": 0.7585551738739014, "rewards/schema_keywords_iou_reward/std": 0.16264864802360535, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 228.73959350585938, "completions/mean_terminated_length": 174.21875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 2.9567430025445294, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8372417092323303, "kl": 0.0526123046875, "learning_rate": 7.630608898105961e-10, "loss": 0.0106, "num_tokens": 223475215.0, "reward": 9.459831237792969, "reward_std": 1.5192599296569824, "rewards/accuracy_reward/mean": 1.3125, "rewards/accuracy_reward/std": 1.4921258687973022, "rewards/format_reward/mean": 0.9479166865348816, "rewards/format_reward/std": 0.22277599573135376, "rewards/judge_reward/mean": 1.4500001668930054, "rewards/judge_reward/std": 1.576927900314331, "rewards/ngrams_iou_reward/mean": 0.19322486221790314, "rewards/ngrams_iou_reward/std": 0.22641266882419586, "rewards/schema_keywords_iou_reward/mean": 0.6791064143180847, "rewards/schema_keywords_iou_reward/std": 0.18441373109817505, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 256.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 230.27084350585938, "completions/mean_terminated_length": 157.1999969482422, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 2.960135708227311, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8866628408432007, "kl": 0.04766845703125, "learning_rate": 6.579684198594337e-10, "loss": 0.0097, "num_tokens": 223705667.0, "reward": 10.140524864196777, "reward_std": 1.5607578754425049, "rewards/accuracy_reward/mean": 1.65625, "rewards/accuracy_reward/std": 1.4957400560379028, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.1958333253860474, "rewards/judge_reward/std": 1.6186037063598633, "rewards/ngrams_iou_reward/mean": 0.2855432331562042, "rewards/ngrams_iou_reward/std": 0.2904534935951233, "rewards/schema_keywords_iou_reward/mean": 0.7237312197685242, "rewards/schema_keywords_iou_reward/std": 0.20870767533779144, "rewards/syntax_reward/mean": 0.8125, "rewards/syntax_reward/std": 0.3913327753543854, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 256.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 170.18182373046875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 2.963528413910093, "frac_reward_zero_std": 0.03125, "grad_norm": 0.8010122776031494, "kl": 0.047607421875, "learning_rate": 5.606540077782162e-10, "loss": -0.0081, "num_tokens": 223947029.0, "reward": 9.819700241088867, "reward_std": 1.4901862144470215, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.53125, "rewards/judge_reward/std": 1.6379857063293457, "rewards/ngrams_iou_reward/mean": 0.21023325622081757, "rewards/ngrams_iou_reward/std": 0.20331251621246338, "rewards/schema_keywords_iou_reward/mean": 0.6980075836181641, "rewards/schema_keywords_iou_reward/std": 0.1750170737504959, "rewards/syntax_reward/mean": 0.78125, "rewards/syntax_reward/std": 0.41447943449020386, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 256.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 234.38021850585938, "completions/mean_terminated_length": 179.12962341308594, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 2.9669211195928753, "frac_reward_zero_std": 0.0, "grad_norm": 0.7864583730697632, "kl": 0.0496826171875, "learning_rate": 4.711191693959615e-10, "loss": 0.0095, "num_tokens": 224186910.0, "reward": 10.345396995544434, "reward_std": 1.3196117877960205, "rewards/accuracy_reward/mean": 1.78125, "rewards/accuracy_reward/std": 1.4772489070892334, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.1708333492279053, "rewards/judge_reward/std": 1.5289126634597778, "rewards/ngrams_iou_reward/mean": 0.18702609837055206, "rewards/ngrams_iou_reward/std": 0.19828839600086212, "rewards/schema_keywords_iou_reward/mean": 0.7073294520378113, "rewards/schema_keywords_iou_reward/std": 0.15841449797153473, "rewards/syntax_reward/mean": 0.8333333134651184, "rewards/syntax_reward/std": 0.37365230917930603, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 230.6197967529297, "completions/mean_terminated_length": 173.4067840576172, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.9703138252756576, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7947092652320862, "kl": 0.05157470703125, "learning_rate": 3.8936529936217656e-10, "loss": 0.0073, "num_tokens": 224446247.0, "reward": 10.494950294494629, "reward_std": 1.403921127319336, "rewards/accuracy_reward/mean": 1.71875, "rewards/accuracy_reward/std": 1.48784339427948, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.2937499284744263, "rewards/judge_reward/std": 1.678307056427002, "rewards/ngrams_iou_reward/mean": 0.23111169040203094, "rewards/ngrams_iou_reward/std": 0.2528926432132721, "rewards/schema_keywords_iou_reward/mean": 0.7492544054985046, "rewards/schema_keywords_iou_reward/std": 0.1824713498353958, "rewards/syntax_reward/mean": 0.84375, "rewards/syntax_reward/std": 0.36404144763946533, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 231.375, "completions/mean_terminated_length": 171.57144165039062, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 2.9737065309584394, "frac_reward_zero_std": 0.0, "grad_norm": 0.8033210039138794, "kl": 0.04840087890625, "learning_rate": 3.1539367112543014e-10, "loss": 0.021, "num_tokens": 224725565.0, "reward": 9.852178573608398, "reward_std": 1.2506812810897827, "rewards/accuracy_reward/mean": 1.453125, "rewards/accuracy_reward/std": 1.5031870603561401, "rewards/format_reward/mean": 0.9322916865348816, "rewards/format_reward/std": 0.2519015669822693, "rewards/judge_reward/mean": 1.4072917699813843, "rewards/judge_reward/std": 1.686207890510559, "rewards/ngrams_iou_reward/mean": 0.19700641930103302, "rewards/ngrams_iou_reward/std": 0.22522124648094177, "rewards/schema_keywords_iou_reward/mean": 0.7103796005249023, "rewards/schema_keywords_iou_reward/std": 0.1663149893283844, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151406288147, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 227.40625, "completions/mean_terminated_length": 168.85714721679688, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 2.9770992366412212, "frac_reward_zero_std": 0.0, "grad_norm": 0.8941340446472168, "kl": 0.04827880859375, "learning_rate": 2.4920543691309137e-10, "loss": -0.0002, "num_tokens": 224973875.0, "reward": 10.635798454284668, "reward_std": 1.2636854648590088, "rewards/accuracy_reward/mean": 1.9375, "rewards/accuracy_reward/std": 1.4385310411453247, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.03125, "rewards/judge_reward/std": 1.5616685152053833, "rewards/ngrams_iou_reward/mean": 0.2631421387195587, "rewards/ngrams_iou_reward/std": 0.2935165464878082, "rewards/schema_keywords_iou_reward/mean": 0.7268218398094177, "rewards/schema_keywords_iou_reward/std": 0.19004908204078674, "rewards/syntax_reward/mean": 0.8020833134651184, "rewards/syntax_reward/std": 0.39947062730789185, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 228.4010467529297, "completions/mean_terminated_length": 171.88890075683594, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 2.9804919423240035, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8512468934059143, "kl": 0.0546875, "learning_rate": 1.9080162771378805e-10, "loss": 0.0165, "num_tokens": 225223072.0, "reward": 10.253978729248047, "reward_std": 1.1953649520874023, "rewards/accuracy_reward/mean": 1.578125, "rewards/accuracy_reward/std": 1.5018802881240845, "rewards/format_reward/mean": 0.9270833134651184, "rewards/format_reward/std": 0.2606794238090515, "rewards/judge_reward/mean": 1.410416603088379, "rewards/judge_reward/std": 1.6432299613952637, "rewards/ngrams_iou_reward/mean": 0.27121201157569885, "rewards/ngrams_iou_reward/std": 0.30734819173812866, "rewards/schema_keywords_iou_reward/mean": 0.745265543460846, "rewards/schema_keywords_iou_reward/std": 0.17529036104679108, "rewards/syntax_reward/mean": 0.7552083134651184, "rewards/syntax_reward/std": 0.4310877323150635, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 229.95834350585938, "completions/mean_terminated_length": 174.03277587890625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 2.9838846480067853, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7655709981918335, "kl": 0.048828125, "learning_rate": 1.401831532610309e-10, "loss": 0.0061, "num_tokens": 225478442.0, "reward": 9.720714569091797, "reward_std": 1.339855432510376, "rewards/accuracy_reward/mean": 1.234375, "rewards/accuracy_reward/std": 1.4801533222198486, "rewards/format_reward/mean": 0.9427083134651184, "rewards/format_reward/std": 0.23300664126873016, "rewards/judge_reward/mean": 1.712499976158142, "rewards/judge_reward/std": 1.5933927297592163, "rewards/ngrams_iou_reward/mean": 0.1733151078224182, "rewards/ngrams_iou_reward/std": 0.16093206405639648, "rewards/schema_keywords_iou_reward/mean": 0.6848986148834229, "rewards/schema_keywords_iou_reward/std": 0.14908380806446075, "rewards/syntax_reward/mean": 0.7916666865348816, "rewards/syntax_reward/std": 0.40717819333076477, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 226.5729217529297, "completions/mean_terminated_length": 172.91175842285156, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 2.9872773536895676, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9149185419082642, "kl": 0.04833984375, "learning_rate": 9.735080201922486e-11, "loss": 0.0102, "num_tokens": 225738334.0, "reward": 9.79557991027832, "reward_std": 1.243537187576294, "rewards/accuracy_reward/mean": 1.640625, "rewards/accuracy_reward/std": 1.497298002243042, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20034873485565186, "rewards/judge_reward/mean": 1.1072916984558105, "rewards/judge_reward/std": 1.4796168804168701, "rewards/ngrams_iou_reward/mean": 0.19917017221450806, "rewards/ngrams_iou_reward/std": 0.24114690721035004, "rewards/schema_keywords_iou_reward/mean": 0.6943259239196777, "rewards/schema_keywords_iou_reward/std": 0.16941417753696442, "rewards/syntax_reward/mean": 0.8072916865348816, "rewards/syntax_reward/std": 0.39545711874961853, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 256.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 226.9479217529297, "completions/mean_terminated_length": 163.03334045410156, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 2.9906700593723494, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8482857942581177, "kl": 0.049072265625, "learning_rate": 6.230524117134539e-11, "loss": 0.0099, "num_tokens": 225996432.0, "reward": 10.423307418823242, "reward_std": 0.9950863122940063, "rewards/accuracy_reward/mean": 1.921875, "rewards/accuracy_reward/std": 1.4432151317596436, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2690697908401489, "rewards/judge_reward/mean": 0.9489583373069763, "rewards/judge_reward/std": 1.4813672304153442, "rewards/ngrams_iou_reward/mean": 0.2375548630952835, "rewards/ngrams_iou_reward/std": 0.2776705324649811, "rewards/schema_keywords_iou_reward/mean": 0.7201266288757324, "rewards/schema_keywords_iou_reward/std": 0.19433225691318512, "rewards/syntax_reward/mean": 0.8802083134651184, "rewards/syntax_reward/std": 0.32556667923927307, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 226.83334350585938, "completions/mean_terminated_length": 168.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 2.9940627650551317, "frac_reward_zero_std": 0.03125, "grad_norm": 0.9077220559120178, "kl": 0.05645751953125, "learning_rate": 3.5047016608613646e-11, "loss": 0.0088, "num_tokens": 226280242.0, "reward": 9.815513610839844, "reward_std": 0.9984002113342285, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17444752156734467, "rewards/judge_reward/mean": 1.5270832777023315, "rewards/judge_reward/std": 1.6616699695587158, "rewards/ngrams_iou_reward/mean": 0.16567771136760712, "rewards/ngrams_iou_reward/std": 0.12561801075935364, "rewards/schema_keywords_iou_reward/mean": 0.7154609560966492, "rewards/schema_keywords_iou_reward/std": 0.14700330793857574, "rewards/syntax_reward/mean": 0.7864583134651184, "rewards/syntax_reward/std": 0.4108782112598419, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 256.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 228.66146850585938, "completions/mean_terminated_length": 182.07041931152344, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 2.9974554707379135, "frac_reward_zero_std": 0.03125, "grad_norm": 0.7668232321739197, "kl": 0.0498046875, "learning_rate": 1.5576552921836573e-11, "loss": 0.0003, "num_tokens": 226571285.0, "reward": 9.61807632446289, "reward_std": 1.3013298511505127, "rewards/accuracy_reward/mean": 1.375, "rewards/accuracy_reward/std": 1.4986904859542847, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21192367374897003, "rewards/judge_reward/mean": 1.4072917699813843, "rewards/judge_reward/std": 1.5182480812072754, "rewards/ngrams_iou_reward/mean": 0.19630439579486847, "rewards/ngrams_iou_reward/std": 0.20840685069561005, "rewards/schema_keywords_iou_reward/mean": 0.6905214786529541, "rewards/schema_keywords_iou_reward/std": 0.16664598882198334, "rewards/syntax_reward/mean": 0.8385416865348816, "rewards/syntax_reward/std": 0.3689151108264923, "step": 884 }, { "epoch": 2.9974554707379135, "step": 884, "total_flos": 0.0, "train_loss": 0.0003594789403539185, "train_runtime": 4902.9642, "train_samples_per_second": 5.769, "train_steps_per_second": 0.181 } ], "logging_steps": 1, "max_steps": 885, "num_input_tokens_seen": 226571285, "num_train_epochs": 3, "save_steps": 52, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }