{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3138, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 285.84375, "epoch": 0.00031867431485022306, "grad_norm": 7.580958366394043, "kl": 0.0, "learning_rate": 9.996813256851498e-07, "loss": -0.0, "reward": 0.9926098585128784, "reward_std": 0.4385250508785248, "rewards/answer_reward": 0.078125, "rewards/format_reward_gqa": 0.71875, "rewards/iou_glue_reward": 0.19573485851287842, "step": 1 }, { "completion_length": 265.53125, "epoch": 0.0006373486297004461, "grad_norm": 11.604437828063965, "kl": 0.00079345703125, "learning_rate": 9.993626513702996e-07, "loss": 0.0, "reward": 1.0492684841156006, "reward_std": 0.5438859462738037, "rewards/format_reward_tg": 0.71875, "rewards/iou_timestamp_reward": 0.23676855862140656, "rewards/pad": 0.09375, "step": 2 }, { "completion_length": 251.453125, "epoch": 0.0009560229445506692, "grad_norm": 12.72636604309082, "kl": 0.00127410888671875, "learning_rate": 9.990439770554494e-07, "loss": 0.0001, "reward": 1.0591096878051758, "reward_std": 0.5083183646202087, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 0.75, "rewards/tracking_iou_reward": 0.23098470270633698, "step": 3 }, { "completion_length": 127.15625, "epoch": 0.0012746972594008922, "grad_norm": 16.9719295501709, "kl": 0.00360107421875, "learning_rate": 9.98725302740599e-07, "loss": 0.0001, "reward": 1.0967737436294556, "reward_std": 0.32875216007232666, "rewards/format_reward_tg": 0.90625, "rewards/iou_timestamp_reward": 0.19052375853061676, "rewards/pad": 0.0, "step": 4 }, { "completion_length": 209.3125, "epoch": 0.0015933715742511153, "grad_norm": 16.323808670043945, "kl": 0.0026092529296875, "learning_rate": 9.984066284257488e-07, "loss": 0.0001, "reward": 1.1501266956329346, "reward_std": 0.2969282269477844, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 0.875, "rewards/iou_glue_reward": 0.16575169563293457, "step": 5 }, { "completion_length": 237.34375, "epoch": 0.0019120458891013384, "grad_norm": 6.7259602546691895, "kl": 0.00933837890625, "learning_rate": 9.980879541108986e-07, "loss": 0.0004, "reward": 1.2382855415344238, "reward_std": 0.2583405375480652, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 0.9375, "rewards/tracking_iou_reward": 0.1601606011390686, "step": 6 }, { "completion_length": 237.921875, "epoch": 0.0022307202039515616, "grad_norm": 11.441876411437988, "kl": 0.004364013671875, "learning_rate": 9.977692797960484e-07, "loss": 0.0002, "reward": 1.0197477340698242, "reward_std": 0.5455303192138672, "rewards/format_reward_tg": 0.65625, "rewards/iou_timestamp_reward": 0.191622793674469, "rewards/pad": 0.171875, "step": 7 }, { "completion_length": 245.140625, "epoch": 0.0025493945188017845, "grad_norm": 37.0315055847168, "kl": 0.01007080078125, "learning_rate": 9.974506054811982e-07, "loss": 0.0004, "reward": 1.1466755867004395, "reward_std": 0.32557743787765503, "rewards/format_reward_tg": 0.875, "rewards/iou_timestamp_reward": 0.17792558670043945, "rewards/pad": 0.09375, "step": 8 }, { "completion_length": 269.09375, "epoch": 0.0028680688336520078, "grad_norm": 30.685070037841797, "kl": 0.01055908203125, "learning_rate": 9.97131931166348e-07, "loss": 0.0004, "reward": 1.16693115234375, "reward_std": 0.42338114976882935, "rewards/format_reward_tg": 0.8125, "rewards/iou_timestamp_reward": 0.21380607783794403, "rewards/pad": 0.140625, "step": 9 }, { "completion_length": 267.671875, "epoch": 0.0031867431485022306, "grad_norm": 6.045750617980957, "kl": 0.013916015625, "learning_rate": 9.968132568514978e-07, "loss": 0.0006, "reward": 1.0137124061584473, "reward_std": 0.5149969458580017, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.75, "rewards/iou_glue_reward": 0.13871240615844727, "step": 10 }, { "completion_length": 303.875, "epoch": 0.003505417463352454, "grad_norm": 19.550809860229492, "kl": 0.007476806640625, "learning_rate": 9.964945825366476e-07, "loss": 0.0003, "reward": 0.9201228618621826, "reward_std": 0.4545382857322693, "rewards/format_reward_tg": 0.671875, "rewards/iou_timestamp_reward": 0.24824786186218262, "rewards/pad": 0.0, "step": 11 }, { "completion_length": 305.109375, "epoch": 0.0038240917782026767, "grad_norm": 10.959343910217285, "kl": 0.009521484375, "learning_rate": 9.961759082217972e-07, "loss": 0.0004, "reward": 1.1065305471420288, "reward_std": 0.4581749141216278, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 0.796875, "rewards/tracking_iou_reward": 0.21590548753738403, "step": 12 }, { "completion_length": 254.71875, "epoch": 0.0041427660930529, "grad_norm": 61.00483322143555, "kl": 0.010986328125, "learning_rate": 9.95857233906947e-07, "loss": 0.0004, "reward": 1.1909832954406738, "reward_std": 0.3574945330619812, "rewards/format_reward_tg": 0.890625, "rewards/iou_timestamp_reward": 0.1909833550453186, "rewards/pad": 0.109375, "step": 13 }, { "completion_length": 304.65625, "epoch": 0.004461440407903123, "grad_norm": 6.863766670227051, "kl": 0.00927734375, "learning_rate": 9.955385595920968e-07, "loss": 0.0004, "reward": 1.004595398902893, "reward_std": 0.46202772855758667, "rewards/format_reward_tg": 0.78125, "rewards/iou_timestamp_reward": 0.2233453392982483, "rewards/pad": 0.0, "step": 14 }, { "completion_length": 274.21875, "epoch": 0.004780114722753346, "grad_norm": 11.535690307617188, "kl": 0.01177978515625, "learning_rate": 9.952198852772466e-07, "loss": 0.0005, "reward": 1.1242345571517944, "reward_std": 0.3550039231777191, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.828125, "rewards/iou_glue_reward": 0.17110954225063324, "step": 15 }, { "completion_length": 201.15625, "epoch": 0.005098789037603569, "grad_norm": 18.40079689025879, "kl": 0.02392578125, "learning_rate": 9.949012109623964e-07, "loss": 0.001, "reward": 1.2128632068634033, "reward_std": 0.3046284019947052, "rewards/format_reward_tg": 0.9375, "rewards/iou_timestamp_reward": 0.24411317706108093, "rewards/pad": 0.03125, "step": 16 }, { "completion_length": 322.9375, "epoch": 0.005417463352453792, "grad_norm": 23.227741241455078, "kl": 0.01165771484375, "learning_rate": 9.945825366475462e-07, "loss": 0.0005, "reward": 1.2675498723983765, "reward_std": 0.35713836550712585, "rewards/format_reward_tg": 0.875, "rewards/iou_timestamp_reward": 0.23629984259605408, "rewards/pad": 0.15625, "step": 17 }, { "completion_length": 351.0625, "epoch": 0.0057361376673040155, "grad_norm": 6.902748107910156, "kl": 0.0101318359375, "learning_rate": 9.94263862332696e-07, "loss": 0.0004, "reward": 1.127271056175232, "reward_std": 0.4339676797389984, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 0.828125, "rewards/tracking_iou_reward": 0.2053961157798767, "step": 18 }, { "completion_length": 298.671875, "epoch": 0.006054811982154238, "grad_norm": 8.274727821350098, "kl": 0.0150146484375, "learning_rate": 9.939451880178459e-07, "loss": 0.0006, "reward": 1.0681308507919312, "reward_std": 0.4059593677520752, "rewards/format_reward_tg": 0.828125, "rewards/iou_timestamp_reward": 0.24000586569309235, "rewards/pad": 0.0, "step": 19 }, { "completion_length": 316.703125, "epoch": 0.006373486297004461, "grad_norm": 5.176186561584473, "kl": 0.01361083984375, "learning_rate": 9.936265137029955e-07, "loss": 0.0005, "reward": 1.147798776626587, "reward_std": 0.20939724147319794, "rewards/format_reward_tg": 0.9375, "rewards/iou_timestamp_reward": 0.21029871702194214, "rewards/pad": 0.0, "step": 20 }, { "completion_length": 332.609375, "epoch": 0.006692160611854685, "grad_norm": 15.555620193481445, "kl": 0.0089111328125, "learning_rate": 9.933078393881453e-07, "loss": 0.0004, "reward": 1.0623363256454468, "reward_std": 0.3388970196247101, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.859375, "rewards/tracking_iou_reward": 0.20296134054660797, "step": 21 }, { "completion_length": 338.875, "epoch": 0.007010834926704908, "grad_norm": 5.417019367218018, "kl": 0.01373291015625, "learning_rate": 9.92989165073295e-07, "loss": 0.0006, "reward": 1.1889852285385132, "reward_std": 0.3602343797683716, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.890625, "rewards/tracking_iou_reward": 0.2983602285385132, "step": 22 }, { "completion_length": 301.140625, "epoch": 0.007329509241555131, "grad_norm": 8.908356666564941, "kl": 0.01507568359375, "learning_rate": 9.926704907584449e-07, "loss": 0.0006, "reward": 1.141218662261963, "reward_std": 0.31373298168182373, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.90625, "rewards/tracking_iou_reward": 0.2349686622619629, "step": 23 }, { "completion_length": 257.84375, "epoch": 0.0076481835564053535, "grad_norm": 21.931612014770508, "kl": 0.0223388671875, "learning_rate": 9.923518164435945e-07, "loss": 0.0009, "reward": 1.3234606981277466, "reward_std": 0.31268778443336487, "rewards/format_reward_tg": 0.890625, "rewards/iou_timestamp_reward": 0.1984606385231018, "rewards/pad": 0.234375, "step": 24 }, { "completion_length": 321.734375, "epoch": 0.007966857871255577, "grad_norm": 13.89256763458252, "kl": 0.033447265625, "learning_rate": 9.920331421287443e-07, "loss": 0.0013, "reward": 1.2630127668380737, "reward_std": 0.3075515627861023, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.9375, "rewards/tracking_iou_reward": 0.3255128264427185, "step": 25 }, { "completion_length": 219.9375, "epoch": 0.0082855321861058, "grad_norm": 10.740429878234863, "kl": 0.0274658203125, "learning_rate": 9.91714467813894e-07, "loss": 0.0011, "reward": 1.4887242317199707, "reward_std": 0.22904357314109802, "rewards/pad": 0.203125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.2855991721153259, "step": 26 }, { "completion_length": 276.5625, "epoch": 0.008604206500956023, "grad_norm": 15.421710968017578, "kl": 0.050048828125, "learning_rate": 9.91395793499044e-07, "loss": 0.002, "reward": 1.297201156616211, "reward_std": 0.23605427145957947, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.31282609701156616, "step": 27 }, { "completion_length": 171.46875, "epoch": 0.008922880815806247, "grad_norm": 7.487827777862549, "kl": 0.046630859375, "learning_rate": 9.910771191841937e-07, "loss": 0.0019, "reward": 1.2176066637039185, "reward_std": 0.13908687233924866, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.21760669350624084, "step": 28 }, { "completion_length": 198.078125, "epoch": 0.009241555130656469, "grad_norm": 17.258358001708984, "kl": 0.055419921875, "learning_rate": 9.907584448693435e-07, "loss": 0.0022, "reward": 1.3561272621154785, "reward_std": 0.18805494904518127, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.21550217270851135, "step": 29 }, { "completion_length": 191.953125, "epoch": 0.009560229445506692, "grad_norm": 14.358026504516602, "kl": 0.035400390625, "learning_rate": 9.904397705544933e-07, "loss": 0.0014, "reward": 1.3451197147369385, "reward_std": 0.16682513058185577, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.34511977434158325, "rewards/pad": 0.0, "step": 30 }, { "completion_length": 217.296875, "epoch": 0.009878903760356916, "grad_norm": 8.59325885772705, "kl": 0.033447265625, "learning_rate": 9.90121096239643e-07, "loss": 0.0013, "reward": 1.4028737545013428, "reward_std": 0.43754905462265015, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.953125, "rewards/tracking_iou_reward": 0.3247487545013428, "step": 31 }, { "completion_length": 262.6875, "epoch": 0.010197578075207138, "grad_norm": 13.470588684082031, "kl": 0.0269775390625, "learning_rate": 9.898024219247927e-07, "loss": 0.0011, "reward": 1.4790947437286377, "reward_std": 0.31719809770584106, "rewards/answer_reward": 0.265625, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.24471977353096008, "step": 32 }, { "completion_length": 195.5625, "epoch": 0.010516252390057362, "grad_norm": 33.77853775024414, "kl": 0.0390625, "learning_rate": 9.894837476099425e-07, "loss": 0.0016, "reward": 1.470423698425293, "reward_std": 0.25363314151763916, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3610486388206482, "step": 33 }, { "completion_length": 260.3125, "epoch": 0.010834926704907584, "grad_norm": 5.149546146392822, "kl": 0.022705078125, "learning_rate": 9.891650732950923e-07, "loss": 0.0009, "reward": 1.2387065887451172, "reward_std": 0.1923772543668747, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.2699566185474396, "step": 34 }, { "completion_length": 124.09375, "epoch": 0.011153601019757807, "grad_norm": 10.509054183959961, "kl": 0.058837890625, "learning_rate": 9.888463989802421e-07, "loss": 0.0024, "reward": 1.262293815612793, "reward_std": 0.13928033411502838, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.2622936964035034, "rewards/pad": 0.0, "step": 35 }, { "completion_length": 175.25, "epoch": 0.011472275334608031, "grad_norm": 40.40633010864258, "kl": 0.031494140625, "learning_rate": 9.88527724665392e-07, "loss": 0.0013, "reward": 1.3443725109100342, "reward_std": 0.22584381699562073, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.23499749600887299, "step": 36 }, { "completion_length": 210.0625, "epoch": 0.011790949649458253, "grad_norm": 7.93485689163208, "kl": 0.04736328125, "learning_rate": 9.882090503505418e-07, "loss": 0.0019, "reward": 1.2812135219573975, "reward_std": 0.16277143359184265, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.18746356666088104, "rewards/pad": 0.09375, "step": 37 }, { "completion_length": 232.78125, "epoch": 0.012109623964308477, "grad_norm": 9.572932243347168, "kl": 0.031982421875, "learning_rate": 9.878903760356916e-07, "loss": 0.0013, "reward": 1.2721017599105835, "reward_std": 0.2769339978694916, "rewards/answer_reward": 0.046875, "rewards/format_reward_gqa": 0.921875, "rewards/iou_glue_reward": 0.30335181951522827, "step": 38 }, { "completion_length": 159.296875, "epoch": 0.0124282982791587, "grad_norm": 15.72573471069336, "kl": 0.04833984375, "learning_rate": 9.875717017208412e-07, "loss": 0.0019, "reward": 1.2688236236572266, "reward_std": 0.18624919652938843, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.28444868326187134, "step": 39 }, { "completion_length": 201.390625, "epoch": 0.012746972594008922, "grad_norm": 18.504762649536133, "kl": 0.05224609375, "learning_rate": 9.87253027405991e-07, "loss": 0.0021, "reward": 1.2598755359649658, "reward_std": 0.3119271397590637, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.953125, "rewards/tracking_iou_reward": 0.3067505955696106, "step": 40 }, { "completion_length": 165.9375, "epoch": 0.013065646908859146, "grad_norm": 16.481534957885742, "kl": 0.080078125, "learning_rate": 9.869343530911408e-07, "loss": 0.0032, "reward": 1.279308795928955, "reward_std": 0.2580498158931732, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.21680885553359985, "rewards/pad": 0.0625, "step": 41 }, { "completion_length": 187.375, "epoch": 0.01338432122370937, "grad_norm": 18.130592346191406, "kl": 0.033935546875, "learning_rate": 9.866156787762906e-07, "loss": 0.0014, "reward": 1.4841196537017822, "reward_std": 0.2194441258907318, "rewards/pad": 0.1875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.2966196537017822, "step": 42 }, { "completion_length": 208.734375, "epoch": 0.013702995538559592, "grad_norm": 11.812344551086426, "kl": 0.033203125, "learning_rate": 9.862970044614404e-07, "loss": 0.0013, "reward": 1.5125854015350342, "reward_std": 0.38584810495376587, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 0.953125, "rewards/iou_glue_reward": 0.32508549094200134, "step": 43 }, { "completion_length": 283.6875, "epoch": 0.014021669853409816, "grad_norm": 15.581579208374023, "kl": 0.026123046875, "learning_rate": 9.859783301465902e-07, "loss": 0.001, "reward": 1.4057618379592896, "reward_std": 0.24079158902168274, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.32763683795928955, "rewards/pad": 0.125, "step": 44 }, { "completion_length": 190.4375, "epoch": 0.014340344168260038, "grad_norm": 15.17668342590332, "kl": 0.049560546875, "learning_rate": 9.8565965583174e-07, "loss": 0.002, "reward": 1.2811412811279297, "reward_std": 0.24153098464012146, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.31239134073257446, "rewards/pad": 0.0, "step": 45 }, { "completion_length": 209.984375, "epoch": 0.014659018483110261, "grad_norm": 9.36685848236084, "kl": 0.034423828125, "learning_rate": 9.853409815168898e-07, "loss": 0.0014, "reward": 1.501969575881958, "reward_std": 0.2501377463340759, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.345719575881958, "rewards/pad": 0.15625, "step": 46 }, { "completion_length": 256.015625, "epoch": 0.014977692797960485, "grad_norm": 9.444185256958008, "kl": 0.042236328125, "learning_rate": 9.850223072020394e-07, "loss": 0.0017, "reward": 1.464492917060852, "reward_std": 0.209243506193161, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.48011791706085205, "rewards/pad": 0.0, "step": 47 }, { "completion_length": 198.59375, "epoch": 0.015296367112810707, "grad_norm": 15.230579376220703, "kl": 0.048828125, "learning_rate": 9.847036328871892e-07, "loss": 0.002, "reward": 1.2543593645095825, "reward_std": 0.22551663219928741, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.26998430490493774, "rewards/pad": 0.0, "step": 48 }, { "completion_length": 251.25, "epoch": 0.01561504142766093, "grad_norm": 7.851691246032715, "kl": 0.043212890625, "learning_rate": 9.84384958572339e-07, "loss": 0.0017, "reward": 1.33894944190979, "reward_std": 0.13577714562416077, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3389494717121124, "step": 49 }, { "completion_length": 184.046875, "epoch": 0.015933715742511154, "grad_norm": 20.711200714111328, "kl": 0.05224609375, "learning_rate": 9.840662842574888e-07, "loss": 0.0021, "reward": 1.4345719814300537, "reward_std": 0.21074189245700836, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.45019692182540894, "rewards/pad": 0.0, "step": 50 }, { "completion_length": 171.546875, "epoch": 0.016252390057361378, "grad_norm": 10.879066467285156, "kl": 0.05322265625, "learning_rate": 9.837476099426386e-07, "loss": 0.0021, "reward": 1.4225932359695435, "reward_std": 0.27576547861099243, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.34446826577186584, "rewards/pad": 0.078125, "step": 51 }, { "completion_length": 335.015625, "epoch": 0.0165710643722116, "grad_norm": 9.482775688171387, "kl": 0.0224609375, "learning_rate": 9.834289356277884e-07, "loss": 0.0009, "reward": 1.5082097053527832, "reward_std": 0.1784358024597168, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39883482456207275, "rewards/pad": 0.109375, "step": 52 }, { "completion_length": 250.703125, "epoch": 0.016889738687061822, "grad_norm": 14.3960599899292, "kl": 0.02978515625, "learning_rate": 9.831102613129382e-07, "loss": 0.0012, "reward": 1.435816764831543, "reward_std": 0.2369682788848877, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.953125, "rewards/tracking_iou_reward": 0.35769182443618774, "step": 53 }, { "completion_length": 253.4375, "epoch": 0.017208413001912046, "grad_norm": 7.352794647216797, "kl": 0.039794921875, "learning_rate": 9.82791586998088e-07, "loss": 0.0016, "reward": 1.255972146987915, "reward_std": 0.13122376799583435, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.25597214698791504, "rewards/pad": 0.0, "step": 54 }, { "completion_length": 226.0625, "epoch": 0.01752708731676227, "grad_norm": 33.61541748046875, "kl": 0.044677734375, "learning_rate": 9.824729126832376e-07, "loss": 0.0018, "reward": 1.3116388320922852, "reward_std": 0.314866840839386, "rewards/format_reward_tg": 0.9375, "rewards/iou_timestamp_reward": 0.3741387128829956, "rewards/pad": 0.0, "step": 55 }, { "completion_length": 179.171875, "epoch": 0.017845761631612493, "grad_norm": 12.783524513244629, "kl": 0.0517578125, "learning_rate": 9.821542383683875e-07, "loss": 0.0021, "reward": 1.3135948181152344, "reward_std": 0.18926963210105896, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.31359487771987915, "rewards/pad": 0.0, "step": 56 }, { "completion_length": 312.171875, "epoch": 0.018164435946462717, "grad_norm": 19.495323181152344, "kl": 0.0235595703125, "learning_rate": 9.818355640535373e-07, "loss": 0.0009, "reward": 1.2919933795928955, "reward_std": 0.20228049159049988, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.2294934093952179, "step": 57 }, { "completion_length": 246.0625, "epoch": 0.018483110261312937, "grad_norm": 5.100099086761475, "kl": 0.0302734375, "learning_rate": 9.81516889738687e-07, "loss": 0.0012, "reward": 1.3808673620224, "reward_std": 0.21135839819908142, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.2714923918247223, "step": 58 }, { "completion_length": 253.453125, "epoch": 0.01880178457616316, "grad_norm": 26.97967529296875, "kl": 0.034912109375, "learning_rate": 9.811982154238367e-07, "loss": 0.0014, "reward": 1.2082840204238892, "reward_std": 0.1777166873216629, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.23953397572040558, "step": 59 }, { "completion_length": 298.75, "epoch": 0.019120458891013385, "grad_norm": 9.286375999450684, "kl": 0.0274658203125, "learning_rate": 9.808795411089865e-07, "loss": 0.0011, "reward": 1.4248732328414917, "reward_std": 0.2562997341156006, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.953125, "rewards/tracking_iou_reward": 0.3623732328414917, "step": 60 }, { "completion_length": 325.6875, "epoch": 0.019439133205863608, "grad_norm": 11.21821403503418, "kl": 0.022216796875, "learning_rate": 9.805608667941363e-07, "loss": 0.0009, "reward": 1.3899298906326294, "reward_std": 0.22722920775413513, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4211798906326294, "step": 61 }, { "completion_length": 231.484375, "epoch": 0.019757807520713832, "grad_norm": 14.974238395690918, "kl": 0.0439453125, "learning_rate": 9.80242192479286e-07, "loss": 0.0018, "reward": 1.3300275802612305, "reward_std": 0.19355839490890503, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.34565258026123047, "rewards/pad": 0.0, "step": 62 }, { "completion_length": 194.859375, "epoch": 0.020076481835564052, "grad_norm": 6.293397426605225, "kl": 0.040771484375, "learning_rate": 9.799235181644359e-07, "loss": 0.0016, "reward": 1.4031833410263062, "reward_std": 0.15579763054847717, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.2781832814216614, "rewards/pad": 0.125, "step": 63 }, { "completion_length": 247.71875, "epoch": 0.020395156150414276, "grad_norm": 8.145806312561035, "kl": 0.038330078125, "learning_rate": 9.796048438495857e-07, "loss": 0.0015, "reward": 1.2597522735595703, "reward_std": 0.3148002624511719, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 0.9375, "rewards/tracking_iou_reward": 0.2597523331642151, "step": 64 }, { "completion_length": 251.5, "epoch": 0.0207138304652645, "grad_norm": 17.193103790283203, "kl": 0.033447265625, "learning_rate": 9.792861695347355e-07, "loss": 0.0013, "reward": 1.4773969650268555, "reward_std": 0.2409696727991104, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.39927199482917786, "rewards/pad": 0.09375, "step": 65 }, { "completion_length": 244.671875, "epoch": 0.021032504780114723, "grad_norm": 11.186094284057617, "kl": 0.033447265625, "learning_rate": 9.78967495219885e-07, "loss": 0.0013, "reward": 1.234787940979004, "reward_std": 0.3220973610877991, "rewards/format_reward_tg": 0.9375, "rewards/iou_timestamp_reward": 0.2972880005836487, "rewards/pad": 0.0, "step": 66 }, { "completion_length": 203.84375, "epoch": 0.021351179094964947, "grad_norm": 10.883271217346191, "kl": 0.039306640625, "learning_rate": 9.78648820905035e-07, "loss": 0.0016, "reward": 1.2655227184295654, "reward_std": 0.1531958132982254, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.23427259922027588, "step": 67 }, { "completion_length": 205.1875, "epoch": 0.021669853409815167, "grad_norm": 20.127857208251953, "kl": 0.03515625, "learning_rate": 9.783301465901847e-07, "loss": 0.0014, "reward": 1.4338656663894653, "reward_std": 0.22612062096595764, "rewards/pad": 0.1875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.2463657557964325, "step": 68 }, { "completion_length": 220.03125, "epoch": 0.02198852772466539, "grad_norm": 8.743712425231934, "kl": 0.029541015625, "learning_rate": 9.780114722753345e-07, "loss": 0.0012, "reward": 1.3033971786499023, "reward_std": 0.15178921818733215, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3033972382545471, "rewards/pad": 0.0, "step": 69 }, { "completion_length": 324.953125, "epoch": 0.022307202039515615, "grad_norm": 8.447614669799805, "kl": 0.022705078125, "learning_rate": 9.776927979604843e-07, "loss": 0.0009, "reward": 1.5264579057693481, "reward_std": 0.17748026549816132, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.43270793557167053, "step": 70 }, { "completion_length": 253.234375, "epoch": 0.02262587635436584, "grad_norm": 7.076296806335449, "kl": 0.0277099609375, "learning_rate": 9.773741236456341e-07, "loss": 0.0011, "reward": 1.4195590019226074, "reward_std": 0.14488717913627625, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4195590913295746, "rewards/pad": 0.0, "step": 71 }, { "completion_length": 271.90625, "epoch": 0.022944550669216062, "grad_norm": 10.374483108520508, "kl": 0.0228271484375, "learning_rate": 9.77055449330784e-07, "loss": 0.0009, "reward": 1.619044542312622, "reward_std": 0.2657039761543274, "rewards/pad": 0.21875, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4315445125102997, "step": 72 }, { "completion_length": 266.125, "epoch": 0.023263224984066286, "grad_norm": 11.071622848510742, "kl": 0.033447265625, "learning_rate": 9.767367750159337e-07, "loss": 0.0013, "reward": 1.3494877815246582, "reward_std": 0.179287850856781, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3651127815246582, "step": 73 }, { "completion_length": 186.65625, "epoch": 0.023581899298916506, "grad_norm": 22.304418563842773, "kl": 0.0284423828125, "learning_rate": 9.764181007010833e-07, "loss": 0.0012, "reward": 1.5244197845458984, "reward_std": 0.2890230417251587, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3837948441505432, "step": 74 }, { "completion_length": 332.875, "epoch": 0.02390057361376673, "grad_norm": 5.554767608642578, "kl": 0.0257568359375, "learning_rate": 9.760994263862331e-07, "loss": 0.001, "reward": 1.4590568542480469, "reward_std": 0.12253490090370178, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4746818542480469, "step": 75 }, { "completion_length": 278.09375, "epoch": 0.024219247928616953, "grad_norm": 20.972787857055664, "kl": 0.033447265625, "learning_rate": 9.75780752071383e-07, "loss": 0.0013, "reward": 1.4379208087921143, "reward_std": 0.16233232617378235, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.34417077898979187, "step": 76 }, { "completion_length": 209.0625, "epoch": 0.024537922243467177, "grad_norm": 6.096325874328613, "kl": 0.0361328125, "learning_rate": 9.754620777565328e-07, "loss": 0.0014, "reward": 1.504381537437439, "reward_std": 0.27014750242233276, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.45750656723976135, "step": 77 }, { "completion_length": 182.0, "epoch": 0.0248565965583174, "grad_norm": 6.677753925323486, "kl": 0.031494140625, "learning_rate": 9.751434034416826e-07, "loss": 0.0013, "reward": 1.5277019739151, "reward_std": 0.22165103256702423, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3714520037174225, "rewards/pad": 0.171875, "step": 78 }, { "completion_length": 94.0625, "epoch": 0.02517527087316762, "grad_norm": 18.051889419555664, "kl": 0.0546875, "learning_rate": 9.748247291268324e-07, "loss": 0.0022, "reward": 1.4485448598861694, "reward_std": 0.2159993201494217, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.33916980028152466, "rewards/pad": 0.125, "step": 79 }, { "completion_length": 123.875, "epoch": 0.025493945188017845, "grad_norm": 12.451839447021484, "kl": 0.0537109375, "learning_rate": 9.745060548119822e-07, "loss": 0.0022, "reward": 1.2630850076675415, "reward_std": 0.2797267436981201, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.24746005237102509, "rewards/pad": 0.03125, "step": 80 }, { "completion_length": 96.53125, "epoch": 0.02581261950286807, "grad_norm": 20.299448013305664, "kl": 0.05810546875, "learning_rate": 9.74187380497132e-07, "loss": 0.0023, "reward": 1.3956706523895264, "reward_std": 0.24089336395263672, "rewards/answer_reward": 0.015625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.38004571199417114, "step": 81 }, { "completion_length": 150.8125, "epoch": 0.026131293817718292, "grad_norm": 18.910051345825195, "kl": 0.060302734375, "learning_rate": 9.738687061822816e-07, "loss": 0.0024, "reward": 1.4209949970245361, "reward_std": 0.16225898265838623, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42099499702453613, "rewards/pad": 0.0, "step": 82 }, { "completion_length": 236.71875, "epoch": 0.026449968132568516, "grad_norm": 11.744294166564941, "kl": 0.046142578125, "learning_rate": 9.735500318674314e-07, "loss": 0.0018, "reward": 1.4210379123687744, "reward_std": 0.2842930555343628, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.3741629421710968, "rewards/pad": 0.078125, "step": 83 }, { "completion_length": 171.515625, "epoch": 0.02676864244741874, "grad_norm": 13.975997924804688, "kl": 0.057373046875, "learning_rate": 9.732313575525812e-07, "loss": 0.0023, "reward": 1.4923350811004639, "reward_std": 0.20501980185508728, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47671014070510864, "step": 84 }, { "completion_length": 221.796875, "epoch": 0.02708731676226896, "grad_norm": 7.706657409667969, "kl": 0.04052734375, "learning_rate": 9.72912683237731e-07, "loss": 0.0016, "reward": 1.380906581878662, "reward_std": 0.25143831968307495, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.41215670108795166, "rewards/pad": 0.0, "step": 85 }, { "completion_length": 176.734375, "epoch": 0.027405991077119184, "grad_norm": 11.26560115814209, "kl": 0.064453125, "learning_rate": 9.725940089228808e-07, "loss": 0.0026, "reward": 1.2714415788650513, "reward_std": 0.19051113724708557, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.30269163846969604, "rewards/pad": 0.0, "step": 86 }, { "completion_length": 140.953125, "epoch": 0.027724665391969407, "grad_norm": 65.41498565673828, "kl": 0.0712890625, "learning_rate": 9.722753346080306e-07, "loss": 0.0028, "reward": 1.315810203552246, "reward_std": 0.17996680736541748, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3158102035522461, "step": 87 }, { "completion_length": 134.84375, "epoch": 0.02804333970681963, "grad_norm": 7.701048374176025, "kl": 0.0712890625, "learning_rate": 9.719566602931804e-07, "loss": 0.0029, "reward": 1.4724924564361572, "reward_std": 0.18042278289794922, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.36311742663383484, "rewards/pad": 0.109375, "step": 88 }, { "completion_length": 149.078125, "epoch": 0.028362014021669855, "grad_norm": 9.855350494384766, "kl": 0.06640625, "learning_rate": 9.716379859783302e-07, "loss": 0.0027, "reward": 1.4240100383758545, "reward_std": 0.15711775422096252, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4240100681781769, "step": 89 }, { "completion_length": 114.953125, "epoch": 0.028680688336520075, "grad_norm": 7.680419921875, "kl": 0.07373046875, "learning_rate": 9.713193116634798e-07, "loss": 0.0029, "reward": 1.4028632640838623, "reward_std": 0.20188677310943604, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.40286314487457275, "step": 90 }, { "completion_length": 194.625, "epoch": 0.0289993626513703, "grad_norm": 5.96244478225708, "kl": 0.0537109375, "learning_rate": 9.710006373486296e-07, "loss": 0.0021, "reward": 1.4750306606292725, "reward_std": 0.16800019145011902, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47503069043159485, "rewards/pad": 0.0, "step": 91 }, { "completion_length": 172.03125, "epoch": 0.029318036966220522, "grad_norm": 7.921225070953369, "kl": 0.056640625, "learning_rate": 9.706819630337794e-07, "loss": 0.0023, "reward": 1.550010323524475, "reward_std": 0.11083010584115982, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3000103235244751, "rewards/pad": 0.25, "step": 92 }, { "completion_length": 240.9375, "epoch": 0.029636711281070746, "grad_norm": 9.648941993713379, "kl": 0.04443359375, "learning_rate": 9.703632887189293e-07, "loss": 0.0018, "reward": 1.5524835586547852, "reward_std": 0.12360557913780212, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42748352885246277, "rewards/pad": 0.125, "step": 93 }, { "completion_length": 162.25, "epoch": 0.02995538559592097, "grad_norm": 20.81108283996582, "kl": 0.0751953125, "learning_rate": 9.70044614404079e-07, "loss": 0.003, "reward": 1.5567106008529663, "reward_std": 0.10027439892292023, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4317106306552887, "step": 94 }, { "completion_length": 155.59375, "epoch": 0.030274059910771194, "grad_norm": 6.367391109466553, "kl": 0.06396484375, "learning_rate": 9.697259400892289e-07, "loss": 0.0026, "reward": 1.3457229137420654, "reward_std": 0.13679799437522888, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3457227945327759, "rewards/pad": 0.0, "step": 95 }, { "completion_length": 100.40625, "epoch": 0.030592734225621414, "grad_norm": 19.097196578979492, "kl": 0.0732421875, "learning_rate": 9.694072657743787e-07, "loss": 0.0029, "reward": 1.475441575050354, "reward_std": 0.1915864497423172, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3504416048526764, "rewards/pad": 0.125, "step": 96 }, { "completion_length": 213.34375, "epoch": 0.030911408540471638, "grad_norm": 9.829632759094238, "kl": 0.047119140625, "learning_rate": 9.690885914595283e-07, "loss": 0.0019, "reward": 1.6064780950546265, "reward_std": 0.13289913535118103, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.37210309505462646, "step": 97 }, { "completion_length": 107.15625, "epoch": 0.03123008285532186, "grad_norm": 13.913726806640625, "kl": 0.10009765625, "learning_rate": 9.68769917144678e-07, "loss": 0.004, "reward": 1.4501903057098389, "reward_std": 0.1813579499721527, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45019030570983887, "rewards/pad": 0.0, "step": 98 }, { "completion_length": 246.109375, "epoch": 0.03154875717017208, "grad_norm": 5.717926025390625, "kl": 0.03564453125, "learning_rate": 9.684512428298279e-07, "loss": 0.0014, "reward": 1.3290218114852905, "reward_std": 0.15001249313354492, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.2977718412876129, "rewards/pad": 0.03125, "step": 99 }, { "completion_length": 127.84375, "epoch": 0.03186743148502231, "grad_norm": 14.6702241897583, "kl": 0.12451171875, "learning_rate": 9.681325685149777e-07, "loss": 0.005, "reward": 1.3388731479644775, "reward_std": 0.18304340541362762, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.33887311816215515, "rewards/pad": 0.0, "step": 100 }, { "completion_length": 171.640625, "epoch": 0.03218610579987253, "grad_norm": 12.464656829833984, "kl": 0.06494140625, "learning_rate": 9.678138942001273e-07, "loss": 0.0025, "reward": 1.3632150888442993, "reward_std": 0.1353316754102707, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3788400888442993, "rewards/pad": 0.0, "step": 101 }, { "completion_length": 178.390625, "epoch": 0.032504780114722756, "grad_norm": 12.3067045211792, "kl": 0.058349609375, "learning_rate": 9.67495219885277e-07, "loss": 0.0023, "reward": 1.5209242105484009, "reward_std": 0.21918198466300964, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44279927015304565, "rewards/pad": 0.078125, "step": 102 }, { "completion_length": 198.75, "epoch": 0.032823454429572976, "grad_norm": 17.653640747070312, "kl": 0.054931640625, "learning_rate": 9.67176545570427e-07, "loss": 0.0021, "reward": 1.4329454898834229, "reward_std": 0.15643572807312012, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3704456090927124, "step": 103 }, { "completion_length": 173.765625, "epoch": 0.0331421287444232, "grad_norm": 18.334651947021484, "kl": 0.06201171875, "learning_rate": 9.668578712555767e-07, "loss": 0.0025, "reward": 1.3778859376907349, "reward_std": 0.1982203871011734, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.31538593769073486, "step": 104 }, { "completion_length": 199.53125, "epoch": 0.033460803059273424, "grad_norm": 18.57367706298828, "kl": 0.05517578125, "learning_rate": 9.665391969407265e-07, "loss": 0.0022, "reward": 1.6662395000457764, "reward_std": 0.13840028643608093, "rewards/answer_reward": 0.3125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.35373955965042114, "step": 105 }, { "completion_length": 232.703125, "epoch": 0.033779477374123644, "grad_norm": 7.207043170928955, "kl": 0.0458984375, "learning_rate": 9.662205226258763e-07, "loss": 0.0018, "reward": 1.347933292388916, "reward_std": 0.09887917339801788, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3635583817958832, "step": 106 }, { "completion_length": 162.578125, "epoch": 0.03409815168897387, "grad_norm": 9.840560913085938, "kl": 0.056640625, "learning_rate": 9.659018483110261e-07, "loss": 0.0023, "reward": 1.53261137008667, "reward_std": 0.16933324933052063, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40761134028434753, "rewards/pad": 0.125, "step": 107 }, { "completion_length": 233.71875, "epoch": 0.03441682600382409, "grad_norm": 4.3503031730651855, "kl": 0.0400390625, "learning_rate": 9.65583173996176e-07, "loss": 0.0016, "reward": 1.4072836637496948, "reward_std": 0.04489172250032425, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4072836637496948, "rewards/pad": 0.0, "step": 108 }, { "completion_length": 154.234375, "epoch": 0.03473550031867431, "grad_norm": 11.090832710266113, "kl": 0.056396484375, "learning_rate": 9.652644996813255e-07, "loss": 0.0023, "reward": 1.6570996046066284, "reward_std": 0.21542125940322876, "rewards/answer_reward": 0.28125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.39147457480430603, "step": 109 }, { "completion_length": 108.40625, "epoch": 0.03505417463352454, "grad_norm": 20.70996856689453, "kl": 0.07666015625, "learning_rate": 9.649458253664753e-07, "loss": 0.0031, "reward": 1.3417737483978271, "reward_std": 0.11198395490646362, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3417738378047943, "rewards/pad": 0.0, "step": 110 }, { "completion_length": 166.359375, "epoch": 0.03537284894837476, "grad_norm": 6.13435697555542, "kl": 0.06494140625, "learning_rate": 9.646271510516251e-07, "loss": 0.0026, "reward": 1.451405644416809, "reward_std": 0.09623386710882187, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4514055550098419, "step": 111 }, { "completion_length": 134.859375, "epoch": 0.035691523263224986, "grad_norm": 15.755718231201172, "kl": 0.0634765625, "learning_rate": 9.64308476736775e-07, "loss": 0.0025, "reward": 1.7639336585998535, "reward_std": 0.20902717113494873, "rewards/answer_reward": 0.328125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4358087182044983, "step": 112 }, { "completion_length": 215.359375, "epoch": 0.036010197578075206, "grad_norm": 13.079955101013184, "kl": 0.05615234375, "learning_rate": 9.639898024219248e-07, "loss": 0.0022, "reward": 1.4580156803131104, "reward_std": 0.1539626121520996, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.36426568031311035, "step": 113 }, { "completion_length": 174.140625, "epoch": 0.036328871892925434, "grad_norm": 8.691699028015137, "kl": 0.087890625, "learning_rate": 9.636711281070746e-07, "loss": 0.0035, "reward": 1.3323192596435547, "reward_std": 0.12468598783016205, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.34794431924819946, "rewards/pad": 0.0, "step": 114 }, { "completion_length": 190.140625, "epoch": 0.036647546207775654, "grad_norm": 7.315731048583984, "kl": 0.052734375, "learning_rate": 9.633524537922244e-07, "loss": 0.0021, "reward": 1.51028311252594, "reward_std": 0.11671674251556396, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.33840811252593994, "step": 115 }, { "completion_length": 177.40625, "epoch": 0.036966220522625874, "grad_norm": 10.40539836883545, "kl": 0.04931640625, "learning_rate": 9.630337794773742e-07, "loss": 0.002, "reward": 1.7965991497039795, "reward_std": 0.2596476078033447, "rewards/pad": 0.375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4372241795063019, "step": 116 }, { "completion_length": 197.4375, "epoch": 0.0372848948374761, "grad_norm": 17.551198959350586, "kl": 0.05615234375, "learning_rate": 9.627151051625238e-07, "loss": 0.0022, "reward": 1.487061619758606, "reward_std": 0.12751880288124084, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48706167936325073, "rewards/pad": 0.0, "step": 117 }, { "completion_length": 212.609375, "epoch": 0.03760356915232632, "grad_norm": 25.8797550201416, "kl": 0.06103515625, "learning_rate": 9.623964308476736e-07, "loss": 0.0024, "reward": 1.4586904048919678, "reward_std": 0.18688003718852997, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4118153750896454, "step": 118 }, { "completion_length": 198.171875, "epoch": 0.03792224346717655, "grad_norm": 11.605658531188965, "kl": 0.0478515625, "learning_rate": 9.620777565328234e-07, "loss": 0.0019, "reward": 1.5306899547576904, "reward_std": 0.24100443720817566, "rewards/answer_reward": 0.1875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3431899845600128, "step": 119 }, { "completion_length": 209.46875, "epoch": 0.03824091778202677, "grad_norm": 3.406437397003174, "kl": 0.054443359375, "learning_rate": 9.617590822179732e-07, "loss": 0.0022, "reward": 1.437645435333252, "reward_std": 0.07607047259807587, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43764549493789673, "step": 120 }, { "completion_length": 162.171875, "epoch": 0.03855959209687699, "grad_norm": 23.933696746826172, "kl": 0.06396484375, "learning_rate": 9.61440407903123e-07, "loss": 0.0025, "reward": 1.439674735069275, "reward_std": 0.18641090393066406, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3146747350692749, "rewards/pad": 0.125, "step": 121 }, { "completion_length": 185.8125, "epoch": 0.038878266411727216, "grad_norm": 9.487775802612305, "kl": 0.056640625, "learning_rate": 9.611217335882728e-07, "loss": 0.0023, "reward": 1.4100788831710815, "reward_std": 0.11771468818187714, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.39445382356643677, "step": 122 }, { "completion_length": 236.921875, "epoch": 0.03919694072657744, "grad_norm": 7.253052711486816, "kl": 0.1279296875, "learning_rate": 9.608030592734226e-07, "loss": 0.0051, "reward": 1.4906271696090698, "reward_std": 0.1307528018951416, "rewards/answer_reward": 0.171875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3187522292137146, "step": 123 }, { "completion_length": 199.546875, "epoch": 0.039515615041427664, "grad_norm": 8.534823417663574, "kl": 0.064453125, "learning_rate": 9.604843849585724e-07, "loss": 0.0026, "reward": 1.3232932090759277, "reward_std": 0.17143693566322327, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.1982932984828949, "step": 124 }, { "completion_length": 101.796875, "epoch": 0.039834289356277884, "grad_norm": 7.8380818367004395, "kl": 0.08349609375, "learning_rate": 9.60165710643722e-07, "loss": 0.0033, "reward": 1.4448331594467163, "reward_std": 0.1456798017024994, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4448332190513611, "rewards/pad": 0.0, "step": 125 }, { "completion_length": 77.078125, "epoch": 0.040152963671128104, "grad_norm": 13.454174995422363, "kl": 0.10400390625, "learning_rate": 9.598470363288718e-07, "loss": 0.0042, "reward": 1.3266003131866455, "reward_std": 0.16800163686275482, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.32660022377967834, "rewards/pad": 0.0, "step": 126 }, { "completion_length": 185.6875, "epoch": 0.04047163798597833, "grad_norm": 8.160616874694824, "kl": 0.055419921875, "learning_rate": 9.595283620140216e-07, "loss": 0.0022, "reward": 1.7484498023986816, "reward_std": 0.16031304001808167, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4046997129917145, "rewards/pad": 0.34375, "step": 127 }, { "completion_length": 215.515625, "epoch": 0.04079031230082855, "grad_norm": 9.371366500854492, "kl": 0.05419921875, "learning_rate": 9.592096876991714e-07, "loss": 0.0022, "reward": 1.3254632949829102, "reward_std": 0.09307268261909485, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.32546335458755493, "rewards/pad": 0.0, "step": 128 }, { "completion_length": 222.796875, "epoch": 0.04110898661567878, "grad_norm": 16.473413467407227, "kl": 0.0625, "learning_rate": 9.588910133843212e-07, "loss": 0.0025, "reward": 1.3656400442123413, "reward_std": 0.1587304174900055, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.38126498460769653, "rewards/pad": 0.0, "step": 129 }, { "completion_length": 270.875, "epoch": 0.041427660930529, "grad_norm": 5.4612555503845215, "kl": 0.05419921875, "learning_rate": 9.58572339069471e-07, "loss": 0.0022, "reward": 1.3114277124404907, "reward_std": 0.06440472602844238, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3114277124404907, "step": 130 }, { "completion_length": 238.84375, "epoch": 0.04174633524537922, "grad_norm": 6.282101631164551, "kl": 0.04638671875, "learning_rate": 9.582536647546209e-07, "loss": 0.0019, "reward": 1.4994099140167236, "reward_std": 0.10063318908214569, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3744097948074341, "rewards/pad": 0.125, "step": 131 }, { "completion_length": 163.625, "epoch": 0.04206500956022945, "grad_norm": 8.125163078308105, "kl": 0.12109375, "learning_rate": 9.579349904397705e-07, "loss": 0.0049, "reward": 1.4386279582977295, "reward_std": 0.14473479986190796, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3917529284954071, "rewards/pad": 0.0625, "step": 132 }, { "completion_length": 231.890625, "epoch": 0.04238368387507967, "grad_norm": 8.46726131439209, "kl": 0.07275390625, "learning_rate": 9.576163161249203e-07, "loss": 0.0029, "reward": 1.523742914199829, "reward_std": 0.11945880949497223, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4143679141998291, "step": 133 }, { "completion_length": 268.34375, "epoch": 0.042702358189929894, "grad_norm": 24.495853424072266, "kl": 0.057861328125, "learning_rate": 9.5729764181007e-07, "loss": 0.0023, "reward": 1.3764296770095825, "reward_std": 0.12560050189495087, "rewards/answer_reward": 0.078125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3139297366142273, "step": 134 }, { "completion_length": 241.609375, "epoch": 0.043021032504780114, "grad_norm": 5.319091796875, "kl": 0.0498046875, "learning_rate": 9.569789674952199e-07, "loss": 0.002, "reward": 1.4918053150177002, "reward_std": 0.13128961622714996, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5074301958084106, "step": 135 }, { "completion_length": 135.6875, "epoch": 0.043339706819630335, "grad_norm": 13.785394668579102, "kl": 0.07421875, "learning_rate": 9.566602931803697e-07, "loss": 0.003, "reward": 1.3160868883132935, "reward_std": 0.19807332754135132, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3160868287086487, "step": 136 }, { "completion_length": 272.875, "epoch": 0.04365838113448056, "grad_norm": 5.691545486450195, "kl": 0.045654296875, "learning_rate": 9.563416188655195e-07, "loss": 0.0018, "reward": 1.3066635131835938, "reward_std": 0.11025525629520416, "rewards/answer_reward": 0.015625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.29103854298591614, "step": 137 }, { "completion_length": 169.25, "epoch": 0.04397705544933078, "grad_norm": 10.946203231811523, "kl": 0.06982421875, "learning_rate": 9.56022944550669e-07, "loss": 0.0028, "reward": 1.563328504562378, "reward_std": 0.21476612985134125, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45395350456237793, "rewards/pad": 0.109375, "step": 138 }, { "completion_length": 242.734375, "epoch": 0.04429572976418101, "grad_norm": 7.429897785186768, "kl": 0.047119140625, "learning_rate": 9.557042702358189e-07, "loss": 0.0019, "reward": 1.4747035503387451, "reward_std": 0.16535669565200806, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3028285801410675, "step": 139 }, { "completion_length": 281.90625, "epoch": 0.04461440407903123, "grad_norm": 8.921205520629883, "kl": 0.033447265625, "learning_rate": 9.553855959209687e-07, "loss": 0.0013, "reward": 1.7500858306884766, "reward_std": 0.13492590188980103, "rewards/pad": 0.328125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.42196089029312134, "step": 140 }, { "completion_length": 233.671875, "epoch": 0.044933078393881457, "grad_norm": 6.8118672370910645, "kl": 0.043212890625, "learning_rate": 9.550669216061185e-07, "loss": 0.0017, "reward": 1.7276577949523926, "reward_std": 0.1283380091190338, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4932827949523926, "step": 141 }, { "completion_length": 339.484375, "epoch": 0.04525175270873168, "grad_norm": 3.8644649982452393, "kl": 0.03125, "learning_rate": 9.547482472912683e-07, "loss": 0.0013, "reward": 1.7625560760498047, "reward_std": 0.17026743292808533, "rewards/pad": 0.4375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3250562250614166, "step": 142 }, { "completion_length": 324.59375, "epoch": 0.0455704270235819, "grad_norm": 3.7466437816619873, "kl": 0.037841796875, "learning_rate": 9.544295729764181e-07, "loss": 0.0015, "reward": 1.369144320487976, "reward_std": 0.06762483716011047, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3691442906856537, "step": 143 }, { "completion_length": 323.609375, "epoch": 0.045889101338432124, "grad_norm": 13.69135570526123, "kl": 0.040771484375, "learning_rate": 9.541108986615677e-07, "loss": 0.0016, "reward": 1.3979535102844238, "reward_std": 0.06589115411043167, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39795345067977905, "rewards/pad": 0.0, "step": 144 }, { "completion_length": 299.984375, "epoch": 0.046207775653282344, "grad_norm": 13.790980339050293, "kl": 0.0615234375, "learning_rate": 9.537922243467175e-07, "loss": 0.0025, "reward": 1.449575662612915, "reward_std": 0.1409299224615097, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4495756924152374, "step": 145 }, { "completion_length": 224.8125, "epoch": 0.04652644996813257, "grad_norm": 17.20113182067871, "kl": 0.06640625, "learning_rate": 9.534735500318673e-07, "loss": 0.0027, "reward": 1.3060486316680908, "reward_std": 0.2386862337589264, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.2904236912727356, "rewards/pad": 0.03125, "step": 146 }, { "completion_length": 295.859375, "epoch": 0.04684512428298279, "grad_norm": 5.707564830780029, "kl": 0.05517578125, "learning_rate": 9.531548757170171e-07, "loss": 0.0022, "reward": 1.4157187938690186, "reward_std": 0.19549745321273804, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.33759376406669617, "step": 147 }, { "completion_length": 196.71875, "epoch": 0.04716379859783301, "grad_norm": 7.082728385925293, "kl": 0.107421875, "learning_rate": 9.528362014021669e-07, "loss": 0.0043, "reward": 1.6351996660232544, "reward_std": 0.2388363629579544, "rewards/answer_reward": 0.203125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.447699636220932, "step": 148 }, { "completion_length": 209.5, "epoch": 0.04748247291268324, "grad_norm": 4.3276190757751465, "kl": 0.0537109375, "learning_rate": 9.525175270873167e-07, "loss": 0.0022, "reward": 1.6083552837371826, "reward_std": 0.1997946947813034, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5302302837371826, "step": 149 }, { "completion_length": 224.359375, "epoch": 0.04780114722753346, "grad_norm": 21.417600631713867, "kl": 0.058349609375, "learning_rate": 9.521988527724664e-07, "loss": 0.0023, "reward": 1.4707787036895752, "reward_std": 0.13133534789085388, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47077876329421997, "step": 150 }, { "completion_length": 234.34375, "epoch": 0.04811982154238369, "grad_norm": 7.798461437225342, "kl": 0.0732421875, "learning_rate": 9.518801784576163e-07, "loss": 0.0029, "reward": 1.2438874244689941, "reward_std": 0.1509024202823639, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.24388736486434937, "rewards/pad": 0.0, "step": 151 }, { "completion_length": 228.71875, "epoch": 0.04843849585723391, "grad_norm": 7.1418585777282715, "kl": 0.06591796875, "learning_rate": 9.515615041427661e-07, "loss": 0.0026, "reward": 1.4649324417114258, "reward_std": 0.1798781454563141, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.433682382106781, "rewards/pad": 0.03125, "step": 152 }, { "completion_length": 276.234375, "epoch": 0.04875717017208413, "grad_norm": 7.342140197753906, "kl": 0.05224609375, "learning_rate": 9.512428298279159e-07, "loss": 0.0021, "reward": 1.5144888162612915, "reward_std": 0.1600225269794464, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4207388162612915, "step": 153 }, { "completion_length": 353.9375, "epoch": 0.049075844486934354, "grad_norm": 6.5262908935546875, "kl": 0.02587890625, "learning_rate": 9.509241555130656e-07, "loss": 0.0011, "reward": 1.4947094917297363, "reward_std": 0.11990465223789215, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5103343725204468, "step": 154 }, { "completion_length": 215.359375, "epoch": 0.049394518801784575, "grad_norm": 8.246253967285156, "kl": 0.060546875, "learning_rate": 9.506054811982154e-07, "loss": 0.0024, "reward": 1.4783300161361694, "reward_std": 0.16495484113693237, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.41582998633384705, "rewards/pad": 0.0625, "step": 155 }, { "completion_length": 364.953125, "epoch": 0.0497131931166348, "grad_norm": 4.4291911125183105, "kl": 0.049560546875, "learning_rate": 9.502868068833652e-07, "loss": 0.002, "reward": 1.445652723312378, "reward_std": 0.15399569272994995, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.46127766370773315, "step": 156 }, { "completion_length": 210.84375, "epoch": 0.05003186743148502, "grad_norm": 13.04565715789795, "kl": 0.049560546875, "learning_rate": 9.499681325685149e-07, "loss": 0.002, "reward": 1.5248757600784302, "reward_std": 0.16148591041564941, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4155007600784302, "rewards/pad": 0.109375, "step": 157 }, { "completion_length": 202.609375, "epoch": 0.05035054174633524, "grad_norm": 12.038368225097656, "kl": 0.05908203125, "learning_rate": 9.496494582536647e-07, "loss": 0.0023, "reward": 1.5142139196395874, "reward_std": 0.22789838910102844, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4517139196395874, "rewards/pad": 0.078125, "step": 158 }, { "completion_length": 173.71875, "epoch": 0.05066921606118547, "grad_norm": 25.960351943969727, "kl": 0.0751953125, "learning_rate": 9.493307839388145e-07, "loss": 0.003, "reward": 1.402698040008545, "reward_std": 0.17803940176963806, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.35582298040390015, "rewards/pad": 0.046875, "step": 159 }, { "completion_length": 283.125, "epoch": 0.05098789037603569, "grad_norm": 7.958873748779297, "kl": 0.04345703125, "learning_rate": 9.490121096239643e-07, "loss": 0.0017, "reward": 1.4051318168640137, "reward_std": 0.04836621135473251, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40513181686401367, "rewards/pad": 0.0, "step": 160 }, { "completion_length": 214.46875, "epoch": 0.05130656469088592, "grad_norm": 11.376971244812012, "kl": 0.0595703125, "learning_rate": 9.48693435309114e-07, "loss": 0.0024, "reward": 1.4891833066940308, "reward_std": 0.15629437565803528, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36418330669403076, "step": 161 }, { "completion_length": 200.84375, "epoch": 0.05162523900573614, "grad_norm": 9.97684383392334, "kl": 0.058837890625, "learning_rate": 9.483747609942638e-07, "loss": 0.0024, "reward": 1.499054193496704, "reward_std": 0.16176065802574158, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4990541636943817, "step": 162 }, { "completion_length": 206.84375, "epoch": 0.05194391332058636, "grad_norm": 14.745891571044922, "kl": 0.06591796875, "learning_rate": 9.480560866794136e-07, "loss": 0.0026, "reward": 1.8148226737976074, "reward_std": 0.13128326833248138, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5804476141929626, "step": 163 }, { "completion_length": 247.546875, "epoch": 0.052262587635436585, "grad_norm": 10.21143913269043, "kl": 0.05322265625, "learning_rate": 9.477374123645634e-07, "loss": 0.0021, "reward": 1.4356274604797363, "reward_std": 0.15665191411972046, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4512525200843811, "rewards/pad": 0.0, "step": 164 }, { "completion_length": 320.6875, "epoch": 0.052581261950286805, "grad_norm": 3.693413734436035, "kl": 0.037109375, "learning_rate": 9.474187380497131e-07, "loss": 0.0015, "reward": 1.470198392868042, "reward_std": 0.10511145740747452, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47019830346107483, "rewards/pad": 0.0, "step": 165 }, { "completion_length": 234.46875, "epoch": 0.05289993626513703, "grad_norm": 6.604076385498047, "kl": 0.0517578125, "learning_rate": 9.471000637348629e-07, "loss": 0.0021, "reward": 1.4114763736724854, "reward_std": 0.1726670116186142, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.41147640347480774, "step": 166 }, { "completion_length": 371.0625, "epoch": 0.05321861057998725, "grad_norm": 12.597403526306152, "kl": 0.042724609375, "learning_rate": 9.467813894200127e-07, "loss": 0.0017, "reward": 1.3728115558624268, "reward_std": 0.06814874708652496, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.372811496257782, "step": 167 }, { "completion_length": 206.71875, "epoch": 0.05353728489483748, "grad_norm": 14.53585433959961, "kl": 0.07275390625, "learning_rate": 9.464627151051625e-07, "loss": 0.0029, "reward": 1.4245121479034424, "reward_std": 0.2063012272119522, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3620120584964752, "step": 168 }, { "completion_length": 120.09375, "epoch": 0.0538559592096877, "grad_norm": 11.440759658813477, "kl": 0.09619140625, "learning_rate": 9.461440407903123e-07, "loss": 0.0039, "reward": 1.455107569694519, "reward_std": 0.20997771620750427, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.33010753989219666, "rewards/pad": 0.125, "step": 169 }, { "completion_length": 200.703125, "epoch": 0.05417463352453792, "grad_norm": 6.415365219116211, "kl": 0.11328125, "learning_rate": 9.458253664754621e-07, "loss": 0.0045, "reward": 1.4902559518814087, "reward_std": 0.18468467891216278, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42775604128837585, "rewards/pad": 0.0625, "step": 170 }, { "completion_length": 165.421875, "epoch": 0.05449330783938815, "grad_norm": 39.82363510131836, "kl": 0.08837890625, "learning_rate": 9.455066921606119e-07, "loss": 0.0035, "reward": 1.3934909105300903, "reward_std": 0.11051173508167267, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3934909701347351, "step": 171 }, { "completion_length": 229.671875, "epoch": 0.05481198215423837, "grad_norm": 12.203930854797363, "kl": 0.056884765625, "learning_rate": 9.451880178457617e-07, "loss": 0.0023, "reward": 1.4572014808654785, "reward_std": 0.10274035483598709, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4572015404701233, "rewards/pad": 0.0, "step": 172 }, { "completion_length": 304.453125, "epoch": 0.055130656469088594, "grad_norm": 4.776219844818115, "kl": 0.0517578125, "learning_rate": 9.448693435309114e-07, "loss": 0.0021, "reward": 1.4793224334716797, "reward_std": 0.12388372421264648, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4949474334716797, "rewards/pad": 0.0, "step": 173 }, { "completion_length": 170.953125, "epoch": 0.055449330783938815, "grad_norm": 7.443392753601074, "kl": 0.061767578125, "learning_rate": 9.445506692160612e-07, "loss": 0.0025, "reward": 1.614640474319458, "reward_std": 0.30754411220550537, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.48964059352874756, "rewards/pad": 0.140625, "step": 174 }, { "completion_length": 127.765625, "epoch": 0.055768005098789035, "grad_norm": 15.550397872924805, "kl": 0.095703125, "learning_rate": 9.44231994901211e-07, "loss": 0.0038, "reward": 1.4903450012207031, "reward_std": 0.2514011859893799, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.39659497141838074, "rewards/pad": 0.109375, "step": 175 }, { "completion_length": 259.046875, "epoch": 0.05608667941363926, "grad_norm": 56.89799880981445, "kl": 0.06103515625, "learning_rate": 9.439133205863608e-07, "loss": 0.0024, "reward": 1.428381085395813, "reward_std": 0.10250850021839142, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.428381085395813, "step": 176 }, { "completion_length": 236.390625, "epoch": 0.05640535372848948, "grad_norm": 11.407561302185059, "kl": 0.049560546875, "learning_rate": 9.435946462715104e-07, "loss": 0.002, "reward": 1.6906650066375732, "reward_std": 0.22784824669361115, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5187899470329285, "rewards/pad": 0.203125, "step": 177 }, { "completion_length": 327.375, "epoch": 0.05672402804333971, "grad_norm": 7.302003383636475, "kl": 0.037841796875, "learning_rate": 9.432759719566602e-07, "loss": 0.0015, "reward": 1.3769761323928833, "reward_std": 0.01789081282913685, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3769761621952057, "step": 178 }, { "completion_length": 197.0625, "epoch": 0.05704270235818993, "grad_norm": 8.595909118652344, "kl": 0.06982421875, "learning_rate": 9.4295729764181e-07, "loss": 0.0028, "reward": 1.488729476928711, "reward_std": 0.16121619939804077, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.48872941732406616, "rewards/pad": 0.015625, "step": 179 }, { "completion_length": 284.4375, "epoch": 0.05736137667304015, "grad_norm": 7.860208988189697, "kl": 0.053466796875, "learning_rate": 9.426386233269598e-07, "loss": 0.0022, "reward": 1.3685816526412964, "reward_std": 0.08686651289463043, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.368581622838974, "step": 180 }, { "completion_length": 257.359375, "epoch": 0.05768005098789038, "grad_norm": 6.262935638427734, "kl": 0.080078125, "learning_rate": 9.423199490121095e-07, "loss": 0.0032, "reward": 1.6427003145217896, "reward_std": 0.245242178440094, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5489503145217896, "rewards/pad": 0.125, "step": 181 }, { "completion_length": 129.53125, "epoch": 0.0579987253027406, "grad_norm": 11.011406898498535, "kl": 0.06640625, "learning_rate": 9.420012746972593e-07, "loss": 0.0027, "reward": 1.786719560623169, "reward_std": 0.15677408874034882, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.536719560623169, "rewards/pad": 0.25, "step": 182 }, { "completion_length": 234.40625, "epoch": 0.058317399617590825, "grad_norm": 10.98038101196289, "kl": 0.05810546875, "learning_rate": 9.416826003824091e-07, "loss": 0.0023, "reward": 1.3131130933761597, "reward_std": 0.21581920981407166, "rewards/format_reward_tg": 0.9375, "rewards/iou_timestamp_reward": 0.32873809337615967, "rewards/pad": 0.046875, "step": 183 }, { "completion_length": 256.359375, "epoch": 0.058636073932441045, "grad_norm": 6.828302383422852, "kl": 0.0625, "learning_rate": 9.413639260675588e-07, "loss": 0.0025, "reward": 1.2020618915557861, "reward_std": 0.07125719636678696, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.20206193625926971, "step": 184 }, { "completion_length": 223.703125, "epoch": 0.058954748247291265, "grad_norm": 8.792315483093262, "kl": 0.0654296875, "learning_rate": 9.410452517527086e-07, "loss": 0.0026, "reward": 1.465730905532837, "reward_std": 0.17245060205459595, "rewards/answer_reward": 0.03125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4344809055328369, "step": 185 }, { "completion_length": 191.0625, "epoch": 0.05927342256214149, "grad_norm": 90.65461730957031, "kl": 0.061279296875, "learning_rate": 9.407265774378584e-07, "loss": 0.0025, "reward": 1.5472661256790161, "reward_std": 0.1555110514163971, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.31289106607437134, "rewards/pad": 0.25, "step": 186 }, { "completion_length": 266.484375, "epoch": 0.05959209687699171, "grad_norm": 8.738775253295898, "kl": 0.053955078125, "learning_rate": 9.404079031230082e-07, "loss": 0.0022, "reward": 1.319126009941101, "reward_std": 0.08598145842552185, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.31912603974342346, "rewards/pad": 0.0, "step": 187 }, { "completion_length": 210.984375, "epoch": 0.05991077119184194, "grad_norm": 14.283634185791016, "kl": 0.06201171875, "learning_rate": 9.40089228808158e-07, "loss": 0.0025, "reward": 1.5603177547454834, "reward_std": 0.16099268198013306, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5759426951408386, "rewards/pad": 0.0, "step": 188 }, { "completion_length": 322.046875, "epoch": 0.06022944550669216, "grad_norm": 4.540855884552002, "kl": 0.05810546875, "learning_rate": 9.397705544933078e-07, "loss": 0.0023, "reward": 1.466914176940918, "reward_std": 0.07637807726860046, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46691423654556274, "rewards/pad": 0.0, "step": 189 }, { "completion_length": 223.703125, "epoch": 0.06054811982154239, "grad_norm": 8.010270118713379, "kl": 0.06396484375, "learning_rate": 9.394518801784576e-07, "loss": 0.0026, "reward": 1.5183391571044922, "reward_std": 0.19354218244552612, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4870891571044922, "step": 190 }, { "completion_length": 286.796875, "epoch": 0.06086679413639261, "grad_norm": 8.864216804504395, "kl": 0.041748046875, "learning_rate": 9.391332058636074e-07, "loss": 0.0017, "reward": 1.661377191543579, "reward_std": 0.31189072132110596, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.4582522511482239, "step": 191 }, { "completion_length": 158.6875, "epoch": 0.06118546845124283, "grad_norm": 6.764616012573242, "kl": 0.07275390625, "learning_rate": 9.388145315487571e-07, "loss": 0.0029, "reward": 1.5173699855804443, "reward_std": 0.1659144163131714, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.28299516439437866, "step": 192 }, { "completion_length": 211.71875, "epoch": 0.061504142766093055, "grad_norm": 9.038619995117188, "kl": 0.0693359375, "learning_rate": 9.384958572339069e-07, "loss": 0.0028, "reward": 1.4801876544952393, "reward_std": 0.09755624830722809, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4801877737045288, "step": 193 }, { "completion_length": 150.46875, "epoch": 0.061822817080943275, "grad_norm": 67.90802001953125, "kl": 0.09228515625, "learning_rate": 9.381771829190567e-07, "loss": 0.0037, "reward": 1.4884817600250244, "reward_std": 0.12955227494239807, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39473170042037964, "rewards/pad": 0.09375, "step": 194 }, { "completion_length": 247.203125, "epoch": 0.0621414913957935, "grad_norm": 6.690598011016846, "kl": 0.064453125, "learning_rate": 9.378585086042065e-07, "loss": 0.0026, "reward": 1.4969897270202637, "reward_std": 0.06976005434989929, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4969896674156189, "step": 195 }, { "completion_length": 181.4375, "epoch": 0.06246016571064372, "grad_norm": 8.010198593139648, "kl": 0.1103515625, "learning_rate": 9.375398342893562e-07, "loss": 0.0044, "reward": 1.564711570739746, "reward_std": 0.11871609091758728, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43971166014671326, "rewards/pad": 0.125, "step": 196 }, { "completion_length": 193.390625, "epoch": 0.06277884002549394, "grad_norm": 11.752547264099121, "kl": 0.07275390625, "learning_rate": 9.37221159974506e-07, "loss": 0.0029, "reward": 1.659630298614502, "reward_std": 0.13107186555862427, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4252552390098572, "rewards/pad": 0.234375, "step": 197 }, { "completion_length": 211.46875, "epoch": 0.06309751434034416, "grad_norm": 11.705774307250977, "kl": 0.050048828125, "learning_rate": 9.369024856596558e-07, "loss": 0.002, "reward": 1.81485915184021, "reward_std": 0.2242487072944641, "rewards/answer_reward": 0.265625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5492340922355652, "step": 198 }, { "completion_length": 264.84375, "epoch": 0.0634161886551944, "grad_norm": 11.895368576049805, "kl": 0.052490234375, "learning_rate": 9.365838113448056e-07, "loss": 0.0021, "reward": 1.466489315032959, "reward_std": 0.1925201416015625, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4196142852306366, "step": 199 }, { "completion_length": 224.015625, "epoch": 0.06373486297004462, "grad_norm": 8.283370971679688, "kl": 0.057861328125, "learning_rate": 9.362651370299553e-07, "loss": 0.0023, "reward": 1.607632040977478, "reward_std": 0.16389986872673035, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3732570707798004, "step": 200 }, { "completion_length": 329.734375, "epoch": 0.06405353728489484, "grad_norm": 3.239655017852783, "kl": 0.0306396484375, "learning_rate": 9.359464627151051e-07, "loss": 0.0012, "reward": 1.567150354385376, "reward_std": 0.06768325716257095, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.33277541399002075, "step": 201 }, { "completion_length": 195.8125, "epoch": 0.06437221159974506, "grad_norm": 24.049251556396484, "kl": 0.06982421875, "learning_rate": 9.356277884002549e-07, "loss": 0.0028, "reward": 1.3611292839050293, "reward_std": 0.13703155517578125, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3298793137073517, "rewards/pad": 0.03125, "step": 202 }, { "completion_length": 290.625, "epoch": 0.06469088591459528, "grad_norm": 9.575765609741211, "kl": 0.06298828125, "learning_rate": 9.353091140854047e-07, "loss": 0.0025, "reward": 1.4079358577728271, "reward_std": 0.08651718497276306, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4235607981681824, "rewards/pad": 0.0, "step": 203 }, { "completion_length": 230.625, "epoch": 0.06500956022944551, "grad_norm": 15.252603530883789, "kl": 0.05517578125, "learning_rate": 9.349904397705544e-07, "loss": 0.0022, "reward": 1.5752151012420654, "reward_std": 0.10122407972812653, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45021528005599976, "rewards/pad": 0.125, "step": 204 }, { "completion_length": 195.625, "epoch": 0.06532823454429573, "grad_norm": 6.833919525146484, "kl": 0.08056640625, "learning_rate": 9.346717654557042e-07, "loss": 0.0032, "reward": 1.4168448448181152, "reward_std": 0.1276351809501648, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.41684478521347046, "step": 205 }, { "completion_length": 152.125, "epoch": 0.06564690885914595, "grad_norm": 11.947805404663086, "kl": 0.0712890625, "learning_rate": 9.34353091140854e-07, "loss": 0.0028, "reward": 1.6025731563568115, "reward_std": 0.13310229778289795, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4619481861591339, "rewards/pad": 0.140625, "step": 206 }, { "completion_length": 152.140625, "epoch": 0.06596558317399617, "grad_norm": 13.497575759887695, "kl": 0.09033203125, "learning_rate": 9.340344168260039e-07, "loss": 0.0036, "reward": 1.3815317153930664, "reward_std": 0.12820778787136078, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.38153162598609924, "step": 207 }, { "completion_length": 206.203125, "epoch": 0.0662842574888464, "grad_norm": 6.831051349639893, "kl": 0.07470703125, "learning_rate": 9.337157425111536e-07, "loss": 0.003, "reward": 1.5932207107543945, "reward_std": 0.14225542545318604, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45259571075439453, "rewards/pad": 0.140625, "step": 208 }, { "completion_length": 335.296875, "epoch": 0.06660293180369663, "grad_norm": 6.019325256347656, "kl": 0.052978515625, "learning_rate": 9.333970681963034e-07, "loss": 0.0021, "reward": 1.528432846069336, "reward_std": 0.11231175065040588, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5440578460693359, "step": 209 }, { "completion_length": 253.203125, "epoch": 0.06692160611854685, "grad_norm": 7.904732704162598, "kl": 0.05517578125, "learning_rate": 9.330783938814532e-07, "loss": 0.0022, "reward": 1.5680043697357178, "reward_std": 0.10188771784305573, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5680044293403625, "rewards/pad": 0.0, "step": 210 }, { "completion_length": 202.203125, "epoch": 0.06724028043339707, "grad_norm": 6.209277153015137, "kl": 0.059326171875, "learning_rate": 9.32759719566603e-07, "loss": 0.0024, "reward": 1.5635247230529785, "reward_std": 0.1443396955728531, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5322748422622681, "rewards/pad": 0.03125, "step": 211 }, { "completion_length": 254.203125, "epoch": 0.06755895474824729, "grad_norm": 6.30163049697876, "kl": 0.048583984375, "learning_rate": 9.324410452517527e-07, "loss": 0.0019, "reward": 1.519893765449524, "reward_std": 0.23284552991390228, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4105188250541687, "rewards/pad": 0.125, "step": 212 }, { "completion_length": 271.28125, "epoch": 0.06787762906309751, "grad_norm": 15.06171989440918, "kl": 0.060791015625, "learning_rate": 9.321223709369025e-07, "loss": 0.0024, "reward": 1.4648241996765137, "reward_std": 0.07391152530908585, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4648240804672241, "rewards/pad": 0.0, "step": 213 }, { "completion_length": 197.5625, "epoch": 0.06819630337794774, "grad_norm": 17.046770095825195, "kl": 0.07080078125, "learning_rate": 9.318036966220523e-07, "loss": 0.0028, "reward": 1.5187010765075684, "reward_std": 0.1929798126220703, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.44057604670524597, "rewards/pad": 0.09375, "step": 214 }, { "completion_length": 287.140625, "epoch": 0.06851497769279796, "grad_norm": 4.82719612121582, "kl": 0.038330078125, "learning_rate": 9.314850223072021e-07, "loss": 0.0015, "reward": 1.410596251487732, "reward_std": 0.11355704069137573, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.42622125148773193, "step": 215 }, { "completion_length": 260.78125, "epoch": 0.06883365200764818, "grad_norm": 17.187711715698242, "kl": 0.052001953125, "learning_rate": 9.311663479923517e-07, "loss": 0.0021, "reward": 1.840174913406372, "reward_std": 0.11517071723937988, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49642491340637207, "rewards/pad": 0.34375, "step": 216 }, { "completion_length": 271.21875, "epoch": 0.0691523263224984, "grad_norm": 5.477590560913086, "kl": 0.052490234375, "learning_rate": 9.308476736775015e-07, "loss": 0.0021, "reward": 1.3956875801086426, "reward_std": 0.15309034287929535, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.31756263971328735, "rewards/pad": 0.09375, "step": 217 }, { "completion_length": 248.046875, "epoch": 0.06947100063734862, "grad_norm": 29.749284744262695, "kl": 0.05078125, "learning_rate": 9.305289993626513e-07, "loss": 0.002, "reward": 1.5329116582870483, "reward_std": 0.10812871158123016, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4235367476940155, "rewards/pad": 0.109375, "step": 218 }, { "completion_length": 320.375, "epoch": 0.06978967495219886, "grad_norm": 6.243754863739014, "kl": 0.0361328125, "learning_rate": 9.30210325047801e-07, "loss": 0.0014, "reward": 1.5504281520843506, "reward_std": 0.04979484900832176, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4254281222820282, "step": 219 }, { "completion_length": 232.671875, "epoch": 0.07010834926704908, "grad_norm": 12.06491756439209, "kl": 0.10205078125, "learning_rate": 9.298916507329508e-07, "loss": 0.0041, "reward": 1.6985371112823486, "reward_std": 0.19800494611263275, "rewards/answer_reward": 0.15625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5422872304916382, "step": 220 }, { "completion_length": 251.9375, "epoch": 0.0704270235818993, "grad_norm": 9.655678749084473, "kl": 0.052490234375, "learning_rate": 9.295729764181006e-07, "loss": 0.0021, "reward": 1.524951696395874, "reward_std": 0.1220092698931694, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43120163679122925, "step": 221 }, { "completion_length": 195.0, "epoch": 0.07074569789674952, "grad_norm": 11.081148147583008, "kl": 0.072265625, "learning_rate": 9.292543021032504e-07, "loss": 0.0029, "reward": 1.3704537153244019, "reward_std": 0.16427022218704224, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.33920374512672424, "step": 222 }, { "completion_length": 270.34375, "epoch": 0.07106437221159974, "grad_norm": 4.369564533233643, "kl": 0.044189453125, "learning_rate": 9.289356277884001e-07, "loss": 0.0018, "reward": 1.6683555841445923, "reward_std": 0.24814534187316895, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5121055245399475, "step": 223 }, { "completion_length": 317.8125, "epoch": 0.07138304652644997, "grad_norm": 10.045781135559082, "kl": 0.042236328125, "learning_rate": 9.286169534735499e-07, "loss": 0.0017, "reward": 1.2989885807037354, "reward_std": 0.11577681452035904, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.2677384614944458, "step": 224 }, { "completion_length": 211.5, "epoch": 0.07170172084130019, "grad_norm": 6.441803932189941, "kl": 0.06591796875, "learning_rate": 9.282982791586997e-07, "loss": 0.0026, "reward": 1.5342642068862915, "reward_std": 0.12945051491260529, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.534264087677002, "rewards/pad": 0.0, "step": 225 }, { "completion_length": 294.078125, "epoch": 0.07202039515615041, "grad_norm": 8.124902725219727, "kl": 0.0556640625, "learning_rate": 9.279796048438496e-07, "loss": 0.0022, "reward": 1.4743280410766602, "reward_std": 0.060752347111701965, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47432801127433777, "step": 226 }, { "completion_length": 259.953125, "epoch": 0.07233906947100063, "grad_norm": 22.559650421142578, "kl": 0.06005859375, "learning_rate": 9.276609305289993e-07, "loss": 0.0024, "reward": 1.4046584367752075, "reward_std": 0.16901271045207977, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.2640334963798523, "step": 227 }, { "completion_length": 283.515625, "epoch": 0.07265774378585087, "grad_norm": 10.921895980834961, "kl": 0.056396484375, "learning_rate": 9.273422562141491e-07, "loss": 0.0023, "reward": 1.2822096347808838, "reward_std": 0.04250604659318924, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.2822096049785614, "rewards/pad": 0.0, "step": 228 }, { "completion_length": 191.0, "epoch": 0.07297641810070109, "grad_norm": 14.872382164001465, "kl": 0.0732421875, "learning_rate": 9.270235818992989e-07, "loss": 0.0029, "reward": 1.5225872993469238, "reward_std": 0.10570458322763443, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.522587239742279, "rewards/pad": 0.0, "step": 229 }, { "completion_length": 291.3125, "epoch": 0.07329509241555131, "grad_norm": 5.075259685516357, "kl": 0.04248046875, "learning_rate": 9.267049075844487e-07, "loss": 0.0017, "reward": 1.3570164442062378, "reward_std": 0.1311841905117035, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3726414740085602, "rewards/pad": 0.0, "step": 230 }, { "completion_length": 240.15625, "epoch": 0.07361376673040153, "grad_norm": 47.22294235229492, "kl": 0.0654296875, "learning_rate": 9.263862332695984e-07, "loss": 0.0026, "reward": 1.4281563758850098, "reward_std": 0.06662596762180328, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4281563460826874, "rewards/pad": 0.0, "step": 231 }, { "completion_length": 208.578125, "epoch": 0.07393244104525175, "grad_norm": 13.75542163848877, "kl": 0.06298828125, "learning_rate": 9.260675589547482e-07, "loss": 0.0025, "reward": 1.466019868850708, "reward_std": 0.1753520667552948, "rewards/answer_reward": 0.046875, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.43476995825767517, "step": 232 }, { "completion_length": 222.59375, "epoch": 0.07425111536010198, "grad_norm": 8.445175170898438, "kl": 0.07958984375, "learning_rate": 9.25748884639898e-07, "loss": 0.0032, "reward": 1.443455696105957, "reward_std": 0.21538135409355164, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.24033081531524658, "rewards/pad": 0.21875, "step": 233 }, { "completion_length": 106.125, "epoch": 0.0745697896749522, "grad_norm": 12.408512115478516, "kl": 0.10400390625, "learning_rate": 9.254302103250478e-07, "loss": 0.0042, "reward": 1.5548592805862427, "reward_std": 0.16731616854667664, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5548592209815979, "rewards/pad": 0.0, "step": 234 }, { "completion_length": 178.796875, "epoch": 0.07488846398980242, "grad_norm": 10.482146263122559, "kl": 0.06982421875, "learning_rate": 9.251115360101975e-07, "loss": 0.0028, "reward": 1.7268046140670776, "reward_std": 0.13564898073673248, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3830545246601105, "rewards/pad": 0.34375, "step": 235 }, { "completion_length": 187.765625, "epoch": 0.07520713830465264, "grad_norm": 8.685198783874512, "kl": 0.0673828125, "learning_rate": 9.247928616953473e-07, "loss": 0.0027, "reward": 1.4304745197296143, "reward_std": 0.10561563819646835, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43047448992729187, "rewards/pad": 0.0, "step": 236 }, { "completion_length": 216.78125, "epoch": 0.07552581261950286, "grad_norm": 8.801995277404785, "kl": 0.0703125, "learning_rate": 9.244741873804971e-07, "loss": 0.0028, "reward": 1.4420864582061768, "reward_std": 0.2987205684185028, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.39521145820617676, "step": 237 }, { "completion_length": 221.625, "epoch": 0.0758444869343531, "grad_norm": 9.968708038330078, "kl": 0.076171875, "learning_rate": 9.241555130656469e-07, "loss": 0.0031, "reward": 1.555429220199585, "reward_std": 0.21994394063949585, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.46167925000190735, "step": 238 }, { "completion_length": 197.375, "epoch": 0.07616316124920332, "grad_norm": 42.812137603759766, "kl": 0.07177734375, "learning_rate": 9.238368387507966e-07, "loss": 0.0029, "reward": 1.5328187942504883, "reward_std": 0.12525755167007446, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4234437346458435, "rewards/pad": 0.109375, "step": 239 }, { "completion_length": 289.734375, "epoch": 0.07648183556405354, "grad_norm": 7.226988792419434, "kl": 0.051025390625, "learning_rate": 9.235181644359464e-07, "loss": 0.002, "reward": 1.4006308317184448, "reward_std": 0.20406073331832886, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.32250580191612244, "step": 240 }, { "completion_length": 174.109375, "epoch": 0.07680050987890376, "grad_norm": 9.643024444580078, "kl": 0.0791015625, "learning_rate": 9.231994901210962e-07, "loss": 0.0032, "reward": 1.610644817352295, "reward_std": 0.10245459526777267, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47001978754997253, "step": 241 }, { "completion_length": 216.890625, "epoch": 0.07711918419375398, "grad_norm": 10.71285343170166, "kl": 0.06298828125, "learning_rate": 9.22880815806246e-07, "loss": 0.0025, "reward": 1.7045223712921143, "reward_std": 0.17062969505786896, "rewards/answer_reward": 0.328125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3763972520828247, "step": 242 }, { "completion_length": 239.46875, "epoch": 0.07743785850860421, "grad_norm": 20.129676818847656, "kl": 0.06298828125, "learning_rate": 9.225621414913957e-07, "loss": 0.0025, "reward": 1.4195454120635986, "reward_std": 0.09258443117141724, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.41954541206359863, "step": 243 }, { "completion_length": 215.234375, "epoch": 0.07775653282345443, "grad_norm": 54.93705749511719, "kl": 0.06787109375, "learning_rate": 9.222434671765456e-07, "loss": 0.0027, "reward": 1.3729498386383057, "reward_std": 0.07894681394100189, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.37294989824295044, "rewards/pad": 0.0, "step": 244 }, { "completion_length": 248.703125, "epoch": 0.07807520713830465, "grad_norm": 4.213068962097168, "kl": 0.051025390625, "learning_rate": 9.219247928616954e-07, "loss": 0.002, "reward": 1.441326379776001, "reward_std": 0.11559078842401505, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.378826379776001, "rewards/pad": 0.0625, "step": 245 }, { "completion_length": 192.578125, "epoch": 0.07839388145315487, "grad_norm": 6.598721027374268, "kl": 0.06787109375, "learning_rate": 9.216061185468452e-07, "loss": 0.0027, "reward": 1.6929112672805786, "reward_std": 0.22981858253479004, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.44291120767593384, "rewards/pad": 0.265625, "step": 246 }, { "completion_length": 239.59375, "epoch": 0.0787125557680051, "grad_norm": 7.554116725921631, "kl": 0.07763671875, "learning_rate": 9.212874442319949e-07, "loss": 0.0031, "reward": 1.4620293378829956, "reward_std": 0.15765343606472015, "rewards/answer_reward": 0.046875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.41515427827835083, "step": 247 }, { "completion_length": 263.984375, "epoch": 0.07903123008285533, "grad_norm": 17.3502254486084, "kl": 0.056884765625, "learning_rate": 9.209687699171447e-07, "loss": 0.0023, "reward": 1.506328821182251, "reward_std": 0.10069683194160461, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.396953821182251, "step": 248 }, { "completion_length": 276.125, "epoch": 0.07934990439770555, "grad_norm": 5.626839637756348, "kl": 0.043212890625, "learning_rate": 9.206500956022945e-07, "loss": 0.0017, "reward": 1.58784019947052, "reward_std": 0.20608378946781158, "rewards/answer_reward": 0.171875, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.43159013986587524, "step": 249 }, { "completion_length": 222.15625, "epoch": 0.07966857871255577, "grad_norm": 7.744899749755859, "kl": 0.060791015625, "learning_rate": 9.203314212874442e-07, "loss": 0.0024, "reward": 1.5529812574386597, "reward_std": 0.1434810906648636, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4592312276363373, "rewards/pad": 0.125, "step": 250 }, { "completion_length": 194.96875, "epoch": 0.07998725302740599, "grad_norm": 8.183343887329102, "kl": 0.06396484375, "learning_rate": 9.20012746972594e-07, "loss": 0.0026, "reward": 1.4947509765625, "reward_std": 0.1750367283821106, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3853759467601776, "rewards/pad": 0.109375, "step": 251 }, { "completion_length": 298.640625, "epoch": 0.08030592734225621, "grad_norm": 24.4941349029541, "kl": 0.045166015625, "learning_rate": 9.196940726577438e-07, "loss": 0.0018, "reward": 1.520071268081665, "reward_std": 0.11857470124959946, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.45757126808166504, "step": 252 }, { "completion_length": 192.84375, "epoch": 0.08062460165710644, "grad_norm": 9.189983367919922, "kl": 0.064453125, "learning_rate": 9.193753983428936e-07, "loss": 0.0026, "reward": 1.8057680130004883, "reward_std": 0.13417977094650269, "rewards/pad": 0.21875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5870180130004883, "step": 253 }, { "completion_length": 251.90625, "epoch": 0.08094327597195666, "grad_norm": 11.186515808105469, "kl": 0.052978515625, "learning_rate": 9.190567240280433e-07, "loss": 0.0021, "reward": 1.3663510084152222, "reward_std": 0.18374572694301605, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3507259488105774, "rewards/pad": 0.03125, "step": 254 }, { "completion_length": 204.75, "epoch": 0.08126195028680688, "grad_norm": 20.661352157592773, "kl": 0.0859375, "learning_rate": 9.18738049713193e-07, "loss": 0.0034, "reward": 1.3985034227371216, "reward_std": 0.0647159069776535, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3985033333301544, "rewards/pad": 0.0, "step": 255 }, { "completion_length": 184.46875, "epoch": 0.0815806246016571, "grad_norm": 9.674245834350586, "kl": 0.06396484375, "learning_rate": 9.184193753983428e-07, "loss": 0.0026, "reward": 1.5792434215545654, "reward_std": 0.21538616716861725, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.422993540763855, "rewards/pad": 0.15625, "step": 256 }, { "completion_length": 253.640625, "epoch": 0.08189929891650732, "grad_norm": 117.39447784423828, "kl": 0.068359375, "learning_rate": 9.181007010834926e-07, "loss": 0.0027, "reward": 1.5494431257247925, "reward_std": 0.10500036925077438, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5494431257247925, "step": 257 }, { "completion_length": 262.453125, "epoch": 0.08221797323135756, "grad_norm": 15.066034317016602, "kl": 0.04345703125, "learning_rate": 9.177820267686423e-07, "loss": 0.0017, "reward": 1.6858816146850586, "reward_std": 0.11864806711673737, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5765066146850586, "step": 258 }, { "completion_length": 214.140625, "epoch": 0.08253664754620778, "grad_norm": 14.4360933303833, "kl": 0.06689453125, "learning_rate": 9.174633524537921e-07, "loss": 0.0027, "reward": 1.4065463542938232, "reward_std": 0.13927623629570007, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.40654638409614563, "rewards/pad": 0.015625, "step": 259 }, { "completion_length": 136.421875, "epoch": 0.082855321861058, "grad_norm": 14.50709342956543, "kl": 0.0947265625, "learning_rate": 9.171446781389419e-07, "loss": 0.0038, "reward": 1.6555917263031006, "reward_std": 0.169988214969635, "rewards/answer_reward": 0.171875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4837167263031006, "step": 260 }, { "completion_length": 211.5625, "epoch": 0.08317399617590822, "grad_norm": 27.106956481933594, "kl": 0.055419921875, "learning_rate": 9.168260038240917e-07, "loss": 0.0022, "reward": 1.4652085304260254, "reward_std": 0.07502000033855438, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.34020841121673584, "rewards/pad": 0.125, "step": 261 }, { "completion_length": 207.0, "epoch": 0.08349267049075844, "grad_norm": 8.570914268493652, "kl": 0.05615234375, "learning_rate": 9.165073295092414e-07, "loss": 0.0022, "reward": 1.5496916770935059, "reward_std": 0.12746258080005646, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5496916770935059, "step": 262 }, { "completion_length": 290.109375, "epoch": 0.08381134480560867, "grad_norm": 10.006585121154785, "kl": 0.048583984375, "learning_rate": 9.161886551943912e-07, "loss": 0.0019, "reward": 1.3973890542984009, "reward_std": 0.13302114605903625, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4286390244960785, "step": 263 }, { "completion_length": 264.984375, "epoch": 0.0841300191204589, "grad_norm": 5.002647876739502, "kl": 0.0703125, "learning_rate": 9.158699808795411e-07, "loss": 0.0028, "reward": 1.3228042125701904, "reward_std": 0.1068376898765564, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.33842921257019043, "rewards/pad": 0.0, "step": 264 }, { "completion_length": 142.734375, "epoch": 0.08444869343530911, "grad_norm": 16.07257843017578, "kl": 0.0859375, "learning_rate": 9.155513065646909e-07, "loss": 0.0034, "reward": 1.4866881370544434, "reward_std": 0.2174568474292755, "rewards/answer_reward": 0.03125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4554380774497986, "step": 265 }, { "completion_length": 185.0, "epoch": 0.08476736775015933, "grad_norm": 14.378037452697754, "kl": 0.064453125, "learning_rate": 9.152326322498406e-07, "loss": 0.0026, "reward": 1.4063767194747925, "reward_std": 0.2589850127696991, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.37512677907943726, "rewards/pad": 0.0625, "step": 266 }, { "completion_length": 258.453125, "epoch": 0.08508604206500955, "grad_norm": 14.581949234008789, "kl": 0.04541015625, "learning_rate": 9.149139579349904e-07, "loss": 0.0018, "reward": 1.425885796546936, "reward_std": 0.09042633324861526, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.42588573694229126, "step": 267 }, { "completion_length": 190.171875, "epoch": 0.08540471637985979, "grad_norm": 8.301403999328613, "kl": 0.064453125, "learning_rate": 9.145952836201402e-07, "loss": 0.0026, "reward": 1.5484635829925537, "reward_std": 0.17337355017662048, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4390886723995209, "rewards/pad": 0.125, "step": 268 }, { "completion_length": 208.59375, "epoch": 0.08572339069471001, "grad_norm": 9.300251007080078, "kl": 0.0771484375, "learning_rate": 9.1427660930529e-07, "loss": 0.0031, "reward": 1.4223666191101074, "reward_std": 0.0689803957939148, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4223666787147522, "rewards/pad": 0.0, "step": 269 }, { "completion_length": 337.84375, "epoch": 0.08604206500956023, "grad_norm": 7.203383922576904, "kl": 0.037109375, "learning_rate": 9.139579349904397e-07, "loss": 0.0015, "reward": 1.3084739446640015, "reward_std": 0.07721705734729767, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.32409897446632385, "step": 270 }, { "completion_length": 222.78125, "epoch": 0.08636073932441045, "grad_norm": 17.609630584716797, "kl": 0.07763671875, "learning_rate": 9.136392606755895e-07, "loss": 0.0031, "reward": 1.4136435985565186, "reward_std": 0.11169826984405518, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.39801862835884094, "step": 271 }, { "completion_length": 185.484375, "epoch": 0.08667941363926067, "grad_norm": 12.315672874450684, "kl": 0.06396484375, "learning_rate": 9.133205863607393e-07, "loss": 0.0026, "reward": 1.6809338331222534, "reward_std": 0.2243930697441101, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4621838927268982, "step": 272 }, { "completion_length": 156.875, "epoch": 0.0869980879541109, "grad_norm": 11.391555786132812, "kl": 0.07666015625, "learning_rate": 9.130019120458891e-07, "loss": 0.0031, "reward": 1.4851157665252686, "reward_std": 0.11955823004245758, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48511582612991333, "rewards/pad": 0.0, "step": 273 }, { "completion_length": 142.203125, "epoch": 0.08731676226896112, "grad_norm": 11.662199020385742, "kl": 0.0849609375, "learning_rate": 9.126832377310388e-07, "loss": 0.0034, "reward": 1.606271505355835, "reward_std": 0.13789045810699463, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49689656496047974, "rewards/pad": 0.109375, "step": 274 }, { "completion_length": 214.796875, "epoch": 0.08763543658381134, "grad_norm": 8.317606925964355, "kl": 0.064453125, "learning_rate": 9.123645634161886e-07, "loss": 0.0026, "reward": 1.5963852405548096, "reward_std": 0.1202874481678009, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5026353001594543, "step": 275 }, { "completion_length": 215.0, "epoch": 0.08795411089866156, "grad_norm": 11.95240592956543, "kl": 0.06787109375, "learning_rate": 9.120458891013384e-07, "loss": 0.0027, "reward": 1.4960030317306519, "reward_std": 0.0837111845612526, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49600303173065186, "rewards/pad": 0.0, "step": 276 }, { "completion_length": 253.28125, "epoch": 0.08827278521351178, "grad_norm": 8.151103019714355, "kl": 0.06689453125, "learning_rate": 9.117272147864882e-07, "loss": 0.0027, "reward": 1.4131813049316406, "reward_std": 0.12128612399101257, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.42880627512931824, "rewards/pad": 0.0, "step": 277 }, { "completion_length": 102.109375, "epoch": 0.08859145952836202, "grad_norm": 14.23055362701416, "kl": 0.08056640625, "learning_rate": 9.114085404716379e-07, "loss": 0.0032, "reward": 1.7072069644927979, "reward_std": 0.04941394180059433, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5822069644927979, "rewards/pad": 0.125, "step": 278 }, { "completion_length": 257.078125, "epoch": 0.08891013384321224, "grad_norm": 50.22057342529297, "kl": 0.06005859375, "learning_rate": 9.110898661567877e-07, "loss": 0.0024, "reward": 1.3935086727142334, "reward_std": 0.1538911759853363, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3310086131095886, "step": 279 }, { "completion_length": 226.375, "epoch": 0.08922880815806246, "grad_norm": 6.466733455657959, "kl": 0.06396484375, "learning_rate": 9.107711918419375e-07, "loss": 0.0026, "reward": 1.5158886909484863, "reward_std": 0.0630214735865593, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5158886313438416, "step": 280 }, { "completion_length": 256.375, "epoch": 0.08954748247291268, "grad_norm": 13.729325294494629, "kl": 0.0693359375, "learning_rate": 9.104525175270872e-07, "loss": 0.0028, "reward": 1.4866538047790527, "reward_std": 0.0790114551782608, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.48665374517440796, "step": 281 }, { "completion_length": 87.53125, "epoch": 0.08986615678776291, "grad_norm": 12.788779258728027, "kl": 0.12255859375, "learning_rate": 9.10133843212237e-07, "loss": 0.0049, "reward": 1.6292393207550049, "reward_std": 0.18508005142211914, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5198642015457153, "rewards/pad": 0.109375, "step": 282 }, { "completion_length": 160.359375, "epoch": 0.09018483110261313, "grad_norm": 10.025726318359375, "kl": 0.06396484375, "learning_rate": 9.098151688973869e-07, "loss": 0.0026, "reward": 1.8845949172973633, "reward_std": 0.2100738286972046, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5720949172973633, "rewards/pad": 0.3125, "step": 283 }, { "completion_length": 234.46875, "epoch": 0.09050350541746335, "grad_norm": 16.763338088989258, "kl": 0.05419921875, "learning_rate": 9.094964945825367e-07, "loss": 0.0022, "reward": 1.6589397192001343, "reward_std": 0.19485995173454285, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4401897192001343, "rewards/pad": 0.21875, "step": 284 }, { "completion_length": 215.84375, "epoch": 0.09082217973231357, "grad_norm": 38.65030288696289, "kl": 0.055908203125, "learning_rate": 9.091778202676864e-07, "loss": 0.0022, "reward": 1.6431481838226318, "reward_std": 0.07203955203294754, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5181481838226318, "rewards/pad": 0.125, "step": 285 }, { "completion_length": 224.546875, "epoch": 0.0911408540471638, "grad_norm": 6.317727088928223, "kl": 0.052978515625, "learning_rate": 9.088591459528362e-07, "loss": 0.0021, "reward": 1.4984824657440186, "reward_std": 0.13541144132614136, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.498482346534729, "step": 286 }, { "completion_length": 241.5, "epoch": 0.09145952836201403, "grad_norm": 11.916752815246582, "kl": 0.059814453125, "learning_rate": 9.08540471637986e-07, "loss": 0.0024, "reward": 1.5799572467803955, "reward_std": 0.12866558134555817, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4862072765827179, "step": 287 }, { "completion_length": 322.78125, "epoch": 0.09177820267686425, "grad_norm": 13.54063892364502, "kl": 0.0576171875, "learning_rate": 9.082217973231358e-07, "loss": 0.0023, "reward": 1.5151567459106445, "reward_std": 0.10878373682498932, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5151568651199341, "step": 288 }, { "completion_length": 147.203125, "epoch": 0.09209687699171447, "grad_norm": 11.628636360168457, "kl": 0.08544921875, "learning_rate": 9.079031230082855e-07, "loss": 0.0034, "reward": 1.461251974105835, "reward_std": 0.18991222977638245, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3675019145011902, "rewards/pad": 0.109375, "step": 289 }, { "completion_length": 236.890625, "epoch": 0.09241555130656469, "grad_norm": 6.667603015899658, "kl": 0.052490234375, "learning_rate": 9.075844486934353e-07, "loss": 0.0021, "reward": 1.4545420408248901, "reward_std": 0.11373404413461685, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.28266704082489014, "step": 290 }, { "completion_length": 266.078125, "epoch": 0.09273422562141491, "grad_norm": 23.1329345703125, "kl": 0.052734375, "learning_rate": 9.072657743785851e-07, "loss": 0.0021, "reward": 1.391666054725647, "reward_std": 0.0459078848361969, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.391666054725647, "step": 291 }, { "completion_length": 92.171875, "epoch": 0.09305289993626514, "grad_norm": 8.88923168182373, "kl": 0.0908203125, "learning_rate": 9.069471000637349e-07, "loss": 0.0036, "reward": 1.671968698501587, "reward_std": 0.13037461042404175, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6719686985015869, "rewards/pad": 0.0, "step": 292 }, { "completion_length": 194.09375, "epoch": 0.09337157425111536, "grad_norm": 15.927318572998047, "kl": 0.072265625, "learning_rate": 9.066284257488846e-07, "loss": 0.0029, "reward": 1.463443398475647, "reward_std": 0.09356062859296799, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.46344345808029175, "step": 293 }, { "completion_length": 185.359375, "epoch": 0.09369024856596558, "grad_norm": 19.33968162536621, "kl": 0.0751953125, "learning_rate": 9.063097514340344e-07, "loss": 0.003, "reward": 1.5570530891418457, "reward_std": 0.12107745558023453, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5414280295372009, "rewards/pad": 0.015625, "step": 294 }, { "completion_length": 228.8125, "epoch": 0.0940089228808158, "grad_norm": 13.032269477844238, "kl": 0.068359375, "learning_rate": 9.059910771191841e-07, "loss": 0.0027, "reward": 1.4693000316619873, "reward_std": 0.06390385329723358, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4693000912666321, "rewards/pad": 0.0, "step": 295 }, { "completion_length": 222.296875, "epoch": 0.09432759719566602, "grad_norm": 7.67185115814209, "kl": 0.054931640625, "learning_rate": 9.056724028043339e-07, "loss": 0.0022, "reward": 1.4528157711029053, "reward_std": 0.12174256145954132, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.39031583070755005, "step": 296 }, { "completion_length": 262.03125, "epoch": 0.09464627151051626, "grad_norm": 9.583202362060547, "kl": 0.056640625, "learning_rate": 9.053537284894836e-07, "loss": 0.0023, "reward": 1.4682092666625977, "reward_std": 0.07929195463657379, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46820923686027527, "rewards/pad": 0.0, "step": 297 }, { "completion_length": 230.375, "epoch": 0.09496494582536648, "grad_norm": 7.358712196350098, "kl": 0.064453125, "learning_rate": 9.050350541746334e-07, "loss": 0.0026, "reward": 1.418821930885315, "reward_std": 0.04992415010929108, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.29382190108299255, "rewards/pad": 0.125, "step": 298 }, { "completion_length": 247.1875, "epoch": 0.0952836201402167, "grad_norm": 5.215261936187744, "kl": 0.057373046875, "learning_rate": 9.047163798597832e-07, "loss": 0.0023, "reward": 1.459688663482666, "reward_std": 0.08986417204141617, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.47531357407569885, "rewards/pad": 0.0, "step": 299 }, { "completion_length": 242.0, "epoch": 0.09560229445506692, "grad_norm": 11.121061325073242, "kl": 0.0703125, "learning_rate": 9.04397705544933e-07, "loss": 0.0028, "reward": 1.7235416173934937, "reward_std": 0.08684414625167847, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5985416769981384, "step": 300 }, { "completion_length": 220.203125, "epoch": 0.09592096876991714, "grad_norm": 15.279075622558594, "kl": 0.06640625, "learning_rate": 9.040790312300827e-07, "loss": 0.0027, "reward": 1.3698844909667969, "reward_std": 0.10006135702133179, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3855094313621521, "rewards/pad": 0.0, "step": 301 }, { "completion_length": 252.265625, "epoch": 0.09623964308476737, "grad_norm": 9.165273666381836, "kl": 0.062255859375, "learning_rate": 9.037603569152326e-07, "loss": 0.0025, "reward": 1.6778218746185303, "reward_std": 0.15550962090492249, "rewards/answer_reward": 0.359375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3184468150138855, "step": 302 }, { "completion_length": 277.03125, "epoch": 0.0965583173996176, "grad_norm": 4.425095558166504, "kl": 0.06005859375, "learning_rate": 9.034416826003824e-07, "loss": 0.0024, "reward": 1.3882136344909668, "reward_std": 0.10006508976221085, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3882136344909668, "step": 303 }, { "completion_length": 209.78125, "epoch": 0.09687699171446781, "grad_norm": 6.989660739898682, "kl": 0.06640625, "learning_rate": 9.031230082855322e-07, "loss": 0.0027, "reward": 1.6118898391723633, "reward_std": 0.1484118551015854, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4868898093700409, "step": 304 }, { "completion_length": 327.5625, "epoch": 0.09719566602931803, "grad_norm": 4.546972751617432, "kl": 0.04248046875, "learning_rate": 9.028043339706819e-07, "loss": 0.0017, "reward": 1.434516191482544, "reward_std": 0.04103871434926987, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4345163106918335, "rewards/pad": 0.0, "step": 305 }, { "completion_length": 159.15625, "epoch": 0.09751434034416825, "grad_norm": 7.944620132446289, "kl": 0.08203125, "learning_rate": 9.024856596558317e-07, "loss": 0.0033, "reward": 1.6820859909057617, "reward_std": 0.1810736060142517, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5883360505104065, "rewards/pad": 0.09375, "step": 306 }, { "completion_length": 188.140625, "epoch": 0.09783301465901849, "grad_norm": 123.36129760742188, "kl": 0.07470703125, "learning_rate": 9.021669853409815e-07, "loss": 0.003, "reward": 1.84897780418396, "reward_std": 0.17008383572101593, "rewards/answer_reward": 0.4375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.41147780418395996, "step": 307 }, { "completion_length": 237.53125, "epoch": 0.09815168897386871, "grad_norm": 11.163032531738281, "kl": 0.0654296875, "learning_rate": 9.018483110261312e-07, "loss": 0.0026, "reward": 1.4214814901351929, "reward_std": 0.14145858585834503, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42148149013519287, "rewards/pad": 0.0, "step": 308 }, { "completion_length": 262.546875, "epoch": 0.09847036328871893, "grad_norm": 9.466226577758789, "kl": 0.072265625, "learning_rate": 9.01529636711281e-07, "loss": 0.0029, "reward": 1.4231048822402954, "reward_std": 0.08867865800857544, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4231048822402954, "step": 309 }, { "completion_length": 344.421875, "epoch": 0.09878903760356915, "grad_norm": 5.422881126403809, "kl": 0.03564453125, "learning_rate": 9.012109623964308e-07, "loss": 0.0014, "reward": 1.530600666999817, "reward_std": 0.12080751359462738, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.2962256968021393, "step": 310 }, { "completion_length": 209.078125, "epoch": 0.09910771191841937, "grad_norm": 5.95306921005249, "kl": 0.07763671875, "learning_rate": 9.008922880815806e-07, "loss": 0.0031, "reward": 1.460267186164856, "reward_std": 0.0819513350725174, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4602671265602112, "rewards/pad": 0.0, "step": 311 }, { "completion_length": 396.609375, "epoch": 0.0994263862332696, "grad_norm": 3.307825803756714, "kl": 0.0296630859375, "learning_rate": 9.005736137667303e-07, "loss": 0.0012, "reward": 1.4927701950073242, "reward_std": 0.008223021402955055, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36777013540267944, "step": 312 }, { "completion_length": 235.984375, "epoch": 0.09974506054811982, "grad_norm": 10.863787651062012, "kl": 0.061767578125, "learning_rate": 9.002549394518801e-07, "loss": 0.0025, "reward": 1.4562087059020996, "reward_std": 0.1860886812210083, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.47183382511138916, "step": 313 }, { "completion_length": 288.796875, "epoch": 0.10006373486297004, "grad_norm": 5.0147881507873535, "kl": 0.078125, "learning_rate": 8.999362651370299e-07, "loss": 0.0031, "reward": 1.5954546928405762, "reward_std": 0.18140065670013428, "rewards/answer_reward": 0.09375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.517329752445221, "step": 314 }, { "completion_length": 259.4375, "epoch": 0.10038240917782026, "grad_norm": 8.11809253692627, "kl": 0.060546875, "learning_rate": 8.996175908221797e-07, "loss": 0.0024, "reward": 1.235037922859192, "reward_std": 0.1669473648071289, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.2037878930568695, "step": 315 }, { "completion_length": 207.078125, "epoch": 0.10070108349267048, "grad_norm": 17.18282699584961, "kl": 0.08203125, "learning_rate": 8.992989165073294e-07, "loss": 0.0033, "reward": 1.6323869228363037, "reward_std": 0.13421547412872314, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6011369228363037, "rewards/pad": 0.03125, "step": 316 }, { "completion_length": 204.21875, "epoch": 0.10101975780752072, "grad_norm": 14.23183536529541, "kl": 0.07568359375, "learning_rate": 8.989802421924792e-07, "loss": 0.003, "reward": 1.4993996620178223, "reward_std": 0.09073391556739807, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49939966201782227, "rewards/pad": 0.0, "step": 317 }, { "completion_length": 230.40625, "epoch": 0.10133843212237094, "grad_norm": 9.19079875946045, "kl": 0.1103515625, "learning_rate": 8.98661567877629e-07, "loss": 0.0044, "reward": 1.5880275964736938, "reward_std": 0.17680257558822632, "rewards/answer_reward": 0.09375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4942775368690491, "step": 318 }, { "completion_length": 300.578125, "epoch": 0.10165710643722116, "grad_norm": 4.534358501434326, "kl": 0.046875, "learning_rate": 8.983428935627788e-07, "loss": 0.0019, "reward": 1.7708877325057983, "reward_std": 0.1038035899400711, "rewards/answer_reward": 0.1875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5833878517150879, "step": 319 }, { "completion_length": 244.796875, "epoch": 0.10197578075207138, "grad_norm": 7.014145374298096, "kl": 0.0751953125, "learning_rate": 8.980242192479286e-07, "loss": 0.003, "reward": 1.6308785676956177, "reward_std": 0.07405470311641693, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6308785080909729, "step": 320 }, { "completion_length": 257.0625, "epoch": 0.1022944550669216, "grad_norm": 9.099129676818848, "kl": 0.0556640625, "learning_rate": 8.977055449330784e-07, "loss": 0.0022, "reward": 1.7587946653366089, "reward_std": 0.14023171365261078, "rewards/answer_reward": 0.203125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5556697249412537, "step": 321 }, { "completion_length": 229.03125, "epoch": 0.10261312938177183, "grad_norm": 105.9485855102539, "kl": 0.076171875, "learning_rate": 8.973868706182282e-07, "loss": 0.003, "reward": 1.485621452331543, "reward_std": 0.10602043569087982, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48562151193618774, "rewards/pad": 0.0, "step": 322 }, { "completion_length": 187.78125, "epoch": 0.10293180369662205, "grad_norm": 10.623984336853027, "kl": 0.1025390625, "learning_rate": 8.97068196303378e-07, "loss": 0.0041, "reward": 1.484621524810791, "reward_std": 0.20591312646865845, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5002465844154358, "rewards/pad": 0.0, "step": 323 }, { "completion_length": 318.578125, "epoch": 0.10325047801147227, "grad_norm": 34.81837844848633, "kl": 0.044189453125, "learning_rate": 8.967495219885277e-07, "loss": 0.0018, "reward": 1.6456243991851807, "reward_std": 0.19282664358615875, "rewards/pad": 0.21875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4424992799758911, "step": 324 }, { "completion_length": 228.359375, "epoch": 0.1035691523263225, "grad_norm": 26.958513259887695, "kl": 0.06640625, "learning_rate": 8.964308476736775e-07, "loss": 0.0027, "reward": 1.664658546447754, "reward_std": 0.06885822117328644, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4146585166454315, "rewards/pad": 0.25, "step": 325 }, { "completion_length": 161.21875, "epoch": 0.10388782664117271, "grad_norm": 13.990256309509277, "kl": 0.08203125, "learning_rate": 8.961121733588273e-07, "loss": 0.0033, "reward": 1.5286167860031128, "reward_std": 0.20374968647956848, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.497366726398468, "rewards/pad": 0.03125, "step": 326 }, { "completion_length": 377.734375, "epoch": 0.10420650095602295, "grad_norm": 5.246281623840332, "kl": 0.0263671875, "learning_rate": 8.957934990439771e-07, "loss": 0.0011, "reward": 1.5219590663909912, "reward_std": 0.02962879277765751, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.396959125995636, "step": 327 }, { "completion_length": 199.171875, "epoch": 0.10452517527087317, "grad_norm": 12.192336082458496, "kl": 0.06396484375, "learning_rate": 8.954748247291268e-07, "loss": 0.0026, "reward": 1.5578429698944092, "reward_std": 0.16721788048744202, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4328429698944092, "rewards/pad": 0.125, "step": 328 }, { "completion_length": 299.09375, "epoch": 0.10484384958572339, "grad_norm": 6.608733177185059, "kl": 0.03857421875, "learning_rate": 8.951561504142766e-07, "loss": 0.0015, "reward": 1.5235612392425537, "reward_std": 0.12073105573654175, "rewards/answer_reward": 0.15625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3673113286495209, "step": 329 }, { "completion_length": 118.078125, "epoch": 0.10516252390057361, "grad_norm": 8.536271095275879, "kl": 0.08349609375, "learning_rate": 8.948374760994264e-07, "loss": 0.0033, "reward": 1.59187912940979, "reward_std": 0.12341618537902832, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5918790698051453, "rewards/pad": 0.0, "step": 330 }, { "completion_length": 265.359375, "epoch": 0.10548119821542384, "grad_norm": 7.807065486907959, "kl": 0.05419921875, "learning_rate": 8.945188017845762e-07, "loss": 0.0022, "reward": 1.397184133529663, "reward_std": 0.06614439189434052, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.39718419313430786, "step": 331 }, { "completion_length": 214.828125, "epoch": 0.10579987253027406, "grad_norm": 19.004209518432617, "kl": 0.056640625, "learning_rate": 8.942001274697259e-07, "loss": 0.0023, "reward": 1.6144123077392578, "reward_std": 0.16007590293884277, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4581623673439026, "rewards/pad": 0.15625, "step": 332 }, { "completion_length": 267.265625, "epoch": 0.10611854684512428, "grad_norm": 14.096921920776367, "kl": 0.064453125, "learning_rate": 8.938814531548757e-07, "loss": 0.0026, "reward": 1.404233455657959, "reward_std": 0.05870746821165085, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40423351526260376, "rewards/pad": 0.0, "step": 333 }, { "completion_length": 346.796875, "epoch": 0.1064372211599745, "grad_norm": 4.083738803863525, "kl": 0.0400390625, "learning_rate": 8.935627788400254e-07, "loss": 0.0016, "reward": 1.4418816566467285, "reward_std": 0.08311553299427032, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4262566566467285, "step": 334 }, { "completion_length": 345.625, "epoch": 0.10675589547482472, "grad_norm": 11.4860200881958, "kl": 0.042236328125, "learning_rate": 8.932441045251752e-07, "loss": 0.0017, "reward": 1.4205396175384521, "reward_std": 0.1086416095495224, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4361645579338074, "step": 335 }, { "completion_length": 330.78125, "epoch": 0.10707456978967496, "grad_norm": 6.222726821899414, "kl": 0.048828125, "learning_rate": 8.929254302103249e-07, "loss": 0.002, "reward": 1.408251166343689, "reward_std": 0.05332936346530914, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.40825116634368896, "step": 336 }, { "completion_length": 351.40625, "epoch": 0.10739324410452518, "grad_norm": 4.140896320343018, "kl": 0.034423828125, "learning_rate": 8.926067558954747e-07, "loss": 0.0014, "reward": 1.4266270399093628, "reward_std": 0.14883245527744293, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.364126980304718, "step": 337 }, { "completion_length": 282.296875, "epoch": 0.1077119184193754, "grad_norm": 12.383210182189941, "kl": 0.060302734375, "learning_rate": 8.922880815806245e-07, "loss": 0.0024, "reward": 1.5008018016815186, "reward_std": 0.1287446767091751, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4383017420768738, "rewards/pad": 0.0625, "step": 338 }, { "completion_length": 245.546875, "epoch": 0.10803059273422562, "grad_norm": 11.367423057556152, "kl": 0.07080078125, "learning_rate": 8.919694072657742e-07, "loss": 0.0028, "reward": 1.3609724044799805, "reward_std": 0.16549429297447205, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.31409746408462524, "rewards/pad": 0.046875, "step": 339 }, { "completion_length": 259.6875, "epoch": 0.10834926704907584, "grad_norm": 8.234755516052246, "kl": 0.0615234375, "learning_rate": 8.916507329509241e-07, "loss": 0.0025, "reward": 1.5450584888458252, "reward_std": 0.13844075798988342, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4825584590435028, "rewards/pad": 0.0625, "step": 340 }, { "completion_length": 173.46875, "epoch": 0.10866794136392607, "grad_norm": 21.999162673950195, "kl": 0.0966796875, "learning_rate": 8.913320586360739e-07, "loss": 0.0039, "reward": 1.5725913047790527, "reward_std": 0.22323568165302277, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5257163047790527, "step": 341 }, { "completion_length": 225.90625, "epoch": 0.1089866156787763, "grad_norm": 11.320805549621582, "kl": 0.056396484375, "learning_rate": 8.910133843212237e-07, "loss": 0.0023, "reward": 1.5097086429595947, "reward_std": 0.15257683396339417, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.2909587025642395, "step": 342 }, { "completion_length": 208.484375, "epoch": 0.10930528999362651, "grad_norm": 5.40504789352417, "kl": 0.06884765625, "learning_rate": 8.906947100063734e-07, "loss": 0.0028, "reward": 1.46799635887146, "reward_std": 0.11990050971508026, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45237141847610474, "rewards/pad": 0.015625, "step": 343 }, { "completion_length": 174.84375, "epoch": 0.10962396430847673, "grad_norm": 17.96059799194336, "kl": 0.08544921875, "learning_rate": 8.903760356915232e-07, "loss": 0.0034, "reward": 1.5519987344741821, "reward_std": 0.12879955768585205, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.44262367486953735, "step": 344 }, { "completion_length": 249.40625, "epoch": 0.10994263862332695, "grad_norm": 9.05187702178955, "kl": 0.053466796875, "learning_rate": 8.90057361376673e-07, "loss": 0.0021, "reward": 1.4286737442016602, "reward_std": 0.18673668801784515, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 0.953125, "rewards/iou_glue_reward": 0.334923654794693, "step": 345 }, { "completion_length": 286.1875, "epoch": 0.11026131293817719, "grad_norm": 80.49254608154297, "kl": 0.439453125, "learning_rate": 8.897386870618228e-07, "loss": 0.0175, "reward": 1.314738154411316, "reward_std": 0.1686519980430603, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.2991131544113159, "step": 346 }, { "completion_length": 260.796875, "epoch": 0.11057998725302741, "grad_norm": 13.089540481567383, "kl": 0.05322265625, "learning_rate": 8.894200127469725e-07, "loss": 0.0021, "reward": 1.3983441591262817, "reward_std": 0.20051315426826477, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.41396912932395935, "rewards/pad": 0.015625, "step": 347 }, { "completion_length": 216.28125, "epoch": 0.11089866156787763, "grad_norm": 8.443741798400879, "kl": 0.11865234375, "learning_rate": 8.891013384321223e-07, "loss": 0.0047, "reward": 1.6128265857696533, "reward_std": 0.16433218121528625, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5347015857696533, "step": 348 }, { "completion_length": 272.296875, "epoch": 0.11121733588272785, "grad_norm": 10.677943229675293, "kl": 0.060791015625, "learning_rate": 8.887826641172721e-07, "loss": 0.0024, "reward": 1.3177645206451416, "reward_std": 0.11974099278450012, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3333895206451416, "rewards/pad": 0.0, "step": 349 }, { "completion_length": 273.75, "epoch": 0.11153601019757807, "grad_norm": 24.402652740478516, "kl": 0.060302734375, "learning_rate": 8.884639898024219e-07, "loss": 0.0024, "reward": 1.3256800174713135, "reward_std": 0.1037040650844574, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.2163049727678299, "rewards/pad": 0.109375, "step": 350 }, { "completion_length": 322.640625, "epoch": 0.1118546845124283, "grad_norm": 8.839086532592773, "kl": 0.0458984375, "learning_rate": 8.881453154875716e-07, "loss": 0.0018, "reward": 1.4524686336517334, "reward_std": 0.14583848416805267, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.389968603849411, "step": 351 }, { "completion_length": 272.03125, "epoch": 0.11217335882727852, "grad_norm": 6.449258804321289, "kl": 0.0537109375, "learning_rate": 8.878266411727214e-07, "loss": 0.0021, "reward": 1.493812084197998, "reward_std": 0.16063782572746277, "rewards/answer_reward": 0.09375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.41568708419799805, "step": 352 }, { "completion_length": 241.96875, "epoch": 0.11249203314212874, "grad_norm": 7.824343204498291, "kl": 0.0654296875, "learning_rate": 8.875079668578712e-07, "loss": 0.0026, "reward": 1.4458305835723877, "reward_std": 0.08239156007766724, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4458305835723877, "step": 353 }, { "completion_length": 275.515625, "epoch": 0.11281070745697896, "grad_norm": 4.663790225982666, "kl": 0.06884765625, "learning_rate": 8.87189292543021e-07, "loss": 0.0028, "reward": 1.4097020626068115, "reward_std": 0.0783160850405693, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4097020626068115, "rewards/pad": 0.0, "step": 354 }, { "completion_length": 254.234375, "epoch": 0.11312938177182918, "grad_norm": 6.693781852722168, "kl": 0.0673828125, "learning_rate": 8.868706182281707e-07, "loss": 0.0027, "reward": 1.324296474456787, "reward_std": 0.07587652653455734, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.32429641485214233, "rewards/pad": 0.0, "step": 355 }, { "completion_length": 212.15625, "epoch": 0.11344805608667942, "grad_norm": 11.825392723083496, "kl": 0.1328125, "learning_rate": 8.865519439133205e-07, "loss": 0.0053, "reward": 1.5467828512191772, "reward_std": 0.0978686735033989, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42178288102149963, "rewards/pad": 0.125, "step": 356 }, { "completion_length": 292.421875, "epoch": 0.11376673040152964, "grad_norm": 14.117955207824707, "kl": 0.052734375, "learning_rate": 8.862332695984703e-07, "loss": 0.0021, "reward": 1.4794337749481201, "reward_std": 0.03357456997036934, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47943371534347534, "step": 357 }, { "completion_length": 367.53125, "epoch": 0.11408540471637986, "grad_norm": 4.155237674713135, "kl": 0.035888671875, "learning_rate": 8.859145952836202e-07, "loss": 0.0014, "reward": 1.467307448387146, "reward_std": 0.07496380805969238, "rewards/answer_reward": 0.03125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.43605750799179077, "step": 358 }, { "completion_length": 227.203125, "epoch": 0.11440407903123008, "grad_norm": 7.234963893890381, "kl": 0.05859375, "learning_rate": 8.855959209687699e-07, "loss": 0.0024, "reward": 1.510168194770813, "reward_std": 0.09282274544239044, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5101682543754578, "rewards/pad": 0.0, "step": 359 }, { "completion_length": 203.203125, "epoch": 0.1147227533460803, "grad_norm": 9.693031311035156, "kl": 0.146484375, "learning_rate": 8.852772466539197e-07, "loss": 0.0059, "reward": 1.6317123174667358, "reward_std": 0.09545093774795532, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5067123770713806, "rewards/pad": 0.125, "step": 360 }, { "completion_length": 253.171875, "epoch": 0.11504142766093053, "grad_norm": 9.964339256286621, "kl": 0.0595703125, "learning_rate": 8.849585723390695e-07, "loss": 0.0024, "reward": 1.4393465518951416, "reward_std": 0.16357487440109253, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.39247167110443115, "step": 361 }, { "completion_length": 290.328125, "epoch": 0.11536010197578075, "grad_norm": 10.486550331115723, "kl": 0.04345703125, "learning_rate": 8.846398980242193e-07, "loss": 0.0017, "reward": 1.4310604333877563, "reward_std": 0.1469384729862213, "rewards/answer_reward": 0.046875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.38418546319007874, "step": 362 }, { "completion_length": 171.390625, "epoch": 0.11567877629063097, "grad_norm": 12.362403869628906, "kl": 0.09375, "learning_rate": 8.84321223709369e-07, "loss": 0.0038, "reward": 1.4338560104370117, "reward_std": 0.2927459180355072, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.21510589122772217, "step": 363 }, { "completion_length": 256.15625, "epoch": 0.1159974506054812, "grad_norm": 20.424320220947266, "kl": 0.050537109375, "learning_rate": 8.840025493945188e-07, "loss": 0.002, "reward": 1.4041390419006348, "reward_std": 0.09797428548336029, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40413904190063477, "rewards/pad": 0.0, "step": 364 }, { "completion_length": 311.4375, "epoch": 0.11631612492033142, "grad_norm": 22.563579559326172, "kl": 0.045166015625, "learning_rate": 8.836838750796686e-07, "loss": 0.0018, "reward": 1.4641354084014893, "reward_std": 0.14457935094833374, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.37038543820381165, "step": 365 }, { "completion_length": 194.265625, "epoch": 0.11663479923518165, "grad_norm": 10.244625091552734, "kl": 0.06689453125, "learning_rate": 8.833652007648184e-07, "loss": 0.0027, "reward": 1.493947982788086, "reward_std": 0.14087332785129547, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46269798278808594, "rewards/pad": 0.03125, "step": 366 }, { "completion_length": 222.171875, "epoch": 0.11695347355003187, "grad_norm": 9.311911582946777, "kl": 0.060546875, "learning_rate": 8.830465264499681e-07, "loss": 0.0024, "reward": 1.7048492431640625, "reward_std": 0.10070245712995529, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5798492431640625, "rewards/pad": 0.125, "step": 367 }, { "completion_length": 237.078125, "epoch": 0.11727214786488209, "grad_norm": 17.15612030029297, "kl": 0.06640625, "learning_rate": 8.827278521351179e-07, "loss": 0.0027, "reward": 1.4977259635925293, "reward_std": 0.12879906594753265, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4664759337902069, "rewards/pad": 0.03125, "step": 368 }, { "completion_length": 280.4375, "epoch": 0.11759082217973231, "grad_norm": 9.260272026062012, "kl": 0.048583984375, "learning_rate": 8.824091778202677e-07, "loss": 0.0019, "reward": 1.4719653129577637, "reward_std": 0.12129160016775131, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44071537256240845, "step": 369 }, { "completion_length": 193.40625, "epoch": 0.11790949649458253, "grad_norm": 9.00613021850586, "kl": 0.068359375, "learning_rate": 8.820905035054175e-07, "loss": 0.0027, "reward": 1.5651819705963135, "reward_std": 0.12391842901706696, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5651820302009583, "rewards/pad": 0.0, "step": 370 }, { "completion_length": 176.5625, "epoch": 0.11822817080943276, "grad_norm": 8.101126670837402, "kl": 0.07080078125, "learning_rate": 8.817718291905672e-07, "loss": 0.0028, "reward": 1.5040981769561768, "reward_std": 0.22234085202217102, "rewards/answer_reward": 0.203125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.316598117351532, "step": 371 }, { "completion_length": 194.0625, "epoch": 0.11854684512428298, "grad_norm": 24.81196403503418, "kl": 0.06884765625, "learning_rate": 8.81453154875717e-07, "loss": 0.0027, "reward": 1.6260745525360107, "reward_std": 0.1592978686094284, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5323246121406555, "step": 372 }, { "completion_length": 159.390625, "epoch": 0.1188655194391332, "grad_norm": 19.618276596069336, "kl": 0.078125, "learning_rate": 8.811344805608667e-07, "loss": 0.0031, "reward": 1.5059666633605957, "reward_std": 0.15724095702171326, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.38096657395362854, "step": 373 }, { "completion_length": 152.96875, "epoch": 0.11918419375398343, "grad_norm": 7.2891716957092285, "kl": 0.0888671875, "learning_rate": 8.808158062460164e-07, "loss": 0.0036, "reward": 1.4844080209732056, "reward_std": 0.11266922950744629, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4844079315662384, "rewards/pad": 0.0, "step": 374 }, { "completion_length": 201.46875, "epoch": 0.11950286806883365, "grad_norm": 10.135858535766602, "kl": 0.07568359375, "learning_rate": 8.804971319311662e-07, "loss": 0.003, "reward": 1.5312637090682983, "reward_std": 0.17054462432861328, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5156386494636536, "rewards/pad": 0.03125, "step": 375 }, { "completion_length": 211.265625, "epoch": 0.11982154238368388, "grad_norm": 34.30707931518555, "kl": 0.06982421875, "learning_rate": 8.80178457616316e-07, "loss": 0.0028, "reward": 1.6000438928604126, "reward_std": 0.1634560525417328, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3812939524650574, "rewards/pad": 0.21875, "step": 376 }, { "completion_length": 356.96875, "epoch": 0.1201402166985341, "grad_norm": 14.304428100585938, "kl": 0.0546875, "learning_rate": 8.798597833014659e-07, "loss": 0.0022, "reward": 1.4964876174926758, "reward_std": 0.09968910366296768, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.38711249828338623, "step": 377 }, { "completion_length": 209.75, "epoch": 0.12045889101338432, "grad_norm": 7.223150730133057, "kl": 0.06201171875, "learning_rate": 8.795411089866156e-07, "loss": 0.0025, "reward": 1.6077044010162354, "reward_std": 0.21932819485664368, "rewards/pad": 0.265625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.35770437121391296, "step": 378 }, { "completion_length": 156.109375, "epoch": 0.12077756532823454, "grad_norm": 48.64055633544922, "kl": 0.0888671875, "learning_rate": 8.792224346717654e-07, "loss": 0.0036, "reward": 1.6563538312911987, "reward_std": 0.12122064083814621, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.500103771686554, "rewards/pad": 0.15625, "step": 379 }, { "completion_length": 239.0625, "epoch": 0.12109623964308477, "grad_norm": 18.189565658569336, "kl": 0.06884765625, "learning_rate": 8.789037603569152e-07, "loss": 0.0027, "reward": 1.4749139547348022, "reward_std": 0.18104201555252075, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.506164014339447, "step": 380 }, { "completion_length": 249.109375, "epoch": 0.121414913957935, "grad_norm": 7.9685139656066895, "kl": 0.06396484375, "learning_rate": 8.78585086042065e-07, "loss": 0.0026, "reward": 1.3926160335540771, "reward_std": 0.1556987464427948, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.40824100375175476, "rewards/pad": 0.0, "step": 381 }, { "completion_length": 235.421875, "epoch": 0.12173358827278521, "grad_norm": 16.168167114257812, "kl": 0.0556640625, "learning_rate": 8.782664117272147e-07, "loss": 0.0022, "reward": 1.5087082386016846, "reward_std": 0.17400582134723663, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4462081789970398, "step": 382 }, { "completion_length": 247.765625, "epoch": 0.12205226258763544, "grad_norm": 7.005702972412109, "kl": 0.05517578125, "learning_rate": 8.779477374123645e-07, "loss": 0.0022, "reward": 1.4749236106872559, "reward_std": 0.1308993399143219, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4280485510826111, "rewards/pad": 0.046875, "step": 383 }, { "completion_length": 262.9375, "epoch": 0.12237093690248566, "grad_norm": 31.693580627441406, "kl": 0.055419921875, "learning_rate": 8.776290630975143e-07, "loss": 0.0022, "reward": 1.641816258430481, "reward_std": 0.12899675965309143, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5480663776397705, "rewards/pad": 0.109375, "step": 384 }, { "completion_length": 242.15625, "epoch": 0.12268961121733589, "grad_norm": 11.220101356506348, "kl": 0.09619140625, "learning_rate": 8.773103887826641e-07, "loss": 0.0039, "reward": 1.4588701725006104, "reward_std": 0.14389871060848236, "rewards/answer_reward": 0.046875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4119952321052551, "step": 385 }, { "completion_length": 183.5625, "epoch": 0.12300828553218611, "grad_norm": 34.87577438354492, "kl": 0.1025390625, "learning_rate": 8.769917144678138e-07, "loss": 0.0041, "reward": 1.3601453304290771, "reward_std": 0.10742165893316269, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3757703900337219, "rewards/pad": 0.0, "step": 386 }, { "completion_length": 163.171875, "epoch": 0.12332695984703633, "grad_norm": 11.912552833557129, "kl": 0.08154296875, "learning_rate": 8.766730401529636e-07, "loss": 0.0033, "reward": 1.6810275316238403, "reward_std": 0.17379043996334076, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5247775912284851, "rewards/pad": 0.15625, "step": 387 }, { "completion_length": 218.671875, "epoch": 0.12364563416188655, "grad_norm": 11.059039115905762, "kl": 0.061767578125, "learning_rate": 8.763543658381134e-07, "loss": 0.0025, "reward": 1.9036428928375244, "reward_std": 0.17060169577598572, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4973929524421692, "rewards/pad": 0.40625, "step": 388 }, { "completion_length": 271.8125, "epoch": 0.12396430847673677, "grad_norm": 10.898370742797852, "kl": 0.047607421875, "learning_rate": 8.760356915232632e-07, "loss": 0.0019, "reward": 1.503767728805542, "reward_std": 0.1309741884469986, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4881427586078644, "step": 389 }, { "completion_length": 294.71875, "epoch": 0.124282982791587, "grad_norm": 12.679078102111816, "kl": 0.050048828125, "learning_rate": 8.757170172084129e-07, "loss": 0.002, "reward": 1.5640192031860352, "reward_std": 0.07892128080129623, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.45464423298835754, "step": 390 }, { "completion_length": 206.59375, "epoch": 0.12460165710643722, "grad_norm": 9.037927627563477, "kl": 0.064453125, "learning_rate": 8.753983428935627e-07, "loss": 0.0026, "reward": 1.6654486656188965, "reward_std": 0.15028820931911469, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.43107372522354126, "step": 391 }, { "completion_length": 336.921875, "epoch": 0.12492033142128744, "grad_norm": 12.565431594848633, "kl": 0.047607421875, "learning_rate": 8.750796685787125e-07, "loss": 0.0019, "reward": 1.3818833827972412, "reward_std": 0.11399512737989426, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.39750826358795166, "step": 392 }, { "completion_length": 284.0625, "epoch": 0.12523900573613767, "grad_norm": 6.1361894607543945, "kl": 0.054443359375, "learning_rate": 8.747609942638623e-07, "loss": 0.0022, "reward": 1.4880211353302002, "reward_std": 0.21052920818328857, "rewards/answer_reward": 0.203125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3005211055278778, "step": 393 }, { "completion_length": 267.375, "epoch": 0.12555768005098789, "grad_norm": 10.701099395751953, "kl": 0.0556640625, "learning_rate": 8.74442319949012e-07, "loss": 0.0022, "reward": 1.5392203330993652, "reward_std": 0.12588217854499817, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.42984533309936523, "step": 394 }, { "completion_length": 369.78125, "epoch": 0.1258763543658381, "grad_norm": 5.1260247230529785, "kl": 0.035888671875, "learning_rate": 8.741236456341619e-07, "loss": 0.0014, "reward": 1.3278656005859375, "reward_std": 0.07222297787666321, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3434906005859375, "step": 395 }, { "completion_length": 151.984375, "epoch": 0.12619502868068833, "grad_norm": 11.32140064239502, "kl": 0.10009765625, "learning_rate": 8.738049713193117e-07, "loss": 0.004, "reward": 1.3771092891693115, "reward_std": 0.22868841886520386, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.2833593487739563, "step": 396 }, { "completion_length": 156.859375, "epoch": 0.12651370299553855, "grad_norm": 6.877212047576904, "kl": 0.0869140625, "learning_rate": 8.734862970044615e-07, "loss": 0.0035, "reward": 1.7244150638580322, "reward_std": 0.2154211401939392, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47441503405570984, "rewards/pad": 0.25, "step": 397 }, { "completion_length": 295.65625, "epoch": 0.1268323773103888, "grad_norm": 10.377167701721191, "kl": 0.0458984375, "learning_rate": 8.731676226896112e-07, "loss": 0.0018, "reward": 1.5047334432601929, "reward_std": 0.17024171352386475, "rewards/answer_reward": 0.171875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3328584134578705, "step": 398 }, { "completion_length": 256.75, "epoch": 0.12715105162523901, "grad_norm": 14.790071487426758, "kl": 0.06494140625, "learning_rate": 8.72848948374761e-07, "loss": 0.0026, "reward": 1.4586857557296753, "reward_std": 0.08622744679450989, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.45868581533432007, "step": 399 }, { "completion_length": 110.484375, "epoch": 0.12746972594008923, "grad_norm": 26.442523956298828, "kl": 0.0986328125, "learning_rate": 8.725302740599108e-07, "loss": 0.004, "reward": 1.4176801443099976, "reward_std": 0.20065349340438843, "rewards/answer_reward": 0.09375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.33955517411231995, "step": 400 }, { "completion_length": 315.40625, "epoch": 0.12778840025493945, "grad_norm": 9.916601181030273, "kl": 0.0888671875, "learning_rate": 8.722115997450606e-07, "loss": 0.0035, "reward": 1.4531784057617188, "reward_std": 0.13725990056991577, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4688034653663635, "step": 401 }, { "completion_length": 150.46875, "epoch": 0.12810707456978968, "grad_norm": 11.95114517211914, "kl": 0.0888671875, "learning_rate": 8.718929254302103e-07, "loss": 0.0036, "reward": 1.3613560199737549, "reward_std": 0.15858888626098633, "rewards/answer_reward": 0.03125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3301060199737549, "step": 402 }, { "completion_length": 275.046875, "epoch": 0.1284257488846399, "grad_norm": 9.421660423278809, "kl": 0.06103515625, "learning_rate": 8.715742511153601e-07, "loss": 0.0024, "reward": 1.4936609268188477, "reward_std": 0.1545773297548294, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.3842858672142029, "rewards/pad": 0.140625, "step": 403 }, { "completion_length": 194.421875, "epoch": 0.12874442319949012, "grad_norm": 11.03164005279541, "kl": 0.0849609375, "learning_rate": 8.712555768005099e-07, "loss": 0.0034, "reward": 1.454899787902832, "reward_std": 0.10735487937927246, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45489975810050964, "rewards/pad": 0.0, "step": 404 }, { "completion_length": 178.15625, "epoch": 0.12906309751434034, "grad_norm": 11.80660343170166, "kl": 0.07666015625, "learning_rate": 8.709369024856596e-07, "loss": 0.0031, "reward": 1.655862808227539, "reward_std": 0.16924072802066803, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5464878678321838, "rewards/pad": 0.125, "step": 405 }, { "completion_length": 237.390625, "epoch": 0.12938177182919056, "grad_norm": 11.708005905151367, "kl": 0.052490234375, "learning_rate": 8.706182281708094e-07, "loss": 0.0021, "reward": 1.5820250511169434, "reward_std": 0.18636751174926758, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4882751405239105, "step": 406 }, { "completion_length": 194.171875, "epoch": 0.1297004461440408, "grad_norm": 8.571614265441895, "kl": 0.0830078125, "learning_rate": 8.702995538559592e-07, "loss": 0.0033, "reward": 1.6992355585098267, "reward_std": 0.23588824272155762, "rewards/pad": 0.203125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4961106479167938, "step": 407 }, { "completion_length": 223.0625, "epoch": 0.13001912045889102, "grad_norm": 8.831110000610352, "kl": 0.07568359375, "learning_rate": 8.69980879541109e-07, "loss": 0.003, "reward": 1.5184645652770996, "reward_std": 0.06349757313728333, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5184646844863892, "step": 408 }, { "completion_length": 347.59375, "epoch": 0.13033779477374124, "grad_norm": 4.942235469818115, "kl": 0.032470703125, "learning_rate": 8.696622052262587e-07, "loss": 0.0013, "reward": 1.588958740234375, "reward_std": 0.07658353447914124, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4795836806297302, "step": 409 }, { "completion_length": 211.40625, "epoch": 0.13065646908859146, "grad_norm": 22.08603286743164, "kl": 0.0634765625, "learning_rate": 8.693435309114085e-07, "loss": 0.0025, "reward": 1.6493875980377197, "reward_std": 0.09739524126052856, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3993876278400421, "step": 410 }, { "completion_length": 107.5625, "epoch": 0.13097514340344169, "grad_norm": 15.189661026000977, "kl": 0.09619140625, "learning_rate": 8.690248565965583e-07, "loss": 0.0038, "reward": 1.763694167137146, "reward_std": 0.19237318634986877, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.7011940479278564, "rewards/pad": 0.0625, "step": 411 }, { "completion_length": 231.03125, "epoch": 0.1312938177182919, "grad_norm": 9.504136085510254, "kl": 0.056640625, "learning_rate": 8.68706182281708e-07, "loss": 0.0023, "reward": 1.6230957508087158, "reward_std": 0.14532506465911865, "rewards/answer_reward": 0.015625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.6230956315994263, "step": 412 }, { "completion_length": 229.5, "epoch": 0.13161249203314213, "grad_norm": 37.026771545410156, "kl": 0.06787109375, "learning_rate": 8.683875079668577e-07, "loss": 0.0027, "reward": 1.5715396404266357, "reward_std": 0.10729014128446579, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44653964042663574, "rewards/pad": 0.125, "step": 413 }, { "completion_length": 243.46875, "epoch": 0.13193116634799235, "grad_norm": 20.16276741027832, "kl": 0.0693359375, "learning_rate": 8.680688336520075e-07, "loss": 0.0028, "reward": 1.6400110721588135, "reward_std": 0.1305033415555954, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5462610125541687, "step": 414 }, { "completion_length": 239.890625, "epoch": 0.13224984066284257, "grad_norm": 8.99725341796875, "kl": 0.06396484375, "learning_rate": 8.677501593371574e-07, "loss": 0.0025, "reward": 1.3902020454406738, "reward_std": 0.0927668884396553, "rewards/answer_reward": 0.09375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.2964521050453186, "step": 415 }, { "completion_length": 164.96875, "epoch": 0.1325685149776928, "grad_norm": 15.773292541503906, "kl": 0.0830078125, "learning_rate": 8.674314850223072e-07, "loss": 0.0033, "reward": 1.677088737487793, "reward_std": 0.21771536767482758, "rewards/answer_reward": 0.171875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5052136778831482, "step": 416 }, { "completion_length": 232.4375, "epoch": 0.13288718929254303, "grad_norm": 18.085857391357422, "kl": 0.0615234375, "learning_rate": 8.671128107074569e-07, "loss": 0.0025, "reward": 1.6815763711929321, "reward_std": 0.17520847916603088, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47845137119293213, "rewards/pad": 0.203125, "step": 417 }, { "completion_length": 200.625, "epoch": 0.13320586360739325, "grad_norm": 43.0606803894043, "kl": 0.08544921875, "learning_rate": 8.667941363926067e-07, "loss": 0.0034, "reward": 1.5185892581939697, "reward_std": 0.08287880569696426, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5185892581939697, "rewards/pad": 0.0, "step": 418 }, { "completion_length": 236.75, "epoch": 0.13352453792224347, "grad_norm": 10.71466064453125, "kl": 0.07666015625, "learning_rate": 8.664754620777565e-07, "loss": 0.0031, "reward": 1.409736156463623, "reward_std": 0.10538657009601593, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.40973618626594543, "step": 419 }, { "completion_length": 191.15625, "epoch": 0.1338432122370937, "grad_norm": 24.71515655517578, "kl": 0.07861328125, "learning_rate": 8.661567877629063e-07, "loss": 0.0031, "reward": 1.4340004920959473, "reward_std": 0.10084269940853119, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43400052189826965, "rewards/pad": 0.0, "step": 420 }, { "completion_length": 268.671875, "epoch": 0.13416188655194392, "grad_norm": 14.787396430969238, "kl": 0.057373046875, "learning_rate": 8.65838113448056e-07, "loss": 0.0023, "reward": 1.55492103099823, "reward_std": 0.08806140720844269, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5549209713935852, "rewards/pad": 0.0, "step": 421 }, { "completion_length": 234.140625, "epoch": 0.13448056086679414, "grad_norm": 35.28914260864258, "kl": 0.060302734375, "learning_rate": 8.655194391332058e-07, "loss": 0.0024, "reward": 1.6005067825317383, "reward_std": 0.11722254008054733, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6161317825317383, "rewards/pad": 0.0, "step": 422 }, { "completion_length": 158.609375, "epoch": 0.13479923518164436, "grad_norm": 10.974730491638184, "kl": 0.07666015625, "learning_rate": 8.652007648183556e-07, "loss": 0.0031, "reward": 1.6964198350906372, "reward_std": 0.10421520471572876, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4464198350906372, "rewards/pad": 0.25, "step": 423 }, { "completion_length": 237.71875, "epoch": 0.13511790949649458, "grad_norm": 6.558365345001221, "kl": 0.064453125, "learning_rate": 8.648820905035054e-07, "loss": 0.0026, "reward": 1.498208999633789, "reward_std": 0.0822606012225151, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49820899963378906, "rewards/pad": 0.0, "step": 424 }, { "completion_length": 225.234375, "epoch": 0.1354365838113448, "grad_norm": 40.3632926940918, "kl": 0.0703125, "learning_rate": 8.645634161886551e-07, "loss": 0.0028, "reward": 1.5778706073760986, "reward_std": 0.1300792247056961, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5934954881668091, "step": 425 }, { "completion_length": 362.8125, "epoch": 0.13575525812619502, "grad_norm": 7.21042013168335, "kl": 0.035888671875, "learning_rate": 8.642447418738049e-07, "loss": 0.0015, "reward": 1.5087440013885498, "reward_std": 0.03255070373415947, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5087440609931946, "rewards/pad": 0.0, "step": 426 }, { "completion_length": 242.03125, "epoch": 0.13607393244104526, "grad_norm": 11.770732879638672, "kl": 0.060546875, "learning_rate": 8.639260675589547e-07, "loss": 0.0024, "reward": 1.4291597604751587, "reward_std": 0.09730469435453415, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.30415982007980347, "rewards/pad": 0.125, "step": 427 }, { "completion_length": 291.734375, "epoch": 0.13639260675589548, "grad_norm": 19.2504940032959, "kl": 0.0595703125, "learning_rate": 8.636073932441045e-07, "loss": 0.0024, "reward": 1.5188581943511963, "reward_std": 0.06681855022907257, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3938581347465515, "step": 428 }, { "completion_length": 264.296875, "epoch": 0.1367112810707457, "grad_norm": 9.25992202758789, "kl": 0.0625, "learning_rate": 8.632887189292542e-07, "loss": 0.0025, "reward": 1.3473349809646606, "reward_std": 0.09401439875364304, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.36296001076698303, "rewards/pad": 0.0, "step": 429 }, { "completion_length": 194.921875, "epoch": 0.13702995538559593, "grad_norm": 27.35711669921875, "kl": 0.07958984375, "learning_rate": 8.62970044614404e-07, "loss": 0.0032, "reward": 1.5259891748428345, "reward_std": 0.17146514356136322, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.40098920464515686, "step": 430 }, { "completion_length": 169.28125, "epoch": 0.13734862970044615, "grad_norm": 14.931100845336914, "kl": 0.08056640625, "learning_rate": 8.626513702995538e-07, "loss": 0.0032, "reward": 1.5208649635314941, "reward_std": 0.13065671920776367, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5208649635314941, "rewards/pad": 0.0, "step": 431 }, { "completion_length": 336.796875, "epoch": 0.13766730401529637, "grad_norm": 12.129999160766602, "kl": 0.05078125, "learning_rate": 8.623326959847035e-07, "loss": 0.002, "reward": 1.339362621307373, "reward_std": 0.12774814665317535, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3549875020980835, "rewards/pad": 0.0, "step": 432 }, { "completion_length": 167.75, "epoch": 0.13798597833014659, "grad_norm": 9.612985610961914, "kl": 0.09033203125, "learning_rate": 8.620140216698534e-07, "loss": 0.0036, "reward": 1.7826504707336426, "reward_std": 0.1886083334684372, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4857754409313202, "rewards/pad": 0.296875, "step": 433 }, { "completion_length": 365.578125, "epoch": 0.1383046526449968, "grad_norm": 5.653805255889893, "kl": 0.035888671875, "learning_rate": 8.616953473550032e-07, "loss": 0.0014, "reward": 1.5266468524932861, "reward_std": 0.03666418045759201, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5266469717025757, "rewards/pad": 0.0, "step": 434 }, { "completion_length": 261.953125, "epoch": 0.13862332695984703, "grad_norm": 14.368305206298828, "kl": 0.130859375, "learning_rate": 8.61376673040153e-07, "loss": 0.0053, "reward": 1.4595415592193604, "reward_std": 0.11646923422813416, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36579152941703796, "step": 435 }, { "completion_length": 287.796875, "epoch": 0.13894200127469725, "grad_norm": 6.295452117919922, "kl": 0.0732421875, "learning_rate": 8.610579987253027e-07, "loss": 0.0029, "reward": 1.5171446800231934, "reward_std": 0.08354522287845612, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.39214473962783813, "step": 436 }, { "completion_length": 266.453125, "epoch": 0.1392606755895475, "grad_norm": 8.192060470581055, "kl": 0.055908203125, "learning_rate": 8.607393244104525e-07, "loss": 0.0022, "reward": 1.730386734008789, "reward_std": 0.18066661059856415, "rewards/pad": 0.1875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5585117340087891, "step": 437 }, { "completion_length": 155.703125, "epoch": 0.13957934990439771, "grad_norm": 38.294921875, "kl": 0.1015625, "learning_rate": 8.604206500956023e-07, "loss": 0.0041, "reward": 1.4056483507156372, "reward_std": 0.09939147531986237, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4056483507156372, "rewards/pad": 0.0, "step": 438 }, { "completion_length": 140.390625, "epoch": 0.13989802421924794, "grad_norm": 14.03580093383789, "kl": 0.25390625, "learning_rate": 8.601019757807521e-07, "loss": 0.0102, "reward": 1.5623379945755005, "reward_std": 0.24287503957748413, "rewards/answer_reward": 0.046875, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5310879349708557, "step": 439 }, { "completion_length": 216.875, "epoch": 0.14021669853409816, "grad_norm": 22.822328567504883, "kl": 0.06982421875, "learning_rate": 8.597833014659018e-07, "loss": 0.0028, "reward": 1.5086784362792969, "reward_std": 0.07392227649688721, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3836785554885864, "step": 440 }, { "completion_length": 176.796875, "epoch": 0.14053537284894838, "grad_norm": 43.985252380371094, "kl": 0.1015625, "learning_rate": 8.594646271510516e-07, "loss": 0.0041, "reward": 1.6859261989593506, "reward_std": 0.11247699707746506, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5765512585639954, "rewards/pad": 0.125, "step": 441 }, { "completion_length": 271.8125, "epoch": 0.1408540471637986, "grad_norm": 33.14993667602539, "kl": 0.06591796875, "learning_rate": 8.591459528362014e-07, "loss": 0.0026, "reward": 1.4822893142700195, "reward_std": 0.07760314643383026, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48228925466537476, "rewards/pad": 0.0, "step": 442 }, { "completion_length": 200.96875, "epoch": 0.14117272147864882, "grad_norm": 10.074478149414062, "kl": 0.07568359375, "learning_rate": 8.588272785213512e-07, "loss": 0.003, "reward": 1.4677287340164185, "reward_std": 0.12547984719276428, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.48335376381874084, "rewards/pad": 0.0, "step": 443 }, { "completion_length": 271.46875, "epoch": 0.14149139579349904, "grad_norm": 7.2311015129089355, "kl": 0.10009765625, "learning_rate": 8.585086042065009e-07, "loss": 0.004, "reward": 1.4389028549194336, "reward_std": 0.06437624990940094, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4389027953147888, "step": 444 }, { "completion_length": 209.609375, "epoch": 0.14181007010834926, "grad_norm": 36.39683151245117, "kl": 0.08837890625, "learning_rate": 8.581899298916507e-07, "loss": 0.0035, "reward": 1.5782438516616821, "reward_std": 0.18926000595092773, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46886885166168213, "rewards/pad": 0.109375, "step": 445 }, { "completion_length": 293.234375, "epoch": 0.14212874442319948, "grad_norm": 8.683002471923828, "kl": 0.064453125, "learning_rate": 8.578712555768005e-07, "loss": 0.0026, "reward": 1.3747520446777344, "reward_std": 0.15450097620487213, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.390377014875412, "step": 446 }, { "completion_length": 319.296875, "epoch": 0.14244741873804972, "grad_norm": 11.498235702514648, "kl": 0.06591796875, "learning_rate": 8.575525812619503e-07, "loss": 0.0027, "reward": 1.410198450088501, "reward_std": 0.0504818931221962, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4101985692977905, "step": 447 }, { "completion_length": 177.375, "epoch": 0.14276609305289995, "grad_norm": 5.232941150665283, "kl": 0.0908203125, "learning_rate": 8.572339069471e-07, "loss": 0.0036, "reward": 1.4534001350402832, "reward_std": 0.053990766406059265, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.453400194644928, "rewards/pad": 0.0, "step": 448 }, { "completion_length": 302.578125, "epoch": 0.14308476736775017, "grad_norm": 34.351104736328125, "kl": 0.051513671875, "learning_rate": 8.569152326322498e-07, "loss": 0.0021, "reward": 1.6452140808105469, "reward_std": 0.06540853530168533, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5202141404151917, "step": 449 }, { "completion_length": 176.640625, "epoch": 0.14340344168260039, "grad_norm": 9.729473114013672, "kl": 0.08203125, "learning_rate": 8.565965583173996e-07, "loss": 0.0033, "reward": 1.4867746829986572, "reward_std": 0.07899289578199387, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.486774742603302, "rewards/pad": 0.0, "step": 450 }, { "completion_length": 169.0, "epoch": 0.1437221159974506, "grad_norm": 9.015345573425293, "kl": 0.07080078125, "learning_rate": 8.562778840025495e-07, "loss": 0.0028, "reward": 1.9677799940109253, "reward_std": 0.1721034198999405, "rewards/answer_reward": 0.53125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.43653005361557007, "step": 451 }, { "completion_length": 199.875, "epoch": 0.14404079031230083, "grad_norm": 8.44957447052002, "kl": 0.076171875, "learning_rate": 8.55959209687699e-07, "loss": 0.003, "reward": 1.7060662508010864, "reward_std": 0.12800918519496918, "rewards/answer_reward": 0.203125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5029412508010864, "step": 452 }, { "completion_length": 189.890625, "epoch": 0.14435946462715105, "grad_norm": 10.389759063720703, "kl": 0.06982421875, "learning_rate": 8.556405353728489e-07, "loss": 0.0028, "reward": 1.3262500762939453, "reward_std": 0.10323597490787506, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3106251060962677, "rewards/pad": 0.015625, "step": 453 }, { "completion_length": 333.3125, "epoch": 0.14467813894200127, "grad_norm": 6.098585605621338, "kl": 0.041748046875, "learning_rate": 8.553218610579987e-07, "loss": 0.0017, "reward": 1.5114164352416992, "reward_std": 0.09109494835138321, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5270413160324097, "step": 454 }, { "completion_length": 245.8125, "epoch": 0.1449968132568515, "grad_norm": 13.058919906616211, "kl": 0.0771484375, "learning_rate": 8.550031867431485e-07, "loss": 0.0031, "reward": 1.5187058448791504, "reward_std": 0.0741514340043068, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5187058448791504, "rewards/pad": 0.0, "step": 455 }, { "completion_length": 247.71875, "epoch": 0.14531548757170173, "grad_norm": 49.04275131225586, "kl": 0.0771484375, "learning_rate": 8.546845124282982e-07, "loss": 0.0031, "reward": 1.6912260055541992, "reward_std": 0.06264584511518478, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5662259459495544, "rewards/pad": 0.125, "step": 456 }, { "completion_length": 207.390625, "epoch": 0.14563416188655195, "grad_norm": 17.323837280273438, "kl": 0.08544921875, "learning_rate": 8.54365838113448e-07, "loss": 0.0034, "reward": 1.5239768028259277, "reward_std": 0.0447956919670105, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5239768028259277, "step": 457 }, { "completion_length": 151.703125, "epoch": 0.14595283620140218, "grad_norm": 12.530643463134766, "kl": 0.09765625, "learning_rate": 8.540471637985978e-07, "loss": 0.0039, "reward": 1.595104455947876, "reward_std": 0.238202303647995, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4544795751571655, "rewards/pad": 0.15625, "step": 458 }, { "completion_length": 178.015625, "epoch": 0.1462715105162524, "grad_norm": 24.093931198120117, "kl": 0.10791015625, "learning_rate": 8.537284894837476e-07, "loss": 0.0043, "reward": 1.462281584739685, "reward_std": 0.09704331308603287, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46228155493736267, "rewards/pad": 0.0, "step": 459 }, { "completion_length": 183.578125, "epoch": 0.14659018483110262, "grad_norm": 8.605521202087402, "kl": 0.1015625, "learning_rate": 8.534098151688973e-07, "loss": 0.0041, "reward": 1.644413948059082, "reward_std": 0.19280801713466644, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.519413948059082, "step": 460 }, { "completion_length": 185.109375, "epoch": 0.14690885914595284, "grad_norm": 9.727154731750488, "kl": 0.09521484375, "learning_rate": 8.530911408540471e-07, "loss": 0.0038, "reward": 1.4764418601989746, "reward_std": 0.1266682744026184, "rewards/answer_reward": 0.015625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4608168601989746, "step": 461 }, { "completion_length": 213.90625, "epoch": 0.14722753346080306, "grad_norm": 26.93505096435547, "kl": 0.07275390625, "learning_rate": 8.527724665391969e-07, "loss": 0.0029, "reward": 1.388505220413208, "reward_std": 0.15996669232845306, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.31038016080856323, "step": 462 }, { "completion_length": 340.046875, "epoch": 0.14754620777565328, "grad_norm": 5.181948184967041, "kl": 0.04736328125, "learning_rate": 8.524537922243466e-07, "loss": 0.0019, "reward": 1.4414582252502441, "reward_std": 0.13214264810085297, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.47270816564559937, "step": 463 }, { "completion_length": 365.53125, "epoch": 0.1478648820905035, "grad_norm": 6.609642505645752, "kl": 0.03662109375, "learning_rate": 8.521351179094964e-07, "loss": 0.0015, "reward": 1.3399658203125, "reward_std": 0.1445762664079666, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.3712157905101776, "step": 464 }, { "completion_length": 132.265625, "epoch": 0.14818355640535372, "grad_norm": 43.083251953125, "kl": 0.10009765625, "learning_rate": 8.518164435946462e-07, "loss": 0.004, "reward": 1.3067584037780762, "reward_std": 0.11628291010856628, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.30675846338272095, "rewards/pad": 0.0, "step": 465 }, { "completion_length": 202.140625, "epoch": 0.14850223072020396, "grad_norm": 6.811500072479248, "kl": 0.0869140625, "learning_rate": 8.51497769279796e-07, "loss": 0.0035, "reward": 1.5276761054992676, "reward_std": 0.2129315733909607, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.35580113530158997, "rewards/pad": 0.203125, "step": 466 }, { "completion_length": 209.265625, "epoch": 0.14882090503505419, "grad_norm": 13.386691093444824, "kl": 0.07568359375, "learning_rate": 8.511790949649457e-07, "loss": 0.003, "reward": 1.4336509704589844, "reward_std": 0.19461283087730408, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.3555260896682739, "rewards/pad": 0.109375, "step": 467 }, { "completion_length": 248.515625, "epoch": 0.1491395793499044, "grad_norm": 8.822103500366211, "kl": 0.0703125, "learning_rate": 8.508604206500955e-07, "loss": 0.0028, "reward": 1.490767240524292, "reward_std": 0.06920873373746872, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49076735973358154, "rewards/pad": 0.0, "step": 468 }, { "completion_length": 251.171875, "epoch": 0.14945825366475463, "grad_norm": 10.474466323852539, "kl": 0.07470703125, "learning_rate": 8.505417463352453e-07, "loss": 0.003, "reward": 1.6712453365325928, "reward_std": 0.20213072001934052, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4524953365325928, "step": 469 }, { "completion_length": 216.984375, "epoch": 0.14977692797960485, "grad_norm": 7.846851825714111, "kl": 0.0849609375, "learning_rate": 8.502230720203951e-07, "loss": 0.0034, "reward": 1.558816909790039, "reward_std": 0.2822313904762268, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.35569196939468384, "step": 470 }, { "completion_length": 231.125, "epoch": 0.15009560229445507, "grad_norm": 38.57760238647461, "kl": 0.06884765625, "learning_rate": 8.499043977055449e-07, "loss": 0.0027, "reward": 1.519878625869751, "reward_std": 0.0842830240726471, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39487871527671814, "rewards/pad": 0.125, "step": 471 }, { "completion_length": 143.25, "epoch": 0.1504142766093053, "grad_norm": 32.709808349609375, "kl": 0.1025390625, "learning_rate": 8.495857233906947e-07, "loss": 0.0041, "reward": 1.6208330392837524, "reward_std": 0.14068816602230072, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6364580392837524, "rewards/pad": 0.0, "step": 472 }, { "completion_length": 276.171875, "epoch": 0.1507329509241555, "grad_norm": 51.76963806152344, "kl": 0.05126953125, "learning_rate": 8.492670490758445e-07, "loss": 0.002, "reward": 1.3026278018951416, "reward_std": 0.19959302246570587, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.25575268268585205, "rewards/pad": 0.078125, "step": 473 }, { "completion_length": 240.546875, "epoch": 0.15105162523900573, "grad_norm": 12.2163724899292, "kl": 0.08935546875, "learning_rate": 8.489483747609943e-07, "loss": 0.0036, "reward": 1.482216715812683, "reward_std": 0.14593639969825745, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.41971665620803833, "step": 474 }, { "completion_length": 264.0625, "epoch": 0.15137029955385595, "grad_norm": 11.082056045532227, "kl": 0.064453125, "learning_rate": 8.48629700446144e-07, "loss": 0.0026, "reward": 1.5266757011413574, "reward_std": 0.045626167207956314, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5266757011413574, "step": 475 }, { "completion_length": 243.375, "epoch": 0.1516889738687062, "grad_norm": 10.438226699829102, "kl": 0.06201171875, "learning_rate": 8.483110261312938e-07, "loss": 0.0025, "reward": 1.5385844707489014, "reward_std": 0.13940951228141785, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.476084440946579, "step": 476 }, { "completion_length": 193.578125, "epoch": 0.15200764818355642, "grad_norm": 9.08781909942627, "kl": 0.07763671875, "learning_rate": 8.479923518164436e-07, "loss": 0.0031, "reward": 1.4773859977722168, "reward_std": 0.10028444975614548, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.477385938167572, "step": 477 }, { "completion_length": 244.546875, "epoch": 0.15232632249840664, "grad_norm": 10.916855812072754, "kl": 0.072265625, "learning_rate": 8.476736775015934e-07, "loss": 0.0029, "reward": 1.606418490409851, "reward_std": 0.17406152188777924, "rewards/answer_reward": 0.0625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5439184904098511, "step": 478 }, { "completion_length": 273.5, "epoch": 0.15264499681325686, "grad_norm": 5.499225616455078, "kl": 0.060546875, "learning_rate": 8.473550031867431e-07, "loss": 0.0024, "reward": 1.5092673301696777, "reward_std": 0.08618798106908798, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5092673301696777, "step": 479 }, { "completion_length": 236.703125, "epoch": 0.15296367112810708, "grad_norm": 19.99932098388672, "kl": 0.056396484375, "learning_rate": 8.470363288718929e-07, "loss": 0.0023, "reward": 1.6022439002990723, "reward_std": 0.13475370407104492, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4616188406944275, "step": 480 }, { "completion_length": 326.90625, "epoch": 0.1532823454429573, "grad_norm": 10.373702049255371, "kl": 0.05810546875, "learning_rate": 8.467176545570427e-07, "loss": 0.0023, "reward": 1.3631759881973267, "reward_std": 0.11231576651334763, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.37880098819732666, "step": 481 }, { "completion_length": 244.359375, "epoch": 0.15360101975780752, "grad_norm": 10.808770179748535, "kl": 0.0712890625, "learning_rate": 8.463989802421925e-07, "loss": 0.0028, "reward": 1.432814598083496, "reward_std": 0.14196133613586426, "rewards/answer_reward": 0.078125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.35468965768814087, "step": 482 }, { "completion_length": 307.0, "epoch": 0.15391969407265774, "grad_norm": 22.456748962402344, "kl": 0.0673828125, "learning_rate": 8.460803059273422e-07, "loss": 0.0027, "reward": 1.431779384613037, "reward_std": 0.07337789237499237, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43177950382232666, "step": 483 }, { "completion_length": 372.8125, "epoch": 0.15423836838750796, "grad_norm": 7.4762139320373535, "kl": 0.037841796875, "learning_rate": 8.45761631612492e-07, "loss": 0.0015, "reward": 1.5914397239685059, "reward_std": 0.0908648818731308, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.48206475377082825, "step": 484 }, { "completion_length": 319.0, "epoch": 0.15455704270235818, "grad_norm": 7.34827995300293, "kl": 0.053955078125, "learning_rate": 8.454429572976418e-07, "loss": 0.0022, "reward": 1.6976406574249268, "reward_std": 0.19344541430473328, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.49451562762260437, "rewards/pad": 0.21875, "step": 485 }, { "completion_length": 241.71875, "epoch": 0.15487571701720843, "grad_norm": 9.271265029907227, "kl": 0.07763671875, "learning_rate": 8.451242829827916e-07, "loss": 0.0031, "reward": 1.671446442604065, "reward_std": 0.125516876578331, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5620712637901306, "rewards/pad": 0.109375, "step": 486 }, { "completion_length": 307.890625, "epoch": 0.15519439133205865, "grad_norm": 5.0968017578125, "kl": 0.05615234375, "learning_rate": 8.448056086679413e-07, "loss": 0.0022, "reward": 1.4919250011444092, "reward_std": 0.07325821369886398, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5075501203536987, "step": 487 }, { "completion_length": 281.625, "epoch": 0.15551306564690887, "grad_norm": 8.375843048095703, "kl": 0.048583984375, "learning_rate": 8.444869343530911e-07, "loss": 0.0019, "reward": 1.793757677078247, "reward_std": 0.09240252524614334, "rewards/pad": 0.296875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5125076770782471, "step": 488 }, { "completion_length": 144.453125, "epoch": 0.15583173996175909, "grad_norm": 35.22758865356445, "kl": 0.1015625, "learning_rate": 8.44168260038241e-07, "loss": 0.0041, "reward": 1.523176670074463, "reward_std": 0.10526008903980255, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5231766700744629, "rewards/pad": 0.0, "step": 489 }, { "completion_length": 159.625, "epoch": 0.1561504142766093, "grad_norm": 13.647049903869629, "kl": 0.08642578125, "learning_rate": 8.438495857233908e-07, "loss": 0.0035, "reward": 1.471238136291504, "reward_std": 0.06791481375694275, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47123825550079346, "step": 490 }, { "completion_length": 306.46875, "epoch": 0.15646908859145953, "grad_norm": 8.69845962524414, "kl": 0.05615234375, "learning_rate": 8.435309114085404e-07, "loss": 0.0023, "reward": 1.3391271829605103, "reward_std": 0.0694485455751419, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.33912718296051025, "rewards/pad": 0.0, "step": 491 }, { "completion_length": 232.046875, "epoch": 0.15678776290630975, "grad_norm": 10.28249740600586, "kl": 0.06787109375, "learning_rate": 8.432122370936902e-07, "loss": 0.0027, "reward": 1.5181519985198975, "reward_std": 0.06104717031121254, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39315202832221985, "rewards/pad": 0.125, "step": 492 }, { "completion_length": 142.765625, "epoch": 0.15710643722115997, "grad_norm": 10.863986015319824, "kl": 0.09033203125, "learning_rate": 8.4289356277884e-07, "loss": 0.0036, "reward": 1.5782145261764526, "reward_std": 0.12296774983406067, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5000894665718079, "step": 493 }, { "completion_length": 182.34375, "epoch": 0.1574251115360102, "grad_norm": 85.28020477294922, "kl": 0.0966796875, "learning_rate": 8.425748884639897e-07, "loss": 0.0038, "reward": 1.455416202545166, "reward_std": 0.08780718594789505, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.45541617274284363, "step": 494 }, { "completion_length": 146.9375, "epoch": 0.1577437858508604, "grad_norm": 34.29008865356445, "kl": 0.08837890625, "learning_rate": 8.422562141491395e-07, "loss": 0.0035, "reward": 1.630502700805664, "reward_std": 0.13765683770179749, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5055026412010193, "rewards/pad": 0.125, "step": 495 }, { "completion_length": 223.4375, "epoch": 0.15806246016571066, "grad_norm": 7.224981784820557, "kl": 0.07958984375, "learning_rate": 8.419375398342893e-07, "loss": 0.0032, "reward": 1.4344799518585205, "reward_std": 0.07560813426971436, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4344799518585205, "rewards/pad": 0.0, "step": 496 }, { "completion_length": 287.734375, "epoch": 0.15838113448056088, "grad_norm": 5.872875690460205, "kl": 0.072265625, "learning_rate": 8.416188655194391e-07, "loss": 0.0029, "reward": 1.2482273578643799, "reward_std": 0.10398498177528381, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.26385238766670227, "step": 497 }, { "completion_length": 190.484375, "epoch": 0.1586998087954111, "grad_norm": 17.003299713134766, "kl": 0.07421875, "learning_rate": 8.413001912045888e-07, "loss": 0.003, "reward": 1.635709524154663, "reward_std": 0.15149308741092682, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4325844943523407, "rewards/pad": 0.203125, "step": 498 }, { "completion_length": 167.03125, "epoch": 0.15901848311026132, "grad_norm": 26.2998104095459, "kl": 0.08447265625, "learning_rate": 8.409815168897386e-07, "loss": 0.0034, "reward": 1.633731484413147, "reward_std": 0.21327605843544006, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4306064546108246, "rewards/pad": 0.21875, "step": 499 }, { "completion_length": 300.09375, "epoch": 0.15933715742511154, "grad_norm": 5.521867752075195, "kl": 0.054931640625, "learning_rate": 8.406628425748884e-07, "loss": 0.0022, "reward": 1.4881134033203125, "reward_std": 0.034483470022678375, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4881134629249573, "step": 500 }, { "completion_length": 256.046875, "epoch": 0.15965583173996176, "grad_norm": 10.656488418579102, "kl": 0.06884765625, "learning_rate": 8.403441682600382e-07, "loss": 0.0028, "reward": 1.7126858234405518, "reward_std": 0.09016523510217667, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6033110022544861, "step": 501 }, { "completion_length": 302.796875, "epoch": 0.15997450605481198, "grad_norm": 7.291167259216309, "kl": 0.054443359375, "learning_rate": 8.400254939451879e-07, "loss": 0.0022, "reward": 1.5585349798202515, "reward_std": 0.10088232904672623, "rewards/answer_reward": 0.078125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.48041003942489624, "step": 502 }, { "completion_length": 225.375, "epoch": 0.1602931803696622, "grad_norm": 17.202632904052734, "kl": 0.064453125, "learning_rate": 8.397068196303377e-07, "loss": 0.0026, "reward": 1.43354070186615, "reward_std": 0.03540854528546333, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4335406720638275, "step": 503 }, { "completion_length": 275.703125, "epoch": 0.16061185468451242, "grad_norm": 11.276266098022461, "kl": 0.08154296875, "learning_rate": 8.393881453154875e-07, "loss": 0.0033, "reward": 1.45798659324646, "reward_std": 0.15015798807144165, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4267365634441376, "rewards/pad": 0.046875, "step": 504 }, { "completion_length": 343.984375, "epoch": 0.16093052899936264, "grad_norm": 3.738642454147339, "kl": 0.039306640625, "learning_rate": 8.390694710006373e-07, "loss": 0.0016, "reward": 1.6080937385559082, "reward_std": 0.022641608491539955, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6080936193466187, "step": 505 }, { "completion_length": 314.25, "epoch": 0.16124920331421289, "grad_norm": 21.043798446655273, "kl": 0.04150390625, "learning_rate": 8.38750796685787e-07, "loss": 0.0017, "reward": 1.453988790512085, "reward_std": 0.08246888220310211, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.34461385011672974, "step": 506 }, { "completion_length": 187.671875, "epoch": 0.1615678776290631, "grad_norm": 7.886463642120361, "kl": 0.0859375, "learning_rate": 8.384321223709368e-07, "loss": 0.0034, "reward": 1.611316442489624, "reward_std": 0.17711031436920166, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5175663232803345, "rewards/pad": 0.09375, "step": 507 }, { "completion_length": 243.3125, "epoch": 0.16188655194391333, "grad_norm": 5.370201110839844, "kl": 0.06201171875, "learning_rate": 8.381134480560866e-07, "loss": 0.0025, "reward": 1.4405754804611206, "reward_std": 0.13271909952163696, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3155754804611206, "step": 508 }, { "completion_length": 284.84375, "epoch": 0.16220522625876355, "grad_norm": 5.100447177886963, "kl": 0.060791015625, "learning_rate": 8.377947737412365e-07, "loss": 0.0024, "reward": 1.561161756515503, "reward_std": 0.08327622711658478, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43616175651550293, "step": 509 }, { "completion_length": 351.1875, "epoch": 0.16252390057361377, "grad_norm": 3.4032809734344482, "kl": 0.037109375, "learning_rate": 8.374760994263862e-07, "loss": 0.0015, "reward": 1.5396835803985596, "reward_std": 0.06057661026716232, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43030864000320435, "step": 510 }, { "completion_length": 259.0, "epoch": 0.162842574888464, "grad_norm": 5.124754905700684, "kl": 0.0634765625, "learning_rate": 8.37157425111536e-07, "loss": 0.0025, "reward": 1.4768354892730713, "reward_std": 0.10499419271945953, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3674604296684265, "rewards/pad": 0.109375, "step": 511 }, { "completion_length": 341.5, "epoch": 0.1631612492033142, "grad_norm": 11.388672828674316, "kl": 0.038818359375, "learning_rate": 8.368387507966858e-07, "loss": 0.0015, "reward": 1.487513542175293, "reward_std": 0.04100422561168671, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4875136613845825, "step": 512 }, { "completion_length": 271.578125, "epoch": 0.16347992351816443, "grad_norm": 17.304515838623047, "kl": 0.0615234375, "learning_rate": 8.365200764818356e-07, "loss": 0.0025, "reward": 1.50333833694458, "reward_std": 0.03414406254887581, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5033382773399353, "rewards/pad": 0.0, "step": 513 }, { "completion_length": 215.875, "epoch": 0.16379859783301465, "grad_norm": 11.562214851379395, "kl": 0.08154296875, "learning_rate": 8.362014021669853e-07, "loss": 0.0033, "reward": 1.6597778797149658, "reward_std": 0.1694626361131668, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5660279989242554, "step": 514 }, { "completion_length": 148.09375, "epoch": 0.1641172721478649, "grad_norm": 7.4554901123046875, "kl": 0.1015625, "learning_rate": 8.358827278521351e-07, "loss": 0.0041, "reward": 1.4301645755767822, "reward_std": 0.07950934767723083, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4301646053791046, "rewards/pad": 0.0, "step": 515 }, { "completion_length": 233.640625, "epoch": 0.16443594646271512, "grad_norm": 25.945159912109375, "kl": 0.06982421875, "learning_rate": 8.355640535372849e-07, "loss": 0.0028, "reward": 1.3905309438705444, "reward_std": 0.14231663942337036, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.37490594387054443, "step": 516 }, { "completion_length": 276.703125, "epoch": 0.16475462077756534, "grad_norm": 23.2886905670166, "kl": 0.06494140625, "learning_rate": 8.352453792224347e-07, "loss": 0.0026, "reward": 1.654345989227295, "reward_std": 0.13660773634910583, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4824710786342621, "step": 517 }, { "completion_length": 314.53125, "epoch": 0.16507329509241556, "grad_norm": 8.694071769714355, "kl": 0.0576171875, "learning_rate": 8.349267049075844e-07, "loss": 0.0023, "reward": 1.5877230167388916, "reward_std": 0.05514439195394516, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.337723046541214, "rewards/pad": 0.25, "step": 518 }, { "completion_length": 337.296875, "epoch": 0.16539196940726578, "grad_norm": 6.448973655700684, "kl": 0.06494140625, "learning_rate": 8.346080305927342e-07, "loss": 0.0026, "reward": 1.4133797883987427, "reward_std": 0.026479611173272133, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4133797287940979, "step": 519 }, { "completion_length": 148.125, "epoch": 0.165710643722116, "grad_norm": 6.242334365844727, "kl": 0.0791015625, "learning_rate": 8.34289356277884e-07, "loss": 0.0032, "reward": 1.4564950466156006, "reward_std": 0.05193711072206497, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3314949870109558, "rewards/pad": 0.125, "step": 520 }, { "completion_length": 267.734375, "epoch": 0.16602931803696622, "grad_norm": 20.728370666503906, "kl": 0.055419921875, "learning_rate": 8.339706819630338e-07, "loss": 0.0022, "reward": 1.562483787536621, "reward_std": 0.042223796248435974, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5624837875366211, "step": 521 }, { "completion_length": 251.484375, "epoch": 0.16634799235181644, "grad_norm": 30.452831268310547, "kl": 0.08349609375, "learning_rate": 8.336520076481835e-07, "loss": 0.0033, "reward": 1.466627836227417, "reward_std": 0.08504165709018707, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46662774682044983, "rewards/pad": 0.0, "step": 522 }, { "completion_length": 285.390625, "epoch": 0.16666666666666666, "grad_norm": 7.482173919677734, "kl": 0.06689453125, "learning_rate": 8.333333333333333e-07, "loss": 0.0027, "reward": 1.398820161819458, "reward_std": 0.10944493114948273, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39882004261016846, "rewards/pad": 0.0, "step": 523 }, { "completion_length": 212.734375, "epoch": 0.16698534098151688, "grad_norm": 14.110750198364258, "kl": 0.11669921875, "learning_rate": 8.330146590184831e-07, "loss": 0.0047, "reward": 1.3691809177398682, "reward_std": 0.15498223900794983, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.21293091773986816, "rewards/pad": 0.171875, "step": 524 }, { "completion_length": 427.375, "epoch": 0.16730401529636713, "grad_norm": 5.056336402893066, "kl": 0.031982421875, "learning_rate": 8.326959847036329e-07, "loss": 0.0013, "reward": 1.4527101516723633, "reward_std": 0.054359886795282364, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4527100920677185, "step": 525 }, { "completion_length": 248.453125, "epoch": 0.16762268961121735, "grad_norm": 5.743110179901123, "kl": 0.058349609375, "learning_rate": 8.323773103887826e-07, "loss": 0.0023, "reward": 1.8058514595031738, "reward_std": 0.02524116262793541, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6808514595031738, "step": 526 }, { "completion_length": 208.25, "epoch": 0.16794136392606757, "grad_norm": 8.54845905303955, "kl": 0.08642578125, "learning_rate": 8.320586360739325e-07, "loss": 0.0035, "reward": 1.3544914722442627, "reward_std": 0.06807970255613327, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3544915020465851, "rewards/pad": 0.0, "step": 527 }, { "completion_length": 249.640625, "epoch": 0.1682600382409178, "grad_norm": 23.088241577148438, "kl": 0.06787109375, "learning_rate": 8.317399617590823e-07, "loss": 0.0027, "reward": 1.5373032093048096, "reward_std": 0.05144185945391655, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5373032093048096, "step": 528 }, { "completion_length": 321.75, "epoch": 0.168578712555768, "grad_norm": 19.688156127929688, "kl": 0.052490234375, "learning_rate": 8.31421287444232e-07, "loss": 0.0021, "reward": 1.4172598123550415, "reward_std": 0.08434939384460449, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3235098421573639, "rewards/pad": 0.09375, "step": 529 }, { "completion_length": 293.890625, "epoch": 0.16889738687061823, "grad_norm": 11.647110939025879, "kl": 0.05224609375, "learning_rate": 8.311026131293817e-07, "loss": 0.0021, "reward": 1.609508752822876, "reward_std": 0.13939915597438812, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6251336932182312, "rewards/pad": 0.0, "step": 530 }, { "completion_length": 277.0, "epoch": 0.16921606118546845, "grad_norm": 34.01567077636719, "kl": 0.05712890625, "learning_rate": 8.307839388145315e-07, "loss": 0.0023, "reward": 1.577805995941162, "reward_std": 0.10225144028663635, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3746810853481293, "rewards/pad": 0.203125, "step": 531 }, { "completion_length": 313.71875, "epoch": 0.16953473550031867, "grad_norm": 5.426848888397217, "kl": 0.0732421875, "learning_rate": 8.304652644996813e-07, "loss": 0.0029, "reward": 1.470942497253418, "reward_std": 0.09922796487808228, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.48656758666038513, "rewards/pad": 0.0, "step": 532 }, { "completion_length": 176.21875, "epoch": 0.1698534098151689, "grad_norm": 12.14452075958252, "kl": 0.0888671875, "learning_rate": 8.30146590184831e-07, "loss": 0.0035, "reward": 1.6665148735046387, "reward_std": 0.23820053040981293, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4008897840976715, "rewards/pad": 0.265625, "step": 533 }, { "completion_length": 247.984375, "epoch": 0.1701720841300191, "grad_norm": 4.657873630523682, "kl": 0.0693359375, "learning_rate": 8.298279158699808e-07, "loss": 0.0028, "reward": 1.4569358825683594, "reward_std": 0.11433446407318115, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.36318591237068176, "rewards/pad": 0.09375, "step": 534 }, { "completion_length": 242.4375, "epoch": 0.17049075844486936, "grad_norm": 10.610453605651855, "kl": 0.0732421875, "learning_rate": 8.295092415551306e-07, "loss": 0.0029, "reward": 1.4943509101867676, "reward_std": 0.06179783120751381, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4943508505821228, "step": 535 }, { "completion_length": 264.75, "epoch": 0.17080943275971958, "grad_norm": 5.561173439025879, "kl": 0.06396484375, "learning_rate": 8.291905672402804e-07, "loss": 0.0026, "reward": 1.734907627105713, "reward_std": 0.1684616506099701, "rewards/pad": 0.328125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4067826569080353, "step": 536 }, { "completion_length": 125.484375, "epoch": 0.1711281070745698, "grad_norm": 11.028822898864746, "kl": 0.0888671875, "learning_rate": 8.288718929254301e-07, "loss": 0.0036, "reward": 1.6976487636566162, "reward_std": 0.18358567357063293, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.369523823261261, "rewards/pad": 0.34375, "step": 537 }, { "completion_length": 203.015625, "epoch": 0.17144678138942002, "grad_norm": 9.298152923583984, "kl": 0.07421875, "learning_rate": 8.285532186105799e-07, "loss": 0.003, "reward": 1.596775770187378, "reward_std": 0.14144155383110046, "rewards/pad": 0.296875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.29990074038505554, "step": 538 }, { "completion_length": 304.765625, "epoch": 0.17176545570427024, "grad_norm": 6.667783737182617, "kl": 0.046630859375, "learning_rate": 8.282345442957297e-07, "loss": 0.0019, "reward": 1.5704073905944824, "reward_std": 0.15934810042381287, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.49228233098983765, "rewards/pad": 0.09375, "step": 539 }, { "completion_length": 342.109375, "epoch": 0.17208413001912046, "grad_norm": 4.383968830108643, "kl": 0.059326171875, "learning_rate": 8.279158699808795e-07, "loss": 0.0024, "reward": 1.5653839111328125, "reward_std": 0.10253358632326126, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.33100882172584534, "rewards/pad": 0.25, "step": 540 }, { "completion_length": 386.6875, "epoch": 0.17240280433397068, "grad_norm": 8.986936569213867, "kl": 0.06298828125, "learning_rate": 8.275971956660292e-07, "loss": 0.0025, "reward": 1.4542319774627686, "reward_std": 0.1005234345793724, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.46985694766044617, "step": 541 }, { "completion_length": 246.953125, "epoch": 0.1727214786488209, "grad_norm": 7.305197715759277, "kl": 0.091796875, "learning_rate": 8.27278521351179e-07, "loss": 0.0037, "reward": 1.3748791217803955, "reward_std": 0.12443351745605469, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3748791813850403, "rewards/pad": 0.0, "step": 542 }, { "completion_length": 284.890625, "epoch": 0.17304015296367112, "grad_norm": 8.34105396270752, "kl": 0.06298828125, "learning_rate": 8.269598470363288e-07, "loss": 0.0025, "reward": 1.3564362525939941, "reward_std": 0.15209169685840607, "rewards/answer_reward": 0.0625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.2939361333847046, "step": 543 }, { "completion_length": 222.640625, "epoch": 0.17335882727852134, "grad_norm": 7.7216796875, "kl": 0.09521484375, "learning_rate": 8.266411727214786e-07, "loss": 0.0038, "reward": 1.6124627590179443, "reward_std": 0.0886450707912445, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.36246275901794434, "rewards/pad": 0.25, "step": 544 }, { "completion_length": 249.765625, "epoch": 0.17367750159337159, "grad_norm": 10.911834716796875, "kl": 0.0810546875, "learning_rate": 8.263224984066283e-07, "loss": 0.0032, "reward": 1.4904203414916992, "reward_std": 0.049107346683740616, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.490420401096344, "rewards/pad": 0.0, "step": 545 }, { "completion_length": 185.984375, "epoch": 0.1739961759082218, "grad_norm": 9.049338340759277, "kl": 0.1025390625, "learning_rate": 8.260038240917782e-07, "loss": 0.0041, "reward": 1.7010513544082642, "reward_std": 0.07424585521221161, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5760514736175537, "rewards/pad": 0.125, "step": 546 }, { "completion_length": 244.1875, "epoch": 0.17431485022307203, "grad_norm": 7.223100185394287, "kl": 0.0771484375, "learning_rate": 8.25685149776928e-07, "loss": 0.0031, "reward": 1.5383145809173584, "reward_std": 0.10397733002901077, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.553939700126648, "step": 547 }, { "completion_length": 255.75, "epoch": 0.17463352453792225, "grad_norm": 8.514344215393066, "kl": 0.072265625, "learning_rate": 8.253664754620778e-07, "loss": 0.0029, "reward": 1.389475703239441, "reward_std": 0.07426249980926514, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3738507032394409, "step": 548 }, { "completion_length": 313.265625, "epoch": 0.17495219885277247, "grad_norm": 7.112410545349121, "kl": 0.06982421875, "learning_rate": 8.250478011472275e-07, "loss": 0.0028, "reward": 1.432603120803833, "reward_std": 0.08065502345561981, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.432603120803833, "step": 549 }, { "completion_length": 192.671875, "epoch": 0.1752708731676227, "grad_norm": 16.948640823364258, "kl": 0.10107421875, "learning_rate": 8.247291268323773e-07, "loss": 0.004, "reward": 1.6080808639526367, "reward_std": 0.11161001026630402, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.23308099806308746, "step": 550 }, { "completion_length": 314.828125, "epoch": 0.1755895474824729, "grad_norm": 5.996268272399902, "kl": 0.06689453125, "learning_rate": 8.244104525175271e-07, "loss": 0.0027, "reward": 1.3713405132293701, "reward_std": 0.06664109230041504, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.37134063243865967, "step": 551 }, { "completion_length": 266.265625, "epoch": 0.17590822179732313, "grad_norm": 9.64345645904541, "kl": 0.0947265625, "learning_rate": 8.240917782026769e-07, "loss": 0.0038, "reward": 1.412318229675293, "reward_std": 0.06251288205385208, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4123181998729706, "step": 552 }, { "completion_length": 397.734375, "epoch": 0.17622689611217335, "grad_norm": 7.888392448425293, "kl": 0.036376953125, "learning_rate": 8.237731038878266e-07, "loss": 0.0014, "reward": 1.5837773084640503, "reward_std": 0.043806686997413635, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4587773382663727, "rewards/pad": 0.125, "step": 553 }, { "completion_length": 215.296875, "epoch": 0.17654557042702357, "grad_norm": 9.22640609741211, "kl": 0.07763671875, "learning_rate": 8.234544295729764e-07, "loss": 0.0031, "reward": 1.536142110824585, "reward_std": 0.1304551213979721, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42676711082458496, "rewards/pad": 0.109375, "step": 554 }, { "completion_length": 223.859375, "epoch": 0.17686424474187382, "grad_norm": 29.19223976135254, "kl": 0.0849609375, "learning_rate": 8.231357552581262e-07, "loss": 0.0034, "reward": 1.5908797979354858, "reward_std": 0.06541752815246582, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5908798575401306, "rewards/pad": 0.0, "step": 555 }, { "completion_length": 319.859375, "epoch": 0.17718291905672404, "grad_norm": 8.462791442871094, "kl": 0.058837890625, "learning_rate": 8.228170809432759e-07, "loss": 0.0024, "reward": 1.4492942094802856, "reward_std": 0.06144850328564644, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44929423928260803, "step": 556 }, { "completion_length": 317.234375, "epoch": 0.17750159337157426, "grad_norm": 9.007328033447266, "kl": 0.06494140625, "learning_rate": 8.224984066284257e-07, "loss": 0.0026, "reward": 1.3446791172027588, "reward_std": 0.13469094038009644, "rewards/answer_reward": 0.03125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3290541172027588, "step": 557 }, { "completion_length": 235.78125, "epoch": 0.17782026768642448, "grad_norm": 29.220151901245117, "kl": 0.08544921875, "learning_rate": 8.221797323135755e-07, "loss": 0.0034, "reward": 1.438340425491333, "reward_std": 0.10674619674682617, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.43834036588668823, "rewards/pad": 0.015625, "step": 558 }, { "completion_length": 174.921875, "epoch": 0.1781389420012747, "grad_norm": 27.95044708251953, "kl": 0.08837890625, "learning_rate": 8.218610579987253e-07, "loss": 0.0035, "reward": 1.6616170406341553, "reward_std": 0.10026431083679199, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6616171002388, "rewards/pad": 0.0, "step": 559 }, { "completion_length": 387.671875, "epoch": 0.17845761631612492, "grad_norm": 12.593073844909668, "kl": 0.05615234375, "learning_rate": 8.21542383683875e-07, "loss": 0.0022, "reward": 1.4177520275115967, "reward_std": 0.05932632088661194, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.41775208711624146, "step": 560 }, { "completion_length": 222.484375, "epoch": 0.17877629063097514, "grad_norm": 20.837419509887695, "kl": 0.07958984375, "learning_rate": 8.212237093690248e-07, "loss": 0.0032, "reward": 1.5061814785003662, "reward_std": 0.10648869723081589, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3968064785003662, "rewards/pad": 0.109375, "step": 561 }, { "completion_length": 238.15625, "epoch": 0.17909496494582536, "grad_norm": 7.644387722015381, "kl": 0.1123046875, "learning_rate": 8.209050350541746e-07, "loss": 0.0045, "reward": 1.3573124408721924, "reward_std": 0.06864476948976517, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3573123514652252, "step": 562 }, { "completion_length": 231.3125, "epoch": 0.17941363926067558, "grad_norm": 9.32486629486084, "kl": 0.091796875, "learning_rate": 8.205863607393244e-07, "loss": 0.0037, "reward": 1.5059893131256104, "reward_std": 0.2075025886297226, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45911431312561035, "rewards/pad": 0.046875, "step": 563 }, { "completion_length": 273.6875, "epoch": 0.17973231357552583, "grad_norm": 19.53805923461914, "kl": 0.0673828125, "learning_rate": 8.202676864244741e-07, "loss": 0.0027, "reward": 1.6068167686462402, "reward_std": 0.10275942087173462, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.513066828250885, "rewards/pad": 0.09375, "step": 564 }, { "completion_length": 290.625, "epoch": 0.18005098789037605, "grad_norm": 4.721208572387695, "kl": 0.130859375, "learning_rate": 8.19949012109624e-07, "loss": 0.0052, "reward": 1.5925767421722412, "reward_std": 0.1044643372297287, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4675767123699188, "step": 565 }, { "completion_length": 309.234375, "epoch": 0.18036966220522627, "grad_norm": 7.534607410430908, "kl": 0.0693359375, "learning_rate": 8.196303377947738e-07, "loss": 0.0028, "reward": 1.3615877628326416, "reward_std": 0.07354053854942322, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.361587792634964, "step": 566 }, { "completion_length": 232.921875, "epoch": 0.1806883365200765, "grad_norm": 18.523197174072266, "kl": 0.08056640625, "learning_rate": 8.193116634799236e-07, "loss": 0.0032, "reward": 1.4811151027679443, "reward_std": 0.1094781905412674, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3717402219772339, "rewards/pad": 0.109375, "step": 567 }, { "completion_length": 328.984375, "epoch": 0.1810070108349267, "grad_norm": 28.020334243774414, "kl": 0.0498046875, "learning_rate": 8.189929891650733e-07, "loss": 0.002, "reward": 1.4994876384735107, "reward_std": 0.0968121737241745, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.35886263847351074, "step": 568 }, { "completion_length": 382.9375, "epoch": 0.18132568514977693, "grad_norm": 5.482840061187744, "kl": 0.045654296875, "learning_rate": 8.18674314850223e-07, "loss": 0.0018, "reward": 1.4488133192062378, "reward_std": 0.10397167503833771, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4019383192062378, "rewards/pad": 0.046875, "step": 569 }, { "completion_length": 328.40625, "epoch": 0.18164435946462715, "grad_norm": 12.763792037963867, "kl": 0.062255859375, "learning_rate": 8.183556405353728e-07, "loss": 0.0025, "reward": 1.6313751935958862, "reward_std": 0.19210481643676758, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.30325016379356384, "rewards/pad": 0.34375, "step": 570 }, { "completion_length": 365.875, "epoch": 0.18196303377947737, "grad_norm": 6.801854610443115, "kl": 0.0517578125, "learning_rate": 8.180369662205226e-07, "loss": 0.0021, "reward": 1.2952144145965576, "reward_std": 0.05773142725229263, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.29521435499191284, "rewards/pad": 0.0, "step": 571 }, { "completion_length": 301.984375, "epoch": 0.1822817080943276, "grad_norm": 9.7481689453125, "kl": 0.0654296875, "learning_rate": 8.177182919056723e-07, "loss": 0.0026, "reward": 1.4989292621612549, "reward_std": 0.14893344044685364, "rewards/answer_reward": 0.0625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4364292621612549, "step": 572 }, { "completion_length": 313.953125, "epoch": 0.1826003824091778, "grad_norm": 5.001101016998291, "kl": 0.058837890625, "learning_rate": 8.173996175908221e-07, "loss": 0.0024, "reward": 1.522552728652954, "reward_std": 0.06399345397949219, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5225528478622437, "rewards/pad": 0.0, "step": 573 }, { "completion_length": 324.203125, "epoch": 0.18291905672402806, "grad_norm": 6.752080917358398, "kl": 0.058349609375, "learning_rate": 8.170809432759719e-07, "loss": 0.0023, "reward": 1.5995622873306274, "reward_std": 0.14907418191432953, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5370622277259827, "rewards/pad": 0.078125, "step": 574 }, { "completion_length": 303.546875, "epoch": 0.18323773103887828, "grad_norm": 6.67637300491333, "kl": 0.07080078125, "learning_rate": 8.167622689611217e-07, "loss": 0.0028, "reward": 1.6408474445343018, "reward_std": 0.10487158596515656, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.422097384929657, "rewards/pad": 0.21875, "step": 575 }, { "completion_length": 222.40625, "epoch": 0.1835564053537285, "grad_norm": 7.407890796661377, "kl": 0.08544921875, "learning_rate": 8.164435946462714e-07, "loss": 0.0034, "reward": 1.5350260734558105, "reward_std": 0.12564942240715027, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3944009840488434, "rewards/pad": 0.140625, "step": 576 }, { "completion_length": 247.796875, "epoch": 0.18387507966857872, "grad_norm": 9.383212089538574, "kl": 0.091796875, "learning_rate": 8.161249203314212e-07, "loss": 0.0037, "reward": 1.7426847219467163, "reward_std": 0.1123775988817215, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5083097219467163, "step": 577 }, { "completion_length": 363.25, "epoch": 0.18419375398342894, "grad_norm": 17.369152069091797, "kl": 0.0595703125, "learning_rate": 8.15806246016571e-07, "loss": 0.0024, "reward": 1.4469183683395386, "reward_std": 0.06069286912679672, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44691839814186096, "step": 578 }, { "completion_length": 197.15625, "epoch": 0.18451242829827916, "grad_norm": 10.02291488647461, "kl": 0.10009765625, "learning_rate": 8.154875717017208e-07, "loss": 0.004, "reward": 1.4997634887695312, "reward_std": 0.14637836813926697, "rewards/answer_reward": 0.078125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4216384291648865, "step": 579 }, { "completion_length": 319.875, "epoch": 0.18483110261312938, "grad_norm": 5.547020435333252, "kl": 0.0712890625, "learning_rate": 8.151688973868705e-07, "loss": 0.0029, "reward": 1.5564733743667603, "reward_std": 0.21002715826034546, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.587723433971405, "rewards/pad": 0.0, "step": 580 }, { "completion_length": 228.546875, "epoch": 0.1851497769279796, "grad_norm": 7.936356544494629, "kl": 0.0859375, "learning_rate": 8.148502230720203e-07, "loss": 0.0034, "reward": 1.9707231521606445, "reward_std": 0.17871692776679993, "rewards/pad": 0.453125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5332232117652893, "step": 581 }, { "completion_length": 443.796875, "epoch": 0.18546845124282982, "grad_norm": 11.128722190856934, "kl": 0.03466796875, "learning_rate": 8.145315487571701e-07, "loss": 0.0014, "reward": 1.5804551839828491, "reward_std": 0.06354731321334839, "rewards/answer_reward": 0.015625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5648301243782043, "step": 582 }, { "completion_length": 176.03125, "epoch": 0.18578712555768004, "grad_norm": 10.60787296295166, "kl": 0.08935546875, "learning_rate": 8.1421287444232e-07, "loss": 0.0036, "reward": 1.6280150413513184, "reward_std": 0.07694578170776367, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5030151009559631, "step": 583 }, { "completion_length": 329.921875, "epoch": 0.1861057998725303, "grad_norm": 12.799588203430176, "kl": 0.06494140625, "learning_rate": 8.138942001274697e-07, "loss": 0.0026, "reward": 1.241806983947754, "reward_std": 0.1604565978050232, "rewards/answer_reward": 0.015625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.24180711805820465, "step": 584 }, { "completion_length": 455.765625, "epoch": 0.1864244741873805, "grad_norm": 15.1790132522583, "kl": 0.03515625, "learning_rate": 8.135755258126195e-07, "loss": 0.0014, "reward": 1.3431535959243774, "reward_std": 0.08880558609962463, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.29627859592437744, "step": 585 }, { "completion_length": 440.984375, "epoch": 0.18674314850223073, "grad_norm": 12.594864845275879, "kl": 0.0341796875, "learning_rate": 8.132568514977693e-07, "loss": 0.0013, "reward": 1.6465810537338257, "reward_std": 0.024850212037563324, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5215809345245361, "step": 586 }, { "completion_length": 395.734375, "epoch": 0.18706182281708095, "grad_norm": 7.501439571380615, "kl": 0.047119140625, "learning_rate": 8.12938177182919e-07, "loss": 0.0019, "reward": 1.5232059955596924, "reward_std": 0.06539130210876465, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5232060551643372, "rewards/pad": 0.0, "step": 587 }, { "completion_length": 453.578125, "epoch": 0.18738049713193117, "grad_norm": 9.147506713867188, "kl": 0.04443359375, "learning_rate": 8.126195028680688e-07, "loss": 0.0018, "reward": 1.4919631481170654, "reward_std": 0.17542997002601624, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5232131481170654, "rewards/pad": 0.0, "step": 588 }, { "completion_length": 256.046875, "epoch": 0.1876991714467814, "grad_norm": 32.91367721557617, "kl": 0.061279296875, "learning_rate": 8.123008285532186e-07, "loss": 0.0025, "reward": 1.458876371383667, "reward_std": 0.06437614560127258, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.333876371383667, "rewards/pad": 0.125, "step": 589 }, { "completion_length": 372.671875, "epoch": 0.1880178457616316, "grad_norm": 7.198757648468018, "kl": 0.05029296875, "learning_rate": 8.119821542383684e-07, "loss": 0.002, "reward": 1.3763469457626343, "reward_std": 0.02698800340294838, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3763468563556671, "step": 590 }, { "completion_length": 326.828125, "epoch": 0.18833652007648183, "grad_norm": 7.202263355255127, "kl": 0.05810546875, "learning_rate": 8.116634799235181e-07, "loss": 0.0023, "reward": 1.6532502174377441, "reward_std": 0.10701927542686462, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5438752770423889, "step": 591 }, { "completion_length": 314.234375, "epoch": 0.18865519439133205, "grad_norm": 6.360630035400391, "kl": 0.068359375, "learning_rate": 8.113448056086679e-07, "loss": 0.0027, "reward": 1.6760071516036987, "reward_std": 0.08180226385593414, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5510071516036987, "rewards/pad": 0.125, "step": 592 }, { "completion_length": 357.375, "epoch": 0.18897386870618227, "grad_norm": 8.12671947479248, "kl": 0.05615234375, "learning_rate": 8.110261312938177e-07, "loss": 0.0023, "reward": 1.4480271339416504, "reward_std": 0.1161215677857399, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4480271339416504, "step": 593 }, { "completion_length": 367.203125, "epoch": 0.18929254302103252, "grad_norm": 5.130788326263428, "kl": 0.05322265625, "learning_rate": 8.107074569789675e-07, "loss": 0.0021, "reward": 1.4342541694641113, "reward_std": 0.1340075582265854, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.34050410985946655, "step": 594 }, { "completion_length": 161.765625, "epoch": 0.18961121733588274, "grad_norm": 10.24941349029541, "kl": 0.0908203125, "learning_rate": 8.103887826641172e-07, "loss": 0.0036, "reward": 1.6754908561706543, "reward_std": 0.12678924202919006, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5504908561706543, "rewards/pad": 0.125, "step": 595 }, { "completion_length": 260.359375, "epoch": 0.18992989165073296, "grad_norm": 19.758913040161133, "kl": 0.0810546875, "learning_rate": 8.10070108349267e-07, "loss": 0.0032, "reward": 1.43495512008667, "reward_std": 0.20263132452964783, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3568301796913147, "rewards/pad": 0.09375, "step": 596 }, { "completion_length": 468.34375, "epoch": 0.19024856596558318, "grad_norm": 5.984434127807617, "kl": 0.03515625, "learning_rate": 8.097514340344168e-07, "loss": 0.0014, "reward": 1.4179304838180542, "reward_std": 0.07944416999816895, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4335554540157318, "step": 597 }, { "completion_length": 187.96875, "epoch": 0.1905672402804334, "grad_norm": 79.40922546386719, "kl": 0.080078125, "learning_rate": 8.094327597195666e-07, "loss": 0.0032, "reward": 1.5468125343322754, "reward_std": 0.17716550827026367, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4374375343322754, "step": 598 }, { "completion_length": 276.34375, "epoch": 0.19088591459528362, "grad_norm": 13.014153480529785, "kl": 0.0654296875, "learning_rate": 8.091140854047163e-07, "loss": 0.0026, "reward": 1.6321213245391846, "reward_std": 0.18163417279720306, "rewards/pad": 0.265625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36649632453918457, "step": 599 }, { "completion_length": 140.703125, "epoch": 0.19120458891013384, "grad_norm": 88.6712417602539, "kl": 0.08154296875, "learning_rate": 8.087954110898661e-07, "loss": 0.0033, "reward": 1.6089478731155396, "reward_std": 0.1710633635520935, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39019784331321716, "rewards/pad": 0.21875, "step": 600 }, { "completion_length": 283.28125, "epoch": 0.19152326322498406, "grad_norm": 8.6228609085083, "kl": 0.0791015625, "learning_rate": 8.084767367750159e-07, "loss": 0.0032, "reward": 1.5642704963684082, "reward_std": 0.11657722294330597, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.454895555973053, "step": 601 }, { "completion_length": 319.125, "epoch": 0.19184193753983428, "grad_norm": 10.412154197692871, "kl": 0.06591796875, "learning_rate": 8.081580624601658e-07, "loss": 0.0026, "reward": 1.352163314819336, "reward_std": 0.07580342888832092, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3521634042263031, "rewards/pad": 0.0, "step": 602 }, { "completion_length": 278.3125, "epoch": 0.1921606118546845, "grad_norm": 7.997784614562988, "kl": 0.060546875, "learning_rate": 8.078393881453155e-07, "loss": 0.0024, "reward": 1.5723495483398438, "reward_std": 0.10167841613292694, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5723496079444885, "rewards/pad": 0.0, "step": 603 }, { "completion_length": 179.96875, "epoch": 0.19247928616953475, "grad_norm": 7.610307693481445, "kl": 0.07666015625, "learning_rate": 8.075207138304653e-07, "loss": 0.0031, "reward": 1.6623280048370361, "reward_std": 0.1471826136112213, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.41232794523239136, "rewards/pad": 0.25, "step": 604 }, { "completion_length": 243.203125, "epoch": 0.19279796048438497, "grad_norm": 9.530278205871582, "kl": 0.07275390625, "learning_rate": 8.072020395156151e-07, "loss": 0.0029, "reward": 1.700823187828064, "reward_std": 0.11023609340190887, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.46644818782806396, "step": 605 }, { "completion_length": 267.640625, "epoch": 0.1931166347992352, "grad_norm": 4.225679397583008, "kl": 0.076171875, "learning_rate": 8.068833652007649e-07, "loss": 0.003, "reward": 1.633243441581726, "reward_std": 0.1748502105474472, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4144933819770813, "step": 606 }, { "completion_length": 405.625, "epoch": 0.1934353091140854, "grad_norm": 16.529685974121094, "kl": 0.051025390625, "learning_rate": 8.065646908859146e-07, "loss": 0.002, "reward": 1.3954849243164062, "reward_std": 0.08648538589477539, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.2861098051071167, "step": 607 }, { "completion_length": 263.96875, "epoch": 0.19375398342893563, "grad_norm": 6.294305324554443, "kl": 0.0732421875, "learning_rate": 8.062460165710643e-07, "loss": 0.0029, "reward": 1.2127344608306885, "reward_std": 0.0745549276471138, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.21273449063301086, "step": 608 }, { "completion_length": 229.03125, "epoch": 0.19407265774378585, "grad_norm": 12.357077598571777, "kl": 0.068359375, "learning_rate": 8.059273422562141e-07, "loss": 0.0027, "reward": 1.5320799350738525, "reward_std": 0.0908544585108757, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5320799350738525, "step": 609 }, { "completion_length": 250.765625, "epoch": 0.19439133205863607, "grad_norm": 9.141507148742676, "kl": 0.0732421875, "learning_rate": 8.056086679413639e-07, "loss": 0.0029, "reward": 1.5585919618606567, "reward_std": 0.1449308544397354, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5273419618606567, "rewards/pad": 0.03125, "step": 610 }, { "completion_length": 320.9375, "epoch": 0.1947100063734863, "grad_norm": 4.861262798309326, "kl": 0.06005859375, "learning_rate": 8.052899936265136e-07, "loss": 0.0024, "reward": 1.5317165851593018, "reward_std": 0.15812718868255615, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.46921658515930176, "rewards/pad": 0.09375, "step": 611 }, { "completion_length": 268.875, "epoch": 0.1950286806883365, "grad_norm": 28.3754825592041, "kl": 0.06884765625, "learning_rate": 8.049713193116634e-07, "loss": 0.0028, "reward": 1.4912457466125488, "reward_std": 0.04721307381987572, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49124568700790405, "rewards/pad": 0.0, "step": 612 }, { "completion_length": 235.453125, "epoch": 0.19534735500318676, "grad_norm": 7.317533493041992, "kl": 0.0771484375, "learning_rate": 8.046526449968132e-07, "loss": 0.0031, "reward": 1.3803117275238037, "reward_std": 0.11250771582126617, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.39593687653541565, "rewards/pad": 0.0, "step": 613 }, { "completion_length": 226.03125, "epoch": 0.19566602931803698, "grad_norm": 38.98457336425781, "kl": 0.0703125, "learning_rate": 8.043339706819629e-07, "loss": 0.0028, "reward": 1.6466970443725586, "reward_std": 0.07787991315126419, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5216969847679138, "step": 614 }, { "completion_length": 232.640625, "epoch": 0.1959847036328872, "grad_norm": 9.064290046691895, "kl": 0.072265625, "learning_rate": 8.040152963671127e-07, "loss": 0.0029, "reward": 1.4208617210388184, "reward_std": 0.2527421712875366, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.29586169123649597, "step": 615 }, { "completion_length": 233.796875, "epoch": 0.19630337794773742, "grad_norm": 5.284163951873779, "kl": 0.08740234375, "learning_rate": 8.036966220522625e-07, "loss": 0.0035, "reward": 1.4077630043029785, "reward_std": 0.13147412240505219, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.42338788509368896, "rewards/pad": 0.0, "step": 616 }, { "completion_length": 284.546875, "epoch": 0.19662205226258764, "grad_norm": 25.7481632232666, "kl": 0.072265625, "learning_rate": 8.033779477374123e-07, "loss": 0.0029, "reward": 1.5234477519989014, "reward_std": 0.060542307794094086, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5234477519989014, "step": 617 }, { "completion_length": 264.3125, "epoch": 0.19694072657743786, "grad_norm": 7.553581237792969, "kl": 0.07275390625, "learning_rate": 8.03059273422562e-07, "loss": 0.0029, "reward": 1.4492477178573608, "reward_std": 0.09408283978700638, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4492477774620056, "rewards/pad": 0.0, "step": 618 }, { "completion_length": 266.9375, "epoch": 0.19725940089228808, "grad_norm": 6.398976802825928, "kl": 0.07958984375, "learning_rate": 8.027405991077118e-07, "loss": 0.0032, "reward": 1.5198715925216675, "reward_std": 0.15117725729942322, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5354965925216675, "step": 619 }, { "completion_length": 330.71875, "epoch": 0.1975780752071383, "grad_norm": 5.78820276260376, "kl": 0.07568359375, "learning_rate": 8.024219247928616e-07, "loss": 0.003, "reward": 1.3081625699996948, "reward_std": 0.09341893345117569, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3081625699996948, "rewards/pad": 0.0, "step": 620 }, { "completion_length": 389.359375, "epoch": 0.19789674952198852, "grad_norm": 13.41144847869873, "kl": 0.055419921875, "learning_rate": 8.021032504780114e-07, "loss": 0.0022, "reward": 1.5052798986434937, "reward_std": 0.21025700867176056, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5209048390388489, "rewards/pad": 0.015625, "step": 621 }, { "completion_length": 312.203125, "epoch": 0.19821542383683874, "grad_norm": 41.177154541015625, "kl": 0.1015625, "learning_rate": 8.017845761631612e-07, "loss": 0.0041, "reward": 1.566091537475586, "reward_std": 0.1455710232257843, "rewards/answer_reward": 0.03125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5348414778709412, "step": 622 }, { "completion_length": 491.96875, "epoch": 0.198534098151689, "grad_norm": 2.699554920196533, "kl": 0.037353515625, "learning_rate": 8.01465901848311e-07, "loss": 0.0015, "reward": 1.5370073318481445, "reward_std": 0.14047783613204956, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.3182574212551117, "step": 623 }, { "completion_length": 431.703125, "epoch": 0.1988527724665392, "grad_norm": 5.324953556060791, "kl": 0.0498046875, "learning_rate": 8.011472275334608e-07, "loss": 0.002, "reward": 1.475203275680542, "reward_std": 0.15671223402023315, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.3814533054828644, "step": 624 }, { "completion_length": 307.0625, "epoch": 0.19917144678138943, "grad_norm": 6.110250949859619, "kl": 0.14453125, "learning_rate": 8.008285532186106e-07, "loss": 0.0058, "reward": 1.4999936819076538, "reward_std": 0.1042509377002716, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4999937415122986, "step": 625 }, { "completion_length": 255.578125, "epoch": 0.19949012109623965, "grad_norm": 10.814800262451172, "kl": 0.07958984375, "learning_rate": 8.005098789037603e-07, "loss": 0.0032, "reward": 1.4932688474655151, "reward_std": 0.08080501109361649, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3682689070701599, "step": 626 }, { "completion_length": 157.046875, "epoch": 0.19980879541108987, "grad_norm": 13.435721397399902, "kl": 0.1025390625, "learning_rate": 8.001912045889101e-07, "loss": 0.0041, "reward": 1.4771438837051392, "reward_std": 0.11090736836194992, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47714388370513916, "rewards/pad": 0.0, "step": 627 }, { "completion_length": 341.984375, "epoch": 0.2001274697259401, "grad_norm": 7.098702430725098, "kl": 0.0712890625, "learning_rate": 7.998725302740599e-07, "loss": 0.0028, "reward": 1.6881647109985352, "reward_std": 0.14802853763103485, "rewards/answer_reward": 0.0625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6256648302078247, "step": 628 }, { "completion_length": 205.40625, "epoch": 0.2004461440407903, "grad_norm": 8.152799606323242, "kl": 0.08837890625, "learning_rate": 7.995538559592097e-07, "loss": 0.0035, "reward": 1.4634058475494385, "reward_std": 0.12990927696228027, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4321558475494385, "rewards/pad": 0.03125, "step": 629 }, { "completion_length": 288.375, "epoch": 0.20076481835564053, "grad_norm": 7.801412582397461, "kl": 0.07958984375, "learning_rate": 7.992351816443594e-07, "loss": 0.0032, "reward": 1.6792851686477661, "reward_std": 0.12204693257808685, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3199102282524109, "rewards/pad": 0.359375, "step": 630 }, { "completion_length": 332.359375, "epoch": 0.20108349267049075, "grad_norm": 56.13440704345703, "kl": 0.08154296875, "learning_rate": 7.989165073295092e-07, "loss": 0.0033, "reward": 1.4205741882324219, "reward_std": 0.10428917407989502, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.42057424783706665, "step": 631 }, { "completion_length": 342.0625, "epoch": 0.20140216698534097, "grad_norm": 48.81986618041992, "kl": 0.0830078125, "learning_rate": 7.98597833014659e-07, "loss": 0.0033, "reward": 1.4643616676330566, "reward_std": 0.1833011955022812, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4956117570400238, "step": 632 }, { "completion_length": 326.984375, "epoch": 0.20172084130019122, "grad_norm": 11.626622200012207, "kl": 0.06884765625, "learning_rate": 7.982791586998088e-07, "loss": 0.0027, "reward": 1.444153904914856, "reward_std": 0.10968662798404694, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44415396451950073, "rewards/pad": 0.0, "step": 633 }, { "completion_length": 498.578125, "epoch": 0.20203951561504144, "grad_norm": 9.971516609191895, "kl": 0.061767578125, "learning_rate": 7.979604843849585e-07, "loss": 0.0025, "reward": 1.4112811088562012, "reward_std": 0.22392134368419647, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.45815610885620117, "rewards/pad": 0.0, "step": 634 }, { "completion_length": 241.34375, "epoch": 0.20235818992989166, "grad_norm": 4.857548713684082, "kl": 0.08447265625, "learning_rate": 7.976418100701083e-07, "loss": 0.0034, "reward": 1.5090776681900024, "reward_std": 0.2434341311454773, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.36845266819000244, "rewards/pad": 0.171875, "step": 635 }, { "completion_length": 480.46875, "epoch": 0.20267686424474188, "grad_norm": 11.361845970153809, "kl": 0.053955078125, "learning_rate": 7.973231357552581e-07, "loss": 0.0022, "reward": 1.4582862854003906, "reward_std": 0.056097157299518585, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45828622579574585, "rewards/pad": 0.0, "step": 636 }, { "completion_length": 309.1875, "epoch": 0.2029955385595921, "grad_norm": 17.00067901611328, "kl": 0.07275390625, "learning_rate": 7.970044614404079e-07, "loss": 0.0029, "reward": 1.5798858404159546, "reward_std": 0.09062710404396057, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5798859000205994, "rewards/pad": 0.0, "step": 637 }, { "completion_length": 386.1875, "epoch": 0.20331421287444232, "grad_norm": 5.623465538024902, "kl": 0.05810546875, "learning_rate": 7.966857871255576e-07, "loss": 0.0023, "reward": 1.492444396018982, "reward_std": 0.04644552618265152, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49244433641433716, "step": 638 }, { "completion_length": 340.109375, "epoch": 0.20363288718929254, "grad_norm": 6.998098373413086, "kl": 0.0693359375, "learning_rate": 7.963671128107074e-07, "loss": 0.0028, "reward": 1.4608345031738281, "reward_std": 0.11724086850881577, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4764595329761505, "rewards/pad": 0.0, "step": 639 }, { "completion_length": 251.234375, "epoch": 0.20395156150414276, "grad_norm": 7.686694145202637, "kl": 0.103515625, "learning_rate": 7.960484384958573e-07, "loss": 0.0041, "reward": 1.6384403705596924, "reward_std": 0.12739157676696777, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5290653705596924, "rewards/pad": 0.125, "step": 640 }, { "completion_length": 280.984375, "epoch": 0.20427023581899298, "grad_norm": 9.52418327331543, "kl": 0.07958984375, "learning_rate": 7.957297641810071e-07, "loss": 0.0032, "reward": 1.3519306182861328, "reward_std": 0.13024069368839264, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.35193055868148804, "rewards/pad": 0.0, "step": 641 }, { "completion_length": 382.640625, "epoch": 0.2045889101338432, "grad_norm": 6.8839874267578125, "kl": 0.0859375, "learning_rate": 7.954110898661568e-07, "loss": 0.0034, "reward": 1.5839730501174927, "reward_std": 0.13091622292995453, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5995981097221375, "rewards/pad": 0.0, "step": 642 }, { "completion_length": 219.859375, "epoch": 0.20490758444869345, "grad_norm": 8.28393840789795, "kl": 0.08984375, "learning_rate": 7.950924155513066e-07, "loss": 0.0036, "reward": 1.5429143905639648, "reward_std": 0.17061397433280945, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5741643905639648, "rewards/pad": 0.0, "step": 643 }, { "completion_length": 446.3125, "epoch": 0.20522625876354367, "grad_norm": 7.347504615783691, "kl": 0.044189453125, "learning_rate": 7.947737412364564e-07, "loss": 0.0018, "reward": 1.511324405670166, "reward_std": 0.14396220445632935, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.417574405670166, "step": 644 }, { "completion_length": 311.671875, "epoch": 0.2055449330783939, "grad_norm": 11.247817039489746, "kl": 0.06982421875, "learning_rate": 7.944550669216062e-07, "loss": 0.0028, "reward": 1.5525568723678589, "reward_std": 0.1326773762702942, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5681818127632141, "rewards/pad": 0.0, "step": 645 }, { "completion_length": 325.859375, "epoch": 0.2058636073932441, "grad_norm": 14.660234451293945, "kl": 0.07470703125, "learning_rate": 7.941363926067559e-07, "loss": 0.003, "reward": 1.2435545921325684, "reward_std": 0.06781746447086334, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.25917962193489075, "rewards/pad": 0.0, "step": 646 }, { "completion_length": 291.4375, "epoch": 0.20618228170809433, "grad_norm": 4.295619964599609, "kl": 0.056884765625, "learning_rate": 7.938177182919057e-07, "loss": 0.0023, "reward": 1.562973976135254, "reward_std": 0.0560150146484375, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4379739761352539, "rewards/pad": 0.125, "step": 647 }, { "completion_length": 393.671875, "epoch": 0.20650095602294455, "grad_norm": 8.650598526000977, "kl": 0.048583984375, "learning_rate": 7.934990439770554e-07, "loss": 0.0019, "reward": 1.2792221307754517, "reward_std": 0.04953842982649803, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.27922216057777405, "step": 648 }, { "completion_length": 247.828125, "epoch": 0.20681963033779477, "grad_norm": 14.719244003295898, "kl": 0.0859375, "learning_rate": 7.931803696622051e-07, "loss": 0.0034, "reward": 1.6231489181518555, "reward_std": 0.13309356570243835, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4043988883495331, "rewards/pad": 0.21875, "step": 649 }, { "completion_length": 289.953125, "epoch": 0.207138304652645, "grad_norm": 10.855441093444824, "kl": 0.072265625, "learning_rate": 7.928616953473549e-07, "loss": 0.0029, "reward": 1.5013923645019531, "reward_std": 0.11773170530796051, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.40764227509498596, "rewards/pad": 0.109375, "step": 650 }, { "completion_length": 231.125, "epoch": 0.2074569789674952, "grad_norm": 9.067728042602539, "kl": 0.07177734375, "learning_rate": 7.925430210325047e-07, "loss": 0.0029, "reward": 1.5861713886260986, "reward_std": 0.12338827550411224, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5392964482307434, "rewards/pad": 0.046875, "step": 651 }, { "completion_length": 306.40625, "epoch": 0.20777565328234543, "grad_norm": 6.772564888000488, "kl": 0.06689453125, "learning_rate": 7.922243467176545e-07, "loss": 0.0027, "reward": 1.4238431453704834, "reward_std": 0.08837040513753891, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4082180857658386, "rewards/pad": 0.015625, "step": 652 }, { "completion_length": 296.8125, "epoch": 0.20809432759719568, "grad_norm": 6.618319034576416, "kl": 0.06494140625, "learning_rate": 7.919056724028042e-07, "loss": 0.0026, "reward": 1.3240604400634766, "reward_std": 0.1518493890762329, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3396855592727661, "rewards/pad": 0.0, "step": 653 }, { "completion_length": 300.421875, "epoch": 0.2084130019120459, "grad_norm": 11.761967658996582, "kl": 0.07666015625, "learning_rate": 7.91586998087954e-07, "loss": 0.0031, "reward": 1.5044121742248535, "reward_std": 0.06377843022346497, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5044121742248535, "rewards/pad": 0.0, "step": 654 }, { "completion_length": 414.296875, "epoch": 0.20873167622689612, "grad_norm": 14.195477485656738, "kl": 0.08642578125, "learning_rate": 7.912683237731038e-07, "loss": 0.0035, "reward": 1.5149987936019897, "reward_std": 0.07117429375648499, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.514998733997345, "step": 655 }, { "completion_length": 234.09375, "epoch": 0.20905035054174634, "grad_norm": 11.114954948425293, "kl": 0.07666015625, "learning_rate": 7.909496494582536e-07, "loss": 0.0031, "reward": 1.4249340295791626, "reward_std": 0.1590331494808197, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.424934059381485, "rewards/pad": 0.015625, "step": 656 }, { "completion_length": 388.84375, "epoch": 0.20936902485659656, "grad_norm": 4.132210731506348, "kl": 0.06494140625, "learning_rate": 7.906309751434033e-07, "loss": 0.0026, "reward": 1.378748893737793, "reward_std": 0.2675632834434509, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.39437395334243774, "rewards/pad": 0.03125, "step": 657 }, { "completion_length": 257.328125, "epoch": 0.20968769917144678, "grad_norm": 8.155116081237793, "kl": 0.080078125, "learning_rate": 7.903123008285531e-07, "loss": 0.0032, "reward": 1.4638173580169678, "reward_std": 0.17092521488666534, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.29194238781929016, "rewards/pad": 0.1875, "step": 658 }, { "completion_length": 364.8125, "epoch": 0.210006373486297, "grad_norm": 8.514001846313477, "kl": 0.06884765625, "learning_rate": 7.89993626513703e-07, "loss": 0.0028, "reward": 1.4162960052490234, "reward_std": 0.055003244429826736, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4162960648536682, "rewards/pad": 0.0, "step": 659 }, { "completion_length": 302.5625, "epoch": 0.21032504780114722, "grad_norm": 5.264654159545898, "kl": 0.07421875, "learning_rate": 7.896749521988528e-07, "loss": 0.003, "reward": 1.4601562023162842, "reward_std": 0.07050922513008118, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46015623211860657, "rewards/pad": 0.0, "step": 660 }, { "completion_length": 300.1875, "epoch": 0.21064372211599744, "grad_norm": 4.894284248352051, "kl": 0.06982421875, "learning_rate": 7.893562778840025e-07, "loss": 0.0028, "reward": 1.594339370727539, "reward_std": 0.08459107577800751, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5787143707275391, "rewards/pad": 0.015625, "step": 661 }, { "completion_length": 361.890625, "epoch": 0.2109623964308477, "grad_norm": 27.475067138671875, "kl": 0.0517578125, "learning_rate": 7.890376035691523e-07, "loss": 0.0021, "reward": 1.4996387958526611, "reward_std": 0.07242294400930405, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3746388852596283, "rewards/pad": 0.125, "step": 662 }, { "completion_length": 232.5625, "epoch": 0.2112810707456979, "grad_norm": 5.580474853515625, "kl": 0.08251953125, "learning_rate": 7.887189292543021e-07, "loss": 0.0033, "reward": 1.6235365867614746, "reward_std": 0.08368790149688721, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6235365867614746, "rewards/pad": 0.0, "step": 663 }, { "completion_length": 273.9375, "epoch": 0.21159974506054813, "grad_norm": 6.202267169952393, "kl": 0.0751953125, "learning_rate": 7.884002549394519e-07, "loss": 0.003, "reward": 1.5605616569519043, "reward_std": 0.07850479334592819, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5605616569519043, "rewards/pad": 0.0, "step": 664 }, { "completion_length": 287.671875, "epoch": 0.21191841937539835, "grad_norm": 14.803051948547363, "kl": 0.0673828125, "learning_rate": 7.880815806246016e-07, "loss": 0.0027, "reward": 1.4791467189788818, "reward_std": 0.19891056418418884, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3697718381881714, "step": 665 }, { "completion_length": 361.96875, "epoch": 0.21223709369024857, "grad_norm": 6.08893346786499, "kl": 0.051025390625, "learning_rate": 7.877629063097514e-07, "loss": 0.002, "reward": 1.542660117149353, "reward_std": 0.10150934010744095, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.433285117149353, "step": 666 }, { "completion_length": 272.828125, "epoch": 0.2125557680050988, "grad_norm": 10.113510131835938, "kl": 0.10400390625, "learning_rate": 7.874442319949012e-07, "loss": 0.0042, "reward": 1.5347037315368652, "reward_std": 0.05837291479110718, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5347037315368652, "rewards/pad": 0.0, "step": 667 }, { "completion_length": 348.390625, "epoch": 0.212874442319949, "grad_norm": 6.178546905517578, "kl": 0.052978515625, "learning_rate": 7.87125557680051e-07, "loss": 0.0021, "reward": 1.5060725212097168, "reward_std": 0.1880113184452057, "rewards/answer_reward": 0.15625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.365447461605072, "step": 668 }, { "completion_length": 383.359375, "epoch": 0.21319311663479923, "grad_norm": 10.284262657165527, "kl": 0.055908203125, "learning_rate": 7.868068833652007e-07, "loss": 0.0022, "reward": 1.4481096267700195, "reward_std": 0.109918974339962, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.46373462677001953, "step": 669 }, { "completion_length": 283.953125, "epoch": 0.21351179094964945, "grad_norm": 17.789011001586914, "kl": 0.076171875, "learning_rate": 7.864882090503505e-07, "loss": 0.0031, "reward": 1.293283224105835, "reward_std": 0.12071557343006134, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.30890828371047974, "rewards/pad": 0.0, "step": 670 }, { "completion_length": 225.4375, "epoch": 0.21383046526449967, "grad_norm": 13.195700645446777, "kl": 0.08544921875, "learning_rate": 7.861695347355003e-07, "loss": 0.0034, "reward": 1.5755658149719238, "reward_std": 0.10046328604221344, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5755658745765686, "rewards/pad": 0.0, "step": 671 }, { "completion_length": 261.46875, "epoch": 0.21414913957934992, "grad_norm": 5.178892135620117, "kl": 0.0654296875, "learning_rate": 7.858508604206501e-07, "loss": 0.0026, "reward": 1.61923086643219, "reward_std": 0.06453816592693329, "rewards/pad": 0.375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.2442309409379959, "step": 672 }, { "completion_length": 316.890625, "epoch": 0.21446781389420014, "grad_norm": 12.810659408569336, "kl": 0.06640625, "learning_rate": 7.855321861057998e-07, "loss": 0.0027, "reward": 1.6415035724639893, "reward_std": 0.046421363949775696, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5165035724639893, "step": 673 }, { "completion_length": 327.6875, "epoch": 0.21478648820905036, "grad_norm": 13.431648254394531, "kl": 0.064453125, "learning_rate": 7.852135117909496e-07, "loss": 0.0026, "reward": 1.3542231321334839, "reward_std": 0.10205789655447006, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.33859819173812866, "step": 674 }, { "completion_length": 278.578125, "epoch": 0.21510516252390058, "grad_norm": 7.597766399383545, "kl": 0.07958984375, "learning_rate": 7.848948374760994e-07, "loss": 0.0032, "reward": 1.4646666049957275, "reward_std": 0.26397255063056946, "rewards/answer_reward": 0.046875, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.44904154539108276, "step": 675 }, { "completion_length": 238.296875, "epoch": 0.2154238368387508, "grad_norm": 25.516254425048828, "kl": 0.076171875, "learning_rate": 7.845761631612492e-07, "loss": 0.003, "reward": 1.469029188156128, "reward_std": 0.21218442916870117, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.39090412855148315, "rewards/pad": 0.09375, "step": 676 }, { "completion_length": 239.734375, "epoch": 0.21574251115360102, "grad_norm": 5.068884372711182, "kl": 0.087890625, "learning_rate": 7.842574888463989e-07, "loss": 0.0035, "reward": 1.4459609985351562, "reward_std": 0.10616542398929596, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3678360879421234, "step": 677 }, { "completion_length": 303.71875, "epoch": 0.21606118546845124, "grad_norm": 6.922590255737305, "kl": 0.07177734375, "learning_rate": 7.839388145315488e-07, "loss": 0.0029, "reward": 1.3253346681594849, "reward_std": 0.08155903965234756, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.34095969796180725, "step": 678 }, { "completion_length": 164.375, "epoch": 0.21637985978330146, "grad_norm": 12.585428237915039, "kl": 0.119140625, "learning_rate": 7.836201402166986e-07, "loss": 0.0048, "reward": 1.6467983722686768, "reward_std": 0.15064901113510132, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4124232530593872, "rewards/pad": 0.234375, "step": 679 }, { "completion_length": 330.078125, "epoch": 0.21669853409815168, "grad_norm": 18.918582916259766, "kl": 0.06396484375, "learning_rate": 7.833014659018483e-07, "loss": 0.0026, "reward": 1.3851571083068848, "reward_std": 0.1387689709663391, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.30703213810920715, "rewards/pad": 0.109375, "step": 680 }, { "completion_length": 298.328125, "epoch": 0.2170172084130019, "grad_norm": 8.898725509643555, "kl": 0.0751953125, "learning_rate": 7.829827915869981e-07, "loss": 0.003, "reward": 1.3700380325317383, "reward_std": 0.06074301898479462, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3700379729270935, "step": 681 }, { "completion_length": 322.640625, "epoch": 0.21733588272785215, "grad_norm": 14.546353340148926, "kl": 0.064453125, "learning_rate": 7.826641172721479e-07, "loss": 0.0026, "reward": 1.4926016330718994, "reward_std": 0.08959314972162247, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.367601603269577, "rewards/pad": 0.125, "step": 682 }, { "completion_length": 256.671875, "epoch": 0.21765455704270237, "grad_norm": 5.957823753356934, "kl": 0.08056640625, "learning_rate": 7.823454429572977e-07, "loss": 0.0032, "reward": 1.5421984195709229, "reward_std": 0.08622148633003235, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5421984791755676, "step": 683 }, { "completion_length": 307.234375, "epoch": 0.2179732313575526, "grad_norm": 6.484907150268555, "kl": 0.09033203125, "learning_rate": 7.820267686424474e-07, "loss": 0.0036, "reward": 1.4343141317367554, "reward_std": 0.09127238392829895, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.44993916153907776, "rewards/pad": 0.0, "step": 684 }, { "completion_length": 377.796875, "epoch": 0.2182919056724028, "grad_norm": 4.331064701080322, "kl": 0.053955078125, "learning_rate": 7.817080943275972e-07, "loss": 0.0022, "reward": 1.450018286705017, "reward_std": 0.07817307859659195, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.29376834630966187, "step": 685 }, { "completion_length": 259.765625, "epoch": 0.21861057998725303, "grad_norm": 7.207475185394287, "kl": 0.07958984375, "learning_rate": 7.81389420012747e-07, "loss": 0.0032, "reward": 1.5709216594696045, "reward_std": 0.10550111532211304, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46154657006263733, "rewards/pad": 0.109375, "step": 686 }, { "completion_length": 300.4375, "epoch": 0.21892925430210325, "grad_norm": 34.08000564575195, "kl": 0.09228515625, "learning_rate": 7.810707456978967e-07, "loss": 0.0037, "reward": 1.5559680461883545, "reward_std": 0.1384851634502411, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5559679269790649, "rewards/pad": 0.015625, "step": 687 }, { "completion_length": 272.15625, "epoch": 0.21924792861695347, "grad_norm": 8.619714736938477, "kl": 0.0791015625, "learning_rate": 7.807520713830464e-07, "loss": 0.0032, "reward": 1.5306808948516846, "reward_std": 0.11045212298631668, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4681808650493622, "rewards/pad": 0.0625, "step": 688 }, { "completion_length": 356.859375, "epoch": 0.2195666029318037, "grad_norm": 13.284117698669434, "kl": 0.1142578125, "learning_rate": 7.804333970681962e-07, "loss": 0.0046, "reward": 1.48612642288208, "reward_std": 0.10852101445198059, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3923763036727905, "step": 689 }, { "completion_length": 257.390625, "epoch": 0.2198852772466539, "grad_norm": 15.725584983825684, "kl": 0.072265625, "learning_rate": 7.80114722753346e-07, "loss": 0.0029, "reward": 1.5707192420959473, "reward_std": 0.18728512525558472, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4769693613052368, "rewards/pad": 0.09375, "step": 690 }, { "completion_length": 204.015625, "epoch": 0.22020395156150413, "grad_norm": 16.46550941467285, "kl": 0.09814453125, "learning_rate": 7.797960484384958e-07, "loss": 0.0039, "reward": 1.4956278800964355, "reward_std": 0.06200110539793968, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49562788009643555, "step": 691 }, { "completion_length": 253.015625, "epoch": 0.22052262587635438, "grad_norm": 10.900519371032715, "kl": 0.09375, "learning_rate": 7.794773741236455e-07, "loss": 0.0037, "reward": 1.4226266145706177, "reward_std": 0.08624820411205292, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42262664437294006, "rewards/pad": 0.0, "step": 692 }, { "completion_length": 255.9375, "epoch": 0.2208413001912046, "grad_norm": 16.583126068115234, "kl": 0.08544921875, "learning_rate": 7.791586998087953e-07, "loss": 0.0034, "reward": 1.4481335878372192, "reward_std": 0.09301026910543442, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44813355803489685, "rewards/pad": 0.0, "step": 693 }, { "completion_length": 377.390625, "epoch": 0.22115997450605482, "grad_norm": 4.421432018280029, "kl": 0.0556640625, "learning_rate": 7.788400254939451e-07, "loss": 0.0022, "reward": 1.5260976552963257, "reward_std": 0.12610724568367004, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5417227149009705, "step": 694 }, { "completion_length": 397.59375, "epoch": 0.22147864882090504, "grad_norm": 6.189242362976074, "kl": 0.0537109375, "learning_rate": 7.785213511790949e-07, "loss": 0.0021, "reward": 1.4347798824310303, "reward_std": 0.09408040344715118, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4035297632217407, "step": 695 }, { "completion_length": 166.515625, "epoch": 0.22179732313575526, "grad_norm": 16.723281860351562, "kl": 0.08984375, "learning_rate": 7.782026768642446e-07, "loss": 0.0036, "reward": 1.554368495941162, "reward_std": 0.2356569468975067, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.28874337673187256, "rewards/pad": 0.296875, "step": 696 }, { "completion_length": 258.390625, "epoch": 0.22211599745060548, "grad_norm": 9.294581413269043, "kl": 0.06884765625, "learning_rate": 7.778840025493945e-07, "loss": 0.0028, "reward": 1.6144590377807617, "reward_std": 0.06388699263334274, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.48945894837379456, "step": 697 }, { "completion_length": 345.328125, "epoch": 0.2224346717654557, "grad_norm": 9.66220760345459, "kl": 0.05859375, "learning_rate": 7.775653282345443e-07, "loss": 0.0023, "reward": 1.3565130233764648, "reward_std": 0.18195012211799622, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.3877629339694977, "step": 698 }, { "completion_length": 250.3125, "epoch": 0.22275334608030592, "grad_norm": 6.309233665466309, "kl": 0.0869140625, "learning_rate": 7.772466539196941e-07, "loss": 0.0035, "reward": 1.4840912818908691, "reward_std": 0.10586719214916229, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.37471622228622437, "step": 699 }, { "completion_length": 351.03125, "epoch": 0.22307202039515614, "grad_norm": 21.303956985473633, "kl": 0.060546875, "learning_rate": 7.769279796048438e-07, "loss": 0.0024, "reward": 1.451223611831665, "reward_std": 0.12037895619869232, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.46684861183166504, "rewards/pad": 0.0, "step": 700 }, { "completion_length": 304.6875, "epoch": 0.22339069471000636, "grad_norm": 11.189263343811035, "kl": 0.0693359375, "learning_rate": 7.766093052899936e-07, "loss": 0.0028, "reward": 1.632739543914795, "reward_std": 0.13571488857269287, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5233644247055054, "rewards/pad": 0.125, "step": 701 }, { "completion_length": 203.375, "epoch": 0.2237093690248566, "grad_norm": 20.83745002746582, "kl": 0.09619140625, "learning_rate": 7.762906309751434e-07, "loss": 0.0039, "reward": 1.6446971893310547, "reward_std": 0.117695152759552, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6446971893310547, "rewards/pad": 0.0, "step": 702 }, { "completion_length": 168.8125, "epoch": 0.22402804333970683, "grad_norm": 10.630196571350098, "kl": 0.091796875, "learning_rate": 7.759719566602932e-07, "loss": 0.0037, "reward": 1.5714305639266968, "reward_std": 0.21625912189483643, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.46205565333366394, "step": 703 }, { "completion_length": 352.703125, "epoch": 0.22434671765455705, "grad_norm": 13.083415985107422, "kl": 0.052978515625, "learning_rate": 7.756532823454429e-07, "loss": 0.0021, "reward": 1.4256035089492798, "reward_std": 0.0655084028840065, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3006035089492798, "rewards/pad": 0.125, "step": 704 }, { "completion_length": 321.890625, "epoch": 0.22466539196940727, "grad_norm": 8.3506441116333, "kl": 0.07470703125, "learning_rate": 7.753346080305927e-07, "loss": 0.003, "reward": 1.4344754219055176, "reward_std": 0.13548129796981812, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4501004219055176, "rewards/pad": 0.0, "step": 705 }, { "completion_length": 122.609375, "epoch": 0.2249840662842575, "grad_norm": 23.514019012451172, "kl": 0.0986328125, "learning_rate": 7.750159337157425e-07, "loss": 0.0039, "reward": 1.5593478679656982, "reward_std": 0.3233591616153717, "rewards/answer_reward": 0.1875, "rewards/format_reward_gqa": 0.9375, "rewards/iou_glue_reward": 0.434347927570343, "step": 706 }, { "completion_length": 261.96875, "epoch": 0.2253027405991077, "grad_norm": 14.593097686767578, "kl": 0.080078125, "learning_rate": 7.746972594008923e-07, "loss": 0.0032, "reward": 1.6064164638519287, "reward_std": 0.2630051374435425, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4657914638519287, "rewards/pad": 0.15625, "step": 707 }, { "completion_length": 269.46875, "epoch": 0.22562141491395793, "grad_norm": 6.744601726531982, "kl": 0.08642578125, "learning_rate": 7.74378585086042e-07, "loss": 0.0034, "reward": 1.5797683000564575, "reward_std": 0.20066973567008972, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4860183000564575, "rewards/pad": 0.109375, "step": 708 }, { "completion_length": 151.21875, "epoch": 0.22594008922880815, "grad_norm": 8.127285957336426, "kl": 0.11083984375, "learning_rate": 7.740599107711918e-07, "loss": 0.0044, "reward": 1.563246250152588, "reward_std": 0.17007926106452942, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.4694961905479431, "step": 709 }, { "completion_length": 189.953125, "epoch": 0.22625876354365837, "grad_norm": 4.988349914550781, "kl": 0.0927734375, "learning_rate": 7.737412364563416e-07, "loss": 0.0037, "reward": 1.4685695171356201, "reward_std": 0.13317400217056274, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4841945171356201, "step": 710 }, { "completion_length": 166.859375, "epoch": 0.22657743785850862, "grad_norm": 18.31987190246582, "kl": 0.0888671875, "learning_rate": 7.734225621414913e-07, "loss": 0.0035, "reward": 1.537738561630249, "reward_std": 0.1872149109840393, "rewards/answer_reward": 0.171875, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.381488561630249, "step": 711 }, { "completion_length": 273.859375, "epoch": 0.22689611217335884, "grad_norm": 6.696666717529297, "kl": 0.0732421875, "learning_rate": 7.731038878266411e-07, "loss": 0.0029, "reward": 1.527390718460083, "reward_std": 0.16815558075904846, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5586407780647278, "step": 712 }, { "completion_length": 287.9375, "epoch": 0.22721478648820906, "grad_norm": 13.334807395935059, "kl": 0.06494140625, "learning_rate": 7.727852135117909e-07, "loss": 0.0026, "reward": 1.5138880014419556, "reward_std": 0.11974107474088669, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4201379418373108, "rewards/pad": 0.109375, "step": 713 }, { "completion_length": 256.140625, "epoch": 0.22753346080305928, "grad_norm": 9.865249633789062, "kl": 0.06982421875, "learning_rate": 7.724665391969407e-07, "loss": 0.0028, "reward": 1.3564677238464355, "reward_std": 0.1476736217737198, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.26271772384643555, "rewards/pad": 0.109375, "step": 714 }, { "completion_length": 207.171875, "epoch": 0.2278521351179095, "grad_norm": 15.958372116088867, "kl": 0.109375, "learning_rate": 7.721478648820904e-07, "loss": 0.0044, "reward": 1.476958155632019, "reward_std": 0.1478617787361145, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.49258315563201904, "rewards/pad": 0.0, "step": 715 }, { "completion_length": 367.640625, "epoch": 0.22817080943275972, "grad_norm": 15.024460792541504, "kl": 0.038818359375, "learning_rate": 7.718291905672403e-07, "loss": 0.0016, "reward": 1.4593737125396729, "reward_std": 0.17769308388233185, "rewards/answer_reward": 0.046875, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.42812371253967285, "step": 716 }, { "completion_length": 244.96875, "epoch": 0.22848948374760994, "grad_norm": 13.350499153137207, "kl": 0.07958984375, "learning_rate": 7.715105162523901e-07, "loss": 0.0032, "reward": 1.5930790901184082, "reward_std": 0.13378813862800598, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.48370397090911865, "rewards/pad": 0.125, "step": 717 }, { "completion_length": 182.25, "epoch": 0.22880815806246016, "grad_norm": 37.75197219848633, "kl": 0.09619140625, "learning_rate": 7.711918419375399e-07, "loss": 0.0038, "reward": 1.5131914615631104, "reward_std": 0.22410151362419128, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5131914615631104, "step": 718 }, { "completion_length": 186.75, "epoch": 0.22912683237731038, "grad_norm": 25.390933990478516, "kl": 0.10595703125, "learning_rate": 7.708731676226896e-07, "loss": 0.0042, "reward": 1.4066598415374756, "reward_std": 0.12678267061710358, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4222848117351532, "rewards/pad": 0.0, "step": 719 }, { "completion_length": 166.078125, "epoch": 0.2294455066921606, "grad_norm": 20.202503204345703, "kl": 0.1025390625, "learning_rate": 7.705544933078394e-07, "loss": 0.0041, "reward": 1.506861925125122, "reward_std": 0.12078607827425003, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5068618059158325, "step": 720 }, { "completion_length": 233.359375, "epoch": 0.22976418100701085, "grad_norm": 24.11056900024414, "kl": 0.078125, "learning_rate": 7.702358189929892e-07, "loss": 0.0031, "reward": 1.4882006645202637, "reward_std": 0.13335324823856354, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5038256049156189, "rewards/pad": 0.0, "step": 721 }, { "completion_length": 275.6875, "epoch": 0.23008285532186107, "grad_norm": 4.728917121887207, "kl": 0.07275390625, "learning_rate": 7.69917144678139e-07, "loss": 0.0029, "reward": 1.3744094371795654, "reward_std": 0.11881347000598907, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.32753440737724304, "step": 722 }, { "completion_length": 251.9375, "epoch": 0.2304015296367113, "grad_norm": 8.52653694152832, "kl": 0.08154296875, "learning_rate": 7.695984703632887e-07, "loss": 0.0033, "reward": 1.6842291355133057, "reward_std": 0.20814046263694763, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.48110413551330566, "rewards/pad": 0.21875, "step": 723 }, { "completion_length": 306.109375, "epoch": 0.2307202039515615, "grad_norm": 11.247517585754395, "kl": 0.0888671875, "learning_rate": 7.692797960484385e-07, "loss": 0.0036, "reward": 1.4908294677734375, "reward_std": 0.06939513236284256, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3658294081687927, "step": 724 }, { "completion_length": 337.0625, "epoch": 0.23103887826641173, "grad_norm": 6.2073445320129395, "kl": 0.0791015625, "learning_rate": 7.689611217335883e-07, "loss": 0.0031, "reward": 1.376875877380371, "reward_std": 0.033518463373184204, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.25187593698501587, "step": 725 }, { "completion_length": 276.9375, "epoch": 0.23135755258126195, "grad_norm": 17.229686737060547, "kl": 0.10009765625, "learning_rate": 7.68642447418738e-07, "loss": 0.004, "reward": 1.5234543085098267, "reward_std": 0.08890549838542938, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5234543681144714, "step": 726 }, { "completion_length": 176.5625, "epoch": 0.23167622689611217, "grad_norm": 7.54797887802124, "kl": 0.10546875, "learning_rate": 7.683237731038877e-07, "loss": 0.0042, "reward": 1.566821813583374, "reward_std": 0.06920469552278519, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5668217539787292, "rewards/pad": 0.0, "step": 727 }, { "completion_length": 184.5625, "epoch": 0.2319949012109624, "grad_norm": 11.765900611877441, "kl": 0.12158203125, "learning_rate": 7.680050987890375e-07, "loss": 0.0049, "reward": 1.5227802991867065, "reward_std": 0.18534991145133972, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44465523958206177, "rewards/pad": 0.078125, "step": 728 }, { "completion_length": 327.484375, "epoch": 0.2323135755258126, "grad_norm": 7.624211311340332, "kl": 0.059814453125, "learning_rate": 7.676864244741873e-07, "loss": 0.0024, "reward": 1.540311574935913, "reward_std": 0.11048051714897156, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4309366047382355, "step": 729 }, { "completion_length": 179.96875, "epoch": 0.23263224984066283, "grad_norm": 12.753997802734375, "kl": 0.1015625, "learning_rate": 7.673677501593371e-07, "loss": 0.0041, "reward": 1.4728918075561523, "reward_std": 0.12791694700717926, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4728918671607971, "rewards/pad": 0.0, "step": 730 }, { "completion_length": 218.59375, "epoch": 0.23295092415551308, "grad_norm": 13.48316478729248, "kl": 0.10205078125, "learning_rate": 7.670490758444868e-07, "loss": 0.0041, "reward": 1.6227314472198486, "reward_std": 0.08450779318809509, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4977312684059143, "step": 731 }, { "completion_length": 158.90625, "epoch": 0.2332695984703633, "grad_norm": 159.3242950439453, "kl": 0.103515625, "learning_rate": 7.667304015296366e-07, "loss": 0.0041, "reward": 1.6386592388153076, "reward_std": 0.1035318672657013, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5136591196060181, "rewards/pad": 0.125, "step": 732 }, { "completion_length": 363.671875, "epoch": 0.23358827278521352, "grad_norm": 5.775212287902832, "kl": 0.0556640625, "learning_rate": 7.664117272147864e-07, "loss": 0.0022, "reward": 1.324002742767334, "reward_std": 0.01590358465909958, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.32400280237197876, "rewards/pad": 0.0, "step": 733 }, { "completion_length": 273.5625, "epoch": 0.23390694710006374, "grad_norm": 9.607227325439453, "kl": 0.078125, "learning_rate": 7.660930528999362e-07, "loss": 0.0031, "reward": 1.46985924243927, "reward_std": 0.11552847921848297, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4542343020439148, "rewards/pad": 0.03125, "step": 734 }, { "completion_length": 264.625, "epoch": 0.23422562141491396, "grad_norm": 11.888554573059082, "kl": 0.076171875, "learning_rate": 7.65774378585086e-07, "loss": 0.003, "reward": 1.63518226146698, "reward_std": 0.11117585748434067, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49455729126930237, "rewards/pad": 0.140625, "step": 735 }, { "completion_length": 212.28125, "epoch": 0.23454429572976418, "grad_norm": 8.31882381439209, "kl": 0.091796875, "learning_rate": 7.654557042702358e-07, "loss": 0.0037, "reward": 1.5230109691619873, "reward_std": 0.09353186190128326, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5230109691619873, "step": 736 }, { "completion_length": 247.890625, "epoch": 0.2348629700446144, "grad_norm": 8.512290954589844, "kl": 0.099609375, "learning_rate": 7.651370299553856e-07, "loss": 0.004, "reward": 1.6262836456298828, "reward_std": 0.09318853914737701, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5012836456298828, "step": 737 }, { "completion_length": 112.421875, "epoch": 0.23518164435946462, "grad_norm": 15.200469017028809, "kl": 0.12158203125, "learning_rate": 7.648183556405353e-07, "loss": 0.0049, "reward": 1.7096667289733887, "reward_std": 0.13319414854049683, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.33466672897338867, "rewards/pad": 0.375, "step": 738 }, { "completion_length": 145.859375, "epoch": 0.23550031867431484, "grad_norm": 9.961045265197754, "kl": 0.11279296875, "learning_rate": 7.644996813256851e-07, "loss": 0.0045, "reward": 1.5271918773651123, "reward_std": 0.060988496989011765, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5271918773651123, "rewards/pad": 0.0, "step": 739 }, { "completion_length": 339.8125, "epoch": 0.23581899298916506, "grad_norm": 15.928852081298828, "kl": 0.0615234375, "learning_rate": 7.641810070108349e-07, "loss": 0.0025, "reward": 1.3527032136917114, "reward_std": 0.06639071553945541, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3527032136917114, "step": 740 }, { "completion_length": 264.21875, "epoch": 0.2361376673040153, "grad_norm": 12.001644134521484, "kl": 0.08154296875, "learning_rate": 7.638623326959847e-07, "loss": 0.0033, "reward": 1.316277027130127, "reward_std": 0.14961186051368713, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.33190202713012695, "step": 741 }, { "completion_length": 178.5, "epoch": 0.23645634161886553, "grad_norm": 14.840734481811523, "kl": 0.12353515625, "learning_rate": 7.635436583811344e-07, "loss": 0.0049, "reward": 1.619857907295227, "reward_std": 0.17236697673797607, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5261078476905823, "rewards/pad": 0.125, "step": 742 }, { "completion_length": 174.9375, "epoch": 0.23677501593371575, "grad_norm": 9.551653861999512, "kl": 0.10205078125, "learning_rate": 7.632249840662842e-07, "loss": 0.0041, "reward": 1.6346447467803955, "reward_std": 0.21686974167823792, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.43151968717575073, "rewards/pad": 0.21875, "step": 743 }, { "completion_length": 309.453125, "epoch": 0.23709369024856597, "grad_norm": 18.017536163330078, "kl": 0.0703125, "learning_rate": 7.62906309751434e-07, "loss": 0.0028, "reward": 1.5284950733184814, "reward_std": 0.11433098465204239, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5128699541091919, "step": 744 }, { "completion_length": 213.6875, "epoch": 0.2374123645634162, "grad_norm": 11.528428077697754, "kl": 0.10546875, "learning_rate": 7.625876354365838e-07, "loss": 0.0042, "reward": 1.6650524139404297, "reward_std": 0.09348995238542557, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5400524139404297, "step": 745 }, { "completion_length": 309.265625, "epoch": 0.2377310388782664, "grad_norm": 20.259113311767578, "kl": 0.0712890625, "learning_rate": 7.622689611217335e-07, "loss": 0.0029, "reward": 1.4184303283691406, "reward_std": 0.10087809711694717, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.38718029856681824, "step": 746 }, { "completion_length": 388.25, "epoch": 0.23804971319311663, "grad_norm": 15.798137664794922, "kl": 0.171875, "learning_rate": 7.619502868068833e-07, "loss": 0.0069, "reward": 1.6084961891174316, "reward_std": 0.27900782227516174, "rewards/answer_reward": 0.203125, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.4366213083267212, "step": 747 }, { "completion_length": 251.15625, "epoch": 0.23836838750796685, "grad_norm": 27.81381607055664, "kl": 0.07421875, "learning_rate": 7.616316124920331e-07, "loss": 0.003, "reward": 1.7127543687820435, "reward_std": 0.13633526861667633, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46275442838668823, "rewards/pad": 0.25, "step": 748 }, { "completion_length": 234.53125, "epoch": 0.23868706182281707, "grad_norm": 21.107439041137695, "kl": 0.1171875, "learning_rate": 7.613129381771829e-07, "loss": 0.0047, "reward": 1.5439451932907104, "reward_std": 0.11229214072227478, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5439450740814209, "rewards/pad": 0.0, "step": 749 }, { "completion_length": 424.484375, "epoch": 0.2390057361376673, "grad_norm": 13.354806900024414, "kl": 0.05224609375, "learning_rate": 7.609942638623326e-07, "loss": 0.0021, "reward": 1.4798873662948608, "reward_std": 0.10450975596904755, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.49551236629486084, "rewards/pad": 0.0, "step": 750 }, { "completion_length": 335.078125, "epoch": 0.23932441045251754, "grad_norm": 7.3585333824157715, "kl": 0.0849609375, "learning_rate": 7.606755895474824e-07, "loss": 0.0034, "reward": 1.5674034357070923, "reward_std": 0.14845271408557892, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4580284357070923, "rewards/pad": 0.125, "step": 751 }, { "completion_length": 361.0625, "epoch": 0.23964308476736776, "grad_norm": 6.82198429107666, "kl": 0.07861328125, "learning_rate": 7.603569152326322e-07, "loss": 0.0032, "reward": 1.4816830158233643, "reward_std": 0.16714411973953247, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.512933075428009, "rewards/pad": 0.0, "step": 752 }, { "completion_length": 221.421875, "epoch": 0.23996175908221798, "grad_norm": 9.491703987121582, "kl": 0.10986328125, "learning_rate": 7.60038240917782e-07, "loss": 0.0044, "reward": 1.6149952411651611, "reward_std": 0.12568673491477966, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48999521136283875, "rewards/pad": 0.125, "step": 753 }, { "completion_length": 211.296875, "epoch": 0.2402804333970682, "grad_norm": 20.870731353759766, "kl": 0.10498046875, "learning_rate": 7.597195666029318e-07, "loss": 0.0042, "reward": 1.5477931499481201, "reward_std": 0.22584594786167145, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4540431499481201, "rewards/pad": 0.125, "step": 754 }, { "completion_length": 276.09375, "epoch": 0.24059910771191842, "grad_norm": 10.502787590026855, "kl": 0.09326171875, "learning_rate": 7.594008922880816e-07, "loss": 0.0037, "reward": 1.3897948265075684, "reward_std": 0.262864351272583, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.43666979670524597, "rewards/pad": 0.0, "step": 755 }, { "completion_length": 205.359375, "epoch": 0.24091778202676864, "grad_norm": 22.47008514404297, "kl": 0.1044921875, "learning_rate": 7.590822179732314e-07, "loss": 0.0042, "reward": 1.4155709743499756, "reward_std": 0.27894821763038635, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.3843209445476532, "rewards/pad": 0.078125, "step": 756 }, { "completion_length": 271.828125, "epoch": 0.24123645634161886, "grad_norm": 11.023628234863281, "kl": 0.078125, "learning_rate": 7.587635436583812e-07, "loss": 0.0031, "reward": 1.4424021244049072, "reward_std": 0.211650550365448, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.9375, "rewards/tracking_iou_reward": 0.3799021244049072, "step": 757 }, { "completion_length": 219.71875, "epoch": 0.24155513065646908, "grad_norm": 13.070090293884277, "kl": 0.09814453125, "learning_rate": 7.584448693435309e-07, "loss": 0.0039, "reward": 1.633631944656372, "reward_std": 0.2932754456996918, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 0.9375, "rewards/tracking_iou_reward": 0.5242569446563721, "step": 758 }, { "completion_length": 165.84375, "epoch": 0.2418738049713193, "grad_norm": 11.352629661560059, "kl": 0.1533203125, "learning_rate": 7.581261950286807e-07, "loss": 0.0061, "reward": 1.5862655639648438, "reward_std": 0.16199254989624023, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4925156235694885, "rewards/pad": 0.125, "step": 759 }, { "completion_length": 421.359375, "epoch": 0.24219247928616955, "grad_norm": 4.43811559677124, "kl": 0.0615234375, "learning_rate": 7.578075207138305e-07, "loss": 0.0025, "reward": 1.3123204708099365, "reward_std": 0.34193655848503113, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.765625, "rewards/iou_glue_reward": 0.4216954708099365, "step": 760 }, { "completion_length": 112.921875, "epoch": 0.24251115360101977, "grad_norm": 7.119170188903809, "kl": 0.12158203125, "learning_rate": 7.574888463989803e-07, "loss": 0.0049, "reward": 1.7790417671203613, "reward_std": 0.1314246654510498, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5446667671203613, "rewards/pad": 0.234375, "step": 761 }, { "completion_length": 456.25, "epoch": 0.24282982791587, "grad_norm": 3.5768837928771973, "kl": 0.046875, "learning_rate": 7.5717017208413e-07, "loss": 0.0019, "reward": 1.3192476034164429, "reward_std": 0.2558511197566986, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.875, "rewards/tracking_iou_reward": 0.44424766302108765, "step": 762 }, { "completion_length": 277.578125, "epoch": 0.2431485022307202, "grad_norm": 15.019842147827148, "kl": 0.08251953125, "learning_rate": 7.568514977692798e-07, "loss": 0.0033, "reward": 1.4773023128509521, "reward_std": 0.1882389783859253, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.953125, "rewards/tracking_iou_reward": 0.39917728304862976, "step": 763 }, { "completion_length": 241.515625, "epoch": 0.24346717654557043, "grad_norm": 36.11360549926758, "kl": 0.091796875, "learning_rate": 7.565328234544296e-07, "loss": 0.0037, "reward": 1.727245569229126, "reward_std": 0.22436536848545074, "rewards/pad": 0.1875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.555370569229126, "step": 764 }, { "completion_length": 330.703125, "epoch": 0.24378585086042065, "grad_norm": 6.199359893798828, "kl": 0.07080078125, "learning_rate": 7.562141491395793e-07, "loss": 0.0028, "reward": 1.5140249729156494, "reward_std": 0.20898380875587463, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.420274943113327, "step": 765 }, { "completion_length": 347.546875, "epoch": 0.24410452517527087, "grad_norm": 5.358139514923096, "kl": 0.08203125, "learning_rate": 7.55895474824729e-07, "loss": 0.0033, "reward": 1.5010960102081299, "reward_std": 0.15243446826934814, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.5479710102081299, "rewards/pad": 0.0, "step": 766 }, { "completion_length": 249.953125, "epoch": 0.2444231994901211, "grad_norm": 19.22927474975586, "kl": 0.10107421875, "learning_rate": 7.555768005098788e-07, "loss": 0.004, "reward": 1.586197853088379, "reward_std": 0.1058286726474762, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46119794249534607, "rewards/pad": 0.125, "step": 767 }, { "completion_length": 265.515625, "epoch": 0.2447418738049713, "grad_norm": 12.999495506286621, "kl": 0.09375, "learning_rate": 7.552581261950286e-07, "loss": 0.0038, "reward": 1.595004677772522, "reward_std": 0.17190062999725342, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.516879677772522, "rewards/pad": 0.09375, "step": 768 }, { "completion_length": 227.625, "epoch": 0.24506054811982153, "grad_norm": 10.447826385498047, "kl": 0.08349609375, "learning_rate": 7.549394518801783e-07, "loss": 0.0033, "reward": 1.4319894313812256, "reward_std": 0.20667222142219543, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3382394015789032, "step": 769 }, { "completion_length": 322.34375, "epoch": 0.24537922243467178, "grad_norm": 18.960357666015625, "kl": 0.0712890625, "learning_rate": 7.546207775653281e-07, "loss": 0.0029, "reward": 1.3130937814712524, "reward_std": 0.03393920511007309, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.31309378147125244, "rewards/pad": 0.0, "step": 770 }, { "completion_length": 217.875, "epoch": 0.245697896749522, "grad_norm": 11.330354690551758, "kl": 0.10986328125, "learning_rate": 7.543021032504779e-07, "loss": 0.0044, "reward": 1.6246107816696167, "reward_std": 0.11168281733989716, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4371107518672943, "rewards/pad": 0.1875, "step": 771 }, { "completion_length": 165.9375, "epoch": 0.24601657106437222, "grad_norm": 14.55956745147705, "kl": 0.10205078125, "learning_rate": 7.539834289356277e-07, "loss": 0.0041, "reward": 1.569342017173767, "reward_std": 0.15387828648090363, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4599670171737671, "rewards/pad": 0.125, "step": 772 }, { "completion_length": 263.03125, "epoch": 0.24633524537922244, "grad_norm": 4.5672101974487305, "kl": 0.08056640625, "learning_rate": 7.536647546207775e-07, "loss": 0.0032, "reward": 1.6846940517425537, "reward_std": 0.05211274325847626, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5596940517425537, "rewards/pad": 0.125, "step": 773 }, { "completion_length": 324.390625, "epoch": 0.24665391969407266, "grad_norm": 10.738593101501465, "kl": 0.0771484375, "learning_rate": 7.533460803059273e-07, "loss": 0.0031, "reward": 1.5184435844421387, "reward_std": 0.19243696331977844, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5496935844421387, "rewards/pad": 0.0, "step": 774 }, { "completion_length": 228.078125, "epoch": 0.24697259400892288, "grad_norm": 8.436495780944824, "kl": 0.10888671875, "learning_rate": 7.530274059910771e-07, "loss": 0.0044, "reward": 1.4948652982711792, "reward_std": 0.1509888470172882, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.510490357875824, "rewards/pad": 0.0, "step": 775 }, { "completion_length": 174.34375, "epoch": 0.2472912683237731, "grad_norm": 35.71000289916992, "kl": 0.369140625, "learning_rate": 7.527087316762269e-07, "loss": 0.0149, "reward": 1.3583810329437256, "reward_std": 0.09419108927249908, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3583810329437256, "rewards/pad": 0.0, "step": 776 }, { "completion_length": 281.59375, "epoch": 0.24760994263862332, "grad_norm": 9.188167572021484, "kl": 0.08056640625, "learning_rate": 7.523900573613766e-07, "loss": 0.0032, "reward": 1.4431560039520264, "reward_std": 0.22239582240581512, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.49003103375434875, "rewards/pad": 0.0, "step": 777 }, { "completion_length": 165.984375, "epoch": 0.24792861695347354, "grad_norm": 7.557226657867432, "kl": 0.1162109375, "learning_rate": 7.520713830465264e-07, "loss": 0.0047, "reward": 1.5415880680084229, "reward_std": 0.22100627422332764, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.46346306800842285, "rewards/pad": 0.09375, "step": 778 }, { "completion_length": 328.5, "epoch": 0.24824729126832376, "grad_norm": 12.88247013092041, "kl": 0.08740234375, "learning_rate": 7.517527087316762e-07, "loss": 0.0035, "reward": 1.4905920028686523, "reward_std": 0.046114400029182434, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4905920624732971, "rewards/pad": 0.0, "step": 779 }, { "completion_length": 326.34375, "epoch": 0.248565965583174, "grad_norm": 5.975739479064941, "kl": 0.06884765625, "learning_rate": 7.51434034416826e-07, "loss": 0.0028, "reward": 1.367580533027649, "reward_std": 0.14677515625953674, "rewards/answer_reward": 0.03125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3519555330276489, "step": 780 }, { "completion_length": 280.015625, "epoch": 0.24888463989802423, "grad_norm": 6.762017726898193, "kl": 0.076171875, "learning_rate": 7.511153601019757e-07, "loss": 0.0031, "reward": 1.6784141063690186, "reward_std": 0.08459261059761047, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5690390467643738, "rewards/pad": 0.109375, "step": 781 }, { "completion_length": 287.09375, "epoch": 0.24920331421287445, "grad_norm": 9.65837287902832, "kl": 0.083984375, "learning_rate": 7.507966857871255e-07, "loss": 0.0034, "reward": 1.3916666507720947, "reward_std": 0.05018036067485809, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3916667401790619, "rewards/pad": 0.0, "step": 782 }, { "completion_length": 252.90625, "epoch": 0.24952198852772467, "grad_norm": 15.337854385375977, "kl": 0.0810546875, "learning_rate": 7.504780114722753e-07, "loss": 0.0032, "reward": 1.4898476600646973, "reward_std": 0.17910003662109375, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.38047271966934204, "step": 783 }, { "completion_length": 221.5, "epoch": 0.2498406628425749, "grad_norm": 24.365131378173828, "kl": 0.09619140625, "learning_rate": 7.501593371574251e-07, "loss": 0.0038, "reward": 1.6819980144500732, "reward_std": 0.18875043094158173, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5726230144500732, "rewards/pad": 0.125, "step": 784 }, { "completion_length": 278.296875, "epoch": 0.2501593371574251, "grad_norm": 10.942834854125977, "kl": 0.08056640625, "learning_rate": 7.498406628425748e-07, "loss": 0.0032, "reward": 1.412353754043579, "reward_std": 0.07322700321674347, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4123537838459015, "rewards/pad": 0.0, "step": 785 }, { "completion_length": 269.3125, "epoch": 0.25047801147227533, "grad_norm": 8.739012718200684, "kl": 0.0732421875, "learning_rate": 7.495219885277246e-07, "loss": 0.0029, "reward": 1.7182860374450684, "reward_std": 0.08406949788331985, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5932859778404236, "rewards/pad": 0.125, "step": 786 }, { "completion_length": 289.34375, "epoch": 0.25079668578712555, "grad_norm": 9.329911231994629, "kl": 0.11083984375, "learning_rate": 7.492033142128744e-07, "loss": 0.0044, "reward": 1.573399305343628, "reward_std": 0.0749359279870987, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5733993053436279, "step": 787 }, { "completion_length": 469.59375, "epoch": 0.25111536010197577, "grad_norm": 6.025957107543945, "kl": 0.052001953125, "learning_rate": 7.488846398980242e-07, "loss": 0.0021, "reward": 1.4300410747528076, "reward_std": 0.08247831463813782, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43004101514816284, "rewards/pad": 0.0, "step": 788 }, { "completion_length": 302.46875, "epoch": 0.251434034416826, "grad_norm": 9.100362777709961, "kl": 0.0771484375, "learning_rate": 7.485659655831739e-07, "loss": 0.0031, "reward": 1.7954503297805786, "reward_std": 0.0859527662396431, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6704503297805786, "rewards/pad": 0.125, "step": 789 }, { "completion_length": 356.828125, "epoch": 0.2517527087316762, "grad_norm": 30.79294776916504, "kl": 0.07080078125, "learning_rate": 7.482472912683237e-07, "loss": 0.0028, "reward": 1.555119514465332, "reward_std": 0.1339489072561264, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.44574448466300964, "step": 790 }, { "completion_length": 380.453125, "epoch": 0.25207138304652643, "grad_norm": 8.440163612365723, "kl": 0.04833984375, "learning_rate": 7.479286169534736e-07, "loss": 0.0019, "reward": 1.7130658626556396, "reward_std": 0.09868164360523224, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4786908030509949, "step": 791 }, { "completion_length": 134.921875, "epoch": 0.25239005736137665, "grad_norm": 20.89670181274414, "kl": 0.1279296875, "learning_rate": 7.476099426386234e-07, "loss": 0.0051, "reward": 1.4510223865509033, "reward_std": 0.10243634879589081, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45102232694625854, "rewards/pad": 0.0, "step": 792 }, { "completion_length": 368.234375, "epoch": 0.25270873167622687, "grad_norm": 9.967374801635742, "kl": 0.1787109375, "learning_rate": 7.472912683237731e-07, "loss": 0.0071, "reward": 1.417081356048584, "reward_std": 0.09499022364616394, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.43270638585090637, "step": 793 }, { "completion_length": 348.078125, "epoch": 0.2530274059910771, "grad_norm": 9.970160484313965, "kl": 0.054443359375, "learning_rate": 7.469725940089229e-07, "loss": 0.0022, "reward": 1.6479089260101318, "reward_std": 0.21434885263442993, "rewards/answer_reward": 0.21875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.42915892601013184, "step": 794 }, { "completion_length": 443.890625, "epoch": 0.25334608030592737, "grad_norm": 5.636565208435059, "kl": 0.06005859375, "learning_rate": 7.466539196940727e-07, "loss": 0.0024, "reward": 1.5904240608215332, "reward_std": 0.1765565574169159, "rewards/answer_reward": 0.21875, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.4029240608215332, "step": 795 }, { "completion_length": 364.765625, "epoch": 0.2536647546207776, "grad_norm": 4.0662126541137695, "kl": 0.0693359375, "learning_rate": 7.463352453792225e-07, "loss": 0.0028, "reward": 1.3498592376708984, "reward_std": 0.21396957337856293, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.3967343270778656, "rewards/pad": 0.0, "step": 796 }, { "completion_length": 382.546875, "epoch": 0.2539834289356278, "grad_norm": 6.166215896606445, "kl": 0.07763671875, "learning_rate": 7.460165710643722e-07, "loss": 0.0031, "reward": 1.6516375541687012, "reward_std": 0.07979218661785126, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.651637613773346, "rewards/pad": 0.0, "step": 797 }, { "completion_length": 203.703125, "epoch": 0.25430210325047803, "grad_norm": 8.784749031066895, "kl": 0.0908203125, "learning_rate": 7.45697896749522e-07, "loss": 0.0036, "reward": 1.5942569971084595, "reward_std": 0.19434666633605957, "rewards/answer_reward": 0.203125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.39113202691078186, "step": 798 }, { "completion_length": 250.40625, "epoch": 0.25462077756532825, "grad_norm": 12.700088500976562, "kl": 0.09814453125, "learning_rate": 7.453792224346718e-07, "loss": 0.0039, "reward": 1.5345667600631714, "reward_std": 0.12197552621364594, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.425191730260849, "rewards/pad": 0.125, "step": 799 }, { "completion_length": 324.671875, "epoch": 0.25493945188017847, "grad_norm": 45.329933166503906, "kl": 0.09912109375, "learning_rate": 7.450605481198216e-07, "loss": 0.004, "reward": 1.6125965118408203, "reward_std": 0.11991085857152939, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5032214522361755, "rewards/pad": 0.125, "step": 800 }, { "completion_length": 314.046875, "epoch": 0.2552581261950287, "grad_norm": 9.435230255126953, "kl": 0.0751953125, "learning_rate": 7.447418738049713e-07, "loss": 0.003, "reward": 1.6179510354995728, "reward_std": 0.10827474296092987, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49295106530189514, "rewards/pad": 0.125, "step": 801 }, { "completion_length": 252.265625, "epoch": 0.2555768005098789, "grad_norm": 7.633427143096924, "kl": 0.09619140625, "learning_rate": 7.444231994901211e-07, "loss": 0.0038, "reward": 1.540982723236084, "reward_std": 0.14200761914253235, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.556607723236084, "rewards/pad": 0.0, "step": 802 }, { "completion_length": 227.34375, "epoch": 0.25589547482472913, "grad_norm": 113.0080795288086, "kl": 0.09814453125, "learning_rate": 7.441045251752709e-07, "loss": 0.0039, "reward": 1.6083734035491943, "reward_std": 0.10746157169342041, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5614984035491943, "step": 803 }, { "completion_length": 353.703125, "epoch": 0.25621414913957935, "grad_norm": 13.557099342346191, "kl": 0.0703125, "learning_rate": 7.437858508604206e-07, "loss": 0.0028, "reward": 1.6362968683242798, "reward_std": 0.1419190913438797, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6519218683242798, "rewards/pad": 0.0, "step": 804 }, { "completion_length": 249.703125, "epoch": 0.25653282345442957, "grad_norm": 7.443843364715576, "kl": 0.07861328125, "learning_rate": 7.434671765455703e-07, "loss": 0.0032, "reward": 1.5101332664489746, "reward_std": 0.11596070230007172, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4788833260536194, "rewards/pad": 0.03125, "step": 805 }, { "completion_length": 300.8125, "epoch": 0.2568514977692798, "grad_norm": 5.476566791534424, "kl": 0.08740234375, "learning_rate": 7.431485022307201e-07, "loss": 0.0035, "reward": 1.482029676437378, "reward_std": 0.08476638793945312, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.48202961683273315, "step": 806 }, { "completion_length": 384.265625, "epoch": 0.25717017208413, "grad_norm": 8.814421653747559, "kl": 0.049072265625, "learning_rate": 7.428298279158699e-07, "loss": 0.002, "reward": 1.6688939332962036, "reward_std": 0.11245816946029663, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.43451887369155884, "step": 807 }, { "completion_length": 557.03125, "epoch": 0.25748884639898023, "grad_norm": 4.086426734924316, "kl": 0.025146484375, "learning_rate": 7.425111536010196e-07, "loss": 0.001, "reward": 1.4199795722961426, "reward_std": 0.0845545083284378, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.43560463190078735, "step": 808 }, { "completion_length": 185.625, "epoch": 0.25780752071383045, "grad_norm": 16.384538650512695, "kl": 0.09423828125, "learning_rate": 7.421924792861694e-07, "loss": 0.0038, "reward": 1.9109994173049927, "reward_std": 0.20538440346717834, "rewards/answer_reward": 0.34375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5828745365142822, "step": 809 }, { "completion_length": 299.78125, "epoch": 0.25812619502868067, "grad_norm": 10.312644958496094, "kl": 0.0859375, "learning_rate": 7.418738049713192e-07, "loss": 0.0034, "reward": 1.4104437828063965, "reward_std": 0.12108160555362701, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3323187530040741, "step": 810 }, { "completion_length": 283.796875, "epoch": 0.2584448693435309, "grad_norm": 8.11568546295166, "kl": 0.09521484375, "learning_rate": 7.415551306564691e-07, "loss": 0.0038, "reward": 1.3238170146942139, "reward_std": 0.11337171494960785, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.27694201469421387, "rewards/pad": 0.046875, "step": 811 }, { "completion_length": 222.328125, "epoch": 0.2587635436583811, "grad_norm": 5.870856285095215, "kl": 0.09228515625, "learning_rate": 7.412364563416188e-07, "loss": 0.0037, "reward": 1.6045132875442505, "reward_std": 0.17492227256298065, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4951382875442505, "step": 812 }, { "completion_length": 175.265625, "epoch": 0.25908221797323133, "grad_norm": 9.085700035095215, "kl": 0.1123046875, "learning_rate": 7.409177820267686e-07, "loss": 0.0045, "reward": 1.6282504796981812, "reward_std": 0.17620429396629333, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42512547969818115, "rewards/pad": 0.203125, "step": 813 }, { "completion_length": 242.953125, "epoch": 0.2594008922880816, "grad_norm": 7.416430950164795, "kl": 0.08740234375, "learning_rate": 7.405991077119184e-07, "loss": 0.0035, "reward": 1.7750552892684937, "reward_std": 0.16323842108249664, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6344302296638489, "step": 814 }, { "completion_length": 284.890625, "epoch": 0.25971956660293183, "grad_norm": 13.476738929748535, "kl": 0.076171875, "learning_rate": 7.402804333970682e-07, "loss": 0.003, "reward": 1.6334511041641235, "reward_std": 0.09985249489545822, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.38345110416412354, "rewards/pad": 0.25, "step": 815 }, { "completion_length": 491.9375, "epoch": 0.26003824091778205, "grad_norm": 5.566437244415283, "kl": 0.0439453125, "learning_rate": 7.399617590822179e-07, "loss": 0.0018, "reward": 1.2809858322143555, "reward_std": 0.19581392407417297, "rewards/answer_reward": 0.015625, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.2966108024120331, "step": 816 }, { "completion_length": 368.796875, "epoch": 0.26035691523263227, "grad_norm": 6.461248397827148, "kl": 0.06005859375, "learning_rate": 7.396430847673677e-07, "loss": 0.0024, "reward": 1.3516507148742676, "reward_std": 0.0791245549917221, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3516506552696228, "step": 817 }, { "completion_length": 213.984375, "epoch": 0.2606755895474825, "grad_norm": 7.895339488983154, "kl": 0.091796875, "learning_rate": 7.393244104525175e-07, "loss": 0.0037, "reward": 1.6286418437957764, "reward_std": 0.1728130280971527, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45676690340042114, "rewards/pad": 0.171875, "step": 818 }, { "completion_length": 238.34375, "epoch": 0.2609942638623327, "grad_norm": 6.759038925170898, "kl": 0.08740234375, "learning_rate": 7.390057361376673e-07, "loss": 0.0035, "reward": 1.5328681468963623, "reward_std": 0.060063887387514114, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5328680872917175, "rewards/pad": 0.0, "step": 819 }, { "completion_length": 272.8125, "epoch": 0.26131293817718293, "grad_norm": 15.274812698364258, "kl": 0.07177734375, "learning_rate": 7.38687061822817e-07, "loss": 0.0029, "reward": 1.5370078086853027, "reward_std": 0.24703297019004822, "rewards/pad": 0.21875, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.3495078682899475, "step": 820 }, { "completion_length": 297.90625, "epoch": 0.26163161249203315, "grad_norm": 46.81049346923828, "kl": 0.07958984375, "learning_rate": 7.383683875079668e-07, "loss": 0.0032, "reward": 1.7406848669052124, "reward_std": 0.13599297404289246, "rewards/pad": 0.359375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3969349265098572, "step": 821 }, { "completion_length": 189.015625, "epoch": 0.26195028680688337, "grad_norm": 7.546118259429932, "kl": 0.09326171875, "learning_rate": 7.380497131931166e-07, "loss": 0.0037, "reward": 1.5959341526031494, "reward_std": 0.05791737139225006, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47093406319618225, "step": 822 }, { "completion_length": 254.265625, "epoch": 0.2622689611217336, "grad_norm": 6.935544490814209, "kl": 0.09375, "learning_rate": 7.377310388782664e-07, "loss": 0.0037, "reward": 1.5564478635787964, "reward_std": 0.0899038165807724, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5564479231834412, "rewards/pad": 0.0, "step": 823 }, { "completion_length": 343.359375, "epoch": 0.2625876354365838, "grad_norm": 6.846329689025879, "kl": 0.06396484375, "learning_rate": 7.374123645634161e-07, "loss": 0.0026, "reward": 1.5198067426681519, "reward_std": 0.1529795527458191, "rewards/answer_reward": 0.203125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.33230674266815186, "step": 824 }, { "completion_length": 214.453125, "epoch": 0.26290630975143403, "grad_norm": 14.635757446289062, "kl": 0.09423828125, "learning_rate": 7.370936902485659e-07, "loss": 0.0038, "reward": 1.3967777490615845, "reward_std": 0.2116951197385788, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.33427777886390686, "rewards/pad": 0.0625, "step": 825 }, { "completion_length": 299.140625, "epoch": 0.26322498406628425, "grad_norm": 21.44894027709961, "kl": 0.0654296875, "learning_rate": 7.367750159337157e-07, "loss": 0.0026, "reward": 1.5391669273376465, "reward_std": 0.18289467692375183, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.41416695713996887, "step": 826 }, { "completion_length": 222.34375, "epoch": 0.26354365838113447, "grad_norm": 7.3377180099487305, "kl": 0.1015625, "learning_rate": 7.364563416188655e-07, "loss": 0.0041, "reward": 1.3520796298980713, "reward_std": 0.0663057416677475, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3520796298980713, "rewards/pad": 0.0, "step": 827 }, { "completion_length": 245.6875, "epoch": 0.2638623326959847, "grad_norm": 11.794517517089844, "kl": 0.150390625, "learning_rate": 7.361376673040152e-07, "loss": 0.006, "reward": 1.4975171089172363, "reward_std": 0.09263218194246292, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49751707911491394, "rewards/pad": 0.0, "step": 828 }, { "completion_length": 163.625, "epoch": 0.2641810070108349, "grad_norm": 10.353131294250488, "kl": 0.1220703125, "learning_rate": 7.35818992989165e-07, "loss": 0.0049, "reward": 1.4297380447387695, "reward_std": 0.11561650782823563, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3984881043434143, "rewards/pad": 0.03125, "step": 829 }, { "completion_length": 366.953125, "epoch": 0.26449968132568513, "grad_norm": 5.801058292388916, "kl": 0.045166015625, "learning_rate": 7.355003186743149e-07, "loss": 0.0018, "reward": 1.5223000049591064, "reward_std": 0.05760771036148071, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3972998559474945, "rewards/pad": 0.125, "step": 830 }, { "completion_length": 317.1875, "epoch": 0.26481835564053535, "grad_norm": 5.280611038208008, "kl": 0.052734375, "learning_rate": 7.351816443594647e-07, "loss": 0.0021, "reward": 1.3706518411636353, "reward_std": 0.049535803496837616, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.37065184116363525, "rewards/pad": 0.0, "step": 831 }, { "completion_length": 245.953125, "epoch": 0.2651370299553856, "grad_norm": 13.63158130645752, "kl": 0.0771484375, "learning_rate": 7.348629700446144e-07, "loss": 0.0031, "reward": 1.5924638509750366, "reward_std": 0.1569618582725525, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4987138509750366, "step": 832 }, { "completion_length": 306.5, "epoch": 0.2654557042702358, "grad_norm": 5.494880676269531, "kl": 0.0712890625, "learning_rate": 7.345442957297642e-07, "loss": 0.0028, "reward": 1.4672374725341797, "reward_std": 0.05155729874968529, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4672374725341797, "rewards/pad": 0.0, "step": 833 }, { "completion_length": 140.21875, "epoch": 0.26577437858508607, "grad_norm": 7.520358085632324, "kl": 0.1171875, "learning_rate": 7.34225621414914e-07, "loss": 0.0047, "reward": 1.6826391220092773, "reward_std": 0.14447268843650818, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5732640624046326, "step": 834 }, { "completion_length": 138.9375, "epoch": 0.2660930528999363, "grad_norm": 13.060035705566406, "kl": 0.1044921875, "learning_rate": 7.339069471000637e-07, "loss": 0.0042, "reward": 1.6643251180648804, "reward_std": 0.17719486355781555, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5705751180648804, "rewards/pad": 0.09375, "step": 835 }, { "completion_length": 344.078125, "epoch": 0.2664117272147865, "grad_norm": 11.838889122009277, "kl": 0.0712890625, "learning_rate": 7.335882727852135e-07, "loss": 0.0029, "reward": 1.3920658826828003, "reward_std": 0.11614827811717987, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.3139408528804779, "step": 836 }, { "completion_length": 219.90625, "epoch": 0.26673040152963673, "grad_norm": 15.134505271911621, "kl": 0.09716796875, "learning_rate": 7.332695984703633e-07, "loss": 0.0039, "reward": 1.1549283266067505, "reward_std": 0.12660345435142517, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.1705532819032669, "step": 837 }, { "completion_length": 292.984375, "epoch": 0.26704907584448695, "grad_norm": 7.054495334625244, "kl": 0.064453125, "learning_rate": 7.329509241555131e-07, "loss": 0.0026, "reward": 1.4501910209655762, "reward_std": 0.1041649878025055, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4345659911632538, "rewards/pad": 0.015625, "step": 838 }, { "completion_length": 229.09375, "epoch": 0.26736775015933717, "grad_norm": 11.810515403747559, "kl": 0.10498046875, "learning_rate": 7.326322498406628e-07, "loss": 0.0042, "reward": 1.550565242767334, "reward_std": 0.14647451043128967, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5661901235580444, "rewards/pad": 0.0, "step": 839 }, { "completion_length": 283.015625, "epoch": 0.2676864244741874, "grad_norm": 5.316693305969238, "kl": 0.06298828125, "learning_rate": 7.323135755258126e-07, "loss": 0.0025, "reward": 1.310988187789917, "reward_std": 0.17328143119812012, "rewards/answer_reward": 0.078125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.24848829209804535, "step": 840 }, { "completion_length": 235.046875, "epoch": 0.2680050987890376, "grad_norm": 8.124117851257324, "kl": 0.09375, "learning_rate": 7.319949012109624e-07, "loss": 0.0038, "reward": 1.5256720781326294, "reward_std": 0.09780138731002808, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5256721377372742, "step": 841 }, { "completion_length": 192.15625, "epoch": 0.26832377310388783, "grad_norm": 15.035517692565918, "kl": 0.09375, "learning_rate": 7.316762268961122e-07, "loss": 0.0038, "reward": 1.4462971687316895, "reward_std": 0.07383842766284943, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44629716873168945, "rewards/pad": 0.0, "step": 842 }, { "completion_length": 194.171875, "epoch": 0.26864244741873805, "grad_norm": 14.58903980255127, "kl": 0.09619140625, "learning_rate": 7.313575525812619e-07, "loss": 0.0039, "reward": 1.7406694889068604, "reward_std": 0.08720827102661133, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4906695485115051, "step": 843 }, { "completion_length": 204.609375, "epoch": 0.26896112173358827, "grad_norm": 15.045029640197754, "kl": 0.07568359375, "learning_rate": 7.310388782664116e-07, "loss": 0.003, "reward": 1.4478912353515625, "reward_std": 0.1405174881219864, "rewards/answer_reward": 0.03125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.41664132475852966, "step": 844 }, { "completion_length": 233.0625, "epoch": 0.2692797960484385, "grad_norm": 7.217484474182129, "kl": 0.076171875, "learning_rate": 7.307202039515614e-07, "loss": 0.0031, "reward": 1.3446323871612549, "reward_std": 0.048448316752910614, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3446323573589325, "rewards/pad": 0.0, "step": 845 }, { "completion_length": 316.203125, "epoch": 0.2695984703632887, "grad_norm": 7.464609146118164, "kl": 0.059326171875, "learning_rate": 7.304015296367112e-07, "loss": 0.0024, "reward": 1.4307100772857666, "reward_std": 0.08219650387763977, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43070995807647705, "rewards/pad": 0.0, "step": 846 }, { "completion_length": 240.34375, "epoch": 0.26991714467813893, "grad_norm": 24.719892501831055, "kl": 0.0830078125, "learning_rate": 7.300828553218609e-07, "loss": 0.0033, "reward": 1.3982957601547241, "reward_std": 0.12989813089370728, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4139207601547241, "step": 847 }, { "completion_length": 259.9375, "epoch": 0.27023581899298915, "grad_norm": 6.298829555511475, "kl": 0.064453125, "learning_rate": 7.297641810070108e-07, "loss": 0.0026, "reward": 1.451304316520691, "reward_std": 0.09524716436862946, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4669293463230133, "step": 848 }, { "completion_length": 193.5, "epoch": 0.27055449330783937, "grad_norm": 10.255414962768555, "kl": 0.09521484375, "learning_rate": 7.294455066921606e-07, "loss": 0.0038, "reward": 1.4266576766967773, "reward_std": 0.12549547851085663, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.39540770649909973, "step": 849 }, { "completion_length": 196.390625, "epoch": 0.2708731676226896, "grad_norm": 11.679965019226074, "kl": 0.10546875, "learning_rate": 7.291268323773104e-07, "loss": 0.0042, "reward": 1.5198023319244385, "reward_std": 0.17359545826911926, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5198023319244385, "rewards/pad": 0.0, "step": 850 }, { "completion_length": 215.90625, "epoch": 0.2711918419375398, "grad_norm": 12.998888969421387, "kl": 0.109375, "learning_rate": 7.288081580624601e-07, "loss": 0.0044, "reward": 1.6264359951019287, "reward_std": 0.07382266968488693, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5014359354972839, "step": 851 }, { "completion_length": 260.921875, "epoch": 0.27151051625239003, "grad_norm": 11.922432899475098, "kl": 0.0810546875, "learning_rate": 7.284894837476099e-07, "loss": 0.0032, "reward": 1.342989444732666, "reward_std": 0.15005718171596527, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.34298938512802124, "rewards/pad": 0.015625, "step": 852 }, { "completion_length": 192.109375, "epoch": 0.27182919056724025, "grad_norm": 8.800082206726074, "kl": 0.09033203125, "learning_rate": 7.281708094327597e-07, "loss": 0.0036, "reward": 1.6281015872955322, "reward_std": 0.08344338089227676, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6281015872955322, "rewards/pad": 0.0, "step": 853 }, { "completion_length": 140.328125, "epoch": 0.27214786488209053, "grad_norm": 448.08642578125, "kl": 0.0966796875, "learning_rate": 7.278521351179095e-07, "loss": 0.0039, "reward": 1.8013361692428589, "reward_std": 0.09915797412395477, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.8013361692428589, "rewards/pad": 0.0, "step": 854 }, { "completion_length": 185.484375, "epoch": 0.27246653919694075, "grad_norm": 18.73372459411621, "kl": 0.10693359375, "learning_rate": 7.275334608030592e-07, "loss": 0.0043, "reward": 1.503127932548523, "reward_std": 0.08186651021242142, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5031278729438782, "step": 855 }, { "completion_length": 183.21875, "epoch": 0.27278521351179097, "grad_norm": 7.774461269378662, "kl": 0.0927734375, "learning_rate": 7.27214786488209e-07, "loss": 0.0037, "reward": 1.4252768754959106, "reward_std": 0.07498372346162796, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42527687549591064, "rewards/pad": 0.0, "step": 856 }, { "completion_length": 146.203125, "epoch": 0.2731038878266412, "grad_norm": 19.676912307739258, "kl": 0.11279296875, "learning_rate": 7.268961121733588e-07, "loss": 0.0045, "reward": 1.4774161577224731, "reward_std": 0.08733925223350525, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47741612792015076, "step": 857 }, { "completion_length": 256.875, "epoch": 0.2734225621414914, "grad_norm": 6.925992488861084, "kl": 0.09130859375, "learning_rate": 7.265774378585086e-07, "loss": 0.0036, "reward": 1.4271292686462402, "reward_std": 0.08978510648012161, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.427129328250885, "rewards/pad": 0.0, "step": 858 }, { "completion_length": 152.0625, "epoch": 0.27374123645634163, "grad_norm": 7.952486991882324, "kl": 0.08740234375, "learning_rate": 7.262587635436583e-07, "loss": 0.0035, "reward": 1.6472711563110352, "reward_std": 0.11177260428667068, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.41289621591567993, "rewards/pad": 0.25, "step": 859 }, { "completion_length": 257.03125, "epoch": 0.27405991077119185, "grad_norm": 9.25637435913086, "kl": 0.08447265625, "learning_rate": 7.259400892288081e-07, "loss": 0.0034, "reward": 1.4652546644210815, "reward_std": 0.06389153748750687, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.46525469422340393, "step": 860 }, { "completion_length": 141.109375, "epoch": 0.27437858508604207, "grad_norm": 9.085542678833008, "kl": 0.10791015625, "learning_rate": 7.256214149139579e-07, "loss": 0.0043, "reward": 1.6769859790802002, "reward_std": 0.14516539871692657, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.614486038684845, "rewards/pad": 0.0625, "step": 861 }, { "completion_length": 237.296875, "epoch": 0.2746972594008923, "grad_norm": 7.306424140930176, "kl": 0.07958984375, "learning_rate": 7.253027405991076e-07, "loss": 0.0032, "reward": 1.5290510654449463, "reward_std": 0.08280518651008606, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5290510654449463, "rewards/pad": 0.0, "step": 862 }, { "completion_length": 193.671875, "epoch": 0.2750159337157425, "grad_norm": 12.244972229003906, "kl": 0.0849609375, "learning_rate": 7.249840662842574e-07, "loss": 0.0034, "reward": 1.6881325244903564, "reward_std": 0.16697387397289276, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4693824350833893, "rewards/pad": 0.234375, "step": 863 }, { "completion_length": 176.578125, "epoch": 0.27533460803059273, "grad_norm": 29.74585723876953, "kl": 0.07763671875, "learning_rate": 7.246653919694072e-07, "loss": 0.0031, "reward": 1.7264668941497803, "reward_std": 0.14079639315605164, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.6170920133590698, "rewards/pad": 0.140625, "step": 864 }, { "completion_length": 275.171875, "epoch": 0.27565328234544295, "grad_norm": 14.265174865722656, "kl": 0.06494140625, "learning_rate": 7.24346717654557e-07, "loss": 0.0026, "reward": 1.6074364185333252, "reward_std": 0.19189007580280304, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.38868647813796997, "step": 865 }, { "completion_length": 186.53125, "epoch": 0.27597195666029317, "grad_norm": 5.63706636428833, "kl": 0.0673828125, "learning_rate": 7.240280433397067e-07, "loss": 0.0027, "reward": 1.4767136573791504, "reward_std": 0.1421107053756714, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3517136573791504, "step": 866 }, { "completion_length": 253.46875, "epoch": 0.2762906309751434, "grad_norm": 214.72650146484375, "kl": 0.076171875, "learning_rate": 7.237093690248566e-07, "loss": 0.003, "reward": 1.5148383378982544, "reward_std": 0.11171391606330872, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5148383378982544, "step": 867 }, { "completion_length": 242.109375, "epoch": 0.2766093052899936, "grad_norm": 17.771141052246094, "kl": 0.107421875, "learning_rate": 7.233906947100064e-07, "loss": 0.0043, "reward": 1.527846097946167, "reward_std": 0.10529676079750061, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.40284615755081177, "step": 868 }, { "completion_length": 236.796875, "epoch": 0.27692797960484383, "grad_norm": 8.183026313781738, "kl": 0.0625, "learning_rate": 7.230720203951562e-07, "loss": 0.0025, "reward": 1.6474695205688477, "reward_std": 0.07472085952758789, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39746958017349243, "rewards/pad": 0.25, "step": 869 }, { "completion_length": 97.1875, "epoch": 0.27724665391969405, "grad_norm": 14.428912162780762, "kl": 0.1298828125, "learning_rate": 7.227533460803059e-07, "loss": 0.0052, "reward": 1.3619276285171509, "reward_std": 0.1646977812051773, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.37755271792411804, "rewards/pad": 0.0, "step": 870 }, { "completion_length": 140.953125, "epoch": 0.2775653282345443, "grad_norm": 9.364572525024414, "kl": 0.166015625, "learning_rate": 7.224346717654557e-07, "loss": 0.0067, "reward": 1.5766136646270752, "reward_std": 0.1297195702791214, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4828636944293976, "rewards/pad": 0.09375, "step": 871 }, { "completion_length": 227.78125, "epoch": 0.2778840025493945, "grad_norm": 53.78718185424805, "kl": 0.08984375, "learning_rate": 7.221159974506055e-07, "loss": 0.0036, "reward": 1.5020396709442139, "reward_std": 0.06564459204673767, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5020396113395691, "rewards/pad": 0.0, "step": 872 }, { "completion_length": 215.78125, "epoch": 0.27820267686424477, "grad_norm": 19.729019165039062, "kl": 0.08349609375, "learning_rate": 7.217973231357553e-07, "loss": 0.0033, "reward": 1.5267720222473145, "reward_std": 0.13667306303977966, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.47989708185195923, "rewards/pad": 0.0625, "step": 873 }, { "completion_length": 203.4375, "epoch": 0.278521351179095, "grad_norm": 22.31565284729004, "kl": 0.07080078125, "learning_rate": 7.21478648820905e-07, "loss": 0.0028, "reward": 1.5913581848144531, "reward_std": 0.09478233754634857, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4663581848144531, "step": 874 }, { "completion_length": 168.90625, "epoch": 0.2788400254939452, "grad_norm": 8.827746391296387, "kl": 0.10986328125, "learning_rate": 7.211599745060548e-07, "loss": 0.0044, "reward": 1.4382688999176025, "reward_std": 0.1273890733718872, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4382690191268921, "rewards/pad": 0.0, "step": 875 }, { "completion_length": 296.96875, "epoch": 0.27915869980879543, "grad_norm": 7.749444484710693, "kl": 0.07421875, "learning_rate": 7.208413001912046e-07, "loss": 0.003, "reward": 1.5172462463378906, "reward_std": 0.10500482469797134, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5172461867332458, "rewards/pad": 0.0, "step": 876 }, { "completion_length": 272.921875, "epoch": 0.27947737412364565, "grad_norm": 7.46060848236084, "kl": 0.0654296875, "learning_rate": 7.205226258763544e-07, "loss": 0.0026, "reward": 1.5742771625518799, "reward_std": 0.04187481105327606, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44927719235420227, "rewards/pad": 0.125, "step": 877 }, { "completion_length": 258.484375, "epoch": 0.27979604843849587, "grad_norm": 27.48328971862793, "kl": 0.0693359375, "learning_rate": 7.202039515615041e-07, "loss": 0.0028, "reward": 1.642381191253662, "reward_std": 0.05967498570680618, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3923811912536621, "rewards/pad": 0.25, "step": 878 }, { "completion_length": 246.8125, "epoch": 0.2801147227533461, "grad_norm": 10.169364929199219, "kl": 0.06494140625, "learning_rate": 7.198852772466539e-07, "loss": 0.0026, "reward": 1.5621225833892822, "reward_std": 0.11288601160049438, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5308725833892822, "step": 879 }, { "completion_length": 247.578125, "epoch": 0.2804333970681963, "grad_norm": 8.097737312316895, "kl": 0.0703125, "learning_rate": 7.195666029318037e-07, "loss": 0.0028, "reward": 1.6014553308486938, "reward_std": 0.06341607868671417, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47645536065101624, "step": 880 }, { "completion_length": 238.84375, "epoch": 0.28075207138304653, "grad_norm": 16.25465202331543, "kl": 0.07666015625, "learning_rate": 7.192479286169535e-07, "loss": 0.0031, "reward": 1.5091499090194702, "reward_std": 0.077355295419693, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5091499090194702, "step": 881 }, { "completion_length": 206.546875, "epoch": 0.28107074569789675, "grad_norm": 26.487606048583984, "kl": 0.07177734375, "learning_rate": 7.189292543021032e-07, "loss": 0.0029, "reward": 1.6797242164611816, "reward_std": 0.10398919880390167, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.42972421646118164, "step": 882 }, { "completion_length": 191.9375, "epoch": 0.28138942001274697, "grad_norm": 11.197914123535156, "kl": 0.087890625, "learning_rate": 7.186105799872529e-07, "loss": 0.0035, "reward": 1.8811314105987549, "reward_std": 0.09780030697584152, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6311314105987549, "rewards/pad": 0.25, "step": 883 }, { "completion_length": 248.859375, "epoch": 0.2817080943275972, "grad_norm": 11.701794624328613, "kl": 0.0791015625, "learning_rate": 7.182919056724027e-07, "loss": 0.0032, "reward": 1.3616845607757568, "reward_std": 0.11649852991104126, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.37730950117111206, "step": 884 }, { "completion_length": 176.015625, "epoch": 0.2820267686424474, "grad_norm": 13.844351768493652, "kl": 0.1103515625, "learning_rate": 7.179732313575525e-07, "loss": 0.0044, "reward": 1.5564985275268555, "reward_std": 0.0744241252541542, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5564984679222107, "rewards/pad": 0.0, "step": 885 }, { "completion_length": 216.5, "epoch": 0.28234544295729763, "grad_norm": 16.154176712036133, "kl": 0.08056640625, "learning_rate": 7.176545570427023e-07, "loss": 0.0032, "reward": 1.5752778053283691, "reward_std": 0.09883183240890503, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5909029245376587, "rewards/pad": 0.0, "step": 886 }, { "completion_length": 170.671875, "epoch": 0.28266411727214785, "grad_norm": 15.331693649291992, "kl": 0.091796875, "learning_rate": 7.173358827278521e-07, "loss": 0.0037, "reward": 1.3783342838287354, "reward_std": 0.18478818237781525, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3627093434333801, "step": 887 }, { "completion_length": 302.640625, "epoch": 0.2829827915869981, "grad_norm": 11.60513973236084, "kl": 0.06787109375, "learning_rate": 7.170172084130019e-07, "loss": 0.0027, "reward": 1.373932123184204, "reward_std": 0.09270681440830231, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3739321827888489, "rewards/pad": 0.0, "step": 888 }, { "completion_length": 144.0625, "epoch": 0.2833014659018483, "grad_norm": 15.91201400756836, "kl": 0.08203125, "learning_rate": 7.166985340981517e-07, "loss": 0.0033, "reward": 1.8487534523010254, "reward_std": 0.1471112072467804, "rewards/answer_reward": 0.3125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5362533926963806, "step": 889 }, { "completion_length": 182.625, "epoch": 0.2836201402166985, "grad_norm": 19.750171661376953, "kl": 0.103515625, "learning_rate": 7.163798597833014e-07, "loss": 0.0042, "reward": 1.5224273204803467, "reward_std": 0.11883044987916946, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3974272608757019, "rewards/pad": 0.125, "step": 890 }, { "completion_length": 238.671875, "epoch": 0.28393881453154873, "grad_norm": 16.4511661529541, "kl": 0.07421875, "learning_rate": 7.160611854684512e-07, "loss": 0.003, "reward": 1.7034804821014404, "reward_std": 0.14997044205665588, "rewards/answer_reward": 0.21875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.48473045229911804, "step": 891 }, { "completion_length": 259.921875, "epoch": 0.28425748884639895, "grad_norm": 10.787720680236816, "kl": 0.08056640625, "learning_rate": 7.15742511153601e-07, "loss": 0.0032, "reward": 1.3430256843566895, "reward_std": 0.13406524062156677, "rewards/answer_reward": 0.09375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.24927571415901184, "step": 892 }, { "completion_length": 123.34375, "epoch": 0.28457616316124923, "grad_norm": 14.076187133789062, "kl": 0.09912109375, "learning_rate": 7.154238368387507e-07, "loss": 0.004, "reward": 1.7401636838912964, "reward_std": 0.17341656982898712, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4276636838912964, "rewards/pad": 0.3125, "step": 893 }, { "completion_length": 320.71875, "epoch": 0.28489483747609945, "grad_norm": 12.595187187194824, "kl": 0.0673828125, "learning_rate": 7.151051625239005e-07, "loss": 0.0027, "reward": 1.4644830226898193, "reward_std": 0.07389900088310242, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4644829034805298, "step": 894 }, { "completion_length": 217.75, "epoch": 0.28521351179094967, "grad_norm": 23.805479049682617, "kl": 0.072265625, "learning_rate": 7.147864882090503e-07, "loss": 0.0029, "reward": 1.5951404571533203, "reward_std": 0.19070225954055786, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4076404869556427, "rewards/pad": 0.1875, "step": 895 }, { "completion_length": 274.15625, "epoch": 0.2855321861057999, "grad_norm": 11.52724552154541, "kl": 0.057861328125, "learning_rate": 7.144678138942001e-07, "loss": 0.0023, "reward": 1.6295645236968994, "reward_std": 0.12622328102588654, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4733145833015442, "step": 896 }, { "completion_length": 266.609375, "epoch": 0.2858508604206501, "grad_norm": 9.084197044372559, "kl": 0.07568359375, "learning_rate": 7.141491395793498e-07, "loss": 0.003, "reward": 1.4604337215423584, "reward_std": 0.048416588455438614, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4604337811470032, "rewards/pad": 0.0, "step": 897 }, { "completion_length": 195.125, "epoch": 0.28616953473550033, "grad_norm": 129.77342224121094, "kl": 0.09033203125, "learning_rate": 7.138304652644996e-07, "loss": 0.0036, "reward": 1.7572461366653442, "reward_std": 0.1534266173839569, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5384961366653442, "rewards/pad": 0.234375, "step": 898 }, { "completion_length": 303.796875, "epoch": 0.28648820905035055, "grad_norm": 6.984428405761719, "kl": 0.062255859375, "learning_rate": 7.135117909496494e-07, "loss": 0.0025, "reward": 1.5522358417510986, "reward_std": 0.1406959742307663, "rewards/pad": 0.21875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3334859311580658, "step": 899 }, { "completion_length": 177.53125, "epoch": 0.28680688336520077, "grad_norm": 10.488786697387695, "kl": 0.08642578125, "learning_rate": 7.131931166347992e-07, "loss": 0.0035, "reward": 1.5821726322174072, "reward_std": 0.06763388216495514, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.33217257261276245, "step": 900 }, { "completion_length": 236.484375, "epoch": 0.287125557680051, "grad_norm": 12.510342597961426, "kl": 0.083984375, "learning_rate": 7.128744423199489e-07, "loss": 0.0033, "reward": 1.6787164211273193, "reward_std": 0.1262408196926117, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5693413615226746, "step": 901 }, { "completion_length": 238.9375, "epoch": 0.2874442319949012, "grad_norm": 9.29166316986084, "kl": 0.07421875, "learning_rate": 7.125557680050987e-07, "loss": 0.003, "reward": 1.4003498554229736, "reward_std": 0.14340917766094208, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.41597479581832886, "rewards/pad": 0.0, "step": 902 }, { "completion_length": 367.359375, "epoch": 0.28776290630975143, "grad_norm": 8.235332489013672, "kl": 0.0439453125, "learning_rate": 7.122370936902485e-07, "loss": 0.0018, "reward": 1.6242144107818604, "reward_std": 0.21928203105926514, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.953125, "rewards/tracking_iou_reward": 0.5460892915725708, "step": 903 }, { "completion_length": 156.9375, "epoch": 0.28808158062460165, "grad_norm": 15.566165924072266, "kl": 0.1142578125, "learning_rate": 7.119184193753984e-07, "loss": 0.0046, "reward": 1.7355973720550537, "reward_std": 0.16478106379508972, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6730973720550537, "rewards/pad": 0.0625, "step": 904 }, { "completion_length": 344.46875, "epoch": 0.28840025493945187, "grad_norm": 5.945833206176758, "kl": 0.06298828125, "learning_rate": 7.115997450605481e-07, "loss": 0.0025, "reward": 1.3633050918579102, "reward_std": 0.12290232628583908, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.37893012166023254, "rewards/pad": 0.0, "step": 905 }, { "completion_length": 116.40625, "epoch": 0.2887189292543021, "grad_norm": 10.99114990234375, "kl": 0.1484375, "learning_rate": 7.112810707456979e-07, "loss": 0.0059, "reward": 1.7536451816558838, "reward_std": 0.169193834066391, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.628645122051239, "rewards/pad": 0.125, "step": 906 }, { "completion_length": 288.375, "epoch": 0.2890376035691523, "grad_norm": 6.997311115264893, "kl": 0.06689453125, "learning_rate": 7.109623964308477e-07, "loss": 0.0027, "reward": 1.539267897605896, "reward_std": 0.12635095417499542, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.554892897605896, "step": 907 }, { "completion_length": 315.0625, "epoch": 0.28935627788400253, "grad_norm": 8.444843292236328, "kl": 0.06591796875, "learning_rate": 7.106437221159975e-07, "loss": 0.0026, "reward": 1.4306590557098389, "reward_std": 0.094997838139534, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.44628414511680603, "rewards/pad": 0.0, "step": 908 }, { "completion_length": 232.84375, "epoch": 0.28967495219885275, "grad_norm": 10.925509452819824, "kl": 0.0751953125, "learning_rate": 7.103250478011472e-07, "loss": 0.003, "reward": 1.534977912902832, "reward_std": 0.06576971709728241, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.534977912902832, "rewards/pad": 0.0, "step": 909 }, { "completion_length": 153.984375, "epoch": 0.289993626513703, "grad_norm": 9.139330863952637, "kl": 0.0947265625, "learning_rate": 7.10006373486297e-07, "loss": 0.0038, "reward": 1.4619795083999634, "reward_std": 0.0989057868719101, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4619795083999634, "rewards/pad": 0.0, "step": 910 }, { "completion_length": 303.1875, "epoch": 0.2903123008285532, "grad_norm": 10.7050142288208, "kl": 0.06396484375, "learning_rate": 7.096876991714468e-07, "loss": 0.0026, "reward": 1.5856952667236328, "reward_std": 0.08746597915887833, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.585695207118988, "step": 911 }, { "completion_length": 228.046875, "epoch": 0.29063097514340347, "grad_norm": 8.583359718322754, "kl": 0.064453125, "learning_rate": 7.093690248565966e-07, "loss": 0.0026, "reward": 1.718181848526001, "reward_std": 0.1566476970911026, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.46818190813064575, "step": 912 }, { "completion_length": 194.46875, "epoch": 0.2909496494582537, "grad_norm": 14.333474159240723, "kl": 0.09912109375, "learning_rate": 7.090503505417463e-07, "loss": 0.004, "reward": 1.5772305727005005, "reward_std": 0.15488551557064056, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4678555428981781, "rewards/pad": 0.125, "step": 913 }, { "completion_length": 313.671875, "epoch": 0.2912683237731039, "grad_norm": 15.910902976989746, "kl": 0.0576171875, "learning_rate": 7.087316762268961e-07, "loss": 0.0023, "reward": 1.5254408121109009, "reward_std": 0.11735684424638748, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4785658121109009, "rewards/pad": 0.046875, "step": 914 }, { "completion_length": 346.765625, "epoch": 0.29158699808795413, "grad_norm": 4.648716449737549, "kl": 0.05615234375, "learning_rate": 7.084130019120459e-07, "loss": 0.0023, "reward": 1.4281669855117798, "reward_std": 0.051169656217098236, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4281669855117798, "step": 915 }, { "completion_length": 272.890625, "epoch": 0.29190567240280435, "grad_norm": 8.32939338684082, "kl": 0.08642578125, "learning_rate": 7.080943275971957e-07, "loss": 0.0035, "reward": 1.4463294744491577, "reward_std": 0.0748964250087738, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4463294744491577, "rewards/pad": 0.0, "step": 916 }, { "completion_length": 218.453125, "epoch": 0.29222434671765457, "grad_norm": 19.580448150634766, "kl": 0.07763671875, "learning_rate": 7.077756532823454e-07, "loss": 0.0031, "reward": 1.6393697261810303, "reward_std": 0.17384447157382965, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5299947261810303, "rewards/pad": 0.125, "step": 917 }, { "completion_length": 255.40625, "epoch": 0.2925430210325048, "grad_norm": 8.05824089050293, "kl": 0.07421875, "learning_rate": 7.074569789674952e-07, "loss": 0.003, "reward": 1.5728120803833008, "reward_std": 0.15007048845291138, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4321870505809784, "step": 918 }, { "completion_length": 320.453125, "epoch": 0.292861695347355, "grad_norm": 8.89892578125, "kl": 0.0517578125, "learning_rate": 7.07138304652645e-07, "loss": 0.0021, "reward": 1.5870790481567383, "reward_std": 0.1334977000951767, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5245789289474487, "step": 919 }, { "completion_length": 245.453125, "epoch": 0.29318036966220523, "grad_norm": 15.274248123168945, "kl": 0.103515625, "learning_rate": 7.068196303377948e-07, "loss": 0.0041, "reward": 1.6676104068756104, "reward_std": 0.2029799222946167, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4176103174686432, "rewards/pad": 0.25, "step": 920 }, { "completion_length": 287.90625, "epoch": 0.29349904397705545, "grad_norm": 6.506035804748535, "kl": 0.0888671875, "learning_rate": 7.065009560229445e-07, "loss": 0.0036, "reward": 1.3093442916870117, "reward_std": 0.17159739136695862, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.3405943810939789, "step": 921 }, { "completion_length": 230.25, "epoch": 0.29381771829190567, "grad_norm": 6.8670806884765625, "kl": 0.1015625, "learning_rate": 7.061822817080942e-07, "loss": 0.0041, "reward": 1.4682257175445557, "reward_std": 0.23254211246967316, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3744756579399109, "step": 922 }, { "completion_length": 216.6875, "epoch": 0.2941363926067559, "grad_norm": 13.328472137451172, "kl": 0.07861328125, "learning_rate": 7.05863607393244e-07, "loss": 0.0031, "reward": 1.7020766735076904, "reward_std": 0.0875585600733757, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5770766735076904, "step": 923 }, { "completion_length": 109.640625, "epoch": 0.2944550669216061, "grad_norm": 8.600102424621582, "kl": 0.11279296875, "learning_rate": 7.055449330783938e-07, "loss": 0.0045, "reward": 1.7592469453811646, "reward_std": 0.1416284590959549, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5248720049858093, "rewards/pad": 0.234375, "step": 924 }, { "completion_length": 282.90625, "epoch": 0.29477374123645633, "grad_norm": 7.736202716827393, "kl": 0.06689453125, "learning_rate": 7.052262587635436e-07, "loss": 0.0027, "reward": 1.674477219581604, "reward_std": 0.12267719209194183, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4244771897792816, "rewards/pad": 0.25, "step": 925 }, { "completion_length": 420.171875, "epoch": 0.29509241555130655, "grad_norm": 5.292170524597168, "kl": 0.044189453125, "learning_rate": 7.049075844486934e-07, "loss": 0.0018, "reward": 1.5019543170928955, "reward_std": 0.08003868907690048, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5019543170928955, "rewards/pad": 0.0, "step": 926 }, { "completion_length": 300.0625, "epoch": 0.2954110898661568, "grad_norm": 11.51095962524414, "kl": 0.07666015625, "learning_rate": 7.045889101338432e-07, "loss": 0.0031, "reward": 1.627131700515747, "reward_std": 0.07033547013998032, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5021317005157471, "step": 927 }, { "completion_length": 202.125, "epoch": 0.295729764181007, "grad_norm": 10.411168098449707, "kl": 0.08447265625, "learning_rate": 7.042702358189929e-07, "loss": 0.0034, "reward": 1.411263108253479, "reward_std": 0.11484085023403168, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3643881380558014, "rewards/pad": 0.046875, "step": 928 }, { "completion_length": 237.90625, "epoch": 0.2960484384958572, "grad_norm": 22.574514389038086, "kl": 0.115234375, "learning_rate": 7.039515615041427e-07, "loss": 0.0046, "reward": 1.503699779510498, "reward_std": 0.17890988290309906, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4880746901035309, "rewards/pad": 0.03125, "step": 929 }, { "completion_length": 209.921875, "epoch": 0.29636711281070743, "grad_norm": 6.581096172332764, "kl": 0.0849609375, "learning_rate": 7.036328871892925e-07, "loss": 0.0034, "reward": 1.4850600957870483, "reward_std": 0.18157213926315308, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.37568503618240356, "rewards/pad": 0.109375, "step": 930 }, { "completion_length": 385.5, "epoch": 0.29668578712555765, "grad_norm": 4.934545516967773, "kl": 0.055419921875, "learning_rate": 7.033142128744423e-07, "loss": 0.0022, "reward": 1.3227486610412598, "reward_std": 0.1512432098388672, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.24462367594242096, "step": 931 }, { "completion_length": 258.609375, "epoch": 0.29700446144040793, "grad_norm": 9.287094116210938, "kl": 0.1123046875, "learning_rate": 7.02995538559592e-07, "loss": 0.0045, "reward": 1.538500189781189, "reward_std": 0.14638297259807587, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46037521958351135, "rewards/pad": 0.078125, "step": 932 }, { "completion_length": 200.71875, "epoch": 0.29732313575525815, "grad_norm": 8.858744621276855, "kl": 0.09375, "learning_rate": 7.026768642447418e-07, "loss": 0.0038, "reward": 1.552154302597046, "reward_std": 0.07690747827291489, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5521542429924011, "step": 933 }, { "completion_length": 269.921875, "epoch": 0.29764181007010837, "grad_norm": 11.787874221801758, "kl": 0.07373046875, "learning_rate": 7.023581899298916e-07, "loss": 0.0029, "reward": 1.6634650230407715, "reward_std": 0.13022476434707642, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.36659008264541626, "rewards/pad": 0.296875, "step": 934 }, { "completion_length": 350.765625, "epoch": 0.2979604843849586, "grad_norm": 11.997943878173828, "kl": 0.0859375, "learning_rate": 7.020395156150414e-07, "loss": 0.0034, "reward": 1.3943145275115967, "reward_std": 0.18118952214717865, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3474395275115967, "step": 935 }, { "completion_length": 480.125, "epoch": 0.2982791586998088, "grad_norm": 4.998562335968018, "kl": 0.040771484375, "learning_rate": 7.017208413001911e-07, "loss": 0.0016, "reward": 1.3589204549789429, "reward_std": 0.010401003994047642, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.35892045497894287, "step": 936 }, { "completion_length": 206.6875, "epoch": 0.29859783301465903, "grad_norm": 10.145219802856445, "kl": 0.08984375, "learning_rate": 7.014021669853409e-07, "loss": 0.0036, "reward": 1.6010687351226807, "reward_std": 0.15740467607975006, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4760686159133911, "rewards/pad": 0.125, "step": 937 }, { "completion_length": 254.484375, "epoch": 0.29891650732950925, "grad_norm": 8.828103065490723, "kl": 0.08544921875, "learning_rate": 7.010834926704907e-07, "loss": 0.0034, "reward": 1.708954930305481, "reward_std": 0.05463279038667679, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.583954930305481, "rewards/pad": 0.125, "step": 938 }, { "completion_length": 208.828125, "epoch": 0.29923518164435947, "grad_norm": 9.465192794799805, "kl": 0.08349609375, "learning_rate": 7.007648183556405e-07, "loss": 0.0033, "reward": 1.4708402156829834, "reward_std": 0.15929856896400452, "rewards/answer_reward": 0.015625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4552152156829834, "step": 939 }, { "completion_length": 264.234375, "epoch": 0.2995538559592097, "grad_norm": 5.2820024490356445, "kl": 0.072265625, "learning_rate": 7.004461440407902e-07, "loss": 0.0029, "reward": 1.665083646774292, "reward_std": 0.14066141843795776, "rewards/answer_reward": 0.15625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5088337063789368, "step": 940 }, { "completion_length": 341.015625, "epoch": 0.2998725302740599, "grad_norm": 3.734297275543213, "kl": 0.05712890625, "learning_rate": 7.0012746972594e-07, "loss": 0.0023, "reward": 1.414062738418579, "reward_std": 0.04965207725763321, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.41406285762786865, "step": 941 }, { "completion_length": 198.28125, "epoch": 0.30019120458891013, "grad_norm": 13.466355323791504, "kl": 0.06982421875, "learning_rate": 6.998087954110899e-07, "loss": 0.0028, "reward": 1.7419767379760742, "reward_std": 0.22685980796813965, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.44510167837142944, "rewards/pad": 0.3125, "step": 942 }, { "completion_length": 252.84375, "epoch": 0.30050987890376035, "grad_norm": 8.96711254119873, "kl": 0.09521484375, "learning_rate": 6.994901210962397e-07, "loss": 0.0038, "reward": 1.5406872034072876, "reward_std": 0.22085599601268768, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5406872034072876, "rewards/pad": 0.03125, "step": 943 }, { "completion_length": 164.78125, "epoch": 0.3008285532186106, "grad_norm": 33.10133361816406, "kl": 0.10009765625, "learning_rate": 6.991714467813894e-07, "loss": 0.004, "reward": 1.4604114294052124, "reward_std": 0.23422494530677795, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4291614294052124, "rewards/pad": 0.046875, "step": 944 }, { "completion_length": 267.5, "epoch": 0.3011472275334608, "grad_norm": 9.718307495117188, "kl": 0.09130859375, "learning_rate": 6.988527724665392e-07, "loss": 0.0037, "reward": 1.4997873306274414, "reward_std": 0.08561849594116211, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.499787300825119, "rewards/pad": 0.0, "step": 945 }, { "completion_length": 322.53125, "epoch": 0.301465901848311, "grad_norm": 6.480549335479736, "kl": 0.08447265625, "learning_rate": 6.98534098151689e-07, "loss": 0.0034, "reward": 1.5078763961791992, "reward_std": 0.18586468696594238, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44537633657455444, "rewards/pad": 0.0625, "step": 946 }, { "completion_length": 241.09375, "epoch": 0.30178457616316123, "grad_norm": 102.33425903320312, "kl": 0.091796875, "learning_rate": 6.982154238368388e-07, "loss": 0.0037, "reward": 1.5356953144073486, "reward_std": 0.0902363583445549, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5356953144073486, "rewards/pad": 0.0, "step": 947 }, { "completion_length": 360.828125, "epoch": 0.30210325047801145, "grad_norm": 8.094786643981934, "kl": 0.07275390625, "learning_rate": 6.978967495219885e-07, "loss": 0.0029, "reward": 1.5371003150939941, "reward_std": 0.11551005393266678, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4277253746986389, "step": 948 }, { "completion_length": 185.453125, "epoch": 0.3024219247928617, "grad_norm": 14.498106002807617, "kl": 0.09765625, "learning_rate": 6.975780752071383e-07, "loss": 0.0039, "reward": 1.489546537399292, "reward_std": 0.1398780643939972, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.473921537399292, "rewards/pad": 0.015625, "step": 949 }, { "completion_length": 255.578125, "epoch": 0.3027405991077119, "grad_norm": 6.012439727783203, "kl": 0.10498046875, "learning_rate": 6.972594008922881e-07, "loss": 0.0042, "reward": 1.3009319305419922, "reward_std": 0.11145664751529694, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.2853069305419922, "rewards/pad": 0.015625, "step": 950 }, { "completion_length": 307.671875, "epoch": 0.3030592734225621, "grad_norm": 8.013301849365234, "kl": 0.07421875, "learning_rate": 6.969407265774379e-07, "loss": 0.003, "reward": 1.384726881980896, "reward_std": 0.06668713688850403, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.384726881980896, "step": 951 }, { "completion_length": 421.75, "epoch": 0.3033779477374124, "grad_norm": 8.324105262756348, "kl": 0.06494140625, "learning_rate": 6.966220522625876e-07, "loss": 0.0026, "reward": 1.3232536315917969, "reward_std": 0.1271124929189682, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.35450369119644165, "step": 952 }, { "completion_length": 200.96875, "epoch": 0.3036966220522626, "grad_norm": 13.980183601379395, "kl": 0.078125, "learning_rate": 6.963033779477374e-07, "loss": 0.0031, "reward": 1.4275410175323486, "reward_std": 0.1396736055612564, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42754098773002625, "rewards/pad": 0.0, "step": 953 }, { "completion_length": 337.875, "epoch": 0.30401529636711283, "grad_norm": 9.594482421875, "kl": 0.08203125, "learning_rate": 6.959847036328872e-07, "loss": 0.0033, "reward": 1.5653290748596191, "reward_std": 0.13285420835018158, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4715789556503296, "step": 954 }, { "completion_length": 319.203125, "epoch": 0.30433397068196305, "grad_norm": 19.658885955810547, "kl": 0.06396484375, "learning_rate": 6.95666029318037e-07, "loss": 0.0026, "reward": 1.576867938041687, "reward_std": 0.08937421441078186, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.467492938041687, "step": 955 }, { "completion_length": 446.875, "epoch": 0.30465264499681327, "grad_norm": 4.599447250366211, "kl": 0.049072265625, "learning_rate": 6.953473550031867e-07, "loss": 0.002, "reward": 1.4357045888900757, "reward_std": 0.11575381457805634, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3419545590877533, "step": 956 }, { "completion_length": 283.515625, "epoch": 0.3049713193116635, "grad_norm": 13.620803833007812, "kl": 0.0869140625, "learning_rate": 6.950286806883365e-07, "loss": 0.0035, "reward": 1.4528107643127441, "reward_std": 0.076294906437397, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45281076431274414, "rewards/pad": 0.0, "step": 957 }, { "completion_length": 236.9375, "epoch": 0.3052899936265137, "grad_norm": 9.338239669799805, "kl": 0.0859375, "learning_rate": 6.947100063734863e-07, "loss": 0.0034, "reward": 1.412969946861267, "reward_std": 0.15528462827205658, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.2879698872566223, "rewards/pad": 0.125, "step": 958 }, { "completion_length": 278.078125, "epoch": 0.30560866794136393, "grad_norm": 7.011009693145752, "kl": 0.07861328125, "learning_rate": 6.94391332058636e-07, "loss": 0.0031, "reward": 1.4641616344451904, "reward_std": 0.1217249259352684, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.47978660464286804, "step": 959 }, { "completion_length": 203.078125, "epoch": 0.30592734225621415, "grad_norm": 12.84834098815918, "kl": 0.08837890625, "learning_rate": 6.940726577437858e-07, "loss": 0.0035, "reward": 1.4572542905807495, "reward_std": 0.22139707207679749, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4728792905807495, "rewards/pad": 0.015625, "step": 960 }, { "completion_length": 292.71875, "epoch": 0.3062460165710644, "grad_norm": 17.683250427246094, "kl": 0.08203125, "learning_rate": 6.937539834289357e-07, "loss": 0.0033, "reward": 1.4127620458602905, "reward_std": 0.11772623658180237, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3190120756626129, "step": 961 }, { "completion_length": 159.9375, "epoch": 0.3065646908859146, "grad_norm": 12.655012130737305, "kl": 0.10546875, "learning_rate": 6.934353091140854e-07, "loss": 0.0042, "reward": 1.7172095775604248, "reward_std": 0.14537625014781952, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4984596073627472, "rewards/pad": 0.234375, "step": 962 }, { "completion_length": 157.9375, "epoch": 0.3068833652007648, "grad_norm": 10.002306938171387, "kl": 0.111328125, "learning_rate": 6.931166347992351e-07, "loss": 0.0045, "reward": 1.5363670587539673, "reward_std": 0.08813170343637466, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5363670587539673, "rewards/pad": 0.0, "step": 963 }, { "completion_length": 441.34375, "epoch": 0.30720203951561503, "grad_norm": 7.946275234222412, "kl": 0.052490234375, "learning_rate": 6.927979604843849e-07, "loss": 0.0021, "reward": 1.5722520351409912, "reward_std": 0.08100847154855728, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5878770351409912, "step": 964 }, { "completion_length": 269.84375, "epoch": 0.30752071383046525, "grad_norm": 6.707942962646484, "kl": 0.0859375, "learning_rate": 6.924792861695347e-07, "loss": 0.0034, "reward": 1.5477776527404785, "reward_std": 0.10902424156665802, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4540277421474457, "rewards/pad": 0.125, "step": 965 }, { "completion_length": 257.84375, "epoch": 0.3078393881453155, "grad_norm": 7.706878185272217, "kl": 0.07763671875, "learning_rate": 6.921606118546845e-07, "loss": 0.0031, "reward": 1.347168207168579, "reward_std": 0.07446466386318207, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3471682071685791, "step": 966 }, { "completion_length": 212.40625, "epoch": 0.3081580624601657, "grad_norm": 10.741114616394043, "kl": 0.10791015625, "learning_rate": 6.918419375398342e-07, "loss": 0.0043, "reward": 1.5717949867248535, "reward_std": 0.10261975973844528, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5717950463294983, "step": 967 }, { "completion_length": 264.390625, "epoch": 0.3084767367750159, "grad_norm": 13.138110160827637, "kl": 0.10791015625, "learning_rate": 6.91523263224984e-07, "loss": 0.0043, "reward": 1.5748289823532104, "reward_std": 0.1230107843875885, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5904540419578552, "rewards/pad": 0.0, "step": 968 }, { "completion_length": 164.9375, "epoch": 0.30879541108986613, "grad_norm": 15.702935218811035, "kl": 0.10498046875, "learning_rate": 6.912045889101338e-07, "loss": 0.0042, "reward": 1.4226768016815186, "reward_std": 0.09258827567100525, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.31330177187919617, "rewards/pad": 0.109375, "step": 969 }, { "completion_length": 234.34375, "epoch": 0.30911408540471635, "grad_norm": 7.595775604248047, "kl": 0.09130859375, "learning_rate": 6.908859145952836e-07, "loss": 0.0037, "reward": 1.6368181705474854, "reward_std": 0.13625429570674896, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5274431109428406, "rewards/pad": 0.109375, "step": 970 }, { "completion_length": 213.921875, "epoch": 0.30943275971956663, "grad_norm": 12.487693786621094, "kl": 0.1376953125, "learning_rate": 6.905672402804333e-07, "loss": 0.0055, "reward": 1.4605860710144043, "reward_std": 0.07868099212646484, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4605861306190491, "rewards/pad": 0.0, "step": 971 }, { "completion_length": 219.09375, "epoch": 0.30975143403441685, "grad_norm": 7.416385173797607, "kl": 0.11962890625, "learning_rate": 6.902485659655831e-07, "loss": 0.0048, "reward": 1.4688482284545898, "reward_std": 0.09033405780792236, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46884816884994507, "rewards/pad": 0.0, "step": 972 }, { "completion_length": 356.0, "epoch": 0.31007010834926707, "grad_norm": 28.166015625, "kl": 0.06982421875, "learning_rate": 6.899298916507329e-07, "loss": 0.0028, "reward": 1.463951826095581, "reward_std": 0.14015373587608337, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.47957685589790344, "rewards/pad": 0.0, "step": 973 }, { "completion_length": 314.265625, "epoch": 0.3103887826641173, "grad_norm": 6.10435676574707, "kl": 0.060302734375, "learning_rate": 6.896112173358827e-07, "loss": 0.0024, "reward": 1.4725921154022217, "reward_std": 0.076809361577034, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3475921154022217, "step": 974 }, { "completion_length": 303.375, "epoch": 0.3107074569789675, "grad_norm": 17.244062423706055, "kl": 0.080078125, "learning_rate": 6.892925430210324e-07, "loss": 0.0032, "reward": 1.5629791021347046, "reward_std": 0.10295584797859192, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4536040127277374, "step": 975 }, { "completion_length": 313.078125, "epoch": 0.31102613129381773, "grad_norm": 6.946135997772217, "kl": 0.062255859375, "learning_rate": 6.889738687061822e-07, "loss": 0.0025, "reward": 1.5047985315322876, "reward_std": 0.18012316524982452, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.41104856133461, "step": 976 }, { "completion_length": 163.3125, "epoch": 0.31134480560866795, "grad_norm": 21.414690017700195, "kl": 0.09423828125, "learning_rate": 6.88655194391332e-07, "loss": 0.0038, "reward": 1.8146766424179077, "reward_std": 0.1765768826007843, "rewards/answer_reward": 0.3125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5021766424179077, "step": 977 }, { "completion_length": 248.625, "epoch": 0.31166347992351817, "grad_norm": 23.901901245117188, "kl": 0.07568359375, "learning_rate": 6.883365200764818e-07, "loss": 0.003, "reward": 1.4768123626708984, "reward_std": 0.11282205581665039, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4924373924732208, "rewards/pad": 0.0, "step": 978 }, { "completion_length": 333.125, "epoch": 0.3119821542383684, "grad_norm": 5.673859119415283, "kl": 0.05712890625, "learning_rate": 6.880178457616315e-07, "loss": 0.0023, "reward": 1.4660868644714355, "reward_std": 0.08494298905134201, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4817117750644684, "step": 979 }, { "completion_length": 258.796875, "epoch": 0.3123008285532186, "grad_norm": 12.043933868408203, "kl": 0.07958984375, "learning_rate": 6.876991714467814e-07, "loss": 0.0032, "reward": 1.6999037265777588, "reward_std": 0.17478086054325104, "rewards/pad": 0.203125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.496778666973114, "step": 980 }, { "completion_length": 331.9375, "epoch": 0.31261950286806883, "grad_norm": 4.478605270385742, "kl": 0.05419921875, "learning_rate": 6.873804971319312e-07, "loss": 0.0022, "reward": 1.4835875034332275, "reward_std": 0.1486971229314804, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3898375630378723, "step": 981 }, { "completion_length": 365.6875, "epoch": 0.31293817718291905, "grad_norm": 7.0540313720703125, "kl": 0.0703125, "learning_rate": 6.87061822817081e-07, "loss": 0.0028, "reward": 1.3310976028442383, "reward_std": 0.100373774766922, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.34672248363494873, "step": 982 }, { "completion_length": 296.3125, "epoch": 0.3132568514977693, "grad_norm": 43.66001892089844, "kl": 0.087890625, "learning_rate": 6.867431485022307e-07, "loss": 0.0035, "reward": 1.3896299600601196, "reward_std": 0.1810145378112793, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.327129989862442, "rewards/pad": 0.078125, "step": 983 }, { "completion_length": 221.3125, "epoch": 0.3135755258126195, "grad_norm": 7.561755657196045, "kl": 0.07958984375, "learning_rate": 6.864244741873805e-07, "loss": 0.0032, "reward": 1.6374634504318237, "reward_std": 0.13763760030269623, "rewards/answer_reward": 0.1875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.44996345043182373, "step": 984 }, { "completion_length": 334.921875, "epoch": 0.3138942001274697, "grad_norm": 9.691668510437012, "kl": 0.076171875, "learning_rate": 6.861057998725303e-07, "loss": 0.003, "reward": 1.5845732688903809, "reward_std": 0.1470044106245041, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5064482688903809, "step": 985 }, { "completion_length": 273.375, "epoch": 0.31421287444231993, "grad_norm": 7.891676902770996, "kl": 0.07177734375, "learning_rate": 6.8578712555768e-07, "loss": 0.0029, "reward": 1.649653434753418, "reward_std": 0.20933878421783447, "rewards/pad": 0.203125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.47777846455574036, "step": 986 }, { "completion_length": 300.1875, "epoch": 0.31453154875717015, "grad_norm": 8.156953811645508, "kl": 0.07373046875, "learning_rate": 6.854684512428298e-07, "loss": 0.003, "reward": 1.531054973602295, "reward_std": 0.07905551791191101, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5310550332069397, "rewards/pad": 0.0, "step": 987 }, { "completion_length": 386.78125, "epoch": 0.3148502230720204, "grad_norm": 8.51009750366211, "kl": 0.05078125, "learning_rate": 6.851497769279796e-07, "loss": 0.002, "reward": 1.456154704093933, "reward_std": 0.05731338635087013, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4561547040939331, "rewards/pad": 0.0, "step": 988 }, { "completion_length": 310.0625, "epoch": 0.3151688973868706, "grad_norm": 9.364920616149902, "kl": 0.078125, "learning_rate": 6.848311026131294e-07, "loss": 0.0031, "reward": 1.444016933441162, "reward_std": 0.11831048130989075, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4440169036388397, "step": 989 }, { "completion_length": 266.671875, "epoch": 0.3154875717017208, "grad_norm": 9.522704124450684, "kl": 0.08251953125, "learning_rate": 6.845124282982791e-07, "loss": 0.0033, "reward": 1.562047004699707, "reward_std": 0.1287516951560974, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.45267191529273987, "step": 990 }, { "completion_length": 273.453125, "epoch": 0.3158062460165711, "grad_norm": 11.734932899475098, "kl": 0.1044921875, "learning_rate": 6.841937539834289e-07, "loss": 0.0042, "reward": 1.616694450378418, "reward_std": 0.14030656218528748, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4135693609714508, "rewards/pad": 0.203125, "step": 991 }, { "completion_length": 195.390625, "epoch": 0.3161249203314213, "grad_norm": 8.491342544555664, "kl": 0.09716796875, "learning_rate": 6.838750796685787e-07, "loss": 0.0039, "reward": 1.4952812194824219, "reward_std": 0.1000450998544693, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4952811598777771, "step": 992 }, { "completion_length": 257.5625, "epoch": 0.31644359464627153, "grad_norm": 6.126900672912598, "kl": 0.0673828125, "learning_rate": 6.835564053537285e-07, "loss": 0.0027, "reward": 1.375550389289856, "reward_std": 0.06355856359004974, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.37555044889450073, "step": 993 }, { "completion_length": 296.40625, "epoch": 0.31676226896112175, "grad_norm": 6.934712886810303, "kl": 0.0537109375, "learning_rate": 6.832377310388782e-07, "loss": 0.0021, "reward": 1.6266264915466309, "reward_std": 0.18362689018249512, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5016264915466309, "step": 994 }, { "completion_length": 314.453125, "epoch": 0.31708094327597197, "grad_norm": 8.887853622436523, "kl": 0.06298828125, "learning_rate": 6.82919056724028e-07, "loss": 0.0025, "reward": 1.5279200077056885, "reward_std": 0.06337783485651016, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4029200077056885, "rewards/pad": 0.125, "step": 995 }, { "completion_length": 367.015625, "epoch": 0.3173996175908222, "grad_norm": 12.307510375976562, "kl": 0.05712890625, "learning_rate": 6.826003824091778e-07, "loss": 0.0023, "reward": 1.352768898010254, "reward_std": 0.10241373628377914, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3683939576148987, "step": 996 }, { "completion_length": 228.46875, "epoch": 0.3177182919056724, "grad_norm": 30.698068618774414, "kl": 0.08544921875, "learning_rate": 6.822817080943276e-07, "loss": 0.0034, "reward": 1.551741123199463, "reward_std": 0.1621362864971161, "rewards/answer_reward": 0.0625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5048661828041077, "step": 997 }, { "completion_length": 181.734375, "epoch": 0.31803696622052263, "grad_norm": 11.052029609680176, "kl": 0.09765625, "learning_rate": 6.819630337794773e-07, "loss": 0.0039, "reward": 1.4496850967407227, "reward_std": 0.08132076263427734, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44968509674072266, "step": 998 }, { "completion_length": 226.171875, "epoch": 0.31835564053537285, "grad_norm": 7.181490898132324, "kl": 0.091796875, "learning_rate": 6.816443594646272e-07, "loss": 0.0037, "reward": 1.499758243560791, "reward_std": 0.09113484621047974, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5153831243515015, "rewards/pad": 0.0, "step": 999 }, { "completion_length": 305.421875, "epoch": 0.3186743148502231, "grad_norm": 6.9231977462768555, "kl": 0.0693359375, "learning_rate": 6.81325685149777e-07, "loss": 0.0028, "reward": 1.4802541732788086, "reward_std": 0.10031703859567642, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3865041732788086, "step": 1000 }, { "completion_length": 341.59375, "epoch": 0.3189929891650733, "grad_norm": 6.3926520347595215, "kl": 0.047119140625, "learning_rate": 6.810070108349267e-07, "loss": 0.0019, "reward": 1.6489269733428955, "reward_std": 0.08554818481206894, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4145520329475403, "step": 1001 }, { "completion_length": 300.484375, "epoch": 0.3193116634799235, "grad_norm": 10.810160636901855, "kl": 0.1005859375, "learning_rate": 6.806883365200764e-07, "loss": 0.004, "reward": 1.564328908920288, "reward_std": 0.13377408683300018, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4549539089202881, "rewards/pad": 0.125, "step": 1002 }, { "completion_length": 297.421875, "epoch": 0.31963033779477373, "grad_norm": 6.0769243240356445, "kl": 0.0810546875, "learning_rate": 6.803696622052262e-07, "loss": 0.0032, "reward": 1.5560057163238525, "reward_std": 0.06804891675710678, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43100565671920776, "step": 1003 }, { "completion_length": 156.453125, "epoch": 0.31994901210962395, "grad_norm": 38.39896011352539, "kl": 0.138671875, "learning_rate": 6.80050987890376e-07, "loss": 0.0056, "reward": 1.532203197479248, "reward_std": 0.11442001163959503, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.532203197479248, "rewards/pad": 0.0, "step": 1004 }, { "completion_length": 117.546875, "epoch": 0.3202676864244742, "grad_norm": 7.369303226470947, "kl": 0.11962890625, "learning_rate": 6.797323135755258e-07, "loss": 0.0048, "reward": 1.6599302291870117, "reward_std": 0.10183878988027573, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40993016958236694, "rewards/pad": 0.25, "step": 1005 }, { "completion_length": 138.515625, "epoch": 0.3205863607393244, "grad_norm": 14.212640762329102, "kl": 0.11376953125, "learning_rate": 6.794136392606755e-07, "loss": 0.0046, "reward": 1.5369373559951782, "reward_std": 0.1619691550731659, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.427562415599823, "rewards/pad": 0.125, "step": 1006 }, { "completion_length": 266.234375, "epoch": 0.3209050350541746, "grad_norm": 13.497116088867188, "kl": 0.06787109375, "learning_rate": 6.790949649458253e-07, "loss": 0.0027, "reward": 1.3564127683639526, "reward_std": 0.028475552797317505, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.35641270875930786, "step": 1007 }, { "completion_length": 250.4375, "epoch": 0.32122370936902483, "grad_norm": 15.16408634185791, "kl": 0.06640625, "learning_rate": 6.787762906309751e-07, "loss": 0.0026, "reward": 1.6797528266906738, "reward_std": 0.14224869012832642, "rewards/pad": 0.359375, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.3516278862953186, "step": 1008 }, { "completion_length": 304.734375, "epoch": 0.32154238368387505, "grad_norm": 8.194258689880371, "kl": 0.06787109375, "learning_rate": 6.784576163161249e-07, "loss": 0.0027, "reward": 1.4798452854156494, "reward_std": 0.08323159068822861, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4798452854156494, "step": 1009 }, { "completion_length": 147.609375, "epoch": 0.3218610579987253, "grad_norm": 27.742565155029297, "kl": 0.09423828125, "learning_rate": 6.781389420012746e-07, "loss": 0.0038, "reward": 2.128603935241699, "reward_std": 0.14139896631240845, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5036039352416992, "rewards/pad": 0.625, "step": 1010 }, { "completion_length": 166.9375, "epoch": 0.32217973231357555, "grad_norm": 14.250349044799805, "kl": 0.09521484375, "learning_rate": 6.778202676864244e-07, "loss": 0.0038, "reward": 1.4786567687988281, "reward_std": 0.13321171700954437, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.24428173899650574, "step": 1011 }, { "completion_length": 306.46875, "epoch": 0.32249840662842577, "grad_norm": 4.766030788421631, "kl": 0.06982421875, "learning_rate": 6.775015933715742e-07, "loss": 0.0028, "reward": 1.446626901626587, "reward_std": 0.09482300281524658, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4622519016265869, "step": 1012 }, { "completion_length": 207.09375, "epoch": 0.322817080943276, "grad_norm": 5.062275409698486, "kl": 0.11083984375, "learning_rate": 6.77182919056724e-07, "loss": 0.0044, "reward": 1.72430419921875, "reward_std": 0.10817757248878479, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6149291396141052, "step": 1013 }, { "completion_length": 279.78125, "epoch": 0.3231357552581262, "grad_norm": 72.79349517822266, "kl": 0.072265625, "learning_rate": 6.768642447418737e-07, "loss": 0.0029, "reward": 1.5395193099975586, "reward_std": 0.22496607899665833, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.44576939940452576, "step": 1014 }, { "completion_length": 222.890625, "epoch": 0.32345442957297643, "grad_norm": 8.13277816772461, "kl": 0.07373046875, "learning_rate": 6.765455704270235e-07, "loss": 0.003, "reward": 1.5967485904693604, "reward_std": 0.10184575617313385, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3623736798763275, "step": 1015 }, { "completion_length": 203.046875, "epoch": 0.32377310388782665, "grad_norm": 7.447783946990967, "kl": 0.1044921875, "learning_rate": 6.762268961121733e-07, "loss": 0.0042, "reward": 1.6282625198364258, "reward_std": 0.11571632325649261, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5345126390457153, "step": 1016 }, { "completion_length": 128.625, "epoch": 0.3240917782026769, "grad_norm": 7.846916198730469, "kl": 0.1005859375, "learning_rate": 6.75908221797323e-07, "loss": 0.004, "reward": 1.5380330085754395, "reward_std": 0.10079097002744675, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4286580979824066, "rewards/pad": 0.109375, "step": 1017 }, { "completion_length": 328.453125, "epoch": 0.3244104525175271, "grad_norm": 6.111870288848877, "kl": 0.0546875, "learning_rate": 6.755895474824729e-07, "loss": 0.0022, "reward": 1.4796979427337646, "reward_std": 0.13149893283843994, "rewards/answer_reward": 0.03125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4484480023384094, "step": 1018 }, { "completion_length": 253.84375, "epoch": 0.3247291268323773, "grad_norm": 30.818262100219727, "kl": 0.0693359375, "learning_rate": 6.752708731676227e-07, "loss": 0.0028, "reward": 1.561851978302002, "reward_std": 0.11902665346860886, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.45247694849967957, "rewards/pad": 0.125, "step": 1019 }, { "completion_length": 249.4375, "epoch": 0.32504780114722753, "grad_norm": 16.430784225463867, "kl": 0.0732421875, "learning_rate": 6.749521988527725e-07, "loss": 0.0029, "reward": 1.5126055479049683, "reward_std": 0.12142893671989441, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.48135554790496826, "step": 1020 }, { "completion_length": 231.203125, "epoch": 0.32536647546207775, "grad_norm": 11.677145957946777, "kl": 0.08154296875, "learning_rate": 6.746335245379222e-07, "loss": 0.0033, "reward": 1.5022270679473877, "reward_std": 0.0498536080121994, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.37722718715667725, "rewards/pad": 0.125, "step": 1021 }, { "completion_length": 262.21875, "epoch": 0.325685149776928, "grad_norm": 6.868616580963135, "kl": 0.060302734375, "learning_rate": 6.74314850223072e-07, "loss": 0.0024, "reward": 1.6476733684539795, "reward_std": 0.11569195985794067, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4914233982563019, "step": 1022 }, { "completion_length": 272.5, "epoch": 0.3260038240917782, "grad_norm": 5.200622081756592, "kl": 0.08544921875, "learning_rate": 6.739961759082218e-07, "loss": 0.0034, "reward": 1.463226556777954, "reward_std": 0.042159367352724075, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.33822667598724365, "step": 1023 }, { "completion_length": 138.390625, "epoch": 0.3263224984066284, "grad_norm": 22.599626541137695, "kl": 0.1357421875, "learning_rate": 6.736775015933716e-07, "loss": 0.0054, "reward": 1.4473992586135864, "reward_std": 0.08425348997116089, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4473992586135864, "rewards/pad": 0.0, "step": 1024 }, { "completion_length": 181.8125, "epoch": 0.32664117272147863, "grad_norm": 10.378974914550781, "kl": 0.10205078125, "learning_rate": 6.733588272785213e-07, "loss": 0.0041, "reward": 1.6889574527740479, "reward_std": 0.11436554789543152, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5795823335647583, "rewards/pad": 0.109375, "step": 1025 }, { "completion_length": 274.765625, "epoch": 0.32695984703632885, "grad_norm": 10.433825492858887, "kl": 0.06396484375, "learning_rate": 6.730401529636711e-07, "loss": 0.0026, "reward": 1.571969747543335, "reward_std": 0.132668137550354, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3219696879386902, "step": 1026 }, { "completion_length": 295.328125, "epoch": 0.3272785213511791, "grad_norm": 16.413532257080078, "kl": 0.0673828125, "learning_rate": 6.727214786488209e-07, "loss": 0.0027, "reward": 1.5735794305801392, "reward_std": 0.17031529545783997, "rewards/answer_reward": 0.21875, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.3860793709754944, "step": 1027 }, { "completion_length": 197.0, "epoch": 0.3275971956660293, "grad_norm": 81.36949157714844, "kl": 0.08349609375, "learning_rate": 6.724028043339707e-07, "loss": 0.0033, "reward": 1.575305700302124, "reward_std": 0.14447948336601257, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3409307599067688, "step": 1028 }, { "completion_length": 226.46875, "epoch": 0.3279158699808795, "grad_norm": 11.29088020324707, "kl": 0.0859375, "learning_rate": 6.720841300191204e-07, "loss": 0.0034, "reward": 1.5066144466400146, "reward_std": 0.07048650085926056, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5066144466400146, "step": 1029 }, { "completion_length": 150.375, "epoch": 0.3282345442957298, "grad_norm": 13.777542114257812, "kl": 0.09228515625, "learning_rate": 6.717654557042702e-07, "loss": 0.0037, "reward": 1.2772927284240723, "reward_std": 0.1390942633152008, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.29291781783103943, "step": 1030 }, { "completion_length": 179.109375, "epoch": 0.32855321861058, "grad_norm": 15.101614952087402, "kl": 0.09912109375, "learning_rate": 6.7144678138942e-07, "loss": 0.004, "reward": 1.6715672016143799, "reward_std": 0.12946265935897827, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5778171420097351, "rewards/pad": 0.125, "step": 1031 }, { "completion_length": 270.25, "epoch": 0.32887189292543023, "grad_norm": 7.602063179016113, "kl": 0.07763671875, "learning_rate": 6.711281070745698e-07, "loss": 0.0031, "reward": 1.615061640739441, "reward_std": 0.11350873112678528, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5213116407394409, "step": 1032 }, { "completion_length": 228.609375, "epoch": 0.32919056724028045, "grad_norm": 6.101695537567139, "kl": 0.0791015625, "learning_rate": 6.708094327597195e-07, "loss": 0.0032, "reward": 1.5489016771316528, "reward_std": 0.14920702576637268, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5176516771316528, "rewards/pad": 0.046875, "step": 1033 }, { "completion_length": 189.640625, "epoch": 0.32950924155513067, "grad_norm": 7.619702339172363, "kl": 0.095703125, "learning_rate": 6.704907584448693e-07, "loss": 0.0038, "reward": 1.3966412544250488, "reward_std": 0.08044132590293884, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39664119482040405, "rewards/pad": 0.0, "step": 1034 }, { "completion_length": 234.75, "epoch": 0.3298279158699809, "grad_norm": 16.621103286743164, "kl": 0.08935546875, "learning_rate": 6.701720841300191e-07, "loss": 0.0036, "reward": 1.644860863685608, "reward_std": 0.20091207325458527, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5198608636856079, "rewards/pad": 0.140625, "step": 1035 }, { "completion_length": 176.734375, "epoch": 0.3301465901848311, "grad_norm": 19.77828598022461, "kl": 0.08935546875, "learning_rate": 6.69853409815169e-07, "loss": 0.0036, "reward": 1.7387616634368896, "reward_std": 0.18001246452331543, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5825115442276001, "step": 1036 }, { "completion_length": 139.5, "epoch": 0.33046526449968133, "grad_norm": 155.10440063476562, "kl": 0.123046875, "learning_rate": 6.695347355003187e-07, "loss": 0.0049, "reward": 1.554897665977478, "reward_std": 0.13476990163326263, "rewards/answer_reward": 0.0625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4923976957798004, "step": 1037 }, { "completion_length": 306.921875, "epoch": 0.33078393881453155, "grad_norm": 6.137125492095947, "kl": 0.08251953125, "learning_rate": 6.692160611854685e-07, "loss": 0.0033, "reward": 1.5144294500350952, "reward_std": 0.04760780930519104, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5144294500350952, "step": 1038 }, { "completion_length": 249.40625, "epoch": 0.3311026131293818, "grad_norm": 5.570843696594238, "kl": 0.0751953125, "learning_rate": 6.688973868706183e-07, "loss": 0.003, "reward": 1.6632411479949951, "reward_std": 0.13550686836242676, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.41324126720428467, "step": 1039 }, { "completion_length": 315.546875, "epoch": 0.331421287444232, "grad_norm": 114.8282699584961, "kl": 0.064453125, "learning_rate": 6.68578712555768e-07, "loss": 0.0026, "reward": 1.3185205459594727, "reward_std": 0.06591153144836426, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.33414554595947266, "step": 1040 }, { "completion_length": 144.296875, "epoch": 0.3317399617590822, "grad_norm": 19.302204132080078, "kl": 0.1201171875, "learning_rate": 6.682600382409177e-07, "loss": 0.0048, "reward": 1.7855418920516968, "reward_std": 0.09190037101507187, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4105418920516968, "rewards/pad": 0.375, "step": 1041 }, { "completion_length": 193.359375, "epoch": 0.33205863607393243, "grad_norm": 12.114130973815918, "kl": 0.1005859375, "learning_rate": 6.679413639260675e-07, "loss": 0.004, "reward": 1.4832127094268799, "reward_std": 0.12668836116790771, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48321273922920227, "rewards/pad": 0.0, "step": 1042 }, { "completion_length": 97.15625, "epoch": 0.33237731038878265, "grad_norm": 34.39638900756836, "kl": 0.1279296875, "learning_rate": 6.676226896112173e-07, "loss": 0.0051, "reward": 1.7472283840179443, "reward_std": 0.10076180100440979, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6222284436225891, "rewards/pad": 0.125, "step": 1043 }, { "completion_length": 251.078125, "epoch": 0.3326959847036329, "grad_norm": 6.926640510559082, "kl": 0.10791015625, "learning_rate": 6.67304015296367e-07, "loss": 0.0043, "reward": 1.4593260288238525, "reward_std": 0.12262436747550964, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.45932596921920776, "step": 1044 }, { "completion_length": 210.65625, "epoch": 0.3330146590184831, "grad_norm": 17.774791717529297, "kl": 0.0986328125, "learning_rate": 6.669853409815168e-07, "loss": 0.0039, "reward": 1.5273455381393433, "reward_std": 0.10245544463396072, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.527345597743988, "rewards/pad": 0.0, "step": 1045 }, { "completion_length": 325.578125, "epoch": 0.3333333333333333, "grad_norm": 4.738499641418457, "kl": 0.056640625, "learning_rate": 6.666666666666666e-07, "loss": 0.0023, "reward": 1.5215575695037842, "reward_std": 0.07813851535320282, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5371826887130737, "rewards/pad": 0.0, "step": 1046 }, { "completion_length": 152.828125, "epoch": 0.33365200764818354, "grad_norm": 9.14372730255127, "kl": 0.11376953125, "learning_rate": 6.663479923518164e-07, "loss": 0.0045, "reward": 1.4708912372589111, "reward_std": 0.188470259308815, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.48651620745658875, "rewards/pad": 0.0, "step": 1047 }, { "completion_length": 110.484375, "epoch": 0.33397068196303376, "grad_norm": 9.19238567352295, "kl": 0.1328125, "learning_rate": 6.660293180369661e-07, "loss": 0.0053, "reward": 1.639833688735962, "reward_std": 0.10873033106327057, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4054586887359619, "step": 1048 }, { "completion_length": 317.78125, "epoch": 0.334289356277884, "grad_norm": 139.2799072265625, "kl": 0.1337890625, "learning_rate": 6.657106437221159e-07, "loss": 0.0053, "reward": 1.6599500179290771, "reward_std": 0.126263827085495, "rewards/pad": 0.21875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44119998812675476, "step": 1049 }, { "completion_length": 299.421875, "epoch": 0.33460803059273425, "grad_norm": 7.264336585998535, "kl": 0.0830078125, "learning_rate": 6.653919694072657e-07, "loss": 0.0033, "reward": 1.4243035316467285, "reward_std": 0.1417325735092163, "rewards/answer_reward": 0.046875, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3930535614490509, "step": 1050 }, { "completion_length": 243.484375, "epoch": 0.33492670490758447, "grad_norm": 27.143455505371094, "kl": 0.0703125, "learning_rate": 6.650732950924155e-07, "loss": 0.0028, "reward": 1.8336070775985718, "reward_std": 0.07981158792972565, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5836069583892822, "step": 1051 }, { "completion_length": 283.359375, "epoch": 0.3352453792224347, "grad_norm": 8.567912101745605, "kl": 0.0693359375, "learning_rate": 6.647546207775652e-07, "loss": 0.0028, "reward": 1.3368821144104004, "reward_std": 0.04559015482664108, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3368821144104004, "step": 1052 }, { "completion_length": 209.0, "epoch": 0.3355640535372849, "grad_norm": 18.560606002807617, "kl": 0.10400390625, "learning_rate": 6.64435946462715e-07, "loss": 0.0042, "reward": 1.6890558004379272, "reward_std": 0.09991727769374847, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6890558004379272, "step": 1053 }, { "completion_length": 410.75, "epoch": 0.33588272785213513, "grad_norm": 16.412952423095703, "kl": 0.04931640625, "learning_rate": 6.641172721478648e-07, "loss": 0.002, "reward": 1.4114445447921753, "reward_std": 0.09891023486852646, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3645695149898529, "rewards/pad": 0.046875, "step": 1054 }, { "completion_length": 262.953125, "epoch": 0.33620140216698535, "grad_norm": 9.055387496948242, "kl": 0.083984375, "learning_rate": 6.637985978330147e-07, "loss": 0.0034, "reward": 1.5122671127319336, "reward_std": 0.10402856022119522, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5122671723365784, "step": 1055 }, { "completion_length": 208.296875, "epoch": 0.3365200764818356, "grad_norm": 10.077004432678223, "kl": 0.0810546875, "learning_rate": 6.634799235181644e-07, "loss": 0.0032, "reward": 1.6834617853164673, "reward_std": 0.10777908563613892, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4490867853164673, "rewards/pad": 0.234375, "step": 1056 }, { "completion_length": 161.75, "epoch": 0.3368387507966858, "grad_norm": 12.025467872619629, "kl": 0.0986328125, "learning_rate": 6.631612492033142e-07, "loss": 0.004, "reward": 1.5560848712921143, "reward_std": 0.19594892859458923, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46233493089675903, "rewards/pad": 0.09375, "step": 1057 }, { "completion_length": 338.34375, "epoch": 0.337157425111536, "grad_norm": 11.654243469238281, "kl": 0.0537109375, "learning_rate": 6.62842574888464e-07, "loss": 0.0021, "reward": 1.517212152481079, "reward_std": 0.06006244942545891, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5172120928764343, "step": 1058 }, { "completion_length": 357.046875, "epoch": 0.33747609942638623, "grad_norm": 10.765448570251465, "kl": 0.06787109375, "learning_rate": 6.625239005736138e-07, "loss": 0.0027, "reward": 1.3917007446289062, "reward_std": 0.11147765070199966, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4073258936405182, "step": 1059 }, { "completion_length": 162.109375, "epoch": 0.33779477374123645, "grad_norm": 16.117137908935547, "kl": 0.10302734375, "learning_rate": 6.622052262587635e-07, "loss": 0.0041, "reward": 1.499964714050293, "reward_std": 0.23960351943969727, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.37496474385261536, "rewards/pad": 0.140625, "step": 1060 }, { "completion_length": 258.34375, "epoch": 0.3381134480560867, "grad_norm": 7.214158058166504, "kl": 0.08984375, "learning_rate": 6.618865519439133e-07, "loss": 0.0036, "reward": 1.595639705657959, "reward_std": 0.09060221910476685, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.470639705657959, "step": 1061 }, { "completion_length": 353.171875, "epoch": 0.3384321223709369, "grad_norm": 8.844647407531738, "kl": 0.0498046875, "learning_rate": 6.615678776290631e-07, "loss": 0.002, "reward": 1.5343294143676758, "reward_std": 0.1305486261844635, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.29995444416999817, "step": 1062 }, { "completion_length": 342.921875, "epoch": 0.3387507966857871, "grad_norm": 10.557599067687988, "kl": 0.0908203125, "learning_rate": 6.612492033142129e-07, "loss": 0.0036, "reward": 1.603943109512329, "reward_std": 0.08996868133544922, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4789431691169739, "rewards/pad": 0.125, "step": 1063 }, { "completion_length": 219.890625, "epoch": 0.33906947100063733, "grad_norm": 10.76680850982666, "kl": 0.09521484375, "learning_rate": 6.609305289993626e-07, "loss": 0.0038, "reward": 1.5738226175308228, "reward_std": 0.22833140194416046, "rewards/answer_reward": 0.203125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.38632258772850037, "step": 1064 }, { "completion_length": 187.28125, "epoch": 0.33938814531548755, "grad_norm": 8.700113296508789, "kl": 0.1181640625, "learning_rate": 6.606118546845124e-07, "loss": 0.0047, "reward": 1.413580060005188, "reward_std": 0.11447126418352127, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.413580060005188, "step": 1065 }, { "completion_length": 266.84375, "epoch": 0.3397068196303378, "grad_norm": 19.892236709594727, "kl": 0.07861328125, "learning_rate": 6.602931803696622e-07, "loss": 0.0031, "reward": 1.4972413778305054, "reward_std": 0.15969689190387726, "rewards/answer_reward": 0.171875, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.34099140763282776, "step": 1066 }, { "completion_length": 318.78125, "epoch": 0.340025493945188, "grad_norm": 6.460472583770752, "kl": 0.0849609375, "learning_rate": 6.59974506054812e-07, "loss": 0.0034, "reward": 1.3890538215637207, "reward_std": 0.1033376008272171, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4046788811683655, "step": 1067 }, { "completion_length": 224.34375, "epoch": 0.3403441682600382, "grad_norm": 10.880087852478027, "kl": 0.09423828125, "learning_rate": 6.596558317399617e-07, "loss": 0.0038, "reward": 1.6705816984176636, "reward_std": 0.17944398522377014, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5612067580223083, "rewards/pad": 0.125, "step": 1068 }, { "completion_length": 202.859375, "epoch": 0.3406628425748885, "grad_norm": 12.304518699645996, "kl": 0.119140625, "learning_rate": 6.593371574251115e-07, "loss": 0.0048, "reward": 1.6697089672088623, "reward_std": 0.10854683816432953, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6697089076042175, "rewards/pad": 0.0, "step": 1069 }, { "completion_length": 233.8125, "epoch": 0.3409815168897387, "grad_norm": 15.415122985839844, "kl": 0.0986328125, "learning_rate": 6.590184831102613e-07, "loss": 0.0039, "reward": 1.6353888511657715, "reward_std": 0.23096291720867157, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.43226397037506104, "rewards/pad": 0.21875, "step": 1070 }, { "completion_length": 235.84375, "epoch": 0.34130019120458893, "grad_norm": 20.07423973083496, "kl": 0.09765625, "learning_rate": 6.586998087954111e-07, "loss": 0.0039, "reward": 1.5326218605041504, "reward_std": 0.10475602746009827, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43887192010879517, "step": 1071 }, { "completion_length": 360.171875, "epoch": 0.34161886551943915, "grad_norm": 18.752527236938477, "kl": 0.0576171875, "learning_rate": 6.583811344805608e-07, "loss": 0.0023, "reward": 1.5809378623962402, "reward_std": 0.11181546002626419, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4715629518032074, "rewards/pad": 0.125, "step": 1072 }, { "completion_length": 268.5, "epoch": 0.3419375398342894, "grad_norm": 7.56311559677124, "kl": 0.0712890625, "learning_rate": 6.580624601657106e-07, "loss": 0.0029, "reward": 1.630232572555542, "reward_std": 0.1588975340127945, "rewards/pad": 0.203125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.42710763216018677, "step": 1073 }, { "completion_length": 208.84375, "epoch": 0.3422562141491396, "grad_norm": 47.703773498535156, "kl": 0.09033203125, "learning_rate": 6.577437858508605e-07, "loss": 0.0036, "reward": 1.799774408340454, "reward_std": 0.13436491787433624, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5653994679450989, "step": 1074 }, { "completion_length": 368.671875, "epoch": 0.3425748884639898, "grad_norm": 8.87077808380127, "kl": 0.06494140625, "learning_rate": 6.574251115360103e-07, "loss": 0.0026, "reward": 1.5052409172058105, "reward_std": 0.1388981193304062, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5208659768104553, "step": 1075 }, { "completion_length": 194.671875, "epoch": 0.34289356277884003, "grad_norm": 8.328705787658691, "kl": 0.126953125, "learning_rate": 6.5710643722116e-07, "loss": 0.0051, "reward": 1.571947693824768, "reward_std": 0.10731147229671478, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5719476938247681, "rewards/pad": 0.0, "step": 1076 }, { "completion_length": 171.859375, "epoch": 0.34321223709369025, "grad_norm": 34.82987594604492, "kl": 0.0986328125, "learning_rate": 6.567877629063098e-07, "loss": 0.0039, "reward": 1.6688724756240845, "reward_std": 0.17240235209465027, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5126224756240845, "step": 1077 }, { "completion_length": 262.25, "epoch": 0.3435309114085405, "grad_norm": 13.668285369873047, "kl": 0.0712890625, "learning_rate": 6.564690885914596e-07, "loss": 0.0029, "reward": 1.568549633026123, "reward_std": 0.2681657671928406, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 0.953125, "rewards/tracking_iou_reward": 0.5216747522354126, "step": 1078 }, { "completion_length": 150.703125, "epoch": 0.3438495857233907, "grad_norm": 23.212413787841797, "kl": 0.1142578125, "learning_rate": 6.561504142766092e-07, "loss": 0.0046, "reward": 1.6253793239593506, "reward_std": 0.109074667096138, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6253793239593506, "rewards/pad": 0.0, "step": 1079 }, { "completion_length": 169.578125, "epoch": 0.3441682600382409, "grad_norm": 12.365942001342773, "kl": 0.10546875, "learning_rate": 6.55831739961759e-07, "loss": 0.0042, "reward": 1.4926584959030151, "reward_std": 0.1042180210351944, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.38328349590301514, "rewards/pad": 0.109375, "step": 1080 }, { "completion_length": 335.625, "epoch": 0.34448693435309113, "grad_norm": 11.104180335998535, "kl": 0.072265625, "learning_rate": 6.555130656469088e-07, "loss": 0.0029, "reward": 1.6401329040527344, "reward_std": 0.12838345766067505, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4995078444480896, "step": 1081 }, { "completion_length": 284.8125, "epoch": 0.34480560866794135, "grad_norm": 11.32220458984375, "kl": 0.0791015625, "learning_rate": 6.551943913320586e-07, "loss": 0.0032, "reward": 1.522026777267456, "reward_std": 0.1750384420156479, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.42827683687210083, "step": 1082 }, { "completion_length": 226.515625, "epoch": 0.3451242829827916, "grad_norm": 3.8591694831848145, "kl": 0.1064453125, "learning_rate": 6.548757170172083e-07, "loss": 0.0043, "reward": 1.390260934829712, "reward_std": 0.1007055938243866, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.2808860242366791, "rewards/pad": 0.125, "step": 1083 }, { "completion_length": 168.015625, "epoch": 0.3454429572976418, "grad_norm": 29.858274459838867, "kl": 0.09765625, "learning_rate": 6.545570427023581e-07, "loss": 0.0039, "reward": 1.3682701587677002, "reward_std": 0.14364153146743774, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.38389521837234497, "rewards/pad": 0.0, "step": 1084 }, { "completion_length": 322.296875, "epoch": 0.345761631612492, "grad_norm": 6.255033493041992, "kl": 0.08203125, "learning_rate": 6.542383683875079e-07, "loss": 0.0033, "reward": 1.3874914646148682, "reward_std": 0.14402329921722412, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.40311652421951294, "step": 1085 }, { "completion_length": 211.0, "epoch": 0.34608030592734224, "grad_norm": 11.35192584991455, "kl": 0.12890625, "learning_rate": 6.539196940726577e-07, "loss": 0.0052, "reward": 1.5528533458709717, "reward_std": 0.10497722029685974, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5528532862663269, "step": 1086 }, { "completion_length": 242.859375, "epoch": 0.34639898024219246, "grad_norm": 10.361661911010742, "kl": 0.08837890625, "learning_rate": 6.536010197578074e-07, "loss": 0.0035, "reward": 1.5939083099365234, "reward_std": 0.22381728887557983, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.45328325033187866, "rewards/pad": 0.15625, "step": 1087 }, { "completion_length": 267.234375, "epoch": 0.3467176545570427, "grad_norm": 12.8701753616333, "kl": 0.08447265625, "learning_rate": 6.532823454429572e-07, "loss": 0.0034, "reward": 1.414771318435669, "reward_std": 0.16493813693523407, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.43039631843566895, "rewards/pad": 0.0, "step": 1088 }, { "completion_length": 280.21875, "epoch": 0.34703632887189295, "grad_norm": 12.832812309265137, "kl": 0.1015625, "learning_rate": 6.52963671128107e-07, "loss": 0.0041, "reward": 1.3166179656982422, "reward_std": 0.10120934247970581, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.30099302530288696, "step": 1089 }, { "completion_length": 390.125, "epoch": 0.34735500318674317, "grad_norm": 10.07620906829834, "kl": 0.044677734375, "learning_rate": 6.526449968132568e-07, "loss": 0.0018, "reward": 1.6574950218200684, "reward_std": 0.0385076180100441, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.40749502182006836, "step": 1090 }, { "completion_length": 331.1875, "epoch": 0.3476736775015934, "grad_norm": 12.087833404541016, "kl": 0.06298828125, "learning_rate": 6.523263224984065e-07, "loss": 0.0025, "reward": 1.497642159461975, "reward_std": 0.1338558793067932, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5132672190666199, "rewards/pad": 0.0, "step": 1091 }, { "completion_length": 305.421875, "epoch": 0.3479923518164436, "grad_norm": 5.817469596862793, "kl": 0.0908203125, "learning_rate": 6.520076481835563e-07, "loss": 0.0036, "reward": 1.504326581954956, "reward_std": 0.04959062859416008, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.504326581954956, "rewards/pad": 0.0, "step": 1092 }, { "completion_length": 410.53125, "epoch": 0.34831102613129383, "grad_norm": 7.265234470367432, "kl": 0.0517578125, "learning_rate": 6.516889738687062e-07, "loss": 0.0021, "reward": 1.4915004968643188, "reward_std": 0.118180051445961, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.39775052666664124, "step": 1093 }, { "completion_length": 277.375, "epoch": 0.34862970044614405, "grad_norm": 4.923130512237549, "kl": 0.07763671875, "learning_rate": 6.51370299553856e-07, "loss": 0.0031, "reward": 1.48738694190979, "reward_std": 0.14238008856773376, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3936370015144348, "step": 1094 }, { "completion_length": 300.453125, "epoch": 0.3489483747609943, "grad_norm": 5.92523193359375, "kl": 0.0830078125, "learning_rate": 6.510516252390057e-07, "loss": 0.0033, "reward": 1.5825649499893188, "reward_std": 0.1137721836566925, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.47318994998931885, "step": 1095 }, { "completion_length": 289.015625, "epoch": 0.3492670490758445, "grad_norm": 40.995445251464844, "kl": 0.07568359375, "learning_rate": 6.507329509241555e-07, "loss": 0.003, "reward": 1.5625817775726318, "reward_std": 0.061178095638751984, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43758174777030945, "step": 1096 }, { "completion_length": 306.546875, "epoch": 0.3495857233906947, "grad_norm": 9.393498420715332, "kl": 0.0791015625, "learning_rate": 6.504142766093053e-07, "loss": 0.0032, "reward": 1.4845378398895264, "reward_std": 0.09015172719955444, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4845377206802368, "rewards/pad": 0.0, "step": 1097 }, { "completion_length": 341.359375, "epoch": 0.34990439770554493, "grad_norm": 19.660934448242188, "kl": 0.06689453125, "learning_rate": 6.500956022944551e-07, "loss": 0.0027, "reward": 1.3632309436798096, "reward_std": 0.1405061036348343, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.37885600328445435, "step": 1098 }, { "completion_length": 215.171875, "epoch": 0.35022307202039515, "grad_norm": 6.55889892578125, "kl": 0.09912109375, "learning_rate": 6.497769279796048e-07, "loss": 0.004, "reward": 1.5062105655670166, "reward_std": 0.13409946858882904, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5062106251716614, "step": 1099 }, { "completion_length": 414.453125, "epoch": 0.3505417463352454, "grad_norm": 5.437408924102783, "kl": 0.060791015625, "learning_rate": 6.494582536647546e-07, "loss": 0.0024, "reward": 1.4838175773620605, "reward_std": 0.04797791689634323, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.48381760716438293, "step": 1100 }, { "completion_length": 451.140625, "epoch": 0.3508604206500956, "grad_norm": 7.171418190002441, "kl": 0.06005859375, "learning_rate": 6.491395793499044e-07, "loss": 0.0024, "reward": 1.4617466926574707, "reward_std": 0.042779937386512756, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4617466330528259, "step": 1101 }, { "completion_length": 185.0, "epoch": 0.3511790949649458, "grad_norm": 8.356879234313965, "kl": 0.11669921875, "learning_rate": 6.488209050350542e-07, "loss": 0.0047, "reward": 1.632096767425537, "reward_std": 0.16088390350341797, "rewards/pad": 0.21875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.41334670782089233, "step": 1102 }, { "completion_length": 311.703125, "epoch": 0.35149776927979604, "grad_norm": 9.635887145996094, "kl": 0.08251953125, "learning_rate": 6.485022307202039e-07, "loss": 0.0033, "reward": 1.5167649984359741, "reward_std": 0.08570042997598648, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5167650580406189, "rewards/pad": 0.0, "step": 1103 }, { "completion_length": 422.015625, "epoch": 0.35181644359464626, "grad_norm": 16.782541275024414, "kl": 0.04931640625, "learning_rate": 6.481835564053537e-07, "loss": 0.002, "reward": 1.4156150817871094, "reward_std": 0.0803535133600235, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3062400221824646, "step": 1104 }, { "completion_length": 340.796875, "epoch": 0.3521351179094965, "grad_norm": 13.343717575073242, "kl": 0.134765625, "learning_rate": 6.478648820905035e-07, "loss": 0.0054, "reward": 1.3510937690734863, "reward_std": 0.2080368846654892, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.9375, "rewards/tracking_iou_reward": 0.41359373927116394, "step": 1105 }, { "completion_length": 413.28125, "epoch": 0.3524537922243467, "grad_norm": 14.27628231048584, "kl": 0.0693359375, "learning_rate": 6.475462077756533e-07, "loss": 0.0028, "reward": 1.4851425886154175, "reward_std": 0.14510664343833923, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5163924694061279, "step": 1106 }, { "completion_length": 186.953125, "epoch": 0.3527724665391969, "grad_norm": 58.4865608215332, "kl": 0.10546875, "learning_rate": 6.47227533460803e-07, "loss": 0.0042, "reward": 1.7755894660949707, "reward_std": 0.18116974830627441, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4162144660949707, "rewards/pad": 0.375, "step": 1107 }, { "completion_length": 299.9375, "epoch": 0.35309114085404714, "grad_norm": 6.266265392303467, "kl": 0.068359375, "learning_rate": 6.469088591459528e-07, "loss": 0.0027, "reward": 1.6438732147216797, "reward_std": 0.3781220614910126, "rewards/pad": 0.375, "rewards/tracking_format_reward": 0.9375, "rewards/tracking_iou_reward": 0.3313731849193573, "step": 1108 }, { "completion_length": 228.34375, "epoch": 0.3534098151688974, "grad_norm": 7.116986274719238, "kl": 0.0927734375, "learning_rate": 6.465901848311026e-07, "loss": 0.0037, "reward": 1.5064152479171753, "reward_std": 0.04102984443306923, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5064153075218201, "rewards/pad": 0.0, "step": 1109 }, { "completion_length": 240.015625, "epoch": 0.35372848948374763, "grad_norm": 24.907663345336914, "kl": 0.0859375, "learning_rate": 6.462715105162523e-07, "loss": 0.0034, "reward": 1.4360811710357666, "reward_std": 0.17298558354377747, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4517062306404114, "step": 1110 }, { "completion_length": 298.5, "epoch": 0.35404716379859785, "grad_norm": 39.48649215698242, "kl": 0.109375, "learning_rate": 6.459528362014021e-07, "loss": 0.0044, "reward": 1.4850733280181885, "reward_std": 0.10610349476337433, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3756982386112213, "rewards/pad": 0.109375, "step": 1111 }, { "completion_length": 384.453125, "epoch": 0.3543658381134481, "grad_norm": 11.466015815734863, "kl": 0.06201171875, "learning_rate": 6.45634161886552e-07, "loss": 0.0025, "reward": 1.6032086610794067, "reward_std": 0.18208816647529602, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.38445860147476196, "step": 1112 }, { "completion_length": 240.921875, "epoch": 0.3546845124282983, "grad_norm": 13.035429954528809, "kl": 0.08984375, "learning_rate": 6.453154875717018e-07, "loss": 0.0036, "reward": 1.5618797540664673, "reward_std": 0.1373620331287384, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.43687981367111206, "step": 1113 }, { "completion_length": 292.515625, "epoch": 0.3550031867431485, "grad_norm": 99.24024200439453, "kl": 0.0810546875, "learning_rate": 6.449968132568515e-07, "loss": 0.0032, "reward": 1.5708576440811157, "reward_std": 0.11023964732885361, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.586482584476471, "rewards/pad": 0.0, "step": 1114 }, { "completion_length": 226.4375, "epoch": 0.35532186105799873, "grad_norm": 49.051387786865234, "kl": 0.1064453125, "learning_rate": 6.446781389420013e-07, "loss": 0.0043, "reward": 1.342435359954834, "reward_std": 0.14247508347034454, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.34243538975715637, "rewards/pad": 0.015625, "step": 1115 }, { "completion_length": 195.265625, "epoch": 0.35564053537284895, "grad_norm": 21.15662956237793, "kl": 0.091796875, "learning_rate": 6.443594646271511e-07, "loss": 0.0037, "reward": 1.7528045177459717, "reward_std": 0.24256056547164917, "rewards/answer_reward": 0.328125, "rewards/format_reward_gqa": 0.953125, "rewards/iou_glue_reward": 0.4715544283390045, "step": 1116 }, { "completion_length": 260.078125, "epoch": 0.3559592096876992, "grad_norm": 7.468287944793701, "kl": 0.08154296875, "learning_rate": 6.440407903123009e-07, "loss": 0.0033, "reward": 1.5833702087402344, "reward_std": 0.2094673067331314, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4739951491355896, "step": 1117 }, { "completion_length": 219.46875, "epoch": 0.3562778840025494, "grad_norm": 15.786979675292969, "kl": 0.10693359375, "learning_rate": 6.437221159974505e-07, "loss": 0.0043, "reward": 1.3431097269058228, "reward_std": 0.216970756649971, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.35873472690582275, "step": 1118 }, { "completion_length": 393.140625, "epoch": 0.3565965583173996, "grad_norm": 8.422698974609375, "kl": 0.07373046875, "learning_rate": 6.434034416826003e-07, "loss": 0.003, "reward": 1.5210447311401367, "reward_std": 0.0758834108710289, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5210448503494263, "step": 1119 }, { "completion_length": 266.890625, "epoch": 0.35691523263224983, "grad_norm": 19.675073623657227, "kl": 0.0888671875, "learning_rate": 6.430847673677501e-07, "loss": 0.0036, "reward": 1.4346909523010254, "reward_std": 0.15363526344299316, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4503158926963806, "rewards/pad": 0.0, "step": 1120 }, { "completion_length": 204.578125, "epoch": 0.35723390694710005, "grad_norm": 15.127815246582031, "kl": 0.0869140625, "learning_rate": 6.427660930528999e-07, "loss": 0.0035, "reward": 1.678032398223877, "reward_std": 0.17766845226287842, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3342824876308441, "rewards/pad": 0.34375, "step": 1121 }, { "completion_length": 208.203125, "epoch": 0.3575525812619503, "grad_norm": 14.863090515136719, "kl": 0.1064453125, "learning_rate": 6.424474187380496e-07, "loss": 0.0043, "reward": 1.639469861984253, "reward_std": 0.14503180980682373, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5144699215888977, "rewards/pad": 0.125, "step": 1122 }, { "completion_length": 290.828125, "epoch": 0.3578712555768005, "grad_norm": 12.484613418579102, "kl": 0.099609375, "learning_rate": 6.421287444231994e-07, "loss": 0.004, "reward": 1.412679672241211, "reward_std": 0.0679410845041275, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.41267964243888855, "rewards/pad": 0.0, "step": 1123 }, { "completion_length": 260.8125, "epoch": 0.3581899298916507, "grad_norm": 4.807104110717773, "kl": 0.142578125, "learning_rate": 6.418100701083492e-07, "loss": 0.0057, "reward": 1.489712119102478, "reward_std": 0.05640888959169388, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4897121489048004, "rewards/pad": 0.0, "step": 1124 }, { "completion_length": 400.609375, "epoch": 0.35850860420650094, "grad_norm": 10.27554702758789, "kl": 0.068359375, "learning_rate": 6.41491395793499e-07, "loss": 0.0027, "reward": 1.5083768367767334, "reward_std": 0.06959781795740128, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.38337668776512146, "rewards/pad": 0.125, "step": 1125 }, { "completion_length": 227.84375, "epoch": 0.35882727852135116, "grad_norm": 8.963057518005371, "kl": 0.109375, "learning_rate": 6.411727214786487e-07, "loss": 0.0044, "reward": 1.6768724918365479, "reward_std": 0.12050767987966537, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44249746203422546, "rewards/pad": 0.234375, "step": 1126 }, { "completion_length": 224.09375, "epoch": 0.3591459528362014, "grad_norm": 12.289810180664062, "kl": 0.10009765625, "learning_rate": 6.408540471637985e-07, "loss": 0.004, "reward": 1.565162181854248, "reward_std": 0.10991180688142776, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.33078712224960327, "step": 1127 }, { "completion_length": 421.34375, "epoch": 0.35946462715105165, "grad_norm": 4.199387073516846, "kl": 0.044677734375, "learning_rate": 6.405353728489483e-07, "loss": 0.0018, "reward": 1.6045417785644531, "reward_std": 0.06083228066563606, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4795418679714203, "step": 1128 }, { "completion_length": 217.96875, "epoch": 0.3597833014659019, "grad_norm": 13.499602317810059, "kl": 0.09326171875, "learning_rate": 6.402166985340981e-07, "loss": 0.0037, "reward": 1.736114263534546, "reward_std": 0.14537036418914795, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.6267393827438354, "step": 1129 }, { "completion_length": 290.921875, "epoch": 0.3601019757807521, "grad_norm": 20.29520034790039, "kl": 0.07373046875, "learning_rate": 6.398980242192478e-07, "loss": 0.0029, "reward": 1.541394829750061, "reward_std": 0.13326993584632874, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.43201982975006104, "rewards/pad": 0.125, "step": 1130 }, { "completion_length": 358.734375, "epoch": 0.3604206500956023, "grad_norm": 6.83867883682251, "kl": 0.07421875, "learning_rate": 6.395793499043977e-07, "loss": 0.003, "reward": 1.656518816947937, "reward_std": 0.09795132279396057, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.42214375734329224, "rewards/pad": 0.25, "step": 1131 }, { "completion_length": 255.9375, "epoch": 0.36073932441045253, "grad_norm": 9.172416687011719, "kl": 0.09228515625, "learning_rate": 6.392606755895475e-07, "loss": 0.0037, "reward": 1.353350281715393, "reward_std": 0.10851083695888519, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.36897528171539307, "rewards/pad": 0.0, "step": 1132 }, { "completion_length": 253.890625, "epoch": 0.36105799872530275, "grad_norm": 10.658964157104492, "kl": 0.07763671875, "learning_rate": 6.389420012746973e-07, "loss": 0.0031, "reward": 1.5653178691864014, "reward_std": 0.16297033429145813, "rewards/answer_reward": 0.09375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.471567839384079, "step": 1133 }, { "completion_length": 167.703125, "epoch": 0.361376673040153, "grad_norm": 11.124542236328125, "kl": 0.1044921875, "learning_rate": 6.38623326959847e-07, "loss": 0.0042, "reward": 1.7909257411956787, "reward_std": 0.1615280658006668, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5565507411956787, "step": 1134 }, { "completion_length": 308.296875, "epoch": 0.3616953473550032, "grad_norm": 8.000903129577637, "kl": 0.07666015625, "learning_rate": 6.383046526449968e-07, "loss": 0.0031, "reward": 1.5040984153747559, "reward_std": 0.08258339762687683, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5197234153747559, "rewards/pad": 0.0, "step": 1135 }, { "completion_length": 110.34375, "epoch": 0.3620140216698534, "grad_norm": 23.70810890197754, "kl": 0.1259765625, "learning_rate": 6.379859783301466e-07, "loss": 0.0051, "reward": 1.719245433807373, "reward_std": 0.2565840482711792, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.594245433807373, "rewards/pad": 0.140625, "step": 1136 }, { "completion_length": 305.5625, "epoch": 0.36233269598470363, "grad_norm": 9.16686725616455, "kl": 0.09521484375, "learning_rate": 6.376673040152964e-07, "loss": 0.0038, "reward": 1.5536150932312012, "reward_std": 0.08969675004482269, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5536150336265564, "rewards/pad": 0.0, "step": 1137 }, { "completion_length": 154.703125, "epoch": 0.36265137029955385, "grad_norm": 36.27172088623047, "kl": 0.11962890625, "learning_rate": 6.373486297004461e-07, "loss": 0.0048, "reward": 1.5816446542739868, "reward_std": 0.2370244562625885, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3785196840763092, "rewards/pad": 0.21875, "step": 1138 }, { "completion_length": 368.28125, "epoch": 0.3629700446144041, "grad_norm": 9.339393615722656, "kl": 0.0810546875, "learning_rate": 6.370299553855959e-07, "loss": 0.0032, "reward": 1.4603204727172852, "reward_std": 0.15351173281669617, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.49157047271728516, "step": 1139 }, { "completion_length": 211.109375, "epoch": 0.3632887189292543, "grad_norm": 10.896927833557129, "kl": 0.10595703125, "learning_rate": 6.367112810707457e-07, "loss": 0.0042, "reward": 1.655792474746704, "reward_std": 0.12964951992034912, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3745424747467041, "rewards/pad": 0.28125, "step": 1140 }, { "completion_length": 322.25, "epoch": 0.3636073932441045, "grad_norm": 7.350358486175537, "kl": 0.083984375, "learning_rate": 6.363926067558954e-07, "loss": 0.0034, "reward": 1.4856479167938232, "reward_std": 0.0888536125421524, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.48564794659614563, "step": 1141 }, { "completion_length": 148.03125, "epoch": 0.36392606755895474, "grad_norm": 18.382137298583984, "kl": 0.1318359375, "learning_rate": 6.360739324410452e-07, "loss": 0.0053, "reward": 1.7307488918304443, "reward_std": 0.13341474533081055, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6213740110397339, "rewards/pad": 0.109375, "step": 1142 }, { "completion_length": 216.484375, "epoch": 0.36424474187380496, "grad_norm": 9.965367317199707, "kl": 0.08642578125, "learning_rate": 6.35755258126195e-07, "loss": 0.0034, "reward": 1.7189104557037354, "reward_std": 0.22512134909629822, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.34391042590141296, "rewards/pad": 0.390625, "step": 1143 }, { "completion_length": 205.03125, "epoch": 0.3645634161886552, "grad_norm": 12.729777336120605, "kl": 0.11767578125, "learning_rate": 6.354365838113448e-07, "loss": 0.0047, "reward": 1.4384092092514038, "reward_std": 0.15935580432415009, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4540341794490814, "rewards/pad": 0.0, "step": 1144 }, { "completion_length": 180.859375, "epoch": 0.3648820905035054, "grad_norm": 16.469371795654297, "kl": 0.107421875, "learning_rate": 6.351179094964945e-07, "loss": 0.0043, "reward": 1.4881618022918701, "reward_std": 0.14285990595817566, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5037867426872253, "rewards/pad": 0.0, "step": 1145 }, { "completion_length": 212.1875, "epoch": 0.3652007648183556, "grad_norm": 15.975790023803711, "kl": 0.1171875, "learning_rate": 6.347992351816443e-07, "loss": 0.0047, "reward": 1.677734613418579, "reward_std": 0.11066339164972305, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5683596134185791, "rewards/pad": 0.125, "step": 1146 }, { "completion_length": 321.84375, "epoch": 0.36551943913320584, "grad_norm": 7.0683369636535645, "kl": 0.07470703125, "learning_rate": 6.344805608667941e-07, "loss": 0.003, "reward": 1.570380449295044, "reward_std": 0.18008087575435638, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.49225541949272156, "step": 1147 }, { "completion_length": 272.15625, "epoch": 0.3658381134480561, "grad_norm": 20.573633193969727, "kl": 0.10205078125, "learning_rate": 6.34161886551944e-07, "loss": 0.0041, "reward": 1.6857969760894775, "reward_std": 0.1163695752620697, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5764220356941223, "step": 1148 }, { "completion_length": 276.328125, "epoch": 0.36615678776290633, "grad_norm": 13.387776374816895, "kl": 0.083984375, "learning_rate": 6.338432122370936e-07, "loss": 0.0034, "reward": 1.8507922887802124, "reward_std": 0.15579496324062347, "rewards/pad": 0.3125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5382922887802124, "step": 1149 }, { "completion_length": 300.03125, "epoch": 0.36647546207775655, "grad_norm": 10.872426986694336, "kl": 0.07763671875, "learning_rate": 6.335245379222435e-07, "loss": 0.0031, "reward": 1.4849570989608765, "reward_std": 0.11124669015407562, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.37558209896087646, "step": 1150 }, { "completion_length": 148.578125, "epoch": 0.3667941363926068, "grad_norm": 12.177960395812988, "kl": 0.134765625, "learning_rate": 6.332058636073933e-07, "loss": 0.0054, "reward": 1.610792636871338, "reward_std": 0.15256120264530182, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5014176964759827, "rewards/pad": 0.125, "step": 1151 }, { "completion_length": 315.015625, "epoch": 0.367112810707457, "grad_norm": 5.850987911224365, "kl": 0.0751953125, "learning_rate": 6.328871892925431e-07, "loss": 0.003, "reward": 1.6577093601226807, "reward_std": 0.04746227711439133, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.40770936012268066, "step": 1152 }, { "completion_length": 322.078125, "epoch": 0.3674314850223072, "grad_norm": 12.73082447052002, "kl": 0.08251953125, "learning_rate": 6.325685149776928e-07, "loss": 0.0033, "reward": 1.3450021743774414, "reward_std": 0.12023515999317169, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.36062711477279663, "rewards/pad": 0.0, "step": 1153 }, { "completion_length": 214.46875, "epoch": 0.36775015933715743, "grad_norm": 20.796520233154297, "kl": 0.10400390625, "learning_rate": 6.322498406628426e-07, "loss": 0.0042, "reward": 1.5687785148620605, "reward_std": 0.09632916003465652, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5687785148620605, "step": 1154 }, { "completion_length": 232.375, "epoch": 0.36806883365200765, "grad_norm": 9.744770050048828, "kl": 0.1044921875, "learning_rate": 6.319311663479924e-07, "loss": 0.0042, "reward": 1.6491217613220215, "reward_std": 0.11243297159671783, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6491218209266663, "rewards/pad": 0.0, "step": 1155 }, { "completion_length": 274.15625, "epoch": 0.3683875079668579, "grad_norm": 6.4882941246032715, "kl": 0.0810546875, "learning_rate": 6.316124920331422e-07, "loss": 0.0032, "reward": 1.8337061405181885, "reward_std": 0.05185786634683609, "rewards/pad": 0.375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4587061405181885, "step": 1156 }, { "completion_length": 258.109375, "epoch": 0.3687061822817081, "grad_norm": 7.578658580780029, "kl": 0.10400390625, "learning_rate": 6.312938177182919e-07, "loss": 0.0042, "reward": 1.5650343894958496, "reward_std": 0.1916087567806244, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.5962843894958496, "step": 1157 }, { "completion_length": 146.390625, "epoch": 0.3690248565965583, "grad_norm": 9.791213035583496, "kl": 0.1318359375, "learning_rate": 6.309751434034416e-07, "loss": 0.0053, "reward": 1.6399765014648438, "reward_std": 0.12476305663585663, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6399766206741333, "rewards/pad": 0.0, "step": 1158 }, { "completion_length": 354.1875, "epoch": 0.36934353091140854, "grad_norm": 7.83534049987793, "kl": 0.0869140625, "learning_rate": 6.306564690885914e-07, "loss": 0.0035, "reward": 1.4030874967575073, "reward_std": 0.04663660749793053, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4030875563621521, "step": 1159 }, { "completion_length": 207.25, "epoch": 0.36966220522625876, "grad_norm": 6.344560146331787, "kl": 0.1259765625, "learning_rate": 6.303377947737412e-07, "loss": 0.005, "reward": 1.7244865894317627, "reward_std": 0.12499900907278061, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.5057364702224731, "step": 1160 }, { "completion_length": 215.921875, "epoch": 0.369980879541109, "grad_norm": 9.869667053222656, "kl": 0.11474609375, "learning_rate": 6.300191204588909e-07, "loss": 0.0046, "reward": 1.5617637634277344, "reward_std": 0.1978577971458435, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.46801382303237915, "rewards/pad": 0.109375, "step": 1161 }, { "completion_length": 308.4375, "epoch": 0.3702995538559592, "grad_norm": 7.597326755523682, "kl": 0.08544921875, "learning_rate": 6.297004461440407e-07, "loss": 0.0034, "reward": 1.5469553470611572, "reward_std": 0.10091152042150497, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5625803470611572, "rewards/pad": 0.0, "step": 1162 }, { "completion_length": 261.265625, "epoch": 0.3706182281708094, "grad_norm": 7.743888854980469, "kl": 0.1064453125, "learning_rate": 6.293817718291905e-07, "loss": 0.0043, "reward": 1.524780511856079, "reward_std": 0.1921815574169159, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4779054522514343, "rewards/pad": 0.0625, "step": 1163 }, { "completion_length": 361.046875, "epoch": 0.37093690248565964, "grad_norm": 4.7014617919921875, "kl": 0.07568359375, "learning_rate": 6.290630975143403e-07, "loss": 0.003, "reward": 1.4223251342773438, "reward_std": 0.048480890691280365, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.42232510447502136, "step": 1164 }, { "completion_length": 212.40625, "epoch": 0.37125557680050986, "grad_norm": 6.080229759216309, "kl": 0.091796875, "learning_rate": 6.2874442319949e-07, "loss": 0.0037, "reward": 1.5960010290145874, "reward_std": 0.11411029100418091, "rewards/answer_reward": 0.34375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.252250999212265, "step": 1165 }, { "completion_length": 353.0625, "epoch": 0.3715742511153601, "grad_norm": 12.403491973876953, "kl": 0.06787109375, "learning_rate": 6.284257488846398e-07, "loss": 0.0027, "reward": 1.3574970960617065, "reward_std": 0.08834861218929291, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.37312212586402893, "rewards/pad": 0.0, "step": 1166 }, { "completion_length": 251.390625, "epoch": 0.37189292543021035, "grad_norm": 15.094289779663086, "kl": 0.10595703125, "learning_rate": 6.281070745697896e-07, "loss": 0.0042, "reward": 1.5393974781036377, "reward_std": 0.14277897775173187, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5550224781036377, "rewards/pad": 0.0, "step": 1167 }, { "completion_length": 219.515625, "epoch": 0.3722115997450606, "grad_norm": 10.170633316040039, "kl": 0.11669921875, "learning_rate": 6.277884002549393e-07, "loss": 0.0047, "reward": 1.4170877933502197, "reward_std": 0.11784126609563828, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4327127933502197, "step": 1168 }, { "completion_length": 200.75, "epoch": 0.3725302740599108, "grad_norm": 14.906290054321289, "kl": 0.1171875, "learning_rate": 6.274697259400892e-07, "loss": 0.0047, "reward": 1.5319913625717163, "reward_std": 0.11990997195243835, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4226164221763611, "step": 1169 }, { "completion_length": 267.609375, "epoch": 0.372848948374761, "grad_norm": 13.22391128540039, "kl": 0.08251953125, "learning_rate": 6.27151051625239e-07, "loss": 0.0033, "reward": 1.6729364395141602, "reward_std": 0.1706743836402893, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5010614395141602, "rewards/pad": 0.171875, "step": 1170 }, { "completion_length": 158.421875, "epoch": 0.37316762268961123, "grad_norm": 29.263126373291016, "kl": 0.1357421875, "learning_rate": 6.268323773103888e-07, "loss": 0.0054, "reward": 1.6222381591796875, "reward_std": 0.1717788130044937, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6066131591796875, "rewards/pad": 0.03125, "step": 1171 }, { "completion_length": 332.09375, "epoch": 0.37348629700446145, "grad_norm": 6.8517985343933105, "kl": 0.06982421875, "learning_rate": 6.265137029955385e-07, "loss": 0.0028, "reward": 1.7126264572143555, "reward_std": 0.175262451171875, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.556376576423645, "step": 1172 }, { "completion_length": 221.40625, "epoch": 0.3738049713193117, "grad_norm": 9.87380599975586, "kl": 0.1337890625, "learning_rate": 6.261950286806883e-07, "loss": 0.0053, "reward": 1.3858520984649658, "reward_std": 0.07210617512464523, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.38585203886032104, "rewards/pad": 0.0, "step": 1173 }, { "completion_length": 336.1875, "epoch": 0.3741236456341619, "grad_norm": 15.57335090637207, "kl": 0.08837890625, "learning_rate": 6.258763543658381e-07, "loss": 0.0035, "reward": 1.4978127479553223, "reward_std": 0.18304958939552307, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4665627181529999, "step": 1174 }, { "completion_length": 386.296875, "epoch": 0.3744423199490121, "grad_norm": 6.890348434448242, "kl": 0.078125, "learning_rate": 6.255576800509879e-07, "loss": 0.0031, "reward": 1.4197403192520142, "reward_std": 0.19700422883033752, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.32599037885665894, "step": 1175 }, { "completion_length": 226.4375, "epoch": 0.37476099426386233, "grad_norm": 7.337767124176025, "kl": 0.1376953125, "learning_rate": 6.252390057361376e-07, "loss": 0.0055, "reward": 1.4902740716934204, "reward_std": 0.08827096223831177, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5058990120887756, "step": 1176 }, { "completion_length": 207.796875, "epoch": 0.37507966857871256, "grad_norm": 35.47562789916992, "kl": 0.140625, "learning_rate": 6.249203314212874e-07, "loss": 0.0056, "reward": 1.5208287239074707, "reward_std": 0.1746503859758377, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5208288431167603, "step": 1177 }, { "completion_length": 177.046875, "epoch": 0.3753983428935628, "grad_norm": 8.447120666503906, "kl": 0.1435546875, "learning_rate": 6.246016571064372e-07, "loss": 0.0058, "reward": 1.6760151386260986, "reward_std": 0.11283519864082336, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5510150790214539, "rewards/pad": 0.125, "step": 1178 }, { "completion_length": 372.421875, "epoch": 0.375717017208413, "grad_norm": 8.715481758117676, "kl": 0.099609375, "learning_rate": 6.24282982791587e-07, "loss": 0.004, "reward": 1.5441935062408447, "reward_std": 0.08572202920913696, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5598185658454895, "step": 1179 }, { "completion_length": 358.90625, "epoch": 0.3760356915232632, "grad_norm": 15.392694473266602, "kl": 0.10986328125, "learning_rate": 6.239643084767367e-07, "loss": 0.0044, "reward": 1.3878960609436035, "reward_std": 0.08293549716472626, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4035210609436035, "step": 1180 }, { "completion_length": 277.671875, "epoch": 0.37635436583811344, "grad_norm": 188.2808380126953, "kl": 0.130859375, "learning_rate": 6.236456341618865e-07, "loss": 0.0053, "reward": 1.5231683254241943, "reward_std": 0.1508413404226303, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5387933254241943, "step": 1181 }, { "completion_length": 415.734375, "epoch": 0.37667304015296366, "grad_norm": 10.059452056884766, "kl": 0.0732421875, "learning_rate": 6.233269598470363e-07, "loss": 0.0029, "reward": 1.50213623046875, "reward_std": 0.04022324085235596, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.37713623046875, "step": 1182 }, { "completion_length": 385.234375, "epoch": 0.3769917144678139, "grad_norm": 4.8774285316467285, "kl": 0.08056640625, "learning_rate": 6.230082855321861e-07, "loss": 0.0032, "reward": 1.5162938833236694, "reward_std": 0.06330114603042603, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5162937641143799, "step": 1183 }, { "completion_length": 274.0625, "epoch": 0.3773103887826641, "grad_norm": 9.35615062713623, "kl": 0.099609375, "learning_rate": 6.226896112173358e-07, "loss": 0.004, "reward": 1.544995903968811, "reward_std": 0.11332038044929504, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.43562090396881104, "step": 1184 }, { "completion_length": 275.453125, "epoch": 0.3776290630975143, "grad_norm": 11.464639663696289, "kl": 0.11328125, "learning_rate": 6.223709369024856e-07, "loss": 0.0045, "reward": 1.4839555025100708, "reward_std": 0.16924186050891876, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4995805025100708, "rewards/pad": 0.0, "step": 1185 }, { "completion_length": 320.609375, "epoch": 0.37794773741236454, "grad_norm": 8.494017601013184, "kl": 0.07470703125, "learning_rate": 6.220522625876354e-07, "loss": 0.003, "reward": 1.5268003940582275, "reward_std": 0.07187595218420029, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.38617539405822754, "step": 1186 }, { "completion_length": 152.828125, "epoch": 0.3782664117272148, "grad_norm": 11.815348625183105, "kl": 0.138671875, "learning_rate": 6.217335882727853e-07, "loss": 0.0055, "reward": 1.6774251461029053, "reward_std": 0.10706285387277603, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.55242520570755, "rewards/pad": 0.125, "step": 1187 }, { "completion_length": 235.515625, "epoch": 0.37858508604206503, "grad_norm": 13.886299133300781, "kl": 0.12451171875, "learning_rate": 6.21414913957935e-07, "loss": 0.005, "reward": 1.4992268085479736, "reward_std": 0.1577257215976715, "rewards/answer_reward": 0.046875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4523519277572632, "step": 1188 }, { "completion_length": 262.796875, "epoch": 0.37890376035691525, "grad_norm": 7.124457359313965, "kl": 0.087890625, "learning_rate": 6.210962396430848e-07, "loss": 0.0035, "reward": 1.6487696170806885, "reward_std": 0.11338422447443008, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4456444978713989, "rewards/pad": 0.203125, "step": 1189 }, { "completion_length": 329.03125, "epoch": 0.3792224346717655, "grad_norm": 7.236481666564941, "kl": 0.126953125, "learning_rate": 6.207775653282346e-07, "loss": 0.0051, "reward": 1.5534617900848389, "reward_std": 0.05858501046895981, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5534616708755493, "step": 1190 }, { "completion_length": 324.421875, "epoch": 0.3795411089866157, "grad_norm": 11.887310028076172, "kl": 0.095703125, "learning_rate": 6.204588910133844e-07, "loss": 0.0038, "reward": 1.5425376892089844, "reward_std": 0.09648825228214264, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3081625998020172, "rewards/pad": 0.25, "step": 1191 }, { "completion_length": 102.515625, "epoch": 0.3798597833014659, "grad_norm": 11.819649696350098, "kl": 0.185546875, "learning_rate": 6.201402166985341e-07, "loss": 0.0074, "reward": 1.5959168672561646, "reward_std": 0.12162573635578156, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5959168672561646, "rewards/pad": 0.0, "step": 1192 }, { "completion_length": 342.609375, "epoch": 0.38017845761631613, "grad_norm": 12.393624305725098, "kl": 0.07470703125, "learning_rate": 6.198215423836839e-07, "loss": 0.003, "reward": 1.4370999336242676, "reward_std": 0.09190460294485092, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4370998442173004, "step": 1193 }, { "completion_length": 262.828125, "epoch": 0.38049713193116635, "grad_norm": 7.728225231170654, "kl": 0.11767578125, "learning_rate": 6.195028680688337e-07, "loss": 0.0047, "reward": 1.4948556423187256, "reward_std": 0.07122111320495605, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3698556125164032, "step": 1194 }, { "completion_length": 217.4375, "epoch": 0.3808158062460166, "grad_norm": 9.003069877624512, "kl": 0.142578125, "learning_rate": 6.191841937539835e-07, "loss": 0.0057, "reward": 1.6949548721313477, "reward_std": 0.19778338074684143, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5699548721313477, "rewards/pad": 0.140625, "step": 1195 }, { "completion_length": 205.515625, "epoch": 0.3811344805608668, "grad_norm": 10.357717514038086, "kl": 0.1279296875, "learning_rate": 6.188655194391332e-07, "loss": 0.0051, "reward": 1.5979564189910889, "reward_std": 0.1371481865644455, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6135815382003784, "rewards/pad": 0.0, "step": 1196 }, { "completion_length": 144.28125, "epoch": 0.381453154875717, "grad_norm": 25.7012939453125, "kl": 0.15234375, "learning_rate": 6.185468451242829e-07, "loss": 0.0061, "reward": 1.5692253112792969, "reward_std": 0.08479062467813492, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4442253112792969, "rewards/pad": 0.125, "step": 1197 }, { "completion_length": 235.46875, "epoch": 0.38177182919056724, "grad_norm": 9.451396942138672, "kl": 0.1240234375, "learning_rate": 6.182281708094327e-07, "loss": 0.005, "reward": 1.5140479803085327, "reward_std": 0.047571178525686264, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5140478610992432, "step": 1198 }, { "completion_length": 328.328125, "epoch": 0.38209050350541746, "grad_norm": 6.12078332901001, "kl": 0.06396484375, "learning_rate": 6.179094964945824e-07, "loss": 0.0026, "reward": 1.356693983078003, "reward_std": 0.11052098125219345, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.23169396817684174, "step": 1199 }, { "completion_length": 233.75, "epoch": 0.3824091778202677, "grad_norm": 13.038230895996094, "kl": 0.10693359375, "learning_rate": 6.175908221797322e-07, "loss": 0.0043, "reward": 1.5692347288131714, "reward_std": 0.107730433344841, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4286096692085266, "rewards/pad": 0.140625, "step": 1200 }, { "completion_length": 378.09375, "epoch": 0.3827278521351179, "grad_norm": 15.127769470214844, "kl": 0.0654296875, "learning_rate": 6.17272147864882e-07, "loss": 0.0026, "reward": 1.5065699815750122, "reward_std": 0.14783266186714172, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4440700113773346, "step": 1201 }, { "completion_length": 332.796875, "epoch": 0.3830465264499681, "grad_norm": 7.491520881652832, "kl": 0.083984375, "learning_rate": 6.169534735500318e-07, "loss": 0.0034, "reward": 1.448569893836975, "reward_std": 0.13246291875839233, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3548198640346527, "step": 1202 }, { "completion_length": 346.9375, "epoch": 0.38336520076481834, "grad_norm": 9.888702392578125, "kl": 0.08984375, "learning_rate": 6.166347992351815e-07, "loss": 0.0036, "reward": 1.416809320449829, "reward_std": 0.20294572412967682, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.953125, "rewards/tracking_iou_reward": 0.4636843800544739, "step": 1203 }, { "completion_length": 200.375, "epoch": 0.38368387507966856, "grad_norm": 14.056575775146484, "kl": 0.126953125, "learning_rate": 6.163161249203313e-07, "loss": 0.0051, "reward": 1.5999902486801147, "reward_std": 0.06742437183856964, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47499027848243713, "rewards/pad": 0.125, "step": 1204 }, { "completion_length": 347.796875, "epoch": 0.3840025493945188, "grad_norm": 7.76228141784668, "kl": 0.0732421875, "learning_rate": 6.159974506054811e-07, "loss": 0.0029, "reward": 1.6015219688415527, "reward_std": 0.1911095827817917, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5077719688415527, "step": 1205 }, { "completion_length": 294.546875, "epoch": 0.384321223709369, "grad_norm": 15.17544937133789, "kl": 0.103515625, "learning_rate": 6.15678776290631e-07, "loss": 0.0041, "reward": 1.4424169063568115, "reward_std": 0.07422250509262085, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4424169063568115, "step": 1206 }, { "completion_length": 97.765625, "epoch": 0.3846398980242193, "grad_norm": 11.784152030944824, "kl": 0.16015625, "learning_rate": 6.153601019757807e-07, "loss": 0.0064, "reward": 1.7995710372924805, "reward_std": 0.1624748706817627, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.45582109689712524, "step": 1207 }, { "completion_length": 192.078125, "epoch": 0.3849585723390695, "grad_norm": 19.06814956665039, "kl": 0.1416015625, "learning_rate": 6.150414276609305e-07, "loss": 0.0056, "reward": 1.4531996250152588, "reward_std": 0.13103961944580078, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.406324565410614, "rewards/pad": 0.046875, "step": 1208 }, { "completion_length": 281.359375, "epoch": 0.3852772466539197, "grad_norm": 9.539800643920898, "kl": 0.0908203125, "learning_rate": 6.147227533460803e-07, "loss": 0.0036, "reward": 1.3577734231948853, "reward_std": 0.2437555491924286, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.3577733635902405, "rewards/pad": 0.046875, "step": 1209 }, { "completion_length": 295.640625, "epoch": 0.38559592096876993, "grad_norm": 7.663999557495117, "kl": 0.0869140625, "learning_rate": 6.144040790312301e-07, "loss": 0.0035, "reward": 1.4731587171554565, "reward_std": 0.11033116281032562, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5044087171554565, "step": 1210 }, { "completion_length": 198.0625, "epoch": 0.38591459528362015, "grad_norm": 12.535527229309082, "kl": 0.1357421875, "learning_rate": 6.140854047163798e-07, "loss": 0.0054, "reward": 1.341176152229309, "reward_std": 0.06295371055603027, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3411761522293091, "step": 1211 }, { "completion_length": 309.921875, "epoch": 0.3862332695984704, "grad_norm": 6.9051103591918945, "kl": 0.08984375, "learning_rate": 6.137667304015296e-07, "loss": 0.0036, "reward": 1.4991049766540527, "reward_std": 0.04289538785815239, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49910494685173035, "rewards/pad": 0.0, "step": 1212 }, { "completion_length": 203.25, "epoch": 0.3865519439133206, "grad_norm": 15.828720092773438, "kl": 0.1494140625, "learning_rate": 6.134480560866794e-07, "loss": 0.006, "reward": 1.5709058046340942, "reward_std": 0.15641742944717407, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46153074502944946, "rewards/pad": 0.109375, "step": 1213 }, { "completion_length": 192.6875, "epoch": 0.3868706182281708, "grad_norm": 10.661425590515137, "kl": 0.119140625, "learning_rate": 6.131293817718292e-07, "loss": 0.0048, "reward": 1.7105867862701416, "reward_std": 0.18071383237838745, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6324617862701416, "rewards/pad": 0.09375, "step": 1214 }, { "completion_length": 303.71875, "epoch": 0.38718929254302104, "grad_norm": 11.888273239135742, "kl": 0.10205078125, "learning_rate": 6.128107074569789e-07, "loss": 0.0041, "reward": 1.4614834785461426, "reward_std": 0.15480071306228638, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4302334785461426, "step": 1215 }, { "completion_length": 373.265625, "epoch": 0.38750796685787126, "grad_norm": 6.659944534301758, "kl": 0.0654296875, "learning_rate": 6.124920331421287e-07, "loss": 0.0026, "reward": 1.3879508972167969, "reward_std": 0.03459502011537552, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.38795098662376404, "rewards/pad": 0.0, "step": 1216 }, { "completion_length": 281.328125, "epoch": 0.3878266411727215, "grad_norm": 18.89515495300293, "kl": 0.0869140625, "learning_rate": 6.121733588272785e-07, "loss": 0.0035, "reward": 1.3299328088760376, "reward_std": 0.08711449801921844, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3299327492713928, "step": 1217 }, { "completion_length": 273.03125, "epoch": 0.3881453154875717, "grad_norm": 36.573307037353516, "kl": 0.10107421875, "learning_rate": 6.118546845124283e-07, "loss": 0.004, "reward": 1.700791835784912, "reward_std": 0.0859699547290802, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5914169549942017, "rewards/pad": 0.109375, "step": 1218 }, { "completion_length": 284.71875, "epoch": 0.3884639898024219, "grad_norm": 7.265768527984619, "kl": 0.11083984375, "learning_rate": 6.11536010197578e-07, "loss": 0.0044, "reward": 1.5254764556884766, "reward_std": 0.04545580968260765, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5254765152931213, "step": 1219 }, { "completion_length": 408.3125, "epoch": 0.38878266411727214, "grad_norm": 3.9388315677642822, "kl": 0.0654296875, "learning_rate": 6.112173358827278e-07, "loss": 0.0026, "reward": 1.3649344444274902, "reward_std": 0.04178182780742645, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.364934504032135, "step": 1220 }, { "completion_length": 349.359375, "epoch": 0.38910133843212236, "grad_norm": 5.0478973388671875, "kl": 0.07080078125, "learning_rate": 6.108986615678776e-07, "loss": 0.0028, "reward": 1.4779832363128662, "reward_std": 0.0777759924530983, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3686083257198334, "step": 1221 }, { "completion_length": 308.421875, "epoch": 0.3894200127469726, "grad_norm": 6.850950717926025, "kl": 0.09375, "learning_rate": 6.105799872530274e-07, "loss": 0.0038, "reward": 1.4696778059005737, "reward_std": 0.06025514006614685, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46967771649360657, "rewards/pad": 0.0, "step": 1222 }, { "completion_length": 224.59375, "epoch": 0.3897386870618228, "grad_norm": 9.790903091430664, "kl": 0.095703125, "learning_rate": 6.102613129381771e-07, "loss": 0.0038, "reward": 1.7676475048065186, "reward_std": 0.19071504473686218, "rewards/answer_reward": 0.3125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.45514750480651855, "step": 1223 }, { "completion_length": 252.84375, "epoch": 0.390057361376673, "grad_norm": 15.577993392944336, "kl": 0.09814453125, "learning_rate": 6.09942638623327e-07, "loss": 0.0039, "reward": 1.5877152681350708, "reward_std": 0.13476546108722687, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3377152681350708, "step": 1224 }, { "completion_length": 228.5625, "epoch": 0.39037603569152324, "grad_norm": 10.700752258300781, "kl": 0.11669921875, "learning_rate": 6.096239643084768e-07, "loss": 0.0047, "reward": 1.5201473236083984, "reward_std": 0.09742455929517746, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5201473832130432, "rewards/pad": 0.0, "step": 1225 }, { "completion_length": 255.1875, "epoch": 0.3906947100063735, "grad_norm": 24.338665008544922, "kl": 0.09814453125, "learning_rate": 6.093052899936266e-07, "loss": 0.0039, "reward": 1.564429759979248, "reward_std": 0.15615159273147583, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.47067975997924805, "rewards/pad": 0.109375, "step": 1226 }, { "completion_length": 262.765625, "epoch": 0.39101338432122373, "grad_norm": 17.822742462158203, "kl": 0.09814453125, "learning_rate": 6.089866156787763e-07, "loss": 0.0039, "reward": 1.5323125123977661, "reward_std": 0.09317489713430405, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5323124527931213, "step": 1227 }, { "completion_length": 167.21875, "epoch": 0.39133205863607395, "grad_norm": 18.925622940063477, "kl": 0.1201171875, "learning_rate": 6.086679413639261e-07, "loss": 0.0048, "reward": 1.6316561698913574, "reward_std": 0.15108171105384827, "rewards/answer_reward": 0.21875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.41290608048439026, "step": 1228 }, { "completion_length": 367.5, "epoch": 0.3916507329509242, "grad_norm": 5.661565780639648, "kl": 0.0732421875, "learning_rate": 6.083492670490759e-07, "loss": 0.0029, "reward": 1.4048430919647217, "reward_std": 0.11436806619167328, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.29546812176704407, "step": 1229 }, { "completion_length": 165.53125, "epoch": 0.3919694072657744, "grad_norm": 11.724607467651367, "kl": 0.146484375, "learning_rate": 6.080305927342257e-07, "loss": 0.0059, "reward": 1.4511432647705078, "reward_std": 0.08234697580337524, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.451143354177475, "rewards/pad": 0.0, "step": 1230 }, { "completion_length": 284.28125, "epoch": 0.3922880815806246, "grad_norm": 7.128300666809082, "kl": 0.07763671875, "learning_rate": 6.077119184193754e-07, "loss": 0.0031, "reward": 1.6145113706588745, "reward_std": 0.10974004864692688, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5051363110542297, "step": 1231 }, { "completion_length": 295.9375, "epoch": 0.39260675589547483, "grad_norm": 8.443071365356445, "kl": 0.08984375, "learning_rate": 6.073932441045252e-07, "loss": 0.0036, "reward": 1.4106472730636597, "reward_std": 0.16361010074615479, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4418972134590149, "rewards/pad": 0.0, "step": 1232 }, { "completion_length": 194.109375, "epoch": 0.39292543021032506, "grad_norm": 13.870003700256348, "kl": 0.1279296875, "learning_rate": 6.07074569789675e-07, "loss": 0.0051, "reward": 1.374561071395874, "reward_std": 0.06559236347675323, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.374561071395874, "rewards/pad": 0.0, "step": 1233 }, { "completion_length": 285.28125, "epoch": 0.3932441045251753, "grad_norm": 11.3886137008667, "kl": 0.07421875, "learning_rate": 6.067558954748247e-07, "loss": 0.003, "reward": 1.4970085620880127, "reward_std": 0.15465247631072998, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3563835024833679, "step": 1234 }, { "completion_length": 303.234375, "epoch": 0.3935627788400255, "grad_norm": 6.8807148933410645, "kl": 0.0830078125, "learning_rate": 6.064372211599745e-07, "loss": 0.0033, "reward": 1.4436665773391724, "reward_std": 0.08115865290164948, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44366663694381714, "step": 1235 }, { "completion_length": 360.5, "epoch": 0.3938814531548757, "grad_norm": 162.79302978515625, "kl": 0.060546875, "learning_rate": 6.061185468451242e-07, "loss": 0.0024, "reward": 1.3667831420898438, "reward_std": 0.01843656599521637, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.24178317189216614, "step": 1236 }, { "completion_length": 240.015625, "epoch": 0.39420012746972594, "grad_norm": 8.241400718688965, "kl": 0.0927734375, "learning_rate": 6.05799872530274e-07, "loss": 0.0037, "reward": 1.536606788635254, "reward_std": 0.16440913081169128, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.48973172903060913, "step": 1237 }, { "completion_length": 215.53125, "epoch": 0.39451880178457616, "grad_norm": 37.798824310302734, "kl": 0.1005859375, "learning_rate": 6.054811982154237e-07, "loss": 0.004, "reward": 1.6488937139511108, "reward_std": 0.1047549918293953, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5238937139511108, "step": 1238 }, { "completion_length": 248.921875, "epoch": 0.3948374760994264, "grad_norm": 19.88370132446289, "kl": 0.12451171875, "learning_rate": 6.051625239005735e-07, "loss": 0.005, "reward": 1.4146902561187744, "reward_std": 0.1598891019821167, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3678152859210968, "rewards/pad": 0.0625, "step": 1239 }, { "completion_length": 303.359375, "epoch": 0.3951561504142766, "grad_norm": 18.375629425048828, "kl": 0.087890625, "learning_rate": 6.048438495857233e-07, "loss": 0.0035, "reward": 1.5469294786453247, "reward_std": 0.06787046790122986, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5469294190406799, "rewards/pad": 0.0, "step": 1240 }, { "completion_length": 195.78125, "epoch": 0.3954748247291268, "grad_norm": 13.132492065429688, "kl": 0.1123046875, "learning_rate": 6.045251752708731e-07, "loss": 0.0045, "reward": 1.5397614240646362, "reward_std": 0.14754366874694824, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.44601136445999146, "rewards/pad": 0.125, "step": 1241 }, { "completion_length": 203.59375, "epoch": 0.39579349904397704, "grad_norm": 16.14130210876465, "kl": 0.09033203125, "learning_rate": 6.042065009560228e-07, "loss": 0.0036, "reward": 1.5089311599731445, "reward_std": 0.13677260279655457, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.383931040763855, "step": 1242 }, { "completion_length": 341.375, "epoch": 0.39611217335882726, "grad_norm": 7.483922004699707, "kl": 0.0654296875, "learning_rate": 6.038878266411726e-07, "loss": 0.0026, "reward": 1.6305183172225952, "reward_std": 0.04471452906727791, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6305183172225952, "rewards/pad": 0.0, "step": 1243 }, { "completion_length": 385.09375, "epoch": 0.3964308476736775, "grad_norm": 4.253243446350098, "kl": 0.058837890625, "learning_rate": 6.035691523263225e-07, "loss": 0.0024, "reward": 1.4404851198196411, "reward_std": 0.09122411906719208, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3936101198196411, "step": 1244 }, { "completion_length": 324.625, "epoch": 0.3967495219885277, "grad_norm": 12.449899673461914, "kl": 0.09130859375, "learning_rate": 6.032504780114723e-07, "loss": 0.0036, "reward": 1.3082168102264404, "reward_std": 0.1398642361164093, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3238418400287628, "step": 1245 }, { "completion_length": 205.71875, "epoch": 0.397068196303378, "grad_norm": 5.796684741973877, "kl": 0.09912109375, "learning_rate": 6.02931803696622e-07, "loss": 0.004, "reward": 1.5298190116882324, "reward_std": 0.06253817677497864, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4048190116882324, "rewards/pad": 0.125, "step": 1246 }, { "completion_length": 254.40625, "epoch": 0.3973868706182282, "grad_norm": 8.309307098388672, "kl": 0.1572265625, "learning_rate": 6.026131293817718e-07, "loss": 0.0063, "reward": 1.5799627304077148, "reward_std": 0.06893587857484818, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4549628496170044, "step": 1247 }, { "completion_length": 227.40625, "epoch": 0.3977055449330784, "grad_norm": 15.03865909576416, "kl": 0.10302734375, "learning_rate": 6.022944550669216e-07, "loss": 0.0041, "reward": 1.6695342063903809, "reward_std": 0.12210407853126526, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5445340871810913, "rewards/pad": 0.125, "step": 1248 }, { "completion_length": 195.484375, "epoch": 0.39802421924792863, "grad_norm": 12.202496528625488, "kl": 0.09912109375, "learning_rate": 6.019757807520714e-07, "loss": 0.004, "reward": 1.6603246927261353, "reward_std": 0.17108845710754395, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5196996927261353, "step": 1249 }, { "completion_length": 404.640625, "epoch": 0.39834289356277885, "grad_norm": 4.665548324584961, "kl": 0.055419921875, "learning_rate": 6.016571064372211e-07, "loss": 0.0022, "reward": 1.3473976850509644, "reward_std": 0.03374429792165756, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.34739768505096436, "step": 1250 }, { "completion_length": 306.078125, "epoch": 0.3986615678776291, "grad_norm": 11.606754302978516, "kl": 0.07763671875, "learning_rate": 6.013384321223709e-07, "loss": 0.0031, "reward": 1.5133811235427856, "reward_std": 0.029662977904081345, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5133810639381409, "rewards/pad": 0.0, "step": 1251 }, { "completion_length": 312.015625, "epoch": 0.3989802421924793, "grad_norm": 6.143039226531982, "kl": 0.08203125, "learning_rate": 6.010197578075207e-07, "loss": 0.0033, "reward": 1.588003158569336, "reward_std": 0.06406011432409286, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5880030989646912, "step": 1252 }, { "completion_length": 233.0, "epoch": 0.3992989165073295, "grad_norm": 87.68891143798828, "kl": 0.41796875, "learning_rate": 6.007010834926705e-07, "loss": 0.0168, "reward": 1.5894889831542969, "reward_std": 0.13853877782821655, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4801139235496521, "step": 1253 }, { "completion_length": 368.859375, "epoch": 0.39961759082217974, "grad_norm": 5.574906826019287, "kl": 0.049072265625, "learning_rate": 6.003824091778202e-07, "loss": 0.002, "reward": 1.417471170425415, "reward_std": 0.07500630617141724, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.43309611082077026, "step": 1254 }, { "completion_length": 430.4375, "epoch": 0.39993626513702996, "grad_norm": 4.724660873413086, "kl": 0.042236328125, "learning_rate": 6.0006373486297e-07, "loss": 0.0017, "reward": 1.5073049068450928, "reward_std": 0.042998362332582474, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.38230496644973755, "step": 1255 }, { "completion_length": 270.03125, "epoch": 0.4002549394518802, "grad_norm": 10.774860382080078, "kl": 0.09423828125, "learning_rate": 5.997450605481198e-07, "loss": 0.0038, "reward": 1.4578254222869873, "reward_std": 0.06299932301044464, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4578254222869873, "step": 1256 }, { "completion_length": 343.953125, "epoch": 0.4005736137667304, "grad_norm": 19.947551727294922, "kl": 0.05908203125, "learning_rate": 5.994263862332696e-07, "loss": 0.0024, "reward": 1.423647403717041, "reward_std": 0.10177726298570633, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36114731431007385, "step": 1257 }, { "completion_length": 260.625, "epoch": 0.4008922880815806, "grad_norm": 13.201912879943848, "kl": 0.0927734375, "learning_rate": 5.991077119184193e-07, "loss": 0.0037, "reward": 1.6356565952301025, "reward_std": 0.0843108594417572, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5106565952301025, "step": 1258 }, { "completion_length": 246.203125, "epoch": 0.40121096239643084, "grad_norm": 10.583910942077637, "kl": 0.11572265625, "learning_rate": 5.987890376035691e-07, "loss": 0.0046, "reward": 1.322936773300171, "reward_std": 0.09079517424106598, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3229368031024933, "rewards/pad": 0.0, "step": 1259 }, { "completion_length": 252.765625, "epoch": 0.40152963671128106, "grad_norm": 14.975433349609375, "kl": 0.08544921875, "learning_rate": 5.984703632887189e-07, "loss": 0.0034, "reward": 1.4862103462219238, "reward_std": 0.04628577083349228, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48621028661727905, "rewards/pad": 0.0, "step": 1260 }, { "completion_length": 273.09375, "epoch": 0.4018483110261313, "grad_norm": 9.01754379272461, "kl": 0.07568359375, "learning_rate": 5.981516889738687e-07, "loss": 0.003, "reward": 1.4627516269683838, "reward_std": 0.18093456327915192, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.3533766269683838, "step": 1261 }, { "completion_length": 236.515625, "epoch": 0.4021669853409815, "grad_norm": 10.139057159423828, "kl": 0.080078125, "learning_rate": 5.978330146590184e-07, "loss": 0.0032, "reward": 1.6219439506530762, "reward_std": 0.14522536098957062, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.37194395065307617, "step": 1262 }, { "completion_length": 364.71875, "epoch": 0.4024856596558317, "grad_norm": 23.10379981994629, "kl": 0.06689453125, "learning_rate": 5.975143403441683e-07, "loss": 0.0027, "reward": 1.3567779064178467, "reward_std": 0.0712767094373703, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.35677778720855713, "step": 1263 }, { "completion_length": 208.328125, "epoch": 0.40280433397068194, "grad_norm": 18.585952758789062, "kl": 0.10546875, "learning_rate": 5.971956660293181e-07, "loss": 0.0042, "reward": 1.637681245803833, "reward_std": 0.1087605282664299, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.528306245803833, "rewards/pad": 0.125, "step": 1264 }, { "completion_length": 301.5, "epoch": 0.40312300828553216, "grad_norm": 8.328240394592285, "kl": 0.09130859375, "learning_rate": 5.968769917144678e-07, "loss": 0.0037, "reward": 1.3431166410446167, "reward_std": 0.1047779768705368, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3587416112422943, "step": 1265 }, { "completion_length": 238.015625, "epoch": 0.40344168260038243, "grad_norm": 9.131882667541504, "kl": 0.0966796875, "learning_rate": 5.965583173996176e-07, "loss": 0.0039, "reward": 1.5039496421813965, "reward_std": 0.0670909658074379, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5039497017860413, "step": 1266 }, { "completion_length": 262.109375, "epoch": 0.40376035691523265, "grad_norm": 9.783851623535156, "kl": 0.083984375, "learning_rate": 5.962396430847674e-07, "loss": 0.0034, "reward": 1.7425625324249268, "reward_std": 0.08505283296108246, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49256250262260437, "rewards/pad": 0.25, "step": 1267 }, { "completion_length": 196.40625, "epoch": 0.4040790312300829, "grad_norm": 17.94754409790039, "kl": 0.10888671875, "learning_rate": 5.959209687699172e-07, "loss": 0.0044, "reward": 1.6453616619110107, "reward_std": 0.0707654058933258, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5203617215156555, "step": 1268 }, { "completion_length": 204.0, "epoch": 0.4043977055449331, "grad_norm": 8.018806457519531, "kl": 0.09912109375, "learning_rate": 5.956022944550669e-07, "loss": 0.004, "reward": 1.431675910949707, "reward_std": 0.05108753964304924, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.30667582154273987, "rewards/pad": 0.125, "step": 1269 }, { "completion_length": 338.890625, "epoch": 0.4047163798597833, "grad_norm": 7.1575703620910645, "kl": 0.06640625, "learning_rate": 5.952836201402167e-07, "loss": 0.0027, "reward": 1.3842601776123047, "reward_std": 0.03410865738987923, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3842601478099823, "rewards/pad": 0.0, "step": 1270 }, { "completion_length": 289.5, "epoch": 0.40503505417463354, "grad_norm": 8.842584609985352, "kl": 0.08544921875, "learning_rate": 5.949649458253665e-07, "loss": 0.0034, "reward": 1.524345874786377, "reward_std": 0.05037612468004227, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5243459939956665, "step": 1271 }, { "completion_length": 138.84375, "epoch": 0.40535372848948376, "grad_norm": 12.332841873168945, "kl": 0.130859375, "learning_rate": 5.946462715105163e-07, "loss": 0.0053, "reward": 1.454542875289917, "reward_std": 0.11645975708961487, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45454293489456177, "rewards/pad": 0.0, "step": 1272 }, { "completion_length": 91.953125, "epoch": 0.405672402804334, "grad_norm": 11.617120742797852, "kl": 0.138671875, "learning_rate": 5.94327597195666e-07, "loss": 0.0055, "reward": 1.7734625339508057, "reward_std": 0.0727016031742096, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6484625339508057, "rewards/pad": 0.125, "step": 1273 }, { "completion_length": 254.703125, "epoch": 0.4059910771191842, "grad_norm": 18.43532943725586, "kl": 0.08837890625, "learning_rate": 5.940089228808158e-07, "loss": 0.0035, "reward": 1.6537601947784424, "reward_std": 0.12687700986862183, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5912603139877319, "rewards/pad": 0.0625, "step": 1274 }, { "completion_length": 222.0, "epoch": 0.4063097514340344, "grad_norm": 10.08658504486084, "kl": 0.10498046875, "learning_rate": 5.936902485659655e-07, "loss": 0.0042, "reward": 1.674185872077942, "reward_std": 0.1543225795030594, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5648109316825867, "step": 1275 }, { "completion_length": 333.1875, "epoch": 0.40662842574888464, "grad_norm": 19.096281051635742, "kl": 0.078125, "learning_rate": 5.933715742511153e-07, "loss": 0.0031, "reward": 1.3640334606170654, "reward_std": 0.04004434123635292, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36403337121009827, "step": 1276 }, { "completion_length": 152.875, "epoch": 0.40694710006373486, "grad_norm": 8.920004844665527, "kl": 0.12353515625, "learning_rate": 5.93052899936265e-07, "loss": 0.0049, "reward": 1.5706473588943481, "reward_std": 0.10923536121845245, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5706473588943481, "rewards/pad": 0.0, "step": 1277 }, { "completion_length": 246.859375, "epoch": 0.4072657743785851, "grad_norm": 13.205419540405273, "kl": 0.09228515625, "learning_rate": 5.927342256214148e-07, "loss": 0.0037, "reward": 1.4794493913650513, "reward_std": 0.05123839154839516, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47944939136505127, "rewards/pad": 0.0, "step": 1278 }, { "completion_length": 253.328125, "epoch": 0.4075844486934353, "grad_norm": 6.297164440155029, "kl": 0.11376953125, "learning_rate": 5.924155513065646e-07, "loss": 0.0046, "reward": 1.3850655555725098, "reward_std": 0.07302466779947281, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.38506558537483215, "rewards/pad": 0.0, "step": 1279 }, { "completion_length": 202.90625, "epoch": 0.4079031230082855, "grad_norm": 4.877528190612793, "kl": 0.1123046875, "learning_rate": 5.920968769917144e-07, "loss": 0.0045, "reward": 1.4636024236679077, "reward_std": 0.10132384300231934, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3386024236679077, "rewards/pad": 0.125, "step": 1280 }, { "completion_length": 256.96875, "epoch": 0.40822179732313574, "grad_norm": 16.753814697265625, "kl": 0.0966796875, "learning_rate": 5.917782026768641e-07, "loss": 0.0039, "reward": 1.5623836517333984, "reward_std": 0.08463288843631744, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5623837113380432, "rewards/pad": 0.0, "step": 1281 }, { "completion_length": 140.40625, "epoch": 0.40854047163798596, "grad_norm": 8.194445610046387, "kl": 0.11279296875, "learning_rate": 5.91459528362014e-07, "loss": 0.0045, "reward": 1.5440839529037476, "reward_std": 0.10215876996517181, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.41908398270606995, "rewards/pad": 0.125, "step": 1282 }, { "completion_length": 308.765625, "epoch": 0.4088591459528362, "grad_norm": 6.8862385749816895, "kl": 0.0859375, "learning_rate": 5.911408540471638e-07, "loss": 0.0035, "reward": 1.499886155128479, "reward_std": 0.09754335880279541, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3905111253261566, "step": 1283 }, { "completion_length": 205.046875, "epoch": 0.4091778202676864, "grad_norm": 13.631743431091309, "kl": 0.09619140625, "learning_rate": 5.908221797323136e-07, "loss": 0.0038, "reward": 1.5217753648757935, "reward_std": 0.1388101577758789, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5217753648757935, "rewards/pad": 0.0, "step": 1284 }, { "completion_length": 339.546875, "epoch": 0.4094964945825367, "grad_norm": 5.26681661605835, "kl": 0.061767578125, "learning_rate": 5.905035054174633e-07, "loss": 0.0025, "reward": 1.5010402202606201, "reward_std": 0.04901232570409775, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3760402202606201, "rewards/pad": 0.125, "step": 1285 }, { "completion_length": 380.78125, "epoch": 0.4098151688973869, "grad_norm": 7.436905384063721, "kl": 0.068359375, "learning_rate": 5.901848311026131e-07, "loss": 0.0027, "reward": 1.6161510944366455, "reward_std": 0.2011728286743164, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5380261540412903, "step": 1286 }, { "completion_length": 266.265625, "epoch": 0.4101338432122371, "grad_norm": 13.028887748718262, "kl": 0.08935546875, "learning_rate": 5.898661567877629e-07, "loss": 0.0036, "reward": 1.3868389129638672, "reward_std": 0.04168622940778732, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3868389427661896, "rewards/pad": 0.0, "step": 1287 }, { "completion_length": 374.234375, "epoch": 0.41045251752708733, "grad_norm": 8.806180000305176, "kl": 0.06396484375, "learning_rate": 5.895474824729127e-07, "loss": 0.0026, "reward": 1.3828778266906738, "reward_std": 0.15111730992794037, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.41412773728370667, "step": 1288 }, { "completion_length": 309.984375, "epoch": 0.41077119184193756, "grad_norm": 5.311010360717773, "kl": 0.06982421875, "learning_rate": 5.892288081580624e-07, "loss": 0.0028, "reward": 1.6289262771606445, "reward_std": 0.061325907707214355, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.503926157951355, "step": 1289 }, { "completion_length": 310.125, "epoch": 0.4110898661567878, "grad_norm": 6.758955001831055, "kl": 0.08203125, "learning_rate": 5.889101338432122e-07, "loss": 0.0033, "reward": 1.640629529953003, "reward_std": 0.10892556607723236, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.48437952995300293, "step": 1290 }, { "completion_length": 323.109375, "epoch": 0.411408540471638, "grad_norm": 4.773209571838379, "kl": 0.0771484375, "learning_rate": 5.88591459528362e-07, "loss": 0.0031, "reward": 1.4355682134628296, "reward_std": 0.15114247798919678, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4668181538581848, "rewards/pad": 0.0, "step": 1291 }, { "completion_length": 267.65625, "epoch": 0.4117272147864882, "grad_norm": 10.178011894226074, "kl": 0.09326171875, "learning_rate": 5.882727852135117e-07, "loss": 0.0037, "reward": 1.4803059101104736, "reward_std": 0.05648474767804146, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48030588030815125, "rewards/pad": 0.0, "step": 1292 }, { "completion_length": 189.1875, "epoch": 0.41204588910133844, "grad_norm": 10.093996047973633, "kl": 0.095703125, "learning_rate": 5.879541108986615e-07, "loss": 0.0038, "reward": 1.7790437936782837, "reward_std": 0.15045028924942017, "rewards/pad": 0.34375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43529391288757324, "step": 1293 }, { "completion_length": 289.765625, "epoch": 0.41236456341618866, "grad_norm": 50.02298355102539, "kl": 0.10107421875, "learning_rate": 5.876354365838113e-07, "loss": 0.004, "reward": 1.5010088682174683, "reward_std": 0.13455824553966522, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.516633927822113, "step": 1294 }, { "completion_length": 246.734375, "epoch": 0.4126832377310389, "grad_norm": 36.4998779296875, "kl": 0.08056640625, "learning_rate": 5.873167622689611e-07, "loss": 0.0032, "reward": 1.3580009937286377, "reward_std": 0.05124887079000473, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.35800105333328247, "rewards/pad": 0.0, "step": 1295 }, { "completion_length": 324.390625, "epoch": 0.4130019120458891, "grad_norm": 9.309545516967773, "kl": 0.060791015625, "learning_rate": 5.869980879541108e-07, "loss": 0.0024, "reward": 1.502913475036621, "reward_std": 0.06350995600223541, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.37791338562965393, "rewards/pad": 0.125, "step": 1296 }, { "completion_length": 166.875, "epoch": 0.4133205863607393, "grad_norm": 10.437187194824219, "kl": 0.09521484375, "learning_rate": 5.866794136392606e-07, "loss": 0.0038, "reward": 1.823692798614502, "reward_std": 0.11942209303379059, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49556779861450195, "rewards/pad": 0.328125, "step": 1297 }, { "completion_length": 334.578125, "epoch": 0.41363926067558954, "grad_norm": 15.77004337310791, "kl": 0.08251953125, "learning_rate": 5.863607393244104e-07, "loss": 0.0033, "reward": 1.4892622232437134, "reward_std": 0.08658753335475922, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4892622232437134, "rewards/pad": 0.0, "step": 1298 }, { "completion_length": 276.96875, "epoch": 0.41395793499043976, "grad_norm": 10.103075981140137, "kl": 0.09130859375, "learning_rate": 5.860420650095602e-07, "loss": 0.0036, "reward": 1.6745097637176514, "reward_std": 0.06824688613414764, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5495096445083618, "step": 1299 }, { "completion_length": 222.203125, "epoch": 0.41427660930529, "grad_norm": 13.353841781616211, "kl": 0.103515625, "learning_rate": 5.8572339069471e-07, "loss": 0.0041, "reward": 1.5671412944793701, "reward_std": 0.21640679240226746, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.44214126467704773, "step": 1300 }, { "completion_length": 206.265625, "epoch": 0.4145952836201402, "grad_norm": 17.48212242126465, "kl": 0.11962890625, "learning_rate": 5.854047163798598e-07, "loss": 0.0048, "reward": 1.500386357307434, "reward_std": 0.14595331251621246, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.37538638710975647, "rewards/pad": 0.125, "step": 1301 }, { "completion_length": 234.625, "epoch": 0.4149139579349904, "grad_norm": 11.124568939208984, "kl": 0.11572265625, "learning_rate": 5.850860420650096e-07, "loss": 0.0046, "reward": 1.4751906394958496, "reward_std": 0.11550100892782211, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4751906394958496, "rewards/pad": 0.0, "step": 1302 }, { "completion_length": 261.25, "epoch": 0.41523263224984064, "grad_norm": 7.58814811706543, "kl": 0.08642578125, "learning_rate": 5.847673677501594e-07, "loss": 0.0035, "reward": 1.567017912864685, "reward_std": 0.15186744928359985, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.45764294266700745, "rewards/pad": 0.125, "step": 1303 }, { "completion_length": 390.40625, "epoch": 0.41555130656469086, "grad_norm": 16.442684173583984, "kl": 0.06640625, "learning_rate": 5.844486934353091e-07, "loss": 0.0027, "reward": 1.398057460784912, "reward_std": 0.060513533651828766, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3980574607849121, "step": 1304 }, { "completion_length": 286.890625, "epoch": 0.41586998087954113, "grad_norm": 8.115775108337402, "kl": 0.08935546875, "learning_rate": 5.841300191204589e-07, "loss": 0.0036, "reward": 1.5188145637512207, "reward_std": 0.05607176199555397, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5188145637512207, "step": 1305 }, { "completion_length": 213.28125, "epoch": 0.41618865519439135, "grad_norm": 10.0141019821167, "kl": 0.1025390625, "learning_rate": 5.838113448056087e-07, "loss": 0.0041, "reward": 1.6528501510620117, "reward_std": 0.11340200901031494, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6528501510620117, "step": 1306 }, { "completion_length": 430.546875, "epoch": 0.4165073295092416, "grad_norm": 7.513751983642578, "kl": 0.051513671875, "learning_rate": 5.834926704907585e-07, "loss": 0.0021, "reward": 1.5538432598114014, "reward_std": 0.04181980714201927, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5538431406021118, "step": 1307 }, { "completion_length": 354.734375, "epoch": 0.4168260038240918, "grad_norm": 7.730433940887451, "kl": 0.06005859375, "learning_rate": 5.831739961759082e-07, "loss": 0.0024, "reward": 1.5449497699737549, "reward_std": 0.10346663743257523, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4668247699737549, "rewards/pad": 0.078125, "step": 1308 }, { "completion_length": 284.78125, "epoch": 0.417144678138942, "grad_norm": 11.019664764404297, "kl": 0.0908203125, "learning_rate": 5.82855321861058e-07, "loss": 0.0036, "reward": 1.6366920471191406, "reward_std": 0.22725805640220642, "rewards/pad": 0.1875, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4804421067237854, "step": 1309 }, { "completion_length": 238.125, "epoch": 0.41746335245379224, "grad_norm": 28.53036880493164, "kl": 0.09912109375, "learning_rate": 5.825366475462078e-07, "loss": 0.004, "reward": 1.4334297180175781, "reward_std": 0.09850458055734634, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.40217968821525574, "step": 1310 }, { "completion_length": 390.1875, "epoch": 0.41778202676864246, "grad_norm": 5.816764831542969, "kl": 0.0634765625, "learning_rate": 5.822179732313576e-07, "loss": 0.0025, "reward": 1.5051462650299072, "reward_std": 0.16161230206489563, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.48952123522758484, "step": 1311 }, { "completion_length": 234.140625, "epoch": 0.4181007010834927, "grad_norm": 10.69921875, "kl": 0.09814453125, "learning_rate": 5.818992989165073e-07, "loss": 0.0039, "reward": 1.806492805480957, "reward_std": 0.11196824908256531, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6814928650856018, "rewards/pad": 0.125, "step": 1312 }, { "completion_length": 172.796875, "epoch": 0.4184193753983429, "grad_norm": 19.093624114990234, "kl": 0.1064453125, "learning_rate": 5.815806246016571e-07, "loss": 0.0043, "reward": 1.6470550298690796, "reward_std": 0.13487771153450012, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.428305059671402, "rewards/pad": 0.21875, "step": 1313 }, { "completion_length": 474.78125, "epoch": 0.4187380497131931, "grad_norm": 10.676046371459961, "kl": 0.045654296875, "learning_rate": 5.812619502868069e-07, "loss": 0.0018, "reward": 1.5626897811889648, "reward_std": 0.12818527221679688, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4533146917819977, "step": 1314 }, { "completion_length": 353.265625, "epoch": 0.41905672402804334, "grad_norm": 12.62907886505127, "kl": 0.06494140625, "learning_rate": 5.809432759719566e-07, "loss": 0.0026, "reward": 1.5759479999542236, "reward_std": 0.11017276346683502, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.45094799995422363, "step": 1315 }, { "completion_length": 274.0625, "epoch": 0.41937539834289356, "grad_norm": 19.575292587280273, "kl": 0.08154296875, "learning_rate": 5.806246016571063e-07, "loss": 0.0033, "reward": 1.4159862995147705, "reward_std": 0.0481281504034996, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4159863591194153, "rewards/pad": 0.0, "step": 1316 }, { "completion_length": 287.0625, "epoch": 0.4196940726577438, "grad_norm": 9.858039855957031, "kl": 0.09423828125, "learning_rate": 5.803059273422561e-07, "loss": 0.0038, "reward": 1.6656700372695923, "reward_std": 0.142438605427742, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5094200372695923, "rewards/pad": 0.171875, "step": 1317 }, { "completion_length": 270.609375, "epoch": 0.420012746972594, "grad_norm": 7.314291954040527, "kl": 0.09423828125, "learning_rate": 5.799872530274059e-07, "loss": 0.0038, "reward": 1.5594706535339355, "reward_std": 0.07420562207698822, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5594705939292908, "rewards/pad": 0.0, "step": 1318 }, { "completion_length": 284.703125, "epoch": 0.4203314212874442, "grad_norm": 15.29833698272705, "kl": 0.0908203125, "learning_rate": 5.796685787125558e-07, "loss": 0.0036, "reward": 1.506531000137329, "reward_std": 0.04399479925632477, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3815310001373291, "rewards/pad": 0.125, "step": 1319 }, { "completion_length": 324.671875, "epoch": 0.42065009560229444, "grad_norm": 13.741570472717285, "kl": 0.064453125, "learning_rate": 5.793499043977055e-07, "loss": 0.0026, "reward": 1.5349571704864502, "reward_std": 0.18845012784004211, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4724571704864502, "step": 1320 }, { "completion_length": 165.75, "epoch": 0.42096876991714466, "grad_norm": 13.279805183410645, "kl": 0.1064453125, "learning_rate": 5.790312300828553e-07, "loss": 0.0043, "reward": 1.528536319732666, "reward_std": 0.11498822271823883, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5285362601280212, "rewards/pad": 0.0, "step": 1321 }, { "completion_length": 169.109375, "epoch": 0.4212874442319949, "grad_norm": 16.651105880737305, "kl": 0.10595703125, "learning_rate": 5.787125557680051e-07, "loss": 0.0042, "reward": 1.8210078477859497, "reward_std": 0.040907394140958786, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5710077881813049, "step": 1322 }, { "completion_length": 259.84375, "epoch": 0.4216061185468451, "grad_norm": 13.567558288574219, "kl": 0.09375, "learning_rate": 5.783938814531548e-07, "loss": 0.0038, "reward": 1.519647240638733, "reward_std": 0.151037335395813, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3165222406387329, "rewards/pad": 0.21875, "step": 1323 }, { "completion_length": 370.0625, "epoch": 0.4219247928616954, "grad_norm": 8.145339965820312, "kl": 0.06982421875, "learning_rate": 5.780752071383046e-07, "loss": 0.0028, "reward": 1.4528758525848389, "reward_std": 0.07324043661355972, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4528758227825165, "rewards/pad": 0.0, "step": 1324 }, { "completion_length": 208.84375, "epoch": 0.4222434671765456, "grad_norm": 33.94546127319336, "kl": 0.1142578125, "learning_rate": 5.777565328234544e-07, "loss": 0.0046, "reward": 1.355173110961914, "reward_std": 0.06881211698055267, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.35517311096191406, "rewards/pad": 0.0, "step": 1325 }, { "completion_length": 168.203125, "epoch": 0.4225621414913958, "grad_norm": 9.615571022033691, "kl": 0.1142578125, "learning_rate": 5.774378585086042e-07, "loss": 0.0046, "reward": 1.8198182582855225, "reward_std": 0.23303106427192688, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.554193377494812, "rewards/pad": 0.265625, "step": 1326 }, { "completion_length": 172.359375, "epoch": 0.42288081580624604, "grad_norm": 7.548245429992676, "kl": 0.1279296875, "learning_rate": 5.771191841937539e-07, "loss": 0.0051, "reward": 1.42877197265625, "reward_std": 0.07921026647090912, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42877188324928284, "rewards/pad": 0.0, "step": 1327 }, { "completion_length": 306.546875, "epoch": 0.42319949012109626, "grad_norm": 16.646623611450195, "kl": 0.07470703125, "learning_rate": 5.768005098789037e-07, "loss": 0.003, "reward": 1.5173559188842773, "reward_std": 0.07084318995475769, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5173557996749878, "rewards/pad": 0.0, "step": 1328 }, { "completion_length": 393.875, "epoch": 0.4235181644359465, "grad_norm": 5.775397300720215, "kl": 0.07666015625, "learning_rate": 5.764818355640535e-07, "loss": 0.0031, "reward": 1.3757777214050293, "reward_std": 0.08598428219556808, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3914026618003845, "step": 1329 }, { "completion_length": 197.78125, "epoch": 0.4238368387507967, "grad_norm": 40.24833297729492, "kl": 0.09521484375, "learning_rate": 5.761631612492033e-07, "loss": 0.0038, "reward": 1.6771042346954346, "reward_std": 0.12677818536758423, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5677293539047241, "rewards/pad": 0.125, "step": 1330 }, { "completion_length": 340.328125, "epoch": 0.4241555130656469, "grad_norm": 8.642611503601074, "kl": 0.0849609375, "learning_rate": 5.75844486934353e-07, "loss": 0.0034, "reward": 1.409783124923706, "reward_std": 0.14267167448997498, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.42540812492370605, "rewards/pad": 0.0, "step": 1331 }, { "completion_length": 254.890625, "epoch": 0.42447418738049714, "grad_norm": 11.313599586486816, "kl": 0.08056640625, "learning_rate": 5.755258126195028e-07, "loss": 0.0032, "reward": 1.4439899921417236, "reward_std": 0.055654481053352356, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.31898996233940125, "rewards/pad": 0.125, "step": 1332 }, { "completion_length": 206.21875, "epoch": 0.42479286169534736, "grad_norm": 37.72480010986328, "kl": 0.12451171875, "learning_rate": 5.752071383046526e-07, "loss": 0.005, "reward": 1.554432988166809, "reward_std": 0.06340833753347397, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5544329881668091, "rewards/pad": 0.0, "step": 1333 }, { "completion_length": 276.078125, "epoch": 0.4251115360101976, "grad_norm": 10.532992362976074, "kl": 0.0859375, "learning_rate": 5.748884639898024e-07, "loss": 0.0034, "reward": 1.5329124927520752, "reward_std": 0.07358253002166748, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5329123735427856, "step": 1334 }, { "completion_length": 315.671875, "epoch": 0.4254302103250478, "grad_norm": 4.332948207855225, "kl": 0.07958984375, "learning_rate": 5.745697896749521e-07, "loss": 0.0032, "reward": 1.4253344535827637, "reward_std": 0.10386821627616882, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.20658442378044128, "step": 1335 }, { "completion_length": 212.71875, "epoch": 0.425748884639898, "grad_norm": 15.159989356994629, "kl": 0.1123046875, "learning_rate": 5.742511153601019e-07, "loss": 0.0045, "reward": 1.6377997398376465, "reward_std": 0.14908741414546967, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6534247398376465, "rewards/pad": 0.0, "step": 1336 }, { "completion_length": 248.8125, "epoch": 0.42606755895474824, "grad_norm": 11.552496910095215, "kl": 0.09716796875, "learning_rate": 5.739324410452517e-07, "loss": 0.0039, "reward": 1.4865126609802246, "reward_std": 0.15525057911872864, "rewards/answer_reward": 0.015625, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.5021377801895142, "step": 1337 }, { "completion_length": 216.625, "epoch": 0.42638623326959846, "grad_norm": 10.185986518859863, "kl": 0.103515625, "learning_rate": 5.736137667304016e-07, "loss": 0.0041, "reward": 1.363529086112976, "reward_std": 0.09389464557170868, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.37915414571762085, "rewards/pad": 0.0, "step": 1338 }, { "completion_length": 301.71875, "epoch": 0.4267049075844487, "grad_norm": 4.8207783699035645, "kl": 0.0625, "learning_rate": 5.732950924155513e-07, "loss": 0.0025, "reward": 1.6868584156036377, "reward_std": 0.09036528319120407, "rewards/pad": 0.34375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.34310850501060486, "step": 1339 }, { "completion_length": 109.8125, "epoch": 0.4270235818992989, "grad_norm": 11.68543815612793, "kl": 0.12109375, "learning_rate": 5.729764181007011e-07, "loss": 0.0049, "reward": 1.8774080276489258, "reward_std": 0.15159153938293457, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.611782968044281, "rewards/pad": 0.265625, "step": 1340 }, { "completion_length": 201.4375, "epoch": 0.4273422562141491, "grad_norm": 12.874130249023438, "kl": 0.0986328125, "learning_rate": 5.726577437858509e-07, "loss": 0.0039, "reward": 1.6547398567199707, "reward_std": 0.12241728603839874, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5766147375106812, "rewards/pad": 0.078125, "step": 1341 }, { "completion_length": 214.984375, "epoch": 0.42766093052899934, "grad_norm": 9.551128387451172, "kl": 0.0830078125, "learning_rate": 5.723390694710007e-07, "loss": 0.0033, "reward": 1.462345004081726, "reward_std": 0.12054663896560669, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4779700040817261, "step": 1342 }, { "completion_length": 192.515625, "epoch": 0.42797960484384956, "grad_norm": 39.426368713378906, "kl": 0.09375, "learning_rate": 5.720203951561504e-07, "loss": 0.0037, "reward": 1.5890249013900757, "reward_std": 0.06130727380514145, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4640248715877533, "step": 1343 }, { "completion_length": 216.390625, "epoch": 0.42829827915869984, "grad_norm": 6.589493274688721, "kl": 0.083984375, "learning_rate": 5.717017208413002e-07, "loss": 0.0034, "reward": 1.6650612354278564, "reward_std": 0.07460354268550873, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.41506117582321167, "step": 1344 }, { "completion_length": 245.09375, "epoch": 0.42861695347355006, "grad_norm": 11.974091529846191, "kl": 0.11376953125, "learning_rate": 5.7138304652645e-07, "loss": 0.0045, "reward": 1.5561902523040771, "reward_std": 0.15294793248176575, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5718152523040771, "step": 1345 }, { "completion_length": 184.59375, "epoch": 0.4289356277884003, "grad_norm": 26.964982986450195, "kl": 0.4609375, "learning_rate": 5.710643722115998e-07, "loss": 0.0185, "reward": 1.5439296960830688, "reward_std": 0.1391330063343048, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5595547556877136, "rewards/pad": 0.0, "step": 1346 }, { "completion_length": 257.4375, "epoch": 0.4292543021032505, "grad_norm": 14.968935012817383, "kl": 0.1484375, "learning_rate": 5.707456978967495e-07, "loss": 0.006, "reward": 1.455926537513733, "reward_std": 0.14180481433868408, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4715515375137329, "step": 1347 }, { "completion_length": 301.484375, "epoch": 0.4295729764181007, "grad_norm": 31.82322120666504, "kl": 0.0703125, "learning_rate": 5.704270235818993e-07, "loss": 0.0028, "reward": 1.5665318965911865, "reward_std": 0.029603153467178345, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4415319263935089, "step": 1348 }, { "completion_length": 337.578125, "epoch": 0.42989165073295094, "grad_norm": 4.555129528045654, "kl": 0.06640625, "learning_rate": 5.701083492670491e-07, "loss": 0.0027, "reward": 1.360124945640564, "reward_std": 0.041842639446258545, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36012494564056396, "step": 1349 }, { "completion_length": 247.921875, "epoch": 0.43021032504780116, "grad_norm": 24.405990600585938, "kl": 0.08447265625, "learning_rate": 5.697896749521989e-07, "loss": 0.0034, "reward": 1.51039719581604, "reward_std": 0.11081908643245697, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4010222554206848, "step": 1350 }, { "completion_length": 310.265625, "epoch": 0.4305289993626514, "grad_norm": 11.049640655517578, "kl": 0.08203125, "learning_rate": 5.694710006373486e-07, "loss": 0.0033, "reward": 1.4907029867172241, "reward_std": 0.04949101805686951, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4907029867172241, "rewards/pad": 0.0, "step": 1351 }, { "completion_length": 262.125, "epoch": 0.4308476736775016, "grad_norm": 4.974499702453613, "kl": 0.0947265625, "learning_rate": 5.691523263224984e-07, "loss": 0.0038, "reward": 1.4859580993652344, "reward_std": 0.07267333567142487, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3609580993652344, "rewards/pad": 0.125, "step": 1352 }, { "completion_length": 168.984375, "epoch": 0.4311663479923518, "grad_norm": 8.621573448181152, "kl": 0.1484375, "learning_rate": 5.688336520076482e-07, "loss": 0.006, "reward": 1.674235224723816, "reward_std": 0.09904008358716965, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6742351651191711, "rewards/pad": 0.0, "step": 1353 }, { "completion_length": 196.71875, "epoch": 0.43148502230720204, "grad_norm": 10.181594848632812, "kl": 0.11669921875, "learning_rate": 5.685149776927978e-07, "loss": 0.0047, "reward": 1.6469101905822754, "reward_std": 0.12676161527633667, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5219101905822754, "rewards/pad": 0.125, "step": 1354 }, { "completion_length": 322.140625, "epoch": 0.43180369662205226, "grad_norm": 4.490674018859863, "kl": 0.068359375, "learning_rate": 5.681963033779476e-07, "loss": 0.0027, "reward": 1.5910611152648926, "reward_std": 0.04071514308452606, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4660611152648926, "step": 1355 }, { "completion_length": 188.0, "epoch": 0.4321223709369025, "grad_norm": 11.657907485961914, "kl": 0.10888671875, "learning_rate": 5.678776290630974e-07, "loss": 0.0043, "reward": 1.5474567413330078, "reward_std": 0.06650790572166443, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.547456681728363, "rewards/pad": 0.0, "step": 1356 }, { "completion_length": 240.078125, "epoch": 0.4324410452517527, "grad_norm": 7.6790618896484375, "kl": 0.07958984375, "learning_rate": 5.675589547482473e-07, "loss": 0.0032, "reward": 1.6534273624420166, "reward_std": 0.14597870409488678, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4503023326396942, "rewards/pad": 0.203125, "step": 1357 }, { "completion_length": 191.546875, "epoch": 0.4327597195666029, "grad_norm": 9.91802978515625, "kl": 0.1494140625, "learning_rate": 5.67240280433397e-07, "loss": 0.006, "reward": 1.6715829372406006, "reward_std": 0.22884416580200195, "rewards/answer_reward": 0.171875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.499707967042923, "step": 1358 }, { "completion_length": 284.390625, "epoch": 0.43307839388145314, "grad_norm": 9.172077178955078, "kl": 0.08837890625, "learning_rate": 5.669216061185468e-07, "loss": 0.0035, "reward": 1.5910166501998901, "reward_std": 0.1787361055612564, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.49726665019989014, "step": 1359 }, { "completion_length": 363.84375, "epoch": 0.43339706819630336, "grad_norm": 6.0271148681640625, "kl": 0.0693359375, "learning_rate": 5.666029318036966e-07, "loss": 0.0028, "reward": 1.524344563484192, "reward_std": 0.07330280542373657, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.41496962308883667, "step": 1360 }, { "completion_length": 199.5625, "epoch": 0.4337157425111536, "grad_norm": 15.283858299255371, "kl": 0.1376953125, "learning_rate": 5.662842574888464e-07, "loss": 0.0055, "reward": 1.664130687713623, "reward_std": 0.10358402132987976, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.539130687713623, "rewards/pad": 0.125, "step": 1361 }, { "completion_length": 288.171875, "epoch": 0.4340344168260038, "grad_norm": 7.310180187225342, "kl": 0.0986328125, "learning_rate": 5.659655831739961e-07, "loss": 0.0039, "reward": 1.2990243434906006, "reward_std": 0.07281455397605896, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3146492838859558, "rewards/pad": 0.0, "step": 1362 }, { "completion_length": 230.265625, "epoch": 0.434353091140854, "grad_norm": 11.217733383178711, "kl": 0.08056640625, "learning_rate": 5.656469088591459e-07, "loss": 0.0032, "reward": 1.649712324142456, "reward_std": 0.075341135263443, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.524712324142456, "rewards/pad": 0.125, "step": 1363 }, { "completion_length": 202.203125, "epoch": 0.4346717654557043, "grad_norm": 49.92980194091797, "kl": 0.447265625, "learning_rate": 5.653282345442957e-07, "loss": 0.0179, "reward": 1.4197627305984497, "reward_std": 0.08784756064414978, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4197627604007721, "rewards/pad": 0.0, "step": 1364 }, { "completion_length": 146.1875, "epoch": 0.4349904397705545, "grad_norm": 7.490056991577148, "kl": 0.126953125, "learning_rate": 5.650095602294455e-07, "loss": 0.0051, "reward": 1.607069492340088, "reward_std": 0.08210156857967377, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4820695221424103, "rewards/pad": 0.125, "step": 1365 }, { "completion_length": 245.921875, "epoch": 0.43530911408540474, "grad_norm": 16.46823501586914, "kl": 0.09423828125, "learning_rate": 5.646908859145952e-07, "loss": 0.0038, "reward": 1.568756103515625, "reward_std": 0.08668509870767593, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5687560439109802, "rewards/pad": 0.0, "step": 1366 }, { "completion_length": 236.859375, "epoch": 0.43562778840025496, "grad_norm": 4.543325901031494, "kl": 0.10546875, "learning_rate": 5.64372211599745e-07, "loss": 0.0042, "reward": 1.3611669540405273, "reward_std": 0.022801637649536133, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3611670136451721, "rewards/pad": 0.0, "step": 1367 }, { "completion_length": 253.046875, "epoch": 0.4359464627151052, "grad_norm": 37.60184860229492, "kl": 0.09619140625, "learning_rate": 5.640535372848948e-07, "loss": 0.0038, "reward": 1.5685096979141235, "reward_std": 0.044750314205884933, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5685096383094788, "rewards/pad": 0.0, "step": 1368 }, { "completion_length": 321.796875, "epoch": 0.4362651370299554, "grad_norm": 10.251585960388184, "kl": 0.07373046875, "learning_rate": 5.637348629700446e-07, "loss": 0.003, "reward": 1.5848073959350586, "reward_std": 0.07786726951599121, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5848073959350586, "step": 1369 }, { "completion_length": 258.515625, "epoch": 0.4365838113448056, "grad_norm": 13.010295867919922, "kl": 0.11669921875, "learning_rate": 5.634161886551943e-07, "loss": 0.0047, "reward": 1.402256727218628, "reward_std": 0.06264589726924896, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4022567868232727, "rewards/pad": 0.0, "step": 1370 }, { "completion_length": 205.53125, "epoch": 0.43690248565965584, "grad_norm": 13.601364135742188, "kl": 0.09130859375, "learning_rate": 5.630975143403441e-07, "loss": 0.0037, "reward": 1.7684476375579834, "reward_std": 0.07154097408056259, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5184476971626282, "step": 1371 }, { "completion_length": 368.734375, "epoch": 0.43722115997450606, "grad_norm": 7.428386211395264, "kl": 0.06982421875, "learning_rate": 5.627788400254939e-07, "loss": 0.0028, "reward": 1.5593843460083008, "reward_std": 0.140476793050766, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.450009286403656, "step": 1372 }, { "completion_length": 107.90625, "epoch": 0.4375398342893563, "grad_norm": 22.303043365478516, "kl": 0.12255859375, "learning_rate": 5.624601657106437e-07, "loss": 0.0049, "reward": 1.643021821975708, "reward_std": 0.2257276475429535, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5648967623710632, "rewards/pad": 0.078125, "step": 1373 }, { "completion_length": 149.328125, "epoch": 0.4378585086042065, "grad_norm": 16.431352615356445, "kl": 0.1083984375, "learning_rate": 5.621414913957934e-07, "loss": 0.0043, "reward": 1.3097245693206787, "reward_std": 0.10590263456106186, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3097245693206787, "rewards/pad": 0.0, "step": 1374 }, { "completion_length": 298.609375, "epoch": 0.4381771829190567, "grad_norm": 17.105371475219727, "kl": 0.07470703125, "learning_rate": 5.618228170809432e-07, "loss": 0.003, "reward": 1.5687129497528076, "reward_std": 0.047115493565797806, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5687129497528076, "step": 1375 }, { "completion_length": 233.234375, "epoch": 0.43849585723390694, "grad_norm": 8.532923698425293, "kl": 0.11865234375, "learning_rate": 5.615041427660931e-07, "loss": 0.0048, "reward": 1.371286392211914, "reward_std": 0.15830770134925842, "rewards/answer_reward": 0.046875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3244113028049469, "step": 1376 }, { "completion_length": 247.5625, "epoch": 0.43881453154875716, "grad_norm": 10.543306350708008, "kl": 0.08154296875, "learning_rate": 5.611854684512429e-07, "loss": 0.0033, "reward": 1.3614822626113892, "reward_std": 0.14806589484214783, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.33023232221603394, "step": 1377 }, { "completion_length": 110.71875, "epoch": 0.4391332058636074, "grad_norm": 27.713855743408203, "kl": 0.125, "learning_rate": 5.608667941363926e-07, "loss": 0.005, "reward": 1.7164729833602905, "reward_std": 0.17640821635723114, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4352230131626129, "rewards/pad": 0.28125, "step": 1378 }, { "completion_length": 403.421875, "epoch": 0.4394518801784576, "grad_norm": 4.868521213531494, "kl": 0.04248046875, "learning_rate": 5.605481198215424e-07, "loss": 0.0017, "reward": 1.3028991222381592, "reward_std": 0.027830326929688454, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3028990626335144, "step": 1379 }, { "completion_length": 206.03125, "epoch": 0.4397705544933078, "grad_norm": 12.161081314086914, "kl": 0.08349609375, "learning_rate": 5.602294455066922e-07, "loss": 0.0033, "reward": 1.6735880374908447, "reward_std": 0.08616338670253754, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5485880374908447, "rewards/pad": 0.125, "step": 1380 }, { "completion_length": 195.390625, "epoch": 0.44008922880815804, "grad_norm": 17.812496185302734, "kl": 0.234375, "learning_rate": 5.59910771191842e-07, "loss": 0.0094, "reward": 1.4883697032928467, "reward_std": 0.08894871920347214, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.34774452447891235, "step": 1381 }, { "completion_length": 316.25, "epoch": 0.44040790312300826, "grad_norm": 9.531152725219727, "kl": 0.078125, "learning_rate": 5.595920968769917e-07, "loss": 0.0031, "reward": 1.4265236854553223, "reward_std": 0.04695095121860504, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42652374505996704, "rewards/pad": 0.0, "step": 1382 }, { "completion_length": 320.0, "epoch": 0.44072657743785854, "grad_norm": 7.157126426696777, "kl": 0.0751953125, "learning_rate": 5.592734225621415e-07, "loss": 0.003, "reward": 1.3335322141647339, "reward_std": 0.050246525555849075, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.20853224396705627, "step": 1383 }, { "completion_length": 213.734375, "epoch": 0.44104525175270876, "grad_norm": 11.768050193786621, "kl": 0.1123046875, "learning_rate": 5.589547482472913e-07, "loss": 0.0045, "reward": 1.51654851436615, "reward_std": 0.10998847335577011, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4540485143661499, "step": 1384 }, { "completion_length": 353.890625, "epoch": 0.441363926067559, "grad_norm": 7.449849605560303, "kl": 0.08251953125, "learning_rate": 5.586360739324411e-07, "loss": 0.0033, "reward": 1.274510145187378, "reward_std": 0.01488026138395071, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.2745102047920227, "step": 1385 }, { "completion_length": 165.375, "epoch": 0.4416826003824092, "grad_norm": 23.854345321655273, "kl": 0.12109375, "learning_rate": 5.583173996175908e-07, "loss": 0.0048, "reward": 1.5486022233963013, "reward_std": 0.2040618658065796, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5642272233963013, "step": 1386 }, { "completion_length": 160.5625, "epoch": 0.4420012746972594, "grad_norm": 82.924560546875, "kl": 0.1025390625, "learning_rate": 5.579987253027406e-07, "loss": 0.0041, "reward": 1.7493038177490234, "reward_std": 0.0807715579867363, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6243036985397339, "rewards/pad": 0.125, "step": 1387 }, { "completion_length": 348.265625, "epoch": 0.44231994901210964, "grad_norm": 14.425209045410156, "kl": 0.068359375, "learning_rate": 5.576800509878904e-07, "loss": 0.0027, "reward": 1.4635679721832275, "reward_std": 0.0678384006023407, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46356791257858276, "rewards/pad": 0.0, "step": 1388 }, { "completion_length": 161.71875, "epoch": 0.44263862332695986, "grad_norm": 11.94912338256836, "kl": 0.1083984375, "learning_rate": 5.573613766730401e-07, "loss": 0.0043, "reward": 1.7208974361419678, "reward_std": 0.09452605247497559, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47089749574661255, "rewards/pad": 0.25, "step": 1389 }, { "completion_length": 259.5625, "epoch": 0.4429572976418101, "grad_norm": 18.560396194458008, "kl": 0.07763671875, "learning_rate": 5.570427023581899e-07, "loss": 0.0031, "reward": 1.8574298620224, "reward_std": 0.1501571536064148, "rewards/answer_reward": 0.328125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5293048024177551, "step": 1390 }, { "completion_length": 126.78125, "epoch": 0.4432759719566603, "grad_norm": 11.051600456237793, "kl": 0.140625, "learning_rate": 5.567240280433397e-07, "loss": 0.0056, "reward": 1.5313197374343872, "reward_std": 0.21828460693359375, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.35944464802742004, "rewards/pad": 0.171875, "step": 1391 }, { "completion_length": 205.59375, "epoch": 0.4435946462715105, "grad_norm": 14.702584266662598, "kl": 0.1005859375, "learning_rate": 5.564053537284895e-07, "loss": 0.004, "reward": 1.6007747650146484, "reward_std": 0.1382533311843872, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5851497054100037, "rewards/pad": 0.015625, "step": 1392 }, { "completion_length": 213.046875, "epoch": 0.44391332058636074, "grad_norm": 11.537291526794434, "kl": 0.10546875, "learning_rate": 5.560866794136391e-07, "loss": 0.0042, "reward": 1.8047747611999512, "reward_std": 0.15652026236057281, "rewards/answer_reward": 0.203125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6016498804092407, "step": 1393 }, { "completion_length": 294.125, "epoch": 0.44423199490121096, "grad_norm": 14.856083869934082, "kl": 0.0810546875, "learning_rate": 5.557680050987889e-07, "loss": 0.0032, "reward": 1.5364224910736084, "reward_std": 0.04746140539646149, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5364224314689636, "step": 1394 }, { "completion_length": 312.5, "epoch": 0.4445506692160612, "grad_norm": 10.558499336242676, "kl": 0.099609375, "learning_rate": 5.554493307839388e-07, "loss": 0.004, "reward": 1.3890564441680908, "reward_std": 0.10178478062152863, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4046815037727356, "step": 1395 }, { "completion_length": 276.171875, "epoch": 0.4448693435309114, "grad_norm": 8.246567726135254, "kl": 0.1337890625, "learning_rate": 5.551306564690886e-07, "loss": 0.0053, "reward": 1.415022611618042, "reward_std": 0.11157543957233429, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4150225818157196, "rewards/pad": 0.0, "step": 1396 }, { "completion_length": 226.796875, "epoch": 0.4451880178457616, "grad_norm": 13.543415069580078, "kl": 0.107421875, "learning_rate": 5.548119821542383e-07, "loss": 0.0043, "reward": 1.5432687997817993, "reward_std": 0.10068956017494202, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5432687997817993, "rewards/pad": 0.0, "step": 1397 }, { "completion_length": 218.734375, "epoch": 0.44550669216061184, "grad_norm": 7.59999942779541, "kl": 0.107421875, "learning_rate": 5.544933078393881e-07, "loss": 0.0043, "reward": 1.5217714309692383, "reward_std": 0.07551935315132141, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5217715501785278, "step": 1398 }, { "completion_length": 210.984375, "epoch": 0.44582536647546206, "grad_norm": 7.47714376449585, "kl": 0.12109375, "learning_rate": 5.541746335245379e-07, "loss": 0.0048, "reward": 1.561887264251709, "reward_std": 0.08502572774887085, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.452512264251709, "rewards/pad": 0.109375, "step": 1399 }, { "completion_length": 181.28125, "epoch": 0.4461440407903123, "grad_norm": 18.773975372314453, "kl": 0.11083984375, "learning_rate": 5.538559592096877e-07, "loss": 0.0044, "reward": 1.6833388805389404, "reward_std": 0.18605932593345642, "rewards/answer_reward": 0.28125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4177139401435852, "step": 1400 }, { "completion_length": 204.671875, "epoch": 0.4464627151051625, "grad_norm": 7.299523830413818, "kl": 0.10986328125, "learning_rate": 5.535372848948374e-07, "loss": 0.0044, "reward": 1.4669969081878662, "reward_std": 0.10377576947212219, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.48262184858322144, "step": 1401 }, { "completion_length": 335.828125, "epoch": 0.4467813894200127, "grad_norm": 29.090721130371094, "kl": 0.08349609375, "learning_rate": 5.532186105799872e-07, "loss": 0.0033, "reward": 1.482242465019226, "reward_std": 0.05453872308135033, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4822424054145813, "rewards/pad": 0.0, "step": 1402 }, { "completion_length": 298.25, "epoch": 0.447100063734863, "grad_norm": 11.817734718322754, "kl": 0.07568359375, "learning_rate": 5.52899936265137e-07, "loss": 0.003, "reward": 1.6147228479385376, "reward_std": 0.19429726898670197, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3334728479385376, "rewards/pad": 0.28125, "step": 1403 }, { "completion_length": 179.03125, "epoch": 0.4474187380497132, "grad_norm": 17.679716110229492, "kl": 0.10302734375, "learning_rate": 5.525812619502868e-07, "loss": 0.0041, "reward": 1.5345510244369507, "reward_std": 0.16566292941570282, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4408009946346283, "step": 1404 }, { "completion_length": 226.75, "epoch": 0.44773741236456344, "grad_norm": 27.14039421081543, "kl": 0.10693359375, "learning_rate": 5.522625876354365e-07, "loss": 0.0043, "reward": 1.5934033393859863, "reward_std": 0.1432866007089615, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.48402824997901917, "step": 1405 }, { "completion_length": 223.625, "epoch": 0.44805608667941366, "grad_norm": 5.414300441741943, "kl": 0.08935546875, "learning_rate": 5.519439133205863e-07, "loss": 0.0036, "reward": 1.5930595397949219, "reward_std": 0.11963119357824326, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4836845099925995, "step": 1406 }, { "completion_length": 278.21875, "epoch": 0.4483747609942639, "grad_norm": 10.535198211669922, "kl": 0.09814453125, "learning_rate": 5.516252390057361e-07, "loss": 0.0039, "reward": 1.5685639381408691, "reward_std": 0.09999606013298035, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5685639381408691, "rewards/pad": 0.0, "step": 1407 }, { "completion_length": 319.0625, "epoch": 0.4486934353091141, "grad_norm": 7.539035797119141, "kl": 0.059326171875, "learning_rate": 5.513065646908859e-07, "loss": 0.0024, "reward": 1.7355012893676758, "reward_std": 0.06021437793970108, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.48550137877464294, "step": 1408 }, { "completion_length": 156.65625, "epoch": 0.4490121096239643, "grad_norm": 48.58030700683594, "kl": 0.0986328125, "learning_rate": 5.509878903760356e-07, "loss": 0.0039, "reward": 1.8095495700836182, "reward_std": 0.12096580862998962, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5907995700836182, "rewards/pad": 0.21875, "step": 1409 }, { "completion_length": 204.453125, "epoch": 0.44933078393881454, "grad_norm": 9.988531112670898, "kl": 0.08544921875, "learning_rate": 5.506692160611854e-07, "loss": 0.0034, "reward": 1.593253493309021, "reward_std": 0.22704046964645386, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4838784635066986, "rewards/pad": 0.109375, "step": 1410 }, { "completion_length": 273.0625, "epoch": 0.44964945825366476, "grad_norm": 7.341174125671387, "kl": 0.0859375, "learning_rate": 5.503505417463352e-07, "loss": 0.0034, "reward": 1.6155000925064087, "reward_std": 0.08141268044710159, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4905000925064087, "step": 1411 }, { "completion_length": 343.21875, "epoch": 0.449968132568515, "grad_norm": 11.9456148147583, "kl": 0.0625, "learning_rate": 5.50031867431485e-07, "loss": 0.0025, "reward": 1.387474536895752, "reward_std": 0.12038862705230713, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3093494772911072, "step": 1412 }, { "completion_length": 213.390625, "epoch": 0.4502868068833652, "grad_norm": 13.47103214263916, "kl": 0.1044921875, "learning_rate": 5.497131931166347e-07, "loss": 0.0042, "reward": 1.6264615058898926, "reward_std": 0.14993353188037872, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5170865058898926, "step": 1413 }, { "completion_length": 219.453125, "epoch": 0.4506054811982154, "grad_norm": 9.811759948730469, "kl": 0.080078125, "learning_rate": 5.493945188017846e-07, "loss": 0.0032, "reward": 1.663762092590332, "reward_std": 0.21020281314849854, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4293871521949768, "step": 1414 }, { "completion_length": 415.078125, "epoch": 0.45092415551306564, "grad_norm": 3.461106777191162, "kl": 0.05224609375, "learning_rate": 5.490758444869344e-07, "loss": 0.0021, "reward": 1.4688377380371094, "reward_std": 0.0422951802611351, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.34383776783943176, "rewards/pad": 0.125, "step": 1415 }, { "completion_length": 335.234375, "epoch": 0.45124282982791586, "grad_norm": 6.391787052154541, "kl": 0.06982421875, "learning_rate": 5.487571701720841e-07, "loss": 0.0028, "reward": 1.430022120475769, "reward_std": 0.22225743532180786, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.44564709067344666, "step": 1416 }, { "completion_length": 371.890625, "epoch": 0.4515615041427661, "grad_norm": 10.911314964294434, "kl": 0.05908203125, "learning_rate": 5.484384958572339e-07, "loss": 0.0024, "reward": 1.6407203674316406, "reward_std": 0.05482962727546692, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5157203078269958, "step": 1417 }, { "completion_length": 291.5, "epoch": 0.4518801784576163, "grad_norm": 16.564706802368164, "kl": 0.08642578125, "learning_rate": 5.481198215423837e-07, "loss": 0.0035, "reward": 1.4472219944000244, "reward_std": 0.06772038340568542, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4472220242023468, "rewards/pad": 0.0, "step": 1418 }, { "completion_length": 304.78125, "epoch": 0.4521988527724665, "grad_norm": 5.892740249633789, "kl": 0.07373046875, "learning_rate": 5.478011472275335e-07, "loss": 0.0029, "reward": 1.537081003189087, "reward_std": 0.06439124047756195, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4120810627937317, "rewards/pad": 0.125, "step": 1419 }, { "completion_length": 248.75, "epoch": 0.45251752708731674, "grad_norm": 5.977435111999512, "kl": 0.09765625, "learning_rate": 5.474824729126832e-07, "loss": 0.0039, "reward": 1.4967403411865234, "reward_std": 0.12663181126117706, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49674031138420105, "rewards/pad": 0.0, "step": 1420 }, { "completion_length": 290.265625, "epoch": 0.45283620140216696, "grad_norm": 12.139585494995117, "kl": 0.0810546875, "learning_rate": 5.47163798597833e-07, "loss": 0.0032, "reward": 1.588675618171692, "reward_std": 0.12400297820568085, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.46367567777633667, "step": 1421 }, { "completion_length": 150.5625, "epoch": 0.45315487571701724, "grad_norm": 26.84971809387207, "kl": 0.10693359375, "learning_rate": 5.468451242829828e-07, "loss": 0.0043, "reward": 1.7052375078201294, "reward_std": 0.10772273689508438, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.7052374482154846, "rewards/pad": 0.0, "step": 1422 }, { "completion_length": 286.390625, "epoch": 0.45347355003186746, "grad_norm": 16.370525360107422, "kl": 0.09814453125, "learning_rate": 5.465264499681326e-07, "loss": 0.0039, "reward": 1.5122170448303223, "reward_std": 0.11250067502260208, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.2622169852256775, "step": 1423 }, { "completion_length": 194.15625, "epoch": 0.4537922243467177, "grad_norm": 9.194384574890137, "kl": 0.09912109375, "learning_rate": 5.462077756532823e-07, "loss": 0.004, "reward": 1.6611933708190918, "reward_std": 0.06436239928007126, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.41119328141212463, "rewards/pad": 0.25, "step": 1424 }, { "completion_length": 412.109375, "epoch": 0.4541108986615679, "grad_norm": 8.52474594116211, "kl": 0.06103515625, "learning_rate": 5.458891013384321e-07, "loss": 0.0024, "reward": 1.3822156190872192, "reward_std": 0.04052000492811203, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.38221555948257446, "step": 1425 }, { "completion_length": 251.15625, "epoch": 0.4544295729764181, "grad_norm": 6.234942436218262, "kl": 0.078125, "learning_rate": 5.455704270235819e-07, "loss": 0.0031, "reward": 1.6243338584899902, "reward_std": 0.039573561400175095, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49933379888534546, "step": 1426 }, { "completion_length": 224.28125, "epoch": 0.45474824729126834, "grad_norm": 9.928245544433594, "kl": 0.10400390625, "learning_rate": 5.452517527087317e-07, "loss": 0.0042, "reward": 1.652978539466858, "reward_std": 0.13306963443756104, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.6686034798622131, "step": 1427 }, { "completion_length": 262.84375, "epoch": 0.45506692160611856, "grad_norm": 13.236774444580078, "kl": 0.09228515625, "learning_rate": 5.449330783938814e-07, "loss": 0.0037, "reward": 1.5027391910552979, "reward_std": 0.06213487684726715, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5027391314506531, "rewards/pad": 0.0, "step": 1428 }, { "completion_length": 359.8125, "epoch": 0.4553855959209688, "grad_norm": 7.582083702087402, "kl": 0.068359375, "learning_rate": 5.446144040790312e-07, "loss": 0.0027, "reward": 1.538433313369751, "reward_std": 0.03907448425889015, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.413433313369751, "rewards/pad": 0.125, "step": 1429 }, { "completion_length": 214.046875, "epoch": 0.455704270235819, "grad_norm": 15.639304161071777, "kl": 0.087890625, "learning_rate": 5.44295729764181e-07, "loss": 0.0035, "reward": 1.746893286705017, "reward_std": 0.1430896818637848, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6375182867050171, "rewards/pad": 0.109375, "step": 1430 }, { "completion_length": 286.3125, "epoch": 0.4560229445506692, "grad_norm": 4.884230136871338, "kl": 0.07958984375, "learning_rate": 5.439770554493309e-07, "loss": 0.0032, "reward": 1.6326934099197388, "reward_std": 0.032782621681690216, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5076934099197388, "step": 1431 }, { "completion_length": 308.984375, "epoch": 0.45634161886551944, "grad_norm": 8.61745834350586, "kl": 0.06640625, "learning_rate": 5.436583811344804e-07, "loss": 0.0027, "reward": 1.4891072511672974, "reward_std": 0.05865410715341568, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48910731077194214, "rewards/pad": 0.0, "step": 1432 }, { "completion_length": 285.5625, "epoch": 0.45666029318036966, "grad_norm": 10.135014533996582, "kl": 0.1123046875, "learning_rate": 5.433397068196303e-07, "loss": 0.0045, "reward": 1.805755853652954, "reward_std": 0.15974077582359314, "rewards/answer_reward": 0.484375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3213808536529541, "step": 1433 }, { "completion_length": 328.390625, "epoch": 0.4569789674952199, "grad_norm": 9.830632209777832, "kl": 0.123046875, "learning_rate": 5.430210325047801e-07, "loss": 0.0049, "reward": 1.503233551979065, "reward_std": 0.11444251239299774, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5032335519790649, "step": 1434 }, { "completion_length": 224.59375, "epoch": 0.4572976418100701, "grad_norm": 10.255071640014648, "kl": 0.08642578125, "learning_rate": 5.427023581899299e-07, "loss": 0.0035, "reward": 1.5475623607635498, "reward_std": 0.04728948324918747, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.42256247997283936, "step": 1435 }, { "completion_length": 272.0625, "epoch": 0.4576163161249203, "grad_norm": 9.047293663024902, "kl": 0.107421875, "learning_rate": 5.423836838750796e-07, "loss": 0.0043, "reward": 1.5126659870147705, "reward_std": 0.09375756978988647, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5126661062240601, "step": 1436 }, { "completion_length": 289.34375, "epoch": 0.45793499043977054, "grad_norm": 9.918564796447754, "kl": 0.119140625, "learning_rate": 5.420650095602294e-07, "loss": 0.0048, "reward": 1.3692153692245483, "reward_std": 0.08862542361021042, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3692152798175812, "rewards/pad": 0.0, "step": 1437 }, { "completion_length": 370.71875, "epoch": 0.45825366475462076, "grad_norm": 6.384302616119385, "kl": 0.07177734375, "learning_rate": 5.417463352453792e-07, "loss": 0.0029, "reward": 1.4721190929412842, "reward_std": 0.08346770703792572, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47211918234825134, "rewards/pad": 0.0, "step": 1438 }, { "completion_length": 235.65625, "epoch": 0.458572339069471, "grad_norm": 29.600440979003906, "kl": 0.083984375, "learning_rate": 5.41427660930529e-07, "loss": 0.0034, "reward": 1.87379789352417, "reward_std": 0.09178641438484192, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6394227743148804, "rewards/pad": 0.234375, "step": 1439 }, { "completion_length": 311.25, "epoch": 0.4588910133843212, "grad_norm": 11.616978645324707, "kl": 0.060302734375, "learning_rate": 5.411089866156787e-07, "loss": 0.0024, "reward": 1.6346428394317627, "reward_std": 0.14542795717716217, "rewards/pad": 0.3125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.33776795864105225, "step": 1440 }, { "completion_length": 308.84375, "epoch": 0.4592096876991714, "grad_norm": 7.0898942947387695, "kl": 0.068359375, "learning_rate": 5.407903123008285e-07, "loss": 0.0027, "reward": 1.5286662578582764, "reward_std": 0.08710530400276184, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4192911386489868, "rewards/pad": 0.125, "step": 1441 }, { "completion_length": 146.25, "epoch": 0.4595283620140217, "grad_norm": 17.23931884765625, "kl": 0.119140625, "learning_rate": 5.404716379859783e-07, "loss": 0.0048, "reward": 1.466902256011963, "reward_std": 0.04941733554005623, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3419022560119629, "step": 1442 }, { "completion_length": 218.71875, "epoch": 0.4598470363288719, "grad_norm": 8.337162017822266, "kl": 0.09326171875, "learning_rate": 5.401529636711281e-07, "loss": 0.0037, "reward": 1.692416787147522, "reward_std": 0.09146659076213837, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44241684675216675, "rewards/pad": 0.25, "step": 1443 }, { "completion_length": 239.65625, "epoch": 0.46016571064372214, "grad_norm": 15.837130546569824, "kl": 0.08203125, "learning_rate": 5.398342893562778e-07, "loss": 0.0033, "reward": 1.5374739170074463, "reward_std": 0.06341423094272614, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5374739170074463, "rewards/pad": 0.0, "step": 1444 }, { "completion_length": 164.625, "epoch": 0.46048438495857236, "grad_norm": 11.028009414672852, "kl": 0.126953125, "learning_rate": 5.395156150414276e-07, "loss": 0.0051, "reward": 1.506535530090332, "reward_std": 0.16987767815589905, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.28778553009033203, "rewards/pad": 0.21875, "step": 1445 }, { "completion_length": 324.359375, "epoch": 0.4608030592734226, "grad_norm": 9.85593318939209, "kl": 0.07080078125, "learning_rate": 5.391969407265774e-07, "loss": 0.0028, "reward": 1.5697029829025269, "reward_std": 0.05483391135931015, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44470298290252686, "rewards/pad": 0.125, "step": 1446 }, { "completion_length": 321.03125, "epoch": 0.4611217335882728, "grad_norm": 10.324029922485352, "kl": 0.05859375, "learning_rate": 5.388782664117271e-07, "loss": 0.0023, "reward": 1.5770617723464966, "reward_std": 0.1372072696685791, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4676867425441742, "step": 1447 }, { "completion_length": 356.53125, "epoch": 0.461440407903123, "grad_norm": 5.90736198425293, "kl": 0.06689453125, "learning_rate": 5.385595920968769e-07, "loss": 0.0027, "reward": 1.6476682424545288, "reward_std": 0.07913592457771301, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5226683020591736, "rewards/pad": 0.125, "step": 1448 }, { "completion_length": 268.75, "epoch": 0.46175908221797324, "grad_norm": 24.653217315673828, "kl": 0.08251953125, "learning_rate": 5.382409177820267e-07, "loss": 0.0033, "reward": 1.5845396518707275, "reward_std": 0.12653054296970367, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.47516465187072754, "rewards/pad": 0.125, "step": 1449 }, { "completion_length": 336.8125, "epoch": 0.46207775653282346, "grad_norm": 14.810794830322266, "kl": 0.07177734375, "learning_rate": 5.379222434671765e-07, "loss": 0.0029, "reward": 1.5406912565231323, "reward_std": 0.21528227627277374, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4469412863254547, "rewards/pad": 0.125, "step": 1450 }, { "completion_length": 215.84375, "epoch": 0.4623964308476737, "grad_norm": 19.48642349243164, "kl": 0.10498046875, "learning_rate": 5.376035691523262e-07, "loss": 0.0042, "reward": 1.615641474723816, "reward_std": 0.14932119846343994, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5062664747238159, "rewards/pad": 0.125, "step": 1451 }, { "completion_length": 154.0625, "epoch": 0.4627151051625239, "grad_norm": 15.968269348144531, "kl": 0.1064453125, "learning_rate": 5.372848948374761e-07, "loss": 0.0043, "reward": 1.4699654579162598, "reward_std": 0.10306274145841599, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4699653387069702, "rewards/pad": 0.0, "step": 1452 }, { "completion_length": 304.03125, "epoch": 0.4630337794773741, "grad_norm": 6.2782745361328125, "kl": 0.078125, "learning_rate": 5.369662205226259e-07, "loss": 0.0031, "reward": 1.429635763168335, "reward_std": 0.03694413974881172, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.30463579297065735, "step": 1453 }, { "completion_length": 313.109375, "epoch": 0.46335245379222434, "grad_norm": 8.989459991455078, "kl": 0.076171875, "learning_rate": 5.366475462077757e-07, "loss": 0.003, "reward": 1.55964994430542, "reward_std": 0.07360120117664337, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5596500635147095, "rewards/pad": 0.0, "step": 1454 }, { "completion_length": 209.8125, "epoch": 0.46367112810707456, "grad_norm": 19.859041213989258, "kl": 0.1005859375, "learning_rate": 5.363288718929254e-07, "loss": 0.004, "reward": 1.6786119937896729, "reward_std": 0.0755452886223793, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5536119341850281, "rewards/pad": 0.125, "step": 1455 }, { "completion_length": 233.203125, "epoch": 0.4639898024219248, "grad_norm": 13.783255577087402, "kl": 0.095703125, "learning_rate": 5.360101975780752e-07, "loss": 0.0038, "reward": 1.7623509168624878, "reward_std": 0.10795766860246658, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5123509168624878, "step": 1456 }, { "completion_length": 231.46875, "epoch": 0.464308476736775, "grad_norm": 9.37671184539795, "kl": 0.09326171875, "learning_rate": 5.35691523263225e-07, "loss": 0.0037, "reward": 1.4515711069107056, "reward_std": 0.19940702617168427, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.34219613671302795, "step": 1457 }, { "completion_length": 324.125, "epoch": 0.4646271510516252, "grad_norm": 9.546353340148926, "kl": 0.0673828125, "learning_rate": 5.353728489483748e-07, "loss": 0.0027, "reward": 1.5384440422058105, "reward_std": 0.10423227399587631, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.41344398260116577, "step": 1458 }, { "completion_length": 232.90625, "epoch": 0.46494582536647544, "grad_norm": 11.147302627563477, "kl": 0.0947265625, "learning_rate": 5.350541746335245e-07, "loss": 0.0038, "reward": 1.5543948411941528, "reward_std": 0.06055128574371338, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5543947815895081, "rewards/pad": 0.0, "step": 1459 }, { "completion_length": 308.25, "epoch": 0.46526449968132566, "grad_norm": 12.76003646850586, "kl": 0.07470703125, "learning_rate": 5.347355003186743e-07, "loss": 0.003, "reward": 1.5120352506637573, "reward_std": 0.07145994901657104, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5120352506637573, "step": 1460 }, { "completion_length": 315.890625, "epoch": 0.4655831739961759, "grad_norm": 7.023419380187988, "kl": 0.076171875, "learning_rate": 5.344168260038241e-07, "loss": 0.003, "reward": 1.4683029651641846, "reward_std": 0.09806257486343384, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.48392796516418457, "step": 1461 }, { "completion_length": 329.484375, "epoch": 0.46590184831102616, "grad_norm": 27.741416931152344, "kl": 0.07421875, "learning_rate": 5.340981516889739e-07, "loss": 0.003, "reward": 1.455172061920166, "reward_std": 0.07932863384485245, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.330172061920166, "rewards/pad": 0.125, "step": 1462 }, { "completion_length": 196.984375, "epoch": 0.4662205226258764, "grad_norm": 7.4176506996154785, "kl": 0.09423828125, "learning_rate": 5.337794773741236e-07, "loss": 0.0038, "reward": 1.753882646560669, "reward_std": 0.10934875905513763, "rewards/answer_reward": 0.34375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.41013264656066895, "step": 1463 }, { "completion_length": 179.03125, "epoch": 0.4665391969407266, "grad_norm": 17.609155654907227, "kl": 0.0986328125, "learning_rate": 5.334608030592734e-07, "loss": 0.004, "reward": 1.5946422815322876, "reward_std": 0.19063492119312286, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4852672815322876, "rewards/pad": 0.109375, "step": 1464 }, { "completion_length": 264.390625, "epoch": 0.4668578712555768, "grad_norm": 59.53977584838867, "kl": 0.09130859375, "learning_rate": 5.331421287444232e-07, "loss": 0.0036, "reward": 1.5025644302368164, "reward_std": 0.1165265142917633, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5181894898414612, "rewards/pad": 0.0, "step": 1465 }, { "completion_length": 244.765625, "epoch": 0.46717654557042704, "grad_norm": 32.96870803833008, "kl": 0.1025390625, "learning_rate": 5.32823454429573e-07, "loss": 0.0041, "reward": 1.5927139520645142, "reward_std": 0.1837758719921112, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5145889520645142, "step": 1466 }, { "completion_length": 249.59375, "epoch": 0.46749521988527726, "grad_norm": 37.91212463378906, "kl": 0.10791015625, "learning_rate": 5.325047801147227e-07, "loss": 0.0043, "reward": 1.4021697044372559, "reward_std": 0.12167386710643768, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.41779476404190063, "step": 1467 }, { "completion_length": 359.40625, "epoch": 0.4678138942001275, "grad_norm": 13.006011962890625, "kl": 0.061767578125, "learning_rate": 5.321861057998725e-07, "loss": 0.0025, "reward": 1.580627202987671, "reward_std": 0.20081724226474762, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5181272029876709, "rewards/pad": 0.09375, "step": 1468 }, { "completion_length": 257.640625, "epoch": 0.4681325685149777, "grad_norm": 22.143430709838867, "kl": 0.11083984375, "learning_rate": 5.318674314850224e-07, "loss": 0.0044, "reward": 1.6454774141311646, "reward_std": 0.09021544456481934, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5361024737358093, "rewards/pad": 0.109375, "step": 1469 }, { "completion_length": 351.6875, "epoch": 0.4684512428298279, "grad_norm": 18.117698669433594, "kl": 0.0615234375, "learning_rate": 5.315487571701722e-07, "loss": 0.0025, "reward": 1.7443134784698486, "reward_std": 0.07033313810825348, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.36931341886520386, "step": 1470 }, { "completion_length": 297.453125, "epoch": 0.46876991714467814, "grad_norm": 5.271533966064453, "kl": 0.0986328125, "learning_rate": 5.312300828553218e-07, "loss": 0.0039, "reward": 1.6119842529296875, "reward_std": 0.05973343178629875, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6119842529296875, "rewards/pad": 0.0, "step": 1471 }, { "completion_length": 235.96875, "epoch": 0.46908859145952836, "grad_norm": 9.944218635559082, "kl": 0.111328125, "learning_rate": 5.309114085404716e-07, "loss": 0.0045, "reward": 1.5511842966079712, "reward_std": 0.1317683756351471, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4418092370033264, "rewards/pad": 0.109375, "step": 1472 }, { "completion_length": 246.828125, "epoch": 0.4694072657743786, "grad_norm": 9.702787399291992, "kl": 0.08837890625, "learning_rate": 5.305927342256214e-07, "loss": 0.0035, "reward": 1.7182908058166504, "reward_std": 0.1070544645190239, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6245408654212952, "step": 1473 }, { "completion_length": 401.765625, "epoch": 0.4697259400892288, "grad_norm": 7.4017229080200195, "kl": 0.06396484375, "learning_rate": 5.302740599107712e-07, "loss": 0.0026, "reward": 1.312766671180725, "reward_std": 0.16063524782657623, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.34401655197143555, "rewards/pad": 0.0, "step": 1474 }, { "completion_length": 218.765625, "epoch": 0.470044614404079, "grad_norm": 12.235037803649902, "kl": 0.07958984375, "learning_rate": 5.299553855959209e-07, "loss": 0.0032, "reward": 1.5451993942260742, "reward_std": 0.12426760792732239, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.451449453830719, "rewards/pad": 0.109375, "step": 1475 }, { "completion_length": 251.0625, "epoch": 0.47036328871892924, "grad_norm": 13.679093360900879, "kl": 0.08447265625, "learning_rate": 5.296367112810707e-07, "loss": 0.0034, "reward": 1.7477211952209473, "reward_std": 0.08329816907644272, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6070961952209473, "step": 1476 }, { "completion_length": 302.515625, "epoch": 0.47068196303377946, "grad_norm": 9.420421600341797, "kl": 0.0771484375, "learning_rate": 5.293180369662205e-07, "loss": 0.0031, "reward": 1.6046003103256226, "reward_std": 0.049083903431892395, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.47960028052330017, "step": 1477 }, { "completion_length": 444.046875, "epoch": 0.4710006373486297, "grad_norm": 5.747177600860596, "kl": 0.064453125, "learning_rate": 5.289993626513702e-07, "loss": 0.0026, "reward": 1.641414761543274, "reward_std": 0.11404988169670105, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5945398807525635, "step": 1478 }, { "completion_length": 365.953125, "epoch": 0.4713193116634799, "grad_norm": 13.27202033996582, "kl": 0.08642578125, "learning_rate": 5.2868068833652e-07, "loss": 0.0035, "reward": 1.4590954780578613, "reward_std": 0.08784317970275879, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4590955078601837, "step": 1479 }, { "completion_length": 229.6875, "epoch": 0.4716379859783301, "grad_norm": 15.232290267944336, "kl": 0.0986328125, "learning_rate": 5.283620140216698e-07, "loss": 0.0039, "reward": 1.6809430122375488, "reward_std": 0.17516812682151794, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5715680122375488, "step": 1480 }, { "completion_length": 225.96875, "epoch": 0.4719566602931804, "grad_norm": 12.24894905090332, "kl": 0.080078125, "learning_rate": 5.280433397068196e-07, "loss": 0.0032, "reward": 1.6003769636154175, "reward_std": 0.1401960253715515, "rewards/answer_reward": 0.015625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.584752082824707, "step": 1481 }, { "completion_length": 241.15625, "epoch": 0.4722753346080306, "grad_norm": 8.365073204040527, "kl": 0.10888671875, "learning_rate": 5.277246653919693e-07, "loss": 0.0044, "reward": 1.6874208450317383, "reward_std": 0.09517542272806168, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5624208450317383, "rewards/pad": 0.125, "step": 1482 }, { "completion_length": 357.015625, "epoch": 0.47259400892288084, "grad_norm": 2.965028762817383, "kl": 0.078125, "learning_rate": 5.274059910771191e-07, "loss": 0.0031, "reward": 1.4896973371505737, "reward_std": 0.08937929570674896, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48969733715057373, "rewards/pad": 0.0, "step": 1483 }, { "completion_length": 258.78125, "epoch": 0.47291268323773106, "grad_norm": 8.647948265075684, "kl": 0.083984375, "learning_rate": 5.270873167622689e-07, "loss": 0.0034, "reward": 1.5113916397094727, "reward_std": 0.23726826906204224, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.3082665801048279, "step": 1484 }, { "completion_length": 277.171875, "epoch": 0.4732313575525813, "grad_norm": 10.09041690826416, "kl": 0.0810546875, "learning_rate": 5.267686424474187e-07, "loss": 0.0032, "reward": 1.7379040718078613, "reward_std": 0.21050803363323212, "rewards/pad": 0.375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.37852901220321655, "step": 1485 }, { "completion_length": 289.796875, "epoch": 0.4735500318674315, "grad_norm": 5.892612934112549, "kl": 0.095703125, "learning_rate": 5.264499681325684e-07, "loss": 0.0038, "reward": 1.6408710479736328, "reward_std": 0.156442791223526, "rewards/pad": 0.21875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.43774598836898804, "step": 1486 }, { "completion_length": 357.96875, "epoch": 0.4738687061822817, "grad_norm": 5.740538597106934, "kl": 0.06982421875, "learning_rate": 5.261312938177182e-07, "loss": 0.0028, "reward": 1.7562711238861084, "reward_std": 0.10480953752994537, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5375211238861084, "rewards/pad": 0.21875, "step": 1487 }, { "completion_length": 349.109375, "epoch": 0.47418738049713194, "grad_norm": 275.7038269042969, "kl": 0.09228515625, "learning_rate": 5.25812619502868e-07, "loss": 0.0037, "reward": 1.5019524097442627, "reward_std": 0.06322328746318817, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5019525289535522, "step": 1488 }, { "completion_length": 228.03125, "epoch": 0.47450605481198216, "grad_norm": 13.978322982788086, "kl": 0.08154296875, "learning_rate": 5.254939451880179e-07, "loss": 0.0033, "reward": 1.7376821041107178, "reward_std": 0.10818063467741013, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6126821637153625, "rewards/pad": 0.125, "step": 1489 }, { "completion_length": 278.046875, "epoch": 0.4748247291268324, "grad_norm": 31.306400299072266, "kl": 0.0791015625, "learning_rate": 5.251752708731676e-07, "loss": 0.0032, "reward": 1.5066144466400146, "reward_std": 0.052641309797763824, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3816143274307251, "rewards/pad": 0.125, "step": 1490 }, { "completion_length": 273.21875, "epoch": 0.4751434034416826, "grad_norm": 5.694839954376221, "kl": 0.09375, "learning_rate": 5.248565965583174e-07, "loss": 0.0038, "reward": 1.6695730686187744, "reward_std": 0.14564523100852966, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5758230686187744, "rewards/pad": 0.109375, "step": 1491 }, { "completion_length": 231.21875, "epoch": 0.4754620777565328, "grad_norm": 15.175251960754395, "kl": 0.107421875, "learning_rate": 5.245379222434672e-07, "loss": 0.0043, "reward": 1.5302845239639282, "reward_std": 0.21586251258850098, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5615345239639282, "rewards/pad": 0.0, "step": 1492 }, { "completion_length": 348.546875, "epoch": 0.47578075207138304, "grad_norm": 6.989543914794922, "kl": 0.0703125, "learning_rate": 5.24219247928617e-07, "loss": 0.0028, "reward": 1.6768548488616943, "reward_std": 0.11237145215272903, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5674799084663391, "rewards/pad": 0.125, "step": 1493 }, { "completion_length": 319.53125, "epoch": 0.47609942638623326, "grad_norm": 9.460558891296387, "kl": 0.09228515625, "learning_rate": 5.239005736137667e-07, "loss": 0.0037, "reward": 1.5112285614013672, "reward_std": 0.09450381994247437, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.511228621006012, "step": 1494 }, { "completion_length": 253.765625, "epoch": 0.4764181007010835, "grad_norm": 6.729824066162109, "kl": 0.09912109375, "learning_rate": 5.235818992989165e-07, "loss": 0.004, "reward": 1.6795250177383423, "reward_std": 0.07208022475242615, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5701500177383423, "step": 1495 }, { "completion_length": 191.921875, "epoch": 0.4767367750159337, "grad_norm": 9.541268348693848, "kl": 0.109375, "learning_rate": 5.232632249840663e-07, "loss": 0.0044, "reward": 1.529311180114746, "reward_std": 0.07672906666994095, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5293111801147461, "rewards/pad": 0.0, "step": 1496 }, { "completion_length": 327.21875, "epoch": 0.4770554493307839, "grad_norm": 6.8143086433410645, "kl": 0.091796875, "learning_rate": 5.229445506692161e-07, "loss": 0.0037, "reward": 1.588439702987671, "reward_std": 0.04701199382543564, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5884397029876709, "rewards/pad": 0.0, "step": 1497 }, { "completion_length": 174.578125, "epoch": 0.47737412364563414, "grad_norm": 13.120146751403809, "kl": 0.1083984375, "learning_rate": 5.226258763543658e-07, "loss": 0.0043, "reward": 1.6799719333648682, "reward_std": 0.05143211781978607, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6799719333648682, "rewards/pad": 0.0, "step": 1498 }, { "completion_length": 362.4375, "epoch": 0.47769279796048436, "grad_norm": 11.334291458129883, "kl": 0.0908203125, "learning_rate": 5.223072020395156e-07, "loss": 0.0036, "reward": 1.4830890893936157, "reward_std": 0.046455733478069305, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4830890893936157, "rewards/pad": 0.0, "step": 1499 }, { "completion_length": 227.25, "epoch": 0.4780114722753346, "grad_norm": 13.76636791229248, "kl": 0.095703125, "learning_rate": 5.219885277246654e-07, "loss": 0.0038, "reward": 1.5293161869049072, "reward_std": 0.11343329399824142, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5293163061141968, "rewards/pad": 0.0, "step": 1500 }, { "completion_length": 354.734375, "epoch": 0.47833014659018486, "grad_norm": 6.618967533111572, "kl": 0.0830078125, "learning_rate": 5.216698534098152e-07, "loss": 0.0033, "reward": 1.4522448778152466, "reward_std": 0.22254043817520142, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4522448182106018, "rewards/pad": 0.03125, "step": 1501 }, { "completion_length": 322.3125, "epoch": 0.4786488209050351, "grad_norm": 30.195968627929688, "kl": 0.07080078125, "learning_rate": 5.213511790949649e-07, "loss": 0.0028, "reward": 1.5755908489227295, "reward_std": 0.14302876591682434, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4818408489227295, "step": 1502 }, { "completion_length": 273.265625, "epoch": 0.4789674952198853, "grad_norm": 5.307925701141357, "kl": 0.087890625, "learning_rate": 5.210325047801147e-07, "loss": 0.0035, "reward": 1.4843538999557495, "reward_std": 0.2061702311038971, "rewards/format_reward_tg": 0.9375, "rewards/iou_timestamp_reward": 0.5468538999557495, "rewards/pad": 0.0, "step": 1503 }, { "completion_length": 289.5, "epoch": 0.4792861695347355, "grad_norm": 19.900287628173828, "kl": 0.1220703125, "learning_rate": 5.207138304652645e-07, "loss": 0.0049, "reward": 1.44891357421875, "reward_std": 0.07818566262722015, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44891366362571716, "rewards/pad": 0.0, "step": 1504 }, { "completion_length": 287.671875, "epoch": 0.47960484384958574, "grad_norm": 14.69202709197998, "kl": 0.09814453125, "learning_rate": 5.203951561504143e-07, "loss": 0.0039, "reward": 1.3902406692504883, "reward_std": 0.07330122590065002, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.39024072885513306, "step": 1505 }, { "completion_length": 332.484375, "epoch": 0.47992351816443596, "grad_norm": 39.345054626464844, "kl": 0.0693359375, "learning_rate": 5.20076481835564e-07, "loss": 0.0028, "reward": 1.4343560934066772, "reward_std": 0.06460077315568924, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.32498109340667725, "step": 1506 }, { "completion_length": 374.046875, "epoch": 0.4802421924792862, "grad_norm": 4.585975646972656, "kl": 0.09814453125, "learning_rate": 5.197578075207139e-07, "loss": 0.0039, "reward": 1.3336130380630493, "reward_std": 0.0820106789469719, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.34923800826072693, "step": 1507 }, { "completion_length": 232.140625, "epoch": 0.4805608667941364, "grad_norm": 16.264991760253906, "kl": 0.07861328125, "learning_rate": 5.194391332058637e-07, "loss": 0.0031, "reward": 1.7201967239379883, "reward_std": 0.08893473446369171, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47019678354263306, "step": 1508 }, { "completion_length": 288.140625, "epoch": 0.4808795411089866, "grad_norm": 33.29106140136719, "kl": 0.08349609375, "learning_rate": 5.191204588910135e-07, "loss": 0.0033, "reward": 1.4425323009490967, "reward_std": 0.04038984328508377, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4425322413444519, "step": 1509 }, { "completion_length": 342.90625, "epoch": 0.48119821542383684, "grad_norm": 9.950427055358887, "kl": 0.0625, "learning_rate": 5.188017845761632e-07, "loss": 0.0025, "reward": 1.8057146072387695, "reward_std": 0.1636984646320343, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.46196460723876953, "rewards/pad": 0.359375, "step": 1510 }, { "completion_length": 214.515625, "epoch": 0.48151688973868706, "grad_norm": 8.171236991882324, "kl": 0.09326171875, "learning_rate": 5.184831102613129e-07, "loss": 0.0037, "reward": 1.408177375793457, "reward_std": 0.05359324812889099, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.2831774652004242, "step": 1511 }, { "completion_length": 258.65625, "epoch": 0.4818355640535373, "grad_norm": 9.692242622375488, "kl": 0.1015625, "learning_rate": 5.181644359464627e-07, "loss": 0.0041, "reward": 1.5358140468597412, "reward_std": 0.13970480859279633, "rewards/answer_reward": 0.09375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4576890170574188, "step": 1512 }, { "completion_length": 269.0625, "epoch": 0.4821542383683875, "grad_norm": 10.316642761230469, "kl": 0.099609375, "learning_rate": 5.178457616316124e-07, "loss": 0.004, "reward": 1.5783674716949463, "reward_std": 0.10503272712230682, "rewards/answer_reward": 0.171875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4064924716949463, "step": 1513 }, { "completion_length": 294.5, "epoch": 0.4824729126832377, "grad_norm": 26.83660888671875, "kl": 0.09423828125, "learning_rate": 5.175270873167622e-07, "loss": 0.0038, "reward": 1.566218614578247, "reward_std": 0.13746222853660583, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5193435549736023, "step": 1514 }, { "completion_length": 303.984375, "epoch": 0.48279158699808794, "grad_norm": 6.991064548492432, "kl": 0.07861328125, "learning_rate": 5.17208413001912e-07, "loss": 0.0031, "reward": 1.561284065246582, "reward_std": 0.17649711668491364, "rewards/answer_reward": 0.078125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.49878406524658203, "step": 1515 }, { "completion_length": 395.78125, "epoch": 0.48311026131293816, "grad_norm": 7.837182521820068, "kl": 0.06298828125, "learning_rate": 5.168897386870618e-07, "loss": 0.0025, "reward": 1.5242811441421509, "reward_std": 0.04641805589199066, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3992811143398285, "rewards/pad": 0.125, "step": 1516 }, { "completion_length": 219.359375, "epoch": 0.4834289356277884, "grad_norm": 7.788435935974121, "kl": 0.11572265625, "learning_rate": 5.165710643722115e-07, "loss": 0.0046, "reward": 1.4695744514465332, "reward_std": 0.11476314067840576, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3601993918418884, "step": 1517 }, { "completion_length": 222.625, "epoch": 0.4837476099426386, "grad_norm": 23.84461784362793, "kl": 0.10400390625, "learning_rate": 5.162523900573613e-07, "loss": 0.0042, "reward": 1.7240705490112305, "reward_std": 0.10388059914112091, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5990704298019409, "rewards/pad": 0.125, "step": 1518 }, { "completion_length": 170.78125, "epoch": 0.4840662842574888, "grad_norm": 15.905930519104004, "kl": 0.099609375, "learning_rate": 5.159337157425111e-07, "loss": 0.004, "reward": 1.6322693824768066, "reward_std": 0.10194779187440872, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5228945016860962, "rewards/pad": 0.125, "step": 1519 }, { "completion_length": 306.546875, "epoch": 0.4843849585723391, "grad_norm": 24.74367332458496, "kl": 0.08740234375, "learning_rate": 5.156150414276609e-07, "loss": 0.0035, "reward": 1.5158793926239014, "reward_std": 0.07431940734386444, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39087939262390137, "rewards/pad": 0.125, "step": 1520 }, { "completion_length": 284.390625, "epoch": 0.4847036328871893, "grad_norm": 11.4489164352417, "kl": 0.130859375, "learning_rate": 5.152963671128106e-07, "loss": 0.0052, "reward": 1.5866434574127197, "reward_std": 0.07465530186891556, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4616435170173645, "step": 1521 }, { "completion_length": 286.921875, "epoch": 0.48502230720203954, "grad_norm": 5.68887996673584, "kl": 0.07421875, "learning_rate": 5.149776927979604e-07, "loss": 0.003, "reward": 1.4670679569244385, "reward_std": 0.11055351793766022, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3576929569244385, "rewards/pad": 0.125, "step": 1522 }, { "completion_length": 331.75, "epoch": 0.48534098151688976, "grad_norm": 4.68539571762085, "kl": 0.0673828125, "learning_rate": 5.146590184831102e-07, "loss": 0.0027, "reward": 1.4538731575012207, "reward_std": 0.06285285204648972, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3288731276988983, "step": 1523 }, { "completion_length": 333.78125, "epoch": 0.48565965583174, "grad_norm": 8.88425350189209, "kl": 0.095703125, "learning_rate": 5.1434034416826e-07, "loss": 0.0038, "reward": 1.3173884153366089, "reward_std": 0.04951098561286926, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3173884153366089, "step": 1524 }, { "completion_length": 424.8125, "epoch": 0.4859783301465902, "grad_norm": 5.125090599060059, "kl": 0.05224609375, "learning_rate": 5.140216698534097e-07, "loss": 0.0021, "reward": 1.418252944946289, "reward_std": 0.06199708208441734, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.43387794494628906, "step": 1525 }, { "completion_length": 233.015625, "epoch": 0.4862970044614404, "grad_norm": 10.070802688598633, "kl": 0.08642578125, "learning_rate": 5.137029955385595e-07, "loss": 0.0035, "reward": 1.6166040897369385, "reward_std": 0.168365940451622, "rewards/answer_reward": 0.3125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3197290897369385, "step": 1526 }, { "completion_length": 239.21875, "epoch": 0.48661567877629064, "grad_norm": 23.041690826416016, "kl": 0.099609375, "learning_rate": 5.133843212237094e-07, "loss": 0.004, "reward": 1.650726556777954, "reward_std": 0.11059662699699402, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5257264971733093, "rewards/pad": 0.125, "step": 1527 }, { "completion_length": 126.6875, "epoch": 0.48693435309114086, "grad_norm": 12.056344985961914, "kl": 0.1142578125, "learning_rate": 5.130656469088592e-07, "loss": 0.0046, "reward": 1.7706551551818848, "reward_std": 0.12458785623311996, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.52065509557724, "step": 1528 }, { "completion_length": 180.921875, "epoch": 0.4872530274059911, "grad_norm": 5.845155715942383, "kl": 0.11328125, "learning_rate": 5.127469725940089e-07, "loss": 0.0045, "reward": 1.4621466398239136, "reward_std": 0.09760545194149017, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46214666962623596, "rewards/pad": 0.0, "step": 1529 }, { "completion_length": 183.96875, "epoch": 0.4875717017208413, "grad_norm": 18.358531951904297, "kl": 0.09814453125, "learning_rate": 5.124282982791587e-07, "loss": 0.0039, "reward": 1.92790687084198, "reward_std": 0.19410569965839386, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6154068112373352, "rewards/pad": 0.328125, "step": 1530 }, { "completion_length": 244.03125, "epoch": 0.4878903760356915, "grad_norm": 12.977493286132812, "kl": 0.0830078125, "learning_rate": 5.121096239643085e-07, "loss": 0.0033, "reward": 1.7430362701416016, "reward_std": 0.12946908175945282, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5711612105369568, "rewards/pad": 0.171875, "step": 1531 }, { "completion_length": 219.734375, "epoch": 0.48820905035054174, "grad_norm": 8.308573722839355, "kl": 0.10546875, "learning_rate": 5.117909496494583e-07, "loss": 0.0042, "reward": 1.6283482313156128, "reward_std": 0.08693157136440277, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6283482313156128, "rewards/pad": 0.0, "step": 1532 }, { "completion_length": 243.03125, "epoch": 0.48852772466539196, "grad_norm": 12.113231658935547, "kl": 0.10205078125, "learning_rate": 5.11472275334608e-07, "loss": 0.0041, "reward": 1.3734705448150635, "reward_std": 0.07666440308094025, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3734704852104187, "rewards/pad": 0.0, "step": 1533 }, { "completion_length": 221.328125, "epoch": 0.4888463989802422, "grad_norm": 15.715192794799805, "kl": 0.1025390625, "learning_rate": 5.111536010197578e-07, "loss": 0.0041, "reward": 1.5486443042755127, "reward_std": 0.14197894930839539, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4236442446708679, "rewards/pad": 0.125, "step": 1534 }, { "completion_length": 329.125, "epoch": 0.4891650732950924, "grad_norm": 18.260419845581055, "kl": 0.06591796875, "learning_rate": 5.108349267049076e-07, "loss": 0.0026, "reward": 1.6931008100509644, "reward_std": 0.04175805300474167, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.44310078024864197, "step": 1535 }, { "completion_length": 384.28125, "epoch": 0.4894837476099426, "grad_norm": 3.997661590576172, "kl": 0.06640625, "learning_rate": 5.105162523900574e-07, "loss": 0.0027, "reward": 1.495345115661621, "reward_std": 0.1974174678325653, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.5422199964523315, "rewards/pad": 0.0, "step": 1536 }, { "completion_length": 328.578125, "epoch": 0.48980242192479284, "grad_norm": 6.118681907653809, "kl": 0.07763671875, "learning_rate": 5.101975780752071e-07, "loss": 0.0031, "reward": 1.459146499633789, "reward_std": 0.11266092956066132, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.47477149963378906, "step": 1537 }, { "completion_length": 221.484375, "epoch": 0.49012109623964306, "grad_norm": 32.38083267211914, "kl": 0.10107421875, "learning_rate": 5.098789037603569e-07, "loss": 0.0041, "reward": 1.5192437171936035, "reward_std": 0.10911397635936737, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.40986865758895874, "step": 1538 }, { "completion_length": 275.5625, "epoch": 0.4904397705544933, "grad_norm": 25.792707443237305, "kl": 0.11376953125, "learning_rate": 5.095602294455067e-07, "loss": 0.0045, "reward": 1.474266767501831, "reward_std": 0.04674249142408371, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47426676750183105, "step": 1539 }, { "completion_length": 173.75, "epoch": 0.49075844486934356, "grad_norm": 15.709648132324219, "kl": 0.1318359375, "learning_rate": 5.092415551306564e-07, "loss": 0.0053, "reward": 1.5435810089111328, "reward_std": 0.09169697761535645, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4342060983181, "rewards/pad": 0.109375, "step": 1540 }, { "completion_length": 279.578125, "epoch": 0.4910771191841938, "grad_norm": 8.689448356628418, "kl": 0.0791015625, "learning_rate": 5.089228808158062e-07, "loss": 0.0032, "reward": 1.5602798461914062, "reward_std": 0.16906121373176575, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.40402984619140625, "step": 1541 }, { "completion_length": 340.546875, "epoch": 0.491395793499044, "grad_norm": 15.626315116882324, "kl": 0.07373046875, "learning_rate": 5.08604206500956e-07, "loss": 0.0029, "reward": 1.3878772258758545, "reward_std": 0.05019243061542511, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3878771960735321, "step": 1542 }, { "completion_length": 210.53125, "epoch": 0.4917144678138942, "grad_norm": 28.460586547851562, "kl": 0.10791015625, "learning_rate": 5.082855321861058e-07, "loss": 0.0043, "reward": 1.3582227230072021, "reward_std": 0.05405221879482269, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3582227826118469, "rewards/pad": 0.0, "step": 1543 }, { "completion_length": 231.71875, "epoch": 0.49203314212874444, "grad_norm": 52.28375244140625, "kl": 0.08984375, "learning_rate": 5.079668578712555e-07, "loss": 0.0036, "reward": 1.8166325092315674, "reward_std": 0.22572393715381622, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6291325688362122, "rewards/pad": 0.203125, "step": 1544 }, { "completion_length": 159.40625, "epoch": 0.49235181644359466, "grad_norm": 7.311517715454102, "kl": 0.099609375, "learning_rate": 5.076481835564054e-07, "loss": 0.004, "reward": 1.329869270324707, "reward_std": 0.09352267533540726, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3298693001270294, "step": 1545 }, { "completion_length": 223.578125, "epoch": 0.4926704907584449, "grad_norm": 14.767241477966309, "kl": 0.09765625, "learning_rate": 5.073295092415552e-07, "loss": 0.0039, "reward": 1.7060375213623047, "reward_std": 0.12457602471113205, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6122875213623047, "step": 1546 }, { "completion_length": 273.546875, "epoch": 0.4929891650732951, "grad_norm": 9.330548286437988, "kl": 0.12890625, "learning_rate": 5.07010834926705e-07, "loss": 0.0051, "reward": 1.4542279243469238, "reward_std": 0.13715635240077972, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4698530435562134, "step": 1547 }, { "completion_length": 257.03125, "epoch": 0.4933078393881453, "grad_norm": 25.97972869873047, "kl": 0.099609375, "learning_rate": 5.066921606118547e-07, "loss": 0.004, "reward": 1.437532663345337, "reward_std": 0.11327797174453735, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.45315760374069214, "rewards/pad": 0.0, "step": 1548 }, { "completion_length": 238.40625, "epoch": 0.49362651370299554, "grad_norm": 6.355741024017334, "kl": 0.07080078125, "learning_rate": 5.063734862970045e-07, "loss": 0.0028, "reward": 1.877377986907959, "reward_std": 0.11931359767913818, "rewards/answer_reward": 0.484375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3930028975009918, "step": 1549 }, { "completion_length": 372.40625, "epoch": 0.49394518801784576, "grad_norm": 9.344226837158203, "kl": 0.078125, "learning_rate": 5.060548119821542e-07, "loss": 0.0031, "reward": 1.4983468055725098, "reward_std": 0.14175045490264893, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5139718055725098, "step": 1550 }, { "completion_length": 255.0625, "epoch": 0.494263862332696, "grad_norm": 22.564973831176758, "kl": 0.1015625, "learning_rate": 5.05736137667304e-07, "loss": 0.0041, "reward": 1.4562467336654663, "reward_std": 0.16175200045108795, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.4874967336654663, "step": 1551 }, { "completion_length": 288.609375, "epoch": 0.4945825366475462, "grad_norm": 6.467219352722168, "kl": 0.08349609375, "learning_rate": 5.054174633524537e-07, "loss": 0.0033, "reward": 1.5488793849945068, "reward_std": 0.06914226710796356, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5488794445991516, "step": 1552 }, { "completion_length": 267.984375, "epoch": 0.4949012109623964, "grad_norm": 6.855160236358643, "kl": 0.0830078125, "learning_rate": 5.050987890376035e-07, "loss": 0.0033, "reward": 1.58836030960083, "reward_std": 0.06652949750423431, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4633602797985077, "step": 1553 }, { "completion_length": 228.734375, "epoch": 0.49521988527724664, "grad_norm": 19.235485076904297, "kl": 0.10498046875, "learning_rate": 5.047801147227533e-07, "loss": 0.0042, "reward": 1.5286242961883545, "reward_std": 0.11664712429046631, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.2942492365837097, "step": 1554 }, { "completion_length": 280.9375, "epoch": 0.49553855959209686, "grad_norm": 10.115842819213867, "kl": 0.09423828125, "learning_rate": 5.044614404079031e-07, "loss": 0.0038, "reward": 1.548297643661499, "reward_std": 0.11133519560098648, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5639227032661438, "rewards/pad": 0.0, "step": 1555 }, { "completion_length": 226.234375, "epoch": 0.4958572339069471, "grad_norm": 22.632034301757812, "kl": 0.10986328125, "learning_rate": 5.041427660930528e-07, "loss": 0.0044, "reward": 1.5520951747894287, "reward_std": 0.11214704811573029, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5520952343940735, "rewards/pad": 0.0, "step": 1556 }, { "completion_length": 387.8125, "epoch": 0.4961759082217973, "grad_norm": 4.392566204071045, "kl": 0.07080078125, "learning_rate": 5.038240917782026e-07, "loss": 0.0028, "reward": 1.5486336946487427, "reward_std": 0.04238799214363098, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5486336946487427, "rewards/pad": 0.0, "step": 1557 }, { "completion_length": 266.71875, "epoch": 0.4964945825366475, "grad_norm": 10.745803833007812, "kl": 0.0947265625, "learning_rate": 5.035054174633524e-07, "loss": 0.0038, "reward": 1.51939058303833, "reward_std": 0.10485051572322845, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5350156426429749, "rewards/pad": 0.0, "step": 1558 }, { "completion_length": 287.21875, "epoch": 0.49681325685149774, "grad_norm": 9.209283828735352, "kl": 0.08935546875, "learning_rate": 5.031867431485022e-07, "loss": 0.0036, "reward": 1.5379369258880615, "reward_std": 0.28919923305511475, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.953125, "rewards/iou_glue_reward": 0.4598119258880615, "step": 1559 }, { "completion_length": 279.828125, "epoch": 0.497131931166348, "grad_norm": 9.263772964477539, "kl": 0.0927734375, "learning_rate": 5.028680688336519e-07, "loss": 0.0037, "reward": 1.4220798015594482, "reward_std": 0.10979984700679779, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4377047121524811, "rewards/pad": 0.0, "step": 1560 }, { "completion_length": 182.3125, "epoch": 0.49745060548119824, "grad_norm": 14.146515846252441, "kl": 0.11572265625, "learning_rate": 5.025493945188017e-07, "loss": 0.0046, "reward": 1.6618396043777466, "reward_std": 0.09788843989372253, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5368396043777466, "rewards/pad": 0.125, "step": 1561 }, { "completion_length": 191.421875, "epoch": 0.49776927979604846, "grad_norm": 53.63630676269531, "kl": 0.09521484375, "learning_rate": 5.022307202039515e-07, "loss": 0.0038, "reward": 1.9397457838058472, "reward_std": 0.10246553272008896, "rewards/answer_reward": 0.5, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.43974587321281433, "step": 1562 }, { "completion_length": 250.953125, "epoch": 0.4980879541108987, "grad_norm": 6.227761745452881, "kl": 0.09521484375, "learning_rate": 5.019120458891013e-07, "loss": 0.0038, "reward": 1.4951188564300537, "reward_std": 0.14482280611991882, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5263688564300537, "step": 1563 }, { "completion_length": 416.875, "epoch": 0.4984066284257489, "grad_norm": 5.780639171600342, "kl": 0.059814453125, "learning_rate": 5.01593371574251e-07, "loss": 0.0024, "reward": 1.532064437866211, "reward_std": 0.03820374235510826, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5320644378662109, "step": 1564 }, { "completion_length": 221.484375, "epoch": 0.4987253027405991, "grad_norm": 15.810646057128906, "kl": 0.1015625, "learning_rate": 5.012746972594009e-07, "loss": 0.0041, "reward": 1.7277759313583374, "reward_std": 0.0686957985162735, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6027759313583374, "rewards/pad": 0.125, "step": 1565 }, { "completion_length": 297.9375, "epoch": 0.49904397705544934, "grad_norm": 8.922152519226074, "kl": 0.08642578125, "learning_rate": 5.009560229445507e-07, "loss": 0.0035, "reward": 1.2540583610534668, "reward_std": 0.31059730052948, "rewards/format_reward_tg": 0.921875, "rewards/iou_timestamp_reward": 0.316558301448822, "rewards/pad": 0.015625, "step": 1566 }, { "completion_length": 220.203125, "epoch": 0.49936265137029956, "grad_norm": 17.810441970825195, "kl": 0.09228515625, "learning_rate": 5.006373486297005e-07, "loss": 0.0037, "reward": 1.598185420036316, "reward_std": 0.1599069982767105, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3638104498386383, "step": 1567 }, { "completion_length": 271.53125, "epoch": 0.4996813256851498, "grad_norm": 5.525754928588867, "kl": 0.080078125, "learning_rate": 5.003186743148502e-07, "loss": 0.0032, "reward": 1.5842467546463013, "reward_std": 0.12647908926010132, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5998716950416565, "step": 1568 }, { "completion_length": 277.890625, "epoch": 0.5, "grad_norm": 31.218156814575195, "kl": 0.0859375, "learning_rate": 5e-07, "loss": 0.0034, "reward": 1.6523618698120117, "reward_std": 0.19241999089717865, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5586118102073669, "step": 1569 }, { "completion_length": 268.0625, "epoch": 0.5003186743148502, "grad_norm": 5.818955898284912, "kl": 0.08984375, "learning_rate": 4.996813256851498e-07, "loss": 0.0036, "reward": 1.4741897583007812, "reward_std": 0.21641533076763153, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.5210647583007812, "rewards/pad": 0.0, "step": 1570 }, { "completion_length": 230.578125, "epoch": 0.5006373486297004, "grad_norm": 8.602928161621094, "kl": 0.09765625, "learning_rate": 4.993626513702995e-07, "loss": 0.0039, "reward": 1.467624545097351, "reward_std": 0.217521071434021, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.32699960470199585, "rewards/pad": 0.15625, "step": 1571 }, { "completion_length": 345.484375, "epoch": 0.5009560229445507, "grad_norm": 5.404134750366211, "kl": 0.07763671875, "learning_rate": 4.990439770554493e-07, "loss": 0.0031, "reward": 1.6908576488494873, "reward_std": 0.07052527368068695, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5658575892448425, "rewards/pad": 0.125, "step": 1572 }, { "completion_length": 289.703125, "epoch": 0.5012746972594009, "grad_norm": 11.262680053710938, "kl": 0.0986328125, "learning_rate": 4.987253027405991e-07, "loss": 0.004, "reward": 1.4238890409469604, "reward_std": 0.13102203607559204, "rewards/answer_reward": 0.03125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.39263904094696045, "step": 1573 }, { "completion_length": 204.421875, "epoch": 0.5015933715742511, "grad_norm": 16.025836944580078, "kl": 0.11279296875, "learning_rate": 4.984066284257489e-07, "loss": 0.0045, "reward": 1.6819837093353271, "reward_std": 0.11338447034358978, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5726087093353271, "rewards/pad": 0.109375, "step": 1574 }, { "completion_length": 283.59375, "epoch": 0.5019120458891013, "grad_norm": 7.817051887512207, "kl": 0.0986328125, "learning_rate": 4.980879541108986e-07, "loss": 0.0039, "reward": 1.5237798690795898, "reward_std": 0.0966147929430008, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4144047498703003, "step": 1575 }, { "completion_length": 272.65625, "epoch": 0.5022307202039515, "grad_norm": 10.048602104187012, "kl": 0.083984375, "learning_rate": 4.977692797960484e-07, "loss": 0.0034, "reward": 1.7033123970031738, "reward_std": 0.09860232472419739, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.45331239700317383, "step": 1576 }, { "completion_length": 201.5625, "epoch": 0.5025493945188018, "grad_norm": 15.853896141052246, "kl": 0.10400390625, "learning_rate": 4.974506054811982e-07, "loss": 0.0042, "reward": 1.5546913146972656, "reward_std": 0.1912136971950531, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5703162550926208, "step": 1577 }, { "completion_length": 317.140625, "epoch": 0.502868068833652, "grad_norm": 6.66535758972168, "kl": 0.07763671875, "learning_rate": 4.97131931166348e-07, "loss": 0.0031, "reward": 1.323586106300354, "reward_std": 0.2054770141839981, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.953125, "rewards/tracking_iou_reward": 0.37046104669570923, "step": 1578 }, { "completion_length": 285.765625, "epoch": 0.5031867431485022, "grad_norm": 11.269193649291992, "kl": 0.09912109375, "learning_rate": 4.968132568514977e-07, "loss": 0.004, "reward": 1.6706132888793945, "reward_std": 0.14314395189285278, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5612383484840393, "step": 1579 }, { "completion_length": 269.6875, "epoch": 0.5035054174633524, "grad_norm": 11.386666297912598, "kl": 0.09228515625, "learning_rate": 4.964945825366475e-07, "loss": 0.0037, "reward": 1.4375762939453125, "reward_std": 0.15949112176895142, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.3438262939453125, "step": 1580 }, { "completion_length": 359.984375, "epoch": 0.5038240917782026, "grad_norm": 13.61768913269043, "kl": 0.0654296875, "learning_rate": 4.961759082217972e-07, "loss": 0.0026, "reward": 1.4767695665359497, "reward_std": 0.23325678706169128, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.953125, "rewards/tracking_iou_reward": 0.5236445069313049, "step": 1581 }, { "completion_length": 218.921875, "epoch": 0.5041427660930529, "grad_norm": 8.900588035583496, "kl": 0.09814453125, "learning_rate": 4.95857233906947e-07, "loss": 0.0039, "reward": 1.6065030097961426, "reward_std": 0.06527124345302582, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.606502890586853, "step": 1582 }, { "completion_length": 216.53125, "epoch": 0.5044614404079031, "grad_norm": 8.777047157287598, "kl": 0.1337890625, "learning_rate": 4.955385595920969e-07, "loss": 0.0053, "reward": 1.6224644184112549, "reward_std": 0.06472902745008469, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49746444821357727, "step": 1583 }, { "completion_length": 398.28125, "epoch": 0.5047801147227533, "grad_norm": 5.128695011138916, "kl": 0.05078125, "learning_rate": 4.952198852772467e-07, "loss": 0.002, "reward": 1.6772174835205078, "reward_std": 0.13002997636795044, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4584675431251526, "step": 1584 }, { "completion_length": 224.328125, "epoch": 0.5050987890376035, "grad_norm": 8.237434387207031, "kl": 0.083984375, "learning_rate": 4.949012109623964e-07, "loss": 0.0034, "reward": 1.6073098182678223, "reward_std": 0.10495173186063766, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.13855993747711182, "rewards/pad": 0.46875, "step": 1585 }, { "completion_length": 253.96875, "epoch": 0.5054174633524537, "grad_norm": 7.674633026123047, "kl": 0.1083984375, "learning_rate": 4.945825366475462e-07, "loss": 0.0043, "reward": 1.4445209503173828, "reward_std": 0.16878455877304077, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4601460099220276, "rewards/pad": 0.0, "step": 1586 }, { "completion_length": 205.28125, "epoch": 0.505736137667304, "grad_norm": 5.523696422576904, "kl": 0.09326171875, "learning_rate": 4.94263862332696e-07, "loss": 0.0037, "reward": 1.7708675861358643, "reward_std": 0.0980464294552803, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.645867645740509, "step": 1587 }, { "completion_length": 387.375, "epoch": 0.5060548119821542, "grad_norm": 8.26123332977295, "kl": 0.0625, "learning_rate": 4.939451880178458e-07, "loss": 0.0025, "reward": 1.524539828300476, "reward_std": 0.12007015943527222, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4932897686958313, "step": 1588 }, { "completion_length": 383.890625, "epoch": 0.5063734862970045, "grad_norm": 5.443143367767334, "kl": 0.06005859375, "learning_rate": 4.936265137029955e-07, "loss": 0.0024, "reward": 1.693671464920044, "reward_std": 0.04399287700653076, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44367146492004395, "step": 1589 }, { "completion_length": 339.703125, "epoch": 0.5066921606118547, "grad_norm": 3.934993028640747, "kl": 0.08251953125, "learning_rate": 4.933078393881453e-07, "loss": 0.0033, "reward": 1.6259467601776123, "reward_std": 0.13442574441432953, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5165717005729675, "step": 1590 }, { "completion_length": 164.15625, "epoch": 0.507010834926705, "grad_norm": 15.19997787475586, "kl": 0.1259765625, "learning_rate": 4.929891650732951e-07, "loss": 0.0051, "reward": 1.5974699258804321, "reward_std": 0.1301935464143753, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5037198662757874, "step": 1591 }, { "completion_length": 229.828125, "epoch": 0.5073295092415552, "grad_norm": 6.299907684326172, "kl": 0.1103515625, "learning_rate": 4.926704907584449e-07, "loss": 0.0044, "reward": 1.4970672130584717, "reward_std": 0.12784096598625183, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3876921534538269, "step": 1592 }, { "completion_length": 341.140625, "epoch": 0.5076481835564054, "grad_norm": 5.9435343742370605, "kl": 0.0947265625, "learning_rate": 4.923518164435946e-07, "loss": 0.0038, "reward": 1.5198999643325806, "reward_std": 0.10769708454608917, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4105249047279358, "step": 1593 }, { "completion_length": 308.359375, "epoch": 0.5079668578712556, "grad_norm": 18.813356399536133, "kl": 0.060546875, "learning_rate": 4.920331421287444e-07, "loss": 0.0024, "reward": 1.5813418626785278, "reward_std": 0.1093776598572731, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.47196686267852783, "rewards/pad": 0.125, "step": 1594 }, { "completion_length": 273.34375, "epoch": 0.5082855321861058, "grad_norm": 10.208179473876953, "kl": 0.08251953125, "learning_rate": 4.917144678138942e-07, "loss": 0.0033, "reward": 1.6122044324874878, "reward_std": 0.23450268805027008, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.39345434308052063, "rewards/pad": 0.25, "step": 1595 }, { "completion_length": 263.703125, "epoch": 0.5086042065009561, "grad_norm": 7.797765731811523, "kl": 0.076171875, "learning_rate": 4.91395793499044e-07, "loss": 0.003, "reward": 1.4443575143814087, "reward_std": 0.11956124752759933, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4599825143814087, "rewards/pad": 0.0, "step": 1596 }, { "completion_length": 394.984375, "epoch": 0.5089228808158063, "grad_norm": 11.725752830505371, "kl": 0.0654296875, "learning_rate": 4.910771191841937e-07, "loss": 0.0026, "reward": 1.3768037557601929, "reward_std": 0.07617262005805969, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3768037259578705, "step": 1597 }, { "completion_length": 338.140625, "epoch": 0.5092415551306565, "grad_norm": 4.875799655914307, "kl": 0.099609375, "learning_rate": 4.907584448693435e-07, "loss": 0.004, "reward": 1.4290649890899658, "reward_std": 0.042309802025556564, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4290649890899658, "rewards/pad": 0.0, "step": 1598 }, { "completion_length": 289.796875, "epoch": 0.5095602294455067, "grad_norm": 13.580358505249023, "kl": 0.09033203125, "learning_rate": 4.904397705544932e-07, "loss": 0.0036, "reward": 1.3642786741256714, "reward_std": 0.09266138076782227, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3799036741256714, "step": 1599 }, { "completion_length": 292.46875, "epoch": 0.5098789037603569, "grad_norm": 21.963088989257812, "kl": 0.07958984375, "learning_rate": 4.90121096239643e-07, "loss": 0.0032, "reward": 1.4895875453948975, "reward_std": 0.05440334975719452, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36458754539489746, "step": 1600 }, { "completion_length": 233.984375, "epoch": 0.5101975780752072, "grad_norm": 15.617948532104492, "kl": 0.0966796875, "learning_rate": 4.898024219247928e-07, "loss": 0.0039, "reward": 1.5592432022094727, "reward_std": 0.1363193839788437, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4498681426048279, "rewards/pad": 0.125, "step": 1601 }, { "completion_length": 168.90625, "epoch": 0.5105162523900574, "grad_norm": 10.946663856506348, "kl": 0.10498046875, "learning_rate": 4.894837476099425e-07, "loss": 0.0042, "reward": 1.5720787048339844, "reward_std": 0.08019654452800751, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5720787644386292, "step": 1602 }, { "completion_length": 323.90625, "epoch": 0.5108349267049076, "grad_norm": 11.409135818481445, "kl": 0.07080078125, "learning_rate": 4.891650732950924e-07, "loss": 0.0028, "reward": 1.6570066213607788, "reward_std": 0.11523165553808212, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5476316809654236, "rewards/pad": 0.125, "step": 1603 }, { "completion_length": 323.5, "epoch": 0.5111536010197578, "grad_norm": 11.713022232055664, "kl": 0.07958984375, "learning_rate": 4.888463989802422e-07, "loss": 0.0032, "reward": 1.5533918142318726, "reward_std": 0.13724485039710999, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5690168142318726, "step": 1604 }, { "completion_length": 246.78125, "epoch": 0.511472275334608, "grad_norm": 11.342105865478516, "kl": 0.08740234375, "learning_rate": 4.88527724665392e-07, "loss": 0.0035, "reward": 1.4041597843170166, "reward_std": 0.08910918235778809, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40415966510772705, "rewards/pad": 0.0, "step": 1605 }, { "completion_length": 287.859375, "epoch": 0.5117909496494583, "grad_norm": 11.058752059936523, "kl": 0.07568359375, "learning_rate": 4.882090503505417e-07, "loss": 0.003, "reward": 1.6646125316619873, "reward_std": 0.10868878662586212, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5552375912666321, "step": 1606 }, { "completion_length": 334.265625, "epoch": 0.5121096239643085, "grad_norm": 9.2514066696167, "kl": 0.058837890625, "learning_rate": 4.878903760356915e-07, "loss": 0.0024, "reward": 1.4564669132232666, "reward_std": 0.14030423760414124, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3314668834209442, "step": 1607 }, { "completion_length": 367.28125, "epoch": 0.5124282982791587, "grad_norm": 8.841236114501953, "kl": 0.06494140625, "learning_rate": 4.875717017208413e-07, "loss": 0.0026, "reward": 1.513685703277588, "reward_std": 0.0567639134824276, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5136858224868774, "step": 1608 }, { "completion_length": 218.296875, "epoch": 0.5127469725940089, "grad_norm": 21.75227928161621, "kl": 0.09423828125, "learning_rate": 4.872530274059911e-07, "loss": 0.0038, "reward": 1.4949934482574463, "reward_std": 0.2251434624195099, "rewards/answer_reward": 0.09375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4168684482574463, "step": 1609 }, { "completion_length": 267.765625, "epoch": 0.5130656469088591, "grad_norm": 13.175375938415527, "kl": 0.0849609375, "learning_rate": 4.869343530911408e-07, "loss": 0.0034, "reward": 1.5545045137405396, "reward_std": 0.055992916226387024, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5545045137405396, "rewards/pad": 0.0, "step": 1610 }, { "completion_length": 184.625, "epoch": 0.5133843212237094, "grad_norm": 6.863095760345459, "kl": 0.0986328125, "learning_rate": 4.866156787762906e-07, "loss": 0.0039, "reward": 1.5448052883148193, "reward_std": 0.0887579694390297, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5448052287101746, "rewards/pad": 0.0, "step": 1611 }, { "completion_length": 171.703125, "epoch": 0.5137029955385596, "grad_norm": 5.797022342681885, "kl": 0.1640625, "learning_rate": 4.862970044614404e-07, "loss": 0.0066, "reward": 1.6547784805297852, "reward_std": 0.12803807854652405, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.420403391122818, "step": 1612 }, { "completion_length": 229.53125, "epoch": 0.5140216698534098, "grad_norm": 4.80680513381958, "kl": 0.08251953125, "learning_rate": 4.859783301465902e-07, "loss": 0.0033, "reward": 1.9011766910552979, "reward_std": 0.10118616372346878, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5261766910552979, "rewards/pad": 0.375, "step": 1613 }, { "completion_length": 385.375, "epoch": 0.51434034416826, "grad_norm": 12.173733711242676, "kl": 0.06103515625, "learning_rate": 4.856596558317399e-07, "loss": 0.0024, "reward": 1.4999946355819702, "reward_std": 0.042640700936317444, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4999946355819702, "rewards/pad": 0.0, "step": 1614 }, { "completion_length": 324.078125, "epoch": 0.5146590184831102, "grad_norm": 4.943187236785889, "kl": 0.10791015625, "learning_rate": 4.853409815168897e-07, "loss": 0.0043, "reward": 1.5149937868118286, "reward_std": 0.10412536561489105, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4056188762187958, "rewards/pad": 0.125, "step": 1615 }, { "completion_length": 427.796875, "epoch": 0.5149776927979605, "grad_norm": 4.6593451499938965, "kl": 0.05322265625, "learning_rate": 4.850223072020395e-07, "loss": 0.0021, "reward": 1.4270119667053223, "reward_std": 0.09866784512996674, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4426369071006775, "step": 1616 }, { "completion_length": 149.375, "epoch": 0.5152963671128107, "grad_norm": 18.755388259887695, "kl": 0.10791015625, "learning_rate": 4.847036328871893e-07, "loss": 0.0043, "reward": 1.5256495475769043, "reward_std": 0.07746165245771408, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5256494879722595, "rewards/pad": 0.0, "step": 1617 }, { "completion_length": 444.65625, "epoch": 0.5156150414276609, "grad_norm": 35.75686264038086, "kl": 0.052978515625, "learning_rate": 4.84384958572339e-07, "loss": 0.0021, "reward": 1.4138743877410889, "reward_std": 0.1107860878109932, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.3201243281364441, "step": 1618 }, { "completion_length": 384.34375, "epoch": 0.5159337157425111, "grad_norm": 9.533431053161621, "kl": 0.068359375, "learning_rate": 4.840662842574888e-07, "loss": 0.0027, "reward": 1.4957008361816406, "reward_std": 0.12621979415416718, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5269508957862854, "rewards/pad": 0.0, "step": 1619 }, { "completion_length": 382.53125, "epoch": 0.5162523900573613, "grad_norm": 16.5041446685791, "kl": 0.05908203125, "learning_rate": 4.837476099426385e-07, "loss": 0.0024, "reward": 1.5703582763671875, "reward_std": 0.11693361401557922, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.585983157157898, "step": 1620 }, { "completion_length": 335.921875, "epoch": 0.5165710643722116, "grad_norm": 16.340742111206055, "kl": 0.10302734375, "learning_rate": 4.834289356277884e-07, "loss": 0.0041, "reward": 1.368363380432129, "reward_std": 0.059988975524902344, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3683633506298065, "step": 1621 }, { "completion_length": 340.953125, "epoch": 0.5168897386870618, "grad_norm": 10.794461250305176, "kl": 0.1845703125, "learning_rate": 4.831102613129382e-07, "loss": 0.0074, "reward": 1.4841980934143066, "reward_std": 0.10662591457366943, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4841980040073395, "step": 1622 }, { "completion_length": 302.21875, "epoch": 0.517208413001912, "grad_norm": 6.696167469024658, "kl": 0.0791015625, "learning_rate": 4.82791586998088e-07, "loss": 0.0032, "reward": 1.4154529571533203, "reward_std": 0.16440361738204956, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.32170307636260986, "rewards/pad": 0.125, "step": 1623 }, { "completion_length": 123.1875, "epoch": 0.5175270873167622, "grad_norm": 34.49293518066406, "kl": 0.11181640625, "learning_rate": 4.824729126832377e-07, "loss": 0.0045, "reward": 1.6380038261413574, "reward_std": 0.12282894551753998, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5130038857460022, "step": 1624 }, { "completion_length": 113.046875, "epoch": 0.5178457616316124, "grad_norm": 68.6580810546875, "kl": 0.1337890625, "learning_rate": 4.821542383683875e-07, "loss": 0.0054, "reward": 1.6568090915679932, "reward_std": 0.20202675461769104, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5630591511726379, "rewards/pad": 0.09375, "step": 1625 }, { "completion_length": 355.78125, "epoch": 0.5181644359464627, "grad_norm": 7.60537576675415, "kl": 0.07373046875, "learning_rate": 4.818355640535373e-07, "loss": 0.0029, "reward": 1.4764320850372314, "reward_std": 0.12306798994541168, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36705708503723145, "step": 1626 }, { "completion_length": 363.03125, "epoch": 0.5184831102613129, "grad_norm": 15.738922119140625, "kl": 0.0810546875, "learning_rate": 4.815168897386871e-07, "loss": 0.0032, "reward": 1.503629446029663, "reward_std": 0.20149904489517212, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5192545056343079, "step": 1627 }, { "completion_length": 266.53125, "epoch": 0.5188017845761632, "grad_norm": 5.414165496826172, "kl": 0.1162109375, "learning_rate": 4.811982154238368e-07, "loss": 0.0046, "reward": 1.5420563220977783, "reward_std": 0.08500739932060242, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4170563519001007, "rewards/pad": 0.125, "step": 1628 }, { "completion_length": 229.15625, "epoch": 0.5191204588910134, "grad_norm": 8.028788566589355, "kl": 0.09716796875, "learning_rate": 4.808795411089866e-07, "loss": 0.0039, "reward": 1.7718290090560913, "reward_std": 0.07292530685663223, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6468290686607361, "rewards/pad": 0.125, "step": 1629 }, { "completion_length": 397.921875, "epoch": 0.5194391332058637, "grad_norm": 30.3446044921875, "kl": 0.06640625, "learning_rate": 4.805608667941364e-07, "loss": 0.0027, "reward": 1.5163317918777466, "reward_std": 0.08876568078994751, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5319567918777466, "rewards/pad": 0.0, "step": 1630 }, { "completion_length": 286.65625, "epoch": 0.5197578075207139, "grad_norm": 54.19609451293945, "kl": 0.07275390625, "learning_rate": 4.802421924792862e-07, "loss": 0.0029, "reward": 1.5980496406555176, "reward_std": 0.17015178501605988, "rewards/answer_reward": 0.15625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.45742470026016235, "step": 1631 }, { "completion_length": 232.015625, "epoch": 0.5200764818355641, "grad_norm": 8.370911598205566, "kl": 0.10302734375, "learning_rate": 4.799235181644359e-07, "loss": 0.0041, "reward": 1.528737187385559, "reward_std": 0.1446613371372223, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4193621873855591, "rewards/pad": 0.125, "step": 1632 }, { "completion_length": 366.9375, "epoch": 0.5203951561504143, "grad_norm": 12.012307167053223, "kl": 0.07958984375, "learning_rate": 4.796048438495857e-07, "loss": 0.0032, "reward": 1.5256717205047607, "reward_std": 0.07025659084320068, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5256717205047607, "step": 1633 }, { "completion_length": 235.0625, "epoch": 0.5207138304652645, "grad_norm": 40.669578552246094, "kl": 0.099609375, "learning_rate": 4.792861695347355e-07, "loss": 0.004, "reward": 1.5062785148620605, "reward_std": 0.14399857819080353, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.39690354466438293, "step": 1634 }, { "completion_length": 225.34375, "epoch": 0.5210325047801148, "grad_norm": 14.326337814331055, "kl": 0.091796875, "learning_rate": 4.789674952198852e-07, "loss": 0.0037, "reward": 1.7521679401397705, "reward_std": 0.16071289777755737, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5177929997444153, "rewards/pad": 0.234375, "step": 1635 }, { "completion_length": 397.890625, "epoch": 0.521351179094965, "grad_norm": 12.720609664916992, "kl": 0.061279296875, "learning_rate": 4.78648820905035e-07, "loss": 0.0025, "reward": 1.395108699798584, "reward_std": 0.06852993369102478, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.39510878920555115, "step": 1636 }, { "completion_length": 520.375, "epoch": 0.5216698534098152, "grad_norm": 4.460532188415527, "kl": 0.04052734375, "learning_rate": 4.783301465901848e-07, "loss": 0.0016, "reward": 1.4853415489196777, "reward_std": 0.03336441516876221, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4853416085243225, "step": 1637 }, { "completion_length": 204.171875, "epoch": 0.5219885277246654, "grad_norm": 25.1767578125, "kl": 0.09619140625, "learning_rate": 4.780114722753345e-07, "loss": 0.0039, "reward": 1.520871877670288, "reward_std": 0.05909598991274834, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3958718776702881, "step": 1638 }, { "completion_length": 330.984375, "epoch": 0.5223072020395156, "grad_norm": 4.884491920471191, "kl": 0.09423828125, "learning_rate": 4.776927979604843e-07, "loss": 0.0038, "reward": 1.6307636499404907, "reward_std": 0.08659254014492035, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6307636499404907, "rewards/pad": 0.0, "step": 1639 }, { "completion_length": 280.90625, "epoch": 0.5226258763543659, "grad_norm": 11.95295238494873, "kl": 0.07861328125, "learning_rate": 4.773741236456342e-07, "loss": 0.0032, "reward": 1.5487451553344727, "reward_std": 0.11208067834377289, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3143702447414398, "rewards/pad": 0.25, "step": 1640 }, { "completion_length": 393.484375, "epoch": 0.5229445506692161, "grad_norm": 9.313077926635742, "kl": 0.06298828125, "learning_rate": 4.770554493307839e-07, "loss": 0.0025, "reward": 1.4793241024017334, "reward_std": 0.10382667183876038, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.369949072599411, "rewards/pad": 0.125, "step": 1641 }, { "completion_length": 271.015625, "epoch": 0.5232632249840663, "grad_norm": 11.412222862243652, "kl": 0.080078125, "learning_rate": 4.7673677501593366e-07, "loss": 0.0032, "reward": 1.6363248825073242, "reward_std": 0.13290798664093018, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5269498825073242, "rewards/pad": 0.125, "step": 1642 }, { "completion_length": 284.734375, "epoch": 0.5235818992989165, "grad_norm": 18.375911712646484, "kl": 0.09375, "learning_rate": 4.7641810070108347e-07, "loss": 0.0038, "reward": 1.5433933734893799, "reward_std": 0.05979537591338158, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5433933734893799, "rewards/pad": 0.0, "step": 1643 }, { "completion_length": 453.34375, "epoch": 0.5239005736137667, "grad_norm": 14.870288848876953, "kl": 0.048828125, "learning_rate": 4.760994263862332e-07, "loss": 0.002, "reward": 1.3990988731384277, "reward_std": 0.1635962575674057, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.953125, "rewards/tracking_iou_reward": 0.3209739923477173, "step": 1644 }, { "completion_length": 217.828125, "epoch": 0.524219247928617, "grad_norm": 10.764137268066406, "kl": 0.09912109375, "learning_rate": 4.7578075207138303e-07, "loss": 0.004, "reward": 1.5573375225067139, "reward_std": 0.05173371732234955, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5573375225067139, "rewards/pad": 0.0, "step": 1645 }, { "completion_length": 237.171875, "epoch": 0.5245379222434672, "grad_norm": 124.7045669555664, "kl": 0.0947265625, "learning_rate": 4.754620777565328e-07, "loss": 0.0038, "reward": 1.6781928539276123, "reward_std": 0.06751322746276855, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5531928539276123, "step": 1646 }, { "completion_length": 294.71875, "epoch": 0.5248565965583174, "grad_norm": 12.334156036376953, "kl": 0.11083984375, "learning_rate": 4.751434034416826e-07, "loss": 0.0044, "reward": 1.528046727180481, "reward_std": 0.09726923704147339, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5436716079711914, "step": 1647 }, { "completion_length": 314.109375, "epoch": 0.5251752708731676, "grad_norm": 29.347471237182617, "kl": 0.07373046875, "learning_rate": 4.7482472912683235e-07, "loss": 0.003, "reward": 1.6609556674957275, "reward_std": 0.04223080724477768, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5359556674957275, "rewards/pad": 0.125, "step": 1648 }, { "completion_length": 163.21875, "epoch": 0.5254939451880178, "grad_norm": 6.873000621795654, "kl": 0.11376953125, "learning_rate": 4.7450605481198215e-07, "loss": 0.0045, "reward": 1.8783053159713745, "reward_std": 0.11644686013460159, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6439304351806641, "step": 1649 }, { "completion_length": 287.171875, "epoch": 0.5258126195028681, "grad_norm": 11.115781784057617, "kl": 0.09130859375, "learning_rate": 4.741873804971319e-07, "loss": 0.0037, "reward": 1.7113351821899414, "reward_std": 0.11662662774324417, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6019601225852966, "step": 1650 }, { "completion_length": 320.15625, "epoch": 0.5261312938177183, "grad_norm": 8.5549898147583, "kl": 0.08203125, "learning_rate": 4.738687061822817e-07, "loss": 0.0033, "reward": 1.419663667678833, "reward_std": 0.14064767956733704, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4509136974811554, "step": 1651 }, { "completion_length": 346.859375, "epoch": 0.5264499681325685, "grad_norm": 26.915172576904297, "kl": 0.07177734375, "learning_rate": 4.7355003186743147e-07, "loss": 0.0029, "reward": 1.6085193157196045, "reward_std": 0.17502236366271973, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5147693157196045, "step": 1652 }, { "completion_length": 202.125, "epoch": 0.5267686424474187, "grad_norm": 27.095932006835938, "kl": 0.1201171875, "learning_rate": 4.732313575525813e-07, "loss": 0.0048, "reward": 1.517913579940796, "reward_std": 0.13210374116897583, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.40853869915008545, "rewards/pad": 0.125, "step": 1653 }, { "completion_length": 302.140625, "epoch": 0.5270873167622689, "grad_norm": 11.207296371459961, "kl": 0.08203125, "learning_rate": 4.7291268323773103e-07, "loss": 0.0033, "reward": 1.5230052471160889, "reward_std": 0.14820341765880585, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.2886302173137665, "step": 1654 }, { "completion_length": 158.53125, "epoch": 0.5274059910771192, "grad_norm": 9.38110065460205, "kl": 0.16796875, "learning_rate": 4.7259400892288084e-07, "loss": 0.0067, "reward": 1.4777312278747559, "reward_std": 0.10828878730535507, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4777311682701111, "rewards/pad": 0.0, "step": 1655 }, { "completion_length": 231.46875, "epoch": 0.5277246653919694, "grad_norm": 94.59187316894531, "kl": 0.10986328125, "learning_rate": 4.722753346080306e-07, "loss": 0.0044, "reward": 1.632595419883728, "reward_std": 0.11153492331504822, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.585720419883728, "rewards/pad": 0.046875, "step": 1656 }, { "completion_length": 251.0625, "epoch": 0.5280433397068196, "grad_norm": 11.970016479492188, "kl": 0.0849609375, "learning_rate": 4.719566602931804e-07, "loss": 0.0034, "reward": 1.5200518369674683, "reward_std": 0.06651220470666885, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.520051896572113, "step": 1657 }, { "completion_length": 290.203125, "epoch": 0.5283620140216698, "grad_norm": 5.539214134216309, "kl": 0.07861328125, "learning_rate": 4.716379859783301e-07, "loss": 0.0032, "reward": 1.2751659154891968, "reward_std": 0.10172433406114578, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.29079097509384155, "step": 1658 }, { "completion_length": 320.609375, "epoch": 0.52868068833652, "grad_norm": 14.202303886413574, "kl": 0.087890625, "learning_rate": 4.713193116634799e-07, "loss": 0.0035, "reward": 1.5370453596115112, "reward_std": 0.089061439037323, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4276703894138336, "rewards/pad": 0.109375, "step": 1659 }, { "completion_length": 290.359375, "epoch": 0.5289993626513703, "grad_norm": 7.756679534912109, "kl": 0.09130859375, "learning_rate": 4.7100063734862966e-07, "loss": 0.0037, "reward": 1.3637516498565674, "reward_std": 0.10193685442209244, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36375173926353455, "step": 1660 }, { "completion_length": 230.8125, "epoch": 0.5293180369662205, "grad_norm": 12.022960662841797, "kl": 0.09912109375, "learning_rate": 4.706819630337794e-07, "loss": 0.004, "reward": 1.688589096069336, "reward_std": 0.09793002158403397, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43858909606933594, "rewards/pad": 0.25, "step": 1661 }, { "completion_length": 269.140625, "epoch": 0.5296367112810707, "grad_norm": 9.252043724060059, "kl": 0.09033203125, "learning_rate": 4.703632887189292e-07, "loss": 0.0036, "reward": 1.3499665260314941, "reward_std": 0.031937532126903534, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.34996655583381653, "rewards/pad": 0.0, "step": 1662 }, { "completion_length": 317.265625, "epoch": 0.5299553855959209, "grad_norm": 7.217067241668701, "kl": 0.10595703125, "learning_rate": 4.70044614404079e-07, "loss": 0.0042, "reward": 1.4520938396453857, "reward_std": 0.13367217779159546, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4677189290523529, "step": 1663 }, { "completion_length": 293.28125, "epoch": 0.5302740599107711, "grad_norm": 8.803386688232422, "kl": 0.09423828125, "learning_rate": 4.697259400892288e-07, "loss": 0.0038, "reward": 1.5019474029541016, "reward_std": 0.054961420595645905, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5019474029541016, "rewards/pad": 0.0, "step": 1664 }, { "completion_length": 327.90625, "epoch": 0.5305927342256214, "grad_norm": 5.697347640991211, "kl": 0.1064453125, "learning_rate": 4.6940726577437853e-07, "loss": 0.0043, "reward": 1.683156967163086, "reward_std": 0.15118834376335144, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5581569671630859, "step": 1665 }, { "completion_length": 316.8125, "epoch": 0.5309114085404716, "grad_norm": 3.3667261600494385, "kl": 0.072265625, "learning_rate": 4.6908859145952834e-07, "loss": 0.0029, "reward": 1.341244101524353, "reward_std": 0.04248841851949692, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3412441611289978, "step": 1666 }, { "completion_length": 276.625, "epoch": 0.5312300828553218, "grad_norm": 9.462784767150879, "kl": 0.09228515625, "learning_rate": 4.687699171446781e-07, "loss": 0.0037, "reward": 1.3527380228042603, "reward_std": 0.060129985213279724, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.35273808240890503, "rewards/pad": 0.0, "step": 1667 }, { "completion_length": 299.484375, "epoch": 0.5315487571701721, "grad_norm": 6.0697150230407715, "kl": 0.0869140625, "learning_rate": 4.684512428298279e-07, "loss": 0.0035, "reward": 1.400463581085205, "reward_std": 0.08055169880390167, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4160885810852051, "rewards/pad": 0.0, "step": 1668 }, { "completion_length": 231.203125, "epoch": 0.5318674314850224, "grad_norm": 13.91861629486084, "kl": 0.10595703125, "learning_rate": 4.6813256851497766e-07, "loss": 0.0042, "reward": 1.6042506694793701, "reward_std": 0.06380566954612732, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6042506694793701, "rewards/pad": 0.0, "step": 1669 }, { "completion_length": 266.671875, "epoch": 0.5321861057998726, "grad_norm": 13.381567001342773, "kl": 0.0947265625, "learning_rate": 4.6781389420012746e-07, "loss": 0.0038, "reward": 1.5504193305969238, "reward_std": 0.14849324524402618, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.45666933059692383, "step": 1670 }, { "completion_length": 378.53125, "epoch": 0.5325047801147228, "grad_norm": 10.656835556030273, "kl": 0.0712890625, "learning_rate": 4.674952198852772e-07, "loss": 0.0029, "reward": 1.446120262145996, "reward_std": 0.11353107541799545, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.46174532175064087, "step": 1671 }, { "completion_length": 262.234375, "epoch": 0.532823454429573, "grad_norm": 20.61830711364746, "kl": 0.1005859375, "learning_rate": 4.67176545570427e-07, "loss": 0.004, "reward": 1.6212286949157715, "reward_std": 0.062337301671504974, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6212287545204163, "step": 1672 }, { "completion_length": 228.296875, "epoch": 0.5331421287444232, "grad_norm": 9.58569622039795, "kl": 0.10009765625, "learning_rate": 4.668578712555768e-07, "loss": 0.004, "reward": 1.542300820350647, "reward_std": 0.08927084505558014, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5423007607460022, "rewards/pad": 0.0, "step": 1673 }, { "completion_length": 243.328125, "epoch": 0.5334608030592735, "grad_norm": 8.100432395935059, "kl": 0.0732421875, "learning_rate": 4.665391969407266e-07, "loss": 0.0029, "reward": 1.8803434371948242, "reward_std": 0.10601752996444702, "rewards/answer_reward": 0.484375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.39596837759017944, "step": 1674 }, { "completion_length": 287.1875, "epoch": 0.5337794773741237, "grad_norm": 7.46446418762207, "kl": 0.11083984375, "learning_rate": 4.6622052262587634e-07, "loss": 0.0044, "reward": 1.512256383895874, "reward_std": 0.12309861183166504, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5122564435005188, "rewards/pad": 0.0, "step": 1675 }, { "completion_length": 304.25, "epoch": 0.5340981516889739, "grad_norm": 12.087656021118164, "kl": 0.12060546875, "learning_rate": 4.6590184831102615e-07, "loss": 0.0048, "reward": 1.4665218591690063, "reward_std": 0.08359487354755402, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46652188897132874, "rewards/pad": 0.0, "step": 1676 }, { "completion_length": 333.96875, "epoch": 0.5344168260038241, "grad_norm": 5.124676704406738, "kl": 0.0712890625, "learning_rate": 4.6558317399617585e-07, "loss": 0.0029, "reward": 1.5634325742721558, "reward_std": 0.04829826205968857, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.438432514667511, "rewards/pad": 0.125, "step": 1677 }, { "completion_length": 297.390625, "epoch": 0.5347355003186743, "grad_norm": 33.74581527709961, "kl": 0.064453125, "learning_rate": 4.6526449968132566e-07, "loss": 0.0026, "reward": 1.5150209665298462, "reward_std": 0.07572810351848602, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3900209963321686, "step": 1678 }, { "completion_length": 392.421875, "epoch": 0.5350541746335246, "grad_norm": 5.853975772857666, "kl": 0.05517578125, "learning_rate": 4.649458253664754e-07, "loss": 0.0022, "reward": 1.466147780418396, "reward_std": 0.1339925229549408, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4817728102207184, "step": 1679 }, { "completion_length": 263.0625, "epoch": 0.5353728489483748, "grad_norm": 29.1920108795166, "kl": 0.09814453125, "learning_rate": 4.646271510516252e-07, "loss": 0.0039, "reward": 1.4998505115509033, "reward_std": 0.0841817855834961, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4998505711555481, "rewards/pad": 0.0, "step": 1680 }, { "completion_length": 299.109375, "epoch": 0.535691523263225, "grad_norm": 9.483532905578613, "kl": 0.08056640625, "learning_rate": 4.6430847673677497e-07, "loss": 0.0032, "reward": 1.566691279411316, "reward_std": 0.11589021980762482, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5823163390159607, "rewards/pad": 0.0, "step": 1681 }, { "completion_length": 213.34375, "epoch": 0.5360101975780752, "grad_norm": 21.55391502380371, "kl": 0.11083984375, "learning_rate": 4.639898024219248e-07, "loss": 0.0044, "reward": 1.4185980558395386, "reward_std": 0.08017615228891373, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4185979962348938, "rewards/pad": 0.0, "step": 1682 }, { "completion_length": 247.609375, "epoch": 0.5363288718929254, "grad_norm": 7.853524208068848, "kl": 0.078125, "learning_rate": 4.6367112810707453e-07, "loss": 0.0031, "reward": 1.6442821025848389, "reward_std": 0.08468535542488098, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6442821025848389, "step": 1683 }, { "completion_length": 116.953125, "epoch": 0.5366475462077757, "grad_norm": 15.663179397583008, "kl": 0.1103515625, "learning_rate": 4.6335245379222434e-07, "loss": 0.0044, "reward": 1.6685657501220703, "reward_std": 0.12723290920257568, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5435657501220703, "rewards/pad": 0.125, "step": 1684 }, { "completion_length": 374.3125, "epoch": 0.5369662205226259, "grad_norm": 6.08555269241333, "kl": 0.05908203125, "learning_rate": 4.630337794773741e-07, "loss": 0.0024, "reward": 1.594588041305542, "reward_std": 0.08949033915996552, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.344588041305542, "step": 1685 }, { "completion_length": 211.9375, "epoch": 0.5372848948374761, "grad_norm": 6.44908332824707, "kl": 0.10009765625, "learning_rate": 4.627151051625239e-07, "loss": 0.004, "reward": 1.6699178218841553, "reward_std": 0.10696688294410706, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5449178814888, "rewards/pad": 0.125, "step": 1686 }, { "completion_length": 265.53125, "epoch": 0.5376035691523263, "grad_norm": 12.869318008422852, "kl": 0.0830078125, "learning_rate": 4.6239643084767365e-07, "loss": 0.0033, "reward": 1.3589563369750977, "reward_std": 0.10501168668270111, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3745812475681305, "step": 1687 }, { "completion_length": 222.53125, "epoch": 0.5379222434671765, "grad_norm": 8.010903358459473, "kl": 0.10693359375, "learning_rate": 4.6207775653282346e-07, "loss": 0.0043, "reward": 1.8639793395996094, "reward_std": 0.08176255226135254, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.7389793992042542, "step": 1688 }, { "completion_length": 320.578125, "epoch": 0.5382409177820268, "grad_norm": 22.186399459838867, "kl": 0.08447265625, "learning_rate": 4.617590822179732e-07, "loss": 0.0034, "reward": 1.3118007183074951, "reward_std": 0.05238881707191467, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.31180059909820557, "step": 1689 }, { "completion_length": 249.984375, "epoch": 0.538559592096877, "grad_norm": 15.33934211730957, "kl": 0.1826171875, "learning_rate": 4.61440407903123e-07, "loss": 0.0073, "reward": 1.4747562408447266, "reward_std": 0.12676838040351868, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4903812110424042, "rewards/pad": 0.0, "step": 1690 }, { "completion_length": 254.5625, "epoch": 0.5388782664117272, "grad_norm": 16.812482833862305, "kl": 0.1591796875, "learning_rate": 4.611217335882728e-07, "loss": 0.0064, "reward": 1.4752761125564575, "reward_std": 0.11461857706308365, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4752761423587799, "rewards/pad": 0.0, "step": 1691 }, { "completion_length": 221.171875, "epoch": 0.5391969407265774, "grad_norm": 10.983870506286621, "kl": 0.09765625, "learning_rate": 4.608030592734226e-07, "loss": 0.0039, "reward": 1.8936082124710083, "reward_std": 0.12493997067213058, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6436082124710083, "step": 1692 }, { "completion_length": 209.359375, "epoch": 0.5395156150414276, "grad_norm": 14.340348243713379, "kl": 0.0927734375, "learning_rate": 4.6048438495857234e-07, "loss": 0.0037, "reward": 1.4879741668701172, "reward_std": 0.07082471251487732, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48797428607940674, "rewards/pad": 0.0, "step": 1693 }, { "completion_length": 257.390625, "epoch": 0.5398342893562779, "grad_norm": 103.97561645507812, "kl": 0.09375, "learning_rate": 4.601657106437221e-07, "loss": 0.0038, "reward": 1.7300111055374146, "reward_std": 0.06880944967269897, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6050111055374146, "rewards/pad": 0.125, "step": 1694 }, { "completion_length": 346.125, "epoch": 0.5401529636711281, "grad_norm": 20.965011596679688, "kl": 0.0751953125, "learning_rate": 4.598470363288719e-07, "loss": 0.003, "reward": 1.4433112144470215, "reward_std": 0.03581881895661354, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44331109523773193, "rewards/pad": 0.0, "step": 1695 }, { "completion_length": 268.53125, "epoch": 0.5404716379859783, "grad_norm": 13.639444351196289, "kl": 0.08984375, "learning_rate": 4.5952836201402165e-07, "loss": 0.0036, "reward": 1.5070383548736572, "reward_std": 0.16513481736183167, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.2882883548736572, "rewards/pad": 0.25, "step": 1696 }, { "completion_length": 260.453125, "epoch": 0.5407903123008285, "grad_norm": 14.117385864257812, "kl": 0.080078125, "learning_rate": 4.592096876991714e-07, "loss": 0.0032, "reward": 1.620898723602295, "reward_std": 0.12866953015327454, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.417773574590683, "rewards/pad": 0.203125, "step": 1697 }, { "completion_length": 265.515625, "epoch": 0.5411089866156787, "grad_norm": 6.444183349609375, "kl": 0.1181640625, "learning_rate": 4.5889101338432116e-07, "loss": 0.0047, "reward": 1.491443395614624, "reward_std": 0.06407006084918976, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49144336581230164, "step": 1698 }, { "completion_length": 247.890625, "epoch": 0.541427660930529, "grad_norm": 10.815384864807129, "kl": 0.10400390625, "learning_rate": 4.5857233906947097e-07, "loss": 0.0042, "reward": 1.6102548837661743, "reward_std": 0.15115009248256683, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6258799433708191, "step": 1699 }, { "completion_length": 306.15625, "epoch": 0.5417463352453792, "grad_norm": 16.33805274963379, "kl": 0.076171875, "learning_rate": 4.582536647546207e-07, "loss": 0.003, "reward": 1.3535027503967285, "reward_std": 0.08354859054088593, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.24412773549556732, "step": 1700 }, { "completion_length": 162.90625, "epoch": 0.5420650095602294, "grad_norm": 9.865619659423828, "kl": 0.150390625, "learning_rate": 4.5793499043977053e-07, "loss": 0.006, "reward": 1.7482905387878418, "reward_std": 0.13975197076797485, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.529540479183197, "rewards/pad": 0.234375, "step": 1701 }, { "completion_length": 157.546875, "epoch": 0.5423836838750796, "grad_norm": 12.057559967041016, "kl": 0.0947265625, "learning_rate": 4.576163161249203e-07, "loss": 0.0038, "reward": 1.8558648824691772, "reward_std": 0.1299760341644287, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.48086488246917725, "step": 1702 }, { "completion_length": 249.71875, "epoch": 0.5427023581899298, "grad_norm": 18.16450309753418, "kl": 0.07080078125, "learning_rate": 4.572976418100701e-07, "loss": 0.0028, "reward": 1.7190160751342773, "reward_std": 0.08569124341011047, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4690161347389221, "step": 1703 }, { "completion_length": 307.59375, "epoch": 0.5430210325047801, "grad_norm": 6.907443523406982, "kl": 0.08984375, "learning_rate": 4.5697896749521984e-07, "loss": 0.0036, "reward": 1.523721694946289, "reward_std": 0.12501972913742065, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5237216353416443, "rewards/pad": 0.0, "step": 1704 }, { "completion_length": 254.9375, "epoch": 0.5433397068196303, "grad_norm": 9.634760856628418, "kl": 0.08447265625, "learning_rate": 4.5666029318036965e-07, "loss": 0.0034, "reward": 1.637330412864685, "reward_std": 0.09492364525794983, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5279552936553955, "rewards/pad": 0.125, "step": 1705 }, { "completion_length": 387.03125, "epoch": 0.5436583811344805, "grad_norm": 8.362167358398438, "kl": 0.052490234375, "learning_rate": 4.563416188655194e-07, "loss": 0.0021, "reward": 1.4359925985336304, "reward_std": 0.04619568586349487, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43599265813827515, "rewards/pad": 0.0, "step": 1706 }, { "completion_length": 306.765625, "epoch": 0.5439770554493308, "grad_norm": 8.562478065490723, "kl": 0.08154296875, "learning_rate": 4.560229445506692e-07, "loss": 0.0033, "reward": 1.3705644607543945, "reward_std": 0.07248996943235397, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3861893117427826, "rewards/pad": 0.0, "step": 1707 }, { "completion_length": 308.296875, "epoch": 0.5442957297641811, "grad_norm": 10.349902153015137, "kl": 0.1083984375, "learning_rate": 4.5570427023581896e-07, "loss": 0.0043, "reward": 1.5466220378875732, "reward_std": 0.049023132771253586, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5466219782829285, "rewards/pad": 0.0, "step": 1708 }, { "completion_length": 155.03125, "epoch": 0.5446144040790313, "grad_norm": 17.148969650268555, "kl": 0.1318359375, "learning_rate": 4.5538559592096877e-07, "loss": 0.0053, "reward": 1.6405951976776123, "reward_std": 0.14014357328414917, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5155952572822571, "rewards/pad": 0.125, "step": 1709 }, { "completion_length": 306.78125, "epoch": 0.5449330783938815, "grad_norm": 19.449230194091797, "kl": 0.07421875, "learning_rate": 4.550669216061185e-07, "loss": 0.003, "reward": 1.7386236190795898, "reward_std": 0.06590403616428375, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48862361907958984, "rewards/pad": 0.25, "step": 1710 }, { "completion_length": 266.96875, "epoch": 0.5452517527087317, "grad_norm": 8.654415130615234, "kl": 0.0830078125, "learning_rate": 4.5474824729126833e-07, "loss": 0.0033, "reward": 1.7879807949066162, "reward_std": 0.07044827193021774, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5379809141159058, "step": 1711 }, { "completion_length": 189.875, "epoch": 0.5455704270235819, "grad_norm": 10.486869812011719, "kl": 0.1044921875, "learning_rate": 4.544295729764181e-07, "loss": 0.0042, "reward": 1.7235859632492065, "reward_std": 0.09536559879779816, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5985859632492065, "rewards/pad": 0.125, "step": 1712 }, { "completion_length": 199.296875, "epoch": 0.5458891013384322, "grad_norm": 7.5744123458862305, "kl": 0.10302734375, "learning_rate": 4.541108986615679e-07, "loss": 0.0041, "reward": 1.4725016355514526, "reward_std": 0.09483238309621811, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.363126665353775, "rewards/pad": 0.125, "step": 1713 }, { "completion_length": 267.671875, "epoch": 0.5462077756532824, "grad_norm": 67.24296569824219, "kl": 0.1552734375, "learning_rate": 4.5379222434671765e-07, "loss": 0.0062, "reward": 1.489917516708374, "reward_std": 0.055628370493650436, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4899175763130188, "rewards/pad": 0.0, "step": 1714 }, { "completion_length": 262.203125, "epoch": 0.5465264499681326, "grad_norm": 7.187012195587158, "kl": 0.08447265625, "learning_rate": 4.5347355003186745e-07, "loss": 0.0034, "reward": 1.5411200523376465, "reward_std": 0.05903906747698784, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5411199927330017, "step": 1715 }, { "completion_length": 235.75, "epoch": 0.5468451242829828, "grad_norm": 24.71690559387207, "kl": 0.0771484375, "learning_rate": 4.531548757170172e-07, "loss": 0.0031, "reward": 1.7826646566390991, "reward_std": 0.12565046548843384, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5326645970344543, "rewards/pad": 0.25, "step": 1716 }, { "completion_length": 183.4375, "epoch": 0.547163798597833, "grad_norm": 9.081765174865723, "kl": 0.1015625, "learning_rate": 4.5283620140216696e-07, "loss": 0.0041, "reward": 1.3880410194396973, "reward_std": 0.02689719945192337, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3880411386489868, "step": 1717 }, { "completion_length": 210.640625, "epoch": 0.5474824729126833, "grad_norm": 7.534451007843018, "kl": 0.0927734375, "learning_rate": 4.525175270873167e-07, "loss": 0.0037, "reward": 1.4608581066131592, "reward_std": 0.10575767606496811, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.35148316621780396, "rewards/pad": 0.125, "step": 1718 }, { "completion_length": 325.765625, "epoch": 0.5478011472275335, "grad_norm": 12.512798309326172, "kl": 0.0703125, "learning_rate": 4.521988527724665e-07, "loss": 0.0028, "reward": 1.5565497875213623, "reward_std": 0.15284176170825958, "rewards/answer_reward": 0.21875, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3534247875213623, "step": 1719 }, { "completion_length": 358.3125, "epoch": 0.5481198215423837, "grad_norm": 5.614782333374023, "kl": 0.0712890625, "learning_rate": 4.518801784576163e-07, "loss": 0.0029, "reward": 1.4160782098770142, "reward_std": 0.14821010828018188, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.44732820987701416, "rewards/pad": 0.0, "step": 1720 }, { "completion_length": 258.953125, "epoch": 0.5484384958572339, "grad_norm": 11.684313774108887, "kl": 0.07177734375, "learning_rate": 4.515615041427661e-07, "loss": 0.0029, "reward": 1.5642802715301514, "reward_std": 0.10995316505432129, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.45490533113479614, "step": 1721 }, { "completion_length": 247.78125, "epoch": 0.5487571701720841, "grad_norm": 54.16145706176758, "kl": 0.08447265625, "learning_rate": 4.5124282982791584e-07, "loss": 0.0034, "reward": 1.5749287605285645, "reward_std": 0.09817046672105789, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44992879033088684, "rewards/pad": 0.125, "step": 1722 }, { "completion_length": 263.578125, "epoch": 0.5490758444869344, "grad_norm": 12.113851547241211, "kl": 0.1845703125, "learning_rate": 4.509241555130656e-07, "loss": 0.0074, "reward": 1.623182773590088, "reward_std": 0.05416783317923546, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49818283319473267, "step": 1723 }, { "completion_length": 214.21875, "epoch": 0.5493945188017846, "grad_norm": 5.456802845001221, "kl": 0.0810546875, "learning_rate": 4.506054811982154e-07, "loss": 0.0032, "reward": 1.621408224105835, "reward_std": 0.0672256350517273, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49640828371047974, "step": 1724 }, { "completion_length": 195.09375, "epoch": 0.5497131931166348, "grad_norm": 13.368043899536133, "kl": 0.11279296875, "learning_rate": 4.5028680688336515e-07, "loss": 0.0045, "reward": 1.6206947565078735, "reward_std": 0.11809299886226654, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.636319637298584, "rewards/pad": 0.0, "step": 1725 }, { "completion_length": 192.515625, "epoch": 0.550031867431485, "grad_norm": 15.275333404541016, "kl": 0.1025390625, "learning_rate": 4.4996813256851496e-07, "loss": 0.0041, "reward": 1.5296623706817627, "reward_std": 0.07286571711301804, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4046623706817627, "step": 1726 }, { "completion_length": 295.0, "epoch": 0.5503505417463352, "grad_norm": 10.07372760772705, "kl": 0.07421875, "learning_rate": 4.496494582536647e-07, "loss": 0.003, "reward": 1.5608832836151123, "reward_std": 0.1302630454301834, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4515083432197571, "step": 1727 }, { "completion_length": 221.890625, "epoch": 0.5506692160611855, "grad_norm": 18.7092227935791, "kl": 0.08642578125, "learning_rate": 4.493307839388145e-07, "loss": 0.0035, "reward": 1.7104350328445435, "reward_std": 0.07333327829837799, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46043500304222107, "rewards/pad": 0.25, "step": 1728 }, { "completion_length": 279.703125, "epoch": 0.5509878903760357, "grad_norm": 9.025415420532227, "kl": 0.09033203125, "learning_rate": 4.490121096239643e-07, "loss": 0.0036, "reward": 1.6692687273025513, "reward_std": 0.07121677696704865, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6692686080932617, "rewards/pad": 0.0, "step": 1729 }, { "completion_length": 155.78125, "epoch": 0.5513065646908859, "grad_norm": 29.62628746032715, "kl": 0.10986328125, "learning_rate": 4.486934353091141e-07, "loss": 0.0044, "reward": 1.5663038492202759, "reward_std": 0.1585964560508728, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5819288492202759, "rewards/pad": 0.0, "step": 1730 }, { "completion_length": 141.46875, "epoch": 0.5516252390057361, "grad_norm": 20.791046142578125, "kl": 0.1220703125, "learning_rate": 4.4837476099426384e-07, "loss": 0.0049, "reward": 1.6928167343139648, "reward_std": 0.05160044878721237, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6928167343139648, "rewards/pad": 0.0, "step": 1731 }, { "completion_length": 316.265625, "epoch": 0.5519439133205863, "grad_norm": 8.608355522155762, "kl": 0.08984375, "learning_rate": 4.4805608667941364e-07, "loss": 0.0036, "reward": 1.4499294757843018, "reward_std": 0.07365255802869797, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.449929416179657, "rewards/pad": 0.0, "step": 1732 }, { "completion_length": 260.3125, "epoch": 0.5522625876354366, "grad_norm": 11.485649108886719, "kl": 0.087890625, "learning_rate": 4.477374123645634e-07, "loss": 0.0035, "reward": 1.5133521556854248, "reward_std": 0.045910757035017014, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5133520364761353, "rewards/pad": 0.0, "step": 1733 }, { "completion_length": 331.859375, "epoch": 0.5525812619502868, "grad_norm": 7.711301803588867, "kl": 0.0771484375, "learning_rate": 4.474187380497132e-07, "loss": 0.0031, "reward": 1.431697130203247, "reward_std": 0.07672516256570816, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43169713020324707, "rewards/pad": 0.0, "step": 1734 }, { "completion_length": 141.65625, "epoch": 0.552899936265137, "grad_norm": 10.256484985351562, "kl": 0.10791015625, "learning_rate": 4.4710006373486296e-07, "loss": 0.0043, "reward": 1.7603859901428223, "reward_std": 0.08859126269817352, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.510386049747467, "step": 1735 }, { "completion_length": 270.296875, "epoch": 0.5532186105799872, "grad_norm": 32.26211929321289, "kl": 0.0791015625, "learning_rate": 4.467813894200127e-07, "loss": 0.0032, "reward": 1.6208715438842773, "reward_std": 0.0996231734752655, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.38649648427963257, "step": 1736 }, { "completion_length": 211.734375, "epoch": 0.5535372848948374, "grad_norm": 7.994901180267334, "kl": 0.10595703125, "learning_rate": 4.4646271510516247e-07, "loss": 0.0042, "reward": 1.7499022483825684, "reward_std": 0.09591798484325409, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49990230798721313, "rewards/pad": 0.25, "step": 1737 }, { "completion_length": 201.34375, "epoch": 0.5538559592096877, "grad_norm": 7.6049628257751465, "kl": 0.099609375, "learning_rate": 4.4614404079031227e-07, "loss": 0.004, "reward": 1.4028677940368652, "reward_std": 0.12666383385658264, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3872428238391876, "step": 1738 }, { "completion_length": 259.28125, "epoch": 0.5541746335245379, "grad_norm": 7.623438358306885, "kl": 0.1005859375, "learning_rate": 4.4582536647546203e-07, "loss": 0.004, "reward": 1.636760950088501, "reward_std": 0.12068548798561096, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.652385950088501, "step": 1739 }, { "completion_length": 293.875, "epoch": 0.5544933078393881, "grad_norm": 137.72137451171875, "kl": 0.0810546875, "learning_rate": 4.4550669216061183e-07, "loss": 0.0032, "reward": 1.3920228481292725, "reward_std": 0.1445450484752655, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4076477885246277, "rewards/pad": 0.0, "step": 1740 }, { "completion_length": 150.578125, "epoch": 0.5548119821542383, "grad_norm": 16.826099395751953, "kl": 0.115234375, "learning_rate": 4.451880178457616e-07, "loss": 0.0046, "reward": 1.7300306558609009, "reward_std": 0.09280145913362503, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6050306558609009, "step": 1741 }, { "completion_length": 315.109375, "epoch": 0.5551306564690885, "grad_norm": 5.719409465789795, "kl": 0.0703125, "learning_rate": 4.448693435309114e-07, "loss": 0.0028, "reward": 1.417891025543213, "reward_std": 0.05220051109790802, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4178909659385681, "step": 1742 }, { "completion_length": 105.234375, "epoch": 0.5554493307839388, "grad_norm": 23.73946762084961, "kl": 0.109375, "learning_rate": 4.4455066921606115e-07, "loss": 0.0044, "reward": 1.599583387374878, "reward_std": 0.0803811177611351, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47458338737487793, "rewards/pad": 0.125, "step": 1743 }, { "completion_length": 199.203125, "epoch": 0.555768005098789, "grad_norm": 13.549830436706543, "kl": 0.1162109375, "learning_rate": 4.4423199490121096e-07, "loss": 0.0046, "reward": 1.552984595298767, "reward_std": 0.1160770133137703, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4279846251010895, "step": 1744 }, { "completion_length": 147.515625, "epoch": 0.5560866794136392, "grad_norm": 22.141952514648438, "kl": 0.1259765625, "learning_rate": 4.439133205863607e-07, "loss": 0.005, "reward": 1.5688796043395996, "reward_std": 0.10179469734430313, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5688797235488892, "rewards/pad": 0.0, "step": 1745 }, { "completion_length": 203.09375, "epoch": 0.5564053537284895, "grad_norm": 13.100513458251953, "kl": 0.103515625, "learning_rate": 4.435946462715105e-07, "loss": 0.0042, "reward": 1.5506198406219482, "reward_std": 0.11961662769317627, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.550619900226593, "step": 1746 }, { "completion_length": 330.734375, "epoch": 0.5567240280433398, "grad_norm": 5.324476718902588, "kl": 0.06884765625, "learning_rate": 4.4327597195666027e-07, "loss": 0.0028, "reward": 1.4494538307189941, "reward_std": 0.13746029138565063, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.34007877111434937, "step": 1747 }, { "completion_length": 210.171875, "epoch": 0.55704270235819, "grad_norm": 22.65328598022461, "kl": 0.09375, "learning_rate": 4.429572976418101e-07, "loss": 0.0037, "reward": 1.7961393594741821, "reward_std": 0.08024761080741882, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4211394190788269, "rewards/pad": 0.375, "step": 1748 }, { "completion_length": 206.53125, "epoch": 0.5573613766730402, "grad_norm": 46.93434143066406, "kl": 0.119140625, "learning_rate": 4.4263862332695983e-07, "loss": 0.0048, "reward": 1.575434684753418, "reward_std": 0.10544592142105103, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.575434684753418, "step": 1749 }, { "completion_length": 166.828125, "epoch": 0.5576800509878904, "grad_norm": 9.36785888671875, "kl": 0.09033203125, "learning_rate": 4.4231994901210964e-07, "loss": 0.0036, "reward": 1.5459160804748535, "reward_std": 0.08506603538990021, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.29591605067253113, "step": 1750 }, { "completion_length": 225.109375, "epoch": 0.5579987253027406, "grad_norm": 17.232309341430664, "kl": 0.1181640625, "learning_rate": 4.420012746972594e-07, "loss": 0.0047, "reward": 1.61324143409729, "reward_std": 0.16581512987613678, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5038663744926453, "rewards/pad": 0.125, "step": 1751 }, { "completion_length": 331.640625, "epoch": 0.5583173996175909, "grad_norm": 7.584392070770264, "kl": 0.0693359375, "learning_rate": 4.416826003824092e-07, "loss": 0.0028, "reward": 1.5683870315551758, "reward_std": 0.18173089623451233, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.490261971950531, "step": 1752 }, { "completion_length": 406.953125, "epoch": 0.5586360739324411, "grad_norm": 11.046274185180664, "kl": 0.0810546875, "learning_rate": 4.4136392606755895e-07, "loss": 0.0032, "reward": 1.5528819561004639, "reward_std": 0.06767392158508301, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42788198590278625, "rewards/pad": 0.125, "step": 1753 }, { "completion_length": 217.9375, "epoch": 0.5589547482472913, "grad_norm": 13.496262550354004, "kl": 0.0927734375, "learning_rate": 4.4104525175270876e-07, "loss": 0.0037, "reward": 1.387298345565796, "reward_std": 0.0605352446436882, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3872983157634735, "rewards/pad": 0.0, "step": 1754 }, { "completion_length": 334.15625, "epoch": 0.5592734225621415, "grad_norm": 38.26420211791992, "kl": 0.10302734375, "learning_rate": 4.407265774378585e-07, "loss": 0.0041, "reward": 1.5812796354293823, "reward_std": 0.10023447871208191, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5812796950340271, "rewards/pad": 0.0, "step": 1755 }, { "completion_length": 267.375, "epoch": 0.5595920968769917, "grad_norm": 17.166568756103516, "kl": 0.09521484375, "learning_rate": 4.404079031230082e-07, "loss": 0.0038, "reward": 1.739503026008606, "reward_std": 0.08325989544391632, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6301279664039612, "rewards/pad": 0.109375, "step": 1756 }, { "completion_length": 272.671875, "epoch": 0.559910771191842, "grad_norm": 8.726228713989258, "kl": 0.08154296875, "learning_rate": 4.40089228808158e-07, "loss": 0.0033, "reward": 1.4452790021896362, "reward_std": 0.10831916332244873, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.429654061794281, "rewards/pad": 0.015625, "step": 1757 }, { "completion_length": 276.875, "epoch": 0.5602294455066922, "grad_norm": 12.648775100708008, "kl": 0.06591796875, "learning_rate": 4.397705544933078e-07, "loss": 0.0026, "reward": 1.4636542797088623, "reward_std": 0.26348677277565, "rewards/answer_reward": 0.15625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3230292499065399, "step": 1758 }, { "completion_length": 279.109375, "epoch": 0.5605481198215424, "grad_norm": 7.031533718109131, "kl": 0.09033203125, "learning_rate": 4.394518801784576e-07, "loss": 0.0036, "reward": 1.6584200859069824, "reward_std": 0.10096648335456848, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5334200859069824, "rewards/pad": 0.125, "step": 1759 }, { "completion_length": 105.8125, "epoch": 0.5608667941363926, "grad_norm": 18.536157608032227, "kl": 0.11767578125, "learning_rate": 4.3913320586360734e-07, "loss": 0.0047, "reward": 1.5089656114578247, "reward_std": 0.07973577827215195, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3839656114578247, "rewards/pad": 0.125, "step": 1760 }, { "completion_length": 217.3125, "epoch": 0.5611854684512428, "grad_norm": 9.231001853942871, "kl": 0.10107421875, "learning_rate": 4.3881453154875715e-07, "loss": 0.004, "reward": 1.5179157257080078, "reward_std": 0.1328127682209015, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.42416560649871826, "rewards/pad": 0.109375, "step": 1761 }, { "completion_length": 197.875, "epoch": 0.5615041427660931, "grad_norm": 8.673807144165039, "kl": 0.09375, "learning_rate": 4.384958572339069e-07, "loss": 0.0038, "reward": 1.443040370941162, "reward_std": 0.1960524320602417, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.38054025173187256, "rewards/pad": 0.078125, "step": 1762 }, { "completion_length": 255.921875, "epoch": 0.5618228170809433, "grad_norm": 11.084033012390137, "kl": 0.08154296875, "learning_rate": 4.381771829190567e-07, "loss": 0.0033, "reward": 1.4985325336456299, "reward_std": 0.10977034270763397, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4985325038433075, "rewards/pad": 0.0, "step": 1763 }, { "completion_length": 154.3125, "epoch": 0.5621414913957935, "grad_norm": 16.184070587158203, "kl": 0.10595703125, "learning_rate": 4.3785850860420646e-07, "loss": 0.0042, "reward": 1.5756832361221313, "reward_std": 0.0854271948337555, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.45068323612213135, "step": 1764 }, { "completion_length": 284.046875, "epoch": 0.5624601657106437, "grad_norm": 22.491928100585938, "kl": 0.076171875, "learning_rate": 4.3753983428935627e-07, "loss": 0.0031, "reward": 1.7509132623672485, "reward_std": 0.12830114364624023, "rewards/answer_reward": 0.21875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5321633219718933, "step": 1765 }, { "completion_length": 295.640625, "epoch": 0.5627788400254939, "grad_norm": 9.5074462890625, "kl": 0.07373046875, "learning_rate": 4.37221159974506e-07, "loss": 0.003, "reward": 1.5447643995285034, "reward_std": 0.0879368782043457, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5447643995285034, "rewards/pad": 0.0, "step": 1766 }, { "completion_length": 255.265625, "epoch": 0.5630975143403442, "grad_norm": 7.456171035766602, "kl": 0.099609375, "learning_rate": 4.3690248565965583e-07, "loss": 0.004, "reward": 1.75307297706604, "reward_std": 0.09735266864299774, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5030728578567505, "rewards/pad": 0.25, "step": 1767 }, { "completion_length": 280.421875, "epoch": 0.5634161886551944, "grad_norm": 24.25010108947754, "kl": 0.076171875, "learning_rate": 4.365838113448056e-07, "loss": 0.0031, "reward": 1.6167089939117432, "reward_std": 0.0686415284872055, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.36670899391174316, "rewards/pad": 0.25, "step": 1768 }, { "completion_length": 247.296875, "epoch": 0.5637348629700446, "grad_norm": 13.49270248413086, "kl": 0.0693359375, "learning_rate": 4.362651370299554e-07, "loss": 0.0028, "reward": 1.6864056587219238, "reward_std": 0.1645689159631729, "rewards/answer_reward": 0.296875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.38953065872192383, "step": 1769 }, { "completion_length": 339.109375, "epoch": 0.5640535372848948, "grad_norm": 21.020076751708984, "kl": 0.080078125, "learning_rate": 4.3594646271510514e-07, "loss": 0.0032, "reward": 1.3722314834594727, "reward_std": 0.15145091712474823, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.38785648345947266, "step": 1770 }, { "completion_length": 236.453125, "epoch": 0.564372211599745, "grad_norm": 19.411277770996094, "kl": 0.0947265625, "learning_rate": 4.3562778840025495e-07, "loss": 0.0038, "reward": 1.4857559204101562, "reward_std": 0.11996018886566162, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.36075592041015625, "rewards/pad": 0.125, "step": 1771 }, { "completion_length": 354.15625, "epoch": 0.5646908859145953, "grad_norm": 8.80309009552002, "kl": 0.05126953125, "learning_rate": 4.353091140854047e-07, "loss": 0.0021, "reward": 1.5560991764068604, "reward_std": 0.10942699015140533, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.44672417640686035, "step": 1772 }, { "completion_length": 349.953125, "epoch": 0.5650095602294455, "grad_norm": 7.837812900543213, "kl": 0.06298828125, "learning_rate": 4.349904397705545e-07, "loss": 0.0025, "reward": 1.5426127910614014, "reward_std": 0.04760386049747467, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.41761288046836853, "rewards/pad": 0.125, "step": 1773 }, { "completion_length": 233.96875, "epoch": 0.5653282345442957, "grad_norm": 9.303118705749512, "kl": 0.1025390625, "learning_rate": 4.3467176545570427e-07, "loss": 0.0041, "reward": 1.7672696113586426, "reward_std": 0.09113365411758423, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.642269492149353, "rewards/pad": 0.125, "step": 1774 }, { "completion_length": 272.28125, "epoch": 0.5656469088591459, "grad_norm": 9.206984519958496, "kl": 0.076171875, "learning_rate": 4.34353091140854e-07, "loss": 0.003, "reward": 1.6474246978759766, "reward_std": 0.07027721405029297, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5224246978759766, "step": 1775 }, { "completion_length": 212.359375, "epoch": 0.5659655831739961, "grad_norm": 5.96937894821167, "kl": 0.138671875, "learning_rate": 4.340344168260038e-07, "loss": 0.0056, "reward": 1.5690696239471436, "reward_std": 0.10176509618759155, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5690696239471436, "rewards/pad": 0.0, "step": 1776 }, { "completion_length": 139.25, "epoch": 0.5662842574888464, "grad_norm": 9.70727825164795, "kl": 0.10400390625, "learning_rate": 4.337157425111536e-07, "loss": 0.0042, "reward": 1.7603824138641357, "reward_std": 0.05401366576552391, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.7603825330734253, "rewards/pad": 0.0, "step": 1777 }, { "completion_length": 272.46875, "epoch": 0.5666029318036966, "grad_norm": 56.87026596069336, "kl": 0.078125, "learning_rate": 4.3339706819630333e-07, "loss": 0.0031, "reward": 1.4824305772781372, "reward_std": 0.0942038744688034, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46680548787117004, "rewards/pad": 0.015625, "step": 1778 }, { "completion_length": 198.09375, "epoch": 0.5669216061185468, "grad_norm": 7.219006061553955, "kl": 0.11474609375, "learning_rate": 4.3307839388145314e-07, "loss": 0.0046, "reward": 1.5830705165863037, "reward_std": 0.14112816751003265, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4580705165863037, "rewards/pad": 0.125, "step": 1779 }, { "completion_length": 311.171875, "epoch": 0.567240280433397, "grad_norm": 52.35779571533203, "kl": 0.0830078125, "learning_rate": 4.327597195666029e-07, "loss": 0.0033, "reward": 1.5457806587219238, "reward_std": 0.07235310226678848, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42078062891960144, "rewards/pad": 0.125, "step": 1780 }, { "completion_length": 175.4375, "epoch": 0.5675589547482472, "grad_norm": 8.508112907409668, "kl": 0.103515625, "learning_rate": 4.324410452517527e-07, "loss": 0.0041, "reward": 1.7337126731872559, "reward_std": 0.12397350370883942, "rewards/answer_reward": 0.21875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5149626731872559, "step": 1781 }, { "completion_length": 265.078125, "epoch": 0.5678776290630975, "grad_norm": 11.246129989624023, "kl": 0.08544921875, "learning_rate": 4.3212237093690246e-07, "loss": 0.0034, "reward": 1.5527467727661133, "reward_std": 0.0398021899163723, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4277467727661133, "rewards/pad": 0.125, "step": 1782 }, { "completion_length": 412.265625, "epoch": 0.5681963033779477, "grad_norm": 4.389920711517334, "kl": 0.04931640625, "learning_rate": 4.3180369662205226e-07, "loss": 0.002, "reward": 1.6300678253173828, "reward_std": 0.0944734513759613, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.39569270610809326, "step": 1783 }, { "completion_length": 245.546875, "epoch": 0.5685149776927979, "grad_norm": 18.353273391723633, "kl": 0.11181640625, "learning_rate": 4.31485022307202e-07, "loss": 0.0045, "reward": 1.658485770225525, "reward_std": 0.12130609154701233, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5334858298301697, "step": 1784 }, { "completion_length": 215.734375, "epoch": 0.5688336520076482, "grad_norm": 9.696588516235352, "kl": 0.10400390625, "learning_rate": 4.3116634799235177e-07, "loss": 0.0042, "reward": 1.6243197917938232, "reward_std": 0.17657245695590973, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5461949110031128, "rewards/pad": 0.09375, "step": 1785 }, { "completion_length": 294.421875, "epoch": 0.5691523263224985, "grad_norm": 5.507444858551025, "kl": 0.10107421875, "learning_rate": 4.308476736775016e-07, "loss": 0.0041, "reward": 1.4492971897125244, "reward_std": 0.15551123023033142, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4805472195148468, "step": 1786 }, { "completion_length": 303.421875, "epoch": 0.5694710006373487, "grad_norm": 5.179250240325928, "kl": 0.083984375, "learning_rate": 4.3052899936265133e-07, "loss": 0.0034, "reward": 1.6566708087921143, "reward_std": 0.09154780209064484, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5316708087921143, "step": 1787 }, { "completion_length": 208.765625, "epoch": 0.5697896749521989, "grad_norm": 12.69097900390625, "kl": 0.08447265625, "learning_rate": 4.3021032504780114e-07, "loss": 0.0034, "reward": 1.6494231224060059, "reward_std": 0.09254056215286255, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6494231224060059, "rewards/pad": 0.0, "step": 1788 }, { "completion_length": 460.21875, "epoch": 0.5701083492670491, "grad_norm": 2.9629247188568115, "kl": 0.051025390625, "learning_rate": 4.298916507329509e-07, "loss": 0.002, "reward": 1.5042239427566528, "reward_std": 0.03330201655626297, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.37922394275665283, "step": 1789 }, { "completion_length": 217.03125, "epoch": 0.5704270235818993, "grad_norm": 9.85554027557373, "kl": 0.09765625, "learning_rate": 4.295729764181007e-07, "loss": 0.0039, "reward": 1.6437140703201294, "reward_std": 0.09522366523742676, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5187140703201294, "step": 1790 }, { "completion_length": 144.40625, "epoch": 0.5707456978967496, "grad_norm": 19.401777267456055, "kl": 0.216796875, "learning_rate": 4.2925430210325045e-07, "loss": 0.0087, "reward": 1.585303783416748, "reward_std": 0.1496884524822235, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.600928783416748, "rewards/pad": 0.0, "step": 1791 }, { "completion_length": 267.890625, "epoch": 0.5710643722115998, "grad_norm": 9.033243179321289, "kl": 0.0927734375, "learning_rate": 4.2893562778840026e-07, "loss": 0.0037, "reward": 1.5676125288009644, "reward_std": 0.10527728497982025, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5832374691963196, "rewards/pad": 0.0, "step": 1792 }, { "completion_length": 275.578125, "epoch": 0.57138304652645, "grad_norm": 51.64488220214844, "kl": 0.0849609375, "learning_rate": 4.2861695347355e-07, "loss": 0.0034, "reward": 1.3531391620635986, "reward_std": 0.0575980469584465, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.35313910245895386, "rewards/pad": 0.0, "step": 1793 }, { "completion_length": 220.46875, "epoch": 0.5717017208413002, "grad_norm": 20.05837059020996, "kl": 0.10791015625, "learning_rate": 4.282982791586998e-07, "loss": 0.0043, "reward": 1.5053462982177734, "reward_std": 0.05837986245751381, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5053463578224182, "rewards/pad": 0.0, "step": 1794 }, { "completion_length": 222.203125, "epoch": 0.5720203951561504, "grad_norm": 16.75486183166504, "kl": 0.107421875, "learning_rate": 4.279796048438495e-07, "loss": 0.0043, "reward": 1.4838123321533203, "reward_std": 0.10905701667070389, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4994373917579651, "step": 1795 }, { "completion_length": 141.515625, "epoch": 0.5723390694710007, "grad_norm": 13.598223686218262, "kl": 0.1494140625, "learning_rate": 4.2766093052899933e-07, "loss": 0.0059, "reward": 1.5000587701797485, "reward_std": 0.07967067509889603, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5000587701797485, "rewards/pad": 0.0, "step": 1796 }, { "completion_length": 311.8125, "epoch": 0.5726577437858509, "grad_norm": 15.51604175567627, "kl": 0.08203125, "learning_rate": 4.273422562141491e-07, "loss": 0.0033, "reward": 1.5402470827102661, "reward_std": 0.12160883098840714, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4308720827102661, "rewards/pad": 0.125, "step": 1797 }, { "completion_length": 267.734375, "epoch": 0.5729764181007011, "grad_norm": 16.46525001525879, "kl": 0.083984375, "learning_rate": 4.270235818992989e-07, "loss": 0.0034, "reward": 1.640853762626648, "reward_std": 0.05473409965634346, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6408537030220032, "rewards/pad": 0.0, "step": 1798 }, { "completion_length": 256.234375, "epoch": 0.5732950924155513, "grad_norm": 27.168062210083008, "kl": 0.099609375, "learning_rate": 4.2670490758444865e-07, "loss": 0.004, "reward": 1.5514403581619263, "reward_std": 0.11094315350055695, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5045653581619263, "rewards/pad": 0.046875, "step": 1799 }, { "completion_length": 225.265625, "epoch": 0.5736137667304015, "grad_norm": 18.198102951049805, "kl": 0.11669921875, "learning_rate": 4.2638623326959845e-07, "loss": 0.0047, "reward": 1.4886753559112549, "reward_std": 0.11343041062355042, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5043004155158997, "rewards/pad": 0.0, "step": 1800 }, { "completion_length": 397.796875, "epoch": 0.5739324410452518, "grad_norm": 11.939931869506836, "kl": 0.052978515625, "learning_rate": 4.260675589547482e-07, "loss": 0.0021, "reward": 1.3815760612487793, "reward_std": 0.0875547006726265, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3972010016441345, "step": 1801 }, { "completion_length": 262.390625, "epoch": 0.574251115360102, "grad_norm": 31.43267250061035, "kl": 0.0791015625, "learning_rate": 4.25748884639898e-07, "loss": 0.0032, "reward": 1.619990587234497, "reward_std": 0.1106300875544548, "rewards/pad": 0.1875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43249058723449707, "step": 1802 }, { "completion_length": 98.484375, "epoch": 0.5745697896749522, "grad_norm": 8.432551383972168, "kl": 0.1357421875, "learning_rate": 4.2543021032504777e-07, "loss": 0.0054, "reward": 1.5922178030014038, "reward_std": 0.10581833869218826, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5922178030014038, "rewards/pad": 0.0, "step": 1803 }, { "completion_length": 214.65625, "epoch": 0.5748884639898024, "grad_norm": 15.14476203918457, "kl": 0.1064453125, "learning_rate": 4.251115360101976e-07, "loss": 0.0043, "reward": 1.5079833269119263, "reward_std": 0.18713060021400452, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3673582673072815, "rewards/pad": 0.15625, "step": 1804 }, { "completion_length": 244.984375, "epoch": 0.5752071383046526, "grad_norm": 31.882089614868164, "kl": 0.244140625, "learning_rate": 4.2479286169534733e-07, "loss": 0.0098, "reward": 1.6286325454711914, "reward_std": 0.10635349899530411, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.519257664680481, "rewards/pad": 0.125, "step": 1805 }, { "completion_length": 282.375, "epoch": 0.5755258126195029, "grad_norm": 15.163368225097656, "kl": 0.07470703125, "learning_rate": 4.2447418738049714e-07, "loss": 0.003, "reward": 1.6580278873443604, "reward_std": 0.17273136973381042, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4236530363559723, "step": 1806 }, { "completion_length": 275.171875, "epoch": 0.5758444869343531, "grad_norm": 9.724393844604492, "kl": 0.0966796875, "learning_rate": 4.241555130656469e-07, "loss": 0.0039, "reward": 1.5033131837844849, "reward_std": 0.13871508836746216, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5189381837844849, "rewards/pad": 0.0, "step": 1807 }, { "completion_length": 247.203125, "epoch": 0.5761631612492033, "grad_norm": 15.421486854553223, "kl": 0.2265625, "learning_rate": 4.238368387507967e-07, "loss": 0.009, "reward": 1.5908052921295166, "reward_std": 0.16774819791316986, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3876802325248718, "rewards/pad": 0.21875, "step": 1808 }, { "completion_length": 381.65625, "epoch": 0.5764818355640535, "grad_norm": 4.462833404541016, "kl": 0.08154296875, "learning_rate": 4.2351816443594645e-07, "loss": 0.0033, "reward": 1.5021185874938965, "reward_std": 0.07689405232667923, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5021185278892517, "rewards/pad": 0.0, "step": 1809 }, { "completion_length": 247.03125, "epoch": 0.5768005098789037, "grad_norm": 7.219703674316406, "kl": 0.12060546875, "learning_rate": 4.2319949012109626e-07, "loss": 0.0048, "reward": 1.5395386219024658, "reward_std": 0.07849196344614029, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.539538562297821, "rewards/pad": 0.0, "step": 1810 }, { "completion_length": 306.890625, "epoch": 0.577119184193754, "grad_norm": 8.923142433166504, "kl": 0.08544921875, "learning_rate": 4.22880815806246e-07, "loss": 0.0034, "reward": 1.4586997032165527, "reward_std": 0.04761248081922531, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4586997330188751, "rewards/pad": 0.0, "step": 1811 }, { "completion_length": 204.171875, "epoch": 0.5774378585086042, "grad_norm": 6.285178184509277, "kl": 0.1181640625, "learning_rate": 4.225621414913958e-07, "loss": 0.0047, "reward": 1.7298730611801147, "reward_std": 0.09434516727924347, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.7298730611801147, "rewards/pad": 0.0, "step": 1812 }, { "completion_length": 156.25, "epoch": 0.5777565328234544, "grad_norm": 14.29250431060791, "kl": 0.115234375, "learning_rate": 4.2224346717654557e-07, "loss": 0.0046, "reward": 1.761228322982788, "reward_std": 0.07904788106679916, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6362283229827881, "rewards/pad": 0.125, "step": 1813 }, { "completion_length": 260.34375, "epoch": 0.5780752071383046, "grad_norm": 15.537161827087402, "kl": 0.10205078125, "learning_rate": 4.219247928616954e-07, "loss": 0.0041, "reward": 1.5445449352264404, "reward_std": 0.1415569931268692, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.43516993522644043, "step": 1814 }, { "completion_length": 289.515625, "epoch": 0.5783938814531548, "grad_norm": 9.268240928649902, "kl": 0.07421875, "learning_rate": 4.216061185468451e-07, "loss": 0.003, "reward": 1.4811573028564453, "reward_std": 0.05475914850831032, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48115718364715576, "rewards/pad": 0.0, "step": 1815 }, { "completion_length": 235.4375, "epoch": 0.5787125557680051, "grad_norm": 12.025527954101562, "kl": 0.09521484375, "learning_rate": 4.2128744423199483e-07, "loss": 0.0038, "reward": 1.5596461296081543, "reward_std": 0.09580011665821075, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4346460700035095, "step": 1816 }, { "completion_length": 287.515625, "epoch": 0.5790312300828553, "grad_norm": 6.452342987060547, "kl": 0.07568359375, "learning_rate": 4.2096876991714464e-07, "loss": 0.003, "reward": 1.7021329402923584, "reward_std": 0.05965135246515274, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45213305950164795, "rewards/pad": 0.25, "step": 1817 }, { "completion_length": 238.75, "epoch": 0.5793499043977055, "grad_norm": 28.46439552307129, "kl": 0.1552734375, "learning_rate": 4.206500956022944e-07, "loss": 0.0062, "reward": 1.6458709239959717, "reward_std": 0.08807423710823059, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6458709239959717, "rewards/pad": 0.0, "step": 1818 }, { "completion_length": 230.328125, "epoch": 0.5796685787125557, "grad_norm": 41.31342315673828, "kl": 0.26171875, "learning_rate": 4.203314212874442e-07, "loss": 0.0104, "reward": 1.5134882926940918, "reward_std": 0.09538456797599792, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3884884715080261, "rewards/pad": 0.125, "step": 1819 }, { "completion_length": 187.453125, "epoch": 0.579987253027406, "grad_norm": 12.76840877532959, "kl": 0.11083984375, "learning_rate": 4.2001274697259396e-07, "loss": 0.0044, "reward": 1.51198410987854, "reward_std": 0.10140029340982437, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.52760910987854, "rewards/pad": 0.0, "step": 1820 }, { "completion_length": 275.921875, "epoch": 0.5803059273422562, "grad_norm": 20.936594009399414, "kl": 0.07861328125, "learning_rate": 4.1969407265774376e-07, "loss": 0.0031, "reward": 1.5541658401489258, "reward_std": 0.1162695437669754, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.31979086995124817, "step": 1821 }, { "completion_length": 423.9375, "epoch": 0.5806246016571064, "grad_norm": 5.021227836608887, "kl": 0.04296875, "learning_rate": 4.193753983428935e-07, "loss": 0.0017, "reward": 1.3449110984802246, "reward_std": 0.09172512590885162, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.36053603887557983, "step": 1822 }, { "completion_length": 233.15625, "epoch": 0.5809432759719566, "grad_norm": 12.918827056884766, "kl": 0.09228515625, "learning_rate": 4.190567240280433e-07, "loss": 0.0037, "reward": 1.6571413278579712, "reward_std": 0.10717593878507614, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.532141387462616, "step": 1823 }, { "completion_length": 250.203125, "epoch": 0.5812619502868069, "grad_norm": 26.820283889770508, "kl": 0.095703125, "learning_rate": 4.187380497131931e-07, "loss": 0.0038, "reward": 1.663448691368103, "reward_std": 0.18507176637649536, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5540737509727478, "step": 1824 }, { "completion_length": 248.46875, "epoch": 0.5815806246016572, "grad_norm": 12.499021530151367, "kl": 0.09228515625, "learning_rate": 4.184193753983429e-07, "loss": 0.0037, "reward": 1.4607033729553223, "reward_std": 0.15350082516670227, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.30445337295532227, "rewards/pad": 0.15625, "step": 1825 }, { "completion_length": 185.390625, "epoch": 0.5818992989165074, "grad_norm": 14.832054138183594, "kl": 0.1357421875, "learning_rate": 4.1810070108349264e-07, "loss": 0.0054, "reward": 1.7510137557983398, "reward_std": 0.11052236706018448, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6260136961936951, "rewards/pad": 0.125, "step": 1826 }, { "completion_length": 237.546875, "epoch": 0.5822179732313576, "grad_norm": 6.5988335609436035, "kl": 0.0908203125, "learning_rate": 4.1778202676864245e-07, "loss": 0.0036, "reward": 1.6146430969238281, "reward_std": 0.05205903202295303, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4896431565284729, "rewards/pad": 0.125, "step": 1827 }, { "completion_length": 201.65625, "epoch": 0.5825366475462078, "grad_norm": 13.932860374450684, "kl": 0.09716796875, "learning_rate": 4.174633524537922e-07, "loss": 0.0039, "reward": 1.6670434474945068, "reward_std": 0.09753619134426117, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5576685667037964, "rewards/pad": 0.109375, "step": 1828 }, { "completion_length": 275.890625, "epoch": 0.582855321861058, "grad_norm": 6.20548677444458, "kl": 0.08203125, "learning_rate": 4.17144678138942e-07, "loss": 0.0033, "reward": 1.488236427307129, "reward_std": 0.05150597542524338, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4882364869117737, "rewards/pad": 0.0, "step": 1829 }, { "completion_length": 187.609375, "epoch": 0.5831739961759083, "grad_norm": 9.856452941894531, "kl": 0.111328125, "learning_rate": 4.1682600382409176e-07, "loss": 0.0045, "reward": 1.603241205215454, "reward_std": 0.11433658003807068, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6032410860061646, "step": 1830 }, { "completion_length": 208.890625, "epoch": 0.5834926704907585, "grad_norm": 8.903815269470215, "kl": 0.126953125, "learning_rate": 4.1650732950924157e-07, "loss": 0.0051, "reward": 1.6346218585968018, "reward_std": 0.12258712202310562, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.650246798992157, "rewards/pad": 0.0, "step": 1831 }, { "completion_length": 252.90625, "epoch": 0.5838113448056087, "grad_norm": 6.527516841888428, "kl": 0.080078125, "learning_rate": 4.161886551943913e-07, "loss": 0.0032, "reward": 1.6859986782073975, "reward_std": 0.058990031480789185, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43599867820739746, "step": 1832 }, { "completion_length": 368.953125, "epoch": 0.5841300191204589, "grad_norm": 24.48158073425293, "kl": 0.06396484375, "learning_rate": 4.1586998087954113e-07, "loss": 0.0026, "reward": 1.3795561790466309, "reward_std": 0.06779491156339645, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.37955614924430847, "step": 1833 }, { "completion_length": 282.96875, "epoch": 0.5844486934353091, "grad_norm": 10.014896392822266, "kl": 0.0654296875, "learning_rate": 4.1555130656469083e-07, "loss": 0.0026, "reward": 1.4391634464263916, "reward_std": 0.09068316221237183, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4391633868217468, "rewards/pad": 0.0, "step": 1834 }, { "completion_length": 388.375, "epoch": 0.5847673677501594, "grad_norm": 10.559937477111816, "kl": 0.059814453125, "learning_rate": 4.1523263224984064e-07, "loss": 0.0024, "reward": 1.4498090744018555, "reward_std": 0.030296865850687027, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44980907440185547, "step": 1835 }, { "completion_length": 330.359375, "epoch": 0.5850860420650096, "grad_norm": 5.408528804779053, "kl": 0.0556640625, "learning_rate": 4.149139579349904e-07, "loss": 0.0022, "reward": 1.639885663986206, "reward_std": 0.06358183920383453, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5148857831954956, "step": 1836 }, { "completion_length": 249.515625, "epoch": 0.5854047163798598, "grad_norm": 15.045970916748047, "kl": 0.091796875, "learning_rate": 4.145952836201402e-07, "loss": 0.0037, "reward": 1.5363945960998535, "reward_std": 0.14407403767108917, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5520195960998535, "rewards/pad": 0.0, "step": 1837 }, { "completion_length": 302.1875, "epoch": 0.58572339069471, "grad_norm": 12.751809120178223, "kl": 0.10986328125, "learning_rate": 4.1427660930528995e-07, "loss": 0.0044, "reward": 1.50230872631073, "reward_std": 0.04466398060321808, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.37730872631073, "step": 1838 }, { "completion_length": 202.234375, "epoch": 0.5860420650095602, "grad_norm": 9.511930465698242, "kl": 0.11865234375, "learning_rate": 4.1395793499043976e-07, "loss": 0.0048, "reward": 1.65510892868042, "reward_std": 0.06541073322296143, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6551089286804199, "step": 1839 }, { "completion_length": 236.03125, "epoch": 0.5863607393244105, "grad_norm": 16.85823631286621, "kl": 0.107421875, "learning_rate": 4.136392606755895e-07, "loss": 0.0043, "reward": 1.6708178520202637, "reward_std": 0.12455782294273376, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5770677924156189, "rewards/pad": 0.125, "step": 1840 }, { "completion_length": 302.78125, "epoch": 0.5866794136392607, "grad_norm": 11.518096923828125, "kl": 0.08447265625, "learning_rate": 4.133205863607393e-07, "loss": 0.0034, "reward": 1.3582675457000732, "reward_std": 0.11208176612854004, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.35826757550239563, "rewards/pad": 0.0, "step": 1841 }, { "completion_length": 243.609375, "epoch": 0.5869980879541109, "grad_norm": 4.969910144805908, "kl": 0.0849609375, "learning_rate": 4.130019120458891e-07, "loss": 0.0034, "reward": 1.4788875579833984, "reward_std": 0.1530902087688446, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.38513755798339844, "step": 1842 }, { "completion_length": 281.15625, "epoch": 0.5873167622689611, "grad_norm": 25.539875030517578, "kl": 0.08544921875, "learning_rate": 4.126832377310389e-07, "loss": 0.0034, "reward": 1.5895493030548096, "reward_std": 0.09465925395488739, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5895493626594543, "rewards/pad": 0.0, "step": 1843 }, { "completion_length": 232.171875, "epoch": 0.5876354365838113, "grad_norm": 16.575340270996094, "kl": 0.08349609375, "learning_rate": 4.1236456341618864e-07, "loss": 0.0033, "reward": 1.4185080528259277, "reward_std": 0.20313654839992523, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4341329336166382, "rewards/pad": 0.015625, "step": 1844 }, { "completion_length": 199.8125, "epoch": 0.5879541108986616, "grad_norm": 16.62732696533203, "kl": 0.1171875, "learning_rate": 4.1204588910133844e-07, "loss": 0.0047, "reward": 1.666072130203247, "reward_std": 0.08273019641637802, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5410721898078918, "step": 1845 }, { "completion_length": 316.03125, "epoch": 0.5882727852135118, "grad_norm": 7.145381927490234, "kl": 0.0693359375, "learning_rate": 4.117272147864882e-07, "loss": 0.0028, "reward": 1.601731777191162, "reward_std": 0.047059301286935806, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47673168778419495, "step": 1846 }, { "completion_length": 235.078125, "epoch": 0.588591459528362, "grad_norm": 13.02017879486084, "kl": 0.08203125, "learning_rate": 4.1140854047163795e-07, "loss": 0.0033, "reward": 1.4461121559143066, "reward_std": 0.08059521019458771, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3211122155189514, "step": 1847 }, { "completion_length": 190.203125, "epoch": 0.5889101338432122, "grad_norm": 7.518156051635742, "kl": 0.0927734375, "learning_rate": 4.1108986615678776e-07, "loss": 0.0037, "reward": 1.5900521278381348, "reward_std": 0.06868287175893784, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.46505212783813477, "step": 1848 }, { "completion_length": 313.59375, "epoch": 0.5892288081580624, "grad_norm": 6.491325855255127, "kl": 0.07568359375, "learning_rate": 4.107711918419375e-07, "loss": 0.003, "reward": 1.5029609203338623, "reward_std": 0.04590877890586853, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.37796100974082947, "step": 1849 }, { "completion_length": 295.8125, "epoch": 0.5895474824729127, "grad_norm": 11.342720985412598, "kl": 0.07763671875, "learning_rate": 4.104525175270873e-07, "loss": 0.0031, "reward": 1.5426108837127686, "reward_std": 0.06933946907520294, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.29261088371276855, "step": 1850 }, { "completion_length": 248.25, "epoch": 0.5898661567877629, "grad_norm": 8.002965927124023, "kl": 0.09033203125, "learning_rate": 4.1013384321223707e-07, "loss": 0.0036, "reward": 1.4224798679351807, "reward_std": 0.09290251135826111, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42247986793518066, "rewards/pad": 0.0, "step": 1851 }, { "completion_length": 225.25, "epoch": 0.5901848311026131, "grad_norm": 27.69028091430664, "kl": 0.06982421875, "learning_rate": 4.098151688973869e-07, "loss": 0.0028, "reward": 1.8487733602523804, "reward_std": 0.17965924739837646, "rewards/pad": 0.375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.48939836025238037, "step": 1852 }, { "completion_length": 288.5, "epoch": 0.5905035054174633, "grad_norm": 9.17288589477539, "kl": 0.0869140625, "learning_rate": 4.0949649458253663e-07, "loss": 0.0035, "reward": 1.4427770376205444, "reward_std": 0.0336136668920517, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4427770972251892, "rewards/pad": 0.0, "step": 1853 }, { "completion_length": 241.296875, "epoch": 0.5908221797323135, "grad_norm": 8.414031028747559, "kl": 0.09228515625, "learning_rate": 4.091778202676864e-07, "loss": 0.0037, "reward": 1.7099772691726685, "reward_std": 0.07219240069389343, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5849772691726685, "rewards/pad": 0.125, "step": 1854 }, { "completion_length": 145.625, "epoch": 0.5911408540471638, "grad_norm": 8.692977905273438, "kl": 0.1005859375, "learning_rate": 4.0885914595283614e-07, "loss": 0.004, "reward": 1.5729535818099976, "reward_std": 0.13664300739765167, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4323284924030304, "rewards/pad": 0.15625, "step": 1855 }, { "completion_length": 259.796875, "epoch": 0.591459528362014, "grad_norm": 9.177119255065918, "kl": 0.0859375, "learning_rate": 4.0854047163798595e-07, "loss": 0.0034, "reward": 1.6044448614120483, "reward_std": 0.09958793222904205, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.49506986141204834, "step": 1856 }, { "completion_length": 221.984375, "epoch": 0.5917782026768642, "grad_norm": 9.245564460754395, "kl": 0.0947265625, "learning_rate": 4.082217973231357e-07, "loss": 0.0038, "reward": 1.5397629737854004, "reward_std": 0.12672016024589539, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4303880035877228, "step": 1857 }, { "completion_length": 201.09375, "epoch": 0.5920968769917144, "grad_norm": 11.010499000549316, "kl": 0.10009765625, "learning_rate": 4.079031230082855e-07, "loss": 0.004, "reward": 1.8728845119476318, "reward_std": 0.09205446392297745, "rewards/answer_reward": 0.265625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6072594523429871, "step": 1858 }, { "completion_length": 208.859375, "epoch": 0.5924155513065646, "grad_norm": 97.43705749511719, "kl": 0.12890625, "learning_rate": 4.0758444869343526e-07, "loss": 0.0052, "reward": 1.571459174156189, "reward_std": 0.10175631195306778, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.571459174156189, "step": 1859 }, { "completion_length": 255.28125, "epoch": 0.5927342256214149, "grad_norm": 7.342701435089111, "kl": 0.08203125, "learning_rate": 4.0726577437858507e-07, "loss": 0.0033, "reward": 1.6075087785720825, "reward_std": 0.0989253893494606, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4981337785720825, "rewards/pad": 0.109375, "step": 1860 }, { "completion_length": 144.578125, "epoch": 0.5930528999362651, "grad_norm": 13.898255348205566, "kl": 0.12255859375, "learning_rate": 4.069471000637348e-07, "loss": 0.0049, "reward": 1.6271339654922485, "reward_std": 0.0684891790151596, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5021339654922485, "rewards/pad": 0.125, "step": 1861 }, { "completion_length": 325.75, "epoch": 0.5933715742511153, "grad_norm": 8.965490341186523, "kl": 0.06591796875, "learning_rate": 4.0662842574888463e-07, "loss": 0.0026, "reward": 1.6634330749511719, "reward_std": 0.09572862088680267, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5384331941604614, "rewards/pad": 0.125, "step": 1862 }, { "completion_length": 293.0, "epoch": 0.5936902485659655, "grad_norm": 6.07974910736084, "kl": 0.1279296875, "learning_rate": 4.063097514340344e-07, "loss": 0.0051, "reward": 1.4634603261947632, "reward_std": 0.05717071518301964, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.46346038579940796, "step": 1863 }, { "completion_length": 103.890625, "epoch": 0.5940089228808159, "grad_norm": 9.67024040222168, "kl": 0.130859375, "learning_rate": 4.059910771191842e-07, "loss": 0.0052, "reward": 1.7005484104156494, "reward_std": 0.09798851609230042, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5755484700202942, "rewards/pad": 0.125, "step": 1864 }, { "completion_length": 259.671875, "epoch": 0.5943275971956661, "grad_norm": 15.222319602966309, "kl": 0.0888671875, "learning_rate": 4.0567240280433395e-07, "loss": 0.0036, "reward": 1.5262441635131836, "reward_std": 0.13098961114883423, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.30749407410621643, "step": 1865 }, { "completion_length": 300.15625, "epoch": 0.5946462715105163, "grad_norm": 7.883068561553955, "kl": 0.09521484375, "learning_rate": 4.0535372848948375e-07, "loss": 0.0038, "reward": 1.6139923334121704, "reward_std": 0.06202385574579239, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6139923334121704, "step": 1866 }, { "completion_length": 188.03125, "epoch": 0.5949649458253665, "grad_norm": 11.199960708618164, "kl": 0.11181640625, "learning_rate": 4.050350541746335e-07, "loss": 0.0045, "reward": 1.4685306549072266, "reward_std": 0.07563184201717377, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4685305953025818, "step": 1867 }, { "completion_length": 317.65625, "epoch": 0.5952836201402167, "grad_norm": 13.962400436401367, "kl": 0.06298828125, "learning_rate": 4.047163798597833e-07, "loss": 0.0025, "reward": 1.5315433740615845, "reward_std": 0.07140391319990158, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3909183442592621, "rewards/pad": 0.140625, "step": 1868 }, { "completion_length": 167.90625, "epoch": 0.595602294455067, "grad_norm": 12.310955047607422, "kl": 0.10302734375, "learning_rate": 4.0439770554493307e-07, "loss": 0.0041, "reward": 1.6179437637329102, "reward_std": 0.07316721975803375, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3679437041282654, "step": 1869 }, { "completion_length": 307.828125, "epoch": 0.5959209687699172, "grad_norm": 4.975736141204834, "kl": 0.0791015625, "learning_rate": 4.040790312300829e-07, "loss": 0.0032, "reward": 1.3248085975646973, "reward_std": 0.03344433754682541, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3248084485530853, "step": 1870 }, { "completion_length": 211.75, "epoch": 0.5962396430847674, "grad_norm": 9.022954940795898, "kl": 0.0849609375, "learning_rate": 4.0376035691523263e-07, "loss": 0.0034, "reward": 1.4232125282287598, "reward_std": 0.07294245809316635, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.423212468624115, "step": 1871 }, { "completion_length": 297.09375, "epoch": 0.5965583173996176, "grad_norm": 19.87626075744629, "kl": 0.0751953125, "learning_rate": 4.0344168260038244e-07, "loss": 0.003, "reward": 1.5290082693099976, "reward_std": 0.08635507524013519, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40400823950767517, "rewards/pad": 0.125, "step": 1872 }, { "completion_length": 364.375, "epoch": 0.5968769917144678, "grad_norm": 8.226881980895996, "kl": 0.087890625, "learning_rate": 4.0312300828553214e-07, "loss": 0.0035, "reward": 1.4459209442138672, "reward_std": 0.1533610224723816, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.47717100381851196, "step": 1873 }, { "completion_length": 150.03125, "epoch": 0.5971956660293181, "grad_norm": 16.274169921875, "kl": 0.126953125, "learning_rate": 4.0280433397068195e-07, "loss": 0.0051, "reward": 1.5381252765655518, "reward_std": 0.10919656604528427, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.41312527656555176, "rewards/pad": 0.125, "step": 1874 }, { "completion_length": 202.6875, "epoch": 0.5975143403441683, "grad_norm": 9.773507118225098, "kl": 0.10888671875, "learning_rate": 4.024856596558317e-07, "loss": 0.0044, "reward": 1.7387768030166626, "reward_std": 0.11466390639543533, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6137768030166626, "step": 1875 }, { "completion_length": 304.234375, "epoch": 0.5978330146590185, "grad_norm": 7.625086784362793, "kl": 0.07470703125, "learning_rate": 4.0216698534098145e-07, "loss": 0.003, "reward": 1.4400463104248047, "reward_std": 0.08653664588928223, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4556713104248047, "rewards/pad": 0.0, "step": 1876 }, { "completion_length": 151.234375, "epoch": 0.5981516889738687, "grad_norm": 14.312983512878418, "kl": 0.10693359375, "learning_rate": 4.0184831102613126e-07, "loss": 0.0043, "reward": 1.6709468364715576, "reward_std": 0.07913589477539062, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.42094680666923523, "step": 1877 }, { "completion_length": 207.3125, "epoch": 0.5984703632887189, "grad_norm": 7.628540515899658, "kl": 0.1044921875, "learning_rate": 4.01529636711281e-07, "loss": 0.0042, "reward": 1.4818041324615479, "reward_std": 0.09821852296590805, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48180416226387024, "rewards/pad": 0.0, "step": 1878 }, { "completion_length": 140.609375, "epoch": 0.5987890376035692, "grad_norm": 10.626641273498535, "kl": 0.1982421875, "learning_rate": 4.012109623964308e-07, "loss": 0.0079, "reward": 1.6922886371612549, "reward_std": 0.13950839638710022, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.7079135179519653, "rewards/pad": 0.0, "step": 1879 }, { "completion_length": 294.15625, "epoch": 0.5991077119184194, "grad_norm": 25.593908309936523, "kl": 0.07861328125, "learning_rate": 4.008922880815806e-07, "loss": 0.0031, "reward": 1.4315825700759888, "reward_std": 0.1311132311820984, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.44720759987831116, "rewards/pad": 0.0, "step": 1880 }, { "completion_length": 200.328125, "epoch": 0.5994263862332696, "grad_norm": 24.06759262084961, "kl": 0.11279296875, "learning_rate": 4.005736137667304e-07, "loss": 0.0045, "reward": 1.5628618001937866, "reward_std": 0.09779863059520721, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5628618001937866, "step": 1881 }, { "completion_length": 243.4375, "epoch": 0.5997450605481198, "grad_norm": 5.812214374542236, "kl": 0.0986328125, "learning_rate": 4.0025493945188014e-07, "loss": 0.0039, "reward": 1.586617112159729, "reward_std": 0.13455936312675476, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.602242112159729, "rewards/pad": 0.0, "step": 1882 }, { "completion_length": 212.484375, "epoch": 0.60006373486297, "grad_norm": 11.938089370727539, "kl": 0.09423828125, "learning_rate": 3.9993626513702994e-07, "loss": 0.0038, "reward": 1.4001901149749756, "reward_std": 0.054095618426799774, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.2751900553703308, "rewards/pad": 0.125, "step": 1883 }, { "completion_length": 267.765625, "epoch": 0.6003824091778203, "grad_norm": 22.59400177001953, "kl": 0.07666015625, "learning_rate": 3.996175908221797e-07, "loss": 0.0031, "reward": 1.3393089771270752, "reward_std": 0.05717310309410095, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.33930888772010803, "step": 1884 }, { "completion_length": 341.578125, "epoch": 0.6007010834926705, "grad_norm": 21.02568244934082, "kl": 0.06787109375, "learning_rate": 3.992989165073295e-07, "loss": 0.0027, "reward": 1.4882479906082153, "reward_std": 0.1340409815311432, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5038730502128601, "rewards/pad": 0.0, "step": 1885 }, { "completion_length": 181.578125, "epoch": 0.6010197578075207, "grad_norm": 10.017570495605469, "kl": 0.1298828125, "learning_rate": 3.9898024219247926e-07, "loss": 0.0052, "reward": 1.3889895677566528, "reward_std": 0.07805732637643814, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.38898953795433044, "rewards/pad": 0.0, "step": 1886 }, { "completion_length": 305.75, "epoch": 0.6013384321223709, "grad_norm": 9.133565902709961, "kl": 0.10693359375, "learning_rate": 3.9866156787762907e-07, "loss": 0.0043, "reward": 1.6056489944458008, "reward_std": 0.08965301513671875, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.48064911365509033, "step": 1887 }, { "completion_length": 155.5, "epoch": 0.6016571064372211, "grad_norm": 9.378506660461426, "kl": 0.12255859375, "learning_rate": 3.983428935627788e-07, "loss": 0.0049, "reward": 1.60020112991333, "reward_std": 0.11282531917095184, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4908260703086853, "step": 1888 }, { "completion_length": 257.46875, "epoch": 0.6019757807520714, "grad_norm": 8.208368301391602, "kl": 0.06982421875, "learning_rate": 3.980242192479286e-07, "loss": 0.0028, "reward": 1.8248472213745117, "reward_std": 0.061266396194696426, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4498470425605774, "step": 1889 }, { "completion_length": 162.140625, "epoch": 0.6022944550669216, "grad_norm": 13.764080047607422, "kl": 0.11181640625, "learning_rate": 3.977055449330784e-07, "loss": 0.0045, "reward": 1.4256174564361572, "reward_std": 0.08565089106559753, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.425617516040802, "rewards/pad": 0.0, "step": 1890 }, { "completion_length": 287.6875, "epoch": 0.6026131293817718, "grad_norm": 20.85105323791504, "kl": 0.0869140625, "learning_rate": 3.973868706182282e-07, "loss": 0.0035, "reward": 1.375194787979126, "reward_std": 0.17042842507362366, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.359569787979126, "step": 1891 }, { "completion_length": 296.953125, "epoch": 0.602931803696622, "grad_norm": 15.295384407043457, "kl": 0.0771484375, "learning_rate": 3.9706819630337794e-07, "loss": 0.0031, "reward": 1.5884289741516113, "reward_std": 0.09348006546497345, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4634290039539337, "step": 1892 }, { "completion_length": 157.0625, "epoch": 0.6032504780114722, "grad_norm": 48.732383728027344, "kl": 0.10888671875, "learning_rate": 3.967495219885277e-07, "loss": 0.0044, "reward": 1.6936684846878052, "reward_std": 0.09765303134918213, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.56866854429245, "step": 1893 }, { "completion_length": 155.9375, "epoch": 0.6035691523263225, "grad_norm": 9.036518096923828, "kl": 0.0966796875, "learning_rate": 3.9643084767367745e-07, "loss": 0.0039, "reward": 1.5201268196105957, "reward_std": 0.1085783839225769, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4107518196105957, "rewards/pad": 0.125, "step": 1894 }, { "completion_length": 423.0625, "epoch": 0.6038878266411727, "grad_norm": 6.770268440246582, "kl": 0.058837890625, "learning_rate": 3.9611217335882726e-07, "loss": 0.0023, "reward": 1.5838446617126465, "reward_std": 0.11189889162778854, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4744696021080017, "step": 1895 }, { "completion_length": 243.03125, "epoch": 0.6042065009560229, "grad_norm": 17.663448333740234, "kl": 0.1171875, "learning_rate": 3.95793499043977e-07, "loss": 0.0047, "reward": 1.535946011543274, "reward_std": 0.06312038749456406, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5359460115432739, "rewards/pad": 0.0, "step": 1896 }, { "completion_length": 149.265625, "epoch": 0.6045251752708731, "grad_norm": 25.105934143066406, "kl": 0.11279296875, "learning_rate": 3.954748247291268e-07, "loss": 0.0045, "reward": 1.7034320831298828, "reward_std": 0.17257440090179443, "rewards/answer_reward": 0.15625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5471821427345276, "step": 1897 }, { "completion_length": 237.578125, "epoch": 0.6048438495857233, "grad_norm": 47.23228073120117, "kl": 0.076171875, "learning_rate": 3.9515615041427657e-07, "loss": 0.003, "reward": 1.3625296354293823, "reward_std": 0.05569884181022644, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3625296950340271, "rewards/pad": 0.0, "step": 1898 }, { "completion_length": 181.34375, "epoch": 0.6051625239005736, "grad_norm": 24.187870025634766, "kl": 0.11572265625, "learning_rate": 3.948374760994264e-07, "loss": 0.0046, "reward": 1.5649663209915161, "reward_std": 0.14966291189193726, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4087163209915161, "rewards/pad": 0.15625, "step": 1899 }, { "completion_length": 260.171875, "epoch": 0.6054811982154238, "grad_norm": 13.711686134338379, "kl": 0.08251953125, "learning_rate": 3.9451880178457613e-07, "loss": 0.0033, "reward": 1.4515687227249146, "reward_std": 0.03165704011917114, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45156872272491455, "rewards/pad": 0.0, "step": 1900 }, { "completion_length": 283.40625, "epoch": 0.605799872530274, "grad_norm": 10.902450561523438, "kl": 0.07763671875, "learning_rate": 3.9420012746972594e-07, "loss": 0.0031, "reward": 1.4659996032714844, "reward_std": 0.07243816554546356, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4659996032714844, "rewards/pad": 0.0, "step": 1901 }, { "completion_length": 131.78125, "epoch": 0.6061185468451242, "grad_norm": 14.222344398498535, "kl": 0.119140625, "learning_rate": 3.938814531548757e-07, "loss": 0.0048, "reward": 1.5577516555786133, "reward_std": 0.09155458956956863, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43275171518325806, "rewards/pad": 0.125, "step": 1902 }, { "completion_length": 165.375, "epoch": 0.6064372211599746, "grad_norm": 42.20652770996094, "kl": 0.10693359375, "learning_rate": 3.935627788400255e-07, "loss": 0.0043, "reward": 1.5626838207244873, "reward_std": 0.14869019389152527, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4845588803291321, "rewards/pad": 0.078125, "step": 1903 }, { "completion_length": 341.890625, "epoch": 0.6067558954748248, "grad_norm": 15.570034980773926, "kl": 0.06201171875, "learning_rate": 3.9324410452517525e-07, "loss": 0.0025, "reward": 1.4920532703399658, "reward_std": 0.11405228078365326, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5076783299446106, "rewards/pad": 0.0, "step": 1904 }, { "completion_length": 250.53125, "epoch": 0.607074569789675, "grad_norm": 6.26237678527832, "kl": 0.0830078125, "learning_rate": 3.9292543021032506e-07, "loss": 0.0033, "reward": 1.6416082382202148, "reward_std": 0.15905556082725525, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.42285817861557007, "step": 1905 }, { "completion_length": 206.390625, "epoch": 0.6073932441045252, "grad_norm": 15.79101276397705, "kl": 0.0947265625, "learning_rate": 3.926067558954748e-07, "loss": 0.0038, "reward": 1.5966687202453613, "reward_std": 0.13306990265846252, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47166866064071655, "rewards/pad": 0.125, "step": 1906 }, { "completion_length": 155.0, "epoch": 0.6077119184193754, "grad_norm": 16.358394622802734, "kl": 0.11279296875, "learning_rate": 3.922880815806246e-07, "loss": 0.0045, "reward": 1.724739909172058, "reward_std": 0.11366938054561615, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4747399091720581, "rewards/pad": 0.25, "step": 1907 }, { "completion_length": 253.15625, "epoch": 0.6080305927342257, "grad_norm": 28.876209259033203, "kl": 0.07763671875, "learning_rate": 3.919694072657744e-07, "loss": 0.0031, "reward": 1.5843945741653442, "reward_std": 0.1402738243341446, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.490644633769989, "step": 1908 }, { "completion_length": 265.671875, "epoch": 0.6083492670490759, "grad_norm": 30.23357391357422, "kl": 0.08935546875, "learning_rate": 3.9165073295092413e-07, "loss": 0.0036, "reward": 1.5527620315551758, "reward_std": 0.10428045690059662, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.31838709115982056, "step": 1909 }, { "completion_length": 185.734375, "epoch": 0.6086679413639261, "grad_norm": 12.290230751037598, "kl": 0.10205078125, "learning_rate": 3.9133205863607394e-07, "loss": 0.0041, "reward": 1.577617883682251, "reward_std": 0.08793454617261887, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.577617883682251, "rewards/pad": 0.0, "step": 1910 }, { "completion_length": 255.359375, "epoch": 0.6089866156787763, "grad_norm": 18.367904663085938, "kl": 0.10205078125, "learning_rate": 3.910133843212237e-07, "loss": 0.0041, "reward": 1.344871997833252, "reward_std": 0.13884970545768738, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.37612199783325195, "step": 1911 }, { "completion_length": 287.296875, "epoch": 0.6093052899936265, "grad_norm": 11.594067573547363, "kl": 0.07568359375, "learning_rate": 3.906947100063735e-07, "loss": 0.003, "reward": 1.5959113836288452, "reward_std": 0.07247813045978546, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4709113836288452, "step": 1912 }, { "completion_length": 258.34375, "epoch": 0.6096239643084768, "grad_norm": 6.591706275939941, "kl": 0.07080078125, "learning_rate": 3.903760356915232e-07, "loss": 0.0028, "reward": 1.65617036819458, "reward_std": 0.05984325334429741, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5311704277992249, "rewards/pad": 0.125, "step": 1913 }, { "completion_length": 198.046875, "epoch": 0.609942638623327, "grad_norm": 10.927465438842773, "kl": 0.09326171875, "learning_rate": 3.90057361376673e-07, "loss": 0.0037, "reward": 1.6630644798278809, "reward_std": 0.0670454129576683, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6630644798278809, "rewards/pad": 0.0, "step": 1914 }, { "completion_length": 253.65625, "epoch": 0.6102613129381772, "grad_norm": 16.31411361694336, "kl": 0.1025390625, "learning_rate": 3.8973868706182276e-07, "loss": 0.0041, "reward": 1.5014548301696777, "reward_std": 0.10171373188495636, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5170798897743225, "step": 1915 }, { "completion_length": 253.40625, "epoch": 0.6105799872530274, "grad_norm": 6.231085777282715, "kl": 0.103515625, "learning_rate": 3.8942001274697257e-07, "loss": 0.0041, "reward": 1.4525947570800781, "reward_std": 0.06625127792358398, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45259469747543335, "rewards/pad": 0.0, "step": 1916 }, { "completion_length": 216.78125, "epoch": 0.6108986615678776, "grad_norm": 10.79334831237793, "kl": 0.08740234375, "learning_rate": 3.891013384321223e-07, "loss": 0.0035, "reward": 1.8049113750457764, "reward_std": 0.08902693539857864, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6799114346504211, "rewards/pad": 0.125, "step": 1917 }, { "completion_length": 283.734375, "epoch": 0.6112173358827279, "grad_norm": 9.601179122924805, "kl": 0.08203125, "learning_rate": 3.8878266411727213e-07, "loss": 0.0033, "reward": 1.535913348197937, "reward_std": 0.0846046507358551, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.426538348197937, "step": 1918 }, { "completion_length": 321.453125, "epoch": 0.6115360101975781, "grad_norm": 13.384991645812988, "kl": 0.07275390625, "learning_rate": 3.884639898024219e-07, "loss": 0.0029, "reward": 1.493588924407959, "reward_std": 0.03804008662700653, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49358880519866943, "step": 1919 }, { "completion_length": 184.3125, "epoch": 0.6118546845124283, "grad_norm": 13.001421928405762, "kl": 0.10009765625, "learning_rate": 3.881453154875717e-07, "loss": 0.004, "reward": 1.6277797222137451, "reward_std": 0.18806394934654236, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5496547818183899, "step": 1920 }, { "completion_length": 330.90625, "epoch": 0.6121733588272785, "grad_norm": 10.868631362915039, "kl": 0.05859375, "learning_rate": 3.8782664117272144e-07, "loss": 0.0023, "reward": 1.4844664335250854, "reward_std": 0.058168746531009674, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3594663739204407, "step": 1921 }, { "completion_length": 247.5625, "epoch": 0.6124920331421287, "grad_norm": 13.617033004760742, "kl": 0.1484375, "learning_rate": 3.8750796685787125e-07, "loss": 0.0059, "reward": 1.4870398044586182, "reward_std": 0.12303633987903595, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.3932897746562958, "rewards/pad": 0.125, "step": 1922 }, { "completion_length": 196.28125, "epoch": 0.612810707456979, "grad_norm": 17.17580795288086, "kl": 0.083984375, "learning_rate": 3.87189292543021e-07, "loss": 0.0033, "reward": 1.5076944828033447, "reward_std": 0.12078005820512772, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5233194828033447, "step": 1923 }, { "completion_length": 139.265625, "epoch": 0.6131293817718292, "grad_norm": 6.496094703674316, "kl": 0.1162109375, "learning_rate": 3.868706182281708e-07, "loss": 0.0046, "reward": 1.4235994815826416, "reward_std": 0.053943369537591934, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4235994815826416, "rewards/pad": 0.0, "step": 1924 }, { "completion_length": 288.4375, "epoch": 0.6134480560866794, "grad_norm": 29.971853256225586, "kl": 0.08203125, "learning_rate": 3.8655194391332057e-07, "loss": 0.0033, "reward": 1.4659730195999146, "reward_std": 0.05683894455432892, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4659729599952698, "rewards/pad": 0.0, "step": 1925 }, { "completion_length": 183.3125, "epoch": 0.6137667304015296, "grad_norm": 13.859814643859863, "kl": 0.10205078125, "learning_rate": 3.8623326959847037e-07, "loss": 0.0041, "reward": 1.5560696125030518, "reward_std": 0.10703746974468231, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.571694552898407, "rewards/pad": 0.0, "step": 1926 }, { "completion_length": 272.53125, "epoch": 0.6140854047163798, "grad_norm": 18.45200538635254, "kl": 0.0830078125, "learning_rate": 3.8591459528362013e-07, "loss": 0.0033, "reward": 1.680010199546814, "reward_std": 0.11167501658201218, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39876025915145874, "rewards/pad": 0.28125, "step": 1927 }, { "completion_length": 214.59375, "epoch": 0.6144040790312301, "grad_norm": 21.060997009277344, "kl": 0.0888671875, "learning_rate": 3.8559592096876993e-07, "loss": 0.0035, "reward": 1.52708101272583, "reward_std": 0.1346757560968399, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3708308935165405, "rewards/pad": 0.15625, "step": 1928 }, { "completion_length": 322.53125, "epoch": 0.6147227533460803, "grad_norm": 16.372791290283203, "kl": 0.07666015625, "learning_rate": 3.852772466539197e-07, "loss": 0.0031, "reward": 1.5177533626556396, "reward_std": 0.0657130628824234, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5177534818649292, "step": 1929 }, { "completion_length": 199.5625, "epoch": 0.6150414276609305, "grad_norm": 23.437456130981445, "kl": 0.09423828125, "learning_rate": 3.849585723390695e-07, "loss": 0.0038, "reward": 1.479830265045166, "reward_std": 0.10783857107162476, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47983020544052124, "rewards/pad": 0.0, "step": 1930 }, { "completion_length": 253.46875, "epoch": 0.6153601019757807, "grad_norm": 8.0709867477417, "kl": 0.09521484375, "learning_rate": 3.8463989802421925e-07, "loss": 0.0038, "reward": 1.6359091997146606, "reward_std": 0.054778601974248886, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5109091997146606, "step": 1931 }, { "completion_length": 330.0625, "epoch": 0.615678776290631, "grad_norm": 5.908551216125488, "kl": 0.06884765625, "learning_rate": 3.84321223709369e-07, "loss": 0.0028, "reward": 1.397073745727539, "reward_std": 0.03605101630091667, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3970736861228943, "step": 1932 }, { "completion_length": 313.6875, "epoch": 0.6159974506054812, "grad_norm": 5.9190897941589355, "kl": 0.08740234375, "learning_rate": 3.8400254939451876e-07, "loss": 0.0035, "reward": 1.4032435417175293, "reward_std": 0.11116233468055725, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4032435417175293, "step": 1933 }, { "completion_length": 244.328125, "epoch": 0.6163161249203314, "grad_norm": 8.51202392578125, "kl": 0.103515625, "learning_rate": 3.8368387507966856e-07, "loss": 0.0041, "reward": 1.6362656354904175, "reward_std": 0.08885425329208374, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6362656950950623, "step": 1934 }, { "completion_length": 209.75, "epoch": 0.6166347992351816, "grad_norm": 14.257871627807617, "kl": 0.10986328125, "learning_rate": 3.833652007648183e-07, "loss": 0.0044, "reward": 1.492276668548584, "reward_std": 0.08433869481086731, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36727672815322876, "step": 1935 }, { "completion_length": 203.90625, "epoch": 0.6169534735500318, "grad_norm": 15.767688751220703, "kl": 0.1005859375, "learning_rate": 3.830465264499681e-07, "loss": 0.004, "reward": 1.728574514389038, "reward_std": 0.1251630187034607, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6191996335983276, "rewards/pad": 0.125, "step": 1936 }, { "completion_length": 218.09375, "epoch": 0.617272147864882, "grad_norm": 11.859417915344238, "kl": 0.1240234375, "learning_rate": 3.827278521351179e-07, "loss": 0.005, "reward": 1.3472200632095337, "reward_std": 0.050142768770456314, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3472200930118561, "step": 1937 }, { "completion_length": 273.96875, "epoch": 0.6175908221797323, "grad_norm": 9.644183158874512, "kl": 0.09521484375, "learning_rate": 3.8240917782026763e-07, "loss": 0.0038, "reward": 1.8013083934783936, "reward_std": 0.06755261868238449, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.551308274269104, "rewards/pad": 0.25, "step": 1938 }, { "completion_length": 314.0625, "epoch": 0.6179094964945825, "grad_norm": 5.385295391082764, "kl": 0.07177734375, "learning_rate": 3.8209050350541744e-07, "loss": 0.0029, "reward": 1.4403769969940186, "reward_std": 0.08441875874996185, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.45600199699401855, "step": 1939 }, { "completion_length": 283.09375, "epoch": 0.6182281708094327, "grad_norm": 16.010988235473633, "kl": 0.09033203125, "learning_rate": 3.817718291905672e-07, "loss": 0.0036, "reward": 1.4347320795059204, "reward_std": 0.05241646617650986, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4347321391105652, "step": 1940 }, { "completion_length": 159.171875, "epoch": 0.6185468451242829, "grad_norm": 12.74837875366211, "kl": 0.11279296875, "learning_rate": 3.81453154875717e-07, "loss": 0.0045, "reward": 1.7130329608917236, "reward_std": 0.09416225552558899, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5880329012870789, "step": 1941 }, { "completion_length": 246.609375, "epoch": 0.6188655194391333, "grad_norm": 10.222420692443848, "kl": 0.0869140625, "learning_rate": 3.8113448056086675e-07, "loss": 0.0035, "reward": 1.6772123575210571, "reward_std": 0.0816313698887825, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4428373873233795, "rewards/pad": 0.234375, "step": 1942 }, { "completion_length": 103.359375, "epoch": 0.6191841937539835, "grad_norm": 8.743064880371094, "kl": 0.140625, "learning_rate": 3.8081580624601656e-07, "loss": 0.0056, "reward": 1.548558235168457, "reward_std": 0.12006276845932007, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5485581755638123, "rewards/pad": 0.0, "step": 1943 }, { "completion_length": 334.34375, "epoch": 0.6195028680688337, "grad_norm": 11.024765968322754, "kl": 0.1640625, "learning_rate": 3.804971319311663e-07, "loss": 0.0066, "reward": 1.461504340171814, "reward_std": 0.19224317371845245, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4927542805671692, "rewards/pad": 0.0, "step": 1944 }, { "completion_length": 194.65625, "epoch": 0.6198215423836839, "grad_norm": 13.036958694458008, "kl": 0.1455078125, "learning_rate": 3.801784576163161e-07, "loss": 0.0058, "reward": 1.6175357103347778, "reward_std": 0.11850807815790176, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5081607103347778, "step": 1945 }, { "completion_length": 337.9375, "epoch": 0.6201402166985341, "grad_norm": 6.2965264320373535, "kl": 0.0693359375, "learning_rate": 3.798597833014659e-07, "loss": 0.0028, "reward": 1.675209641456604, "reward_std": 0.045450374484062195, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.675209641456604, "step": 1946 }, { "completion_length": 209.671875, "epoch": 0.6204588910133844, "grad_norm": 10.050835609436035, "kl": 0.095703125, "learning_rate": 3.795411089866157e-07, "loss": 0.0038, "reward": 1.769952416419983, "reward_std": 0.13773831725120544, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5355774760246277, "step": 1947 }, { "completion_length": 296.0, "epoch": 0.6207775653282346, "grad_norm": 8.063725471496582, "kl": 0.0869140625, "learning_rate": 3.7922243467176544e-07, "loss": 0.0035, "reward": 1.4973119497299194, "reward_std": 0.1238342821598053, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3879369795322418, "rewards/pad": 0.125, "step": 1948 }, { "completion_length": 237.25, "epoch": 0.6210962396430848, "grad_norm": 11.939338684082031, "kl": 0.08740234375, "learning_rate": 3.7890376035691524e-07, "loss": 0.0035, "reward": 1.6560659408569336, "reward_std": 0.05877237021923065, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5310658812522888, "step": 1949 }, { "completion_length": 338.9375, "epoch": 0.621414913957935, "grad_norm": 9.682485580444336, "kl": 0.06787109375, "learning_rate": 3.78585086042065e-07, "loss": 0.0027, "reward": 1.4578567743301392, "reward_std": 0.10045262426137924, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.34848177433013916, "rewards/pad": 0.125, "step": 1950 }, { "completion_length": 244.734375, "epoch": 0.6217335882727852, "grad_norm": 31.29572296142578, "kl": 0.0908203125, "learning_rate": 3.782664117272148e-07, "loss": 0.0036, "reward": 1.561888337135315, "reward_std": 0.04258840158581734, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5618883371353149, "rewards/pad": 0.0, "step": 1951 }, { "completion_length": 234.125, "epoch": 0.6220522625876355, "grad_norm": 10.641196250915527, "kl": 0.1513671875, "learning_rate": 3.779477374123645e-07, "loss": 0.0061, "reward": 1.4462183713912964, "reward_std": 0.15027868747711182, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.414968341588974, "rewards/pad": 0.03125, "step": 1952 }, { "completion_length": 203.34375, "epoch": 0.6223709369024857, "grad_norm": 15.608134269714355, "kl": 0.0966796875, "learning_rate": 3.776290630975143e-07, "loss": 0.0039, "reward": 1.7943435907363892, "reward_std": 0.09442506730556488, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6693435311317444, "step": 1953 }, { "completion_length": 238.421875, "epoch": 0.6226896112173359, "grad_norm": 38.01367950439453, "kl": 0.0927734375, "learning_rate": 3.7731038878266407e-07, "loss": 0.0037, "reward": 1.5012284517288208, "reward_std": 0.10308201611042023, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4231035113334656, "step": 1954 }, { "completion_length": 200.796875, "epoch": 0.6230082855321861, "grad_norm": 8.269294738769531, "kl": 0.07861328125, "learning_rate": 3.769917144678139e-07, "loss": 0.0032, "reward": 1.824209451675415, "reward_std": 0.12756913900375366, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.589834451675415, "step": 1955 }, { "completion_length": 212.171875, "epoch": 0.6233269598470363, "grad_norm": 17.209590911865234, "kl": 0.10107421875, "learning_rate": 3.7667304015296363e-07, "loss": 0.004, "reward": 1.6338493824005127, "reward_std": 0.06915850192308426, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6338493824005127, "step": 1956 }, { "completion_length": 275.015625, "epoch": 0.6236456341618866, "grad_norm": 6.851185321807861, "kl": 0.07373046875, "learning_rate": 3.7635436583811344e-07, "loss": 0.0029, "reward": 1.5390303134918213, "reward_std": 0.0869000107049942, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.2890303134918213, "step": 1957 }, { "completion_length": 155.125, "epoch": 0.6239643084767368, "grad_norm": 11.542649269104004, "kl": 0.1201171875, "learning_rate": 3.760356915232632e-07, "loss": 0.0048, "reward": 1.5345096588134766, "reward_std": 0.05417778715491295, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5345095992088318, "rewards/pad": 0.0, "step": 1958 }, { "completion_length": 207.5625, "epoch": 0.624282982791587, "grad_norm": 6.008335590362549, "kl": 0.0966796875, "learning_rate": 3.75717017208413e-07, "loss": 0.0039, "reward": 1.4731647968292236, "reward_std": 0.14521664381027222, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39503979682922363, "rewards/pad": 0.078125, "step": 1959 }, { "completion_length": 252.5625, "epoch": 0.6246016571064372, "grad_norm": 10.692580223083496, "kl": 0.076171875, "learning_rate": 3.7539834289356275e-07, "loss": 0.003, "reward": 1.6467187404632568, "reward_std": 0.11047782748937607, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44359374046325684, "rewards/pad": 0.203125, "step": 1960 }, { "completion_length": 359.296875, "epoch": 0.6249203314212874, "grad_norm": 4.429586887359619, "kl": 0.05029296875, "learning_rate": 3.7507966857871256e-07, "loss": 0.002, "reward": 1.327553629875183, "reward_std": 0.09251774847507477, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3431786596775055, "rewards/pad": 0.0, "step": 1961 }, { "completion_length": 294.515625, "epoch": 0.6252390057361377, "grad_norm": 12.322518348693848, "kl": 0.0712890625, "learning_rate": 3.747609942638623e-07, "loss": 0.0029, "reward": 1.5070735216140747, "reward_std": 0.1569286286830902, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5226985216140747, "step": 1962 }, { "completion_length": 208.21875, "epoch": 0.6255576800509879, "grad_norm": 24.573041915893555, "kl": 0.08837890625, "learning_rate": 3.744423199490121e-07, "loss": 0.0035, "reward": 1.5982944965362549, "reward_std": 0.14915892481803894, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5045445561408997, "step": 1963 }, { "completion_length": 275.953125, "epoch": 0.6258763543658381, "grad_norm": 11.793495178222656, "kl": 0.08837890625, "learning_rate": 3.7412364563416187e-07, "loss": 0.0035, "reward": 1.4760801792144775, "reward_std": 0.041609302163124084, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3510802984237671, "step": 1964 }, { "completion_length": 218.25, "epoch": 0.6261950286806883, "grad_norm": 13.615195274353027, "kl": 0.11669921875, "learning_rate": 3.738049713193117e-07, "loss": 0.0047, "reward": 1.6353042125701904, "reward_std": 0.09656595438718796, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5103042125701904, "step": 1965 }, { "completion_length": 289.375, "epoch": 0.6265137029955385, "grad_norm": 5.561672210693359, "kl": 0.08056640625, "learning_rate": 3.7348629700446143e-07, "loss": 0.0032, "reward": 1.458938717842102, "reward_std": 0.08781175315380096, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4589385986328125, "step": 1966 }, { "completion_length": 220.921875, "epoch": 0.6268323773103888, "grad_norm": 8.867315292358398, "kl": 0.11572265625, "learning_rate": 3.7316762268961124e-07, "loss": 0.0046, "reward": 1.5917067527770996, "reward_std": 0.08042184263467789, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5917068123817444, "rewards/pad": 0.0, "step": 1967 }, { "completion_length": 218.890625, "epoch": 0.627151051625239, "grad_norm": 14.069080352783203, "kl": 0.07763671875, "learning_rate": 3.72848948374761e-07, "loss": 0.0031, "reward": 1.7388994693756104, "reward_std": 0.09234826266765594, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4888995289802551, "rewards/pad": 0.25, "step": 1968 }, { "completion_length": 312.96875, "epoch": 0.6274697259400892, "grad_norm": 7.027201175689697, "kl": 0.12060546875, "learning_rate": 3.725302740599108e-07, "loss": 0.0048, "reward": 1.5714778900146484, "reward_std": 0.14681343734264374, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.46210283041000366, "rewards/pad": 0.125, "step": 1969 }, { "completion_length": 245.671875, "epoch": 0.6277884002549394, "grad_norm": 7.419850826263428, "kl": 0.068359375, "learning_rate": 3.7221159974506056e-07, "loss": 0.0027, "reward": 1.7859978675842285, "reward_std": 0.0766286849975586, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6609978675842285, "rewards/pad": 0.125, "step": 1970 }, { "completion_length": 163.28125, "epoch": 0.6281070745697896, "grad_norm": 54.53138732910156, "kl": 0.10888671875, "learning_rate": 3.718929254302103e-07, "loss": 0.0044, "reward": 1.4890398979187012, "reward_std": 0.11502894014120102, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3796647787094116, "rewards/pad": 0.125, "step": 1971 }, { "completion_length": 350.671875, "epoch": 0.6284257488846399, "grad_norm": 6.750097274780273, "kl": 0.062255859375, "learning_rate": 3.7157425111536006e-07, "loss": 0.0025, "reward": 1.429856300354004, "reward_std": 0.0958351194858551, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.44548124074935913, "step": 1972 }, { "completion_length": 216.578125, "epoch": 0.6287444231994901, "grad_norm": 7.220076084136963, "kl": 0.07763671875, "learning_rate": 3.712555768005098e-07, "loss": 0.0031, "reward": 1.8001503944396973, "reward_std": 0.09091947972774506, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4251503646373749, "rewards/pad": 0.375, "step": 1973 }, { "completion_length": 227.6875, "epoch": 0.6290630975143403, "grad_norm": 5.458504676818848, "kl": 0.10693359375, "learning_rate": 3.709369024856596e-07, "loss": 0.0043, "reward": 1.4395153522491455, "reward_std": 0.08207175880670547, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43951529264450073, "rewards/pad": 0.0, "step": 1974 }, { "completion_length": 151.265625, "epoch": 0.6293817718291905, "grad_norm": 13.328695297241211, "kl": 0.1220703125, "learning_rate": 3.706182281708094e-07, "loss": 0.0049, "reward": 1.6766632795333862, "reward_std": 0.09797574579715729, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.676663339138031, "rewards/pad": 0.0, "step": 1975 }, { "completion_length": 231.625, "epoch": 0.6297004461440407, "grad_norm": 10.131150245666504, "kl": 0.1328125, "learning_rate": 3.702995538559592e-07, "loss": 0.0053, "reward": 1.8385590314865112, "reward_std": 0.1282750517129898, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.47918403148651123, "step": 1976 }, { "completion_length": 307.25, "epoch": 0.630019120458891, "grad_norm": 7.387079238891602, "kl": 0.0673828125, "learning_rate": 3.6998087954110894e-07, "loss": 0.0027, "reward": 1.5480023622512817, "reward_std": 0.04613717645406723, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.42300236225128174, "step": 1977 }, { "completion_length": 164.6875, "epoch": 0.6303377947737412, "grad_norm": 16.791282653808594, "kl": 0.1416015625, "learning_rate": 3.6966220522625875e-07, "loss": 0.0057, "reward": 1.5016502141952515, "reward_std": 0.08761771768331528, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5016502737998962, "step": 1978 }, { "completion_length": 218.96875, "epoch": 0.6306564690885914, "grad_norm": 72.0509033203125, "kl": 0.07958984375, "learning_rate": 3.693435309114085e-07, "loss": 0.0032, "reward": 1.9234390258789062, "reward_std": 0.22262617945671082, "rewards/pad": 0.265625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6734391450881958, "step": 1979 }, { "completion_length": 337.03125, "epoch": 0.6309751434034416, "grad_norm": 13.148605346679688, "kl": 0.07275390625, "learning_rate": 3.690248565965583e-07, "loss": 0.0029, "reward": 1.4018738269805908, "reward_std": 0.17414557933807373, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4331238567829132, "rewards/pad": 0.0, "step": 1980 }, { "completion_length": 182.375, "epoch": 0.631293817718292, "grad_norm": 7.003880977630615, "kl": 0.0908203125, "learning_rate": 3.6870618228170806e-07, "loss": 0.0036, "reward": 1.6010947227478027, "reward_std": 0.11983469128608704, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.2573447823524475, "step": 1981 }, { "completion_length": 270.84375, "epoch": 0.6316124920331422, "grad_norm": 5.914927005767822, "kl": 0.07958984375, "learning_rate": 3.6838750796685787e-07, "loss": 0.0032, "reward": 1.6006536483764648, "reward_std": 0.11984487622976303, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.49127864837646484, "rewards/pad": 0.125, "step": 1982 }, { "completion_length": 222.578125, "epoch": 0.6319311663479924, "grad_norm": 6.8138747215271, "kl": 0.10888671875, "learning_rate": 3.680688336520076e-07, "loss": 0.0043, "reward": 1.5949468612670898, "reward_std": 0.08582202345132828, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5949469208717346, "rewards/pad": 0.0, "step": 1983 }, { "completion_length": 243.03125, "epoch": 0.6322498406628426, "grad_norm": 12.201217651367188, "kl": 0.078125, "learning_rate": 3.6775015933715743e-07, "loss": 0.0031, "reward": 1.477726936340332, "reward_std": 0.1586262583732605, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.49335187673568726, "rewards/pad": 0.0, "step": 1984 }, { "completion_length": 279.671875, "epoch": 0.6325685149776928, "grad_norm": 14.337632179260254, "kl": 0.0859375, "learning_rate": 3.674314850223072e-07, "loss": 0.0034, "reward": 1.4611310958862305, "reward_std": 0.07045906037092209, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.46113121509552, "step": 1985 }, { "completion_length": 313.609375, "epoch": 0.6328871892925431, "grad_norm": 28.399967193603516, "kl": 0.08154296875, "learning_rate": 3.67112810707457e-07, "loss": 0.0033, "reward": 1.359786868095398, "reward_std": 0.09078215062618256, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.35978686809539795, "step": 1986 }, { "completion_length": 152.953125, "epoch": 0.6332058636073933, "grad_norm": 21.749130249023438, "kl": 0.1005859375, "learning_rate": 3.6679413639260674e-07, "loss": 0.004, "reward": 1.3360384702682495, "reward_std": 0.047141361981630325, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3360384404659271, "rewards/pad": 0.0, "step": 1987 }, { "completion_length": 294.984375, "epoch": 0.6335245379222435, "grad_norm": 12.006050109863281, "kl": 0.07421875, "learning_rate": 3.6647546207775655e-07, "loss": 0.003, "reward": 1.4102833271026611, "reward_std": 0.2350875735282898, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.2696583569049835, "step": 1988 }, { "completion_length": 259.015625, "epoch": 0.6338432122370937, "grad_norm": 7.643406867980957, "kl": 0.1044921875, "learning_rate": 3.661567877629063e-07, "loss": 0.0042, "reward": 1.5448561906814575, "reward_std": 0.09191156923770905, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4354812204837799, "step": 1989 }, { "completion_length": 110.65625, "epoch": 0.6341618865519439, "grad_norm": 11.28622817993164, "kl": 0.1533203125, "learning_rate": 3.658381134480561e-07, "loss": 0.0061, "reward": 1.6380500793457031, "reward_std": 0.18105697631835938, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5286750793457031, "rewards/pad": 0.125, "step": 1990 }, { "completion_length": 173.828125, "epoch": 0.6344805608667942, "grad_norm": 6.512990951538086, "kl": 0.099609375, "learning_rate": 3.655194391332058e-07, "loss": 0.004, "reward": 1.6445505619049072, "reward_std": 0.054843753576278687, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5195505619049072, "rewards/pad": 0.125, "step": 1991 }, { "completion_length": 306.859375, "epoch": 0.6347992351816444, "grad_norm": 10.722943305969238, "kl": 0.076171875, "learning_rate": 3.652007648183556e-07, "loss": 0.003, "reward": 1.3924834728240967, "reward_std": 0.09814517199993134, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.28310853242874146, "step": 1992 }, { "completion_length": 194.40625, "epoch": 0.6351179094964946, "grad_norm": 7.510833740234375, "kl": 0.1328125, "learning_rate": 3.648820905035054e-07, "loss": 0.0053, "reward": 1.677827000617981, "reward_std": 0.06774307787418365, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.677827000617981, "rewards/pad": 0.0, "step": 1993 }, { "completion_length": 275.46875, "epoch": 0.6354365838113448, "grad_norm": 35.3467903137207, "kl": 0.08544921875, "learning_rate": 3.645634161886552e-07, "loss": 0.0034, "reward": 1.5682491064071655, "reward_std": 0.1375354379415512, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4588741362094879, "step": 1994 }, { "completion_length": 217.609375, "epoch": 0.635755258126195, "grad_norm": 8.333303451538086, "kl": 0.09375, "learning_rate": 3.6424474187380494e-07, "loss": 0.0038, "reward": 1.6891238689422607, "reward_std": 0.07034970819950104, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.564123809337616, "rewards/pad": 0.125, "step": 1995 }, { "completion_length": 230.8125, "epoch": 0.6360739324410453, "grad_norm": 9.261439323425293, "kl": 0.10546875, "learning_rate": 3.6392606755895474e-07, "loss": 0.0042, "reward": 1.5057934522628784, "reward_std": 0.10754223167896271, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5057935118675232, "rewards/pad": 0.0, "step": 1996 }, { "completion_length": 332.25, "epoch": 0.6363926067558955, "grad_norm": 5.428790092468262, "kl": 0.07177734375, "learning_rate": 3.636073932441045e-07, "loss": 0.0029, "reward": 1.567945122718811, "reward_std": 0.06191801652312279, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.567945122718811, "step": 1997 }, { "completion_length": 323.0, "epoch": 0.6367112810707457, "grad_norm": 29.642379760742188, "kl": 0.109375, "learning_rate": 3.632887189292543e-07, "loss": 0.0044, "reward": 1.7203319072723389, "reward_std": 0.07314572483301163, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4703318774700165, "step": 1998 }, { "completion_length": 223.3125, "epoch": 0.6370299553855959, "grad_norm": 9.734238624572754, "kl": 0.10986328125, "learning_rate": 3.6297004461440406e-07, "loss": 0.0044, "reward": 1.5392111539840698, "reward_std": 0.11153466254472733, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5392111539840698, "rewards/pad": 0.0, "step": 1999 }, { "completion_length": 267.5, "epoch": 0.6373486297004461, "grad_norm": 12.07893180847168, "kl": 0.09326171875, "learning_rate": 3.626513702995538e-07, "loss": 0.0037, "reward": 1.5638694763183594, "reward_std": 0.09723970293998718, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43886953592300415, "rewards/pad": 0.125, "step": 2000 }, { "completion_length": 313.0625, "epoch": 0.6376673040152964, "grad_norm": 5.8718414306640625, "kl": 0.07373046875, "learning_rate": 3.623326959847036e-07, "loss": 0.0029, "reward": 1.4742928743362427, "reward_std": 0.05352408438920975, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47429290413856506, "step": 2001 }, { "completion_length": 314.875, "epoch": 0.6379859783301466, "grad_norm": 72.57586669921875, "kl": 0.08740234375, "learning_rate": 3.6201402166985337e-07, "loss": 0.0035, "reward": 1.5000325441360474, "reward_std": 0.06527876853942871, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5000325441360474, "rewards/pad": 0.0, "step": 2002 }, { "completion_length": 199.53125, "epoch": 0.6383046526449968, "grad_norm": 10.587024688720703, "kl": 0.1474609375, "learning_rate": 3.616953473550032e-07, "loss": 0.0059, "reward": 1.5684709548950195, "reward_std": 0.12079137563705444, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.31847089529037476, "rewards/pad": 0.25, "step": 2003 }, { "completion_length": 318.40625, "epoch": 0.638623326959847, "grad_norm": 4.737164497375488, "kl": 0.0712890625, "learning_rate": 3.6137667304015293e-07, "loss": 0.0028, "reward": 1.5824774503707886, "reward_std": 0.09609906375408173, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.45747750997543335, "step": 2004 }, { "completion_length": 286.640625, "epoch": 0.6389420012746972, "grad_norm": 6.416104793548584, "kl": 0.09912109375, "learning_rate": 3.6105799872530274e-07, "loss": 0.004, "reward": 1.5904369354248047, "reward_std": 0.12192674726247787, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6060618758201599, "step": 2005 }, { "completion_length": 360.890625, "epoch": 0.6392606755895475, "grad_norm": 12.107686042785645, "kl": 0.06884765625, "learning_rate": 3.607393244104525e-07, "loss": 0.0027, "reward": 1.4735157489776611, "reward_std": 0.10779435932636261, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4891408085823059, "step": 2006 }, { "completion_length": 263.96875, "epoch": 0.6395793499043977, "grad_norm": 11.123519897460938, "kl": 0.09423828125, "learning_rate": 3.604206500956023e-07, "loss": 0.0038, "reward": 1.5251493453979492, "reward_std": 0.04218164086341858, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5251492857933044, "step": 2007 }, { "completion_length": 291.46875, "epoch": 0.6398980242192479, "grad_norm": 8.76659107208252, "kl": 0.08056640625, "learning_rate": 3.6010197578075206e-07, "loss": 0.0032, "reward": 1.5642863512039185, "reward_std": 0.11715750396251678, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.45491135120391846, "rewards/pad": 0.125, "step": 2008 }, { "completion_length": 358.78125, "epoch": 0.6402166985340981, "grad_norm": 6.300568580627441, "kl": 0.06787109375, "learning_rate": 3.5978330146590186e-07, "loss": 0.0027, "reward": 1.526787281036377, "reward_std": 0.19017034769058228, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4017874002456665, "step": 2009 }, { "completion_length": 296.671875, "epoch": 0.6405353728489483, "grad_norm": 7.317353248596191, "kl": 0.08251953125, "learning_rate": 3.594646271510516e-07, "loss": 0.0033, "reward": 1.5332716703414917, "reward_std": 0.09363551437854767, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5332717299461365, "rewards/pad": 0.0, "step": 2010 }, { "completion_length": 178.515625, "epoch": 0.6408540471637986, "grad_norm": 38.86016845703125, "kl": 0.1005859375, "learning_rate": 3.5914595283620137e-07, "loss": 0.004, "reward": 1.6575322151184082, "reward_std": 0.04223339259624481, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4075321555137634, "step": 2011 }, { "completion_length": 235.984375, "epoch": 0.6411727214786488, "grad_norm": 24.452747344970703, "kl": 0.0869140625, "learning_rate": 3.588272785213511e-07, "loss": 0.0035, "reward": 1.6948299407958984, "reward_std": 0.15844303369522095, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5229548811912537, "step": 2012 }, { "completion_length": 246.625, "epoch": 0.641491395793499, "grad_norm": 17.739437103271484, "kl": 0.09228515625, "learning_rate": 3.5850860420650093e-07, "loss": 0.0037, "reward": 1.663374423980713, "reward_std": 0.10271087288856506, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6789993643760681, "rewards/pad": 0.0, "step": 2013 }, { "completion_length": 275.296875, "epoch": 0.6418100701083492, "grad_norm": 10.146001815795898, "kl": 0.0869140625, "learning_rate": 3.581899298916507e-07, "loss": 0.0035, "reward": 1.8067395687103271, "reward_std": 0.08705069869756699, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5723645687103271, "step": 2014 }, { "completion_length": 331.625, "epoch": 0.6421287444231994, "grad_norm": 7.079098701477051, "kl": 0.0693359375, "learning_rate": 3.578712555768005e-07, "loss": 0.0028, "reward": 1.4350299835205078, "reward_std": 0.1440443992614746, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.45065492391586304, "rewards/pad": 0.0, "step": 2015 }, { "completion_length": 279.125, "epoch": 0.6424474187380497, "grad_norm": 6.421546459197998, "kl": 0.07763671875, "learning_rate": 3.5755258126195025e-07, "loss": 0.0031, "reward": 1.558060646057129, "reward_std": 0.07677186280488968, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4330606460571289, "rewards/pad": 0.125, "step": 2016 }, { "completion_length": 269.84375, "epoch": 0.6427660930528999, "grad_norm": 17.23097801208496, "kl": 0.09326171875, "learning_rate": 3.5723390694710005e-07, "loss": 0.0037, "reward": 1.6398184299468994, "reward_std": 0.07573511451482773, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5148184299468994, "rewards/pad": 0.125, "step": 2017 }, { "completion_length": 314.546875, "epoch": 0.6430847673677501, "grad_norm": 7.973112106323242, "kl": 0.07080078125, "learning_rate": 3.569152326322498e-07, "loss": 0.0028, "reward": 1.4390485286712646, "reward_std": 0.03923648223280907, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3140484690666199, "rewards/pad": 0.125, "step": 2018 }, { "completion_length": 232.8125, "epoch": 0.6434034416826003, "grad_norm": 7.682769298553467, "kl": 0.08203125, "learning_rate": 3.565965583173996e-07, "loss": 0.0033, "reward": 1.6641254425048828, "reward_std": 0.09908575564622879, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.42975035309791565, "rewards/pad": 0.25, "step": 2019 }, { "completion_length": 201.984375, "epoch": 0.6437221159974506, "grad_norm": 99.12747192382812, "kl": 0.1005859375, "learning_rate": 3.5627788400254937e-07, "loss": 0.004, "reward": 1.4531182050704956, "reward_std": 0.15648740530014038, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4531181752681732, "step": 2020 }, { "completion_length": 271.8125, "epoch": 0.6440407903123009, "grad_norm": 9.223620414733887, "kl": 0.07421875, "learning_rate": 3.559592096876992e-07, "loss": 0.003, "reward": 1.5330138206481934, "reward_std": 0.06132769584655762, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40801379084587097, "rewards/pad": 0.125, "step": 2021 }, { "completion_length": 284.625, "epoch": 0.6443594646271511, "grad_norm": 7.2176079750061035, "kl": 0.07763671875, "learning_rate": 3.5564053537284893e-07, "loss": 0.0031, "reward": 1.4767729043960571, "reward_std": 0.06308045983314514, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47677284479141235, "rewards/pad": 0.0, "step": 2022 }, { "completion_length": 310.28125, "epoch": 0.6446781389420013, "grad_norm": 11.201861381530762, "kl": 0.1455078125, "learning_rate": 3.5532186105799874e-07, "loss": 0.0058, "reward": 1.4627937078475952, "reward_std": 0.1969190090894699, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3534187078475952, "step": 2023 }, { "completion_length": 293.828125, "epoch": 0.6449968132568515, "grad_norm": 4.923059940338135, "kl": 0.0791015625, "learning_rate": 3.550031867431485e-07, "loss": 0.0032, "reward": 1.4457471370697021, "reward_std": 0.053632702678442, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44574716687202454, "step": 2024 }, { "completion_length": 272.671875, "epoch": 0.6453154875717018, "grad_norm": 6.5525360107421875, "kl": 0.0830078125, "learning_rate": 3.546845124282983e-07, "loss": 0.0033, "reward": 1.4404683113098145, "reward_std": 0.10515347123146057, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.45609331130981445, "step": 2025 }, { "completion_length": 238.734375, "epoch": 0.645634161886552, "grad_norm": 11.033935546875, "kl": 0.107421875, "learning_rate": 3.5436583811344805e-07, "loss": 0.0043, "reward": 1.7628023624420166, "reward_std": 0.21090787649154663, "rewards/pad": 0.3125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4659273624420166, "step": 2026 }, { "completion_length": 183.1875, "epoch": 0.6459528362014022, "grad_norm": 8.789874076843262, "kl": 0.099609375, "learning_rate": 3.5404716379859786e-07, "loss": 0.004, "reward": 1.8745747804641724, "reward_std": 0.08702093362808228, "rewards/pad": 0.375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4995748698711395, "step": 2027 }, { "completion_length": 457.5, "epoch": 0.6462715105162524, "grad_norm": 8.265464782714844, "kl": 0.05615234375, "learning_rate": 3.537284894837476e-07, "loss": 0.0022, "reward": 1.4592204093933105, "reward_std": 0.0653558000922203, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.45922043919563293, "step": 2028 }, { "completion_length": 259.765625, "epoch": 0.6465901848311026, "grad_norm": 14.02660083770752, "kl": 0.08740234375, "learning_rate": 3.534098151688974e-07, "loss": 0.0035, "reward": 1.4943106174468994, "reward_std": 0.09233476966619492, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4943106174468994, "rewards/pad": 0.0, "step": 2029 }, { "completion_length": 321.1875, "epoch": 0.6469088591459529, "grad_norm": 99.69380187988281, "kl": 0.08251953125, "learning_rate": 3.530911408540471e-07, "loss": 0.0033, "reward": 1.5039708614349365, "reward_std": 0.16264577209949493, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5195959210395813, "rewards/pad": 0.0, "step": 2030 }, { "completion_length": 274.03125, "epoch": 0.6472275334608031, "grad_norm": 8.511700630187988, "kl": 0.07470703125, "learning_rate": 3.527724665391969e-07, "loss": 0.003, "reward": 1.6471614837646484, "reward_std": 0.11210751533508301, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5377864241600037, "rewards/pad": 0.125, "step": 2031 }, { "completion_length": 223.65625, "epoch": 0.6475462077756533, "grad_norm": 22.564176559448242, "kl": 0.08251953125, "learning_rate": 3.524537922243467e-07, "loss": 0.0033, "reward": 1.6479836702346802, "reward_std": 0.1305704116821289, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47610872983932495, "step": 2032 }, { "completion_length": 311.484375, "epoch": 0.6478648820905035, "grad_norm": 13.289634704589844, "kl": 0.08154296875, "learning_rate": 3.5213511790949644e-07, "loss": 0.0033, "reward": 1.714613914489746, "reward_std": 0.13245388865470886, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.6208638548851013, "step": 2033 }, { "completion_length": 210.953125, "epoch": 0.6481835564053537, "grad_norm": 18.591171264648438, "kl": 0.1015625, "learning_rate": 3.5181644359464624e-07, "loss": 0.0041, "reward": 1.7396042346954346, "reward_std": 0.07380440831184387, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6146041750907898, "rewards/pad": 0.125, "step": 2034 }, { "completion_length": 348.328125, "epoch": 0.648502230720204, "grad_norm": 7.386063575744629, "kl": 0.06494140625, "learning_rate": 3.51497769279796e-07, "loss": 0.0026, "reward": 1.6278009414672852, "reward_std": 0.05366513878107071, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.37780094146728516, "rewards/pad": 0.25, "step": 2035 }, { "completion_length": 221.359375, "epoch": 0.6488209050350542, "grad_norm": 15.518906593322754, "kl": 0.0927734375, "learning_rate": 3.511790949649458e-07, "loss": 0.0037, "reward": 1.6019225120544434, "reward_std": 0.07224541902542114, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6019225120544434, "rewards/pad": 0.0, "step": 2036 }, { "completion_length": 193.90625, "epoch": 0.6491395793499044, "grad_norm": 22.68230628967285, "kl": 0.08056640625, "learning_rate": 3.5086042065009556e-07, "loss": 0.0032, "reward": 1.8990789651870728, "reward_std": 0.08138591051101685, "rewards/answer_reward": 0.5, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.39907896518707275, "step": 2037 }, { "completion_length": 316.15625, "epoch": 0.6494582536647546, "grad_norm": 15.482177734375, "kl": 0.0693359375, "learning_rate": 3.5054174633524537e-07, "loss": 0.0028, "reward": 1.4039890766143799, "reward_std": 0.06646262854337692, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4039890170097351, "step": 2038 }, { "completion_length": 350.6875, "epoch": 0.6497769279796048, "grad_norm": 9.26961898803711, "kl": 0.06201171875, "learning_rate": 3.502230720203951e-07, "loss": 0.0025, "reward": 1.6099941730499268, "reward_std": 0.061674658209085464, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6099941730499268, "step": 2039 }, { "completion_length": 407.15625, "epoch": 0.6500956022944551, "grad_norm": 4.745171546936035, "kl": 0.0732421875, "learning_rate": 3.499043977055449e-07, "loss": 0.0029, "reward": 1.4271552562713623, "reward_std": 0.12896493077278137, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4427802562713623, "rewards/pad": 0.0, "step": 2040 }, { "completion_length": 223.984375, "epoch": 0.6504142766093053, "grad_norm": 13.607216835021973, "kl": 0.0712890625, "learning_rate": 3.495857233906947e-07, "loss": 0.0029, "reward": 1.6901793479919434, "reward_std": 0.07591713964939117, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44017940759658813, "rewards/pad": 0.25, "step": 2041 }, { "completion_length": 311.078125, "epoch": 0.6507329509241555, "grad_norm": 7.090723514556885, "kl": 0.06787109375, "learning_rate": 3.492670490758445e-07, "loss": 0.0027, "reward": 1.6069159507751465, "reward_std": 0.21890851855278015, "rewards/pad": 0.34375, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.29441598057746887, "step": 2042 }, { "completion_length": 345.390625, "epoch": 0.6510516252390057, "grad_norm": 26.19813346862793, "kl": 0.06396484375, "learning_rate": 3.4894837476099424e-07, "loss": 0.0026, "reward": 1.6184431314468384, "reward_std": 0.10701720416545868, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.35281819105148315, "rewards/pad": 0.265625, "step": 2043 }, { "completion_length": 291.828125, "epoch": 0.651370299553856, "grad_norm": 10.5043306350708, "kl": 0.07470703125, "learning_rate": 3.4862970044614405e-07, "loss": 0.003, "reward": 1.4766829013824463, "reward_std": 0.07539892196655273, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.35168278217315674, "rewards/pad": 0.125, "step": 2044 }, { "completion_length": 198.4375, "epoch": 0.6516889738687062, "grad_norm": 7.1742658615112305, "kl": 0.11328125, "learning_rate": 3.483110261312938e-07, "loss": 0.0045, "reward": 1.6048579216003418, "reward_std": 0.08010513335466385, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6204828023910522, "step": 2045 }, { "completion_length": 218.125, "epoch": 0.6520076481835564, "grad_norm": 12.691168785095215, "kl": 0.09619140625, "learning_rate": 3.479923518164436e-07, "loss": 0.0038, "reward": 1.5560691356658936, "reward_std": 0.08447499573230743, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4310690462589264, "rewards/pad": 0.125, "step": 2046 }, { "completion_length": 283.890625, "epoch": 0.6523263224984066, "grad_norm": 24.001325607299805, "kl": 0.08740234375, "learning_rate": 3.4767367750159336e-07, "loss": 0.0035, "reward": 1.343886375427246, "reward_std": 0.12355609983205795, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3595114052295685, "step": 2047 }, { "completion_length": 307.515625, "epoch": 0.6526449968132568, "grad_norm": 5.191986083984375, "kl": 0.0673828125, "learning_rate": 3.4735500318674317e-07, "loss": 0.0027, "reward": 1.5148013830184937, "reward_std": 0.0558304563164711, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.38980141282081604, "rewards/pad": 0.125, "step": 2048 }, { "completion_length": 335.328125, "epoch": 0.652963671128107, "grad_norm": 11.43508529663086, "kl": 0.21875, "learning_rate": 3.470363288718929e-07, "loss": 0.0087, "reward": 1.516359806060791, "reward_std": 0.13375940918922424, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.422609806060791, "step": 2049 }, { "completion_length": 221.71875, "epoch": 0.6532823454429573, "grad_norm": 12.406441688537598, "kl": 0.07373046875, "learning_rate": 3.467176545570427e-07, "loss": 0.003, "reward": 1.6535775661468506, "reward_std": 0.09638911485671997, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5285775661468506, "rewards/pad": 0.125, "step": 2050 }, { "completion_length": 325.890625, "epoch": 0.6536010197578075, "grad_norm": 13.22891616821289, "kl": 0.062255859375, "learning_rate": 3.4639898024219243e-07, "loss": 0.0025, "reward": 1.6790329217910767, "reward_std": 0.09938417375087738, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44465792179107666, "step": 2051 }, { "completion_length": 138.46875, "epoch": 0.6539196940726577, "grad_norm": 16.85652732849121, "kl": 0.1298828125, "learning_rate": 3.4608030592734224e-07, "loss": 0.0052, "reward": 1.7120802402496338, "reward_std": 0.09974721819162369, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.7120803594589233, "rewards/pad": 0.0, "step": 2052 }, { "completion_length": 282.78125, "epoch": 0.6542383683875079, "grad_norm": 20.882482528686523, "kl": 0.083984375, "learning_rate": 3.45761631612492e-07, "loss": 0.0033, "reward": 1.5891286134719849, "reward_std": 0.15410195291042328, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5110036134719849, "step": 2053 }, { "completion_length": 251.390625, "epoch": 0.6545570427023581, "grad_norm": 13.868054389953613, "kl": 0.0927734375, "learning_rate": 3.454429572976418e-07, "loss": 0.0037, "reward": 1.6373168230056763, "reward_std": 0.07170312851667404, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.637316882610321, "step": 2054 }, { "completion_length": 259.09375, "epoch": 0.6548757170172084, "grad_norm": 19.36661148071289, "kl": 0.08642578125, "learning_rate": 3.4512428298279155e-07, "loss": 0.0035, "reward": 1.507773995399475, "reward_std": 0.12215866148471832, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3983990550041199, "step": 2055 }, { "completion_length": 215.59375, "epoch": 0.6551943913320586, "grad_norm": 8.673489570617676, "kl": 0.0927734375, "learning_rate": 3.4480560866794136e-07, "loss": 0.0037, "reward": 1.5000026226043701, "reward_std": 0.05920401215553284, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.37500256299972534, "step": 2056 }, { "completion_length": 215.34375, "epoch": 0.6555130656469088, "grad_norm": 18.26003646850586, "kl": 0.09130859375, "learning_rate": 3.444869343530911e-07, "loss": 0.0036, "reward": 1.7287582159042358, "reward_std": 0.12288461625576019, "rewards/answer_reward": 0.15625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5725082159042358, "step": 2057 }, { "completion_length": 265.703125, "epoch": 0.655831739961759, "grad_norm": 11.515850067138672, "kl": 0.07666015625, "learning_rate": 3.441682600382409e-07, "loss": 0.0031, "reward": 1.430114507675171, "reward_std": 0.13782833516597748, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3051145076751709, "rewards/pad": 0.140625, "step": 2058 }, { "completion_length": 289.265625, "epoch": 0.6561504142766093, "grad_norm": 7.043642044067383, "kl": 0.0927734375, "learning_rate": 3.438495857233907e-07, "loss": 0.0037, "reward": 1.5840719938278198, "reward_std": 0.15137723088264465, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5996969938278198, "step": 2059 }, { "completion_length": 187.375, "epoch": 0.6564690885914596, "grad_norm": 150.14413452148438, "kl": 0.1044921875, "learning_rate": 3.435309114085405e-07, "loss": 0.0042, "reward": 1.5122089385986328, "reward_std": 0.09084093570709229, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.512208878993988, "step": 2060 }, { "completion_length": 346.515625, "epoch": 0.6567877629063098, "grad_norm": 8.237125396728516, "kl": 0.064453125, "learning_rate": 3.4321223709369024e-07, "loss": 0.0026, "reward": 1.3773670196533203, "reward_std": 0.0877017229795456, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4086169898509979, "step": 2061 }, { "completion_length": 387.421875, "epoch": 0.65710643722116, "grad_norm": 14.507115364074707, "kl": 0.07861328125, "learning_rate": 3.4289356277884e-07, "loss": 0.0031, "reward": 1.3933491706848145, "reward_std": 0.13431811332702637, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4089740514755249, "step": 2062 }, { "completion_length": 355.015625, "epoch": 0.6574251115360102, "grad_norm": 14.55032730102539, "kl": 0.07177734375, "learning_rate": 3.425748884639898e-07, "loss": 0.0029, "reward": 1.2918533086776733, "reward_std": 0.08444277942180634, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.32310330867767334, "rewards/pad": 0.0, "step": 2063 }, { "completion_length": 294.796875, "epoch": 0.6577437858508605, "grad_norm": 12.294832229614258, "kl": 0.0810546875, "learning_rate": 3.4225621414913955e-07, "loss": 0.0032, "reward": 1.4564380645751953, "reward_std": 0.08586239069700241, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4564380347728729, "step": 2064 }, { "completion_length": 426.53125, "epoch": 0.6580624601657107, "grad_norm": 5.074521541595459, "kl": 0.05078125, "learning_rate": 3.4193753983428936e-07, "loss": 0.002, "reward": 1.4866843223571777, "reward_std": 0.13601148128509521, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.2679342031478882, "step": 2065 }, { "completion_length": 304.984375, "epoch": 0.6583811344805609, "grad_norm": 32.57131576538086, "kl": 0.09619140625, "learning_rate": 3.416188655194391e-07, "loss": 0.0038, "reward": 1.3942128419876099, "reward_std": 0.1586059182882309, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.42546284198760986, "rewards/pad": 0.0, "step": 2066 }, { "completion_length": 214.171875, "epoch": 0.6586998087954111, "grad_norm": 7.813204765319824, "kl": 0.10009765625, "learning_rate": 3.413001912045889e-07, "loss": 0.004, "reward": 1.6111717224121094, "reward_std": 0.09173314273357391, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6111717820167542, "rewards/pad": 0.0, "step": 2067 }, { "completion_length": 244.546875, "epoch": 0.6590184831102613, "grad_norm": 10.449907302856445, "kl": 0.09521484375, "learning_rate": 3.409815168897387e-07, "loss": 0.0038, "reward": 1.6441354751586914, "reward_std": 0.15547554194927216, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5347604751586914, "step": 2068 }, { "completion_length": 344.46875, "epoch": 0.6593371574251116, "grad_norm": 7.0181660652160645, "kl": 0.059326171875, "learning_rate": 3.406628425748885e-07, "loss": 0.0024, "reward": 1.453713297843933, "reward_std": 0.08794372528791428, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.32871323823928833, "step": 2069 }, { "completion_length": 175.140625, "epoch": 0.6596558317399618, "grad_norm": 10.922768592834473, "kl": 0.107421875, "learning_rate": 3.403441682600382e-07, "loss": 0.0043, "reward": 1.7466790676116943, "reward_std": 0.12864170968532562, "rewards/pad": 0.296875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4498040974140167, "step": 2070 }, { "completion_length": 393.125, "epoch": 0.659974506054812, "grad_norm": 4.762542247772217, "kl": 0.06591796875, "learning_rate": 3.40025493945188e-07, "loss": 0.0026, "reward": 1.5451923608779907, "reward_std": 0.10473742336034775, "rewards/answer_reward": 0.03125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5139423608779907, "step": 2071 }, { "completion_length": 382.59375, "epoch": 0.6602931803696622, "grad_norm": 9.2803373336792, "kl": 0.07373046875, "learning_rate": 3.3970681963033774e-07, "loss": 0.003, "reward": 1.369894027709961, "reward_std": 0.1183726117014885, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.38551896810531616, "rewards/pad": 0.0, "step": 2072 }, { "completion_length": 401.953125, "epoch": 0.6606118546845124, "grad_norm": 10.371095657348633, "kl": 0.06640625, "learning_rate": 3.3938814531548755e-07, "loss": 0.0026, "reward": 1.4870697259902954, "reward_std": 0.11239289492368698, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5026946663856506, "step": 2073 }, { "completion_length": 335.0, "epoch": 0.6609305289993627, "grad_norm": 17.193138122558594, "kl": 0.083984375, "learning_rate": 3.390694710006373e-07, "loss": 0.0034, "reward": 1.389042854309082, "reward_std": 0.10139884054660797, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4046678841114044, "rewards/pad": 0.0, "step": 2074 }, { "completion_length": 448.859375, "epoch": 0.6612492033142129, "grad_norm": 3.9179635047912598, "kl": 0.04833984375, "learning_rate": 3.387507966857871e-07, "loss": 0.0019, "reward": 1.4355924129486084, "reward_std": 0.11411859840154648, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.45121729373931885, "rewards/pad": 0.0, "step": 2075 }, { "completion_length": 218.078125, "epoch": 0.6615678776290631, "grad_norm": 10.488283157348633, "kl": 0.11083984375, "learning_rate": 3.3843212237093687e-07, "loss": 0.0044, "reward": 1.6504709720611572, "reward_std": 0.07368594408035278, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.650471031665802, "rewards/pad": 0.0, "step": 2076 }, { "completion_length": 330.015625, "epoch": 0.6618865519439133, "grad_norm": 5.749670505523682, "kl": 0.08154296875, "learning_rate": 3.3811344805608667e-07, "loss": 0.0033, "reward": 1.6113598346710205, "reward_std": 0.08943575620651245, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6269847750663757, "step": 2077 }, { "completion_length": 230.9375, "epoch": 0.6622052262587635, "grad_norm": 12.148361206054688, "kl": 0.07861328125, "learning_rate": 3.3779477374123643e-07, "loss": 0.0031, "reward": 1.5412075519561768, "reward_std": 0.17028933763504028, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4162076413631439, "step": 2078 }, { "completion_length": 165.96875, "epoch": 0.6625239005736138, "grad_norm": 40.98452377319336, "kl": 0.1142578125, "learning_rate": 3.3747609942638623e-07, "loss": 0.0046, "reward": 1.661864995956421, "reward_std": 0.14011311531066895, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5524899363517761, "step": 2079 }, { "completion_length": 239.84375, "epoch": 0.662842574888464, "grad_norm": 6.925748348236084, "kl": 0.0888671875, "learning_rate": 3.37157425111536e-07, "loss": 0.0036, "reward": 1.6524909734725952, "reward_std": 0.09404580295085907, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4024909734725952, "rewards/pad": 0.25, "step": 2080 }, { "completion_length": 327.015625, "epoch": 0.6631612492033142, "grad_norm": 12.143636703491211, "kl": 0.0791015625, "learning_rate": 3.368387507966858e-07, "loss": 0.0032, "reward": 1.5867152214050293, "reward_std": 0.12560738623142242, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6023401021957397, "step": 2081 }, { "completion_length": 278.578125, "epoch": 0.6634799235181644, "grad_norm": 6.4376749992370605, "kl": 0.091796875, "learning_rate": 3.3652007648183555e-07, "loss": 0.0037, "reward": 1.5167558193206787, "reward_std": 0.08888687193393707, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5167558193206787, "step": 2082 }, { "completion_length": 334.828125, "epoch": 0.6637985978330146, "grad_norm": 10.173569679260254, "kl": 0.095703125, "learning_rate": 3.3620140216698536e-07, "loss": 0.0038, "reward": 1.6784933805465698, "reward_std": 0.18250435590744019, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.953125, "rewards/tracking_iou_reward": 0.6003684401512146, "step": 2083 }, { "completion_length": 233.4375, "epoch": 0.6641172721478649, "grad_norm": 7.580698013305664, "kl": 0.1416015625, "learning_rate": 3.358827278521351e-07, "loss": 0.0056, "reward": 1.357182264328003, "reward_std": 0.13714271783828735, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.38843220472335815, "rewards/pad": 0.0, "step": 2084 }, { "completion_length": 275.4375, "epoch": 0.6644359464627151, "grad_norm": 27.354007720947266, "kl": 0.1357421875, "learning_rate": 3.355640535372849e-07, "loss": 0.0055, "reward": 1.4340639114379883, "reward_std": 0.1587117612361908, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.35593900084495544, "rewards/pad": 0.09375, "step": 2085 }, { "completion_length": 226.390625, "epoch": 0.6647546207775653, "grad_norm": 11.366966247558594, "kl": 0.0830078125, "learning_rate": 3.3524537922243467e-07, "loss": 0.0033, "reward": 1.5681848526000977, "reward_std": 0.13581296801567078, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4588099718093872, "step": 2086 }, { "completion_length": 180.953125, "epoch": 0.6650732950924155, "grad_norm": 15.23033332824707, "kl": 0.09912109375, "learning_rate": 3.349267049075845e-07, "loss": 0.004, "reward": 1.6432744264602661, "reward_std": 0.1856716275215149, "rewards/answer_reward": 0.21875, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.45577436685562134, "step": 2087 }, { "completion_length": 334.546875, "epoch": 0.6653919694072657, "grad_norm": 5.13187837600708, "kl": 0.06640625, "learning_rate": 3.3460803059273423e-07, "loss": 0.0027, "reward": 1.5802624225616455, "reward_std": 0.09438225626945496, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4708874225616455, "rewards/pad": 0.125, "step": 2088 }, { "completion_length": 392.328125, "epoch": 0.665710643722116, "grad_norm": 14.914807319641113, "kl": 0.05810546875, "learning_rate": 3.34289356277884e-07, "loss": 0.0023, "reward": 1.4108269214630127, "reward_std": 0.07436959445476532, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4108269214630127, "step": 2089 }, { "completion_length": 307.3125, "epoch": 0.6660293180369662, "grad_norm": 11.673267364501953, "kl": 0.0732421875, "learning_rate": 3.3397068196303374e-07, "loss": 0.0029, "reward": 1.2841676473617554, "reward_std": 0.11732315272092819, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.28416767716407776, "rewards/pad": 0.015625, "step": 2090 }, { "completion_length": 515.8125, "epoch": 0.6663479923518164, "grad_norm": 3.9852609634399414, "kl": 0.037109375, "learning_rate": 3.336520076481835e-07, "loss": 0.0015, "reward": 1.4397344589233398, "reward_std": 0.12903329730033875, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.377234548330307, "step": 2091 }, { "completion_length": 323.90625, "epoch": 0.6666666666666666, "grad_norm": 6.877256870269775, "kl": 0.08642578125, "learning_rate": 3.333333333333333e-07, "loss": 0.0034, "reward": 1.566868782043457, "reward_std": 0.09986849129199982, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5668688416481018, "rewards/pad": 0.0, "step": 2092 }, { "completion_length": 424.84375, "epoch": 0.6669853409815168, "grad_norm": 4.864638805389404, "kl": 0.050537109375, "learning_rate": 3.3301465901848305e-07, "loss": 0.002, "reward": 1.5969088077545166, "reward_std": 0.04110327735543251, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4719088077545166, "step": 2093 }, { "completion_length": 186.890625, "epoch": 0.6673040152963671, "grad_norm": 9.601995468139648, "kl": 0.0966796875, "learning_rate": 3.3269598470363286e-07, "loss": 0.0039, "reward": 1.6393721103668213, "reward_std": 0.12142668664455414, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5143721103668213, "rewards/pad": 0.125, "step": 2094 }, { "completion_length": 355.21875, "epoch": 0.6676226896112173, "grad_norm": 9.643255233764648, "kl": 0.0732421875, "learning_rate": 3.323773103887826e-07, "loss": 0.0029, "reward": 1.5644805431365967, "reward_std": 0.045724526047706604, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4394805133342743, "step": 2095 }, { "completion_length": 376.03125, "epoch": 0.6679413639260675, "grad_norm": 6.067450523376465, "kl": 0.053955078125, "learning_rate": 3.320586360739324e-07, "loss": 0.0022, "reward": 1.5394401550292969, "reward_std": 0.15540587902069092, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5706900954246521, "step": 2096 }, { "completion_length": 266.0625, "epoch": 0.6682600382409177, "grad_norm": 13.3276948928833, "kl": 0.06201171875, "learning_rate": 3.317399617590822e-07, "loss": 0.0025, "reward": 1.539819359779358, "reward_std": 0.1312125325202942, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4304443299770355, "step": 2097 }, { "completion_length": 269.125, "epoch": 0.668578712555768, "grad_norm": 14.501863479614258, "kl": 0.09375, "learning_rate": 3.31421287444232e-07, "loss": 0.0038, "reward": 1.5091478824615479, "reward_std": 0.09442204236984253, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5247730016708374, "rewards/pad": 0.0, "step": 2098 }, { "completion_length": 321.046875, "epoch": 0.6688973868706183, "grad_norm": 4.701894760131836, "kl": 0.0732421875, "learning_rate": 3.3110261312938174e-07, "loss": 0.0029, "reward": 1.5360662937164307, "reward_std": 0.19051328301429749, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4423162639141083, "rewards/pad": 0.109375, "step": 2099 }, { "completion_length": 242.984375, "epoch": 0.6692160611854685, "grad_norm": 14.593493461608887, "kl": 0.103515625, "learning_rate": 3.3078393881453154e-07, "loss": 0.0041, "reward": 1.5114970207214355, "reward_std": 0.11804000288248062, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5271220207214355, "step": 2100 }, { "completion_length": 270.296875, "epoch": 0.6695347355003187, "grad_norm": 11.10335636138916, "kl": 0.0830078125, "learning_rate": 3.304652644996813e-07, "loss": 0.0033, "reward": 1.50400972366333, "reward_std": 0.09469564259052277, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5040097236633301, "rewards/pad": 0.0, "step": 2101 }, { "completion_length": 310.46875, "epoch": 0.6698534098151689, "grad_norm": 10.861712455749512, "kl": 0.09716796875, "learning_rate": 3.301465901848311e-07, "loss": 0.0039, "reward": 1.2648383378982544, "reward_std": 0.1338602602481842, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.2804632782936096, "rewards/pad": 0.0, "step": 2102 }, { "completion_length": 215.671875, "epoch": 0.6701720841300192, "grad_norm": 17.331504821777344, "kl": 0.08837890625, "learning_rate": 3.2982791586998086e-07, "loss": 0.0035, "reward": 1.326276183128357, "reward_std": 0.08590371906757355, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.32627618312835693, "step": 2103 }, { "completion_length": 242.3125, "epoch": 0.6704907584448694, "grad_norm": 14.294371604919434, "kl": 0.091796875, "learning_rate": 3.2950924155513067e-07, "loss": 0.0037, "reward": 1.5341930389404297, "reward_std": 0.10620848834514618, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5498180985450745, "rewards/pad": 0.0, "step": 2104 }, { "completion_length": 276.578125, "epoch": 0.6708094327597196, "grad_norm": 5.867783546447754, "kl": 0.0703125, "learning_rate": 3.291905672402804e-07, "loss": 0.0028, "reward": 1.6233067512512207, "reward_std": 0.057741183787584305, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4983067512512207, "rewards/pad": 0.125, "step": 2105 }, { "completion_length": 380.515625, "epoch": 0.6711281070745698, "grad_norm": 10.394988059997559, "kl": 0.06298828125, "learning_rate": 3.2887189292543023e-07, "loss": 0.0025, "reward": 1.4723341464996338, "reward_std": 0.2224801480770111, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.39420920610427856, "step": 2106 }, { "completion_length": 131.578125, "epoch": 0.67144678138942, "grad_norm": 10.822446823120117, "kl": 0.103515625, "learning_rate": 3.2855321861058e-07, "loss": 0.0041, "reward": 1.6905455589294434, "reward_std": 0.20311430096626282, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5342955589294434, "rewards/pad": 0.15625, "step": 2107 }, { "completion_length": 220.984375, "epoch": 0.6717654557042703, "grad_norm": 18.684009552001953, "kl": 0.0791015625, "learning_rate": 3.282345442957298e-07, "loss": 0.0032, "reward": 1.6459847688674927, "reward_std": 0.12653863430023193, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5209848284721375, "rewards/pad": 0.125, "step": 2108 }, { "completion_length": 252.4375, "epoch": 0.6720841300191205, "grad_norm": 6.713516712188721, "kl": 0.083984375, "learning_rate": 3.279158699808795e-07, "loss": 0.0034, "reward": 1.2996958494186401, "reward_std": 0.0696154534816742, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.17469587922096252, "step": 2109 }, { "completion_length": 313.6875, "epoch": 0.6724028043339707, "grad_norm": 35.75420379638672, "kl": 0.0703125, "learning_rate": 3.275971956660293e-07, "loss": 0.0028, "reward": 1.653608798980713, "reward_std": 0.05071251094341278, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5286088585853577, "rewards/pad": 0.125, "step": 2110 }, { "completion_length": 396.140625, "epoch": 0.6727214786488209, "grad_norm": 6.315822601318359, "kl": 0.057373046875, "learning_rate": 3.2727852135117905e-07, "loss": 0.0023, "reward": 1.5398814678192139, "reward_std": 0.1398405134677887, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.44613149762153625, "step": 2111 }, { "completion_length": 275.28125, "epoch": 0.6730401529636711, "grad_norm": 18.19247055053711, "kl": 0.0859375, "learning_rate": 3.2695984703632886e-07, "loss": 0.0034, "reward": 1.7365256547927856, "reward_std": 0.18420612812042236, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5177757740020752, "step": 2112 }, { "completion_length": 216.171875, "epoch": 0.6733588272785214, "grad_norm": 17.706764221191406, "kl": 0.08349609375, "learning_rate": 3.266411727214786e-07, "loss": 0.0033, "reward": 1.43001389503479, "reward_std": 0.11733804643154144, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.44563889503479004, "step": 2113 }, { "completion_length": 280.859375, "epoch": 0.6736775015933716, "grad_norm": 12.78309154510498, "kl": 0.07177734375, "learning_rate": 3.263224984066284e-07, "loss": 0.0029, "reward": 1.4484806060791016, "reward_std": 0.1356436014175415, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.46410566568374634, "rewards/pad": 0.0, "step": 2114 }, { "completion_length": 200.109375, "epoch": 0.6739961759082218, "grad_norm": 10.1220064163208, "kl": 0.099609375, "learning_rate": 3.2600382409177817e-07, "loss": 0.004, "reward": 1.534574270248413, "reward_std": 0.13635796308517456, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4251992702484131, "step": 2115 }, { "completion_length": 333.5, "epoch": 0.674314850223072, "grad_norm": 13.432465553283691, "kl": 0.07763671875, "learning_rate": 3.25685149776928e-07, "loss": 0.0031, "reward": 1.4835143089294434, "reward_std": 0.1128297820687294, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.37413930892944336, "step": 2116 }, { "completion_length": 295.078125, "epoch": 0.6746335245379222, "grad_norm": 6.980226993560791, "kl": 0.07421875, "learning_rate": 3.2536647546207773e-07, "loss": 0.003, "reward": 1.530266523361206, "reward_std": 0.05441119521856308, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.530266523361206, "rewards/pad": 0.0, "step": 2117 }, { "completion_length": 268.21875, "epoch": 0.6749521988527725, "grad_norm": 5.705792427062988, "kl": 0.083984375, "learning_rate": 3.2504780114722754e-07, "loss": 0.0034, "reward": 1.416893482208252, "reward_std": 0.07435958832502365, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4168934226036072, "step": 2118 }, { "completion_length": 309.875, "epoch": 0.6752708731676227, "grad_norm": 7.612441539764404, "kl": 0.08154296875, "learning_rate": 3.247291268323773e-07, "loss": 0.0033, "reward": 1.6166479587554932, "reward_std": 0.18765589594841003, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5228978991508484, "rewards/pad": 0.109375, "step": 2119 }, { "completion_length": 255.640625, "epoch": 0.6755895474824729, "grad_norm": 13.338653564453125, "kl": 0.0830078125, "learning_rate": 3.244104525175271e-07, "loss": 0.0033, "reward": 1.6340851783752441, "reward_std": 0.12460452318191528, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5247102379798889, "rewards/pad": 0.125, "step": 2120 }, { "completion_length": 187.09375, "epoch": 0.6759082217973231, "grad_norm": 65.28882598876953, "kl": 0.11572265625, "learning_rate": 3.2409177820267686e-07, "loss": 0.0046, "reward": 1.5655567646026611, "reward_std": 0.08916109800338745, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5655567049980164, "rewards/pad": 0.0, "step": 2121 }, { "completion_length": 208.203125, "epoch": 0.6762268961121733, "grad_norm": 12.348464012145996, "kl": 0.08837890625, "learning_rate": 3.2377310388782666e-07, "loss": 0.0035, "reward": 1.7495148181915283, "reward_std": 0.09711165726184845, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4995148181915283, "step": 2122 }, { "completion_length": 207.9375, "epoch": 0.6765455704270236, "grad_norm": 7.536280155181885, "kl": 0.0703125, "learning_rate": 3.234544295729764e-07, "loss": 0.0028, "reward": 1.8844282627105713, "reward_std": 0.06893973052501678, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5094282627105713, "step": 2123 }, { "completion_length": 253.8125, "epoch": 0.6768642447418738, "grad_norm": 6.380032062530518, "kl": 0.0849609375, "learning_rate": 3.2313575525812617e-07, "loss": 0.0034, "reward": 1.497983694076538, "reward_std": 0.0429433137178421, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49798381328582764, "rewards/pad": 0.0, "step": 2124 }, { "completion_length": 241.640625, "epoch": 0.677182919056724, "grad_norm": 8.918638229370117, "kl": 0.09228515625, "learning_rate": 3.22817080943276e-07, "loss": 0.0037, "reward": 1.4526426792144775, "reward_std": 0.09334304183721542, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4526427388191223, "step": 2125 }, { "completion_length": 252.984375, "epoch": 0.6775015933715742, "grad_norm": 14.118407249450684, "kl": 0.09228515625, "learning_rate": 3.2249840662842573e-07, "loss": 0.0037, "reward": 1.5071016550064087, "reward_std": 0.1294305920600891, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5227266550064087, "step": 2126 }, { "completion_length": 297.984375, "epoch": 0.6778202676864244, "grad_norm": 15.682186126708984, "kl": 0.0771484375, "learning_rate": 3.2217973231357554e-07, "loss": 0.0031, "reward": 1.5055906772613525, "reward_std": 0.1808309257030487, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.41184061765670776, "rewards/pad": 0.125, "step": 2127 }, { "completion_length": 317.921875, "epoch": 0.6781389420012747, "grad_norm": 10.266303062438965, "kl": 0.0693359375, "learning_rate": 3.2186105799872524e-07, "loss": 0.0028, "reward": 1.4385040998458862, "reward_std": 0.057395100593566895, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.18850407004356384, "step": 2128 }, { "completion_length": 344.046875, "epoch": 0.6784576163161249, "grad_norm": 8.247588157653809, "kl": 0.11572265625, "learning_rate": 3.2154238368387505e-07, "loss": 0.0046, "reward": 1.4720286130905151, "reward_std": 0.0752357468008995, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47202861309051514, "rewards/pad": 0.0, "step": 2129 }, { "completion_length": 197.46875, "epoch": 0.6787762906309751, "grad_norm": 30.725082397460938, "kl": 0.0986328125, "learning_rate": 3.212237093690248e-07, "loss": 0.0039, "reward": 1.365682601928711, "reward_std": 0.05450920760631561, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3656827509403229, "rewards/pad": 0.0, "step": 2130 }, { "completion_length": 254.5625, "epoch": 0.6790949649458253, "grad_norm": 10.495698928833008, "kl": 0.10595703125, "learning_rate": 3.209050350541746e-07, "loss": 0.0042, "reward": 1.5750397443771362, "reward_std": 0.08169522881507874, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.575039803981781, "rewards/pad": 0.0, "step": 2131 }, { "completion_length": 306.203125, "epoch": 0.6794136392606756, "grad_norm": 16.586490631103516, "kl": 0.07373046875, "learning_rate": 3.2058636073932436e-07, "loss": 0.0029, "reward": 1.5276161432266235, "reward_std": 0.07765782624483109, "rewards/answer_reward": 0.21875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.30886608362197876, "step": 2132 }, { "completion_length": 205.203125, "epoch": 0.6797323135755258, "grad_norm": 35.02643966674805, "kl": 0.10302734375, "learning_rate": 3.2026768642447417e-07, "loss": 0.0041, "reward": 1.581708312034607, "reward_std": 0.09050709009170532, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5817083120346069, "step": 2133 }, { "completion_length": 232.484375, "epoch": 0.680050987890376, "grad_norm": 27.2739200592041, "kl": 0.0869140625, "learning_rate": 3.199490121096239e-07, "loss": 0.0035, "reward": 1.488992691040039, "reward_std": 0.08124648034572601, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.36399269104003906, "rewards/pad": 0.125, "step": 2134 }, { "completion_length": 351.234375, "epoch": 0.6803696622052262, "grad_norm": 8.70663070678711, "kl": 0.0771484375, "learning_rate": 3.1963033779477373e-07, "loss": 0.0031, "reward": 1.446648120880127, "reward_std": 0.03918899595737457, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4466480612754822, "step": 2135 }, { "completion_length": 297.484375, "epoch": 0.6806883365200764, "grad_norm": 25.142175674438477, "kl": 0.062255859375, "learning_rate": 3.193116634799235e-07, "loss": 0.0025, "reward": 1.6298236846923828, "reward_std": 0.10432236641645432, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.39544859528541565, "step": 2136 }, { "completion_length": 224.484375, "epoch": 0.6810070108349267, "grad_norm": 7.5887627601623535, "kl": 0.0830078125, "learning_rate": 3.189929891650733e-07, "loss": 0.0033, "reward": 1.6977254152297974, "reward_std": 0.0781145989894867, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5727253556251526, "step": 2137 }, { "completion_length": 242.59375, "epoch": 0.681325685149777, "grad_norm": 16.79660415649414, "kl": 0.09130859375, "learning_rate": 3.1867431485022304e-07, "loss": 0.0037, "reward": 1.3872432708740234, "reward_std": 0.06355497241020203, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3872433304786682, "rewards/pad": 0.0, "step": 2138 }, { "completion_length": 162.296875, "epoch": 0.6816443594646272, "grad_norm": 14.901323318481445, "kl": 0.11474609375, "learning_rate": 3.1835564053537285e-07, "loss": 0.0046, "reward": 1.5048975944519043, "reward_std": 0.12836740911006927, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4892726242542267, "rewards/pad": 0.015625, "step": 2139 }, { "completion_length": 202.75, "epoch": 0.6819630337794774, "grad_norm": 29.60565757751465, "kl": 0.09716796875, "learning_rate": 3.180369662205226e-07, "loss": 0.0039, "reward": 1.6599183082580566, "reward_std": 0.10319438576698303, "rewards/answer_reward": 0.09375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5661684274673462, "step": 2140 }, { "completion_length": 303.40625, "epoch": 0.6822817080943276, "grad_norm": 20.659942626953125, "kl": 0.06201171875, "learning_rate": 3.177182919056724e-07, "loss": 0.0025, "reward": 1.590092420578003, "reward_std": 0.11927967518568039, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5119674801826477, "rewards/pad": 0.078125, "step": 2141 }, { "completion_length": 231.1875, "epoch": 0.6826003824091779, "grad_norm": 12.383918762207031, "kl": 0.08203125, "learning_rate": 3.1739961759082217e-07, "loss": 0.0033, "reward": 1.486088514328003, "reward_std": 0.06917048245668411, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.36108851432800293, "rewards/pad": 0.125, "step": 2142 }, { "completion_length": 244.96875, "epoch": 0.6829190567240281, "grad_norm": 4.997034549713135, "kl": 0.09521484375, "learning_rate": 3.17080943275972e-07, "loss": 0.0038, "reward": 1.5020737648010254, "reward_std": 0.06794846802949905, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.37707382440567017, "step": 2143 }, { "completion_length": 206.140625, "epoch": 0.6832377310388783, "grad_norm": 13.505306243896484, "kl": 0.08203125, "learning_rate": 3.1676226896112173e-07, "loss": 0.0033, "reward": 1.6951855421066284, "reward_std": 0.07091044634580612, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4451856017112732, "step": 2144 }, { "completion_length": 355.734375, "epoch": 0.6835564053537285, "grad_norm": 19.45197296142578, "kl": 0.1767578125, "learning_rate": 3.1644359464627153e-07, "loss": 0.0071, "reward": 1.5212619304656982, "reward_std": 0.09661325812339783, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5368869304656982, "step": 2145 }, { "completion_length": 378.578125, "epoch": 0.6838750796685787, "grad_norm": 10.446003913879395, "kl": 0.052978515625, "learning_rate": 3.161249203314213e-07, "loss": 0.0021, "reward": 1.5361788272857666, "reward_std": 0.15842914581298828, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4424287974834442, "step": 2146 }, { "completion_length": 224.28125, "epoch": 0.684193753983429, "grad_norm": 16.769617080688477, "kl": 0.09130859375, "learning_rate": 3.158062460165711e-07, "loss": 0.0037, "reward": 1.5884971618652344, "reward_std": 0.1478859782218933, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6041221618652344, "rewards/pad": 0.0, "step": 2147 }, { "completion_length": 209.46875, "epoch": 0.6845124282982792, "grad_norm": 14.459205627441406, "kl": 0.0966796875, "learning_rate": 3.154875717017208e-07, "loss": 0.0039, "reward": 1.5993014574050903, "reward_std": 0.1308264136314392, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6149263978004456, "rewards/pad": 0.0, "step": 2148 }, { "completion_length": 166.890625, "epoch": 0.6848311026131294, "grad_norm": 12.089001655578613, "kl": 0.1171875, "learning_rate": 3.151688973868706e-07, "loss": 0.0047, "reward": 1.6179935932159424, "reward_std": 0.14248764514923096, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3836185932159424, "rewards/pad": 0.25, "step": 2149 }, { "completion_length": 262.203125, "epoch": 0.6851497769279796, "grad_norm": 25.056171417236328, "kl": 0.06884765625, "learning_rate": 3.1485022307202036e-07, "loss": 0.0028, "reward": 1.6065418720245361, "reward_std": 0.1253398358821869, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5752919912338257, "step": 2150 }, { "completion_length": 309.8125, "epoch": 0.6854684512428298, "grad_norm": 12.220285415649414, "kl": 0.064453125, "learning_rate": 3.1453154875717016e-07, "loss": 0.0026, "reward": 1.4543453454971313, "reward_std": 0.12550035119056702, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.34497037529945374, "rewards/pad": 0.125, "step": 2151 }, { "completion_length": 238.40625, "epoch": 0.6857871255576801, "grad_norm": 9.260493278503418, "kl": 0.09228515625, "learning_rate": 3.142128744423199e-07, "loss": 0.0037, "reward": 1.6811506748199463, "reward_std": 0.12158751487731934, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5405256748199463, "rewards/pad": 0.140625, "step": 2152 }, { "completion_length": 247.828125, "epoch": 0.6861057998725303, "grad_norm": 16.205915451049805, "kl": 0.11328125, "learning_rate": 3.1389420012746967e-07, "loss": 0.0045, "reward": 1.6481878757476807, "reward_std": 0.09217728674411774, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5231878757476807, "step": 2153 }, { "completion_length": 317.828125, "epoch": 0.6864244741873805, "grad_norm": 9.023106575012207, "kl": 0.07275390625, "learning_rate": 3.135755258126195e-07, "loss": 0.0029, "reward": 1.5768007040023804, "reward_std": 0.09698163717985153, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4518006443977356, "rewards/pad": 0.125, "step": 2154 }, { "completion_length": 239.109375, "epoch": 0.6867431485022307, "grad_norm": 9.951905250549316, "kl": 0.115234375, "learning_rate": 3.1325685149776923e-07, "loss": 0.0046, "reward": 1.6389799118041992, "reward_std": 0.2226027250289917, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.5608548521995544, "rewards/pad": 0.125, "step": 2155 }, { "completion_length": 201.875, "epoch": 0.687061822817081, "grad_norm": 13.409597396850586, "kl": 0.087890625, "learning_rate": 3.1293817718291904e-07, "loss": 0.0035, "reward": 1.8152843713760376, "reward_std": 0.149655282497406, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.5965343713760376, "step": 2156 }, { "completion_length": 171.328125, "epoch": 0.6873804971319312, "grad_norm": 12.817052841186523, "kl": 0.1357421875, "learning_rate": 3.126195028680688e-07, "loss": 0.0054, "reward": 1.5036648511886597, "reward_std": 0.1525655835866928, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4099148213863373, "rewards/pad": 0.109375, "step": 2157 }, { "completion_length": 221.375, "epoch": 0.6876991714467814, "grad_norm": 10.492629051208496, "kl": 0.10498046875, "learning_rate": 3.123008285532186e-07, "loss": 0.0042, "reward": 1.6159346103668213, "reward_std": 0.19899283349514008, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5221847295761108, "step": 2158 }, { "completion_length": 344.296875, "epoch": 0.6880178457616316, "grad_norm": 7.73666524887085, "kl": 0.11279296875, "learning_rate": 3.1198215423836836e-07, "loss": 0.0045, "reward": 1.3504173755645752, "reward_std": 0.09438289701938629, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.36604249477386475, "rewards/pad": 0.0, "step": 2159 }, { "completion_length": 256.203125, "epoch": 0.6883365200764818, "grad_norm": 23.789533615112305, "kl": 0.07666015625, "learning_rate": 3.1166347992351816e-07, "loss": 0.0031, "reward": 1.567455530166626, "reward_std": 0.056159138679504395, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3174554705619812, "step": 2160 }, { "completion_length": 197.8125, "epoch": 0.688655194391332, "grad_norm": 10.632448196411133, "kl": 0.091796875, "learning_rate": 3.113448056086679e-07, "loss": 0.0037, "reward": 1.5515515804290771, "reward_std": 0.14573228359222412, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47342658042907715, "step": 2161 }, { "completion_length": 266.59375, "epoch": 0.6889738687061823, "grad_norm": 18.066492080688477, "kl": 0.0771484375, "learning_rate": 3.110261312938177e-07, "loss": 0.0031, "reward": 1.5058605670928955, "reward_std": 0.10740061849355698, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.38086065649986267, "step": 2162 }, { "completion_length": 209.8125, "epoch": 0.6892925430210325, "grad_norm": 16.501205444335938, "kl": 0.08544921875, "learning_rate": 3.107074569789675e-07, "loss": 0.0034, "reward": 1.7047884464263916, "reward_std": 0.07390573620796204, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4547883868217468, "step": 2163 }, { "completion_length": 153.609375, "epoch": 0.6896112173358827, "grad_norm": 11.284491539001465, "kl": 0.11962890625, "learning_rate": 3.103887826641173e-07, "loss": 0.0048, "reward": 1.6686841249465942, "reward_std": 0.055707305669784546, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6686841249465942, "rewards/pad": 0.0, "step": 2164 }, { "completion_length": 222.65625, "epoch": 0.6899298916507329, "grad_norm": 19.294475555419922, "kl": 0.0947265625, "learning_rate": 3.1007010834926704e-07, "loss": 0.0038, "reward": 1.5718317031860352, "reward_std": 0.15449334681034088, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5093317031860352, "rewards/pad": 0.0625, "step": 2165 }, { "completion_length": 262.125, "epoch": 0.6902485659655831, "grad_norm": 10.915752410888672, "kl": 0.08642578125, "learning_rate": 3.0975143403441685e-07, "loss": 0.0035, "reward": 1.6148755550384521, "reward_std": 0.09238608181476593, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4898754954338074, "step": 2166 }, { "completion_length": 253.265625, "epoch": 0.6905672402804334, "grad_norm": 20.450952529907227, "kl": 0.1025390625, "learning_rate": 3.094327597195666e-07, "loss": 0.0041, "reward": 1.6529321670532227, "reward_std": 0.05808892846107483, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5279321670532227, "step": 2167 }, { "completion_length": 267.890625, "epoch": 0.6908859145952836, "grad_norm": 9.19316577911377, "kl": 0.07080078125, "learning_rate": 3.0911408540471635e-07, "loss": 0.0028, "reward": 1.5765550136566162, "reward_std": 0.12725546956062317, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4671799838542938, "step": 2168 }, { "completion_length": 202.6875, "epoch": 0.6912045889101338, "grad_norm": 19.015901565551758, "kl": 0.11083984375, "learning_rate": 3.087954110898661e-07, "loss": 0.0044, "reward": 1.5361312627792358, "reward_std": 0.14626570045948029, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5517563223838806, "rewards/pad": 0.0, "step": 2169 }, { "completion_length": 184.296875, "epoch": 0.691523263224984, "grad_norm": 14.142531394958496, "kl": 0.0888671875, "learning_rate": 3.084767367750159e-07, "loss": 0.0036, "reward": 1.4200398921966553, "reward_std": 0.1185888797044754, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.2950398325920105, "step": 2170 }, { "completion_length": 324.59375, "epoch": 0.6918419375398343, "grad_norm": 10.305068016052246, "kl": 0.07275390625, "learning_rate": 3.0815806246016567e-07, "loss": 0.0029, "reward": 1.5552586317062378, "reward_std": 0.09846571087837219, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5708836317062378, "step": 2171 }, { "completion_length": 312.921875, "epoch": 0.6921606118546845, "grad_norm": 7.395571708679199, "kl": 0.06298828125, "learning_rate": 3.078393881453155e-07, "loss": 0.0025, "reward": 1.5822372436523438, "reward_std": 0.13736650347709656, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4259873032569885, "rewards/pad": 0.15625, "step": 2172 }, { "completion_length": 200.625, "epoch": 0.6924792861695347, "grad_norm": 13.26996898651123, "kl": 0.10498046875, "learning_rate": 3.0752071383046523e-07, "loss": 0.0042, "reward": 1.5964655876159668, "reward_std": 0.04902723804116249, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5964655876159668, "rewards/pad": 0.0, "step": 2173 }, { "completion_length": 196.3125, "epoch": 0.6927979604843849, "grad_norm": 19.934968948364258, "kl": 0.10498046875, "learning_rate": 3.0720203951561504e-07, "loss": 0.0042, "reward": 1.6381475925445557, "reward_std": 0.11138331890106201, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5131475925445557, "rewards/pad": 0.125, "step": 2174 }, { "completion_length": 140.171875, "epoch": 0.6931166347992351, "grad_norm": 13.879480361938477, "kl": 0.1142578125, "learning_rate": 3.068833652007648e-07, "loss": 0.0046, "reward": 1.5448079109191895, "reward_std": 0.09733173996210098, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5448079109191895, "rewards/pad": 0.0, "step": 2175 }, { "completion_length": 261.96875, "epoch": 0.6934353091140854, "grad_norm": 10.17612075805664, "kl": 0.076171875, "learning_rate": 3.065646908859146e-07, "loss": 0.0031, "reward": 1.7340949773788452, "reward_std": 0.06729496270418167, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48409509658813477, "rewards/pad": 0.25, "step": 2176 }, { "completion_length": 238.140625, "epoch": 0.6937539834289357, "grad_norm": 9.040535926818848, "kl": 0.08447265625, "learning_rate": 3.0624601657106435e-07, "loss": 0.0034, "reward": 1.5748553276062012, "reward_std": 0.07171687483787537, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5748553276062012, "rewards/pad": 0.0, "step": 2177 }, { "completion_length": 219.59375, "epoch": 0.6940726577437859, "grad_norm": 8.737715721130371, "kl": 0.16015625, "learning_rate": 3.0592734225621416e-07, "loss": 0.0064, "reward": 1.494781255722046, "reward_std": 0.08365820348262787, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49478113651275635, "step": 2178 }, { "completion_length": 274.984375, "epoch": 0.6943913320586361, "grad_norm": 27.758153915405273, "kl": 0.0771484375, "learning_rate": 3.056086679413639e-07, "loss": 0.0031, "reward": 1.5322539806365967, "reward_std": 0.09594956040382385, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5478790402412415, "rewards/pad": 0.0, "step": 2179 }, { "completion_length": 278.5, "epoch": 0.6947100063734863, "grad_norm": 8.370512962341309, "kl": 0.08544921875, "learning_rate": 3.052899936265137e-07, "loss": 0.0034, "reward": 1.5229417085647583, "reward_std": 0.09253714978694916, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5385667085647583, "step": 2180 }, { "completion_length": 155.171875, "epoch": 0.6950286806883366, "grad_norm": 38.28666687011719, "kl": 0.1123046875, "learning_rate": 3.049713193116635e-07, "loss": 0.0045, "reward": 1.7286458015441895, "reward_std": 0.15620869398117065, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5098958015441895, "rewards/pad": 0.234375, "step": 2181 }, { "completion_length": 345.28125, "epoch": 0.6953473550031868, "grad_norm": 9.88138484954834, "kl": 0.05419921875, "learning_rate": 3.046526449968133e-07, "loss": 0.0022, "reward": 1.5823705196380615, "reward_std": 0.1268407106399536, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4104955792427063, "step": 2182 }, { "completion_length": 283.203125, "epoch": 0.695666029318037, "grad_norm": 19.334638595581055, "kl": 0.06494140625, "learning_rate": 3.0433397068196304e-07, "loss": 0.0026, "reward": 1.5107245445251465, "reward_std": 0.09902769327163696, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.40134957432746887, "rewards/pad": 0.125, "step": 2183 }, { "completion_length": 257.359375, "epoch": 0.6959847036328872, "grad_norm": 7.854503154754639, "kl": 0.07763671875, "learning_rate": 3.0401529636711284e-07, "loss": 0.0031, "reward": 1.5665841102600098, "reward_std": 0.16165412962436676, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.488459050655365, "step": 2184 }, { "completion_length": 102.671875, "epoch": 0.6963033779477374, "grad_norm": 18.900053024291992, "kl": 0.1240234375, "learning_rate": 3.036966220522626e-07, "loss": 0.005, "reward": 1.6835215091705322, "reward_std": 0.05428507551550865, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.683521568775177, "rewards/pad": 0.0, "step": 2185 }, { "completion_length": 171.46875, "epoch": 0.6966220522625877, "grad_norm": 12.507633209228516, "kl": 0.09619140625, "learning_rate": 3.0337794773741235e-07, "loss": 0.0039, "reward": 1.5419970750808716, "reward_std": 0.08573351800441742, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5419970154762268, "rewards/pad": 0.0, "step": 2186 }, { "completion_length": 291.046875, "epoch": 0.6969407265774379, "grad_norm": 12.020310401916504, "kl": 0.0771484375, "learning_rate": 3.030592734225621e-07, "loss": 0.0031, "reward": 1.5739872455596924, "reward_std": 0.07773323357105255, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5739873647689819, "step": 2187 }, { "completion_length": 192.609375, "epoch": 0.6972594008922881, "grad_norm": 66.1197509765625, "kl": 0.2138671875, "learning_rate": 3.0274059910771186e-07, "loss": 0.0085, "reward": 1.4427610635757446, "reward_std": 0.12355436384677887, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4583861231803894, "rewards/pad": 0.0, "step": 2188 }, { "completion_length": 233.828125, "epoch": 0.6975780752071383, "grad_norm": 15.46832275390625, "kl": 0.11328125, "learning_rate": 3.0242192479286167e-07, "loss": 0.0045, "reward": 1.5915331840515137, "reward_std": 0.11270304024219513, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6071581840515137, "rewards/pad": 0.0, "step": 2189 }, { "completion_length": 304.765625, "epoch": 0.6978967495219885, "grad_norm": 11.094467163085938, "kl": 0.1494140625, "learning_rate": 3.021032504780114e-07, "loss": 0.006, "reward": 1.4202561378479004, "reward_std": 0.09753061085939407, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4202560782432556, "rewards/pad": 0.0, "step": 2190 }, { "completion_length": 205.84375, "epoch": 0.6982154238368388, "grad_norm": 12.484258651733398, "kl": 0.08154296875, "learning_rate": 3.017845761631612e-07, "loss": 0.0033, "reward": 1.753688931465149, "reward_std": 0.07068890333175659, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3786890208721161, "step": 2191 }, { "completion_length": 192.96875, "epoch": 0.698534098151689, "grad_norm": 16.866378784179688, "kl": 0.11083984375, "learning_rate": 3.01465901848311e-07, "loss": 0.0044, "reward": 1.4792733192443848, "reward_std": 0.12700484693050385, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.38552331924438477, "rewards/pad": 0.125, "step": 2192 }, { "completion_length": 367.0625, "epoch": 0.6988527724665392, "grad_norm": 6.141723155975342, "kl": 0.08349609375, "learning_rate": 3.011472275334608e-07, "loss": 0.0033, "reward": 1.4423532485961914, "reward_std": 0.0903225690126419, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.457978218793869, "step": 2193 }, { "completion_length": 239.140625, "epoch": 0.6991714467813894, "grad_norm": 7.0455732345581055, "kl": 0.087890625, "learning_rate": 3.0082855321861054e-07, "loss": 0.0035, "reward": 1.6241099834442139, "reward_std": 0.0879855751991272, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49911004304885864, "rewards/pad": 0.125, "step": 2194 }, { "completion_length": 280.640625, "epoch": 0.6994901210962396, "grad_norm": 17.386064529418945, "kl": 0.07275390625, "learning_rate": 3.0050987890376035e-07, "loss": 0.0029, "reward": 1.50416898727417, "reward_std": 0.08903595060110092, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4260439872741699, "step": 2195 }, { "completion_length": 147.21875, "epoch": 0.6998087954110899, "grad_norm": 13.106439590454102, "kl": 0.10888671875, "learning_rate": 3.001912045889101e-07, "loss": 0.0044, "reward": 1.604874849319458, "reward_std": 0.1985802948474884, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4173748195171356, "rewards/pad": 0.203125, "step": 2196 }, { "completion_length": 192.84375, "epoch": 0.7001274697259401, "grad_norm": 16.086668014526367, "kl": 0.09130859375, "learning_rate": 2.998725302740599e-07, "loss": 0.0036, "reward": 1.608708381652832, "reward_std": 0.057043883949518204, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6087083220481873, "rewards/pad": 0.0, "step": 2197 }, { "completion_length": 232.875, "epoch": 0.7004461440407903, "grad_norm": 10.739706993103027, "kl": 0.0908203125, "learning_rate": 2.9955385595920966e-07, "loss": 0.0036, "reward": 1.5825889110565186, "reward_std": 0.07782239466905594, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5825889110565186, "rewards/pad": 0.0, "step": 2198 }, { "completion_length": 114.265625, "epoch": 0.7007648183556405, "grad_norm": 50.676753997802734, "kl": 0.11865234375, "learning_rate": 2.9923518164435947e-07, "loss": 0.0047, "reward": 1.7058982849121094, "reward_std": 0.19980685412883759, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5652732849121094, "step": 2199 }, { "completion_length": 191.40625, "epoch": 0.7010834926704907, "grad_norm": 15.388541221618652, "kl": 0.0849609375, "learning_rate": 2.989165073295092e-07, "loss": 0.0034, "reward": 1.5542172193527222, "reward_std": 0.06731617450714111, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3042171895503998, "step": 2200 }, { "completion_length": 230.0625, "epoch": 0.701402166985341, "grad_norm": 10.774901390075684, "kl": 0.0859375, "learning_rate": 2.9859783301465903e-07, "loss": 0.0034, "reward": 1.502295970916748, "reward_std": 0.10654985159635544, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.39292100071907043, "rewards/pad": 0.125, "step": 2201 }, { "completion_length": 252.640625, "epoch": 0.7017208413001912, "grad_norm": 7.72211217880249, "kl": 0.08056640625, "learning_rate": 2.982791586998088e-07, "loss": 0.0032, "reward": 1.6727725267410278, "reward_std": 0.129264697432518, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5633975863456726, "step": 2202 }, { "completion_length": 414.328125, "epoch": 0.7020395156150414, "grad_norm": 9.224809646606445, "kl": 0.06982421875, "learning_rate": 2.979604843849586e-07, "loss": 0.0028, "reward": 1.4881536960601807, "reward_std": 0.16598357260227203, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3787786662578583, "step": 2203 }, { "completion_length": 289.796875, "epoch": 0.7023581899298916, "grad_norm": 7.270951271057129, "kl": 0.083984375, "learning_rate": 2.9764181007010835e-07, "loss": 0.0034, "reward": 1.5620064735412598, "reward_std": 0.07426491379737854, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5620065331459045, "rewards/pad": 0.0, "step": 2204 }, { "completion_length": 199.078125, "epoch": 0.7026768642447419, "grad_norm": 8.401556968688965, "kl": 0.1005859375, "learning_rate": 2.9732313575525815e-07, "loss": 0.004, "reward": 1.50137197971344, "reward_std": 0.09893225878477097, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5013719797134399, "step": 2205 }, { "completion_length": 241.796875, "epoch": 0.7029955385595921, "grad_norm": 13.952909469604492, "kl": 0.08642578125, "learning_rate": 2.970044614404079e-07, "loss": 0.0035, "reward": 1.5688952207565308, "reward_std": 0.22908249497413635, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.490770161151886, "step": 2206 }, { "completion_length": 305.25, "epoch": 0.7033142128744423, "grad_norm": 11.196319580078125, "kl": 0.07470703125, "learning_rate": 2.9668578712555766e-07, "loss": 0.003, "reward": 1.534885048866272, "reward_std": 0.10356839001178741, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.550510048866272, "step": 2207 }, { "completion_length": 167.203125, "epoch": 0.7036328871892925, "grad_norm": 7.589974403381348, "kl": 0.107421875, "learning_rate": 2.963671128107074e-07, "loss": 0.0043, "reward": 1.5548350811004639, "reward_std": 0.14660724997520447, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.38296017050743103, "rewards/pad": 0.171875, "step": 2208 }, { "completion_length": 317.625, "epoch": 0.7039515615041427, "grad_norm": 5.4483962059021, "kl": 0.078125, "learning_rate": 2.960484384958572e-07, "loss": 0.0031, "reward": 1.4976832866668701, "reward_std": 0.12796911597251892, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5289332866668701, "rewards/pad": 0.0, "step": 2209 }, { "completion_length": 262.40625, "epoch": 0.704270235818993, "grad_norm": 31.277687072753906, "kl": 0.1015625, "learning_rate": 2.95729764181007e-07, "loss": 0.0041, "reward": 1.6184595823287964, "reward_std": 0.17132757604122162, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.6497095823287964, "step": 2210 }, { "completion_length": 238.25, "epoch": 0.7045889101338432, "grad_norm": 12.485980033874512, "kl": 0.0869140625, "learning_rate": 2.954110898661568e-07, "loss": 0.0035, "reward": 1.5100154876708984, "reward_std": 0.04877918213605881, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.38501548767089844, "step": 2211 }, { "completion_length": 194.09375, "epoch": 0.7049075844486934, "grad_norm": 9.918503761291504, "kl": 0.0908203125, "learning_rate": 2.9509241555130654e-07, "loss": 0.0036, "reward": 1.5856094360351562, "reward_std": 0.07165414094924927, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4606093764305115, "step": 2212 }, { "completion_length": 336.96875, "epoch": 0.7052262587635436, "grad_norm": 5.346742153167725, "kl": 0.0654296875, "learning_rate": 2.9477374123645634e-07, "loss": 0.0026, "reward": 1.417320966720581, "reward_std": 0.14546999335289001, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.40169599652290344, "rewards/pad": 0.03125, "step": 2213 }, { "completion_length": 274.9375, "epoch": 0.7055449330783938, "grad_norm": 12.8856201171875, "kl": 0.072265625, "learning_rate": 2.944550669216061e-07, "loss": 0.0029, "reward": 1.5375027656555176, "reward_std": 0.11782203614711761, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5531278252601624, "step": 2214 }, { "completion_length": 251.359375, "epoch": 0.705863607393244, "grad_norm": 9.751949310302734, "kl": 0.0830078125, "learning_rate": 2.9413639260675585e-07, "loss": 0.0033, "reward": 1.5835912227630615, "reward_std": 0.10557487607002258, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5210912227630615, "rewards/pad": 0.0625, "step": 2215 }, { "completion_length": 152.375, "epoch": 0.7061822817080943, "grad_norm": 10.901607513427734, "kl": 0.10009765625, "learning_rate": 2.9381771829190566e-07, "loss": 0.004, "reward": 1.6085469722747803, "reward_std": 0.15042608976364136, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.45229700207710266, "rewards/pad": 0.171875, "step": 2216 }, { "completion_length": 296.390625, "epoch": 0.7065009560229446, "grad_norm": 10.667808532714844, "kl": 0.07861328125, "learning_rate": 2.934990439770554e-07, "loss": 0.0031, "reward": 1.5774935483932495, "reward_std": 0.060238465666770935, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4524935781955719, "step": 2217 }, { "completion_length": 209.09375, "epoch": 0.7068196303377948, "grad_norm": 7.474569797515869, "kl": 0.07861328125, "learning_rate": 2.931803696622052e-07, "loss": 0.0031, "reward": 1.9297740459442139, "reward_std": 0.1745995581150055, "rewards/answer_reward": 0.359375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5860241055488586, "step": 2218 }, { "completion_length": 301.015625, "epoch": 0.707138304652645, "grad_norm": 11.891040802001953, "kl": 0.07080078125, "learning_rate": 2.92861695347355e-07, "loss": 0.0028, "reward": 1.4260377883911133, "reward_std": 0.12861579656600952, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.44166284799575806, "rewards/pad": 0.0, "step": 2219 }, { "completion_length": 387.03125, "epoch": 0.7074569789674953, "grad_norm": 12.634696960449219, "kl": 0.0830078125, "learning_rate": 2.925430210325048e-07, "loss": 0.0033, "reward": 1.5856103897094727, "reward_std": 0.13534234464168549, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.49186044931411743, "step": 2220 }, { "completion_length": 266.484375, "epoch": 0.7077756532823455, "grad_norm": 8.80695629119873, "kl": 0.083984375, "learning_rate": 2.9222434671765454e-07, "loss": 0.0033, "reward": 1.4818050861358643, "reward_std": 0.13370054960250854, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.48180508613586426, "step": 2221 }, { "completion_length": 235.59375, "epoch": 0.7080943275971957, "grad_norm": 20.21587562561035, "kl": 0.0869140625, "learning_rate": 2.9190567240280434e-07, "loss": 0.0035, "reward": 1.457129955291748, "reward_std": 0.15341603755950928, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.47275495529174805, "rewards/pad": 0.0, "step": 2222 }, { "completion_length": 143.390625, "epoch": 0.7084130019120459, "grad_norm": 21.482358932495117, "kl": 0.125, "learning_rate": 2.915869980879541e-07, "loss": 0.005, "reward": 1.4829232692718506, "reward_std": 0.14291679859161377, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4985482692718506, "rewards/pad": 0.0, "step": 2223 }, { "completion_length": 318.28125, "epoch": 0.7087316762268961, "grad_norm": 9.289806365966797, "kl": 0.0830078125, "learning_rate": 2.912683237731039e-07, "loss": 0.0033, "reward": 1.5218733549118042, "reward_std": 0.1469164788722992, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4281233549118042, "rewards/pad": 0.109375, "step": 2224 }, { "completion_length": 321.234375, "epoch": 0.7090503505417464, "grad_norm": 7.08596658706665, "kl": 0.07470703125, "learning_rate": 2.9094964945825366e-07, "loss": 0.003, "reward": 1.5650315284729004, "reward_std": 0.06518622487783432, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5650315284729004, "rewards/pad": 0.0, "step": 2225 }, { "completion_length": 190.640625, "epoch": 0.7093690248565966, "grad_norm": 15.268865585327148, "kl": 0.087890625, "learning_rate": 2.9063097514340346e-07, "loss": 0.0035, "reward": 1.6178052425384521, "reward_std": 0.10819917172193527, "rewards/pad": 0.1875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4303053319454193, "step": 2226 }, { "completion_length": 257.265625, "epoch": 0.7096876991714468, "grad_norm": 8.498229026794434, "kl": 0.0966796875, "learning_rate": 2.9031230082855317e-07, "loss": 0.0039, "reward": 1.4241349697113037, "reward_std": 0.143952414393425, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4241349399089813, "step": 2227 }, { "completion_length": 190.03125, "epoch": 0.710006373486297, "grad_norm": 9.867719650268555, "kl": 0.10888671875, "learning_rate": 2.8999362651370297e-07, "loss": 0.0044, "reward": 1.3704220056533813, "reward_std": 0.08321335911750793, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3860470652580261, "rewards/pad": 0.0, "step": 2228 }, { "completion_length": 243.09375, "epoch": 0.7103250478011472, "grad_norm": 6.65676212310791, "kl": 0.059814453125, "learning_rate": 2.896749521988527e-07, "loss": 0.0024, "reward": 1.8582676649093628, "reward_std": 0.20621733367443085, "rewards/pad": 0.484375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3895176947116852, "step": 2229 }, { "completion_length": 259.859375, "epoch": 0.7106437221159975, "grad_norm": 12.128743171691895, "kl": 0.09716796875, "learning_rate": 2.8935627788400253e-07, "loss": 0.0039, "reward": 1.594397783279419, "reward_std": 0.10106487572193146, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.594397783279419, "rewards/pad": 0.0, "step": 2230 }, { "completion_length": 306.15625, "epoch": 0.7109623964308477, "grad_norm": 11.669578552246094, "kl": 0.06396484375, "learning_rate": 2.890376035691523e-07, "loss": 0.0026, "reward": 1.7158432006835938, "reward_std": 0.08693593740463257, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6064682006835938, "step": 2231 }, { "completion_length": 206.515625, "epoch": 0.7112810707456979, "grad_norm": 18.729063034057617, "kl": 0.12890625, "learning_rate": 2.887189292543021e-07, "loss": 0.0051, "reward": 1.4813673496246338, "reward_std": 0.12282691895961761, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.371992290019989, "step": 2232 }, { "completion_length": 315.859375, "epoch": 0.7115997450605481, "grad_norm": 29.33302116394043, "kl": 0.09326171875, "learning_rate": 2.8840025493945185e-07, "loss": 0.0037, "reward": 1.4940911531448364, "reward_std": 0.05092655122280121, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4940911531448364, "rewards/pad": 0.0, "step": 2233 }, { "completion_length": 266.46875, "epoch": 0.7119184193753983, "grad_norm": 21.585269927978516, "kl": 0.1015625, "learning_rate": 2.8808158062460166e-07, "loss": 0.0041, "reward": 1.5710127353668213, "reward_std": 0.17600609362125397, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5397627949714661, "rewards/pad": 0.046875, "step": 2234 }, { "completion_length": 267.984375, "epoch": 0.7122370936902486, "grad_norm": 7.6832451820373535, "kl": 0.08251953125, "learning_rate": 2.877629063097514e-07, "loss": 0.0033, "reward": 1.5236074924468994, "reward_std": 0.1719309240579605, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5548574924468994, "step": 2235 }, { "completion_length": 152.515625, "epoch": 0.7125557680050988, "grad_norm": 20.99796485900879, "kl": 0.125, "learning_rate": 2.874442319949012e-07, "loss": 0.005, "reward": 1.5960841178894043, "reward_std": 0.0930885523557663, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5960839986801147, "rewards/pad": 0.0, "step": 2236 }, { "completion_length": 343.09375, "epoch": 0.712874442319949, "grad_norm": 21.34062385559082, "kl": 0.0693359375, "learning_rate": 2.8712555768005097e-07, "loss": 0.0028, "reward": 1.4536378383636475, "reward_std": 0.10192979127168655, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4692626893520355, "step": 2237 }, { "completion_length": 207.6875, "epoch": 0.7131931166347992, "grad_norm": 46.177391052246094, "kl": 0.10498046875, "learning_rate": 2.868068833652008e-07, "loss": 0.0042, "reward": 1.6383215188980103, "reward_std": 0.13554789125919342, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4508214294910431, "rewards/pad": 0.1875, "step": 2238 }, { "completion_length": 195.5, "epoch": 0.7135117909496494, "grad_norm": 15.51397705078125, "kl": 0.09228515625, "learning_rate": 2.8648820905035053e-07, "loss": 0.0037, "reward": 1.394288420677185, "reward_std": 0.05121123790740967, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3942883610725403, "step": 2239 }, { "completion_length": 258.84375, "epoch": 0.7138304652644997, "grad_norm": 16.36289405822754, "kl": 0.0771484375, "learning_rate": 2.8616953473550034e-07, "loss": 0.0031, "reward": 1.529281735420227, "reward_std": 0.19541539251804352, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.35740670561790466, "rewards/pad": 0.1875, "step": 2240 }, { "completion_length": 304.640625, "epoch": 0.7141491395793499, "grad_norm": 7.12017822265625, "kl": 0.0751953125, "learning_rate": 2.858508604206501e-07, "loss": 0.003, "reward": 1.5130261182785034, "reward_std": 0.08079716563224792, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5130261182785034, "rewards/pad": 0.0, "step": 2241 }, { "completion_length": 299.890625, "epoch": 0.7144678138942001, "grad_norm": 14.374198913574219, "kl": 0.11279296875, "learning_rate": 2.855321861057999e-07, "loss": 0.0045, "reward": 1.4909179210662842, "reward_std": 0.10724478960037231, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5065428018569946, "step": 2242 }, { "completion_length": 316.75, "epoch": 0.7147864882090503, "grad_norm": 11.825737953186035, "kl": 0.07568359375, "learning_rate": 2.8521351179094965e-07, "loss": 0.003, "reward": 1.5616931915283203, "reward_std": 0.17490637302398682, "rewards/pad": 0.21875, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.37419313192367554, "step": 2243 }, { "completion_length": 357.015625, "epoch": 0.7151051625239006, "grad_norm": 4.220314979553223, "kl": 0.06591796875, "learning_rate": 2.8489483747609946e-07, "loss": 0.0026, "reward": 1.5746936798095703, "reward_std": 0.04713768512010574, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5746935606002808, "step": 2244 }, { "completion_length": 194.375, "epoch": 0.7154238368387508, "grad_norm": 9.533937454223633, "kl": 0.08154296875, "learning_rate": 2.845761631612492e-07, "loss": 0.0033, "reward": 1.9403507709503174, "reward_std": 0.06749410927295685, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5653507709503174, "rewards/pad": 0.375, "step": 2245 }, { "completion_length": 331.3125, "epoch": 0.715742511153601, "grad_norm": 7.48617696762085, "kl": 0.0634765625, "learning_rate": 2.842574888463989e-07, "loss": 0.0025, "reward": 1.603176236152649, "reward_std": 0.1028289645910263, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4313012361526489, "step": 2246 }, { "completion_length": 250.5625, "epoch": 0.7160611854684512, "grad_norm": 10.276695251464844, "kl": 0.10888671875, "learning_rate": 2.839388145315487e-07, "loss": 0.0044, "reward": 1.653708815574646, "reward_std": 0.1339433491230011, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.544333815574646, "rewards/pad": 0.109375, "step": 2247 }, { "completion_length": 412.28125, "epoch": 0.7163798597833014, "grad_norm": 12.946741104125977, "kl": 0.062255859375, "learning_rate": 2.836201402166985e-07, "loss": 0.0025, "reward": 1.4689414501190186, "reward_std": 0.11642967909574509, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4845663905143738, "step": 2248 }, { "completion_length": 282.09375, "epoch": 0.7166985340981517, "grad_norm": 7.711841583251953, "kl": 0.0888671875, "learning_rate": 2.833014659018483e-07, "loss": 0.0035, "reward": 1.492098331451416, "reward_std": 0.11170074343681335, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.38272321224212646, "step": 2249 }, { "completion_length": 196.390625, "epoch": 0.7170172084130019, "grad_norm": 13.00545597076416, "kl": 0.12255859375, "learning_rate": 2.8298279158699804e-07, "loss": 0.0049, "reward": 1.6409862041473389, "reward_std": 0.12882104516029358, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6566110849380493, "step": 2250 }, { "completion_length": 226.328125, "epoch": 0.7173358827278521, "grad_norm": 14.536446571350098, "kl": 0.0986328125, "learning_rate": 2.8266411727214784e-07, "loss": 0.0039, "reward": 1.617206335067749, "reward_std": 0.08918547630310059, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.507831335067749, "rewards/pad": 0.125, "step": 2251 }, { "completion_length": 316.0, "epoch": 0.7176545570427023, "grad_norm": 29.476350784301758, "kl": 0.07958984375, "learning_rate": 2.823454429572976e-07, "loss": 0.0032, "reward": 1.5252597332000732, "reward_std": 0.10496915876865387, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5408847332000732, "step": 2252 }, { "completion_length": 218.40625, "epoch": 0.7179732313575525, "grad_norm": 39.07975387573242, "kl": 0.1259765625, "learning_rate": 2.820267686424474e-07, "loss": 0.005, "reward": 1.6677427291870117, "reward_std": 0.0700826495885849, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5427427291870117, "step": 2253 }, { "completion_length": 382.328125, "epoch": 0.7182919056724028, "grad_norm": 33.672271728515625, "kl": 0.06201171875, "learning_rate": 2.8170809432759716e-07, "loss": 0.0025, "reward": 1.567516803741455, "reward_std": 0.10253050923347473, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4581417441368103, "step": 2254 }, { "completion_length": 265.90625, "epoch": 0.718610579987253, "grad_norm": 7.391505718231201, "kl": 0.07958984375, "learning_rate": 2.8138942001274697e-07, "loss": 0.0032, "reward": 1.600637674331665, "reward_std": 0.08258798718452454, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3506375551223755, "rewards/pad": 0.25, "step": 2255 }, { "completion_length": 215.796875, "epoch": 0.7189292543021033, "grad_norm": 12.145453453063965, "kl": 0.09716796875, "learning_rate": 2.810707456978967e-07, "loss": 0.0039, "reward": 1.5579832792282104, "reward_std": 0.06622102111577988, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4329833388328552, "step": 2256 }, { "completion_length": 157.46875, "epoch": 0.7192479286169535, "grad_norm": 26.469045639038086, "kl": 0.12109375, "learning_rate": 2.8075207138304653e-07, "loss": 0.0048, "reward": 1.8562612533569336, "reward_std": 0.19814197719097137, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6687612533569336, "rewards/pad": 0.1875, "step": 2257 }, { "completion_length": 164.8125, "epoch": 0.7195666029318037, "grad_norm": 12.529221534729004, "kl": 0.125, "learning_rate": 2.804333970681963e-07, "loss": 0.005, "reward": 1.5287246704101562, "reward_std": 0.1076161339879036, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4037245512008667, "rewards/pad": 0.125, "step": 2258 }, { "completion_length": 393.375, "epoch": 0.719885277246654, "grad_norm": 7.355383396148682, "kl": 0.0498046875, "learning_rate": 2.801147227533461e-07, "loss": 0.002, "reward": 1.5239171981811523, "reward_std": 0.1284276247024536, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.46141716837882996, "step": 2259 }, { "completion_length": 157.8125, "epoch": 0.7202039515615042, "grad_norm": 11.706124305725098, "kl": 0.10498046875, "learning_rate": 2.7979604843849584e-07, "loss": 0.0042, "reward": 1.599716067314148, "reward_std": 0.28268083930015564, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5215910077095032, "rewards/pad": 0.109375, "step": 2260 }, { "completion_length": 216.34375, "epoch": 0.7205226258763544, "grad_norm": 19.255443572998047, "kl": 0.10986328125, "learning_rate": 2.7947737412364565e-07, "loss": 0.0044, "reward": 1.5750744342803955, "reward_std": 0.06895896792411804, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5750743746757507, "rewards/pad": 0.0, "step": 2261 }, { "completion_length": 200.203125, "epoch": 0.7208413001912046, "grad_norm": 12.548673629760742, "kl": 0.11083984375, "learning_rate": 2.791586998087954e-07, "loss": 0.0044, "reward": 1.5564448833465576, "reward_std": 0.09696632623672485, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4314449429512024, "step": 2262 }, { "completion_length": 211.109375, "epoch": 0.7211599745060548, "grad_norm": 16.141681671142578, "kl": 0.12255859375, "learning_rate": 2.788400254939452e-07, "loss": 0.0049, "reward": 1.605320930480957, "reward_std": 0.1554790735244751, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5115709900856018, "step": 2263 }, { "completion_length": 172.0, "epoch": 0.7214786488209051, "grad_norm": 11.038640975952148, "kl": 0.11669921875, "learning_rate": 2.7852135117909496e-07, "loss": 0.0047, "reward": 1.8946161270141602, "reward_std": 0.08197048306465149, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5196161866188049, "rewards/pad": 0.375, "step": 2264 }, { "completion_length": 165.359375, "epoch": 0.7217973231357553, "grad_norm": 24.56603240966797, "kl": 0.08984375, "learning_rate": 2.7820267686424477e-07, "loss": 0.0036, "reward": 1.5874215364456177, "reward_std": 0.12280981987714767, "rewards/answer_reward": 0.28125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3061715364456177, "step": 2265 }, { "completion_length": 241.953125, "epoch": 0.7221159974506055, "grad_norm": 13.44584846496582, "kl": 0.08203125, "learning_rate": 2.7788400254939447e-07, "loss": 0.0033, "reward": 1.4582545757293701, "reward_std": 0.04218538850545883, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.45825451612472534, "step": 2266 }, { "completion_length": 273.109375, "epoch": 0.7224346717654557, "grad_norm": 5.412264347076416, "kl": 0.0859375, "learning_rate": 2.775653282345443e-07, "loss": 0.0034, "reward": 1.7483843564987183, "reward_std": 0.07010708749294281, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.49838435649871826, "step": 2267 }, { "completion_length": 203.796875, "epoch": 0.722753346080306, "grad_norm": 12.466039657592773, "kl": 0.1083984375, "learning_rate": 2.7724665391969403e-07, "loss": 0.0043, "reward": 1.511660099029541, "reward_std": 0.047675177454948425, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.511660099029541, "step": 2268 }, { "completion_length": 299.453125, "epoch": 0.7230720203951562, "grad_norm": 41.059959411621094, "kl": 0.076171875, "learning_rate": 2.7692797960484384e-07, "loss": 0.003, "reward": 1.6285617351531982, "reward_std": 0.10453909635543823, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5035616159439087, "rewards/pad": 0.125, "step": 2269 }, { "completion_length": 216.765625, "epoch": 0.7233906947100064, "grad_norm": 9.3422212600708, "kl": 0.0869140625, "learning_rate": 2.766093052899936e-07, "loss": 0.0035, "reward": 1.7067334651947021, "reward_std": 0.11028394103050232, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4723585247993469, "step": 2270 }, { "completion_length": 307.953125, "epoch": 0.7237093690248566, "grad_norm": 12.578081130981445, "kl": 0.08251953125, "learning_rate": 2.762906309751434e-07, "loss": 0.0033, "reward": 1.5892455577850342, "reward_std": 0.23204761743545532, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.9375, "rewards/tracking_iou_reward": 0.5267455577850342, "step": 2271 }, { "completion_length": 187.015625, "epoch": 0.7240280433397068, "grad_norm": 14.036943435668945, "kl": 0.10498046875, "learning_rate": 2.7597195666029316e-07, "loss": 0.0042, "reward": 1.6664389371871948, "reward_std": 0.10293828696012497, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6664389967918396, "step": 2272 }, { "completion_length": 210.96875, "epoch": 0.724346717654557, "grad_norm": 25.174528121948242, "kl": 0.10400390625, "learning_rate": 2.7565328234544296e-07, "loss": 0.0042, "reward": 1.5564359426498413, "reward_std": 0.06936076283454895, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43143588304519653, "rewards/pad": 0.125, "step": 2273 }, { "completion_length": 155.6875, "epoch": 0.7246653919694073, "grad_norm": 4.558319091796875, "kl": 0.134765625, "learning_rate": 2.753346080305927e-07, "loss": 0.0054, "reward": 1.5315523147583008, "reward_std": 0.0986432284116745, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.422177255153656, "rewards/pad": 0.125, "step": 2274 }, { "completion_length": 313.28125, "epoch": 0.7249840662842575, "grad_norm": 9.25501537322998, "kl": 0.060546875, "learning_rate": 2.750159337157425e-07, "loss": 0.0024, "reward": 1.6354743242263794, "reward_std": 0.1479807049036026, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5417243242263794, "rewards/pad": 0.125, "step": 2275 }, { "completion_length": 112.90625, "epoch": 0.7253027405991077, "grad_norm": 19.316740036010742, "kl": 0.107421875, "learning_rate": 2.746972594008923e-07, "loss": 0.0043, "reward": 1.7034404277801514, "reward_std": 0.05819811671972275, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.453440397977829, "rewards/pad": 0.25, "step": 2276 }, { "completion_length": 339.046875, "epoch": 0.7256214149139579, "grad_norm": 6.1517815589904785, "kl": 0.07763671875, "learning_rate": 2.7437858508604203e-07, "loss": 0.0031, "reward": 1.485317349433899, "reward_std": 0.15594914555549622, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.37594231963157654, "step": 2277 }, { "completion_length": 237.09375, "epoch": 0.7259400892288081, "grad_norm": 22.668804168701172, "kl": 0.0888671875, "learning_rate": 2.7405991077119184e-07, "loss": 0.0036, "reward": 1.654649019241333, "reward_std": 0.14173540472984314, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5921490788459778, "rewards/pad": 0.0625, "step": 2278 }, { "completion_length": 179.484375, "epoch": 0.7262587635436584, "grad_norm": 13.707090377807617, "kl": 0.11328125, "learning_rate": 2.737412364563416e-07, "loss": 0.0045, "reward": 1.6936432123184204, "reward_std": 0.1634928286075592, "rewards/answer_reward": 0.265625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4280182123184204, "step": 2279 }, { "completion_length": 212.328125, "epoch": 0.7265774378585086, "grad_norm": 11.429169654846191, "kl": 0.111328125, "learning_rate": 2.734225621414914e-07, "loss": 0.0044, "reward": 1.50600004196167, "reward_std": 0.13330598175525665, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5216250419616699, "rewards/pad": 0.0, "step": 2280 }, { "completion_length": 312.8125, "epoch": 0.7268961121733588, "grad_norm": 9.42291259765625, "kl": 0.0751953125, "learning_rate": 2.7310388782664115e-07, "loss": 0.003, "reward": 1.5466002225875854, "reward_std": 0.10822071880102158, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4372251033782959, "step": 2281 }, { "completion_length": 324.96875, "epoch": 0.727214786488209, "grad_norm": 13.664058685302734, "kl": 0.06396484375, "learning_rate": 2.7278521351179096e-07, "loss": 0.0026, "reward": 1.5122783184051514, "reward_std": 0.09490574896335602, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.40290340781211853, "step": 2282 }, { "completion_length": 159.9375, "epoch": 0.7275334608030593, "grad_norm": 8.65300464630127, "kl": 0.10009765625, "learning_rate": 2.724665391969407e-07, "loss": 0.004, "reward": 1.7262582778930664, "reward_std": 0.08647529780864716, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.47625815868377686, "step": 2283 }, { "completion_length": 281.9375, "epoch": 0.7278521351179095, "grad_norm": 14.41006088256836, "kl": 0.076171875, "learning_rate": 2.721478648820905e-07, "loss": 0.003, "reward": 1.639107346534729, "reward_std": 0.10988225042819977, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5297324657440186, "rewards/pad": 0.125, "step": 2284 }, { "completion_length": 111.6875, "epoch": 0.7281708094327597, "grad_norm": 24.305002212524414, "kl": 0.138671875, "learning_rate": 2.718291905672402e-07, "loss": 0.0055, "reward": 1.6760501861572266, "reward_std": 0.19735486805438995, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5823003053665161, "rewards/pad": 0.109375, "step": 2285 }, { "completion_length": 331.8125, "epoch": 0.7284894837476099, "grad_norm": 5.85117769241333, "kl": 0.0625, "learning_rate": 2.7151051625239003e-07, "loss": 0.0025, "reward": 1.3858469724655151, "reward_std": 0.0686102956533432, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.38584691286087036, "step": 2286 }, { "completion_length": 303.90625, "epoch": 0.7288081580624601, "grad_norm": 32.89085388183594, "kl": 0.11181640625, "learning_rate": 2.711918419375398e-07, "loss": 0.0045, "reward": 1.343575119972229, "reward_std": 0.09186676144599915, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.343575119972229, "rewards/pad": 0.0, "step": 2287 }, { "completion_length": 224.28125, "epoch": 0.7291268323773104, "grad_norm": 16.360694885253906, "kl": 0.109375, "learning_rate": 2.708731676226896e-07, "loss": 0.0044, "reward": 1.630078673362732, "reward_std": 0.09970824420452118, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6457037329673767, "step": 2288 }, { "completion_length": 212.65625, "epoch": 0.7294455066921606, "grad_norm": 90.82035064697266, "kl": 0.099609375, "learning_rate": 2.7055449330783934e-07, "loss": 0.004, "reward": 1.3920116424560547, "reward_std": 0.06360248476266861, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3920116424560547, "rewards/pad": 0.0, "step": 2289 }, { "completion_length": 381.640625, "epoch": 0.7297641810070108, "grad_norm": 11.928213119506836, "kl": 0.05517578125, "learning_rate": 2.7023581899298915e-07, "loss": 0.0022, "reward": 1.4683802127838135, "reward_std": 0.06154194846749306, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4683801829814911, "rewards/pad": 0.0, "step": 2290 }, { "completion_length": 294.578125, "epoch": 0.730082855321861, "grad_norm": 25.566471099853516, "kl": 0.09228515625, "learning_rate": 2.699171446781389e-07, "loss": 0.0037, "reward": 1.4241492748260498, "reward_std": 0.15201619267463684, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4553992748260498, "step": 2291 }, { "completion_length": 361.625, "epoch": 0.7304015296367112, "grad_norm": 38.10639190673828, "kl": 0.059326171875, "learning_rate": 2.695984703632887e-07, "loss": 0.0024, "reward": 1.5291061401367188, "reward_std": 0.11724649369716644, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45098114013671875, "rewards/pad": 0.078125, "step": 2292 }, { "completion_length": 207.078125, "epoch": 0.7307202039515615, "grad_norm": 15.148269653320312, "kl": 0.1298828125, "learning_rate": 2.6927979604843847e-07, "loss": 0.0052, "reward": 1.7448357343673706, "reward_std": 0.19660058617591858, "rewards/pad": 0.203125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5573357343673706, "step": 2293 }, { "completion_length": 113.921875, "epoch": 0.7310388782664117, "grad_norm": 26.689838409423828, "kl": 0.134765625, "learning_rate": 2.689611217335883e-07, "loss": 0.0054, "reward": 1.82138991355896, "reward_std": 0.1653740108013153, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.57138991355896, "step": 2294 }, { "completion_length": 281.15625, "epoch": 0.731357552581262, "grad_norm": 10.188474655151367, "kl": 0.091796875, "learning_rate": 2.6864244741873803e-07, "loss": 0.0037, "reward": 1.4889552593231201, "reward_std": 0.12610562145709991, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5045802593231201, "rewards/pad": 0.0, "step": 2295 }, { "completion_length": 318.375, "epoch": 0.7316762268961122, "grad_norm": 5.968549728393555, "kl": 0.06396484375, "learning_rate": 2.6832377310388783e-07, "loss": 0.0026, "reward": 1.3989291191101074, "reward_std": 0.16393840312957764, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3051791191101074, "step": 2296 }, { "completion_length": 303.234375, "epoch": 0.7319949012109624, "grad_norm": 10.959392547607422, "kl": 0.06396484375, "learning_rate": 2.680050987890376e-07, "loss": 0.0026, "reward": 1.6067535877227783, "reward_std": 0.1501012146472931, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5130035877227783, "step": 2297 }, { "completion_length": 268.875, "epoch": 0.7323135755258127, "grad_norm": 11.456298828125, "kl": 0.08447265625, "learning_rate": 2.676864244741874e-07, "loss": 0.0034, "reward": 1.4659645557403564, "reward_std": 0.11348327994346619, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46596449613571167, "rewards/pad": 0.0, "step": 2298 }, { "completion_length": 230.703125, "epoch": 0.7326322498406629, "grad_norm": 41.293704986572266, "kl": 0.123046875, "learning_rate": 2.6736775015933715e-07, "loss": 0.0049, "reward": 1.5611324310302734, "reward_std": 0.12068554759025574, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5298823714256287, "rewards/pad": 0.03125, "step": 2299 }, { "completion_length": 248.234375, "epoch": 0.7329509241555131, "grad_norm": 18.223880767822266, "kl": 0.080078125, "learning_rate": 2.6704907584448696e-07, "loss": 0.0032, "reward": 1.879724383354187, "reward_std": 0.1355212926864624, "rewards/pad": 0.375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5203494429588318, "step": 2300 }, { "completion_length": 153.40625, "epoch": 0.7332695984703633, "grad_norm": 22.09395980834961, "kl": 0.115234375, "learning_rate": 2.667304015296367e-07, "loss": 0.0046, "reward": 1.470009207725525, "reward_std": 0.14496728777885437, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4700092077255249, "rewards/pad": 0.0, "step": 2301 }, { "completion_length": 210.1875, "epoch": 0.7335882727852135, "grad_norm": 9.805888175964355, "kl": 0.109375, "learning_rate": 2.664117272147865e-07, "loss": 0.0044, "reward": 1.6735904216766357, "reward_std": 0.08329583704471588, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.548590362071991, "rewards/pad": 0.125, "step": 2302 }, { "completion_length": 106.1875, "epoch": 0.7339069471000638, "grad_norm": 15.89492416381836, "kl": 0.1728515625, "learning_rate": 2.6609305289993627e-07, "loss": 0.0069, "reward": 1.5772063732147217, "reward_std": 0.13957607746124268, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45220643281936646, "rewards/pad": 0.125, "step": 2303 }, { "completion_length": 268.859375, "epoch": 0.734225621414914, "grad_norm": 9.0073881149292, "kl": 0.08544921875, "learning_rate": 2.657743785850861e-07, "loss": 0.0034, "reward": 1.5893383026123047, "reward_std": 0.10896583646535873, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4955883026123047, "step": 2304 }, { "completion_length": 243.046875, "epoch": 0.7345442957297642, "grad_norm": 6.57396936416626, "kl": 0.09130859375, "learning_rate": 2.654557042702358e-07, "loss": 0.0036, "reward": 1.7307865619659424, "reward_std": 0.10434369742870331, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6057864427566528, "rewards/pad": 0.125, "step": 2305 }, { "completion_length": 288.90625, "epoch": 0.7348629700446144, "grad_norm": 5.094200134277344, "kl": 0.064453125, "learning_rate": 2.651370299553856e-07, "loss": 0.0026, "reward": 1.5313658714294434, "reward_std": 0.10805720835924149, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4376158118247986, "rewards/pad": 0.09375, "step": 2306 }, { "completion_length": 193.25, "epoch": 0.7351816443594646, "grad_norm": 43.746063232421875, "kl": 0.0947265625, "learning_rate": 2.6481835564053534e-07, "loss": 0.0038, "reward": 1.545185923576355, "reward_std": 0.15073031187057495, "rewards/answer_reward": 0.078125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4670608937740326, "step": 2307 }, { "completion_length": 220.703125, "epoch": 0.7355003186743149, "grad_norm": 11.71030330657959, "kl": 0.11865234375, "learning_rate": 2.644996813256851e-07, "loss": 0.0048, "reward": 1.5706113576889038, "reward_std": 0.1904187798500061, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4612364172935486, "step": 2308 }, { "completion_length": 197.125, "epoch": 0.7358189929891651, "grad_norm": 23.52389907836914, "kl": 0.09716796875, "learning_rate": 2.641810070108349e-07, "loss": 0.0039, "reward": 1.6343523263931274, "reward_std": 0.14043483138084412, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5093523263931274, "rewards/pad": 0.125, "step": 2309 }, { "completion_length": 292.390625, "epoch": 0.7361376673040153, "grad_norm": 23.054824829101562, "kl": 0.06640625, "learning_rate": 2.6386233269598466e-07, "loss": 0.0027, "reward": 1.565664529800415, "reward_std": 0.163468599319458, "rewards/pad": 0.1875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.39378952980041504, "step": 2310 }, { "completion_length": 217.765625, "epoch": 0.7364563416188655, "grad_norm": 9.4763765335083, "kl": 0.076171875, "learning_rate": 2.6354365838113446e-07, "loss": 0.003, "reward": 1.7144595384597778, "reward_std": 0.10374715924263, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4800845682621002, "rewards/pad": 0.234375, "step": 2311 }, { "completion_length": 234.046875, "epoch": 0.7367750159337157, "grad_norm": 14.154435157775879, "kl": 0.1064453125, "learning_rate": 2.632249840662842e-07, "loss": 0.0043, "reward": 1.5973759889602661, "reward_std": 0.13200649619102478, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4880009591579437, "rewards/pad": 0.125, "step": 2312 }, { "completion_length": 242.140625, "epoch": 0.737093690248566, "grad_norm": 9.745589256286621, "kl": 0.0869140625, "learning_rate": 2.62906309751434e-07, "loss": 0.0035, "reward": 1.4887791872024536, "reward_std": 0.0891360491514206, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3637791574001312, "step": 2313 }, { "completion_length": 182.015625, "epoch": 0.7374123645634162, "grad_norm": 55.06594467163086, "kl": 0.09765625, "learning_rate": 2.625876354365838e-07, "loss": 0.0039, "reward": 1.7070443630218506, "reward_std": 0.072098508477211, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5820443630218506, "step": 2314 }, { "completion_length": 253.75, "epoch": 0.7377310388782664, "grad_norm": 34.84463882446289, "kl": 0.078125, "learning_rate": 2.622689611217336e-07, "loss": 0.0031, "reward": 1.5879429578781128, "reward_std": 0.15225547552108765, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4316929578781128, "rewards/pad": 0.15625, "step": 2315 }, { "completion_length": 223.125, "epoch": 0.7380497131931166, "grad_norm": 8.495549201965332, "kl": 0.0810546875, "learning_rate": 2.6195028680688334e-07, "loss": 0.0032, "reward": 1.5012521743774414, "reward_std": 0.14926576614379883, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.34500211477279663, "step": 2316 }, { "completion_length": 147.171875, "epoch": 0.7383683875079669, "grad_norm": 7.767884254455566, "kl": 0.10498046875, "learning_rate": 2.6163161249203315e-07, "loss": 0.0042, "reward": 1.7319977283477783, "reward_std": 0.04964713752269745, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6069977283477783, "step": 2317 }, { "completion_length": 240.015625, "epoch": 0.7386870618228171, "grad_norm": 41.17066955566406, "kl": 0.07373046875, "learning_rate": 2.613129381771829e-07, "loss": 0.0029, "reward": 1.4253292083740234, "reward_std": 0.13328561186790466, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4409542679786682, "step": 2318 }, { "completion_length": 106.796875, "epoch": 0.7390057361376673, "grad_norm": 428.2945556640625, "kl": 1.484375, "learning_rate": 2.609942638623327e-07, "loss": 0.0595, "reward": 1.5589778423309326, "reward_std": 0.1439211368560791, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.433977872133255, "rewards/pad": 0.125, "step": 2319 }, { "completion_length": 243.734375, "epoch": 0.7393244104525175, "grad_norm": 30.449155807495117, "kl": 0.08642578125, "learning_rate": 2.6067558954748246e-07, "loss": 0.0034, "reward": 1.480348825454712, "reward_std": 0.11358199268579483, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3553488254547119, "step": 2320 }, { "completion_length": 334.75, "epoch": 0.7396430847673677, "grad_norm": 8.932828903198242, "kl": 0.07275390625, "learning_rate": 2.6035691523263227e-07, "loss": 0.0029, "reward": 1.5339847803115845, "reward_std": 0.06272891163825989, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5339848399162292, "step": 2321 }, { "completion_length": 280.109375, "epoch": 0.739961759082218, "grad_norm": 14.46866512298584, "kl": 0.0712890625, "learning_rate": 2.60038240917782e-07, "loss": 0.0028, "reward": 1.4635485410690308, "reward_std": 0.0832393690943718, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3385485112667084, "rewards/pad": 0.125, "step": 2322 }, { "completion_length": 303.859375, "epoch": 0.7402804333970682, "grad_norm": 7.381771564483643, "kl": 0.052734375, "learning_rate": 2.5971956660293183e-07, "loss": 0.0021, "reward": 1.7793185710906982, "reward_std": 0.07244474440813065, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.529318630695343, "step": 2323 }, { "completion_length": 136.515625, "epoch": 0.7405991077119184, "grad_norm": 9.891907691955566, "kl": 0.126953125, "learning_rate": 2.594008922880816e-07, "loss": 0.0051, "reward": 1.4616608619689941, "reward_std": 0.07319878041744232, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4616607427597046, "rewards/pad": 0.0, "step": 2324 }, { "completion_length": 243.015625, "epoch": 0.7409177820267686, "grad_norm": 23.877626419067383, "kl": 0.0849609375, "learning_rate": 2.5908221797323134e-07, "loss": 0.0034, "reward": 1.4740469455718994, "reward_std": 0.21762168407440186, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3802970051765442, "step": 2325 }, { "completion_length": 286.09375, "epoch": 0.7412364563416188, "grad_norm": 6.268517971038818, "kl": 0.06640625, "learning_rate": 2.587635436583811e-07, "loss": 0.0027, "reward": 1.504499912261963, "reward_std": 0.0882675051689148, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3951248824596405, "rewards/pad": 0.125, "step": 2326 }, { "completion_length": 194.625, "epoch": 0.741555130656469, "grad_norm": 6.7995429039001465, "kl": 0.10595703125, "learning_rate": 2.584448693435309e-07, "loss": 0.0042, "reward": 1.5988112688064575, "reward_std": 0.11524415016174316, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4738112986087799, "rewards/pad": 0.125, "step": 2327 }, { "completion_length": 111.0625, "epoch": 0.7418738049713193, "grad_norm": 18.028059005737305, "kl": 0.10791015625, "learning_rate": 2.5812619502868065e-07, "loss": 0.0043, "reward": 1.900662899017334, "reward_std": 0.12899482250213623, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5256630182266235, "step": 2328 }, { "completion_length": 199.359375, "epoch": 0.7421924792861695, "grad_norm": 24.136371612548828, "kl": 0.0966796875, "learning_rate": 2.5780752071383046e-07, "loss": 0.0039, "reward": 1.6215674877166748, "reward_std": 0.11621560156345367, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5121925473213196, "rewards/pad": 0.125, "step": 2329 }, { "completion_length": 317.078125, "epoch": 0.7425111536010197, "grad_norm": 5.354953765869141, "kl": 0.078125, "learning_rate": 2.574888463989802e-07, "loss": 0.0031, "reward": 1.3678014278411865, "reward_std": 0.03476231172680855, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3678014278411865, "step": 2330 }, { "completion_length": 203.90625, "epoch": 0.7428298279158699, "grad_norm": 13.329392433166504, "kl": 0.109375, "learning_rate": 2.5717017208413e-07, "loss": 0.0044, "reward": 1.6113553047180176, "reward_std": 0.12751217186450958, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5488553047180176, "step": 2331 }, { "completion_length": 350.703125, "epoch": 0.7431485022307202, "grad_norm": 14.267176628112793, "kl": 0.07666015625, "learning_rate": 2.568514977692798e-07, "loss": 0.0031, "reward": 1.491163969039917, "reward_std": 0.03700249269604683, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49116405844688416, "step": 2332 }, { "completion_length": 192.890625, "epoch": 0.7434671765455704, "grad_norm": 24.421035766601562, "kl": 0.0849609375, "learning_rate": 2.565328234544296e-07, "loss": 0.0034, "reward": 1.6897307634353638, "reward_std": 0.06476552784442902, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43973079323768616, "step": 2333 }, { "completion_length": 242.234375, "epoch": 0.7437858508604207, "grad_norm": 10.342123031616211, "kl": 0.0830078125, "learning_rate": 2.5621414913957934e-07, "loss": 0.0033, "reward": 1.543015956878662, "reward_std": 0.09575996547937393, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5430158972740173, "rewards/pad": 0.0, "step": 2334 }, { "completion_length": 141.1875, "epoch": 0.7441045251752709, "grad_norm": 10.832265853881836, "kl": 0.1162109375, "learning_rate": 2.5589547482472914e-07, "loss": 0.0046, "reward": 1.7195611000061035, "reward_std": 0.10429985076189041, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5945611000061035, "rewards/pad": 0.125, "step": 2335 }, { "completion_length": 282.234375, "epoch": 0.7444231994901211, "grad_norm": 10.534006118774414, "kl": 0.08984375, "learning_rate": 2.555768005098789e-07, "loss": 0.0036, "reward": 1.4840242862701416, "reward_std": 0.06869726628065109, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3590242266654968, "step": 2336 }, { "completion_length": 243.046875, "epoch": 0.7447418738049714, "grad_norm": 8.319096565246582, "kl": 0.0927734375, "learning_rate": 2.552581261950287e-07, "loss": 0.0037, "reward": 1.5507136583328247, "reward_std": 0.12487111240625381, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5663386583328247, "step": 2337 }, { "completion_length": 192.734375, "epoch": 0.7450605481198216, "grad_norm": 9.370587348937988, "kl": 0.103515625, "learning_rate": 2.5493945188017846e-07, "loss": 0.0041, "reward": 1.6424996852874756, "reward_std": 0.09512320160865784, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5174996852874756, "step": 2338 }, { "completion_length": 214.1875, "epoch": 0.7453792224346718, "grad_norm": 13.986674308776855, "kl": 0.08984375, "learning_rate": 2.546207775653282e-07, "loss": 0.0036, "reward": 1.5424704551696777, "reward_std": 0.036111533641815186, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5424704551696777, "rewards/pad": 0.0, "step": 2339 }, { "completion_length": 142.5, "epoch": 0.745697896749522, "grad_norm": 11.817384719848633, "kl": 0.1171875, "learning_rate": 2.54302103250478e-07, "loss": 0.0047, "reward": 1.5492959022521973, "reward_std": 0.10030539333820343, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5649209022521973, "rewards/pad": 0.0, "step": 2340 }, { "completion_length": 179.484375, "epoch": 0.7460165710643722, "grad_norm": 10.856732368469238, "kl": 0.11376953125, "learning_rate": 2.5398342893562777e-07, "loss": 0.0046, "reward": 1.516695261001587, "reward_std": 0.08297878503799438, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3916953206062317, "step": 2341 }, { "completion_length": 218.71875, "epoch": 0.7463352453792225, "grad_norm": 23.0338077545166, "kl": 0.08544921875, "learning_rate": 2.536647546207776e-07, "loss": 0.0034, "reward": 1.5617284774780273, "reward_std": 0.09402166306972504, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.45235344767570496, "step": 2342 }, { "completion_length": 149.796875, "epoch": 0.7466539196940727, "grad_norm": 11.982497215270996, "kl": 0.10546875, "learning_rate": 2.5334608030592733e-07, "loss": 0.0042, "reward": 1.6303856372833252, "reward_std": 0.17340770363807678, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5835106372833252, "rewards/pad": 0.046875, "step": 2343 }, { "completion_length": 346.765625, "epoch": 0.7469725940089229, "grad_norm": 6.833131313323975, "kl": 0.05517578125, "learning_rate": 2.530274059910771e-07, "loss": 0.0022, "reward": 1.2404136657714844, "reward_std": 0.1525970995426178, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.2716636657714844, "rewards/pad": 0.0, "step": 2344 }, { "completion_length": 192.640625, "epoch": 0.7472912683237731, "grad_norm": 9.509944915771484, "kl": 0.087890625, "learning_rate": 2.5270873167622684e-07, "loss": 0.0035, "reward": 1.6787506341934204, "reward_std": 0.057284578680992126, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4287506341934204, "step": 2345 }, { "completion_length": 268.9375, "epoch": 0.7476099426386233, "grad_norm": 33.24864959716797, "kl": 0.07666015625, "learning_rate": 2.5239005736137665e-07, "loss": 0.0031, "reward": 1.7066154479980469, "reward_std": 0.11836501955986023, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5972404479980469, "step": 2346 }, { "completion_length": 223.25, "epoch": 0.7479286169534736, "grad_norm": 21.398069381713867, "kl": 0.080078125, "learning_rate": 2.520713830465264e-07, "loss": 0.0032, "reward": 1.3415656089782715, "reward_std": 0.06385195255279541, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.34156566858291626, "rewards/pad": 0.0, "step": 2347 }, { "completion_length": 223.0, "epoch": 0.7482472912683238, "grad_norm": 10.627423286437988, "kl": 0.0859375, "learning_rate": 2.517527087316762e-07, "loss": 0.0034, "reward": 1.5851722955703735, "reward_std": 0.06106871739029884, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46017220616340637, "rewards/pad": 0.125, "step": 2348 }, { "completion_length": 222.15625, "epoch": 0.748565965583174, "grad_norm": 94.22887420654297, "kl": 0.10009765625, "learning_rate": 2.5143403441682596e-07, "loss": 0.004, "reward": 1.6433560848236084, "reward_std": 0.14962291717529297, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5652311444282532, "step": 2349 }, { "completion_length": 202.609375, "epoch": 0.7488846398980242, "grad_norm": 52.1790657043457, "kl": 0.08154296875, "learning_rate": 2.5111536010197577e-07, "loss": 0.0033, "reward": 1.7037962675094604, "reward_std": 0.16860975325107574, "rewards/answer_reward": 0.140625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.57879638671875, "step": 2350 }, { "completion_length": 229.140625, "epoch": 0.7492033142128744, "grad_norm": 6.283476829528809, "kl": 0.0908203125, "learning_rate": 2.507966857871255e-07, "loss": 0.0036, "reward": 1.5790940523147583, "reward_std": 0.14463365077972412, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5634690523147583, "rewards/pad": 0.015625, "step": 2351 }, { "completion_length": 183.25, "epoch": 0.7495219885277247, "grad_norm": 9.41490650177002, "kl": 0.1044921875, "learning_rate": 2.5047801147227533e-07, "loss": 0.0042, "reward": 1.7538079023361206, "reward_std": 0.11042501032352448, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6131829023361206, "rewards/pad": 0.140625, "step": 2352 }, { "completion_length": 267.78125, "epoch": 0.7498406628425749, "grad_norm": 15.176810264587402, "kl": 0.07763671875, "learning_rate": 2.501593371574251e-07, "loss": 0.0031, "reward": 1.6882057189941406, "reward_std": 0.04086899757385254, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5632058382034302, "step": 2353 }, { "completion_length": 241.53125, "epoch": 0.7501593371574251, "grad_norm": 8.03711223602295, "kl": 0.09326171875, "learning_rate": 2.498406628425749e-07, "loss": 0.0037, "reward": 1.7172772884368896, "reward_std": 0.13023245334625244, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46727728843688965, "rewards/pad": 0.25, "step": 2354 }, { "completion_length": 317.484375, "epoch": 0.7504780114722753, "grad_norm": 12.903837203979492, "kl": 0.06640625, "learning_rate": 2.4952198852772465e-07, "loss": 0.0027, "reward": 1.5770788192749023, "reward_std": 0.035880204290151596, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5770787000656128, "step": 2355 }, { "completion_length": 234.71875, "epoch": 0.7507966857871256, "grad_norm": 11.72558307647705, "kl": 0.08740234375, "learning_rate": 2.4920331421287445e-07, "loss": 0.0035, "reward": 1.4853765964508057, "reward_std": 0.09491764008998871, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5010015964508057, "step": 2356 }, { "completion_length": 268.03125, "epoch": 0.7511153601019758, "grad_norm": 10.013938903808594, "kl": 0.0751953125, "learning_rate": 2.488846398980242e-07, "loss": 0.003, "reward": 1.596638798713684, "reward_std": 0.043350279331207275, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5966388583183289, "rewards/pad": 0.0, "step": 2357 }, { "completion_length": 199.015625, "epoch": 0.751434034416826, "grad_norm": 7.718585968017578, "kl": 0.115234375, "learning_rate": 2.48565965583174e-07, "loss": 0.0046, "reward": 1.4091236591339111, "reward_std": 0.07670184969902039, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40912362933158875, "rewards/pad": 0.0, "step": 2358 }, { "completion_length": 198.34375, "epoch": 0.7517527087316762, "grad_norm": 18.98740577697754, "kl": 0.09619140625, "learning_rate": 2.4824729126832377e-07, "loss": 0.0038, "reward": 1.7292869091033936, "reward_std": 0.13945679366588593, "rewards/answer_reward": 0.21875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5105368494987488, "step": 2359 }, { "completion_length": 152.421875, "epoch": 0.7520713830465264, "grad_norm": 11.604713439941406, "kl": 0.10693359375, "learning_rate": 2.479286169534735e-07, "loss": 0.0043, "reward": 1.7420519590377808, "reward_std": 0.09547455608844757, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49205198884010315, "rewards/pad": 0.25, "step": 2360 }, { "completion_length": 300.265625, "epoch": 0.7523900573613767, "grad_norm": 10.19469165802002, "kl": 0.1982421875, "learning_rate": 2.4760994263862333e-07, "loss": 0.0079, "reward": 1.6528478860855103, "reward_std": 0.06703270226716995, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.527847945690155, "step": 2361 }, { "completion_length": 177.453125, "epoch": 0.7527087316762269, "grad_norm": 11.560866355895996, "kl": 0.1064453125, "learning_rate": 2.472912683237731e-07, "loss": 0.0043, "reward": 1.546684980392456, "reward_std": 0.16256418824195862, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.43731003999710083, "step": 2362 }, { "completion_length": 150.296875, "epoch": 0.7530274059910771, "grad_norm": 15.46419620513916, "kl": 0.10693359375, "learning_rate": 2.469725940089229e-07, "loss": 0.0043, "reward": 1.5246479511260986, "reward_std": 0.17527268826961517, "rewards/answer_reward": 0.046875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.47777289152145386, "step": 2363 }, { "completion_length": 155.6875, "epoch": 0.7533460803059273, "grad_norm": 21.624813079833984, "kl": 0.12353515625, "learning_rate": 2.4665391969407264e-07, "loss": 0.0049, "reward": 1.560457706451416, "reward_std": 0.14548426866531372, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.576082706451416, "step": 2364 }, { "completion_length": 146.671875, "epoch": 0.7536647546207775, "grad_norm": 13.269795417785645, "kl": 0.095703125, "learning_rate": 2.4633524537922245e-07, "loss": 0.0038, "reward": 1.6994431018829346, "reward_std": 0.08348685503005981, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4494430422782898, "rewards/pad": 0.25, "step": 2365 }, { "completion_length": 230.625, "epoch": 0.7539834289356278, "grad_norm": 27.45595932006836, "kl": 0.0927734375, "learning_rate": 2.460165710643722e-07, "loss": 0.0037, "reward": 1.59385347366333, "reward_std": 0.16194438934326172, "rewards/answer_reward": 0.203125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4063534140586853, "step": 2366 }, { "completion_length": 185.765625, "epoch": 0.754302103250478, "grad_norm": 3.6742031574249268, "kl": 0.12158203125, "learning_rate": 2.45697896749522e-07, "loss": 0.0049, "reward": 1.4821596145629883, "reward_std": 0.05528702214360237, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48215949535369873, "rewards/pad": 0.0, "step": 2367 }, { "completion_length": 99.921875, "epoch": 0.7546207775653282, "grad_norm": 7.76267671585083, "kl": 0.12890625, "learning_rate": 2.4537922243467177e-07, "loss": 0.0051, "reward": 1.4437228441238403, "reward_std": 0.16315695643424988, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.45934778451919556, "rewards/pad": 0.0, "step": 2368 }, { "completion_length": 257.046875, "epoch": 0.7549394518801784, "grad_norm": 9.175533294677734, "kl": 0.0751953125, "learning_rate": 2.450605481198215e-07, "loss": 0.003, "reward": 1.649479866027832, "reward_std": 0.1376563012599945, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.524479866027832, "step": 2369 }, { "completion_length": 251.6875, "epoch": 0.7552581261950286, "grad_norm": 15.279226303100586, "kl": 0.09619140625, "learning_rate": 2.447418738049713e-07, "loss": 0.0039, "reward": 1.6367669105529785, "reward_std": 0.11726522445678711, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4961419701576233, "rewards/pad": 0.140625, "step": 2370 }, { "completion_length": 230.90625, "epoch": 0.7555768005098789, "grad_norm": 17.72090721130371, "kl": 0.0947265625, "learning_rate": 2.444231994901211e-07, "loss": 0.0038, "reward": 1.4760117530822754, "reward_std": 0.17733457684516907, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5072617530822754, "step": 2371 }, { "completion_length": 200.296875, "epoch": 0.7558954748247291, "grad_norm": 17.369386672973633, "kl": 0.1064453125, "learning_rate": 2.4410452517527084e-07, "loss": 0.0042, "reward": 1.7264306545257568, "reward_std": 0.13656508922576904, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5076805949211121, "rewards/pad": 0.21875, "step": 2372 }, { "completion_length": 178.359375, "epoch": 0.7562141491395793, "grad_norm": 7.309889793395996, "kl": 0.09619140625, "learning_rate": 2.4378585086042064e-07, "loss": 0.0038, "reward": 1.7076586484909058, "reward_std": 0.08293125033378601, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.582658588886261, "step": 2373 }, { "completion_length": 289.359375, "epoch": 0.7565328234544296, "grad_norm": 11.607328414916992, "kl": 0.0966796875, "learning_rate": 2.434671765455704e-07, "loss": 0.0039, "reward": 1.4018770456314087, "reward_std": 0.1482263058423996, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4331270456314087, "step": 2374 }, { "completion_length": 208.671875, "epoch": 0.7568514977692798, "grad_norm": 16.60245132446289, "kl": 0.09716796875, "learning_rate": 2.431485022307202e-07, "loss": 0.0039, "reward": 1.4880285263061523, "reward_std": 0.07213404774665833, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.48802849650382996, "step": 2375 }, { "completion_length": 269.8125, "epoch": 0.7571701720841301, "grad_norm": 5.0662760734558105, "kl": 0.07080078125, "learning_rate": 2.4282982791586996e-07, "loss": 0.0028, "reward": 1.5826705694198608, "reward_std": 0.08732300996780396, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.47329556941986084, "step": 2376 }, { "completion_length": 290.34375, "epoch": 0.7574888463989803, "grad_norm": 14.884427070617676, "kl": 0.059814453125, "learning_rate": 2.4251115360101976e-07, "loss": 0.0024, "reward": 1.7173967361450195, "reward_std": 0.10928597301244736, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6080216765403748, "rewards/pad": 0.109375, "step": 2377 }, { "completion_length": 288.875, "epoch": 0.7578075207138305, "grad_norm": 6.457363128662109, "kl": 0.0703125, "learning_rate": 2.421924792861695e-07, "loss": 0.0028, "reward": 1.7575962543487549, "reward_std": 0.07571257650852203, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.38259628415107727, "rewards/pad": 0.375, "step": 2378 }, { "completion_length": 169.484375, "epoch": 0.7581261950286807, "grad_norm": 18.78461265563965, "kl": 0.11572265625, "learning_rate": 2.4187380497131927e-07, "loss": 0.0046, "reward": 1.405072808265686, "reward_std": 0.11775682866573334, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3894478380680084, "rewards/pad": 0.015625, "step": 2379 }, { "completion_length": 235.234375, "epoch": 0.758444869343531, "grad_norm": 11.631933212280273, "kl": 0.07373046875, "learning_rate": 2.415551306564691e-07, "loss": 0.0029, "reward": 1.5759958028793335, "reward_std": 0.32423967123031616, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.3259957730770111, "rewards/pad": 0.296875, "step": 2380 }, { "completion_length": 190.09375, "epoch": 0.7587635436583812, "grad_norm": 10.003499984741211, "kl": 0.09375, "learning_rate": 2.4123645634161883e-07, "loss": 0.0038, "reward": 1.4936577081680298, "reward_std": 0.0779888927936554, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.368657648563385, "rewards/pad": 0.125, "step": 2381 }, { "completion_length": 207.703125, "epoch": 0.7590822179732314, "grad_norm": 8.13753604888916, "kl": 0.091796875, "learning_rate": 2.4091778202676864e-07, "loss": 0.0037, "reward": 1.6867070198059082, "reward_std": 0.10507681965827942, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5617070198059082, "rewards/pad": 0.125, "step": 2382 }, { "completion_length": 350.234375, "epoch": 0.7594008922880816, "grad_norm": 4.736698150634766, "kl": 0.048583984375, "learning_rate": 2.405991077119184e-07, "loss": 0.0019, "reward": 1.3926687240600586, "reward_std": 0.046149738132953644, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3926687240600586, "step": 2383 }, { "completion_length": 226.28125, "epoch": 0.7597195666029318, "grad_norm": 15.912691116333008, "kl": 0.09228515625, "learning_rate": 2.402804333970682e-07, "loss": 0.0037, "reward": 1.5672473907470703, "reward_std": 0.09506212174892426, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5672474503517151, "step": 2384 }, { "completion_length": 285.9375, "epoch": 0.760038240917782, "grad_norm": 8.352356910705566, "kl": 0.06396484375, "learning_rate": 2.3996175908221796e-07, "loss": 0.0026, "reward": 1.492800235748291, "reward_std": 0.1296931505203247, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5084253549575806, "step": 2385 }, { "completion_length": 207.90625, "epoch": 0.7603569152326323, "grad_norm": 11.334542274475098, "kl": 0.125, "learning_rate": 2.3964308476736776e-07, "loss": 0.005, "reward": 1.5617526769638062, "reward_std": 0.17248529195785522, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.43675270676612854, "rewards/pad": 0.140625, "step": 2386 }, { "completion_length": 222.9375, "epoch": 0.7606755895474825, "grad_norm": 9.127038955688477, "kl": 0.08447265625, "learning_rate": 2.393244104525175e-07, "loss": 0.0034, "reward": 1.5596591234207153, "reward_std": 0.08980844914913177, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5596591830253601, "rewards/pad": 0.0, "step": 2387 }, { "completion_length": 310.453125, "epoch": 0.7609942638623327, "grad_norm": 8.183820724487305, "kl": 0.068359375, "learning_rate": 2.3900573613766727e-07, "loss": 0.0027, "reward": 1.5660204887390137, "reward_std": 0.12294905632734299, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5816453695297241, "step": 2388 }, { "completion_length": 279.1875, "epoch": 0.7613129381771829, "grad_norm": 6.7372846603393555, "kl": 0.060546875, "learning_rate": 2.386870618228171e-07, "loss": 0.0024, "reward": 1.493821382522583, "reward_std": 0.11108361184597015, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3844463527202606, "step": 2389 }, { "completion_length": 153.015625, "epoch": 0.7616316124920331, "grad_norm": 8.310419082641602, "kl": 0.103515625, "learning_rate": 2.3836838750796683e-07, "loss": 0.0041, "reward": 1.6152290105819702, "reward_std": 0.1089547723531723, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3652289807796478, "rewards/pad": 0.25, "step": 2390 }, { "completion_length": 181.953125, "epoch": 0.7619502868068834, "grad_norm": 21.235824584960938, "kl": 0.11181640625, "learning_rate": 2.380497131931166e-07, "loss": 0.0045, "reward": 1.490411400794983, "reward_std": 0.11310833692550659, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4904113709926605, "rewards/pad": 0.0, "step": 2391 }, { "completion_length": 192.09375, "epoch": 0.7622689611217336, "grad_norm": 22.150575637817383, "kl": 0.1005859375, "learning_rate": 2.377310388782664e-07, "loss": 0.004, "reward": 1.3348021507263184, "reward_std": 0.12773533165454865, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.31917715072631836, "rewards/pad": 0.015625, "step": 2392 }, { "completion_length": 90.96875, "epoch": 0.7625876354365838, "grad_norm": 10.078266143798828, "kl": 0.1357421875, "learning_rate": 2.3741236456341617e-07, "loss": 0.0055, "reward": 1.7374215126037598, "reward_std": 0.09872999042272568, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.7374216318130493, "rewards/pad": 0.0, "step": 2393 }, { "completion_length": 287.53125, "epoch": 0.762906309751434, "grad_norm": 12.642020225524902, "kl": 0.06982421875, "learning_rate": 2.3709369024856595e-07, "loss": 0.0028, "reward": 1.5701704025268555, "reward_std": 0.10735349357128143, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47642046213150024, "step": 2394 }, { "completion_length": 303.734375, "epoch": 0.7632249840662843, "grad_norm": 6.236365795135498, "kl": 0.0712890625, "learning_rate": 2.3677501593371573e-07, "loss": 0.0029, "reward": 1.4030518531799316, "reward_std": 0.05589941516518593, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4030519127845764, "rewards/pad": 0.0, "step": 2395 }, { "completion_length": 208.3125, "epoch": 0.7635436583811345, "grad_norm": 20.96642303466797, "kl": 0.08642578125, "learning_rate": 2.3645634161886551e-07, "loss": 0.0035, "reward": 1.8169206380844116, "reward_std": 0.13159474730491638, "rewards/answer_reward": 0.3125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5200456976890564, "step": 2396 }, { "completion_length": 337.28125, "epoch": 0.7638623326959847, "grad_norm": 6.730218887329102, "kl": 0.05859375, "learning_rate": 2.361376673040153e-07, "loss": 0.0024, "reward": 1.4287188053131104, "reward_std": 0.030213970690965652, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4287187457084656, "step": 2397 }, { "completion_length": 270.734375, "epoch": 0.7641810070108349, "grad_norm": 9.62966537475586, "kl": 0.083984375, "learning_rate": 2.3581899298916505e-07, "loss": 0.0034, "reward": 1.7294020652770996, "reward_std": 0.14015543460845947, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5731521248817444, "rewards/pad": 0.15625, "step": 2398 }, { "completion_length": 317.75, "epoch": 0.7644996813256851, "grad_norm": 8.748055458068848, "kl": 0.06787109375, "learning_rate": 2.3550031867431483e-07, "loss": 0.0027, "reward": 1.4744210243225098, "reward_std": 0.10222569853067398, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4900459349155426, "step": 2399 }, { "completion_length": 207.265625, "epoch": 0.7648183556405354, "grad_norm": 14.838871955871582, "kl": 0.0986328125, "learning_rate": 2.351816443594646e-07, "loss": 0.0039, "reward": 1.5980523824691772, "reward_std": 0.08159129321575165, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6136773824691772, "rewards/pad": 0.0, "step": 2400 }, { "completion_length": 195.15625, "epoch": 0.7651370299553856, "grad_norm": 10.845717430114746, "kl": 0.09521484375, "learning_rate": 2.348629700446144e-07, "loss": 0.0038, "reward": 1.7929725646972656, "reward_std": 0.09547235071659088, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6679726243019104, "step": 2401 }, { "completion_length": 190.25, "epoch": 0.7654557042702358, "grad_norm": 10.455423355102539, "kl": 0.111328125, "learning_rate": 2.3454429572976417e-07, "loss": 0.0045, "reward": 1.7947559356689453, "reward_std": 0.17043697834014893, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5447558164596558, "step": 2402 }, { "completion_length": 149.828125, "epoch": 0.765774378585086, "grad_norm": 21.63300895690918, "kl": 0.11279296875, "learning_rate": 2.3422562141491395e-07, "loss": 0.0045, "reward": 1.5531684160232544, "reward_std": 0.0856570452451706, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.428168386220932, "rewards/pad": 0.125, "step": 2403 }, { "completion_length": 210.390625, "epoch": 0.7660930528999362, "grad_norm": 28.16766357421875, "kl": 0.0830078125, "learning_rate": 2.3390694710006373e-07, "loss": 0.0033, "reward": 1.4170100688934326, "reward_std": 0.0960836410522461, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4170100688934326, "rewards/pad": 0.0, "step": 2404 }, { "completion_length": 276.203125, "epoch": 0.7664117272147865, "grad_norm": 7.249959945678711, "kl": 0.07373046875, "learning_rate": 2.335882727852135e-07, "loss": 0.0029, "reward": 1.4971795082092285, "reward_std": 0.036122873425483704, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49717944860458374, "step": 2405 }, { "completion_length": 394.9375, "epoch": 0.7667304015296367, "grad_norm": 4.664658069610596, "kl": 0.04638671875, "learning_rate": 2.332695984703633e-07, "loss": 0.0019, "reward": 1.3320510387420654, "reward_std": 0.06756319850683212, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3164260983467102, "step": 2406 }, { "completion_length": 243.921875, "epoch": 0.7670490758444869, "grad_norm": 7.838922023773193, "kl": 0.1376953125, "learning_rate": 2.3295092415551307e-07, "loss": 0.0055, "reward": 1.678990125656128, "reward_std": 0.13129065930843353, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.44461509585380554, "step": 2407 }, { "completion_length": 270.34375, "epoch": 0.7673677501593371, "grad_norm": 7.979779243469238, "kl": 0.0712890625, "learning_rate": 2.3263224984066283e-07, "loss": 0.0028, "reward": 1.5450541973114014, "reward_std": 0.11059615761041641, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.2950543165206909, "step": 2408 }, { "completion_length": 220.84375, "epoch": 0.7676864244741873, "grad_norm": 8.696356773376465, "kl": 0.0888671875, "learning_rate": 2.323135755258126e-07, "loss": 0.0036, "reward": 1.5947052240371704, "reward_std": 0.13390368223190308, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4384552240371704, "step": 2409 }, { "completion_length": 292.03125, "epoch": 0.7680050987890376, "grad_norm": 6.767152309417725, "kl": 0.076171875, "learning_rate": 2.319949012109624e-07, "loss": 0.003, "reward": 1.6036784648895264, "reward_std": 0.08758947253227234, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.49430355429649353, "rewards/pad": 0.125, "step": 2410 }, { "completion_length": 271.140625, "epoch": 0.7683237731038878, "grad_norm": 23.512475967407227, "kl": 0.06982421875, "learning_rate": 2.3167622689611217e-07, "loss": 0.0028, "reward": 1.5783004760742188, "reward_std": 0.12401663511991501, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46892544627189636, "rewards/pad": 0.109375, "step": 2411 }, { "completion_length": 282.5, "epoch": 0.768642447418738, "grad_norm": 12.914742469787598, "kl": 0.07666015625, "learning_rate": 2.3135755258126195e-07, "loss": 0.0031, "reward": 1.5491721630096436, "reward_std": 0.12556979060173035, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5647971034049988, "step": 2412 }, { "completion_length": 263.421875, "epoch": 0.7689611217335883, "grad_norm": 25.450551986694336, "kl": 0.08837890625, "learning_rate": 2.3103887826641173e-07, "loss": 0.0035, "reward": 1.4551992416381836, "reward_std": 0.059495627880096436, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.455199271440506, "rewards/pad": 0.0, "step": 2413 }, { "completion_length": 266.90625, "epoch": 0.7692797960484385, "grad_norm": 8.406025886535645, "kl": 0.0830078125, "learning_rate": 2.307202039515615e-07, "loss": 0.0033, "reward": 1.5796451568603516, "reward_std": 0.12612810730934143, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4702701270580292, "step": 2414 }, { "completion_length": 261.5, "epoch": 0.7695984703632888, "grad_norm": 15.752683639526367, "kl": 0.10498046875, "learning_rate": 2.304015296367113e-07, "loss": 0.0042, "reward": 1.632777214050293, "reward_std": 0.18482252955436707, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49215221405029297, "rewards/pad": 0.140625, "step": 2415 }, { "completion_length": 271.46875, "epoch": 0.769917144678139, "grad_norm": 6.456062316894531, "kl": 0.0751953125, "learning_rate": 2.3008285532186105e-07, "loss": 0.003, "reward": 1.5452253818511963, "reward_std": 0.13085828721523285, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4358502924442291, "step": 2416 }, { "completion_length": 239.734375, "epoch": 0.7702358189929892, "grad_norm": 23.77017593383789, "kl": 0.09619140625, "learning_rate": 2.2976418100701083e-07, "loss": 0.0038, "reward": 1.5104858875274658, "reward_std": 0.13161501288414001, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5261107683181763, "step": 2417 }, { "completion_length": 236.46875, "epoch": 0.7705544933078394, "grad_norm": 6.151467323303223, "kl": 0.08984375, "learning_rate": 2.2944550669216058e-07, "loss": 0.0036, "reward": 1.5481237173080444, "reward_std": 0.18944257497787476, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.45437365770339966, "step": 2418 }, { "completion_length": 184.203125, "epoch": 0.7708731676226896, "grad_norm": 10.371918678283691, "kl": 0.09130859375, "learning_rate": 2.2912683237731036e-07, "loss": 0.0037, "reward": 1.4940639734268188, "reward_std": 0.08268411457538605, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4940640330314636, "step": 2419 }, { "completion_length": 291.65625, "epoch": 0.7711918419375399, "grad_norm": 7.256937503814697, "kl": 0.06982421875, "learning_rate": 2.2880815806246014e-07, "loss": 0.0028, "reward": 1.4115976095199585, "reward_std": 0.038619644939899445, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4115976095199585, "rewards/pad": 0.0, "step": 2420 }, { "completion_length": 273.34375, "epoch": 0.7715105162523901, "grad_norm": 63.12538146972656, "kl": 0.08056640625, "learning_rate": 2.2848948374760992e-07, "loss": 0.0032, "reward": 1.4360949993133545, "reward_std": 0.03568316251039505, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4360950291156769, "step": 2421 }, { "completion_length": 340.1875, "epoch": 0.7718291905672403, "grad_norm": 8.08768367767334, "kl": 0.06396484375, "learning_rate": 2.281708094327597e-07, "loss": 0.0025, "reward": 1.5484213829040527, "reward_std": 0.1030707061290741, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.43904638290405273, "rewards/pad": 0.125, "step": 2422 }, { "completion_length": 194.046875, "epoch": 0.7721478648820905, "grad_norm": 12.11047077178955, "kl": 0.0869140625, "learning_rate": 2.2785213511790948e-07, "loss": 0.0035, "reward": 1.5583499670028687, "reward_std": 0.04292614012956619, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3083499073982239, "step": 2423 }, { "completion_length": 268.515625, "epoch": 0.7724665391969407, "grad_norm": 28.141538619995117, "kl": 0.091796875, "learning_rate": 2.2753346080305926e-07, "loss": 0.0037, "reward": 1.4259402751922607, "reward_std": 0.15826770663261414, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4259403347969055, "step": 2424 }, { "completion_length": 207.0625, "epoch": 0.772785213511791, "grad_norm": 16.846036911010742, "kl": 0.10400390625, "learning_rate": 2.2721478648820904e-07, "loss": 0.0042, "reward": 1.5542716979980469, "reward_std": 0.1610041856765747, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.44489675760269165, "rewards/pad": 0.125, "step": 2425 }, { "completion_length": 186.0625, "epoch": 0.7731038878266412, "grad_norm": 12.483802795410156, "kl": 0.0888671875, "learning_rate": 2.2689611217335882e-07, "loss": 0.0036, "reward": 1.666634202003479, "reward_std": 0.050067052245140076, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.666634202003479, "step": 2426 }, { "completion_length": 200.03125, "epoch": 0.7734225621414914, "grad_norm": 11.658013343811035, "kl": 0.0986328125, "learning_rate": 2.265774378585086e-07, "loss": 0.0039, "reward": 1.7182862758636475, "reward_std": 0.06905974447727203, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.46828627586364746, "step": 2427 }, { "completion_length": 233.90625, "epoch": 0.7737412364563416, "grad_norm": 7.01774787902832, "kl": 0.087890625, "learning_rate": 2.2625876354365836e-07, "loss": 0.0035, "reward": 1.5699102878570557, "reward_std": 0.15561619400978088, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.96875, "rewards/iou_glue_reward": 0.4761601686477661, "step": 2428 }, { "completion_length": 175.859375, "epoch": 0.7740599107711919, "grad_norm": 25.093128204345703, "kl": 0.11181640625, "learning_rate": 2.2594008922880814e-07, "loss": 0.0045, "reward": 1.3243741989135742, "reward_std": 0.13296332955360413, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.33999931812286377, "step": 2429 }, { "completion_length": 232.96875, "epoch": 0.7743785850860421, "grad_norm": 13.453207969665527, "kl": 0.08544921875, "learning_rate": 2.2562141491395792e-07, "loss": 0.0034, "reward": 1.6193019151687622, "reward_std": 0.12025083601474762, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.509926974773407, "step": 2430 }, { "completion_length": 298.03125, "epoch": 0.7746972594008923, "grad_norm": 6.668832778930664, "kl": 0.068359375, "learning_rate": 2.253027405991077e-07, "loss": 0.0027, "reward": 1.5202369689941406, "reward_std": 0.04842230677604675, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.39523690938949585, "step": 2431 }, { "completion_length": 258.515625, "epoch": 0.7750159337157425, "grad_norm": 10.22235107421875, "kl": 0.078125, "learning_rate": 2.2498406628425748e-07, "loss": 0.0031, "reward": 1.458024501800537, "reward_std": 0.058063872158527374, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3330245614051819, "step": 2432 }, { "completion_length": 385.96875, "epoch": 0.7753346080305927, "grad_norm": 9.327546119689941, "kl": 0.056884765625, "learning_rate": 2.2466539196940726e-07, "loss": 0.0023, "reward": 1.412497639656067, "reward_std": 0.04687918350100517, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4124976098537445, "step": 2433 }, { "completion_length": 253.546875, "epoch": 0.775653282345443, "grad_norm": 9.379130363464355, "kl": 0.10546875, "learning_rate": 2.2434671765455704e-07, "loss": 0.0042, "reward": 1.610476016998291, "reward_std": 0.10061073303222656, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6104759573936462, "step": 2434 }, { "completion_length": 286.59375, "epoch": 0.7759719566602932, "grad_norm": 13.452876091003418, "kl": 0.08544921875, "learning_rate": 2.2402804333970682e-07, "loss": 0.0034, "reward": 1.4705870151519775, "reward_std": 0.10605093836784363, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4862120449542999, "step": 2435 }, { "completion_length": 287.171875, "epoch": 0.7762906309751434, "grad_norm": 11.824004173278809, "kl": 0.08251953125, "learning_rate": 2.237093690248566e-07, "loss": 0.0033, "reward": 1.4830718040466309, "reward_std": 0.08090022951364517, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.37369686365127563, "rewards/pad": 0.125, "step": 2436 }, { "completion_length": 307.96875, "epoch": 0.7766093052899936, "grad_norm": 12.284720420837402, "kl": 0.06640625, "learning_rate": 2.2339069471000636e-07, "loss": 0.0027, "reward": 1.5076161623001099, "reward_std": 0.08636889606714249, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4919911026954651, "step": 2437 }, { "completion_length": 199.875, "epoch": 0.7769279796048438, "grad_norm": 13.545829772949219, "kl": 0.1328125, "learning_rate": 2.2307202039515614e-07, "loss": 0.0053, "reward": 1.460900068283081, "reward_std": 0.07126356661319733, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.46090012788772583, "rewards/pad": 0.0, "step": 2438 }, { "completion_length": 228.984375, "epoch": 0.777246653919694, "grad_norm": 36.11991500854492, "kl": 0.09033203125, "learning_rate": 2.2275334608030592e-07, "loss": 0.0036, "reward": 1.6342846155166626, "reward_std": 0.1392236053943634, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5405345559120178, "rewards/pad": 0.09375, "step": 2439 }, { "completion_length": 314.921875, "epoch": 0.7775653282345443, "grad_norm": 3.917736530303955, "kl": 0.055419921875, "learning_rate": 2.224346717654557e-07, "loss": 0.0022, "reward": 1.6305112838745117, "reward_std": 0.09095712006092072, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4898863732814789, "step": 2440 }, { "completion_length": 244.375, "epoch": 0.7778840025493945, "grad_norm": 11.698638916015625, "kl": 0.09423828125, "learning_rate": 2.2211599745060548e-07, "loss": 0.0038, "reward": 1.5138826370239258, "reward_std": 0.042464740574359894, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5138826966285706, "step": 2441 }, { "completion_length": 197.640625, "epoch": 0.7782026768642447, "grad_norm": 13.141349792480469, "kl": 0.11328125, "learning_rate": 2.2179732313575526e-07, "loss": 0.0045, "reward": 1.6041779518127441, "reward_std": 0.09736239165067673, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6041779518127441, "rewards/pad": 0.0, "step": 2442 }, { "completion_length": 254.28125, "epoch": 0.7785213511790949, "grad_norm": 3.905799627304077, "kl": 0.08203125, "learning_rate": 2.2147864882090504e-07, "loss": 0.0033, "reward": 1.56472909450531, "reward_std": 0.04493452236056328, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43972909450531006, "step": 2443 }, { "completion_length": 217.59375, "epoch": 0.7788400254939452, "grad_norm": 32.241065979003906, "kl": 0.076171875, "learning_rate": 2.2115997450605482e-07, "loss": 0.003, "reward": 1.9587368965148926, "reward_std": 0.1502833217382431, "rewards/pad": 0.4375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5212369561195374, "step": 2444 }, { "completion_length": 276.984375, "epoch": 0.7791586998087954, "grad_norm": 13.728992462158203, "kl": 0.072265625, "learning_rate": 2.208413001912046e-07, "loss": 0.0029, "reward": 1.7761509418487549, "reward_std": 0.14614000916481018, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5574009418487549, "step": 2445 }, { "completion_length": 344.59375, "epoch": 0.7794773741236456, "grad_norm": 15.054845809936523, "kl": 0.06982421875, "learning_rate": 2.2052262587635438e-07, "loss": 0.0028, "reward": 1.475869059562683, "reward_std": 0.04769786819815636, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4758690595626831, "step": 2446 }, { "completion_length": 223.125, "epoch": 0.7797960484384958, "grad_norm": 7.250488758087158, "kl": 0.1015625, "learning_rate": 2.202039515615041e-07, "loss": 0.0041, "reward": 1.597805380821228, "reward_std": 0.18581557273864746, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42593032121658325, "rewards/pad": 0.171875, "step": 2447 }, { "completion_length": 325.015625, "epoch": 0.780114722753346, "grad_norm": 6.287956237792969, "kl": 0.08056640625, "learning_rate": 2.198852772466539e-07, "loss": 0.0032, "reward": 1.547116994857788, "reward_std": 0.037797972559928894, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5471169948577881, "rewards/pad": 0.0, "step": 2448 }, { "completion_length": 208.078125, "epoch": 0.7804333970681963, "grad_norm": 13.886652946472168, "kl": 0.10400390625, "learning_rate": 2.1956660293180367e-07, "loss": 0.0042, "reward": 1.5772626399993896, "reward_std": 0.02991614118218422, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4522625803947449, "rewards/pad": 0.125, "step": 2449 }, { "completion_length": 204.296875, "epoch": 0.7807520713830465, "grad_norm": 22.697803497314453, "kl": 0.08251953125, "learning_rate": 2.1924792861695345e-07, "loss": 0.0033, "reward": 1.5583763122558594, "reward_std": 0.07844780385494232, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43337637186050415, "rewards/pad": 0.125, "step": 2450 }, { "completion_length": 147.875, "epoch": 0.7810707456978967, "grad_norm": 19.415813446044922, "kl": 0.1005859375, "learning_rate": 2.1892925430210323e-07, "loss": 0.004, "reward": 1.5877751111984253, "reward_std": 0.077241450548172, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5877752304077148, "rewards/pad": 0.0, "step": 2451 }, { "completion_length": 358.546875, "epoch": 0.781389420012747, "grad_norm": 28.12146759033203, "kl": 0.056884765625, "learning_rate": 2.18610579987253e-07, "loss": 0.0023, "reward": 1.4711517095565796, "reward_std": 0.0944700762629509, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4867766499519348, "step": 2452 }, { "completion_length": 204.546875, "epoch": 0.7817080943275972, "grad_norm": 22.27834701538086, "kl": 0.0986328125, "learning_rate": 2.182919056724028e-07, "loss": 0.0039, "reward": 1.2419202327728271, "reward_std": 0.04451475292444229, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.24192023277282715, "rewards/pad": 0.0, "step": 2453 }, { "completion_length": 146.03125, "epoch": 0.7820267686424475, "grad_norm": 19.50977325439453, "kl": 0.1083984375, "learning_rate": 2.1797323135755257e-07, "loss": 0.0043, "reward": 1.8846417665481567, "reward_std": 0.0772833451628685, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6346417665481567, "step": 2454 }, { "completion_length": 232.625, "epoch": 0.7823454429572977, "grad_norm": 10.139874458312988, "kl": 0.0751953125, "learning_rate": 2.1765455704270235e-07, "loss": 0.003, "reward": 1.4167118072509766, "reward_std": 0.07653843611478806, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.41671180725097656, "rewards/pad": 0.0, "step": 2455 }, { "completion_length": 153.390625, "epoch": 0.7826641172721479, "grad_norm": 23.29401969909668, "kl": 0.109375, "learning_rate": 2.1733588272785213e-07, "loss": 0.0044, "reward": 1.5589635372161865, "reward_std": 0.1103920266032219, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5589635372161865, "rewards/pad": 0.0, "step": 2456 }, { "completion_length": 230.0625, "epoch": 0.7829827915869981, "grad_norm": 13.375690460205078, "kl": 0.0947265625, "learning_rate": 2.170172084130019e-07, "loss": 0.0038, "reward": 1.518528938293457, "reward_std": 0.048551395535469055, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.518528938293457, "rewards/pad": 0.0, "step": 2457 }, { "completion_length": 191.640625, "epoch": 0.7833014659018483, "grad_norm": 8.836167335510254, "kl": 0.11328125, "learning_rate": 2.1669853409815167e-07, "loss": 0.0045, "reward": 1.5738965272903442, "reward_std": 0.08173016458749771, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5738966464996338, "rewards/pad": 0.0, "step": 2458 }, { "completion_length": 151.890625, "epoch": 0.7836201402166986, "grad_norm": 15.70499324798584, "kl": 0.19140625, "learning_rate": 2.1637985978330145e-07, "loss": 0.0077, "reward": 1.6416845321655273, "reward_std": 0.11252018809318542, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5166846513748169, "step": 2459 }, { "completion_length": 296.609375, "epoch": 0.7839388145315488, "grad_norm": 6.298742294311523, "kl": 0.06884765625, "learning_rate": 2.1606118546845123e-07, "loss": 0.0028, "reward": 1.72971773147583, "reward_std": 0.04087499901652336, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6047176718711853, "step": 2460 }, { "completion_length": 276.859375, "epoch": 0.784257488846399, "grad_norm": 6.319355010986328, "kl": 0.07861328125, "learning_rate": 2.15742511153601e-07, "loss": 0.0031, "reward": 1.771275520324707, "reward_std": 0.10621865093708038, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5369004607200623, "step": 2461 }, { "completion_length": 285.140625, "epoch": 0.7845761631612492, "grad_norm": 17.279361724853516, "kl": 0.1044921875, "learning_rate": 2.154238368387508e-07, "loss": 0.0042, "reward": 1.602461576461792, "reward_std": 0.045225173234939575, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6024615168571472, "rewards/pad": 0.0, "step": 2462 }, { "completion_length": 345.984375, "epoch": 0.7848948374760994, "grad_norm": 8.349935531616211, "kl": 0.0634765625, "learning_rate": 2.1510516252390057e-07, "loss": 0.0025, "reward": 1.5142486095428467, "reward_std": 0.06760507822036743, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5142486691474915, "rewards/pad": 0.0, "step": 2463 }, { "completion_length": 381.5625, "epoch": 0.7852135117909497, "grad_norm": 13.86041259765625, "kl": 0.0673828125, "learning_rate": 2.1478648820905035e-07, "loss": 0.0027, "reward": 1.4394135475158691, "reward_std": 0.03722523897886276, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4394136071205139, "step": 2464 }, { "completion_length": 258.828125, "epoch": 0.7855321861057999, "grad_norm": 6.705070495605469, "kl": 0.1279296875, "learning_rate": 2.1446781389420013e-07, "loss": 0.0051, "reward": 1.3438005447387695, "reward_std": 0.12043680995702744, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.37505069375038147, "rewards/pad": 0.0, "step": 2465 }, { "completion_length": 194.75, "epoch": 0.7858508604206501, "grad_norm": 13.5197172164917, "kl": 0.0888671875, "learning_rate": 2.141491395793499e-07, "loss": 0.0036, "reward": 1.8589537143707275, "reward_std": 0.07028871774673462, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6089537739753723, "rewards/pad": 0.25, "step": 2466 }, { "completion_length": 124.28125, "epoch": 0.7861695347355003, "grad_norm": 10.612565040588379, "kl": 0.103515625, "learning_rate": 2.1383046526449967e-07, "loss": 0.0041, "reward": 1.6125763654708862, "reward_std": 0.09464748203754425, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4875763952732086, "step": 2467 }, { "completion_length": 243.03125, "epoch": 0.7864882090503506, "grad_norm": 10.93760871887207, "kl": 0.09912109375, "learning_rate": 2.1351179094964945e-07, "loss": 0.004, "reward": 1.4043235778808594, "reward_std": 0.11670240759849548, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.419948548078537, "step": 2468 }, { "completion_length": 323.078125, "epoch": 0.7868068833652008, "grad_norm": 11.92921257019043, "kl": 0.08642578125, "learning_rate": 2.1319311663479923e-07, "loss": 0.0035, "reward": 1.4428250789642334, "reward_std": 0.047589220106601715, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4428250789642334, "step": 2469 }, { "completion_length": 293.140625, "epoch": 0.787125557680051, "grad_norm": 5.340349197387695, "kl": 0.07763671875, "learning_rate": 2.12874442319949e-07, "loss": 0.0031, "reward": 1.5190386772155762, "reward_std": 0.04808990657329559, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5190386176109314, "rewards/pad": 0.0, "step": 2470 }, { "completion_length": 161.796875, "epoch": 0.7874442319949012, "grad_norm": 23.361160278320312, "kl": 0.11279296875, "learning_rate": 2.125557680050988e-07, "loss": 0.0045, "reward": 1.4743843078613281, "reward_std": 0.09407757222652435, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47438421845436096, "rewards/pad": 0.0, "step": 2471 }, { "completion_length": 255.15625, "epoch": 0.7877629063097514, "grad_norm": 12.378373146057129, "kl": 0.08642578125, "learning_rate": 2.1223709369024857e-07, "loss": 0.0034, "reward": 1.305260419845581, "reward_std": 0.05898979306221008, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.30526041984558105, "step": 2472 }, { "completion_length": 286.140625, "epoch": 0.7880815806246017, "grad_norm": 20.72404670715332, "kl": 0.0869140625, "learning_rate": 2.1191841937539835e-07, "loss": 0.0035, "reward": 1.698275089263916, "reward_std": 0.07133931666612625, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.573275089263916, "step": 2473 }, { "completion_length": 297.484375, "epoch": 0.7884002549394519, "grad_norm": 8.229775428771973, "kl": 0.0654296875, "learning_rate": 2.1159974506054813e-07, "loss": 0.0026, "reward": 1.6661851406097412, "reward_std": 0.10517589747905731, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5568101406097412, "step": 2474 }, { "completion_length": 282.421875, "epoch": 0.7887189292543021, "grad_norm": 30.26580238342285, "kl": 0.07275390625, "learning_rate": 2.112810707456979e-07, "loss": 0.0029, "reward": 1.5597797632217407, "reward_std": 0.10693672299385071, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4972797632217407, "step": 2475 }, { "completion_length": 307.28125, "epoch": 0.7890376035691523, "grad_norm": 183.1418914794922, "kl": 0.0576171875, "learning_rate": 2.109623964308477e-07, "loss": 0.0023, "reward": 1.5799769163131714, "reward_std": 0.05586535483598709, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.32997697591781616, "step": 2476 }, { "completion_length": 283.734375, "epoch": 0.7893562778840025, "grad_norm": 20.426721572875977, "kl": 0.09033203125, "learning_rate": 2.1064372211599742e-07, "loss": 0.0036, "reward": 1.4326732158660889, "reward_std": 0.05556642264127731, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4326731562614441, "rewards/pad": 0.0, "step": 2477 }, { "completion_length": 347.671875, "epoch": 0.7896749521988528, "grad_norm": 7.112791538238525, "kl": 0.109375, "learning_rate": 2.103250478011472e-07, "loss": 0.0044, "reward": 1.5783393383026123, "reward_std": 0.16577112674713135, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.6095892786979675, "rewards/pad": 0.0, "step": 2478 }, { "completion_length": 197.96875, "epoch": 0.789993626513703, "grad_norm": 7.802004814147949, "kl": 0.1123046875, "learning_rate": 2.1000637348629698e-07, "loss": 0.0045, "reward": 1.6445430517196655, "reward_std": 0.06837106496095657, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5195431113243103, "rewards/pad": 0.125, "step": 2479 }, { "completion_length": 287.765625, "epoch": 0.7903123008285532, "grad_norm": 6.5852742195129395, "kl": 0.0888671875, "learning_rate": 2.0968769917144676e-07, "loss": 0.0036, "reward": 1.404374122619629, "reward_std": 0.038719989359378815, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.40437403321266174, "step": 2480 }, { "completion_length": 375.984375, "epoch": 0.7906309751434034, "grad_norm": 21.665122985839844, "kl": 0.07373046875, "learning_rate": 2.0936902485659654e-07, "loss": 0.003, "reward": 1.4499353170394897, "reward_std": 0.06959246844053268, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44993534684181213, "step": 2481 }, { "completion_length": 362.21875, "epoch": 0.7909496494582536, "grad_norm": 4.866166114807129, "kl": 0.05615234375, "learning_rate": 2.0905035054174632e-07, "loss": 0.0022, "reward": 1.5529453754425049, "reward_std": 0.053469765931367874, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5529453754425049, "step": 2482 }, { "completion_length": 154.125, "epoch": 0.7912683237731039, "grad_norm": 9.112666130065918, "kl": 0.09765625, "learning_rate": 2.087316762268961e-07, "loss": 0.0039, "reward": 1.6775808334350586, "reward_std": 0.07012620568275452, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5525809526443481, "rewards/pad": 0.125, "step": 2483 }, { "completion_length": 243.015625, "epoch": 0.7915869980879541, "grad_norm": 9.58671760559082, "kl": 0.08251953125, "learning_rate": 2.0841300191204588e-07, "loss": 0.0033, "reward": 1.7703495025634766, "reward_std": 0.10711017996072769, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5359745025634766, "step": 2484 }, { "completion_length": 379.703125, "epoch": 0.7919056724028043, "grad_norm": 8.184744834899902, "kl": 0.06005859375, "learning_rate": 2.0809432759719566e-07, "loss": 0.0024, "reward": 1.3782320022583008, "reward_std": 0.030829811468720436, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.25323206186294556, "step": 2485 }, { "completion_length": 243.953125, "epoch": 0.7922243467176545, "grad_norm": 13.434769630432129, "kl": 0.083984375, "learning_rate": 2.0777565328234542e-07, "loss": 0.0034, "reward": 1.5161212682724, "reward_std": 0.16218221187591553, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5317463278770447, "step": 2486 }, { "completion_length": 190.1875, "epoch": 0.7925430210325047, "grad_norm": 13.576552391052246, "kl": 0.091796875, "learning_rate": 2.074569789674952e-07, "loss": 0.0037, "reward": 1.6686426401138306, "reward_std": 0.041518136858940125, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5436426997184753, "rewards/pad": 0.125, "step": 2487 }, { "completion_length": 201.40625, "epoch": 0.792861695347355, "grad_norm": 24.43422508239746, "kl": 0.1103515625, "learning_rate": 2.0713830465264498e-07, "loss": 0.0044, "reward": 1.7355256080627441, "reward_std": 0.11372970044612885, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.6261506080627441, "step": 2488 }, { "completion_length": 222.03125, "epoch": 0.7931803696622052, "grad_norm": 12.765557289123535, "kl": 0.09033203125, "learning_rate": 2.0681963033779476e-07, "loss": 0.0036, "reward": 1.6278808116912842, "reward_std": 0.09283436834812164, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5028807520866394, "rewards/pad": 0.125, "step": 2489 }, { "completion_length": 145.171875, "epoch": 0.7934990439770554, "grad_norm": 12.70199966430664, "kl": 0.123046875, "learning_rate": 2.0650095602294454e-07, "loss": 0.0049, "reward": 1.6430946588516235, "reward_std": 0.0791141539812088, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6430947184562683, "rewards/pad": 0.0, "step": 2490 }, { "completion_length": 260.375, "epoch": 0.7938177182919057, "grad_norm": 10.663228988647461, "kl": 0.0927734375, "learning_rate": 2.0618228170809432e-07, "loss": 0.0037, "reward": 1.3959004878997803, "reward_std": 0.09733359515666962, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3959004580974579, "step": 2491 }, { "completion_length": 401.734375, "epoch": 0.794136392606756, "grad_norm": 24.024709701538086, "kl": 0.04931640625, "learning_rate": 2.058636073932441e-07, "loss": 0.002, "reward": 1.450697898864746, "reward_std": 0.026952486485242844, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.32569780945777893, "step": 2492 }, { "completion_length": 221.40625, "epoch": 0.7944550669216062, "grad_norm": 9.899370193481445, "kl": 0.09765625, "learning_rate": 2.0554493307839388e-07, "loss": 0.0039, "reward": 1.6808327436447144, "reward_std": 0.11763062328100204, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.44645780324935913, "step": 2493 }, { "completion_length": 407.140625, "epoch": 0.7947737412364564, "grad_norm": 22.02533721923828, "kl": 0.052978515625, "learning_rate": 2.0522625876354366e-07, "loss": 0.0021, "reward": 1.3636521100997925, "reward_std": 0.027319133281707764, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3636520504951477, "step": 2494 }, { "completion_length": 198.53125, "epoch": 0.7950924155513066, "grad_norm": 9.314778327941895, "kl": 0.08837890625, "learning_rate": 2.0490758444869344e-07, "loss": 0.0035, "reward": 1.634337306022644, "reward_std": 0.09219536185264587, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3843373954296112, "rewards/pad": 0.25, "step": 2495 }, { "completion_length": 110.8125, "epoch": 0.7954110898661568, "grad_norm": 7.935081481933594, "kl": 0.10986328125, "learning_rate": 2.045889101338432e-07, "loss": 0.0044, "reward": 1.6727542877197266, "reward_std": 0.06368027627468109, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.42275434732437134, "rewards/pad": 0.25, "step": 2496 }, { "completion_length": 145.6875, "epoch": 0.795729764181007, "grad_norm": 9.061888694763184, "kl": 0.1103515625, "learning_rate": 2.0427023581899297e-07, "loss": 0.0044, "reward": 1.5904161930084229, "reward_std": 0.05827527120709419, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5904163122177124, "rewards/pad": 0.0, "step": 2497 }, { "completion_length": 256.625, "epoch": 0.7960484384958573, "grad_norm": 8.258540153503418, "kl": 0.083984375, "learning_rate": 2.0395156150414276e-07, "loss": 0.0034, "reward": 1.4634907245635986, "reward_std": 0.08649997413158417, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4634907841682434, "rewards/pad": 0.0, "step": 2498 }, { "completion_length": 207.59375, "epoch": 0.7963671128107075, "grad_norm": 10.039529800415039, "kl": 0.11474609375, "learning_rate": 2.0363288718929254e-07, "loss": 0.0046, "reward": 1.4351905584335327, "reward_std": 0.08591999113559723, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4195655584335327, "step": 2499 }, { "completion_length": 341.109375, "epoch": 0.7966857871255577, "grad_norm": 7.5028395652771, "kl": 0.058837890625, "learning_rate": 2.0331421287444232e-07, "loss": 0.0024, "reward": 1.5099382400512695, "reward_std": 0.0751279965043068, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.38493824005126953, "step": 2500 }, { "completion_length": 395.671875, "epoch": 0.7970044614404079, "grad_norm": 7.090115070343018, "kl": 0.059326171875, "learning_rate": 2.029955385595921e-07, "loss": 0.0024, "reward": 1.5254924297332764, "reward_std": 0.07720702141523361, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5254924297332764, "rewards/pad": 0.0, "step": 2501 }, { "completion_length": 207.59375, "epoch": 0.7973231357552581, "grad_norm": 7.278848171234131, "kl": 0.0849609375, "learning_rate": 2.0267686424474188e-07, "loss": 0.0034, "reward": 1.6780972480773926, "reward_std": 0.0748632550239563, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4280971586704254, "rewards/pad": 0.25, "step": 2502 }, { "completion_length": 302.0, "epoch": 0.7976418100701084, "grad_norm": 5.311611175537109, "kl": 0.07177734375, "learning_rate": 2.0235818992989166e-07, "loss": 0.0029, "reward": 1.7154650688171387, "reward_std": 0.058912493288517, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5904650688171387, "step": 2503 }, { "completion_length": 317.140625, "epoch": 0.7979604843849586, "grad_norm": 12.959879875183105, "kl": 0.0615234375, "learning_rate": 2.0203951561504144e-07, "loss": 0.0025, "reward": 1.583716630935669, "reward_std": 0.09927290678024292, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3493417799472809, "rewards/pad": 0.25, "step": 2504 }, { "completion_length": 240.96875, "epoch": 0.7982791586998088, "grad_norm": 8.32062816619873, "kl": 0.07373046875, "learning_rate": 2.0172084130019122e-07, "loss": 0.0029, "reward": 1.3935962915420532, "reward_std": 0.042469993233680725, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.39359626173973083, "step": 2505 }, { "completion_length": 228.125, "epoch": 0.798597833014659, "grad_norm": 22.151350021362305, "kl": 0.0859375, "learning_rate": 2.0140216698534097e-07, "loss": 0.0034, "reward": 1.4923195838928223, "reward_std": 0.03624539077281952, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4923197031021118, "step": 2506 }, { "completion_length": 317.390625, "epoch": 0.7989165073295093, "grad_norm": 11.759739875793457, "kl": 0.0693359375, "learning_rate": 2.0108349267049073e-07, "loss": 0.0028, "reward": 1.5153393745422363, "reward_std": 0.07655253261327744, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3903394043445587, "step": 2507 }, { "completion_length": 242.796875, "epoch": 0.7992351816443595, "grad_norm": 7.063753128051758, "kl": 0.09375, "learning_rate": 2.007648183556405e-07, "loss": 0.0037, "reward": 1.5463790893554688, "reward_std": 0.08136877417564392, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5463790893554688, "rewards/pad": 0.0, "step": 2508 }, { "completion_length": 146.03125, "epoch": 0.7995538559592097, "grad_norm": 8.412872314453125, "kl": 0.1064453125, "learning_rate": 2.004461440407903e-07, "loss": 0.0043, "reward": 1.742506742477417, "reward_std": 0.09771303087472916, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.664381742477417, "rewards/pad": 0.078125, "step": 2509 }, { "completion_length": 272.90625, "epoch": 0.7998725302740599, "grad_norm": 16.04229736328125, "kl": 0.080078125, "learning_rate": 2.0012746972594007e-07, "loss": 0.0032, "reward": 1.7979576587677002, "reward_std": 0.07358251512050629, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5479576587677002, "step": 2510 }, { "completion_length": 151.4375, "epoch": 0.8001912045889101, "grad_norm": 15.322516441345215, "kl": 0.1298828125, "learning_rate": 1.9980879541108985e-07, "loss": 0.0052, "reward": 1.6307207345962524, "reward_std": 0.08559076488018036, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6307207345962524, "step": 2511 }, { "completion_length": 361.640625, "epoch": 0.8005098789037604, "grad_norm": 8.897799491882324, "kl": 0.07470703125, "learning_rate": 1.9949012109623963e-07, "loss": 0.003, "reward": 1.5593959093093872, "reward_std": 0.05124068260192871, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5593959093093872, "step": 2512 }, { "completion_length": 280.03125, "epoch": 0.8008285532186106, "grad_norm": 10.985091209411621, "kl": 0.0830078125, "learning_rate": 1.991714467813894e-07, "loss": 0.0033, "reward": 1.8225293159484863, "reward_std": 0.1210012435913086, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5881543159484863, "step": 2513 }, { "completion_length": 292.1875, "epoch": 0.8011472275334608, "grad_norm": 7.801126956939697, "kl": 0.087890625, "learning_rate": 1.988527724665392e-07, "loss": 0.0035, "reward": 1.5416383743286133, "reward_std": 0.05828118696808815, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4166383743286133, "step": 2514 }, { "completion_length": 350.984375, "epoch": 0.801465901848311, "grad_norm": 5.292451858520508, "kl": 0.06005859375, "learning_rate": 1.9853409815168897e-07, "loss": 0.0024, "reward": 1.5692921876907349, "reward_std": 0.032198529690504074, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5692921280860901, "rewards/pad": 0.0, "step": 2515 }, { "completion_length": 286.109375, "epoch": 0.8017845761631612, "grad_norm": 9.00987720489502, "kl": 0.08251953125, "learning_rate": 1.9821542383683872e-07, "loss": 0.0033, "reward": 1.565173864364624, "reward_std": 0.07822804898023605, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4557989239692688, "rewards/pad": 0.109375, "step": 2516 }, { "completion_length": 305.640625, "epoch": 0.8021032504780115, "grad_norm": 8.103169441223145, "kl": 0.06787109375, "learning_rate": 1.978967495219885e-07, "loss": 0.0027, "reward": 1.5759283304214478, "reward_std": 0.06781023740768433, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.575928270816803, "step": 2517 }, { "completion_length": 220.609375, "epoch": 0.8024219247928617, "grad_norm": 10.552177429199219, "kl": 0.095703125, "learning_rate": 1.9757807520713829e-07, "loss": 0.0038, "reward": 1.570098876953125, "reward_std": 0.09314015507698059, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4450989067554474, "rewards/pad": 0.125, "step": 2518 }, { "completion_length": 256.75, "epoch": 0.8027405991077119, "grad_norm": 13.762317657470703, "kl": 0.06982421875, "learning_rate": 1.9725940089228807e-07, "loss": 0.0028, "reward": 1.7126691341400146, "reward_std": 0.12554529309272766, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4782941937446594, "rewards/pad": 0.234375, "step": 2519 }, { "completion_length": 219.484375, "epoch": 0.8030592734225621, "grad_norm": 10.376080513000488, "kl": 0.08544921875, "learning_rate": 1.9694072657743785e-07, "loss": 0.0034, "reward": 1.743795394897461, "reward_std": 0.07224259525537491, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4937954843044281, "step": 2520 }, { "completion_length": 316.265625, "epoch": 0.8033779477374123, "grad_norm": 7.719282150268555, "kl": 0.0703125, "learning_rate": 1.9662205226258763e-07, "loss": 0.0028, "reward": 1.6823227405548096, "reward_std": 0.19630685448646545, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5416978597640991, "step": 2521 }, { "completion_length": 245.0625, "epoch": 0.8036966220522626, "grad_norm": 9.966045379638672, "kl": 0.08544921875, "learning_rate": 1.963033779477374e-07, "loss": 0.0034, "reward": 1.722424864768982, "reward_std": 0.04164808616042137, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5974248647689819, "step": 2522 }, { "completion_length": 404.546875, "epoch": 0.8040152963671128, "grad_norm": 9.246283531188965, "kl": 0.0625, "learning_rate": 1.959847036328872e-07, "loss": 0.0025, "reward": 1.5026819705963135, "reward_std": 0.13315097987651825, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4089319407939911, "step": 2523 }, { "completion_length": 249.3125, "epoch": 0.804333970681963, "grad_norm": 7.0430169105529785, "kl": 0.0791015625, "learning_rate": 1.9566602931803697e-07, "loss": 0.0032, "reward": 1.777950406074524, "reward_std": 0.15799814462661743, "rewards/pad": 0.34375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43420034646987915, "step": 2524 }, { "completion_length": 300.484375, "epoch": 0.8046526449968132, "grad_norm": 14.292428016662598, "kl": 0.078125, "learning_rate": 1.9534735500318675e-07, "loss": 0.0031, "reward": 1.4989312887191772, "reward_std": 0.05096810311079025, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49893131852149963, "rewards/pad": 0.0, "step": 2525 }, { "completion_length": 301.5625, "epoch": 0.8049713193116634, "grad_norm": 8.920188903808594, "kl": 0.083984375, "learning_rate": 1.950286806883365e-07, "loss": 0.0034, "reward": 1.684213399887085, "reward_std": 0.13281765580177307, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5748384594917297, "rewards/pad": 0.125, "step": 2526 }, { "completion_length": 248.40625, "epoch": 0.8052899936265137, "grad_norm": 21.75054931640625, "kl": 0.0703125, "learning_rate": 1.9471000637348628e-07, "loss": 0.0028, "reward": 1.7569079399108887, "reward_std": 0.12462820112705231, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5381580591201782, "rewards/pad": 0.25, "step": 2527 }, { "completion_length": 294.828125, "epoch": 0.8056086679413639, "grad_norm": 6.906178951263428, "kl": 0.0615234375, "learning_rate": 1.9439133205863606e-07, "loss": 0.0025, "reward": 1.5684860944747925, "reward_std": 0.10319699347019196, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4591110646724701, "step": 2528 }, { "completion_length": 127.1875, "epoch": 0.8059273422562141, "grad_norm": 53.47869110107422, "kl": 0.1376953125, "learning_rate": 1.9407265774378584e-07, "loss": 0.0055, "reward": 1.8714401721954346, "reward_std": 0.11145314574241638, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.49644017219543457, "step": 2529 }, { "completion_length": 225.71875, "epoch": 0.8062460165710643, "grad_norm": 8.28803539276123, "kl": 0.09326171875, "learning_rate": 1.9375398342893563e-07, "loss": 0.0037, "reward": 1.4719030857086182, "reward_std": 0.07403188943862915, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.471902996301651, "step": 2530 }, { "completion_length": 255.265625, "epoch": 0.8065646908859146, "grad_norm": 15.238947868347168, "kl": 0.08544921875, "learning_rate": 1.934353091140854e-07, "loss": 0.0034, "reward": 1.6258214712142944, "reward_std": 0.06549122929573059, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6258213520050049, "step": 2531 }, { "completion_length": 240.4375, "epoch": 0.8068833652007649, "grad_norm": 16.703702926635742, "kl": 0.091796875, "learning_rate": 1.9311663479923519e-07, "loss": 0.0037, "reward": 1.4397114515304565, "reward_std": 0.13731428980827332, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.42408645153045654, "rewards/pad": 0.03125, "step": 2532 }, { "completion_length": 342.203125, "epoch": 0.8072020395156151, "grad_norm": 9.475166320800781, "kl": 0.0771484375, "learning_rate": 1.9279796048438497e-07, "loss": 0.0031, "reward": 1.584092617034912, "reward_std": 0.14641991257667542, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.6153426766395569, "step": 2533 }, { "completion_length": 293.265625, "epoch": 0.8075207138304653, "grad_norm": 10.645809173583984, "kl": 0.06689453125, "learning_rate": 1.9247928616953475e-07, "loss": 0.0027, "reward": 1.8401165008544922, "reward_std": 0.0870843380689621, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.715116560459137, "rewards/pad": 0.125, "step": 2534 }, { "completion_length": 313.296875, "epoch": 0.8078393881453155, "grad_norm": 4.8072428703308105, "kl": 0.146484375, "learning_rate": 1.921606118546845e-07, "loss": 0.0059, "reward": 1.4932180643081665, "reward_std": 0.06363866478204727, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3682180643081665, "step": 2535 }, { "completion_length": 225.609375, "epoch": 0.8081580624601657, "grad_norm": 17.09768295288086, "kl": 0.07958984375, "learning_rate": 1.9184193753983428e-07, "loss": 0.0032, "reward": 1.644900918006897, "reward_std": 0.14637455344200134, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.41052594780921936, "step": 2536 }, { "completion_length": 242.953125, "epoch": 0.808476736775016, "grad_norm": 85.25773620605469, "kl": 0.0830078125, "learning_rate": 1.9152326322498406e-07, "loss": 0.0033, "reward": 1.539243459701538, "reward_std": 0.05695287883281708, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5392435193061829, "rewards/pad": 0.0, "step": 2537 }, { "completion_length": 318.828125, "epoch": 0.8087954110898662, "grad_norm": 9.385004997253418, "kl": 0.07421875, "learning_rate": 1.9120458891013382e-07, "loss": 0.003, "reward": 1.5466983318328857, "reward_std": 0.030538583174347878, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5466983318328857, "step": 2538 }, { "completion_length": 362.859375, "epoch": 0.8091140854047164, "grad_norm": 17.053071975708008, "kl": 0.08349609375, "learning_rate": 1.908859145952836e-07, "loss": 0.0033, "reward": 1.459040641784668, "reward_std": 0.04085429757833481, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.45904070138931274, "step": 2539 }, { "completion_length": 199.28125, "epoch": 0.8094327597195666, "grad_norm": 11.204050064086914, "kl": 0.1015625, "learning_rate": 1.9056724028043338e-07, "loss": 0.0041, "reward": 1.6462476253509521, "reward_std": 0.08865442872047424, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5212478041648865, "step": 2540 }, { "completion_length": 103.6875, "epoch": 0.8097514340344169, "grad_norm": 17.3836612701416, "kl": 0.134765625, "learning_rate": 1.9024856596558316e-07, "loss": 0.0054, "reward": 1.7457025051116943, "reward_std": 0.08871598541736603, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6207026243209839, "rewards/pad": 0.125, "step": 2541 }, { "completion_length": 219.5, "epoch": 0.8100701083492671, "grad_norm": 12.529458999633789, "kl": 0.10009765625, "learning_rate": 1.8992989165073294e-07, "loss": 0.004, "reward": 1.6205840110778809, "reward_std": 0.054811395704746246, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49558398127555847, "rewards/pad": 0.125, "step": 2542 }, { "completion_length": 153.234375, "epoch": 0.8103887826641173, "grad_norm": 21.83299446105957, "kl": 0.0986328125, "learning_rate": 1.8961121733588272e-07, "loss": 0.0039, "reward": 1.8295905590057373, "reward_std": 0.14615780115127563, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5952155590057373, "step": 2543 }, { "completion_length": 434.453125, "epoch": 0.8107074569789675, "grad_norm": 13.690393447875977, "kl": 0.0498046875, "learning_rate": 1.892925430210325e-07, "loss": 0.002, "reward": 1.5930736064910889, "reward_std": 0.03718571364879608, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4680735766887665, "step": 2544 }, { "completion_length": 109.1875, "epoch": 0.8110261312938177, "grad_norm": 17.369874954223633, "kl": 0.234375, "learning_rate": 1.8897386870618225e-07, "loss": 0.0094, "reward": 1.766268253326416, "reward_std": 0.10231577605009079, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.641268253326416, "rewards/pad": 0.125, "step": 2545 }, { "completion_length": 246.703125, "epoch": 0.811344805608668, "grad_norm": 22.70635986328125, "kl": 0.09326171875, "learning_rate": 1.8865519439133203e-07, "loss": 0.0037, "reward": 1.5770329236984253, "reward_std": 0.06733634322881699, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5770329236984253, "rewards/pad": 0.0, "step": 2546 }, { "completion_length": 154.921875, "epoch": 0.8116634799235182, "grad_norm": 118.38398742675781, "kl": 0.10986328125, "learning_rate": 1.8833652007648181e-07, "loss": 0.0044, "reward": 1.5974708795547485, "reward_std": 0.06037134677171707, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5974709391593933, "rewards/pad": 0.0, "step": 2547 }, { "completion_length": 427.75, "epoch": 0.8119821542383684, "grad_norm": 7.559087753295898, "kl": 0.049072265625, "learning_rate": 1.880178457616316e-07, "loss": 0.002, "reward": 1.5248229503631592, "reward_std": 0.12492824345827103, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4154479503631592, "step": 2548 }, { "completion_length": 266.109375, "epoch": 0.8123008285532186, "grad_norm": 14.66143798828125, "kl": 0.0732421875, "learning_rate": 1.8769917144678138e-07, "loss": 0.0029, "reward": 1.530194640159607, "reward_std": 0.12328522652387619, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40519458055496216, "rewards/pad": 0.125, "step": 2549 }, { "completion_length": 203.40625, "epoch": 0.8126195028680688, "grad_norm": 12.825209617614746, "kl": 0.11279296875, "learning_rate": 1.8738049713193116e-07, "loss": 0.0045, "reward": 1.687673568725586, "reward_std": 0.14001277089118958, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5782985687255859, "step": 2550 }, { "completion_length": 341.703125, "epoch": 0.812938177182919, "grad_norm": 5.511659622192383, "kl": 0.056396484375, "learning_rate": 1.8706182281708094e-07, "loss": 0.0023, "reward": 1.509248971939087, "reward_std": 0.05761463940143585, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5092489719390869, "step": 2551 }, { "completion_length": 297.109375, "epoch": 0.8132568514977693, "grad_norm": 15.541271209716797, "kl": 0.060791015625, "learning_rate": 1.8674314850223072e-07, "loss": 0.0024, "reward": 1.5715813636779785, "reward_std": 0.085521399974823, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3372064232826233, "rewards/pad": 0.25, "step": 2552 }, { "completion_length": 269.765625, "epoch": 0.8135755258126195, "grad_norm": 6.019388198852539, "kl": 0.08935546875, "learning_rate": 1.864244741873805e-07, "loss": 0.0036, "reward": 1.7263768911361694, "reward_std": 0.10301776975393295, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6013767719268799, "step": 2553 }, { "completion_length": 311.90625, "epoch": 0.8138942001274697, "grad_norm": 5.30941104888916, "kl": 0.0849609375, "learning_rate": 1.8610579987253028e-07, "loss": 0.0034, "reward": 1.5381710529327393, "reward_std": 0.16679885983467102, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.47567105293273926, "step": 2554 }, { "completion_length": 304.328125, "epoch": 0.8142128744423199, "grad_norm": 14.053765296936035, "kl": 0.068359375, "learning_rate": 1.8578712555768003e-07, "loss": 0.0027, "reward": 1.3786784410476685, "reward_std": 0.1604098379611969, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.33180350065231323, "rewards/pad": 0.0625, "step": 2555 }, { "completion_length": 320.921875, "epoch": 0.8145315487571702, "grad_norm": 7.8060479164123535, "kl": 0.0751953125, "learning_rate": 1.854684512428298e-07, "loss": 0.003, "reward": 1.6688108444213867, "reward_std": 0.1284911185503006, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4344358742237091, "rewards/pad": 0.25, "step": 2556 }, { "completion_length": 310.8125, "epoch": 0.8148502230720204, "grad_norm": 9.67390251159668, "kl": 0.0634765625, "learning_rate": 1.851497769279796e-07, "loss": 0.0025, "reward": 1.7895619869232178, "reward_std": 0.05394981428980827, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.414561927318573, "rewards/pad": 0.375, "step": 2557 }, { "completion_length": 285.828125, "epoch": 0.8151688973868706, "grad_norm": 5.931857109069824, "kl": 0.07763671875, "learning_rate": 1.8483110261312937e-07, "loss": 0.0031, "reward": 1.5367307662963867, "reward_std": 0.04691781848669052, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5367306470870972, "step": 2558 }, { "completion_length": 220.203125, "epoch": 0.8154875717017208, "grad_norm": 14.692937850952148, "kl": 0.08642578125, "learning_rate": 1.8451242829827915e-07, "loss": 0.0035, "reward": 1.648263931274414, "reward_std": 0.07860487699508667, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5232639312744141, "rewards/pad": 0.125, "step": 2559 }, { "completion_length": 394.859375, "epoch": 0.815806246016571, "grad_norm": 18.749855041503906, "kl": 0.06982421875, "learning_rate": 1.8419375398342893e-07, "loss": 0.0028, "reward": 1.6806821823120117, "reward_std": 0.055952079594135284, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5556822419166565, "step": 2560 }, { "completion_length": 409.359375, "epoch": 0.8161249203314213, "grad_norm": 6.769157409667969, "kl": 0.0556640625, "learning_rate": 1.8387507966857871e-07, "loss": 0.0022, "reward": 1.4579136371612549, "reward_std": 0.10316675901412964, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.47353869676589966, "rewards/pad": 0.0, "step": 2561 }, { "completion_length": 229.09375, "epoch": 0.8164435946462715, "grad_norm": 16.578529357910156, "kl": 0.07421875, "learning_rate": 1.835564053537285e-07, "loss": 0.003, "reward": 1.8228061199188232, "reward_std": 0.10866206139326096, "rewards/pad": 0.375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.46343109011650085, "step": 2562 }, { "completion_length": 221.390625, "epoch": 0.8167622689611217, "grad_norm": 6.562174320220947, "kl": 0.0908203125, "learning_rate": 1.8323773103887828e-07, "loss": 0.0036, "reward": 1.7422194480895996, "reward_std": 0.14775794744491577, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5078444480895996, "rewards/pad": 0.25, "step": 2563 }, { "completion_length": 348.828125, "epoch": 0.8170809432759719, "grad_norm": 11.428489685058594, "kl": 0.0556640625, "learning_rate": 1.8291905672402806e-07, "loss": 0.0022, "reward": 1.4955813884735107, "reward_std": 0.0692591667175293, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49558135867118835, "step": 2564 }, { "completion_length": 271.328125, "epoch": 0.8173996175908221, "grad_norm": 10.209710121154785, "kl": 0.08251953125, "learning_rate": 1.826003824091778e-07, "loss": 0.0033, "reward": 1.4857373237609863, "reward_std": 0.05850472301244736, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48573732376098633, "rewards/pad": 0.0, "step": 2565 }, { "completion_length": 197.640625, "epoch": 0.8177182919056724, "grad_norm": 8.506927490234375, "kl": 0.10400390625, "learning_rate": 1.822817080943276e-07, "loss": 0.0042, "reward": 1.485414743423462, "reward_std": 0.07199355959892273, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4854147434234619, "step": 2566 }, { "completion_length": 255.4375, "epoch": 0.8180369662205226, "grad_norm": 8.188014030456543, "kl": 0.0732421875, "learning_rate": 1.8196303377947737e-07, "loss": 0.0029, "reward": 1.5830761194229126, "reward_std": 0.1581452488899231, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3643261194229126, "step": 2567 }, { "completion_length": 287.8125, "epoch": 0.8183556405353728, "grad_norm": 9.338285446166992, "kl": 0.07421875, "learning_rate": 1.8164435946462715e-07, "loss": 0.003, "reward": 1.59792160987854, "reward_std": 0.07224056124687195, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.59792160987854, "step": 2568 }, { "completion_length": 261.0625, "epoch": 0.818674314850223, "grad_norm": 43.08826446533203, "kl": 0.083984375, "learning_rate": 1.813256851497769e-07, "loss": 0.0034, "reward": 1.6259100437164307, "reward_std": 0.15336614847183228, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39153510332107544, "rewards/pad": 0.234375, "step": 2569 }, { "completion_length": 206.421875, "epoch": 0.8189929891650733, "grad_norm": 10.55008602142334, "kl": 0.0888671875, "learning_rate": 1.8100701083492669e-07, "loss": 0.0036, "reward": 1.8721909523010254, "reward_std": 0.0830574557185173, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.7471910119056702, "step": 2570 }, { "completion_length": 211.5, "epoch": 0.8193116634799236, "grad_norm": 13.250950813293457, "kl": 0.07958984375, "learning_rate": 1.8068833652007647e-07, "loss": 0.0032, "reward": 1.6348446607589722, "reward_std": 0.09269119054079056, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4004696011543274, "rewards/pad": 0.234375, "step": 2571 }, { "completion_length": 259.75, "epoch": 0.8196303377947738, "grad_norm": 9.012632369995117, "kl": 0.08837890625, "learning_rate": 1.8036966220522625e-07, "loss": 0.0035, "reward": 1.5878616571426392, "reward_std": 0.05733507499098778, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4628615975379944, "rewards/pad": 0.125, "step": 2572 }, { "completion_length": 174.53125, "epoch": 0.819949012109624, "grad_norm": 23.10243034362793, "kl": 0.08642578125, "learning_rate": 1.8005098789037603e-07, "loss": 0.0035, "reward": 1.8872909545898438, "reward_std": 0.09623947739601135, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6372909545898438, "step": 2573 }, { "completion_length": 259.578125, "epoch": 0.8202676864244742, "grad_norm": 10.32063102722168, "kl": 0.1435546875, "learning_rate": 1.797323135755258e-07, "loss": 0.0057, "reward": 1.5099594593048096, "reward_std": 0.04081812873482704, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5099595785140991, "step": 2574 }, { "completion_length": 153.90625, "epoch": 0.8205863607393244, "grad_norm": 18.978233337402344, "kl": 0.10009765625, "learning_rate": 1.7941363926067556e-07, "loss": 0.004, "reward": 1.6833765506744385, "reward_std": 0.15838143229484558, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.44900164008140564, "step": 2575 }, { "completion_length": 164.03125, "epoch": 0.8209050350541747, "grad_norm": 23.3841495513916, "kl": 0.0869140625, "learning_rate": 1.7909496494582534e-07, "loss": 0.0035, "reward": 1.5571430921554565, "reward_std": 0.1657448410987854, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5258930325508118, "rewards/pad": 0.046875, "step": 2576 }, { "completion_length": 208.15625, "epoch": 0.8212237093690249, "grad_norm": 11.401373863220215, "kl": 0.07861328125, "learning_rate": 1.7877629063097512e-07, "loss": 0.0031, "reward": 1.849107265472412, "reward_std": 0.07179224491119385, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5991072058677673, "step": 2577 }, { "completion_length": 207.796875, "epoch": 0.8215423836838751, "grad_norm": 6.517545223236084, "kl": 0.07763671875, "learning_rate": 1.784576163161249e-07, "loss": 0.0031, "reward": 1.6891777515411377, "reward_std": 0.12289692461490631, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3298027217388153, "step": 2578 }, { "completion_length": 400.546875, "epoch": 0.8218610579987253, "grad_norm": 3.41658353805542, "kl": 0.052490234375, "learning_rate": 1.7813894200127468e-07, "loss": 0.0021, "reward": 1.3222894668579102, "reward_std": 0.026268957182765007, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.1972896009683609, "step": 2579 }, { "completion_length": 336.28125, "epoch": 0.8221797323135756, "grad_norm": 6.9868974685668945, "kl": 0.05615234375, "learning_rate": 1.7782026768642447e-07, "loss": 0.0023, "reward": 1.4629323482513428, "reward_std": 0.1270006000995636, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.47855743765830994, "step": 2580 }, { "completion_length": 169.578125, "epoch": 0.8224984066284258, "grad_norm": 41.07691955566406, "kl": 0.115234375, "learning_rate": 1.7750159337157425e-07, "loss": 0.0046, "reward": 1.686937689781189, "reward_std": 0.17813950777053833, "rewards/answer_reward": 0.21875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.46818774938583374, "step": 2581 }, { "completion_length": 268.8125, "epoch": 0.822817080943276, "grad_norm": 7.311337947845459, "kl": 0.08544921875, "learning_rate": 1.7718291905672403e-07, "loss": 0.0034, "reward": 1.6053481101989746, "reward_std": 0.05105286091566086, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6053481698036194, "step": 2582 }, { "completion_length": 252.90625, "epoch": 0.8231357552581262, "grad_norm": 9.382682800292969, "kl": 0.1025390625, "learning_rate": 1.768642447418738e-07, "loss": 0.0041, "reward": 1.7015126943588257, "reward_std": 0.14050918817520142, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4827626943588257, "rewards/pad": 0.234375, "step": 2583 }, { "completion_length": 216.9375, "epoch": 0.8234544295729764, "grad_norm": 9.241425514221191, "kl": 0.1171875, "learning_rate": 1.7654557042702356e-07, "loss": 0.0047, "reward": 1.474588394165039, "reward_std": 0.0695609524846077, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47458839416503906, "rewards/pad": 0.0, "step": 2584 }, { "completion_length": 348.734375, "epoch": 0.8237731038878267, "grad_norm": 7.2932448387146, "kl": 0.0673828125, "learning_rate": 1.7622689611217334e-07, "loss": 0.0027, "reward": 1.5508739948272705, "reward_std": 0.0942344218492508, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5664990544319153, "rewards/pad": 0.0, "step": 2585 }, { "completion_length": 191.140625, "epoch": 0.8240917782026769, "grad_norm": 15.595159530639648, "kl": 0.08935546875, "learning_rate": 1.7590822179732312e-07, "loss": 0.0036, "reward": 1.4914007186889648, "reward_std": 0.0594581663608551, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.366400808095932, "rewards/pad": 0.125, "step": 2586 }, { "completion_length": 209.671875, "epoch": 0.8244104525175271, "grad_norm": 10.024636268615723, "kl": 0.11279296875, "learning_rate": 1.755895474824729e-07, "loss": 0.0045, "reward": 1.5031325817108154, "reward_std": 0.09866450726985931, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5187575221061707, "rewards/pad": 0.0, "step": 2587 }, { "completion_length": 204.75, "epoch": 0.8247291268323773, "grad_norm": 24.551767349243164, "kl": 0.10595703125, "learning_rate": 1.7527087316762268e-07, "loss": 0.0042, "reward": 1.5804860591888428, "reward_std": 0.10609886050224304, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5961111187934875, "step": 2588 }, { "completion_length": 212.84375, "epoch": 0.8250478011472275, "grad_norm": 13.566366195678711, "kl": 0.08935546875, "learning_rate": 1.7495219885277246e-07, "loss": 0.0036, "reward": 1.7298917770385742, "reward_std": 0.10028212517499924, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6048917174339294, "rewards/pad": 0.125, "step": 2589 }, { "completion_length": 300.640625, "epoch": 0.8253664754620778, "grad_norm": 10.322278022766113, "kl": 0.07568359375, "learning_rate": 1.7463352453792224e-07, "loss": 0.003, "reward": 1.5587718486785889, "reward_std": 0.16429835557937622, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.953125, "rewards/tracking_iou_reward": 0.48064684867858887, "step": 2590 }, { "completion_length": 256.109375, "epoch": 0.825685149776928, "grad_norm": 7.206241607666016, "kl": 0.08544921875, "learning_rate": 1.7431485022307202e-07, "loss": 0.0034, "reward": 1.5976800918579102, "reward_std": 0.09668730199337006, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4883049726486206, "step": 2591 }, { "completion_length": 214.484375, "epoch": 0.8260038240917782, "grad_norm": 10.769760131835938, "kl": 0.0849609375, "learning_rate": 1.739961759082218e-07, "loss": 0.0034, "reward": 1.6574702262878418, "reward_std": 0.06978029012680054, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5324702262878418, "rewards/pad": 0.125, "step": 2592 }, { "completion_length": 371.171875, "epoch": 0.8263224984066284, "grad_norm": 9.234620094299316, "kl": 0.05908203125, "learning_rate": 1.7367750159337159e-07, "loss": 0.0024, "reward": 1.3417534828186035, "reward_std": 0.08860547840595245, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.35737860202789307, "step": 2593 }, { "completion_length": 268.015625, "epoch": 0.8266411727214786, "grad_norm": 40.45941925048828, "kl": 0.0751953125, "learning_rate": 1.7335882727852134e-07, "loss": 0.003, "reward": 1.4371353387832642, "reward_std": 0.12356918305158615, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.218385249376297, "step": 2594 }, { "completion_length": 292.671875, "epoch": 0.8269598470363289, "grad_norm": 9.114712715148926, "kl": 0.08056640625, "learning_rate": 1.7304015296367112e-07, "loss": 0.0032, "reward": 1.538050889968872, "reward_std": 0.057188645005226135, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5380508303642273, "step": 2595 }, { "completion_length": 225.78125, "epoch": 0.8272785213511791, "grad_norm": 13.69244384765625, "kl": 0.07470703125, "learning_rate": 1.727214786488209e-07, "loss": 0.003, "reward": 1.7152938842773438, "reward_std": 0.05327215790748596, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.46529388427734375, "step": 2596 }, { "completion_length": 337.6875, "epoch": 0.8275971956660293, "grad_norm": 19.979782104492188, "kl": 0.0703125, "learning_rate": 1.7240280433397068e-07, "loss": 0.0028, "reward": 1.6229982376098633, "reward_std": 0.11573445796966553, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.38862332701683044, "rewards/pad": 0.25, "step": 2597 }, { "completion_length": 288.65625, "epoch": 0.8279158699808795, "grad_norm": 10.287363052368164, "kl": 0.0751953125, "learning_rate": 1.7208413001912046e-07, "loss": 0.003, "reward": 1.3860034942626953, "reward_std": 0.08018055558204651, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3860034942626953, "rewards/pad": 0.0, "step": 2598 }, { "completion_length": 314.4375, "epoch": 0.8282345442957297, "grad_norm": 31.766921997070312, "kl": 0.076171875, "learning_rate": 1.7176545570427024e-07, "loss": 0.003, "reward": 1.3634631633758545, "reward_std": 0.10952109098434448, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.37908822298049927, "step": 2599 }, { "completion_length": 271.296875, "epoch": 0.82855321861058, "grad_norm": 16.726682662963867, "kl": 0.07080078125, "learning_rate": 1.7144678138942e-07, "loss": 0.0028, "reward": 1.6758776903152466, "reward_std": 0.07480183988809586, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5508776903152466, "rewards/pad": 0.125, "step": 2600 }, { "completion_length": 241.3125, "epoch": 0.8288718929254302, "grad_norm": 14.826166152954102, "kl": 0.1044921875, "learning_rate": 1.7112810707456978e-07, "loss": 0.0042, "reward": 1.5708891153335571, "reward_std": 0.08528805524110794, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5708891153335571, "rewards/pad": 0.0, "step": 2601 }, { "completion_length": 371.375, "epoch": 0.8291905672402804, "grad_norm": 22.454978942871094, "kl": 0.0556640625, "learning_rate": 1.7080943275971956e-07, "loss": 0.0022, "reward": 1.581399917602539, "reward_std": 0.11798357963562012, "rewards/pad": 0.0625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5345250368118286, "step": 2602 }, { "completion_length": 356.171875, "epoch": 0.8295092415551306, "grad_norm": 24.175840377807617, "kl": 0.045654296875, "learning_rate": 1.7049075844486934e-07, "loss": 0.0018, "reward": 1.6505095958709717, "reward_std": 0.038452018052339554, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4005095958709717, "rewards/pad": 0.25, "step": 2603 }, { "completion_length": 161.25, "epoch": 0.8298279158699808, "grad_norm": 22.794755935668945, "kl": 0.11279296875, "learning_rate": 1.701720841300191e-07, "loss": 0.0045, "reward": 1.7162601947784424, "reward_std": 0.09409304708242416, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5912601947784424, "step": 2604 }, { "completion_length": 156.90625, "epoch": 0.8301465901848311, "grad_norm": 8.154170036315918, "kl": 0.1123046875, "learning_rate": 1.6985340981516887e-07, "loss": 0.0045, "reward": 1.560814380645752, "reward_std": 0.049747712910175323, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5608144402503967, "rewards/pad": 0.0, "step": 2605 }, { "completion_length": 282.671875, "epoch": 0.8304652644996813, "grad_norm": 12.08977222442627, "kl": 0.07275390625, "learning_rate": 1.6953473550031865e-07, "loss": 0.0029, "reward": 1.6754486560821533, "reward_std": 0.0713881179690361, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5660737156867981, "step": 2606 }, { "completion_length": 252.90625, "epoch": 0.8307839388145315, "grad_norm": 14.890015602111816, "kl": 0.07568359375, "learning_rate": 1.6921606118546843e-07, "loss": 0.003, "reward": 1.662458896636963, "reward_std": 0.07578548789024353, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5374589562416077, "step": 2607 }, { "completion_length": 208.03125, "epoch": 0.8311026131293817, "grad_norm": 9.151792526245117, "kl": 0.07666015625, "learning_rate": 1.6889738687061821e-07, "loss": 0.0031, "reward": 1.6875064373016357, "reward_std": 0.1356935203075409, "rewards/pad": 0.21875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.46875646710395813, "step": 2608 }, { "completion_length": 246.5, "epoch": 0.831421287444232, "grad_norm": 12.123498916625977, "kl": 0.09619140625, "learning_rate": 1.68578712555768e-07, "loss": 0.0039, "reward": 1.754729986190796, "reward_std": 0.10019632428884506, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5047299861907959, "rewards/pad": 0.25, "step": 2609 }, { "completion_length": 257.328125, "epoch": 0.8317399617590823, "grad_norm": 30.473678588867188, "kl": 0.07568359375, "learning_rate": 1.6826003824091777e-07, "loss": 0.003, "reward": 1.5858802795410156, "reward_std": 0.1852986067533493, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5077552199363708, "step": 2610 }, { "completion_length": 400.0, "epoch": 0.8320586360739325, "grad_norm": 6.983999252319336, "kl": 0.064453125, "learning_rate": 1.6794136392606755e-07, "loss": 0.0026, "reward": 1.3887776136398315, "reward_std": 0.09014880657196045, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.27940258383750916, "step": 2611 }, { "completion_length": 386.953125, "epoch": 0.8323773103887827, "grad_norm": 8.778849601745605, "kl": 0.049560546875, "learning_rate": 1.6762268961121734e-07, "loss": 0.002, "reward": 1.4016900062561035, "reward_std": 0.0967012494802475, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4173150956630707, "rewards/pad": 0.0, "step": 2612 }, { "completion_length": 115.78125, "epoch": 0.8326959847036329, "grad_norm": 12.407329559326172, "kl": 0.1259765625, "learning_rate": 1.6730401529636712e-07, "loss": 0.005, "reward": 1.786607027053833, "reward_std": 0.1045801043510437, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.661607027053833, "step": 2613 }, { "completion_length": 261.046875, "epoch": 0.8330146590184832, "grad_norm": 13.980108261108398, "kl": 0.091796875, "learning_rate": 1.6698534098151687e-07, "loss": 0.0037, "reward": 1.6479519605636597, "reward_std": 0.04881848394870758, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5229519605636597, "step": 2614 }, { "completion_length": 305.546875, "epoch": 0.8333333333333334, "grad_norm": 5.3638434410095215, "kl": 0.08837890625, "learning_rate": 1.6666666666666665e-07, "loss": 0.0035, "reward": 1.4231975078582764, "reward_std": 0.12304175645112991, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4544474482536316, "step": 2615 }, { "completion_length": 263.234375, "epoch": 0.8336520076481836, "grad_norm": 8.397769927978516, "kl": 0.080078125, "learning_rate": 1.6634799235181643e-07, "loss": 0.0032, "reward": 1.6945033073425293, "reward_std": 0.057246167212724686, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4445032775402069, "step": 2616 }, { "completion_length": 420.171875, "epoch": 0.8339706819630338, "grad_norm": 12.449578285217285, "kl": 0.06298828125, "learning_rate": 1.660293180369662e-07, "loss": 0.0025, "reward": 1.3212941884994507, "reward_std": 0.041054822504520416, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.32129421830177307, "rewards/pad": 0.0, "step": 2617 }, { "completion_length": 204.796875, "epoch": 0.834289356277884, "grad_norm": 19.821781158447266, "kl": 0.08447265625, "learning_rate": 1.65710643722116e-07, "loss": 0.0034, "reward": 1.7124440670013428, "reward_std": 0.049534693360328674, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.587444007396698, "rewards/pad": 0.125, "step": 2618 }, { "completion_length": 149.09375, "epoch": 0.8346080305927343, "grad_norm": 17.232561111450195, "kl": 0.115234375, "learning_rate": 1.6539196940726577e-07, "loss": 0.0046, "reward": 1.5556838512420654, "reward_std": 0.0869147777557373, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5556838512420654, "rewards/pad": 0.0, "step": 2619 }, { "completion_length": 304.625, "epoch": 0.8349267049075845, "grad_norm": 5.9082255363464355, "kl": 0.06298828125, "learning_rate": 1.6507329509241555e-07, "loss": 0.0025, "reward": 1.6162960529327393, "reward_std": 0.08056774735450745, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4912959933280945, "rewards/pad": 0.125, "step": 2620 }, { "completion_length": 224.484375, "epoch": 0.8352453792224347, "grad_norm": 11.629868507385254, "kl": 0.0849609375, "learning_rate": 1.6475462077756533e-07, "loss": 0.0034, "reward": 1.3327964544296265, "reward_std": 0.06474357098340988, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3327964246273041, "rewards/pad": 0.0, "step": 2621 }, { "completion_length": 165.796875, "epoch": 0.8355640535372849, "grad_norm": 139.20846557617188, "kl": 0.12353515625, "learning_rate": 1.6443594646271511e-07, "loss": 0.005, "reward": 1.564035415649414, "reward_std": 0.07771393656730652, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4390353560447693, "rewards/pad": 0.125, "step": 2622 }, { "completion_length": 273.6875, "epoch": 0.8358827278521351, "grad_norm": 10.315620422363281, "kl": 0.06787109375, "learning_rate": 1.641172721478649e-07, "loss": 0.0027, "reward": 1.5206451416015625, "reward_std": 0.07278362661600113, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4112701416015625, "step": 2623 }, { "completion_length": 221.265625, "epoch": 0.8362014021669854, "grad_norm": 13.258879661560059, "kl": 0.09423828125, "learning_rate": 1.6379859783301465e-07, "loss": 0.0038, "reward": 1.6685354709625244, "reward_std": 0.1319824755191803, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5904104709625244, "rewards/pad": 0.078125, "step": 2624 }, { "completion_length": 263.875, "epoch": 0.8365200764818356, "grad_norm": 12.78669261932373, "kl": 0.08251953125, "learning_rate": 1.6347992351816443e-07, "loss": 0.0033, "reward": 1.412626028060913, "reward_std": 0.04722040146589279, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4126260578632355, "rewards/pad": 0.0, "step": 2625 }, { "completion_length": 296.828125, "epoch": 0.8368387507966858, "grad_norm": 22.007631301879883, "kl": 0.0556640625, "learning_rate": 1.631612492033142e-07, "loss": 0.0022, "reward": 1.7572367191314697, "reward_std": 0.11621341109275818, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5228617191314697, "rewards/pad": 0.234375, "step": 2626 }, { "completion_length": 313.0, "epoch": 0.837157425111536, "grad_norm": 6.745031356811523, "kl": 0.080078125, "learning_rate": 1.62842574888464e-07, "loss": 0.0032, "reward": 1.3981493711471558, "reward_std": 0.10961896181106567, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4293993413448334, "step": 2627 }, { "completion_length": 347.609375, "epoch": 0.8374760994263862, "grad_norm": 9.913939476013184, "kl": 0.060791015625, "learning_rate": 1.6252390057361377e-07, "loss": 0.0024, "reward": 1.5316874980926514, "reward_std": 0.11328306794166565, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5473124980926514, "rewards/pad": 0.0, "step": 2628 }, { "completion_length": 157.21875, "epoch": 0.8377947737412365, "grad_norm": 20.527606964111328, "kl": 0.08251953125, "learning_rate": 1.6220522625876355e-07, "loss": 0.0033, "reward": 1.8070180416107178, "reward_std": 0.15403233468532562, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.572642982006073, "step": 2629 }, { "completion_length": 299.59375, "epoch": 0.8381134480560867, "grad_norm": 6.451737880706787, "kl": 0.06689453125, "learning_rate": 1.6188655194391333e-07, "loss": 0.0027, "reward": 1.6873538494110107, "reward_std": 0.13507935404777527, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5779789090156555, "step": 2630 }, { "completion_length": 289.46875, "epoch": 0.8384321223709369, "grad_norm": 17.178695678710938, "kl": 0.07958984375, "learning_rate": 1.6156787762906309e-07, "loss": 0.0032, "reward": 1.43893301486969, "reward_std": 0.057767391204833984, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43893301486968994, "step": 2631 }, { "completion_length": 264.78125, "epoch": 0.8387507966857871, "grad_norm": 20.748149871826172, "kl": 0.07080078125, "learning_rate": 1.6124920331421287e-07, "loss": 0.0028, "reward": 1.5110678672790527, "reward_std": 0.10878480970859528, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3860679566860199, "step": 2632 }, { "completion_length": 303.09375, "epoch": 0.8390694710006373, "grad_norm": 10.335489273071289, "kl": 0.06689453125, "learning_rate": 1.6093052899936262e-07, "loss": 0.0027, "reward": 1.610609531402588, "reward_std": 0.061860740184783936, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4856095314025879, "step": 2633 }, { "completion_length": 207.921875, "epoch": 0.8393881453154876, "grad_norm": 13.028353691101074, "kl": 0.0908203125, "learning_rate": 1.606118546845124e-07, "loss": 0.0036, "reward": 1.662575125694275, "reward_std": 0.0714992880821228, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5375750064849854, "rewards/pad": 0.125, "step": 2634 }, { "completion_length": 286.265625, "epoch": 0.8397068196303378, "grad_norm": 20.43387222290039, "kl": 0.078125, "learning_rate": 1.6029318036966218e-07, "loss": 0.0031, "reward": 1.6184622049331665, "reward_std": 0.08133666217327118, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5090871453285217, "step": 2635 }, { "completion_length": 292.96875, "epoch": 0.840025493945188, "grad_norm": 9.506821632385254, "kl": 0.1123046875, "learning_rate": 1.5997450605481196e-07, "loss": 0.0045, "reward": 1.5370888710021973, "reward_std": 0.1493566632270813, "rewards/answer_reward": 0.09375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.44333887100219727, "step": 2636 }, { "completion_length": 311.421875, "epoch": 0.8403441682600382, "grad_norm": 15.433694839477539, "kl": 0.07568359375, "learning_rate": 1.5965583173996174e-07, "loss": 0.003, "reward": 1.522940993309021, "reward_std": 0.1161201074719429, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.413565993309021, "rewards/pad": 0.125, "step": 2637 }, { "completion_length": 209.765625, "epoch": 0.8406628425748884, "grad_norm": 9.586259841918945, "kl": 0.0869140625, "learning_rate": 1.5933715742511152e-07, "loss": 0.0035, "reward": 1.6498695611953735, "reward_std": 0.14003899693489075, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47799453139305115, "step": 2638 }, { "completion_length": 244.203125, "epoch": 0.8409815168897387, "grad_norm": 29.076448440551758, "kl": 0.08203125, "learning_rate": 1.590184831102613e-07, "loss": 0.0033, "reward": 1.7706998586654663, "reward_std": 0.12016107887029648, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6144499182701111, "rewards/pad": 0.15625, "step": 2639 }, { "completion_length": 261.046875, "epoch": 0.8413001912045889, "grad_norm": 8.538639068603516, "kl": 0.07373046875, "learning_rate": 1.5869980879541108e-07, "loss": 0.003, "reward": 1.514484167098999, "reward_std": 0.05714438855648041, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.514484167098999, "rewards/pad": 0.0, "step": 2640 }, { "completion_length": 364.96875, "epoch": 0.8416188655194391, "grad_norm": 24.829381942749023, "kl": 0.072265625, "learning_rate": 1.5838113448056086e-07, "loss": 0.0029, "reward": 1.4145584106445312, "reward_std": 0.06661966443061829, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.41455844044685364, "step": 2641 }, { "completion_length": 212.734375, "epoch": 0.8419375398342893, "grad_norm": 9.248557090759277, "kl": 0.0966796875, "learning_rate": 1.5806246016571064e-07, "loss": 0.0039, "reward": 1.6410948038101196, "reward_std": 0.07060378789901733, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5160947442054749, "rewards/pad": 0.125, "step": 2642 }, { "completion_length": 242.828125, "epoch": 0.8422562141491395, "grad_norm": 10.468927383422852, "kl": 0.09912109375, "learning_rate": 1.577437858508604e-07, "loss": 0.004, "reward": 1.6543145179748535, "reward_std": 0.044029463082551956, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6543145179748535, "rewards/pad": 0.0, "step": 2643 }, { "completion_length": 217.28125, "epoch": 0.8425748884639898, "grad_norm": 7.1923065185546875, "kl": 0.1591796875, "learning_rate": 1.5742511153601018e-07, "loss": 0.0064, "reward": 1.661090612411499, "reward_std": 0.08890962600708008, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.536090612411499, "step": 2644 }, { "completion_length": 355.34375, "epoch": 0.84289356277884, "grad_norm": 19.381942749023438, "kl": 0.0751953125, "learning_rate": 1.5710643722115996e-07, "loss": 0.003, "reward": 1.3861980438232422, "reward_std": 0.1279515027999878, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.2768230736255646, "step": 2645 }, { "completion_length": 299.25, "epoch": 0.8432122370936902, "grad_norm": 10.264814376831055, "kl": 0.0830078125, "learning_rate": 1.5678776290630974e-07, "loss": 0.0033, "reward": 1.570202112197876, "reward_std": 0.11829538643360138, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.570202112197876, "step": 2646 }, { "completion_length": 249.5, "epoch": 0.8435309114085404, "grad_norm": 11.279900550842285, "kl": 0.08447265625, "learning_rate": 1.5646908859145952e-07, "loss": 0.0034, "reward": 1.544940710067749, "reward_std": 0.06372781842947006, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.419940710067749, "step": 2647 }, { "completion_length": 211.84375, "epoch": 0.8438495857233907, "grad_norm": 14.549507141113281, "kl": 0.08935546875, "learning_rate": 1.561504142766093e-07, "loss": 0.0036, "reward": 1.6167564392089844, "reward_std": 0.11819630861282349, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4917564392089844, "rewards/pad": 0.125, "step": 2648 }, { "completion_length": 214.484375, "epoch": 0.844168260038241, "grad_norm": 18.767784118652344, "kl": 0.107421875, "learning_rate": 1.5583173996175908e-07, "loss": 0.0043, "reward": 1.6396679878234863, "reward_std": 0.15843240916728973, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.6709179878234863, "step": 2649 }, { "completion_length": 352.203125, "epoch": 0.8444869343530912, "grad_norm": 7.323569297790527, "kl": 0.07080078125, "learning_rate": 1.5551306564690886e-07, "loss": 0.0028, "reward": 1.368403673171997, "reward_std": 0.05267880856990814, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3684036433696747, "rewards/pad": 0.0, "step": 2650 }, { "completion_length": 248.84375, "epoch": 0.8448056086679414, "grad_norm": 14.06733226776123, "kl": 0.09423828125, "learning_rate": 1.5519439133205864e-07, "loss": 0.0038, "reward": 1.4895764589309692, "reward_std": 0.19936680793762207, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.9375, "rewards/tracking_iou_reward": 0.5520764589309692, "step": 2651 }, { "completion_length": 146.21875, "epoch": 0.8451242829827916, "grad_norm": 14.021964073181152, "kl": 0.1103515625, "learning_rate": 1.5487571701720842e-07, "loss": 0.0044, "reward": 1.5428390502929688, "reward_std": 0.10221075266599655, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5428390502929688, "rewards/pad": 0.0, "step": 2652 }, { "completion_length": 262.390625, "epoch": 0.8454429572976419, "grad_norm": 5.666436195373535, "kl": 0.08203125, "learning_rate": 1.5455704270235818e-07, "loss": 0.0033, "reward": 1.78913414478302, "reward_std": 0.07610681653022766, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5391342043876648, "step": 2653 }, { "completion_length": 275.375, "epoch": 0.8457616316124921, "grad_norm": 8.450581550598145, "kl": 0.07861328125, "learning_rate": 1.5423836838750796e-07, "loss": 0.0031, "reward": 1.5056837797164917, "reward_std": 0.034122534096241, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5056837201118469, "rewards/pad": 0.0, "step": 2654 }, { "completion_length": 295.03125, "epoch": 0.8460803059273423, "grad_norm": 9.418177604675293, "kl": 0.0791015625, "learning_rate": 1.5391969407265774e-07, "loss": 0.0032, "reward": 1.5818204879760742, "reward_std": 0.09731556475162506, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4724454879760742, "step": 2655 }, { "completion_length": 194.40625, "epoch": 0.8463989802421925, "grad_norm": 36.22610092163086, "kl": 0.099609375, "learning_rate": 1.5360101975780752e-07, "loss": 0.004, "reward": 1.5101938247680664, "reward_std": 0.09336835891008377, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5101937651634216, "rewards/pad": 0.0, "step": 2656 }, { "completion_length": 334.34375, "epoch": 0.8467176545570427, "grad_norm": 18.423858642578125, "kl": 0.07275390625, "learning_rate": 1.532823454429573e-07, "loss": 0.0029, "reward": 1.5258831977844238, "reward_std": 0.13142189383506775, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5415083169937134, "step": 2657 }, { "completion_length": 164.921875, "epoch": 0.847036328871893, "grad_norm": 29.420068740844727, "kl": 0.11328125, "learning_rate": 1.5296367112810708e-07, "loss": 0.0045, "reward": 1.5939750671386719, "reward_std": 0.10682038962841034, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5939749479293823, "rewards/pad": 0.0, "step": 2658 }, { "completion_length": 157.109375, "epoch": 0.8473550031867432, "grad_norm": 44.709381103515625, "kl": 0.11376953125, "learning_rate": 1.5264499681325686e-07, "loss": 0.0046, "reward": 1.6389029026031494, "reward_std": 0.14254002273082733, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.513903021812439, "rewards/pad": 0.125, "step": 2659 }, { "completion_length": 393.3125, "epoch": 0.8476736775015934, "grad_norm": 3.54472017288208, "kl": 0.046630859375, "learning_rate": 1.5232632249840664e-07, "loss": 0.0019, "reward": 1.4698128700256348, "reward_std": 0.04882584884762764, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.46981289982795715, "step": 2660 }, { "completion_length": 279.828125, "epoch": 0.8479923518164436, "grad_norm": 6.6133832931518555, "kl": 0.09765625, "learning_rate": 1.5200764818355642e-07, "loss": 0.0039, "reward": 1.4522119760513306, "reward_std": 0.15472984313964844, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4834619462490082, "rewards/pad": 0.0, "step": 2661 }, { "completion_length": 270.28125, "epoch": 0.8483110261312938, "grad_norm": 22.683170318603516, "kl": 0.06982421875, "learning_rate": 1.5168897386870618e-07, "loss": 0.0028, "reward": 1.789536714553833, "reward_std": 0.17844700813293457, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4614117741584778, "rewards/pad": 0.34375, "step": 2662 }, { "completion_length": 353.015625, "epoch": 0.848629700446144, "grad_norm": 4.951906681060791, "kl": 0.0634765625, "learning_rate": 1.5137029955385593e-07, "loss": 0.0025, "reward": 1.4932067394256592, "reward_std": 0.08742601424455643, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.383831650018692, "rewards/pad": 0.125, "step": 2663 }, { "completion_length": 313.75, "epoch": 0.8489483747609943, "grad_norm": 5.683970928192139, "kl": 0.07275390625, "learning_rate": 1.510516252390057e-07, "loss": 0.0029, "reward": 1.6920580863952637, "reward_std": 0.13436323404312134, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5826830863952637, "step": 2664 }, { "completion_length": 276.703125, "epoch": 0.8492670490758445, "grad_norm": 7.645228385925293, "kl": 0.10546875, "learning_rate": 1.507329509241555e-07, "loss": 0.0042, "reward": 1.4928104877471924, "reward_std": 0.06374173611402512, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4928104877471924, "rewards/pad": 0.0, "step": 2665 }, { "completion_length": 308.59375, "epoch": 0.8495857233906947, "grad_norm": 7.788649559020996, "kl": 0.0693359375, "learning_rate": 1.5041427660930527e-07, "loss": 0.0028, "reward": 1.4845178127288818, "reward_std": 0.09160245954990387, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5001428723335266, "rewards/pad": 0.0, "step": 2666 }, { "completion_length": 285.078125, "epoch": 0.8499043977055449, "grad_norm": 16.08645248413086, "kl": 0.0791015625, "learning_rate": 1.5009560229445505e-07, "loss": 0.0032, "reward": 1.4720804691314697, "reward_std": 0.11547581106424332, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4877054691314697, "rewards/pad": 0.0, "step": 2667 }, { "completion_length": 340.0, "epoch": 0.8502230720203952, "grad_norm": 7.84180212020874, "kl": 0.07177734375, "learning_rate": 1.4977692797960483e-07, "loss": 0.0029, "reward": 1.407630443572998, "reward_std": 0.1025407463312149, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4232555031776428, "step": 2668 }, { "completion_length": 273.625, "epoch": 0.8505417463352454, "grad_norm": 8.022253036499023, "kl": 0.091796875, "learning_rate": 1.494582536647546e-07, "loss": 0.0037, "reward": 1.5171573162078857, "reward_std": 0.16217833757400513, "rewards/answer_reward": 0.203125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.32965725660324097, "step": 2669 }, { "completion_length": 305.609375, "epoch": 0.8508604206500956, "grad_norm": 11.595643997192383, "kl": 0.06298828125, "learning_rate": 1.491395793499044e-07, "loss": 0.0025, "reward": 1.793088674545288, "reward_std": 0.1004258394241333, "rewards/pad": 0.203125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5899636745452881, "step": 2670 }, { "completion_length": 367.6875, "epoch": 0.8511790949649458, "grad_norm": 7.779849529266357, "kl": 0.05859375, "learning_rate": 1.4882090503505417e-07, "loss": 0.0023, "reward": 1.4648220539093018, "reward_std": 0.07983763515949249, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44919702410697937, "step": 2671 }, { "completion_length": 235.9375, "epoch": 0.851497769279796, "grad_norm": 16.491291046142578, "kl": 0.08251953125, "learning_rate": 1.4850223072020395e-07, "loss": 0.0033, "reward": 1.4662169218063354, "reward_std": 0.047906674444675446, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4662169814109802, "step": 2672 }, { "completion_length": 248.765625, "epoch": 0.8518164435946463, "grad_norm": 8.279892921447754, "kl": 0.06884765625, "learning_rate": 1.481835564053537e-07, "loss": 0.0028, "reward": 1.8111933469772339, "reward_std": 0.07763614505529404, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5611932873725891, "step": 2673 }, { "completion_length": 332.0, "epoch": 0.8521351179094965, "grad_norm": 47.811439514160156, "kl": 0.07373046875, "learning_rate": 1.478648820905035e-07, "loss": 0.003, "reward": 1.4083024263381958, "reward_std": 0.03486569970846176, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.408302366733551, "rewards/pad": 0.0, "step": 2674 }, { "completion_length": 352.671875, "epoch": 0.8524537922243467, "grad_norm": 15.739791870117188, "kl": 0.048828125, "learning_rate": 1.4754620777565327e-07, "loss": 0.0019, "reward": 1.4655406475067139, "reward_std": 0.049866218119859695, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.34054064750671387, "step": 2675 }, { "completion_length": 206.34375, "epoch": 0.8527724665391969, "grad_norm": 9.068931579589844, "kl": 0.1337890625, "learning_rate": 1.4722753346080305e-07, "loss": 0.0054, "reward": 1.5625362396240234, "reward_std": 0.08124029636383057, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43753618001937866, "rewards/pad": 0.125, "step": 2676 }, { "completion_length": 277.78125, "epoch": 0.8530911408540471, "grad_norm": 26.652000427246094, "kl": 0.07373046875, "learning_rate": 1.4690885914595283e-07, "loss": 0.0029, "reward": 1.658882975578308, "reward_std": 0.03863655775785446, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6588829159736633, "rewards/pad": 0.0, "step": 2677 }, { "completion_length": 309.40625, "epoch": 0.8534098151688974, "grad_norm": 11.035650253295898, "kl": 0.0703125, "learning_rate": 1.465901848311026e-07, "loss": 0.0028, "reward": 1.4774425029754639, "reward_std": 0.1272467076778412, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.49306756258010864, "rewards/pad": 0.0, "step": 2678 }, { "completion_length": 173.75, "epoch": 0.8537284894837476, "grad_norm": 8.464993476867676, "kl": 0.0888671875, "learning_rate": 1.462715105162524e-07, "loss": 0.0036, "reward": 1.6125917434692383, "reward_std": 0.059951480478048325, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6125918030738831, "rewards/pad": 0.0, "step": 2679 }, { "completion_length": 269.875, "epoch": 0.8540471637985978, "grad_norm": 5.818058490753174, "kl": 0.09521484375, "learning_rate": 1.4595283620140217e-07, "loss": 0.0038, "reward": 1.5869768857955933, "reward_std": 0.1202688217163086, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.47760194540023804, "step": 2680 }, { "completion_length": 188.140625, "epoch": 0.854365838113448, "grad_norm": 15.608235359191895, "kl": 0.09912109375, "learning_rate": 1.4563416188655195e-07, "loss": 0.004, "reward": 1.6593985557556152, "reward_std": 0.06854577362537384, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6593986749649048, "rewards/pad": 0.0, "step": 2681 }, { "completion_length": 360.71875, "epoch": 0.8546845124282982, "grad_norm": 11.706225395202637, "kl": 0.0673828125, "learning_rate": 1.4531548757170173e-07, "loss": 0.0027, "reward": 1.5080907344818115, "reward_std": 0.11667753756046295, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.39871570467948914, "rewards/pad": 0.125, "step": 2682 }, { "completion_length": 194.328125, "epoch": 0.8550031867431485, "grad_norm": 16.0618953704834, "kl": 0.099609375, "learning_rate": 1.4499681325685149e-07, "loss": 0.004, "reward": 1.6319692134857178, "reward_std": 0.055502306669950485, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6319692730903625, "rewards/pad": 0.0, "step": 2683 }, { "completion_length": 188.96875, "epoch": 0.8553218610579987, "grad_norm": 38.90488815307617, "kl": 0.1162109375, "learning_rate": 1.4467813894200127e-07, "loss": 0.0047, "reward": 1.5885021686553955, "reward_std": 0.07763275504112244, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5885021686553955, "rewards/pad": 0.0, "step": 2684 }, { "completion_length": 279.625, "epoch": 0.8556405353728489, "grad_norm": 5.486372470855713, "kl": 0.06982421875, "learning_rate": 1.4435946462715105e-07, "loss": 0.0028, "reward": 1.6683847904205322, "reward_std": 0.060479871928691864, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5433847904205322, "step": 2685 }, { "completion_length": 198.859375, "epoch": 0.8559592096876991, "grad_norm": 15.562231063842773, "kl": 0.08837890625, "learning_rate": 1.4404079031230083e-07, "loss": 0.0035, "reward": 1.700699806213379, "reward_std": 0.08188517391681671, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5756998062133789, "rewards/pad": 0.125, "step": 2686 }, { "completion_length": 229.46875, "epoch": 0.8562778840025494, "grad_norm": 16.420684814453125, "kl": 0.07666015625, "learning_rate": 1.437221159974506e-07, "loss": 0.0031, "reward": 1.7302379608154297, "reward_std": 0.04194648563861847, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.48023805022239685, "step": 2687 }, { "completion_length": 264.03125, "epoch": 0.8565965583173997, "grad_norm": 9.722140312194824, "kl": 0.08447265625, "learning_rate": 1.434034416826004e-07, "loss": 0.0034, "reward": 1.574733853340149, "reward_std": 0.05277637392282486, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4497338533401489, "rewards/pad": 0.125, "step": 2688 }, { "completion_length": 195.390625, "epoch": 0.8569152326322499, "grad_norm": 14.187362670898438, "kl": 0.09326171875, "learning_rate": 1.4308476736775017e-07, "loss": 0.0037, "reward": 1.424652099609375, "reward_std": 0.07914784550666809, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4246521592140198, "step": 2689 }, { "completion_length": 247.71875, "epoch": 0.8572339069471001, "grad_norm": 5.567068099975586, "kl": 0.08837890625, "learning_rate": 1.4276609305289995e-07, "loss": 0.0035, "reward": 1.4163810014724731, "reward_std": 0.08065269887447357, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.41638097167015076, "step": 2690 }, { "completion_length": 315.296875, "epoch": 0.8575525812619503, "grad_norm": 11.564667701721191, "kl": 0.07373046875, "learning_rate": 1.4244741873804973e-07, "loss": 0.0029, "reward": 1.5152671337127686, "reward_std": 0.10225661844015121, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4683920443058014, "step": 2691 }, { "completion_length": 254.421875, "epoch": 0.8578712555768006, "grad_norm": 21.23450469970703, "kl": 0.07177734375, "learning_rate": 1.4212874442319946e-07, "loss": 0.0029, "reward": 1.4381251335144043, "reward_std": 0.04972090572118759, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3131251335144043, "rewards/pad": 0.125, "step": 2692 }, { "completion_length": 297.359375, "epoch": 0.8581899298916508, "grad_norm": 9.087606430053711, "kl": 0.0693359375, "learning_rate": 1.4181007010834924e-07, "loss": 0.0028, "reward": 1.5673494338989258, "reward_std": 0.08237025141716003, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5829744338989258, "rewards/pad": 0.0, "step": 2693 }, { "completion_length": 240.21875, "epoch": 0.858508604206501, "grad_norm": 15.42308521270752, "kl": 0.09521484375, "learning_rate": 1.4149139579349902e-07, "loss": 0.0038, "reward": 1.4050732851028442, "reward_std": 0.132409930229187, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.42069828510284424, "step": 2694 }, { "completion_length": 306.671875, "epoch": 0.8588272785213512, "grad_norm": 13.6598482131958, "kl": 0.056884765625, "learning_rate": 1.411727214786488e-07, "loss": 0.0023, "reward": 1.866136074066162, "reward_std": 0.10433478653430939, "rewards/pad": 0.375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4911360442638397, "step": 2695 }, { "completion_length": 223.0, "epoch": 0.8591459528362014, "grad_norm": 11.171992301940918, "kl": 0.09130859375, "learning_rate": 1.4085404716379858e-07, "loss": 0.0037, "reward": 1.6910674571990967, "reward_std": 0.1258898377418518, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44106757640838623, "rewards/pad": 0.25, "step": 2696 }, { "completion_length": 208.546875, "epoch": 0.8594646271510517, "grad_norm": 19.124313354492188, "kl": 0.1171875, "learning_rate": 1.4053537284894836e-07, "loss": 0.0047, "reward": 1.6321409940719604, "reward_std": 0.1274343580007553, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5071409940719604, "rewards/pad": 0.125, "step": 2697 }, { "completion_length": 247.859375, "epoch": 0.8597833014659019, "grad_norm": 9.619743347167969, "kl": 0.07373046875, "learning_rate": 1.4021669853409814e-07, "loss": 0.0029, "reward": 1.7662237882614136, "reward_std": 0.11302575469017029, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5318487286567688, "step": 2698 }, { "completion_length": 196.3125, "epoch": 0.8601019757807521, "grad_norm": 12.168063163757324, "kl": 0.0927734375, "learning_rate": 1.3989802421924792e-07, "loss": 0.0037, "reward": 1.6692471504211426, "reward_std": 0.08357395231723785, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6692471504211426, "rewards/pad": 0.0, "step": 2699 }, { "completion_length": 271.71875, "epoch": 0.8604206500956023, "grad_norm": 18.2286376953125, "kl": 0.07421875, "learning_rate": 1.395793499043977e-07, "loss": 0.003, "reward": 1.4802391529083252, "reward_std": 0.0642646849155426, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4802391827106476, "rewards/pad": 0.0, "step": 2700 }, { "completion_length": 297.078125, "epoch": 0.8607393244104525, "grad_norm": 7.670638561248779, "kl": 0.07666015625, "learning_rate": 1.3926067558954748e-07, "loss": 0.0031, "reward": 1.6246676445007324, "reward_std": 0.056935422122478485, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49966752529144287, "step": 2701 }, { "completion_length": 272.28125, "epoch": 0.8610579987253028, "grad_norm": 11.62264347076416, "kl": 0.09375, "learning_rate": 1.3894200127469724e-07, "loss": 0.0038, "reward": 1.3839402198791504, "reward_std": 0.09366807341575623, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3839401602745056, "rewards/pad": 0.0, "step": 2702 }, { "completion_length": 249.40625, "epoch": 0.861376673040153, "grad_norm": 8.985459327697754, "kl": 0.0849609375, "learning_rate": 1.3862332695984702e-07, "loss": 0.0034, "reward": 1.6927094459533691, "reward_std": 0.08651595562696457, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5677094459533691, "step": 2703 }, { "completion_length": 152.25, "epoch": 0.8616953473550032, "grad_norm": 12.082322120666504, "kl": 0.11865234375, "learning_rate": 1.383046526449968e-07, "loss": 0.0048, "reward": 1.6535862684249878, "reward_std": 0.06323709338903427, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6535862684249878, "rewards/pad": 0.0, "step": 2704 }, { "completion_length": 253.703125, "epoch": 0.8620140216698534, "grad_norm": 9.971209526062012, "kl": 0.09619140625, "learning_rate": 1.3798597833014658e-07, "loss": 0.0039, "reward": 1.4282680749893188, "reward_std": 0.13955871760845184, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.44389307498931885, "rewards/pad": 0.0, "step": 2705 }, { "completion_length": 150.5625, "epoch": 0.8623326959847036, "grad_norm": 20.146631240844727, "kl": 0.1328125, "learning_rate": 1.3766730401529636e-07, "loss": 0.0053, "reward": 1.5105044841766357, "reward_std": 0.16675220429897308, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.541754424571991, "rewards/pad": 0.0, "step": 2706 }, { "completion_length": 256.875, "epoch": 0.8626513702995539, "grad_norm": 6.620942115783691, "kl": 0.09716796875, "learning_rate": 1.3734862970044614e-07, "loss": 0.0039, "reward": 1.4991254806518555, "reward_std": 0.05944891646504402, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49912554025650024, "step": 2707 }, { "completion_length": 217.640625, "epoch": 0.8629700446144041, "grad_norm": 10.063702583312988, "kl": 0.091796875, "learning_rate": 1.3702995538559592e-07, "loss": 0.0037, "reward": 1.7139660120010376, "reward_std": 0.15648072957992554, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.6045910716056824, "step": 2708 }, { "completion_length": 310.6875, "epoch": 0.8632887189292543, "grad_norm": 8.95224380493164, "kl": 0.06982421875, "learning_rate": 1.367112810707457e-07, "loss": 0.0028, "reward": 1.5477850437164307, "reward_std": 0.032181769609451294, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4227849841117859, "step": 2709 }, { "completion_length": 211.4375, "epoch": 0.8636073932441045, "grad_norm": 12.4854736328125, "kl": 0.091796875, "learning_rate": 1.3639260675589548e-07, "loss": 0.0037, "reward": 1.8467566967010498, "reward_std": 0.055249616503715515, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4717566967010498, "step": 2710 }, { "completion_length": 288.5, "epoch": 0.8639260675589547, "grad_norm": 16.495664596557617, "kl": 0.06494140625, "learning_rate": 1.3607393244104526e-07, "loss": 0.0026, "reward": 1.6814583539962769, "reward_std": 0.08853200823068619, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5720832943916321, "step": 2711 }, { "completion_length": 163.34375, "epoch": 0.864244741873805, "grad_norm": 29.02391242980957, "kl": 0.1220703125, "learning_rate": 1.3575525812619501e-07, "loss": 0.0049, "reward": 1.8566462993621826, "reward_std": 0.19339735805988312, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6535212397575378, "rewards/pad": 0.203125, "step": 2712 }, { "completion_length": 259.15625, "epoch": 0.8645634161886552, "grad_norm": 5.97607421875, "kl": 0.0703125, "learning_rate": 1.354365838113448e-07, "loss": 0.0028, "reward": 1.30307137966156, "reward_std": 0.03802892565727234, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.30307137966156006, "rewards/pad": 0.0, "step": 2713 }, { "completion_length": 358.9375, "epoch": 0.8648820905035054, "grad_norm": 12.710834503173828, "kl": 0.0673828125, "learning_rate": 1.3511790949649458e-07, "loss": 0.0027, "reward": 1.3558748960494995, "reward_std": 0.10074446350336075, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3714999556541443, "rewards/pad": 0.0, "step": 2714 }, { "completion_length": 211.765625, "epoch": 0.8652007648183556, "grad_norm": 10.720243453979492, "kl": 0.0849609375, "learning_rate": 1.3479923518164436e-07, "loss": 0.0034, "reward": 1.6154134273529053, "reward_std": 0.05592378228902817, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.61541348695755, "step": 2715 }, { "completion_length": 263.359375, "epoch": 0.8655194391332058, "grad_norm": 10.494246482849121, "kl": 0.07421875, "learning_rate": 1.3448056086679414e-07, "loss": 0.003, "reward": 1.6904199123382568, "reward_std": 0.08571196347475052, "rewards/pad": 0.296875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3935449719429016, "step": 2716 }, { "completion_length": 412.921875, "epoch": 0.8658381134480561, "grad_norm": 6.365828514099121, "kl": 0.051513671875, "learning_rate": 1.3416188655194392e-07, "loss": 0.0021, "reward": 1.421118974685669, "reward_std": 0.04486546665430069, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.42111900448799133, "step": 2717 }, { "completion_length": 218.015625, "epoch": 0.8661567877629063, "grad_norm": 64.41060638427734, "kl": 0.09130859375, "learning_rate": 1.338432122370937e-07, "loss": 0.0037, "reward": 1.6570738554000854, "reward_std": 0.1489730179309845, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5945739150047302, "rewards/pad": 0.0625, "step": 2718 }, { "completion_length": 173.34375, "epoch": 0.8664754620777565, "grad_norm": 18.154401779174805, "kl": 0.0966796875, "learning_rate": 1.3352453792224348e-07, "loss": 0.0039, "reward": 1.8273169994354248, "reward_std": 0.11780412495136261, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5929418802261353, "step": 2719 }, { "completion_length": 192.921875, "epoch": 0.8667941363926067, "grad_norm": 13.782989501953125, "kl": 0.09130859375, "learning_rate": 1.3320586360739326e-07, "loss": 0.0036, "reward": 1.579949140548706, "reward_std": 0.125904381275177, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4393240511417389, "step": 2720 }, { "completion_length": 370.375, "epoch": 0.8671128107074569, "grad_norm": 4.068702220916748, "kl": 0.06298828125, "learning_rate": 1.3288718929254304e-07, "loss": 0.0025, "reward": 1.5640076398849487, "reward_std": 0.12377434968948364, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.47025763988494873, "step": 2721 }, { "completion_length": 254.09375, "epoch": 0.8674314850223072, "grad_norm": 9.885491371154785, "kl": 0.0771484375, "learning_rate": 1.325685149776928e-07, "loss": 0.0031, "reward": 1.5922003984451294, "reward_std": 0.08162754029035568, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4672004282474518, "rewards/pad": 0.125, "step": 2722 }, { "completion_length": 368.609375, "epoch": 0.8677501593371574, "grad_norm": 9.013411521911621, "kl": 0.05859375, "learning_rate": 1.3224984066284255e-07, "loss": 0.0023, "reward": 1.5695838928222656, "reward_std": 0.0959939956665039, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49145886301994324, "step": 2723 }, { "completion_length": 289.75, "epoch": 0.8680688336520076, "grad_norm": 9.853470802307129, "kl": 0.07177734375, "learning_rate": 1.3193116634799233e-07, "loss": 0.0029, "reward": 1.5496944189071655, "reward_std": 0.1058008000254631, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.534069299697876, "rewards/pad": 0.015625, "step": 2724 }, { "completion_length": 205.625, "epoch": 0.8683875079668578, "grad_norm": 15.999686241149902, "kl": 0.111328125, "learning_rate": 1.316124920331421e-07, "loss": 0.0045, "reward": 1.5291621685028076, "reward_std": 0.11018738150596619, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5291622281074524, "step": 2725 }, { "completion_length": 263.171875, "epoch": 0.868706182281708, "grad_norm": 19.691360473632812, "kl": 0.0771484375, "learning_rate": 1.312938177182919e-07, "loss": 0.0031, "reward": 1.544965386390686, "reward_std": 0.033744215965270996, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.544965386390686, "rewards/pad": 0.0, "step": 2726 }, { "completion_length": 162.203125, "epoch": 0.8690248565965584, "grad_norm": 15.681041717529297, "kl": 0.11279296875, "learning_rate": 1.3097514340344167e-07, "loss": 0.0045, "reward": 1.5828311443328857, "reward_std": 0.11496913433074951, "rewards/answer_reward": 0.09375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4890812337398529, "step": 2727 }, { "completion_length": 246.6875, "epoch": 0.8693435309114086, "grad_norm": 7.064988613128662, "kl": 0.09033203125, "learning_rate": 1.3065646908859145e-07, "loss": 0.0036, "reward": 1.592658519744873, "reward_std": 0.057133033871650696, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.592658519744873, "step": 2728 }, { "completion_length": 263.90625, "epoch": 0.8696622052262588, "grad_norm": 13.716483116149902, "kl": 0.0751953125, "learning_rate": 1.3033779477374123e-07, "loss": 0.003, "reward": 1.7002770900726318, "reward_std": 0.09610147774219513, "rewards/answer_reward": 0.15625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5440271496772766, "step": 2729 }, { "completion_length": 204.828125, "epoch": 0.869980879541109, "grad_norm": 11.942748069763184, "kl": 0.1171875, "learning_rate": 1.30019120458891e-07, "loss": 0.0047, "reward": 1.5761370658874512, "reward_std": 0.09762246161699295, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45113712549209595, "rewards/pad": 0.125, "step": 2730 }, { "completion_length": 314.28125, "epoch": 0.8702995538559593, "grad_norm": 9.208845138549805, "kl": 0.0751953125, "learning_rate": 1.297004461440408e-07, "loss": 0.003, "reward": 1.6383988857269287, "reward_std": 0.11062052845954895, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5290238261222839, "step": 2731 }, { "completion_length": 168.3125, "epoch": 0.8706182281708095, "grad_norm": 8.693392753601074, "kl": 0.13671875, "learning_rate": 1.2938177182919055e-07, "loss": 0.0055, "reward": 1.375337839126587, "reward_std": 0.10295344889163971, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.39096277952194214, "rewards/pad": 0.0, "step": 2732 }, { "completion_length": 306.65625, "epoch": 0.8709369024856597, "grad_norm": 19.472307205200195, "kl": 0.09130859375, "learning_rate": 1.2906309751434033e-07, "loss": 0.0036, "reward": 1.4047869443893433, "reward_std": 0.04510766640305519, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4047868847846985, "step": 2733 }, { "completion_length": 227.859375, "epoch": 0.8712555768005099, "grad_norm": 7.83612585067749, "kl": 0.09765625, "learning_rate": 1.287444231994901e-07, "loss": 0.0039, "reward": 1.5649466514587402, "reward_std": 0.11199548840522766, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5649466514587402, "step": 2734 }, { "completion_length": 306.8125, "epoch": 0.8715742511153601, "grad_norm": 12.435422897338867, "kl": 0.09716796875, "learning_rate": 1.284257488846399e-07, "loss": 0.0039, "reward": 1.689566969871521, "reward_std": 0.0790565237402916, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.564566969871521, "step": 2735 }, { "completion_length": 341.9375, "epoch": 0.8718929254302104, "grad_norm": 4.984146595001221, "kl": 0.07470703125, "learning_rate": 1.2810707456978967e-07, "loss": 0.003, "reward": 1.5292251110076904, "reward_std": 0.06255487352609634, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5292251110076904, "step": 2736 }, { "completion_length": 245.65625, "epoch": 0.8722115997450606, "grad_norm": 20.56971549987793, "kl": 0.076171875, "learning_rate": 1.2778840025493945e-07, "loss": 0.003, "reward": 1.6221401691436768, "reward_std": 0.2308957576751709, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.481515109539032, "rewards/pad": 0.15625, "step": 2737 }, { "completion_length": 325.453125, "epoch": 0.8725302740599108, "grad_norm": 9.56176471710205, "kl": 0.0673828125, "learning_rate": 1.2746972594008923e-07, "loss": 0.0027, "reward": 1.4896824359893799, "reward_std": 0.07883624732494354, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3959324359893799, "step": 2738 }, { "completion_length": 275.0625, "epoch": 0.872848948374761, "grad_norm": 12.742481231689453, "kl": 0.064453125, "learning_rate": 1.27151051625239e-07, "loss": 0.0026, "reward": 1.5121251344680786, "reward_std": 0.12479326128959656, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5277501940727234, "step": 2739 }, { "completion_length": 194.65625, "epoch": 0.8731676226896112, "grad_norm": 23.468538284301758, "kl": 0.11279296875, "learning_rate": 1.268323773103888e-07, "loss": 0.0045, "reward": 1.4817605018615723, "reward_std": 0.06805586069822311, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.48176059126853943, "step": 2740 }, { "completion_length": 158.875, "epoch": 0.8734862970044615, "grad_norm": 14.54665756225586, "kl": 0.10400390625, "learning_rate": 1.2651370299553854e-07, "loss": 0.0042, "reward": 1.7623220682144165, "reward_std": 0.11608093976974487, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6216970086097717, "step": 2741 }, { "completion_length": 313.140625, "epoch": 0.8738049713193117, "grad_norm": 16.616018295288086, "kl": 0.06591796875, "learning_rate": 1.2619502868068832e-07, "loss": 0.0026, "reward": 1.532200813293457, "reward_std": 0.04246199131011963, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.2822008430957794, "step": 2742 }, { "completion_length": 213.953125, "epoch": 0.8741236456341619, "grad_norm": 12.715255737304688, "kl": 0.0908203125, "learning_rate": 1.258763543658381e-07, "loss": 0.0036, "reward": 1.5940533876419067, "reward_std": 0.10762360692024231, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.46905332803726196, "step": 2743 }, { "completion_length": 204.671875, "epoch": 0.8744423199490121, "grad_norm": 10.193065643310547, "kl": 0.083984375, "learning_rate": 1.2555768005098789e-07, "loss": 0.0034, "reward": 1.8008592128753662, "reward_std": 0.06925635039806366, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6758592128753662, "rewards/pad": 0.125, "step": 2744 }, { "completion_length": 206.671875, "epoch": 0.8747609942638623, "grad_norm": 12.41603946685791, "kl": 0.1083984375, "learning_rate": 1.2523900573613767e-07, "loss": 0.0043, "reward": 1.6150926351547241, "reward_std": 0.11028784513473511, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6150925159454346, "rewards/pad": 0.0, "step": 2745 }, { "completion_length": 321.78125, "epoch": 0.8750796685787126, "grad_norm": 6.864892482757568, "kl": 0.087890625, "learning_rate": 1.2492033142128745e-07, "loss": 0.0035, "reward": 1.5100083351135254, "reward_std": 0.06830344349145889, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5100083351135254, "step": 2746 }, { "completion_length": 253.78125, "epoch": 0.8753983428935628, "grad_norm": 8.223762512207031, "kl": 0.08984375, "learning_rate": 1.2460165710643723e-07, "loss": 0.0036, "reward": 1.8030331134796143, "reward_std": 0.23443907499313354, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.5842832326889038, "rewards/pad": 0.25, "step": 2747 }, { "completion_length": 240.453125, "epoch": 0.875717017208413, "grad_norm": 359.0318298339844, "kl": 0.0859375, "learning_rate": 1.24282982791587e-07, "loss": 0.0034, "reward": 1.5661985874176025, "reward_std": 0.05305684357881546, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5661985874176025, "step": 2748 }, { "completion_length": 374.515625, "epoch": 0.8760356915232632, "grad_norm": 33.193660736083984, "kl": 0.0693359375, "learning_rate": 1.2396430847673676e-07, "loss": 0.0028, "reward": 1.477623701095581, "reward_std": 0.07416988909244537, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36824867129325867, "step": 2749 }, { "completion_length": 200.328125, "epoch": 0.8763543658381134, "grad_norm": 27.331974029541016, "kl": 0.0947265625, "learning_rate": 1.2364563416188654e-07, "loss": 0.0038, "reward": 1.3313639163970947, "reward_std": 0.08686615526676178, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3313639760017395, "step": 2750 }, { "completion_length": 289.078125, "epoch": 0.8766730401529637, "grad_norm": 7.792123317718506, "kl": 0.064453125, "learning_rate": 1.2332695984703632e-07, "loss": 0.0026, "reward": 1.655777931213379, "reward_std": 0.13871659338474274, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5307778716087341, "step": 2751 }, { "completion_length": 150.4375, "epoch": 0.8769917144678139, "grad_norm": 33.16511154174805, "kl": 0.0908203125, "learning_rate": 1.230082855321861e-07, "loss": 0.0036, "reward": 1.5670239925384521, "reward_std": 0.11330273747444153, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5826489925384521, "rewards/pad": 0.0, "step": 2752 }, { "completion_length": 214.890625, "epoch": 0.8773103887826641, "grad_norm": 13.900517463684082, "kl": 0.0859375, "learning_rate": 1.2268961121733588e-07, "loss": 0.0034, "reward": 1.5166895389556885, "reward_std": 0.08768896758556366, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.39168962836265564, "step": 2753 }, { "completion_length": 251.96875, "epoch": 0.8776290630975143, "grad_norm": 7.415778636932373, "kl": 0.07421875, "learning_rate": 1.2237093690248564e-07, "loss": 0.003, "reward": 1.5468295812606812, "reward_std": 0.10374996066093445, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.45307961106300354, "step": 2754 }, { "completion_length": 208.3125, "epoch": 0.8779477374123645, "grad_norm": 14.48386287689209, "kl": 0.08740234375, "learning_rate": 1.2205226258763542e-07, "loss": 0.0035, "reward": 1.5509672164916992, "reward_std": 0.06657718122005463, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5509671568870544, "rewards/pad": 0.0, "step": 2755 }, { "completion_length": 146.421875, "epoch": 0.8782664117272148, "grad_norm": 11.69698715209961, "kl": 0.107421875, "learning_rate": 1.217335882727852e-07, "loss": 0.0043, "reward": 1.556875467300415, "reward_std": 0.057311661541461945, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5568753480911255, "rewards/pad": 0.0, "step": 2756 }, { "completion_length": 347.984375, "epoch": 0.878585086042065, "grad_norm": 8.806641578674316, "kl": 0.06494140625, "learning_rate": 1.2141491395793498e-07, "loss": 0.0026, "reward": 1.3751672506332397, "reward_std": 0.1003868505358696, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3907921612262726, "step": 2757 }, { "completion_length": 261.796875, "epoch": 0.8789037603569152, "grad_norm": 9.35265827178955, "kl": 0.0849609375, "learning_rate": 1.2109623964308476e-07, "loss": 0.0034, "reward": 1.5465914011001587, "reward_std": 0.09646390378475189, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5465914011001587, "rewards/pad": 0.0, "step": 2758 }, { "completion_length": 219.0625, "epoch": 0.8792224346717654, "grad_norm": 31.54623794555664, "kl": 0.0859375, "learning_rate": 1.2077756532823454e-07, "loss": 0.0034, "reward": 1.446937084197998, "reward_std": 0.04907142370939255, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.32193702459335327, "step": 2759 }, { "completion_length": 271.890625, "epoch": 0.8795411089866156, "grad_norm": 14.196030616760254, "kl": 0.10986328125, "learning_rate": 1.2045889101338432e-07, "loss": 0.0044, "reward": 1.5679893493652344, "reward_std": 0.24801567196846008, "rewards/format_reward_tg": 0.953125, "rewards/iou_timestamp_reward": 0.4429892897605896, "rewards/pad": 0.171875, "step": 2760 }, { "completion_length": 268.984375, "epoch": 0.8798597833014659, "grad_norm": 9.754602432250977, "kl": 0.0751953125, "learning_rate": 1.201402166985341e-07, "loss": 0.003, "reward": 1.6172590255737305, "reward_std": 0.11558352410793304, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49225905537605286, "step": 2761 }, { "completion_length": 296.71875, "epoch": 0.8801784576163161, "grad_norm": 6.027721405029297, "kl": 0.068359375, "learning_rate": 1.1982154238368388e-07, "loss": 0.0027, "reward": 1.869599461555481, "reward_std": 0.18542581796646118, "rewards/pad": 0.359375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.525849461555481, "step": 2762 }, { "completion_length": 143.796875, "epoch": 0.8804971319311663, "grad_norm": 22.146970748901367, "kl": 0.12353515625, "learning_rate": 1.1950286806883364e-07, "loss": 0.0049, "reward": 1.416458010673523, "reward_std": 0.07813596725463867, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4164580702781677, "rewards/pad": 0.0, "step": 2763 }, { "completion_length": 252.796875, "epoch": 0.8808158062460165, "grad_norm": 21.758710861206055, "kl": 0.095703125, "learning_rate": 1.1918419375398342e-07, "loss": 0.0038, "reward": 1.4576029777526855, "reward_std": 0.06436774134635925, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4576028883457184, "step": 2764 }, { "completion_length": 262.53125, "epoch": 0.8811344805608667, "grad_norm": 85.56441497802734, "kl": 0.080078125, "learning_rate": 1.188655194391332e-07, "loss": 0.0032, "reward": 1.5895549058914185, "reward_std": 0.10457515716552734, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.48017996549606323, "step": 2765 }, { "completion_length": 293.046875, "epoch": 0.8814531548757171, "grad_norm": 109.34986877441406, "kl": 0.07275390625, "learning_rate": 1.1854684512428298e-07, "loss": 0.0029, "reward": 1.5263152122497559, "reward_std": 0.03732115402817726, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40131521224975586, "rewards/pad": 0.125, "step": 2766 }, { "completion_length": 173.46875, "epoch": 0.8817718291905673, "grad_norm": 14.634538650512695, "kl": 0.10595703125, "learning_rate": 1.1822817080943276e-07, "loss": 0.0042, "reward": 1.7670618295669556, "reward_std": 0.15223199129104614, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5639367699623108, "rewards/pad": 0.203125, "step": 2767 }, { "completion_length": 287.375, "epoch": 0.8820905035054175, "grad_norm": 9.117862701416016, "kl": 0.068359375, "learning_rate": 1.1790949649458252e-07, "loss": 0.0027, "reward": 1.5190690755844116, "reward_std": 0.04640193283557892, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39406919479370117, "rewards/pad": 0.125, "step": 2768 }, { "completion_length": 289.34375, "epoch": 0.8824091778202677, "grad_norm": 11.593138694763184, "kl": 0.06396484375, "learning_rate": 1.175908221797323e-07, "loss": 0.0026, "reward": 1.5423412322998047, "reward_std": 0.1217973604798317, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4954662322998047, "rewards/pad": 0.046875, "step": 2769 }, { "completion_length": 298.53125, "epoch": 0.882727852135118, "grad_norm": 11.1407470703125, "kl": 0.06640625, "learning_rate": 1.1727214786488209e-07, "loss": 0.0027, "reward": 1.5723299980163574, "reward_std": 0.048630490899086, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5723298788070679, "rewards/pad": 0.0, "step": 2770 }, { "completion_length": 194.359375, "epoch": 0.8830465264499682, "grad_norm": 18.737730026245117, "kl": 0.087890625, "learning_rate": 1.1695347355003187e-07, "loss": 0.0035, "reward": 1.5446221828460693, "reward_std": 0.0422695018351078, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5446221828460693, "step": 2771 }, { "completion_length": 202.625, "epoch": 0.8833652007648184, "grad_norm": 16.50901222229004, "kl": 0.0859375, "learning_rate": 1.1663479923518165e-07, "loss": 0.0034, "reward": 1.4691429138183594, "reward_std": 0.17112360894680023, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3597680628299713, "rewards/pad": 0.109375, "step": 2772 }, { "completion_length": 232.125, "epoch": 0.8836838750796686, "grad_norm": 14.47046184539795, "kl": 0.119140625, "learning_rate": 1.1631612492033141e-07, "loss": 0.0048, "reward": 1.4901710748672485, "reward_std": 0.08002850413322449, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4901711344718933, "rewards/pad": 0.0, "step": 2773 }, { "completion_length": 195.40625, "epoch": 0.8840025493945188, "grad_norm": 68.46620178222656, "kl": 0.0888671875, "learning_rate": 1.159974506054812e-07, "loss": 0.0036, "reward": 1.6715497970581055, "reward_std": 0.15683436393737793, "rewards/answer_reward": 0.265625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.40592485666275024, "step": 2774 }, { "completion_length": 184.78125, "epoch": 0.884321223709369, "grad_norm": 20.505659103393555, "kl": 0.1201171875, "learning_rate": 1.1567877629063097e-07, "loss": 0.0048, "reward": 1.5303431749343872, "reward_std": 0.22911837697029114, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4053431451320648, "rewards/pad": 0.140625, "step": 2775 }, { "completion_length": 267.0625, "epoch": 0.8846398980242193, "grad_norm": 8.295364379882812, "kl": 0.126953125, "learning_rate": 1.1536010197578076e-07, "loss": 0.0051, "reward": 1.6413304805755615, "reward_std": 0.06875017285346985, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5163305401802063, "rewards/pad": 0.125, "step": 2776 }, { "completion_length": 422.984375, "epoch": 0.8849585723390695, "grad_norm": 5.138932228088379, "kl": 0.05224609375, "learning_rate": 1.1504142766093052e-07, "loss": 0.0021, "reward": 1.5415451526641846, "reward_std": 0.1087242066860199, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.43217024207115173, "step": 2777 }, { "completion_length": 175.25, "epoch": 0.8852772466539197, "grad_norm": 13.963083267211914, "kl": 0.08837890625, "learning_rate": 1.1472275334608029e-07, "loss": 0.0035, "reward": 1.8134742975234985, "reward_std": 0.08984021097421646, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5634742975234985, "rewards/pad": 0.25, "step": 2778 }, { "completion_length": 296.796875, "epoch": 0.8855959209687699, "grad_norm": 66.78641510009766, "kl": 0.119140625, "learning_rate": 1.1440407903123007e-07, "loss": 0.0047, "reward": 1.3360178470611572, "reward_std": 0.03126439452171326, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.33601781725883484, "step": 2779 }, { "completion_length": 348.859375, "epoch": 0.8859145952836202, "grad_norm": 20.583782196044922, "kl": 0.06982421875, "learning_rate": 1.1408540471637985e-07, "loss": 0.0028, "reward": 1.4972288608551025, "reward_std": 0.056556351482868195, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49722886085510254, "rewards/pad": 0.0, "step": 2780 }, { "completion_length": 311.40625, "epoch": 0.8862332695984704, "grad_norm": 13.552074432373047, "kl": 0.0712890625, "learning_rate": 1.1376673040152963e-07, "loss": 0.0028, "reward": 1.6806656122207642, "reward_std": 0.07932628691196442, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5556654930114746, "step": 2781 }, { "completion_length": 191.140625, "epoch": 0.8865519439133206, "grad_norm": 19.05376434326172, "kl": 0.091796875, "learning_rate": 1.1344805608667941e-07, "loss": 0.0037, "reward": 1.6291983127593994, "reward_std": 0.17766210436820984, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5510733127593994, "step": 2782 }, { "completion_length": 204.453125, "epoch": 0.8868706182281708, "grad_norm": 17.7001895904541, "kl": 0.10888671875, "learning_rate": 1.1312938177182918e-07, "loss": 0.0044, "reward": 1.477161169052124, "reward_std": 0.07692308723926544, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47716113924980164, "rewards/pad": 0.0, "step": 2783 }, { "completion_length": 157.640625, "epoch": 0.887189292543021, "grad_norm": 10.790077209472656, "kl": 0.11865234375, "learning_rate": 1.1281070745697896e-07, "loss": 0.0048, "reward": 1.555767297744751, "reward_std": 0.08283720910549164, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5557674169540405, "rewards/pad": 0.0, "step": 2784 }, { "completion_length": 249.6875, "epoch": 0.8875079668578713, "grad_norm": 23.696714401245117, "kl": 0.0986328125, "learning_rate": 1.1249203314212874e-07, "loss": 0.0039, "reward": 1.5565049648284912, "reward_std": 0.07266457378864288, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5565049052238464, "rewards/pad": 0.0, "step": 2785 }, { "completion_length": 243.3125, "epoch": 0.8878266411727215, "grad_norm": 12.905336380004883, "kl": 0.07763671875, "learning_rate": 1.1217335882727852e-07, "loss": 0.0031, "reward": 1.6837048530578613, "reward_std": 0.13713642954826355, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6993297934532166, "step": 2786 }, { "completion_length": 209.0, "epoch": 0.8881453154875717, "grad_norm": 11.713064193725586, "kl": 0.08984375, "learning_rate": 1.118546845124283e-07, "loss": 0.0036, "reward": 1.5936157703399658, "reward_std": 0.16193625330924988, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4998658001422882, "rewards/pad": 0.125, "step": 2787 }, { "completion_length": 206.03125, "epoch": 0.8884639898024219, "grad_norm": 20.389999389648438, "kl": 0.1005859375, "learning_rate": 1.1153601019757807e-07, "loss": 0.004, "reward": 1.4149174690246582, "reward_std": 0.1828393191099167, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.43054234981536865, "rewards/pad": 0.0, "step": 2788 }, { "completion_length": 279.6875, "epoch": 0.8887826641172721, "grad_norm": 10.964557647705078, "kl": 0.0859375, "learning_rate": 1.1121733588272785e-07, "loss": 0.0034, "reward": 1.7892601490020752, "reward_std": 0.09654922038316727, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6642600893974304, "rewards/pad": 0.125, "step": 2789 }, { "completion_length": 349.515625, "epoch": 0.8891013384321224, "grad_norm": 7.1359758377075195, "kl": 0.053466796875, "learning_rate": 1.1089866156787763e-07, "loss": 0.0021, "reward": 1.4936069250106812, "reward_std": 0.10209286212921143, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5092318654060364, "step": 2790 }, { "completion_length": 331.53125, "epoch": 0.8894200127469726, "grad_norm": 10.255035400390625, "kl": 0.05712890625, "learning_rate": 1.1057998725302741e-07, "loss": 0.0023, "reward": 1.6107176542282104, "reward_std": 0.04477081447839737, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48571765422821045, "rewards/pad": 0.125, "step": 2791 }, { "completion_length": 360.96875, "epoch": 0.8897386870618228, "grad_norm": 17.346084594726562, "kl": 0.06591796875, "learning_rate": 1.1026131293817719e-07, "loss": 0.0026, "reward": 1.4452977180480957, "reward_std": 0.13373248279094696, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4765477478504181, "step": 2792 }, { "completion_length": 179.078125, "epoch": 0.890057361376673, "grad_norm": 13.599401473999023, "kl": 0.0908203125, "learning_rate": 1.0994263862332694e-07, "loss": 0.0036, "reward": 1.6920461654663086, "reward_std": 0.09594403207302094, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5670461058616638, "rewards/pad": 0.125, "step": 2793 }, { "completion_length": 200.421875, "epoch": 0.8903760356915232, "grad_norm": 10.814448356628418, "kl": 0.09814453125, "learning_rate": 1.0962396430847672e-07, "loss": 0.0039, "reward": 1.4065415859222412, "reward_std": 0.052019260823726654, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.406541645526886, "rewards/pad": 0.0, "step": 2794 }, { "completion_length": 248.234375, "epoch": 0.8906947100063735, "grad_norm": 12.750199317932129, "kl": 0.06494140625, "learning_rate": 1.093052899936265e-07, "loss": 0.0026, "reward": 1.776698350906372, "reward_std": 0.1367223858833313, "rewards/pad": 0.328125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44857338070869446, "step": 2795 }, { "completion_length": 257.71875, "epoch": 0.8910133843212237, "grad_norm": 7.891810417175293, "kl": 0.083984375, "learning_rate": 1.0898661567877629e-07, "loss": 0.0034, "reward": 1.8464381694793701, "reward_std": 0.08025862276554108, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5964381694793701, "rewards/pad": 0.25, "step": 2796 }, { "completion_length": 196.890625, "epoch": 0.8913320586360739, "grad_norm": 12.372171401977539, "kl": 0.0869140625, "learning_rate": 1.0866794136392607e-07, "loss": 0.0035, "reward": 1.5370149612426758, "reward_std": 0.0749216079711914, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5370149612426758, "rewards/pad": 0.0, "step": 2797 }, { "completion_length": 210.09375, "epoch": 0.8916507329509241, "grad_norm": 11.810192108154297, "kl": 0.0966796875, "learning_rate": 1.0834926704907583e-07, "loss": 0.0039, "reward": 1.3395678997039795, "reward_std": 0.1803453117609024, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.2614429295063019, "rewards/pad": 0.09375, "step": 2798 }, { "completion_length": 265.53125, "epoch": 0.8919694072657743, "grad_norm": 18.495361328125, "kl": 0.10693359375, "learning_rate": 1.0803059273422561e-07, "loss": 0.0043, "reward": 1.8013211488723755, "reward_std": 0.1095597967505455, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5669461488723755, "rewards/pad": 0.234375, "step": 2799 }, { "completion_length": 256.640625, "epoch": 0.8922880815806246, "grad_norm": 15.504286766052246, "kl": 0.078125, "learning_rate": 1.077119184193754e-07, "loss": 0.0031, "reward": 1.6308748722076416, "reward_std": 0.03704064339399338, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5058748126029968, "step": 2800 }, { "completion_length": 328.734375, "epoch": 0.8926067558954748, "grad_norm": 14.028239250183105, "kl": 0.06201171875, "learning_rate": 1.0739324410452518e-07, "loss": 0.0025, "reward": 1.3260129690170288, "reward_std": 0.0780874714255333, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.32601290941238403, "step": 2801 }, { "completion_length": 382.078125, "epoch": 0.892925430210325, "grad_norm": 42.63294219970703, "kl": 0.09814453125, "learning_rate": 1.0707456978967496e-07, "loss": 0.0039, "reward": 1.3472610712051392, "reward_std": 0.15402668714523315, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.37851107120513916, "step": 2802 }, { "completion_length": 197.203125, "epoch": 0.8932441045251752, "grad_norm": 11.023219108581543, "kl": 0.08935546875, "learning_rate": 1.0675589547482472e-07, "loss": 0.0036, "reward": 1.520681381225586, "reward_std": 0.10876883566379547, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5050563812255859, "rewards/pad": 0.015625, "step": 2803 }, { "completion_length": 319.359375, "epoch": 0.8935627788400254, "grad_norm": 143.46206665039062, "kl": 0.072265625, "learning_rate": 1.064372211599745e-07, "loss": 0.0029, "reward": 1.4527530670166016, "reward_std": 0.046053387224674225, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.32775312662124634, "step": 2804 }, { "completion_length": 249.546875, "epoch": 0.8938814531548758, "grad_norm": 30.37599754333496, "kl": 0.08203125, "learning_rate": 1.0611854684512428e-07, "loss": 0.0033, "reward": 1.5027915239334106, "reward_std": 0.06004762649536133, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5027914643287659, "rewards/pad": 0.0, "step": 2805 }, { "completion_length": 235.421875, "epoch": 0.894200127469726, "grad_norm": 8.845436096191406, "kl": 0.08447265625, "learning_rate": 1.0579987253027406e-07, "loss": 0.0034, "reward": 1.6691420078277588, "reward_std": 0.06246839463710785, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5441420674324036, "rewards/pad": 0.125, "step": 2806 }, { "completion_length": 250.03125, "epoch": 0.8945188017845762, "grad_norm": 5.992687225341797, "kl": 0.0849609375, "learning_rate": 1.0548119821542384e-07, "loss": 0.0034, "reward": 1.6259146928787231, "reward_std": 0.06487414240837097, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5009146928787231, "rewards/pad": 0.125, "step": 2807 }, { "completion_length": 307.234375, "epoch": 0.8948374760994264, "grad_norm": 9.169029235839844, "kl": 0.0869140625, "learning_rate": 1.051625239005736e-07, "loss": 0.0035, "reward": 1.4168381690979004, "reward_std": 0.1290677785873413, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4324631690979004, "rewards/pad": 0.0, "step": 2808 }, { "completion_length": 204.796875, "epoch": 0.8951561504142767, "grad_norm": 21.92626190185547, "kl": 0.0859375, "learning_rate": 1.0484384958572338e-07, "loss": 0.0034, "reward": 1.656879186630249, "reward_std": 0.12416574358940125, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5475040674209595, "rewards/pad": 0.125, "step": 2809 }, { "completion_length": 260.203125, "epoch": 0.8954748247291269, "grad_norm": 14.02978515625, "kl": 0.10693359375, "learning_rate": 1.0452517527087316e-07, "loss": 0.0043, "reward": 1.5584192276000977, "reward_std": 0.08877848088741302, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4490443170070648, "step": 2810 }, { "completion_length": 370.109375, "epoch": 0.8957934990439771, "grad_norm": 7.561759948730469, "kl": 0.0595703125, "learning_rate": 1.0420650095602294e-07, "loss": 0.0024, "reward": 1.6742053031921387, "reward_std": 0.03354213014245033, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5492053031921387, "step": 2811 }, { "completion_length": 317.0625, "epoch": 0.8961121733588273, "grad_norm": 54.87419128417969, "kl": 0.0712890625, "learning_rate": 1.0388782664117271e-07, "loss": 0.0029, "reward": 1.637012004852295, "reward_std": 0.16005094349384308, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5432620048522949, "step": 2812 }, { "completion_length": 160.796875, "epoch": 0.8964308476736775, "grad_norm": 6.998462200164795, "kl": 0.08544921875, "learning_rate": 1.0356915232632249e-07, "loss": 0.0034, "reward": 1.7436039447784424, "reward_std": 0.047648873180150986, "rewards/pad": 0.375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36860400438308716, "step": 2813 }, { "completion_length": 266.59375, "epoch": 0.8967495219885278, "grad_norm": 9.871182441711426, "kl": 0.078125, "learning_rate": 1.0325047801147227e-07, "loss": 0.0031, "reward": 1.5711551904678345, "reward_std": 0.06365922838449478, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5711551904678345, "step": 2814 }, { "completion_length": 212.109375, "epoch": 0.897068196303378, "grad_norm": 9.782853126525879, "kl": 0.09619140625, "learning_rate": 1.0293180369662205e-07, "loss": 0.0038, "reward": 1.6640100479125977, "reward_std": 0.06419562548398972, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5390099883079529, "step": 2815 }, { "completion_length": 196.34375, "epoch": 0.8973868706182282, "grad_norm": 10.195688247680664, "kl": 0.087890625, "learning_rate": 1.0261312938177183e-07, "loss": 0.0035, "reward": 1.5186760425567627, "reward_std": 0.10513772070407867, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.40930113196372986, "rewards/pad": 0.125, "step": 2816 }, { "completion_length": 347.6875, "epoch": 0.8977055449330784, "grad_norm": 5.789995193481445, "kl": 0.0791015625, "learning_rate": 1.022944550669216e-07, "loss": 0.0032, "reward": 1.5678385496139526, "reward_std": 0.0854002982378006, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.44283854961395264, "step": 2817 }, { "completion_length": 267.203125, "epoch": 0.8980242192479286, "grad_norm": 23.427282333374023, "kl": 0.08154296875, "learning_rate": 1.0197578075207138e-07, "loss": 0.0033, "reward": 1.720076084136963, "reward_std": 0.14952677488327026, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5169510245323181, "rewards/pad": 0.21875, "step": 2818 }, { "completion_length": 244.140625, "epoch": 0.8983428935627789, "grad_norm": 9.095246315002441, "kl": 0.09130859375, "learning_rate": 1.0165710643722116e-07, "loss": 0.0037, "reward": 1.6292340755462646, "reward_std": 0.0824042558670044, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5042339563369751, "rewards/pad": 0.125, "step": 2819 }, { "completion_length": 236.90625, "epoch": 0.8986615678776291, "grad_norm": 21.445959091186523, "kl": 0.0888671875, "learning_rate": 1.0133843212237094e-07, "loss": 0.0035, "reward": 1.616790771484375, "reward_std": 0.16625714302062988, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.507415771484375, "step": 2820 }, { "completion_length": 226.953125, "epoch": 0.8989802421924793, "grad_norm": 9.607512474060059, "kl": 0.1025390625, "learning_rate": 1.0101975780752072e-07, "loss": 0.0041, "reward": 1.4582407474517822, "reward_std": 0.06423820555210114, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.33324068784713745, "rewards/pad": 0.125, "step": 2821 }, { "completion_length": 202.296875, "epoch": 0.8992989165073295, "grad_norm": 40.7913818359375, "kl": 0.10498046875, "learning_rate": 1.0070108349267049e-07, "loss": 0.0042, "reward": 1.6470917463302612, "reward_std": 0.1350969672203064, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5377167463302612, "rewards/pad": 0.125, "step": 2822 }, { "completion_length": 292.65625, "epoch": 0.8996175908221797, "grad_norm": 12.874184608459473, "kl": 0.06640625, "learning_rate": 1.0038240917782025e-07, "loss": 0.0027, "reward": 1.6831600666046143, "reward_std": 0.05511980876326561, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5581599473953247, "step": 2823 }, { "completion_length": 195.5625, "epoch": 0.89993626513703, "grad_norm": 15.63112735748291, "kl": 0.083984375, "learning_rate": 1.0006373486297003e-07, "loss": 0.0034, "reward": 1.7760124206542969, "reward_std": 0.060271888971328735, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6510124206542969, "rewards/pad": 0.125, "step": 2824 }, { "completion_length": 368.421875, "epoch": 0.9002549394518802, "grad_norm": 10.406145095825195, "kl": 0.07470703125, "learning_rate": 9.974506054811981e-08, "loss": 0.003, "reward": 1.6415812969207764, "reward_std": 0.1609392762184143, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5478312969207764, "rewards/pad": 0.109375, "step": 2825 }, { "completion_length": 194.78125, "epoch": 0.9005736137667304, "grad_norm": 23.362503051757812, "kl": 0.0966796875, "learning_rate": 9.94263862332696e-08, "loss": 0.0039, "reward": 1.7137928009033203, "reward_std": 0.09469905495643616, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.7137926816940308, "rewards/pad": 0.0, "step": 2826 }, { "completion_length": 155.046875, "epoch": 0.9008922880815806, "grad_norm": 36.5787467956543, "kl": 0.0986328125, "learning_rate": 9.910771191841936e-08, "loss": 0.0039, "reward": 1.466389536857605, "reward_std": 0.12822285294532776, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.482014536857605, "step": 2827 }, { "completion_length": 246.375, "epoch": 0.9012109623964308, "grad_norm": 30.5788516998291, "kl": 0.0908203125, "learning_rate": 9.878903760356914e-08, "loss": 0.0036, "reward": 1.5797438621520996, "reward_std": 0.13034319877624512, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4859938323497772, "rewards/pad": 0.109375, "step": 2828 }, { "completion_length": 249.953125, "epoch": 0.9015296367112811, "grad_norm": 7.016419410705566, "kl": 0.09521484375, "learning_rate": 9.847036328871892e-08, "loss": 0.0038, "reward": 1.6300158500671387, "reward_std": 0.09085611253976822, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5050158500671387, "rewards/pad": 0.125, "step": 2829 }, { "completion_length": 248.46875, "epoch": 0.9018483110261313, "grad_norm": 12.229598045349121, "kl": 0.1162109375, "learning_rate": 9.81516889738687e-08, "loss": 0.0046, "reward": 1.2450551986694336, "reward_std": 0.0686068907380104, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.2450551837682724, "step": 2830 }, { "completion_length": 105.421875, "epoch": 0.9021669853409815, "grad_norm": 17.319347381591797, "kl": 0.173828125, "learning_rate": 9.783301465901848e-08, "loss": 0.0069, "reward": 1.8770604133605957, "reward_std": 0.18187379837036133, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6426854729652405, "rewards/pad": 0.234375, "step": 2831 }, { "completion_length": 244.40625, "epoch": 0.9024856596558317, "grad_norm": 13.74586296081543, "kl": 0.076171875, "learning_rate": 9.751434034416825e-08, "loss": 0.003, "reward": 1.508291482925415, "reward_std": 0.08148925751447678, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.508291482925415, "rewards/pad": 0.0, "step": 2832 }, { "completion_length": 239.046875, "epoch": 0.9028043339706819, "grad_norm": 10.063822746276855, "kl": 0.08447265625, "learning_rate": 9.719566602931803e-08, "loss": 0.0034, "reward": 1.597080945968628, "reward_std": 0.056462403386831284, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5970809459686279, "step": 2833 }, { "completion_length": 278.8125, "epoch": 0.9031230082855322, "grad_norm": 4.26906681060791, "kl": 0.078125, "learning_rate": 9.687699171446781e-08, "loss": 0.0031, "reward": 1.5676623582839966, "reward_std": 0.07192361354827881, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.45828741788864136, "step": 2834 }, { "completion_length": 218.5, "epoch": 0.9034416826003824, "grad_norm": 18.245553970336914, "kl": 0.08203125, "learning_rate": 9.655831739961759e-08, "loss": 0.0033, "reward": 1.528798222541809, "reward_std": 0.043091028928756714, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4037981331348419, "rewards/pad": 0.125, "step": 2835 }, { "completion_length": 159.328125, "epoch": 0.9037603569152326, "grad_norm": 7.013727188110352, "kl": 0.103515625, "learning_rate": 9.623964308476737e-08, "loss": 0.0041, "reward": 1.5832340717315674, "reward_std": 0.12680181860923767, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.473859041929245, "step": 2836 }, { "completion_length": 237.671875, "epoch": 0.9040790312300828, "grad_norm": 20.34369468688965, "kl": 0.095703125, "learning_rate": 9.592096876991714e-08, "loss": 0.0038, "reward": 1.3584239482879639, "reward_std": 0.1278751641511917, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.37404897809028625, "step": 2837 }, { "completion_length": 289.9375, "epoch": 0.904397705544933, "grad_norm": 16.990022659301758, "kl": 0.0712890625, "learning_rate": 9.560229445506691e-08, "loss": 0.0029, "reward": 1.6058906316757202, "reward_std": 0.06301936507225037, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4808906316757202, "rewards/pad": 0.125, "step": 2838 }, { "completion_length": 319.953125, "epoch": 0.9047163798597833, "grad_norm": 8.548789978027344, "kl": 0.0791015625, "learning_rate": 9.528362014021669e-08, "loss": 0.0032, "reward": 1.5624927282333374, "reward_std": 0.11143027245998383, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4218676686286926, "step": 2839 }, { "completion_length": 187.125, "epoch": 0.9050350541746335, "grad_norm": 10.730565071105957, "kl": 0.10205078125, "learning_rate": 9.496494582536647e-08, "loss": 0.0041, "reward": 1.679841160774231, "reward_std": 0.1068398505449295, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.570466160774231, "rewards/pad": 0.109375, "step": 2840 }, { "completion_length": 185.015625, "epoch": 0.9053537284894837, "grad_norm": 27.30592155456543, "kl": 0.07568359375, "learning_rate": 9.464627151051625e-08, "loss": 0.003, "reward": 1.6507123708724976, "reward_std": 0.18122225999832153, "rewards/answer_reward": 0.171875, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4944624602794647, "step": 2841 }, { "completion_length": 245.65625, "epoch": 0.9056724028043339, "grad_norm": 5.355472564697266, "kl": 0.0712890625, "learning_rate": 9.432759719566602e-08, "loss": 0.0029, "reward": 1.4526417255401611, "reward_std": 0.07515285909175873, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4370167553424835, "rewards/pad": 0.015625, "step": 2842 }, { "completion_length": 150.125, "epoch": 0.9059910771191841, "grad_norm": 186.78187561035156, "kl": 0.11279296875, "learning_rate": 9.40089228808158e-08, "loss": 0.0045, "reward": 1.5947678089141846, "reward_std": 0.07983440905809402, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5947678685188293, "rewards/pad": 0.0, "step": 2843 }, { "completion_length": 226.046875, "epoch": 0.9063097514340345, "grad_norm": 13.87447452545166, "kl": 0.07373046875, "learning_rate": 9.369024856596558e-08, "loss": 0.0029, "reward": 1.6612658500671387, "reward_std": 0.1668727546930313, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4112658202648163, "step": 2844 }, { "completion_length": 327.59375, "epoch": 0.9066284257488847, "grad_norm": 7.351214408874512, "kl": 0.060791015625, "learning_rate": 9.337157425111536e-08, "loss": 0.0024, "reward": 1.4562897682189941, "reward_std": 0.06675264239311218, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4562898874282837, "step": 2845 }, { "completion_length": 229.640625, "epoch": 0.9069471000637349, "grad_norm": 14.260663032531738, "kl": 0.08154296875, "learning_rate": 9.305289993626514e-08, "loss": 0.0033, "reward": 1.507227897644043, "reward_std": 0.04547261819243431, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.507227897644043, "rewards/pad": 0.0, "step": 2846 }, { "completion_length": 274.28125, "epoch": 0.9072657743785851, "grad_norm": 4.880247592926025, "kl": 0.0927734375, "learning_rate": 9.27342256214149e-08, "loss": 0.0037, "reward": 1.577436923980713, "reward_std": 0.18369559943675995, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.48368698358535767, "rewards/pad": 0.125, "step": 2847 }, { "completion_length": 301.828125, "epoch": 0.9075844486934354, "grad_norm": 7.844675064086914, "kl": 0.123046875, "learning_rate": 9.241555130656469e-08, "loss": 0.0049, "reward": 1.560899257659912, "reward_std": 0.06565877795219421, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5608993172645569, "step": 2848 }, { "completion_length": 224.046875, "epoch": 0.9079031230082856, "grad_norm": 15.078897476196289, "kl": 0.08349609375, "learning_rate": 9.209687699171447e-08, "loss": 0.0033, "reward": 1.7017287015914917, "reward_std": 0.09162631630897522, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4517287611961365, "step": 2849 }, { "completion_length": 221.53125, "epoch": 0.9082217973231358, "grad_norm": 14.18359661102295, "kl": 0.095703125, "learning_rate": 9.177820267686425e-08, "loss": 0.0038, "reward": 1.5834238529205322, "reward_std": 0.12328074872493744, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3646739721298218, "rewards/pad": 0.21875, "step": 2850 }, { "completion_length": 207.4375, "epoch": 0.908540471637986, "grad_norm": 16.44442367553711, "kl": 0.08740234375, "learning_rate": 9.145952836201403e-08, "loss": 0.0035, "reward": 1.647652268409729, "reward_std": 0.11559803783893585, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.538277268409729, "rewards/pad": 0.125, "step": 2851 }, { "completion_length": 332.234375, "epoch": 0.9088591459528362, "grad_norm": 70.49515533447266, "kl": 0.058837890625, "learning_rate": 9.11408540471638e-08, "loss": 0.0024, "reward": 1.549206018447876, "reward_std": 0.16914981603622437, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.4554559886455536, "rewards/pad": 0.125, "step": 2852 }, { "completion_length": 387.90625, "epoch": 0.9091778202676865, "grad_norm": 9.856443405151367, "kl": 0.053955078125, "learning_rate": 9.082217973231358e-08, "loss": 0.0022, "reward": 1.5135695934295654, "reward_std": 0.04079330712556839, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5135695934295654, "step": 2853 }, { "completion_length": 153.375, "epoch": 0.9094964945825367, "grad_norm": 22.22443962097168, "kl": 0.11083984375, "learning_rate": 9.050350541746334e-08, "loss": 0.0044, "reward": 1.5554393529891968, "reward_std": 0.03890087455511093, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.430439293384552, "rewards/pad": 0.125, "step": 2854 }, { "completion_length": 242.296875, "epoch": 0.9098151688973869, "grad_norm": 7.04885196685791, "kl": 0.08056640625, "learning_rate": 9.018483110261312e-08, "loss": 0.0032, "reward": 1.5216234922409058, "reward_std": 0.04502193629741669, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.39662352204322815, "step": 2855 }, { "completion_length": 310.09375, "epoch": 0.9101338432122371, "grad_norm": 7.436180591583252, "kl": 0.0703125, "learning_rate": 8.98661567877629e-08, "loss": 0.0028, "reward": 1.6724367141723633, "reward_std": 0.052072592079639435, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5474367737770081, "step": 2856 }, { "completion_length": 257.375, "epoch": 0.9104525175270873, "grad_norm": 13.737147331237793, "kl": 0.08642578125, "learning_rate": 8.954748247291267e-08, "loss": 0.0035, "reward": 1.5169252157211304, "reward_std": 0.13501378893852234, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47005027532577515, "step": 2857 }, { "completion_length": 286.03125, "epoch": 0.9107711918419376, "grad_norm": 14.250129699707031, "kl": 0.08447265625, "learning_rate": 8.922880815806245e-08, "loss": 0.0034, "reward": 1.4254100322723389, "reward_std": 0.10079248249530792, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.42541009187698364, "step": 2858 }, { "completion_length": 226.6875, "epoch": 0.9110898661567878, "grad_norm": 16.881498336791992, "kl": 0.06396484375, "learning_rate": 8.891013384321223e-08, "loss": 0.0026, "reward": 1.9186750650405884, "reward_std": 0.10284863412380219, "rewards/answer_reward": 0.5, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.41867509484291077, "step": 2859 }, { "completion_length": 356.6875, "epoch": 0.911408540471638, "grad_norm": 7.63905143737793, "kl": 0.052490234375, "learning_rate": 8.859145952836201e-08, "loss": 0.0021, "reward": 1.4054787158966064, "reward_std": 0.11696632206439972, "rewards/pad": 0.09375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.31172871589660645, "step": 2860 }, { "completion_length": 305.28125, "epoch": 0.9117272147864882, "grad_norm": 23.72255516052246, "kl": 0.054443359375, "learning_rate": 8.827278521351178e-08, "loss": 0.0022, "reward": 1.6052396297454834, "reward_std": 0.13836698234081268, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.511489748954773, "step": 2861 }, { "completion_length": 257.546875, "epoch": 0.9120458891013384, "grad_norm": 9.212669372558594, "kl": 0.0791015625, "learning_rate": 8.795411089866156e-08, "loss": 0.0032, "reward": 1.6073306798934937, "reward_std": 0.08537907898426056, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.3729557394981384, "step": 2862 }, { "completion_length": 219.921875, "epoch": 0.9123645634161887, "grad_norm": 10.81699275970459, "kl": 0.08837890625, "learning_rate": 8.763543658381134e-08, "loss": 0.0035, "reward": 1.4224504232406616, "reward_std": 0.15301409363746643, "rewards/answer_reward": 0.015625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4224504232406616, "step": 2863 }, { "completion_length": 266.21875, "epoch": 0.9126832377310389, "grad_norm": 11.685783386230469, "kl": 0.08203125, "learning_rate": 8.731676226896112e-08, "loss": 0.0033, "reward": 1.657499074935913, "reward_std": 0.07901878654956818, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4074990451335907, "step": 2864 }, { "completion_length": 201.703125, "epoch": 0.9130019120458891, "grad_norm": 109.4502944946289, "kl": 0.09130859375, "learning_rate": 8.69980879541109e-08, "loss": 0.0036, "reward": 1.5427440404891968, "reward_std": 0.11305812746286392, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5583691000938416, "rewards/pad": 0.0, "step": 2865 }, { "completion_length": 219.46875, "epoch": 0.9133205863607393, "grad_norm": 52.34642028808594, "kl": 0.10009765625, "learning_rate": 8.667941363926067e-08, "loss": 0.004, "reward": 1.4690560102462769, "reward_std": 0.08319102227687836, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.34405606985092163, "rewards/pad": 0.125, "step": 2866 }, { "completion_length": 231.09375, "epoch": 0.9136392606755895, "grad_norm": 9.66303539276123, "kl": 0.07470703125, "learning_rate": 8.636073932441045e-08, "loss": 0.003, "reward": 1.5846589803695679, "reward_std": 0.1445649415254593, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5221589803695679, "rewards/pad": 0.0625, "step": 2867 }, { "completion_length": 200.296875, "epoch": 0.9139579349904398, "grad_norm": 20.801359176635742, "kl": 0.1103515625, "learning_rate": 8.604206500956023e-08, "loss": 0.0044, "reward": 1.584848403930664, "reward_std": 0.11578701436519623, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5848484039306641, "rewards/pad": 0.0, "step": 2868 }, { "completion_length": 208.046875, "epoch": 0.91427660930529, "grad_norm": 16.8826847076416, "kl": 0.1025390625, "learning_rate": 8.572339069471e-08, "loss": 0.0041, "reward": 1.6622300148010254, "reward_std": 0.07939045131206512, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6622299551963806, "step": 2869 }, { "completion_length": 242.828125, "epoch": 0.9145952836201402, "grad_norm": 5.844204902648926, "kl": 0.07666015625, "learning_rate": 8.540471637985978e-08, "loss": 0.0031, "reward": 1.5444097518920898, "reward_std": 0.031050903722643852, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4194098711013794, "step": 2870 }, { "completion_length": 210.890625, "epoch": 0.9149139579349904, "grad_norm": 15.819625854492188, "kl": 0.10302734375, "learning_rate": 8.508604206500955e-08, "loss": 0.0041, "reward": 1.4890446662902832, "reward_std": 0.11497621238231659, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4734196066856384, "step": 2871 }, { "completion_length": 279.6875, "epoch": 0.9152326322498406, "grad_norm": 14.926518440246582, "kl": 0.0751953125, "learning_rate": 8.476736775015933e-08, "loss": 0.003, "reward": 1.5654652118682861, "reward_std": 0.058053113520145416, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4404652714729309, "step": 2872 }, { "completion_length": 294.046875, "epoch": 0.9155513065646909, "grad_norm": 8.7820463180542, "kl": 0.087890625, "learning_rate": 8.444869343530911e-08, "loss": 0.0035, "reward": 1.6744558811187744, "reward_std": 0.07011964917182922, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5494560599327087, "step": 2873 }, { "completion_length": 201.15625, "epoch": 0.9158699808795411, "grad_norm": 21.165136337280273, "kl": 0.087890625, "learning_rate": 8.413001912045889e-08, "loss": 0.0035, "reward": 1.5898141860961914, "reward_std": 0.08441634476184845, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5898141860961914, "step": 2874 }, { "completion_length": 215.3125, "epoch": 0.9161886551943913, "grad_norm": 14.786849021911621, "kl": 0.1259765625, "learning_rate": 8.381134480560867e-08, "loss": 0.005, "reward": 1.7169456481933594, "reward_std": 0.09836380928754807, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.48257070779800415, "step": 2875 }, { "completion_length": 261.90625, "epoch": 0.9165073295092415, "grad_norm": 8.064598083496094, "kl": 0.083984375, "learning_rate": 8.349267049075843e-08, "loss": 0.0034, "reward": 1.6613750457763672, "reward_std": 0.06930655241012573, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.41137492656707764, "rewards/pad": 0.25, "step": 2876 }, { "completion_length": 257.5, "epoch": 0.9168260038240917, "grad_norm": 15.72818374633789, "kl": 0.0859375, "learning_rate": 8.317399617590822e-08, "loss": 0.0034, "reward": 1.6899898052215576, "reward_std": 0.06479023396968842, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43998971581459045, "rewards/pad": 0.25, "step": 2877 }, { "completion_length": 305.796875, "epoch": 0.917144678138942, "grad_norm": 76.97195434570312, "kl": 0.1611328125, "learning_rate": 8.2855321861058e-08, "loss": 0.0065, "reward": 1.6203289031982422, "reward_std": 0.10637544095516205, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4953289031982422, "rewards/pad": 0.125, "step": 2878 }, { "completion_length": 283.390625, "epoch": 0.9174633524537922, "grad_norm": 7.364353656768799, "kl": 0.0771484375, "learning_rate": 8.253664754620778e-08, "loss": 0.0031, "reward": 1.577134132385254, "reward_std": 0.13365203142166138, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5927591323852539, "step": 2879 }, { "completion_length": 210.359375, "epoch": 0.9177820267686424, "grad_norm": 18.346982955932617, "kl": 0.09326171875, "learning_rate": 8.221797323135756e-08, "loss": 0.0037, "reward": 1.4383864402770996, "reward_std": 0.14986774325370789, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40713638067245483, "rewards/pad": 0.03125, "step": 2880 }, { "completion_length": 205.109375, "epoch": 0.9181007010834926, "grad_norm": 11.897275924682617, "kl": 0.09521484375, "learning_rate": 8.189929891650732e-08, "loss": 0.0038, "reward": 1.6616731882095337, "reward_std": 0.06684476137161255, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4116731584072113, "step": 2881 }, { "completion_length": 189.671875, "epoch": 0.9184193753983428, "grad_norm": 9.623435974121094, "kl": 0.087890625, "learning_rate": 8.15806246016571e-08, "loss": 0.0035, "reward": 1.5173976421356201, "reward_std": 0.051018401980400085, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5173976421356201, "rewards/pad": 0.0, "step": 2882 }, { "completion_length": 277.890625, "epoch": 0.9187380497131931, "grad_norm": 7.127283096313477, "kl": 0.07177734375, "learning_rate": 8.126195028680689e-08, "loss": 0.0029, "reward": 1.5310813188552856, "reward_std": 0.09413205087184906, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5467062592506409, "step": 2883 }, { "completion_length": 280.34375, "epoch": 0.9190567240280434, "grad_norm": 6.712581157684326, "kl": 0.07470703125, "learning_rate": 8.094327597195667e-08, "loss": 0.003, "reward": 1.4587035179138184, "reward_std": 0.07780376821756363, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.34932857751846313, "rewards/pad": 0.125, "step": 2884 }, { "completion_length": 200.96875, "epoch": 0.9193753983428936, "grad_norm": 10.461310386657715, "kl": 0.099609375, "learning_rate": 8.062460165710643e-08, "loss": 0.004, "reward": 1.4560800790786743, "reward_std": 0.09152737259864807, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4560800790786743, "step": 2885 }, { "completion_length": 195.28125, "epoch": 0.9196940726577438, "grad_norm": 19.952253341674805, "kl": 0.2265625, "learning_rate": 8.03059273422562e-08, "loss": 0.0091, "reward": 1.5408905744552612, "reward_std": 0.06746616214513779, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5408905744552612, "rewards/pad": 0.0, "step": 2886 }, { "completion_length": 204.234375, "epoch": 0.920012746972594, "grad_norm": 27.799915313720703, "kl": 0.07373046875, "learning_rate": 7.998725302740598e-08, "loss": 0.0029, "reward": 1.5065892934799194, "reward_std": 0.12254871428012848, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.38158929347991943, "rewards/pad": 0.125, "step": 2887 }, { "completion_length": 259.484375, "epoch": 0.9203314212874443, "grad_norm": 17.611223220825195, "kl": 0.080078125, "learning_rate": 7.966857871255576e-08, "loss": 0.0032, "reward": 1.7014851570129395, "reward_std": 0.10515861958265305, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5764852166175842, "step": 2888 }, { "completion_length": 262.984375, "epoch": 0.9206500956022945, "grad_norm": 14.8787841796875, "kl": 0.07080078125, "learning_rate": 7.934990439770554e-08, "loss": 0.0028, "reward": 1.4765360355377197, "reward_std": 0.10343047976493835, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3515360355377197, "rewards/pad": 0.125, "step": 2889 }, { "completion_length": 216.078125, "epoch": 0.9209687699171447, "grad_norm": 11.560380935668945, "kl": 0.11474609375, "learning_rate": 7.903123008285532e-08, "loss": 0.0046, "reward": 1.5976759195327759, "reward_std": 0.1031763032078743, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47267594933509827, "rewards/pad": 0.125, "step": 2890 }, { "completion_length": 211.40625, "epoch": 0.9212874442319949, "grad_norm": 11.442146301269531, "kl": 0.1005859375, "learning_rate": 7.871255576800509e-08, "loss": 0.004, "reward": 1.5785408020019531, "reward_std": 0.11714388430118561, "rewards/pad": 0.140625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4379158020019531, "step": 2891 }, { "completion_length": 286.515625, "epoch": 0.9216061185468452, "grad_norm": 7.306763648986816, "kl": 0.06884765625, "learning_rate": 7.839388145315487e-08, "loss": 0.0028, "reward": 1.492227554321289, "reward_std": 0.0733942911028862, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4922274649143219, "step": 2892 }, { "completion_length": 232.046875, "epoch": 0.9219247928616954, "grad_norm": 10.7002534866333, "kl": 0.0869140625, "learning_rate": 7.807520713830465e-08, "loss": 0.0035, "reward": 1.589614748954773, "reward_std": 0.04720890522003174, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.589614748954773, "step": 2893 }, { "completion_length": 294.625, "epoch": 0.9222434671765456, "grad_norm": 8.909106254577637, "kl": 0.138671875, "learning_rate": 7.775653282345443e-08, "loss": 0.0055, "reward": 1.6108899116516113, "reward_std": 0.08002126216888428, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6108898520469666, "step": 2894 }, { "completion_length": 153.015625, "epoch": 0.9225621414913958, "grad_norm": 18.675016403198242, "kl": 0.1328125, "learning_rate": 7.743785850860421e-08, "loss": 0.0053, "reward": 1.551439881324768, "reward_std": 0.09808996319770813, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5514398813247681, "rewards/pad": 0.0, "step": 2895 }, { "completion_length": 278.375, "epoch": 0.922880815806246, "grad_norm": 39.86183547973633, "kl": 0.0673828125, "learning_rate": 7.711918419375398e-08, "loss": 0.0027, "reward": 1.4703035354614258, "reward_std": 0.1023271381855011, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4703035354614258, "step": 2896 }, { "completion_length": 189.25, "epoch": 0.9231994901210963, "grad_norm": 7.673598289489746, "kl": 0.09716796875, "learning_rate": 7.680050987890376e-08, "loss": 0.0039, "reward": 1.5865952968597412, "reward_std": 0.048405349254608154, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5865952372550964, "rewards/pad": 0.0, "step": 2897 }, { "completion_length": 304.890625, "epoch": 0.9235181644359465, "grad_norm": 6.57677698135376, "kl": 0.06494140625, "learning_rate": 7.648183556405354e-08, "loss": 0.0026, "reward": 1.470102071762085, "reward_std": 0.06978422403335571, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47010213136672974, "step": 2898 }, { "completion_length": 336.0, "epoch": 0.9238368387507967, "grad_norm": 10.75872802734375, "kl": 0.055908203125, "learning_rate": 7.616316124920332e-08, "loss": 0.0022, "reward": 1.6533763408660889, "reward_std": 0.05971824750304222, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5283763408660889, "step": 2899 }, { "completion_length": 232.671875, "epoch": 0.9241555130656469, "grad_norm": 6.692727565765381, "kl": 0.08154296875, "learning_rate": 7.584448693435309e-08, "loss": 0.0033, "reward": 1.5848937034606934, "reward_std": 0.0841706320643425, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45989376306533813, "rewards/pad": 0.125, "step": 2900 }, { "completion_length": 213.953125, "epoch": 0.9244741873804971, "grad_norm": 19.63039779663086, "kl": 0.07275390625, "learning_rate": 7.552581261950285e-08, "loss": 0.0029, "reward": 1.582329273223877, "reward_std": 0.09304428100585938, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4573292136192322, "step": 2901 }, { "completion_length": 204.390625, "epoch": 0.9247928616953474, "grad_norm": 29.540363311767578, "kl": 0.08056640625, "learning_rate": 7.520713830465264e-08, "loss": 0.0032, "reward": 1.8274729251861572, "reward_std": 0.15914708375930786, "rewards/answer_reward": 0.3125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.514972984790802, "step": 2902 }, { "completion_length": 256.71875, "epoch": 0.9251115360101976, "grad_norm": 14.957947731018066, "kl": 0.16796875, "learning_rate": 7.488846398980242e-08, "loss": 0.0067, "reward": 1.803660273551941, "reward_std": 0.05807485803961754, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5536603927612305, "step": 2903 }, { "completion_length": 162.78125, "epoch": 0.9254302103250478, "grad_norm": 39.77608871459961, "kl": 0.0927734375, "learning_rate": 7.45697896749522e-08, "loss": 0.0037, "reward": 1.5235381126403809, "reward_std": 0.0962752103805542, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.39853811264038086, "rewards/pad": 0.125, "step": 2904 }, { "completion_length": 211.46875, "epoch": 0.925748884639898, "grad_norm": 25.95490074157715, "kl": 0.09765625, "learning_rate": 7.425111536010198e-08, "loss": 0.0039, "reward": 1.7679393291473389, "reward_std": 0.11521582305431366, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.6585642099380493, "step": 2905 }, { "completion_length": 266.78125, "epoch": 0.9260675589547482, "grad_norm": 24.75307846069336, "kl": 0.349609375, "learning_rate": 7.393244104525174e-08, "loss": 0.014, "reward": 1.5795177221298218, "reward_std": 0.07660073786973953, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45451778173446655, "rewards/pad": 0.125, "step": 2906 }, { "completion_length": 165.796875, "epoch": 0.9263862332695985, "grad_norm": 9.585287094116211, "kl": 0.0986328125, "learning_rate": 7.361376673040152e-08, "loss": 0.0039, "reward": 1.718969464302063, "reward_std": 0.10125716775655746, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5939695239067078, "rewards/pad": 0.125, "step": 2907 }, { "completion_length": 253.953125, "epoch": 0.9267049075844487, "grad_norm": 15.626591682434082, "kl": 0.10009765625, "learning_rate": 7.32950924155513e-08, "loss": 0.004, "reward": 1.4436264038085938, "reward_std": 0.10515650361776352, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4592515230178833, "step": 2908 }, { "completion_length": 221.015625, "epoch": 0.9270235818992989, "grad_norm": 11.700095176696777, "kl": 0.08154296875, "learning_rate": 7.297641810070109e-08, "loss": 0.0033, "reward": 1.6605483293533325, "reward_std": 0.07712937146425247, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5355482697486877, "step": 2909 }, { "completion_length": 233.765625, "epoch": 0.9273422562141491, "grad_norm": 9.402727127075195, "kl": 0.08837890625, "learning_rate": 7.265774378585087e-08, "loss": 0.0035, "reward": 1.52135169506073, "reward_std": 0.08518072962760925, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.52135169506073, "rewards/pad": 0.0, "step": 2910 }, { "completion_length": 247.359375, "epoch": 0.9276609305289993, "grad_norm": 10.16604232788086, "kl": 0.10546875, "learning_rate": 7.233906947100063e-08, "loss": 0.0042, "reward": 1.5909231901168823, "reward_std": 0.0987737774848938, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5909231901168823, "step": 2911 }, { "completion_length": 157.125, "epoch": 0.9279796048438496, "grad_norm": 17.30365562438965, "kl": 0.09326171875, "learning_rate": 7.202039515615041e-08, "loss": 0.0037, "reward": 1.5729458332061768, "reward_std": 0.08636891841888428, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.44794589281082153, "step": 2912 }, { "completion_length": 282.265625, "epoch": 0.9282982791586998, "grad_norm": 16.395715713500977, "kl": 0.087890625, "learning_rate": 7.17017208413002e-08, "loss": 0.0035, "reward": 1.5452919006347656, "reward_std": 0.053568463772535324, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5452919602394104, "step": 2913 }, { "completion_length": 405.84375, "epoch": 0.92861695347355, "grad_norm": 7.0628485679626465, "kl": 0.048095703125, "learning_rate": 7.138304652644997e-08, "loss": 0.0019, "reward": 1.531229019165039, "reward_std": 0.04710269346833229, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5312290191650391, "step": 2914 }, { "completion_length": 205.46875, "epoch": 0.9289356277884002, "grad_norm": 8.386787414550781, "kl": 0.11083984375, "learning_rate": 7.106437221159973e-08, "loss": 0.0044, "reward": 1.684080958366394, "reward_std": 0.10352471470832825, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.574705958366394, "step": 2915 }, { "completion_length": 266.25, "epoch": 0.9292543021032504, "grad_norm": 17.049253463745117, "kl": 0.080078125, "learning_rate": 7.074569789674951e-08, "loss": 0.0032, "reward": 1.6391890048980713, "reward_std": 0.08314747363328934, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3891889750957489, "step": 2916 }, { "completion_length": 258.8125, "epoch": 0.9295729764181007, "grad_norm": 11.82989501953125, "kl": 0.08056640625, "learning_rate": 7.042702358189929e-08, "loss": 0.0032, "reward": 1.608884572982788, "reward_std": 0.12440791726112366, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4526345431804657, "step": 2917 }, { "completion_length": 294.09375, "epoch": 0.9298916507329509, "grad_norm": 14.522891998291016, "kl": 0.08203125, "learning_rate": 7.010834926704907e-08, "loss": 0.0033, "reward": 1.634501338005066, "reward_std": 0.08189462870359421, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.6501263976097107, "rewards/pad": 0.0, "step": 2918 }, { "completion_length": 355.28125, "epoch": 0.9302103250478011, "grad_norm": 10.51163101196289, "kl": 0.0576171875, "learning_rate": 6.978967495219885e-08, "loss": 0.0023, "reward": 1.5610953569412231, "reward_std": 0.1257803738117218, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.46734535694122314, "step": 2919 }, { "completion_length": 245.546875, "epoch": 0.9305289993626513, "grad_norm": 14.03441047668457, "kl": 0.099609375, "learning_rate": 6.947100063734862e-08, "loss": 0.004, "reward": 1.5305681228637695, "reward_std": 0.06954821199178696, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5305681228637695, "rewards/pad": 0.0, "step": 2920 }, { "completion_length": 312.1875, "epoch": 0.9308476736775015, "grad_norm": 10.062077522277832, "kl": 0.0732421875, "learning_rate": 6.91523263224984e-08, "loss": 0.0029, "reward": 1.5083460807800293, "reward_std": 0.08206836134195328, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5239709615707397, "rewards/pad": 0.0, "step": 2921 }, { "completion_length": 298.671875, "epoch": 0.9311663479923518, "grad_norm": 7.746303081512451, "kl": 0.087890625, "learning_rate": 6.883365200764818e-08, "loss": 0.0035, "reward": 1.5460608005523682, "reward_std": 0.041064534336328506, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5460608005523682, "rewards/pad": 0.0, "step": 2922 }, { "completion_length": 217.46875, "epoch": 0.9314850223072021, "grad_norm": 8.386974334716797, "kl": 0.09130859375, "learning_rate": 6.851497769279796e-08, "loss": 0.0036, "reward": 1.5595431327819824, "reward_std": 0.07949882000684738, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.43454310297966003, "rewards/pad": 0.125, "step": 2923 }, { "completion_length": 346.375, "epoch": 0.9318036966220523, "grad_norm": 6.066806316375732, "kl": 0.076171875, "learning_rate": 6.819630337794774e-08, "loss": 0.003, "reward": 1.5769262313842773, "reward_std": 0.0969569981098175, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5925512909889221, "rewards/pad": 0.0, "step": 2924 }, { "completion_length": 312.3125, "epoch": 0.9321223709369025, "grad_norm": 7.170636177062988, "kl": 0.080078125, "learning_rate": 6.787762906309751e-08, "loss": 0.0032, "reward": 1.492858648300171, "reward_std": 0.12179407477378845, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5084837079048157, "step": 2925 }, { "completion_length": 197.78125, "epoch": 0.9324410452517528, "grad_norm": 18.992835998535156, "kl": 0.09423828125, "learning_rate": 6.755895474824729e-08, "loss": 0.0038, "reward": 1.4061367511749268, "reward_std": 0.09154429286718369, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.42176181077957153, "step": 2926 }, { "completion_length": 220.6875, "epoch": 0.932759719566603, "grad_norm": 23.679418563842773, "kl": 0.0888671875, "learning_rate": 6.724028043339707e-08, "loss": 0.0035, "reward": 1.6889541149139404, "reward_std": 0.09088733792304993, "rewards/answer_reward": 0.171875, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5170789957046509, "step": 2927 }, { "completion_length": 214.296875, "epoch": 0.9330783938814532, "grad_norm": 10.682872772216797, "kl": 0.076171875, "learning_rate": 6.692160611854685e-08, "loss": 0.003, "reward": 1.7833725214004517, "reward_std": 0.06558363139629364, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5333724617958069, "step": 2928 }, { "completion_length": 309.59375, "epoch": 0.9333970681963034, "grad_norm": 5.863774299621582, "kl": 0.064453125, "learning_rate": 6.660293180369663e-08, "loss": 0.0026, "reward": 1.6170300245285034, "reward_std": 0.06263009458780289, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4920300841331482, "step": 2929 }, { "completion_length": 307.515625, "epoch": 0.9337157425111536, "grad_norm": 34.34066390991211, "kl": 0.07373046875, "learning_rate": 6.62842574888464e-08, "loss": 0.0029, "reward": 1.4313735961914062, "reward_std": 0.05077674239873886, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43137362599372864, "step": 2930 }, { "completion_length": 214.859375, "epoch": 0.9340344168260039, "grad_norm": 19.838254928588867, "kl": 0.1279296875, "learning_rate": 6.596558317399616e-08, "loss": 0.0051, "reward": 1.5669782161712646, "reward_std": 0.11023472994565964, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.44197824597358704, "rewards/pad": 0.125, "step": 2931 }, { "completion_length": 262.6875, "epoch": 0.9343530911408541, "grad_norm": 6.702688217163086, "kl": 0.0703125, "learning_rate": 6.564690885914594e-08, "loss": 0.0028, "reward": 1.4523699283599854, "reward_std": 0.04695475473999977, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3273698091506958, "step": 2932 }, { "completion_length": 163.0625, "epoch": 0.9346717654557043, "grad_norm": 11.438345909118652, "kl": 0.0966796875, "learning_rate": 6.532823454429572e-08, "loss": 0.0039, "reward": 1.7000656127929688, "reward_std": 0.05705934017896652, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5750656127929688, "step": 2933 }, { "completion_length": 305.65625, "epoch": 0.9349904397705545, "grad_norm": 23.206321716308594, "kl": 0.072265625, "learning_rate": 6.50095602294455e-08, "loss": 0.0029, "reward": 1.7570136785507202, "reward_std": 0.07544252276420593, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5070135593414307, "step": 2934 }, { "completion_length": 339.765625, "epoch": 0.9353091140854047, "grad_norm": 13.489253997802734, "kl": 0.07568359375, "learning_rate": 6.469088591459527e-08, "loss": 0.003, "reward": 1.8512710332870483, "reward_std": 0.09367866814136505, "rewards/pad": 0.375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4762710928916931, "step": 2935 }, { "completion_length": 256.5, "epoch": 0.935627788400255, "grad_norm": 19.88876724243164, "kl": 0.08837890625, "learning_rate": 6.437221159974505e-08, "loss": 0.0035, "reward": 1.566627860069275, "reward_std": 0.04243381693959236, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5666278600692749, "step": 2936 }, { "completion_length": 277.828125, "epoch": 0.9359464627151052, "grad_norm": 12.794341087341309, "kl": 0.076171875, "learning_rate": 6.405353728489483e-08, "loss": 0.003, "reward": 1.5573514699935913, "reward_std": 0.14613482356071472, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4479764699935913, "step": 2937 }, { "completion_length": 164.578125, "epoch": 0.9362651370299554, "grad_norm": 10.677695274353027, "kl": 0.1064453125, "learning_rate": 6.373486297004461e-08, "loss": 0.0043, "reward": 1.8390767574310303, "reward_std": 0.07184389978647232, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5890767574310303, "rewards/pad": 0.25, "step": 2938 }, { "completion_length": 268.046875, "epoch": 0.9365838113448056, "grad_norm": 9.086060523986816, "kl": 0.08837890625, "learning_rate": 6.34161886551944e-08, "loss": 0.0035, "reward": 1.5117340087890625, "reward_std": 0.04868802800774574, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5117339491844177, "rewards/pad": 0.0, "step": 2939 }, { "completion_length": 349.109375, "epoch": 0.9369024856596558, "grad_norm": 16.004676818847656, "kl": 0.054931640625, "learning_rate": 6.309751434034416e-08, "loss": 0.0022, "reward": 1.4877666234970093, "reward_std": 0.10770290344953537, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3783915936946869, "rewards/pad": 0.125, "step": 2940 }, { "completion_length": 371.609375, "epoch": 0.9372211599745061, "grad_norm": 29.801067352294922, "kl": 0.055908203125, "learning_rate": 6.277884002549394e-08, "loss": 0.0022, "reward": 1.3484127521514893, "reward_std": 0.16493690013885498, "rewards/answer_reward": 0.015625, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3484126329421997, "step": 2941 }, { "completion_length": 183.03125, "epoch": 0.9375398342893563, "grad_norm": 15.473187446594238, "kl": 0.09375, "learning_rate": 6.246016571064372e-08, "loss": 0.0038, "reward": 1.7514030933380127, "reward_std": 0.13864727318286896, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5326529741287231, "rewards/pad": 0.21875, "step": 2942 }, { "completion_length": 221.15625, "epoch": 0.9378585086042065, "grad_norm": 35.03718948364258, "kl": 0.0888671875, "learning_rate": 6.21414913957935e-08, "loss": 0.0036, "reward": 1.5468283891677856, "reward_std": 0.1377798318862915, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5624533891677856, "rewards/pad": 0.0, "step": 2943 }, { "completion_length": 279.015625, "epoch": 0.9381771829190567, "grad_norm": 10.888705253601074, "kl": 0.068359375, "learning_rate": 6.182281708094327e-08, "loss": 0.0027, "reward": 1.8214476108551025, "reward_std": 0.06730944663286209, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5714476108551025, "step": 2944 }, { "completion_length": 263.1875, "epoch": 0.9384958572339069, "grad_norm": 6.947081089019775, "kl": 0.080078125, "learning_rate": 6.150414276609305e-08, "loss": 0.0032, "reward": 1.4168049097061157, "reward_std": 0.15775421261787415, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4480549395084381, "step": 2945 }, { "completion_length": 254.953125, "epoch": 0.9388145315487572, "grad_norm": 16.894763946533203, "kl": 0.087890625, "learning_rate": 6.118546845124282e-08, "loss": 0.0035, "reward": 1.4989348649978638, "reward_std": 0.12767580151557922, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.514559805393219, "step": 2946 }, { "completion_length": 293.296875, "epoch": 0.9391332058636074, "grad_norm": 18.079181671142578, "kl": 0.0732421875, "learning_rate": 6.08667941363926e-08, "loss": 0.0029, "reward": 1.4271761178970337, "reward_std": 0.029198169708251953, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4271760880947113, "step": 2947 }, { "completion_length": 303.421875, "epoch": 0.9394518801784576, "grad_norm": 9.24788761138916, "kl": 0.0771484375, "learning_rate": 6.054811982154238e-08, "loss": 0.0031, "reward": 1.5654431581497192, "reward_std": 0.07180614024400711, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5654430985450745, "step": 2948 }, { "completion_length": 252.984375, "epoch": 0.9397705544933078, "grad_norm": 12.085131645202637, "kl": 0.09375, "learning_rate": 6.022944550669216e-08, "loss": 0.0038, "reward": 1.5537967681884766, "reward_std": 0.06366712599992752, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5537967681884766, "step": 2949 }, { "completion_length": 309.9375, "epoch": 0.940089228808158, "grad_norm": 5.42282772064209, "kl": 0.06640625, "learning_rate": 5.991077119184194e-08, "loss": 0.0026, "reward": 1.6361989974975586, "reward_std": 0.04752662777900696, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3861989378929138, "step": 2950 }, { "completion_length": 304.859375, "epoch": 0.9404079031230083, "grad_norm": 21.39364242553711, "kl": 0.0908203125, "learning_rate": 5.959209687699171e-08, "loss": 0.0036, "reward": 1.4632328748703003, "reward_std": 0.1383098065853119, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4788578748703003, "step": 2951 }, { "completion_length": 266.921875, "epoch": 0.9407265774378585, "grad_norm": 17.54840850830078, "kl": 0.08203125, "learning_rate": 5.927342256214149e-08, "loss": 0.0033, "reward": 1.537185788154602, "reward_std": 0.10543005913496017, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4121857285499573, "rewards/pad": 0.125, "step": 2952 }, { "completion_length": 164.421875, "epoch": 0.9410452517527087, "grad_norm": 7.701272010803223, "kl": 0.095703125, "learning_rate": 5.895474824729126e-08, "loss": 0.0038, "reward": 1.5821001529693604, "reward_std": 0.09362144768238068, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45710012316703796, "rewards/pad": 0.125, "step": 2953 }, { "completion_length": 335.703125, "epoch": 0.9413639260675589, "grad_norm": 8.84483528137207, "kl": 0.06787109375, "learning_rate": 5.863607393244104e-08, "loss": 0.0027, "reward": 1.5164875984191895, "reward_std": 0.03204556554555893, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5164875984191895, "step": 2954 }, { "completion_length": 288.28125, "epoch": 0.9416826003824091, "grad_norm": 5.423102378845215, "kl": 0.06591796875, "learning_rate": 5.831739961759082e-08, "loss": 0.0026, "reward": 1.6633018255233765, "reward_std": 0.11971494555473328, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4758017361164093, "rewards/pad": 0.1875, "step": 2955 }, { "completion_length": 228.59375, "epoch": 0.9420012746972594, "grad_norm": 9.51950454711914, "kl": 0.08056640625, "learning_rate": 5.79987253027406e-08, "loss": 0.0032, "reward": 1.7245519161224365, "reward_std": 0.07877679169178009, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4745519757270813, "step": 2956 }, { "completion_length": 242.96875, "epoch": 0.9423199490121096, "grad_norm": 16.62901496887207, "kl": 0.07080078125, "learning_rate": 5.768005098789038e-08, "loss": 0.0028, "reward": 1.5734515190124512, "reward_std": 0.0714656189084053, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5734515190124512, "step": 2957 }, { "completion_length": 167.28125, "epoch": 0.9426386233269598, "grad_norm": 27.43533706665039, "kl": 0.10888671875, "learning_rate": 5.7361376673040145e-08, "loss": 0.0044, "reward": 1.538001537322998, "reward_std": 0.12343016266822815, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.538001537322998, "rewards/pad": 0.0, "step": 2958 }, { "completion_length": 211.296875, "epoch": 0.94295729764181, "grad_norm": 16.95347785949707, "kl": 0.08544921875, "learning_rate": 5.7042702358189925e-08, "loss": 0.0034, "reward": 1.5702875852584839, "reward_std": 0.056008193641901016, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4452875554561615, "rewards/pad": 0.125, "step": 2959 }, { "completion_length": 198.75, "epoch": 0.9432759719566602, "grad_norm": 8.831547737121582, "kl": 0.1005859375, "learning_rate": 5.6724028043339706e-08, "loss": 0.004, "reward": 1.3954182863235474, "reward_std": 0.09478885680437088, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3954182267189026, "rewards/pad": 0.0, "step": 2960 }, { "completion_length": 243.953125, "epoch": 0.9435946462715105, "grad_norm": 10.098033905029297, "kl": 0.07568359375, "learning_rate": 5.640535372848948e-08, "loss": 0.003, "reward": 1.6640007495880127, "reward_std": 0.05323734134435654, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5390008687973022, "step": 2961 }, { "completion_length": 207.359375, "epoch": 0.9439133205863608, "grad_norm": 21.087514877319336, "kl": 0.08447265625, "learning_rate": 5.608667941363926e-08, "loss": 0.0034, "reward": 1.7884836196899414, "reward_std": 0.20367951691150665, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5541085600852966, "rewards/pad": 0.234375, "step": 2962 }, { "completion_length": 246.984375, "epoch": 0.944231994901211, "grad_norm": 11.247840881347656, "kl": 0.08984375, "learning_rate": 5.5768005098789034e-08, "loss": 0.0036, "reward": 1.6184732913970947, "reward_std": 0.1049545407295227, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49347323179244995, "rewards/pad": 0.125, "step": 2963 }, { "completion_length": 370.34375, "epoch": 0.9445506692160612, "grad_norm": 8.138402938842773, "kl": 0.056884765625, "learning_rate": 5.5449330783938815e-08, "loss": 0.0023, "reward": 1.4136632680892944, "reward_std": 0.10494749248027802, "rewards/pad": 0.078125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.33553823828697205, "step": 2964 }, { "completion_length": 167.796875, "epoch": 0.9448693435309115, "grad_norm": 67.82130432128906, "kl": 0.11962890625, "learning_rate": 5.5130656469088595e-08, "loss": 0.0048, "reward": 1.7002036571502686, "reward_std": 0.17276358604431152, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.46582862734794617, "step": 2965 }, { "completion_length": 315.9375, "epoch": 0.9451880178457617, "grad_norm": 8.57858943939209, "kl": 0.087890625, "learning_rate": 5.481198215423836e-08, "loss": 0.0035, "reward": 1.7237110137939453, "reward_std": 0.06048474460840225, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4737110137939453, "rewards/pad": 0.25, "step": 2966 }, { "completion_length": 243.09375, "epoch": 0.9455066921606119, "grad_norm": 9.491878509521484, "kl": 0.08203125, "learning_rate": 5.449330783938814e-08, "loss": 0.0033, "reward": 1.6464054584503174, "reward_std": 0.12060653418302536, "rewards/pad": 0.109375, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5526554584503174, "step": 2967 }, { "completion_length": 252.5, "epoch": 0.9458253664754621, "grad_norm": 15.32401180267334, "kl": 0.072265625, "learning_rate": 5.417463352453792e-08, "loss": 0.0029, "reward": 1.6297492980957031, "reward_std": 0.11908349394798279, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.41099923849105835, "rewards/pad": 0.234375, "step": 2968 }, { "completion_length": 274.90625, "epoch": 0.9461440407903123, "grad_norm": 6.705809593200684, "kl": 0.078125, "learning_rate": 5.38559592096877e-08, "loss": 0.0031, "reward": 1.565004587173462, "reward_std": 0.0942590981721878, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.45562952756881714, "rewards/pad": 0.125, "step": 2969 }, { "completion_length": 210.984375, "epoch": 0.9464627151051626, "grad_norm": 19.868122100830078, "kl": 0.08984375, "learning_rate": 5.353728489483748e-08, "loss": 0.0036, "reward": 1.6228530406951904, "reward_std": 0.06939147412776947, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49785304069519043, "rewards/pad": 0.125, "step": 2970 }, { "completion_length": 212.421875, "epoch": 0.9467813894200128, "grad_norm": 11.730999946594238, "kl": 0.09228515625, "learning_rate": 5.321861057998725e-08, "loss": 0.0037, "reward": 1.6018955707550049, "reward_std": 0.05306608974933624, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.47689566016197205, "rewards/pad": 0.125, "step": 2971 }, { "completion_length": 216.578125, "epoch": 0.947100063734863, "grad_norm": 10.840819358825684, "kl": 0.10400390625, "learning_rate": 5.289993626513703e-08, "loss": 0.0042, "reward": 1.5596673488616943, "reward_std": 0.1178320050239563, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4659172594547272, "rewards/pad": 0.09375, "step": 2972 }, { "completion_length": 258.984375, "epoch": 0.9474187380497132, "grad_norm": 21.22210693359375, "kl": 0.07177734375, "learning_rate": 5.25812619502868e-08, "loss": 0.0029, "reward": 1.6138757467269897, "reward_std": 0.062143437564373016, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.48887574672698975, "step": 2973 }, { "completion_length": 399.984375, "epoch": 0.9477374123645634, "grad_norm": 11.115797996520996, "kl": 0.05224609375, "learning_rate": 5.226258763543658e-08, "loss": 0.0021, "reward": 1.3853999376296997, "reward_std": 0.14694945514202118, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.4166499674320221, "step": 2974 }, { "completion_length": 325.015625, "epoch": 0.9480560866794137, "grad_norm": 140.2318115234375, "kl": 0.061279296875, "learning_rate": 5.1943913320586354e-08, "loss": 0.0024, "reward": 1.497594952583313, "reward_std": 0.09182921797037125, "rewards/pad": 0.03125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.466344952583313, "step": 2975 }, { "completion_length": 112.921875, "epoch": 0.9483747609942639, "grad_norm": 14.602428436279297, "kl": 0.1201171875, "learning_rate": 5.1625239005736134e-08, "loss": 0.0048, "reward": 1.7921075820922852, "reward_std": 0.12751063704490662, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5577325820922852, "rewards/pad": 0.234375, "step": 2976 }, { "completion_length": 180.671875, "epoch": 0.9486934353091141, "grad_norm": 28.626598358154297, "kl": 0.0888671875, "learning_rate": 5.1306564690885915e-08, "loss": 0.0036, "reward": 1.3622570037841797, "reward_std": 0.0660131573677063, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.23725706338882446, "step": 2977 }, { "completion_length": 260.421875, "epoch": 0.9490121096239643, "grad_norm": 8.978132247924805, "kl": 0.125, "learning_rate": 5.098789037603569e-08, "loss": 0.005, "reward": 1.5364545583724976, "reward_std": 0.12177839875221252, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.552079439163208, "rewards/pad": 0.0, "step": 2978 }, { "completion_length": 203.125, "epoch": 0.9493307839388145, "grad_norm": 12.183414459228516, "kl": 0.10107421875, "learning_rate": 5.066921606118547e-08, "loss": 0.004, "reward": 1.5364055633544922, "reward_std": 0.11670367419719696, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5520305037498474, "rewards/pad": 0.0, "step": 2979 }, { "completion_length": 331.40625, "epoch": 0.9496494582536648, "grad_norm": 21.630046844482422, "kl": 0.056640625, "learning_rate": 5.035054174633524e-08, "loss": 0.0023, "reward": 1.5015981197357178, "reward_std": 0.14297401905059814, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4547232687473297, "rewards/pad": 0.0625, "step": 2980 }, { "completion_length": 311.828125, "epoch": 0.949968132568515, "grad_norm": 10.797045707702637, "kl": 0.09423828125, "learning_rate": 5.003186743148502e-08, "loss": 0.0038, "reward": 1.3974107503890991, "reward_std": 0.04455138370394707, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.27241069078445435, "step": 2981 }, { "completion_length": 265.359375, "epoch": 0.9502868068833652, "grad_norm": 12.558472633361816, "kl": 0.07763671875, "learning_rate": 4.97131931166348e-08, "loss": 0.0031, "reward": 1.7381936311721802, "reward_std": 0.13941644132137299, "rewards/pad": 0.25, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.5194436311721802, "step": 2982 }, { "completion_length": 150.96875, "epoch": 0.9506054811982154, "grad_norm": 36.841224670410156, "kl": 0.1220703125, "learning_rate": 4.939451880178457e-08, "loss": 0.0049, "reward": 1.7243318557739258, "reward_std": 0.10196399688720703, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.599331796169281, "rewards/pad": 0.125, "step": 2983 }, { "completion_length": 398.90625, "epoch": 0.9509241555130656, "grad_norm": 5.658376216888428, "kl": 0.047119140625, "learning_rate": 4.907584448693435e-08, "loss": 0.0019, "reward": 1.5583215951919556, "reward_std": 0.07376955449581146, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.44894659519195557, "step": 2984 }, { "completion_length": 262.96875, "epoch": 0.9512428298279159, "grad_norm": 14.046929359436035, "kl": 0.0712890625, "learning_rate": 4.8757170172084126e-08, "loss": 0.0029, "reward": 1.5000898838043213, "reward_std": 0.12163540720939636, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.3907149136066437, "rewards/pad": 0.125, "step": 2985 }, { "completion_length": 149.234375, "epoch": 0.9515615041427661, "grad_norm": 11.515385627746582, "kl": 0.1005859375, "learning_rate": 4.8438495857233906e-08, "loss": 0.004, "reward": 1.8945035934448242, "reward_std": 0.11986218392848969, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6445035934448242, "rewards/pad": 0.25, "step": 2986 }, { "completion_length": 211.65625, "epoch": 0.9518801784576163, "grad_norm": 63.49930953979492, "kl": 0.08837890625, "learning_rate": 4.811982154238369e-08, "loss": 0.0035, "reward": 1.7222895622253418, "reward_std": 0.07968949526548386, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5972896814346313, "rewards/pad": 0.125, "step": 2987 }, { "completion_length": 252.140625, "epoch": 0.9521988527724665, "grad_norm": 9.624911308288574, "kl": 0.0966796875, "learning_rate": 4.7801147227533454e-08, "loss": 0.0039, "reward": 1.764341950416565, "reward_std": 0.15120723843574524, "rewards/pad": 0.171875, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5924668908119202, "step": 2988 }, { "completion_length": 277.484375, "epoch": 0.9525175270873167, "grad_norm": 7.495218276977539, "kl": 0.07861328125, "learning_rate": 4.7482472912683235e-08, "loss": 0.0032, "reward": 1.6148617267608643, "reward_std": 0.08406472206115723, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.48986178636550903, "rewards/pad": 0.125, "step": 2989 }, { "completion_length": 398.421875, "epoch": 0.952836201402167, "grad_norm": 10.273070335388184, "kl": 0.060546875, "learning_rate": 4.716379859783301e-08, "loss": 0.0024, "reward": 1.4720619916915894, "reward_std": 0.03158481419086456, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.47206199169158936, "step": 2990 }, { "completion_length": 337.234375, "epoch": 0.9531548757170172, "grad_norm": 8.842098236083984, "kl": 0.0517578125, "learning_rate": 4.684512428298279e-08, "loss": 0.0021, "reward": 1.4910411834716797, "reward_std": 0.05693596601486206, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49104124307632446, "step": 2991 }, { "completion_length": 268.4375, "epoch": 0.9534735500318674, "grad_norm": 7.0008745193481445, "kl": 0.08837890625, "learning_rate": 4.652644996813257e-08, "loss": 0.0035, "reward": 1.5434587001800537, "reward_std": 0.05424448847770691, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5434587001800537, "step": 2992 }, { "completion_length": 112.6875, "epoch": 0.9537922243467176, "grad_norm": 19.789148330688477, "kl": 0.09619140625, "learning_rate": 4.6207775653282343e-08, "loss": 0.0038, "reward": 2.011953830718994, "reward_std": 0.14079678058624268, "rewards/answer_reward": 0.359375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6525790095329285, "step": 2993 }, { "completion_length": 262.640625, "epoch": 0.9541108986615678, "grad_norm": 8.151646614074707, "kl": 0.08154296875, "learning_rate": 4.5889101338432124e-08, "loss": 0.0033, "reward": 1.683987021446228, "reward_std": 0.15515395998954773, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.574612021446228, "rewards/pad": 0.109375, "step": 2994 }, { "completion_length": 295.46875, "epoch": 0.9544295729764181, "grad_norm": 12.131200790405273, "kl": 0.083984375, "learning_rate": 4.55704270235819e-08, "loss": 0.0034, "reward": 1.648716926574707, "reward_std": 0.14547915756702423, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.539341926574707, "step": 2995 }, { "completion_length": 228.140625, "epoch": 0.9547482472912683, "grad_norm": 14.250265121459961, "kl": 0.07958984375, "learning_rate": 4.525175270873167e-08, "loss": 0.0032, "reward": 1.8075170516967773, "reward_std": 0.18245282769203186, "rewards/answer_reward": 0.4375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.37001708149909973, "step": 2996 }, { "completion_length": 321.1875, "epoch": 0.9550669216061185, "grad_norm": 5.624185085296631, "kl": 0.0712890625, "learning_rate": 4.493307839388145e-08, "loss": 0.0029, "reward": 1.5331188440322876, "reward_std": 0.06479167938232422, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40811875462532043, "rewards/pad": 0.125, "step": 2997 }, { "completion_length": 195.109375, "epoch": 0.9553855959209687, "grad_norm": 14.505610466003418, "kl": 0.107421875, "learning_rate": 4.4614404079031226e-08, "loss": 0.0043, "reward": 1.5591657161712646, "reward_std": 0.06373967230319977, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5591658353805542, "step": 2998 }, { "completion_length": 223.515625, "epoch": 0.9557042702358189, "grad_norm": 13.931330680847168, "kl": 0.091796875, "learning_rate": 4.4295729764181007e-08, "loss": 0.0037, "reward": 1.6169328689575195, "reward_std": 0.04845960810780525, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6169329285621643, "rewards/pad": 0.0, "step": 2999 }, { "completion_length": 316.390625, "epoch": 0.9560229445506692, "grad_norm": 16.772754669189453, "kl": 0.0625, "learning_rate": 4.397705544933078e-08, "loss": 0.0025, "reward": 1.7339085340499878, "reward_std": 0.09912504255771637, "rewards/pad": 0.234375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4995335340499878, "step": 3000 }, { "completion_length": 300.765625, "epoch": 0.9563416188655195, "grad_norm": 7.473287582397461, "kl": 0.07958984375, "learning_rate": 4.365838113448056e-08, "loss": 0.0032, "reward": 1.4801604747772217, "reward_std": 0.04294013977050781, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4801604151725769, "rewards/pad": 0.0, "step": 3001 }, { "completion_length": 289.828125, "epoch": 0.9566602931803697, "grad_norm": 11.7194242477417, "kl": 0.06640625, "learning_rate": 4.3339706819630335e-08, "loss": 0.0027, "reward": 1.5888422727584839, "reward_std": 0.04296314716339111, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4638422429561615, "rewards/pad": 0.125, "step": 3002 }, { "completion_length": 325.671875, "epoch": 0.9569789674952199, "grad_norm": 28.08829116821289, "kl": 0.0673828125, "learning_rate": 4.3021032504780115e-08, "loss": 0.0027, "reward": 1.465879201889038, "reward_std": 0.059772029519081116, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.34087926149368286, "step": 3003 }, { "completion_length": 211.9375, "epoch": 0.9572976418100702, "grad_norm": 12.590349197387695, "kl": 0.1513671875, "learning_rate": 4.270235818992989e-08, "loss": 0.0061, "reward": 1.5774794816970825, "reward_std": 0.10665491223335266, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5774794816970825, "rewards/pad": 0.0, "step": 3004 }, { "completion_length": 187.5, "epoch": 0.9576163161249204, "grad_norm": 16.79472541809082, "kl": 0.12255859375, "learning_rate": 4.238368387507966e-08, "loss": 0.0049, "reward": 1.6055787801742554, "reward_std": 0.08426319062709808, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6055787801742554, "step": 3005 }, { "completion_length": 263.3125, "epoch": 0.9579349904397706, "grad_norm": 11.45641803741455, "kl": 0.0849609375, "learning_rate": 4.2065009560229444e-08, "loss": 0.0034, "reward": 1.622631311416626, "reward_std": 0.059184275567531586, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.497631311416626, "rewards/pad": 0.125, "step": 3006 }, { "completion_length": 196.921875, "epoch": 0.9582536647546208, "grad_norm": 5.744304180145264, "kl": 0.076171875, "learning_rate": 4.174633524537922e-08, "loss": 0.003, "reward": 1.3995598554611206, "reward_std": 0.15788444876670837, "rewards/format_reward_tg": 0.96875, "rewards/iou_timestamp_reward": 0.3058098256587982, "rewards/pad": 0.125, "step": 3007 }, { "completion_length": 203.890625, "epoch": 0.958572339069471, "grad_norm": 11.544098854064941, "kl": 0.08154296875, "learning_rate": 4.1427660930529e-08, "loss": 0.0033, "reward": 1.6876447200775146, "reward_std": 0.07707478106021881, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4376447796821594, "step": 3008 }, { "completion_length": 241.15625, "epoch": 0.9588910133843213, "grad_norm": 11.547026634216309, "kl": 0.08056640625, "learning_rate": 4.110898661567878e-08, "loss": 0.0032, "reward": 1.5496833324432373, "reward_std": 0.06832106411457062, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5496833920478821, "rewards/pad": 0.0, "step": 3009 }, { "completion_length": 247.8125, "epoch": 0.9592096876991715, "grad_norm": 10.342385292053223, "kl": 0.0751953125, "learning_rate": 4.079031230082855e-08, "loss": 0.003, "reward": 1.7064106464385986, "reward_std": 0.08886480331420898, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5814106464385986, "step": 3010 }, { "completion_length": 301.1875, "epoch": 0.9595283620140217, "grad_norm": 9.354042053222656, "kl": 0.09716796875, "learning_rate": 4.047163798597833e-08, "loss": 0.0039, "reward": 1.569082498550415, "reward_std": 0.09403829276561737, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.584707498550415, "rewards/pad": 0.0, "step": 3011 }, { "completion_length": 207.21875, "epoch": 0.9598470363288719, "grad_norm": 27.846111297607422, "kl": 0.07421875, "learning_rate": 4.01529636711281e-08, "loss": 0.003, "reward": 1.7822484970092773, "reward_std": 0.10999485850334167, "rewards/pad": 0.375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.40724843740463257, "step": 3012 }, { "completion_length": 151.546875, "epoch": 0.9601657106437221, "grad_norm": 13.443730354309082, "kl": 0.12060546875, "learning_rate": 3.983428935627788e-08, "loss": 0.0048, "reward": 1.720609188079834, "reward_std": 0.05341852456331253, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.7206092476844788, "step": 3013 }, { "completion_length": 299.65625, "epoch": 0.9604843849585724, "grad_norm": 14.898906707763672, "kl": 0.0625, "learning_rate": 3.951561504142766e-08, "loss": 0.0025, "reward": 1.4509539604187012, "reward_std": 0.11394208669662476, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.46657896041870117, "step": 3014 }, { "completion_length": 360.953125, "epoch": 0.9608030592734226, "grad_norm": 7.477005481719971, "kl": 0.07666015625, "learning_rate": 3.9196940726577435e-08, "loss": 0.0031, "reward": 1.4555785655975342, "reward_std": 0.11029116809368134, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4712035357952118, "rewards/pad": 0.0, "step": 3015 }, { "completion_length": 217.671875, "epoch": 0.9611217335882728, "grad_norm": 11.383493423461914, "kl": 0.0869140625, "learning_rate": 3.8878266411727215e-08, "loss": 0.0035, "reward": 1.6601845026016235, "reward_std": 0.1018948182463646, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6601845622062683, "rewards/pad": 0.0, "step": 3016 }, { "completion_length": 242.90625, "epoch": 0.961440407903123, "grad_norm": 10.623103141784668, "kl": 0.0810546875, "learning_rate": 3.855959209687699e-08, "loss": 0.0032, "reward": 1.6002237796783447, "reward_std": 0.07772623002529144, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6002237796783447, "step": 3017 }, { "completion_length": 327.765625, "epoch": 0.9617590822179732, "grad_norm": 14.112882614135742, "kl": 0.064453125, "learning_rate": 3.824091778202677e-08, "loss": 0.0026, "reward": 1.7028043270111084, "reward_std": 0.19873130321502686, "rewards/pad": 0.21875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4996793270111084, "step": 3018 }, { "completion_length": 407.71875, "epoch": 0.9620777565328235, "grad_norm": 21.201560974121094, "kl": 0.05224609375, "learning_rate": 3.7922243467176544e-08, "loss": 0.0021, "reward": 1.4655060768127441, "reward_std": 0.08513548970222473, "rewards/answer_reward": 0.109375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.35613104701042175, "step": 3019 }, { "completion_length": 226.421875, "epoch": 0.9623964308476737, "grad_norm": 5.293650150299072, "kl": 0.07861328125, "learning_rate": 3.760356915232632e-08, "loss": 0.0031, "reward": 1.4824628829956055, "reward_std": 0.056464605033397675, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.48246294260025024, "step": 3020 }, { "completion_length": 311.5, "epoch": 0.9627151051625239, "grad_norm": 13.04458999633789, "kl": 0.08251953125, "learning_rate": 3.72848948374761e-08, "loss": 0.0033, "reward": 1.4492220878601074, "reward_std": 0.04893079772591591, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4492220878601074, "rewards/pad": 0.0, "step": 3021 }, { "completion_length": 225.640625, "epoch": 0.9630337794773741, "grad_norm": 16.370359420776367, "kl": 0.1796875, "learning_rate": 3.696622052262587e-08, "loss": 0.0072, "reward": 1.5129005908966064, "reward_std": 0.10321009904146194, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.38790059089660645, "step": 3022 }, { "completion_length": 162.375, "epoch": 0.9633524537922243, "grad_norm": 17.733623504638672, "kl": 0.095703125, "learning_rate": 3.664754620777565e-08, "loss": 0.0038, "reward": 1.785008430480957, "reward_std": 0.07291992753744125, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5350083708763123, "rewards/pad": 0.25, "step": 3023 }, { "completion_length": 312.4375, "epoch": 0.9636711281070746, "grad_norm": 10.358829498291016, "kl": 0.06640625, "learning_rate": 3.632887189292543e-08, "loss": 0.0027, "reward": 1.5295699834823608, "reward_std": 0.1087612509727478, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.29519492387771606, "rewards/pad": 0.25, "step": 3024 }, { "completion_length": 313.203125, "epoch": 0.9639898024219248, "grad_norm": 10.354582786560059, "kl": 0.0712890625, "learning_rate": 3.601019757807521e-08, "loss": 0.0028, "reward": 1.6526461839675903, "reward_std": 0.04885178059339523, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5276461243629456, "step": 3025 }, { "completion_length": 216.9375, "epoch": 0.964308476736775, "grad_norm": 6.7993011474609375, "kl": 0.0947265625, "learning_rate": 3.569152326322499e-08, "loss": 0.0038, "reward": 1.5121433734893799, "reward_std": 0.15271823108196259, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4027683734893799, "rewards/pad": 0.125, "step": 3026 }, { "completion_length": 323.609375, "epoch": 0.9646271510516252, "grad_norm": 6.0091166496276855, "kl": 0.08935546875, "learning_rate": 3.5372848948374755e-08, "loss": 0.0036, "reward": 1.5815134048461914, "reward_std": 0.04523976147174835, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4565134644508362, "step": 3027 }, { "completion_length": 259.109375, "epoch": 0.9649458253664754, "grad_norm": 123.76586151123047, "kl": 0.0859375, "learning_rate": 3.5054174633524535e-08, "loss": 0.0034, "reward": 1.640549898147583, "reward_std": 0.06378833204507828, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.515549898147583, "step": 3028 }, { "completion_length": 305.84375, "epoch": 0.9652644996813257, "grad_norm": 7.624619483947754, "kl": 0.08984375, "learning_rate": 3.473550031867431e-08, "loss": 0.0036, "reward": 1.532264232635498, "reward_std": 0.15833765268325806, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4228892922401428, "step": 3029 }, { "completion_length": 350.484375, "epoch": 0.9655831739961759, "grad_norm": 10.998796463012695, "kl": 0.07666015625, "learning_rate": 3.441682600382409e-08, "loss": 0.0031, "reward": 1.5304185152053833, "reward_std": 0.02771463245153427, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4054185152053833, "rewards/pad": 0.125, "step": 3030 }, { "completion_length": 186.796875, "epoch": 0.9659018483110261, "grad_norm": 18.013072967529297, "kl": 0.095703125, "learning_rate": 3.409815168897387e-08, "loss": 0.0038, "reward": 1.7294105291366577, "reward_std": 0.1627049595117569, "rewards/answer_reward": 0.34375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3856606185436249, "step": 3031 }, { "completion_length": 245.1875, "epoch": 0.9662205226258763, "grad_norm": 21.26409912109375, "kl": 0.0986328125, "learning_rate": 3.3779477374123644e-08, "loss": 0.004, "reward": 1.552625298500061, "reward_std": 0.1314411163330078, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.45887523889541626, "rewards/pad": 0.109375, "step": 3032 }, { "completion_length": 314.453125, "epoch": 0.9665391969407265, "grad_norm": 10.736640930175781, "kl": 0.09033203125, "learning_rate": 3.3460803059273424e-08, "loss": 0.0036, "reward": 1.3965399265289307, "reward_std": 0.08710728585720062, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.41216492652893066, "rewards/pad": 0.0, "step": 3033 }, { "completion_length": 421.6875, "epoch": 0.9668578712555768, "grad_norm": 5.672484397888184, "kl": 0.06689453125, "learning_rate": 3.31421287444232e-08, "loss": 0.0027, "reward": 1.3580100536346436, "reward_std": 0.15528865158557892, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.3892601430416107, "step": 3034 }, { "completion_length": 214.609375, "epoch": 0.967176545570427, "grad_norm": 14.969710350036621, "kl": 0.1025390625, "learning_rate": 3.282345442957297e-08, "loss": 0.0041, "reward": 1.651845932006836, "reward_std": 0.11346770823001862, "rewards/answer_reward": 0.203125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.4643459916114807, "step": 3035 }, { "completion_length": 242.296875, "epoch": 0.9674952198852772, "grad_norm": 41.26520919799805, "kl": 0.08837890625, "learning_rate": 3.250478011472275e-08, "loss": 0.0035, "reward": 1.4151108264923096, "reward_std": 0.059655025601387024, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.41511082649230957, "step": 3036 }, { "completion_length": 254.078125, "epoch": 0.9678138942001274, "grad_norm": 10.369619369506836, "kl": 0.08544921875, "learning_rate": 3.2186105799872527e-08, "loss": 0.0034, "reward": 1.386507272720337, "reward_std": 0.05853161960840225, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.3865073323249817, "rewards/pad": 0.0, "step": 3037 }, { "completion_length": 318.75, "epoch": 0.9681325685149776, "grad_norm": 11.73049259185791, "kl": 0.08056640625, "learning_rate": 3.186743148502231e-08, "loss": 0.0032, "reward": 1.4719185829162598, "reward_std": 0.11228282004594803, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.48754358291625977, "step": 3038 }, { "completion_length": 196.453125, "epoch": 0.9684512428298279, "grad_norm": 31.66900634765625, "kl": 0.0966796875, "learning_rate": 3.154875717017208e-08, "loss": 0.0039, "reward": 1.5930705070495605, "reward_std": 0.08366744220256805, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5930704474449158, "step": 3039 }, { "completion_length": 270.4375, "epoch": 0.9687699171446782, "grad_norm": 6.654904365539551, "kl": 0.09375, "learning_rate": 3.123008285532186e-08, "loss": 0.0038, "reward": 1.749192237854004, "reward_std": 0.052496008574962616, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6241921782493591, "step": 3040 }, { "completion_length": 288.640625, "epoch": 0.9690885914595284, "grad_norm": 11.12962818145752, "kl": 0.0869140625, "learning_rate": 3.0911408540471635e-08, "loss": 0.0035, "reward": 1.713568925857544, "reward_std": 0.13127964735031128, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.604193925857544, "step": 3041 }, { "completion_length": 217.484375, "epoch": 0.9694072657743786, "grad_norm": 13.664684295654297, "kl": 0.103515625, "learning_rate": 3.059273422562141e-08, "loss": 0.0041, "reward": 1.5261034965515137, "reward_std": 0.07350903749465942, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5261036157608032, "step": 3042 }, { "completion_length": 209.25, "epoch": 0.9697259400892289, "grad_norm": 11.630436897277832, "kl": 0.08837890625, "learning_rate": 3.027405991077119e-08, "loss": 0.0035, "reward": 1.5359015464782715, "reward_std": 0.11306829750537872, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5202765464782715, "rewards/pad": 0.015625, "step": 3043 }, { "completion_length": 262.640625, "epoch": 0.9700446144040791, "grad_norm": 8.941901206970215, "kl": 0.087890625, "learning_rate": 2.995538559592097e-08, "loss": 0.0035, "reward": 1.4941205978393555, "reward_std": 0.07758435606956482, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.49412065744400024, "step": 3044 }, { "completion_length": 284.078125, "epoch": 0.9703632887189293, "grad_norm": 5.911899566650391, "kl": 0.06591796875, "learning_rate": 2.9636711281070744e-08, "loss": 0.0026, "reward": 1.6351535320281982, "reward_std": 0.05527539551258087, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5101535320281982, "step": 3045 }, { "completion_length": 166.40625, "epoch": 0.9706819630337795, "grad_norm": 15.447898864746094, "kl": 0.09765625, "learning_rate": 2.931803696622052e-08, "loss": 0.0039, "reward": 1.826016902923584, "reward_std": 0.10933683067560196, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5760170221328735, "rewards/pad": 0.25, "step": 3046 }, { "completion_length": 258.515625, "epoch": 0.9710006373486297, "grad_norm": 10.936978340148926, "kl": 0.06884765625, "learning_rate": 2.89993626513703e-08, "loss": 0.0028, "reward": 1.5791189670562744, "reward_std": 0.16111625730991364, "rewards/pad": 0.046875, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.5478689670562744, "step": 3047 }, { "completion_length": 157.703125, "epoch": 0.97131931166348, "grad_norm": 10.121866226196289, "kl": 0.12451171875, "learning_rate": 2.8680688336520072e-08, "loss": 0.005, "reward": 1.6469779014587402, "reward_std": 0.06688619405031204, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5219780206680298, "rewards/pad": 0.125, "step": 3048 }, { "completion_length": 265.21875, "epoch": 0.9716379859783302, "grad_norm": 12.310712814331055, "kl": 0.07666015625, "learning_rate": 2.8362014021669853e-08, "loss": 0.0031, "reward": 1.7152507305145264, "reward_std": 0.13572877645492554, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6058759093284607, "rewards/pad": 0.109375, "step": 3049 }, { "completion_length": 254.15625, "epoch": 0.9719566602931804, "grad_norm": 12.733508110046387, "kl": 0.087890625, "learning_rate": 2.804333970681963e-08, "loss": 0.0035, "reward": 1.4946439266204834, "reward_std": 0.07657530903816223, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4946439862251282, "rewards/pad": 0.0, "step": 3050 }, { "completion_length": 233.859375, "epoch": 0.9722753346080306, "grad_norm": 27.716503143310547, "kl": 0.09228515625, "learning_rate": 2.7724665391969407e-08, "loss": 0.0037, "reward": 1.657578945159912, "reward_std": 0.07279810309410095, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5325790643692017, "rewards/pad": 0.125, "step": 3051 }, { "completion_length": 167.03125, "epoch": 0.9725940089228808, "grad_norm": 10.394854545593262, "kl": 0.09716796875, "learning_rate": 2.740599107711918e-08, "loss": 0.0039, "reward": 1.4805113077163696, "reward_std": 0.06541478633880615, "rewards/answer_reward": 0.0, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.4805113673210144, "step": 3052 }, { "completion_length": 294.46875, "epoch": 0.9729126832377311, "grad_norm": 10.962387084960938, "kl": 0.07958984375, "learning_rate": 2.708731676226896e-08, "loss": 0.0032, "reward": 1.5387133359909058, "reward_std": 0.0850515067577362, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5387133359909058, "step": 3053 }, { "completion_length": 350.5, "epoch": 0.9732313575525813, "grad_norm": 21.669363021850586, "kl": 0.06591796875, "learning_rate": 2.676864244741874e-08, "loss": 0.0026, "reward": 1.494605302810669, "reward_std": 0.03655810281634331, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4946053624153137, "rewards/pad": 0.0, "step": 3054 }, { "completion_length": 250.296875, "epoch": 0.9735500318674315, "grad_norm": 26.71428871154785, "kl": 0.087890625, "learning_rate": 2.6449968132568516e-08, "loss": 0.0035, "reward": 1.5646378993988037, "reward_std": 0.06353458762168884, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5646378397941589, "step": 3055 }, { "completion_length": 207.828125, "epoch": 0.9738687061822817, "grad_norm": 7.8366780281066895, "kl": 0.07373046875, "learning_rate": 2.613129381771829e-08, "loss": 0.003, "reward": 1.8365015983581543, "reward_std": 0.10568691045045853, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5865016579627991, "step": 3056 }, { "completion_length": 367.109375, "epoch": 0.9741873804971319, "grad_norm": 13.387369155883789, "kl": 0.0654296875, "learning_rate": 2.5812619502868067e-08, "loss": 0.0026, "reward": 1.498044729232788, "reward_std": 0.09658186882734299, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.49804478883743286, "rewards/pad": 0.0, "step": 3057 }, { "completion_length": 297.328125, "epoch": 0.9745060548119822, "grad_norm": 8.86507511138916, "kl": 0.0859375, "learning_rate": 2.5493945188017844e-08, "loss": 0.0034, "reward": 1.4849259853363037, "reward_std": 0.0937286987900734, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3599260747432709, "step": 3058 }, { "completion_length": 233.90625, "epoch": 0.9748247291268324, "grad_norm": 14.907334327697754, "kl": 0.06787109375, "learning_rate": 2.517527087316762e-08, "loss": 0.0027, "reward": 1.9055280685424805, "reward_std": 0.12614959478378296, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5617780685424805, "rewards/pad": 0.34375, "step": 3059 }, { "completion_length": 259.359375, "epoch": 0.9751434034416826, "grad_norm": 7.794898509979248, "kl": 0.08203125, "learning_rate": 2.48565965583174e-08, "loss": 0.0033, "reward": 1.4615026712417603, "reward_std": 0.10705969482660294, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.47712767124176025, "step": 3060 }, { "completion_length": 283.703125, "epoch": 0.9754620777565328, "grad_norm": 5.05447244644165, "kl": 0.08740234375, "learning_rate": 2.4537922243467176e-08, "loss": 0.0035, "reward": 1.5107604265213013, "reward_std": 0.07834285497665405, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5107604265213013, "rewards/pad": 0.0, "step": 3061 }, { "completion_length": 251.140625, "epoch": 0.975780752071383, "grad_norm": 30.65591049194336, "kl": 0.08984375, "learning_rate": 2.4219247928616953e-08, "loss": 0.0036, "reward": 1.40012526512146, "reward_std": 0.03734934702515602, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4001252055168152, "step": 3062 }, { "completion_length": 236.359375, "epoch": 0.9760994263862333, "grad_norm": 5.879389762878418, "kl": 0.091796875, "learning_rate": 2.3900573613766727e-08, "loss": 0.0037, "reward": 1.4694182872772217, "reward_std": 0.136412113904953, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.36004331707954407, "step": 3063 }, { "completion_length": 393.828125, "epoch": 0.9764181007010835, "grad_norm": 5.189061164855957, "kl": 0.052734375, "learning_rate": 2.3581899298916504e-08, "loss": 0.0021, "reward": 1.5653045177459717, "reward_std": 0.03525523096323013, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5653046369552612, "step": 3064 }, { "completion_length": 206.09375, "epoch": 0.9767367750159337, "grad_norm": 17.348432540893555, "kl": 0.0869140625, "learning_rate": 2.3263224984066285e-08, "loss": 0.0035, "reward": 1.4528409242630005, "reward_std": 0.08727478981018066, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.46846598386764526, "rewards/pad": 0.0, "step": 3065 }, { "completion_length": 207.40625, "epoch": 0.9770554493307839, "grad_norm": 23.430965423583984, "kl": 0.09716796875, "learning_rate": 2.2944550669216062e-08, "loss": 0.0039, "reward": 1.7889633178710938, "reward_std": 0.06907358765602112, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.663963258266449, "rewards/pad": 0.125, "step": 3066 }, { "completion_length": 257.234375, "epoch": 0.9773741236456341, "grad_norm": 34.405094146728516, "kl": 0.0693359375, "learning_rate": 2.2625876354365836e-08, "loss": 0.0028, "reward": 1.8198498487472534, "reward_std": 0.0726766362786293, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5698498487472534, "step": 3067 }, { "completion_length": 270.6875, "epoch": 0.9776927979604844, "grad_norm": 14.171384811401367, "kl": 0.08740234375, "learning_rate": 2.2307202039515613e-08, "loss": 0.0035, "reward": 1.4805707931518555, "reward_std": 0.15138691663742065, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.41807082295417786, "rewards/pad": 0.078125, "step": 3068 }, { "completion_length": 296.125, "epoch": 0.9780114722753346, "grad_norm": 7.556200981140137, "kl": 0.07080078125, "learning_rate": 2.198852772466539e-08, "loss": 0.0028, "reward": 1.5037003755569458, "reward_std": 0.07935502380132675, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.503700315952301, "step": 3069 }, { "completion_length": 268.703125, "epoch": 0.9783301465901848, "grad_norm": 9.175628662109375, "kl": 0.08447265625, "learning_rate": 2.1669853409815167e-08, "loss": 0.0034, "reward": 1.480644941329956, "reward_std": 0.10172773152589798, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4962700605392456, "step": 3070 }, { "completion_length": 221.421875, "epoch": 0.978648820905035, "grad_norm": 12.232224464416504, "kl": 0.09228515625, "learning_rate": 2.1351179094964945e-08, "loss": 0.0037, "reward": 1.790949821472168, "reward_std": 0.10301777720451355, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5409497618675232, "rewards/pad": 0.25, "step": 3071 }, { "completion_length": 209.25, "epoch": 0.9789674952198852, "grad_norm": 19.196441650390625, "kl": 0.08984375, "learning_rate": 2.1032504780114722e-08, "loss": 0.0036, "reward": 1.5424752235412598, "reward_std": 0.11825200170278549, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5581002831459045, "rewards/pad": 0.0, "step": 3072 }, { "completion_length": 257.734375, "epoch": 0.9792861695347355, "grad_norm": 20.04810333251953, "kl": 0.0791015625, "learning_rate": 2.07138304652645e-08, "loss": 0.0032, "reward": 1.635162591934204, "reward_std": 0.05953432619571686, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5101625919342041, "step": 3073 }, { "completion_length": 264.4375, "epoch": 0.9796048438495857, "grad_norm": 10.390314102172852, "kl": 0.078125, "learning_rate": 2.0395156150414276e-08, "loss": 0.0031, "reward": 1.5732051134109497, "reward_std": 0.20447908341884613, "rewards/pad": 0.15625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.4325801134109497, "step": 3074 }, { "completion_length": 303.15625, "epoch": 0.9799235181644359, "grad_norm": 9.053024291992188, "kl": 0.07421875, "learning_rate": 2.007648183556405e-08, "loss": 0.003, "reward": 1.5527631044387817, "reward_std": 0.06574830412864685, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.552763044834137, "step": 3075 }, { "completion_length": 198.3125, "epoch": 0.9802421924792861, "grad_norm": 16.09470558166504, "kl": 0.09912109375, "learning_rate": 1.975780752071383e-08, "loss": 0.004, "reward": 1.5192358493804932, "reward_std": 0.08530691266059875, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5192359089851379, "rewards/pad": 0.0, "step": 3076 }, { "completion_length": 224.71875, "epoch": 0.9805608667941363, "grad_norm": 5.323574066162109, "kl": 0.07421875, "learning_rate": 1.9439133205863608e-08, "loss": 0.003, "reward": 1.782647967338562, "reward_std": 0.07638280838727951, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6576479077339172, "step": 3077 }, { "completion_length": 124.421875, "epoch": 0.9808795411089866, "grad_norm": 51.17085266113281, "kl": 0.11328125, "learning_rate": 1.9120458891013385e-08, "loss": 0.0045, "reward": 1.8514604568481445, "reward_std": 0.18483218550682068, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5077104568481445, "rewards/pad": 0.359375, "step": 3078 }, { "completion_length": 324.921875, "epoch": 0.9811982154238368, "grad_norm": 9.296708106994629, "kl": 0.060791015625, "learning_rate": 1.880178457616316e-08, "loss": 0.0024, "reward": 1.5254148244857788, "reward_std": 0.07078447937965393, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4004148840904236, "rewards/pad": 0.125, "step": 3079 }, { "completion_length": 314.078125, "epoch": 0.9815168897386871, "grad_norm": 10.421858787536621, "kl": 0.08349609375, "learning_rate": 1.8483110261312936e-08, "loss": 0.0033, "reward": 1.4044888019561768, "reward_std": 0.0950625091791153, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.42011380195617676, "rewards/pad": 0.0, "step": 3080 }, { "completion_length": 165.375, "epoch": 0.9818355640535373, "grad_norm": 13.181941986083984, "kl": 0.10986328125, "learning_rate": 1.8164435946462717e-08, "loss": 0.0044, "reward": 1.4772001504898071, "reward_std": 0.09890478849411011, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.32095012068748474, "rewards/pad": 0.15625, "step": 3081 }, { "completion_length": 172.53125, "epoch": 0.9821542383683876, "grad_norm": 27.367578506469727, "kl": 0.10888671875, "learning_rate": 1.7845761631612494e-08, "loss": 0.0044, "reward": 1.5015852451324463, "reward_std": 0.055210791528224945, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5015853643417358, "rewards/pad": 0.0, "step": 3082 }, { "completion_length": 237.671875, "epoch": 0.9824729126832378, "grad_norm": 8.945556640625, "kl": 0.083984375, "learning_rate": 1.7527087316762268e-08, "loss": 0.0034, "reward": 1.4884425401687622, "reward_std": 0.08792269229888916, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4884425401687622, "step": 3083 }, { "completion_length": 188.234375, "epoch": 0.982791586998088, "grad_norm": 8.917075157165527, "kl": 0.10107421875, "learning_rate": 1.7208413001912045e-08, "loss": 0.004, "reward": 1.3387261629104614, "reward_std": 0.056616295129060745, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3387261927127838, "step": 3084 }, { "completion_length": 215.625, "epoch": 0.9831102613129382, "grad_norm": 20.064624786376953, "kl": 0.1142578125, "learning_rate": 1.6889738687061822e-08, "loss": 0.0046, "reward": 1.5038783550262451, "reward_std": 0.1603861153125763, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5195032954216003, "rewards/pad": 0.0, "step": 3085 }, { "completion_length": 381.375, "epoch": 0.9834289356277884, "grad_norm": 15.217743873596191, "kl": 0.056640625, "learning_rate": 1.65710643722116e-08, "loss": 0.0023, "reward": 1.4081106185913086, "reward_std": 0.0775367021560669, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.40811046957969666, "rewards/pad": 0.0, "step": 3086 }, { "completion_length": 238.875, "epoch": 0.9837476099426387, "grad_norm": 6.312625885009766, "kl": 0.06591796875, "learning_rate": 1.6252390057361376e-08, "loss": 0.0026, "reward": 1.6966602802276611, "reward_std": 0.08635349571704865, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4466603398323059, "step": 3087 }, { "completion_length": 270.75, "epoch": 0.9840662842574889, "grad_norm": 31.017301559448242, "kl": 0.08203125, "learning_rate": 1.5933715742511154e-08, "loss": 0.0033, "reward": 1.5948489904403687, "reward_std": 0.10828594118356705, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.46984899044036865, "step": 3088 }, { "completion_length": 261.640625, "epoch": 0.9843849585723391, "grad_norm": 9.238334655761719, "kl": 0.07763671875, "learning_rate": 1.561504142766093e-08, "loss": 0.0031, "reward": 1.5603516101837158, "reward_std": 0.08074082434177399, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5603516101837158, "rewards/pad": 0.0, "step": 3089 }, { "completion_length": 253.40625, "epoch": 0.9847036328871893, "grad_norm": 10.491616249084473, "kl": 0.0869140625, "learning_rate": 1.5296367112810705e-08, "loss": 0.0035, "reward": 1.5297356843948364, "reward_std": 0.07021459937095642, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5297356843948364, "step": 3090 }, { "completion_length": 344.921875, "epoch": 0.9850223072020395, "grad_norm": 10.11392879486084, "kl": 0.107421875, "learning_rate": 1.4977692797960485e-08, "loss": 0.0043, "reward": 1.5372244119644165, "reward_std": 0.07054002583026886, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5372244119644165, "step": 3091 }, { "completion_length": 331.859375, "epoch": 0.9853409815168898, "grad_norm": 6.936378002166748, "kl": 0.048583984375, "learning_rate": 1.465901848311026e-08, "loss": 0.0019, "reward": 1.771822214126587, "reward_std": 0.057387061417102814, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5218220949172974, "step": 3092 }, { "completion_length": 309.125, "epoch": 0.98565965583174, "grad_norm": 9.3517484664917, "kl": 0.09375, "learning_rate": 1.4340344168260036e-08, "loss": 0.0037, "reward": 1.5548007488250732, "reward_std": 0.12453365325927734, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5704257488250732, "rewards/pad": 0.0, "step": 3093 }, { "completion_length": 255.140625, "epoch": 0.9859783301465902, "grad_norm": 20.259626388549805, "kl": 0.09716796875, "learning_rate": 1.4021669853409815e-08, "loss": 0.0039, "reward": 1.5752711296081543, "reward_std": 0.1396198868751526, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4815211296081543, "rewards/pad": 0.09375, "step": 3094 }, { "completion_length": 330.953125, "epoch": 0.9862970044614404, "grad_norm": 52.81623840332031, "kl": 0.054931640625, "learning_rate": 1.370299553855959e-08, "loss": 0.0022, "reward": 1.4935202598571777, "reward_std": 0.06092238426208496, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.36852020025253296, "step": 3095 }, { "completion_length": 441.265625, "epoch": 0.9866156787762906, "grad_norm": 8.006575584411621, "kl": 0.043701171875, "learning_rate": 1.338432122370937e-08, "loss": 0.0017, "reward": 1.448917031288147, "reward_std": 0.0884983167052269, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.43329206109046936, "step": 3096 }, { "completion_length": 317.15625, "epoch": 0.9869343530911409, "grad_norm": 10.468679428100586, "kl": 0.06640625, "learning_rate": 1.3065646908859145e-08, "loss": 0.0027, "reward": 1.6145520210266113, "reward_std": 0.04079999029636383, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6145520806312561, "step": 3097 }, { "completion_length": 233.34375, "epoch": 0.9872530274059911, "grad_norm": 11.847618103027344, "kl": 0.06640625, "learning_rate": 1.2746972594008922e-08, "loss": 0.0027, "reward": 1.839026689529419, "reward_std": 0.06752954423427582, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.7140265703201294, "rewards/pad": 0.125, "step": 3098 }, { "completion_length": 204.03125, "epoch": 0.9875717017208413, "grad_norm": 10.032548904418945, "kl": 0.083984375, "learning_rate": 1.24282982791587e-08, "loss": 0.0034, "reward": 1.5784478187561035, "reward_std": 0.09089922904968262, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4534478187561035, "step": 3099 }, { "completion_length": 178.796875, "epoch": 0.9878903760356915, "grad_norm": 27.579824447631836, "kl": 0.10009765625, "learning_rate": 1.2109623964308477e-08, "loss": 0.004, "reward": 1.7521710395812988, "reward_std": 0.11859433352947235, "rewards/answer_reward": 0.375, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.3927960693836212, "step": 3100 }, { "completion_length": 272.109375, "epoch": 0.9882090503505417, "grad_norm": 9.311613082885742, "kl": 0.09326171875, "learning_rate": 1.1790949649458252e-08, "loss": 0.0037, "reward": 1.4809329509735107, "reward_std": 0.15124361217021942, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.37155792117118835, "step": 3101 }, { "completion_length": 376.75, "epoch": 0.988527724665392, "grad_norm": 6.454984664916992, "kl": 0.06982421875, "learning_rate": 1.1472275334608031e-08, "loss": 0.0028, "reward": 1.4383108615875244, "reward_std": 0.12465780973434448, "rewards/pad": 0.0, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.45393580198287964, "step": 3102 }, { "completion_length": 228.46875, "epoch": 0.9888463989802422, "grad_norm": 4.873990058898926, "kl": 0.10009765625, "learning_rate": 1.1153601019757807e-08, "loss": 0.004, "reward": 1.5083374977111816, "reward_std": 0.057083502411842346, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.3833376169204712, "step": 3103 }, { "completion_length": 273.109375, "epoch": 0.9891650732950924, "grad_norm": 12.395194053649902, "kl": 0.08154296875, "learning_rate": 1.0834926704907584e-08, "loss": 0.0033, "reward": 1.5387074947357178, "reward_std": 0.042679332196712494, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5387074947357178, "step": 3104 }, { "completion_length": 422.8125, "epoch": 0.9894837476099426, "grad_norm": 8.831942558288574, "kl": 0.04541015625, "learning_rate": 1.0516252390057361e-08, "loss": 0.0018, "reward": 1.2390778064727783, "reward_std": 0.12053517252206802, "rewards/pad": 0.015625, "rewards/tracking_format_reward": 0.984375, "rewards/tracking_iou_reward": 0.23907773196697235, "step": 3105 }, { "completion_length": 231.875, "epoch": 0.9898024219247928, "grad_norm": 9.457585334777832, "kl": 0.09716796875, "learning_rate": 1.0197578075207138e-08, "loss": 0.0039, "reward": 1.7127227783203125, "reward_std": 0.0834982842206955, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4627227187156677, "step": 3106 }, { "completion_length": 317.328125, "epoch": 0.9901210962396431, "grad_norm": 13.605870246887207, "kl": 0.056884765625, "learning_rate": 9.878903760356915e-09, "loss": 0.0023, "reward": 1.7096130847930908, "reward_std": 0.13412657380104065, "rewards/answer_reward": 0.03125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.6783630847930908, "step": 3107 }, { "completion_length": 106.90625, "epoch": 0.9904397705544933, "grad_norm": 14.606616973876953, "kl": 0.125, "learning_rate": 9.560229445506692e-09, "loss": 0.005, "reward": 1.7832053899765015, "reward_std": 0.0934794694185257, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6582053899765015, "rewards/pad": 0.125, "step": 3108 }, { "completion_length": 177.0, "epoch": 0.9907584448693435, "grad_norm": 15.736312866210938, "kl": 0.10498046875, "learning_rate": 9.241555130656468e-09, "loss": 0.0042, "reward": 1.4562592506408691, "reward_std": 0.23588356375694275, "rewards/answer_reward": 0.0625, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3937593996524811, "step": 3109 }, { "completion_length": 204.671875, "epoch": 0.9910771191841937, "grad_norm": 7.896600723266602, "kl": 0.10498046875, "learning_rate": 8.922880815806247e-09, "loss": 0.0042, "reward": 1.458459734916687, "reward_std": 0.03806007280945778, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.458459734916687, "rewards/pad": 0.0, "step": 3110 }, { "completion_length": 256.171875, "epoch": 0.9913957934990439, "grad_norm": 15.68986988067627, "kl": 0.125, "learning_rate": 8.604206500956022e-09, "loss": 0.005, "reward": 1.4623175859451294, "reward_std": 0.0622483566403389, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.3373175263404846, "step": 3111 }, { "completion_length": 287.453125, "epoch": 0.9917144678138942, "grad_norm": 12.322236061096191, "kl": 0.0517578125, "learning_rate": 8.2855321861058e-09, "loss": 0.0021, "reward": 1.8157682418823242, "reward_std": 0.13324689865112305, "rewards/pad": 0.34375, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4720180630683899, "step": 3112 }, { "completion_length": 157.875, "epoch": 0.9920331421287444, "grad_norm": 33.36964416503906, "kl": 0.1181640625, "learning_rate": 7.966857871255577e-09, "loss": 0.0047, "reward": 1.681018590927124, "reward_std": 0.08750802278518677, "rewards/answer_reward": 0.125, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5560184717178345, "step": 3113 }, { "completion_length": 241.21875, "epoch": 0.9923518164435946, "grad_norm": 19.668622970581055, "kl": 0.083984375, "learning_rate": 7.648183556405352e-09, "loss": 0.0034, "reward": 1.6106711626052856, "reward_std": 0.06073055416345596, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6106711626052856, "rewards/pad": 0.0, "step": 3114 }, { "completion_length": 419.046875, "epoch": 0.9926704907584448, "grad_norm": 4.147368431091309, "kl": 0.060791015625, "learning_rate": 7.32950924155513e-09, "loss": 0.0024, "reward": 1.546280860900879, "reward_std": 0.06868711113929749, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5462808012962341, "step": 3115 }, { "completion_length": 225.765625, "epoch": 0.992989165073295, "grad_norm": 12.72802448272705, "kl": 0.09423828125, "learning_rate": 7.0108349267049075e-09, "loss": 0.0038, "reward": 1.6859400272369385, "reward_std": 0.1193547323346138, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.45156511664390564, "rewards/pad": 0.234375, "step": 3116 }, { "completion_length": 222.84375, "epoch": 0.9933078393881453, "grad_norm": 23.26289176940918, "kl": 0.0859375, "learning_rate": 6.692160611854685e-09, "loss": 0.0034, "reward": 1.8061788082122803, "reward_std": 0.09761648625135422, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.5561787486076355, "step": 3117 }, { "completion_length": 316.8125, "epoch": 0.9936265137029955, "grad_norm": 15.188164710998535, "kl": 0.07568359375, "learning_rate": 6.373486297004461e-09, "loss": 0.003, "reward": 1.4298902750015259, "reward_std": 0.03217019885778427, "rewards/pad": 0.0, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.4298902451992035, "step": 3118 }, { "completion_length": 168.53125, "epoch": 0.9939451880178458, "grad_norm": 22.4268741607666, "kl": 0.1201171875, "learning_rate": 6.054811982154238e-09, "loss": 0.0048, "reward": 1.6596311330795288, "reward_std": 0.15989093482494354, "rewards/answer_reward": 0.234375, "rewards/format_reward_gqa": 1.0, "rewards/iou_glue_reward": 0.42525607347488403, "step": 3119 }, { "completion_length": 156.65625, "epoch": 0.994263862332696, "grad_norm": 13.560260772705078, "kl": 0.09765625, "learning_rate": 5.7361376673040155e-09, "loss": 0.0039, "reward": 1.680901050567627, "reward_std": 0.1962885558605194, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.602776050567627, "rewards/pad": 0.078125, "step": 3120 }, { "completion_length": 408.453125, "epoch": 0.9945825366475463, "grad_norm": 3.027728796005249, "kl": 0.041015625, "learning_rate": 5.417463352453792e-09, "loss": 0.0016, "reward": 1.4375839233398438, "reward_std": 0.12024590373039246, "rewards/pad": 0.125, "rewards/tracking_format_reward": 0.96875, "rewards/tracking_iou_reward": 0.34383392333984375, "step": 3121 }, { "completion_length": 248.828125, "epoch": 0.9949012109623965, "grad_norm": 11.69973087310791, "kl": 0.08984375, "learning_rate": 5.098789037603569e-09, "loss": 0.0036, "reward": 1.7863315343856812, "reward_std": 0.10297290980815887, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.6613315343856812, "step": 3122 }, { "completion_length": 172.765625, "epoch": 0.9952198852772467, "grad_norm": 18.64386558532715, "kl": 0.12890625, "learning_rate": 4.780114722753346e-09, "loss": 0.0052, "reward": 1.7994670867919922, "reward_std": 0.09457459300756454, "rewards/answer_reward": 0.25, "rewards/format_reward_gqa": 0.984375, "rewards/iou_glue_reward": 0.5650919675827026, "step": 3123 }, { "completion_length": 233.65625, "epoch": 0.9955385595920969, "grad_norm": 21.83030128479004, "kl": 0.09228515625, "learning_rate": 4.4614404079031234e-09, "loss": 0.0037, "reward": 1.4759409427642822, "reward_std": 0.09514787793159485, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.49156588315963745, "rewards/pad": 0.0, "step": 3124 }, { "completion_length": 261.5625, "epoch": 0.9958572339069471, "grad_norm": 9.203996658325195, "kl": 0.08056640625, "learning_rate": 4.1427660930529e-09, "loss": 0.0032, "reward": 1.6252367496490479, "reward_std": 0.08587397634983063, "rewards/pad": 0.125, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.5002367496490479, "step": 3125 }, { "completion_length": 216.03125, "epoch": 0.9961759082217974, "grad_norm": 20.595712661743164, "kl": 0.1083984375, "learning_rate": 3.824091778202676e-09, "loss": 0.0043, "reward": 1.822039008140564, "reward_std": 0.12611322104930878, "rewards/pad": 0.25, "rewards/tracking_format_reward": 1.0, "rewards/tracking_iou_reward": 0.572039008140564, "step": 3126 }, { "completion_length": 269.921875, "epoch": 0.9964945825366476, "grad_norm": 11.260319709777832, "kl": 0.08984375, "learning_rate": 3.5054174633524538e-09, "loss": 0.0036, "reward": 1.6543762683868408, "reward_std": 0.18545392155647278, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5450013279914856, "rewards/pad": 0.125, "step": 3127 }, { "completion_length": 251.5, "epoch": 0.9968132568514978, "grad_norm": 13.050055503845215, "kl": 0.08349609375, "learning_rate": 3.1867431485022305e-09, "loss": 0.0033, "reward": 1.8711285591125488, "reward_std": 0.0955628752708435, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.6211286783218384, "rewards/pad": 0.25, "step": 3128 }, { "completion_length": 306.6875, "epoch": 0.997131931166348, "grad_norm": 27.774784088134766, "kl": 0.138671875, "learning_rate": 2.8680688336520077e-09, "loss": 0.0055, "reward": 1.4905447959899902, "reward_std": 0.11900576949119568, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.5061697959899902, "rewards/pad": 0.0, "step": 3129 }, { "completion_length": 336.984375, "epoch": 0.9974506054811982, "grad_norm": 11.642563819885254, "kl": 0.055419921875, "learning_rate": 2.5493945188017845e-09, "loss": 0.0022, "reward": 1.4699349403381348, "reward_std": 0.054827552288770676, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.34493494033813477, "rewards/pad": 0.125, "step": 3130 }, { "completion_length": 191.234375, "epoch": 0.9977692797960485, "grad_norm": 12.029446601867676, "kl": 0.0859375, "learning_rate": 2.2307202039515617e-09, "loss": 0.0034, "reward": 1.5740249156951904, "reward_std": 0.034564144909381866, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5740248560905457, "rewards/pad": 0.0, "step": 3131 }, { "completion_length": 246.734375, "epoch": 0.9980879541108987, "grad_norm": 20.317331314086914, "kl": 0.08447265625, "learning_rate": 1.912045889101338e-09, "loss": 0.0034, "reward": 1.579896092414856, "reward_std": 0.06583913415670395, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.579896092414856, "rewards/pad": 0.0, "step": 3132 }, { "completion_length": 256.1875, "epoch": 0.9984066284257489, "grad_norm": 22.188108444213867, "kl": 0.12060546875, "learning_rate": 1.5933715742511153e-09, "loss": 0.0048, "reward": 1.6689432859420776, "reward_std": 0.13535553216934204, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.5439432859420776, "rewards/pad": 0.125, "step": 3133 }, { "completion_length": 168.90625, "epoch": 0.9987253027405991, "grad_norm": 14.834891319274902, "kl": 0.08984375, "learning_rate": 1.2746972594008923e-09, "loss": 0.0036, "reward": 1.698204517364502, "reward_std": 0.10654359310865402, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.46382954716682434, "rewards/pad": 0.25, "step": 3134 }, { "completion_length": 354.34375, "epoch": 0.9990439770554493, "grad_norm": 5.06793212890625, "kl": 0.0546875, "learning_rate": 9.56022944550669e-10, "loss": 0.0022, "reward": 1.5813429355621338, "reward_std": 0.11093565076589584, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.4719679355621338, "rewards/pad": 0.125, "step": 3135 }, { "completion_length": 214.9375, "epoch": 0.9993626513702996, "grad_norm": 11.776244163513184, "kl": 0.11865234375, "learning_rate": 6.373486297004461e-10, "loss": 0.0047, "reward": 1.6603457927703857, "reward_std": 0.15173858404159546, "rewards/format_reward_tg": 0.984375, "rewards/iou_timestamp_reward": 0.550970733165741, "rewards/pad": 0.125, "step": 3136 }, { "completion_length": 160.828125, "epoch": 0.9996813256851498, "grad_norm": 10.365925788879395, "kl": 0.083984375, "learning_rate": 3.1867431485022307e-10, "loss": 0.0034, "reward": 1.7423681020736694, "reward_std": 0.10533448308706284, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.4923681616783142, "rewards/pad": 0.25, "step": 3137 }, { "completion_length": 148.1999969482422, "epoch": 1.0, "grad_norm": 60.47707748413086, "kl": 0.09228515625, "learning_rate": 0.0, "loss": 0.0033, "reward": 1.7849794626235962, "reward_std": 0.1385474056005478, "rewards/format_reward_tg": 1.0, "rewards/iou_timestamp_reward": 0.7849794030189514, "rewards/pad": 0.0, "step": 3138 } ], "logging_steps": 1.0, "max_steps": 3138, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }