diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,18 +3,18 @@ "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, - "global_step": 6141, + "global_step": 5898, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "learning_rate": 8.130081300813008e-10, - "logits/chosen": -2.9841558933258057, - "logits/rejected": -3.0006747245788574, - "logps/chosen": -434.3608093261719, - "logps/rejected": -530.4495849609375, + "learning_rate": 8.47457627118644e-10, + "logits/chosen": -2.827263116836548, + "logits/rejected": -2.783407211303711, + "logps/chosen": -255.93801879882812, + "logps/rejected": -201.33627319335938, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -23,9589 +23,9191 @@ "step": 1 }, { - "epoch": 0.0, - "learning_rate": 8.130081300813008e-09, - "logits/chosen": -2.9102089405059814, - "logits/rejected": -2.9600725173950195, - "logps/chosen": -316.8162536621094, - "logps/rejected": -279.8689270019531, - "loss": 0.6925, - "rewards/accuracies": 0.4583333432674408, - "rewards/chosen": -0.00048791884910315275, - "rewards/margins": 0.0006313243065960705, - "rewards/rejected": -0.0011192429810762405, + "epoch": 0.01, + "learning_rate": 8.474576271186441e-09, + "logits/chosen": -2.632491111755371, + "logits/rejected": -2.627588987350464, + "logps/chosen": -326.1221923828125, + "logps/rejected": -333.5044860839844, + "loss": 0.6956, + "rewards/accuracies": 0.4305555522441864, + "rewards/chosen": 0.0061164614744484425, + "rewards/margins": 0.002570565789937973, + "rewards/rejected": 0.0035458963830024004, "step": 10 }, { "epoch": 0.01, - "learning_rate": 1.6260162601626016e-08, - "logits/chosen": -2.956594467163086, - "logits/rejected": -2.945495367050171, - "logps/chosen": -344.69403076171875, - "logps/rejected": -268.44903564453125, - "loss": 0.6931, + "learning_rate": 1.6949152542372882e-08, + "logits/chosen": -2.646186351776123, + "logits/rejected": -2.6703312397003174, + "logps/chosen": -241.2902374267578, + "logps/rejected": -235.29745483398438, + "loss": 0.6904, "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.01039506308734417, - "rewards/margins": 0.007600178010761738, - "rewards/rejected": 0.0027948860079050064, + "rewards/chosen": 0.007977111265063286, + "rewards/margins": 0.009717261418700218, + "rewards/rejected": -0.0017401501536369324, "step": 20 }, { - "epoch": 0.01, - "learning_rate": 2.4390243902439023e-08, - "logits/chosen": -2.960836887359619, - "logits/rejected": -2.9482264518737793, - "logps/chosen": -305.5439453125, - "logps/rejected": -262.696044921875, - "loss": 0.6878, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.031020671129226685, - "rewards/margins": 0.007005402352660894, - "rewards/rejected": 0.024015270173549652, + "epoch": 0.02, + "learning_rate": 2.5423728813559323e-08, + "logits/chosen": -2.6961770057678223, + "logits/rejected": -2.6650378704071045, + "logps/chosen": -290.31256103515625, + "logps/rejected": -256.8085632324219, + "loss": 0.6935, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0055731819011271, + "rewards/margins": -0.004891841672360897, + "rewards/rejected": -0.000681340170558542, "step": 30 }, { "epoch": 0.02, - "learning_rate": 3.252032520325203e-08, - "logits/chosen": -2.9725985527038574, - "logits/rejected": -2.9855666160583496, - "logps/chosen": -267.8367614746094, - "logps/rejected": -254.37557983398438, - "loss": 0.6872, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.04555894061923027, - "rewards/margins": 0.016201425343751907, - "rewards/rejected": 0.02935752272605896, + "learning_rate": 3.3898305084745764e-08, + "logits/chosen": -2.662567377090454, + "logits/rejected": -2.6888091564178467, + "logps/chosen": -269.77191162109375, + "logps/rejected": -251.4422149658203, + "loss": 0.6902, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.005323584191501141, + "rewards/margins": 0.0068922750651836395, + "rewards/rejected": -0.0015686902916058898, "step": 40 }, { - "epoch": 0.02, - "learning_rate": 4.065040650406504e-08, - "logits/chosen": -2.926569938659668, - "logits/rejected": -2.955169200897217, - "logps/chosen": -276.6246643066406, - "logps/rejected": -253.388671875, - "loss": 0.6698, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.07494758069515228, - "rewards/margins": 0.038938138633966446, - "rewards/rejected": 0.03600943461060524, + "epoch": 0.03, + "learning_rate": 4.23728813559322e-08, + "logits/chosen": -2.696052074432373, + "logits/rejected": -2.8296093940734863, + "logps/chosen": -297.51080322265625, + "logps/rejected": -236.7479248046875, + "loss": 0.6903, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0028805125039070845, + "rewards/margins": 0.009964686818420887, + "rewards/rejected": -0.0070841750130057335, "step": 50 }, { "epoch": 0.03, - "learning_rate": 4.878048780487805e-08, - "logits/chosen": -2.956350326538086, - "logits/rejected": -2.9792628288269043, - "logps/chosen": -328.7430725097656, - "logps/rejected": -270.8122863769531, - "loss": 0.6593, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.15852577984333038, - "rewards/margins": 0.05970991775393486, - "rewards/rejected": 0.09881584346294403, + "learning_rate": 5.0847457627118645e-08, + "logits/chosen": -2.5665619373321533, + "logits/rejected": -2.6641879081726074, + "logps/chosen": -302.92889404296875, + "logps/rejected": -283.39117431640625, + "loss": 0.6888, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.007401665206998587, + "rewards/margins": 0.0012593126157298684, + "rewards/rejected": 0.0061423517763614655, "step": 60 }, { - "epoch": 0.03, - "learning_rate": 5.6910569105691055e-08, - "logits/chosen": -2.9546256065368652, - "logits/rejected": -2.9788239002227783, - "logps/chosen": -292.5069885253906, - "logps/rejected": -246.9122772216797, - "loss": 0.6474, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.2095698118209839, - "rewards/margins": 0.13486798107624054, - "rewards/rejected": 0.07470183074474335, + "epoch": 0.04, + "learning_rate": 5.932203389830508e-08, + "logits/chosen": -2.7289795875549316, + "logits/rejected": -2.6271157264709473, + "logps/chosen": -296.3174743652344, + "logps/rejected": -259.4359130859375, + "loss": 0.6807, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.014876957051455975, + "rewards/margins": 0.02076360210776329, + "rewards/rejected": -0.005886645056307316, "step": 70 }, { "epoch": 0.04, - "learning_rate": 6.504065040650406e-08, - "logits/chosen": -2.9138400554656982, - "logits/rejected": -2.9316182136535645, - "logps/chosen": -318.5199890136719, - "logps/rejected": -251.32357788085938, - "loss": 0.6223, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.293539822101593, - "rewards/margins": 0.24193021655082703, - "rewards/rejected": 0.051609598100185394, + "learning_rate": 6.779661016949153e-08, + "logits/chosen": -2.716602087020874, + "logits/rejected": -2.6984047889709473, + "logps/chosen": -277.34625244140625, + "logps/rejected": -250.19580078125, + "loss": 0.6768, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.022055678069591522, + "rewards/margins": 0.037467751652002335, + "rewards/rejected": -0.015412074513733387, "step": 80 }, { - "epoch": 0.04, - "learning_rate": 7.317073170731706e-08, - "logits/chosen": -2.9335989952087402, - "logits/rejected": -2.962618350982666, - "logps/chosen": -345.62408447265625, - "logps/rejected": -276.18389892578125, - "loss": 0.6181, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.4488205313682556, - "rewards/margins": 0.2681005597114563, - "rewards/rejected": 0.18071994185447693, + "epoch": 0.05, + "learning_rate": 7.627118644067796e-08, + "logits/chosen": -2.667224884033203, + "logits/rejected": -2.606696605682373, + "logps/chosen": -289.1363830566406, + "logps/rejected": -275.8477478027344, + "loss": 0.6707, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.026466140523552895, + "rewards/margins": 0.05512174963951111, + "rewards/rejected": -0.028655609115958214, "step": 90 }, { "epoch": 0.05, - "learning_rate": 8.130081300813008e-08, - "logits/chosen": -2.9538724422454834, - "logits/rejected": -2.9184556007385254, - "logps/chosen": -330.77752685546875, - "logps/rejected": -235.3378143310547, - "loss": 0.6163, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.4436494708061218, - "rewards/margins": 0.3342064321041107, - "rewards/rejected": 0.10944309085607529, + "learning_rate": 8.47457627118644e-08, + "logits/chosen": -2.6877145767211914, + "logits/rejected": -2.759593963623047, + "logps/chosen": -290.8085021972656, + "logps/rejected": -252.248779296875, + "loss": 0.6612, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.04527619853615761, + "rewards/margins": 0.062491558492183685, + "rewards/rejected": -0.017215365543961525, "step": 100 }, { "epoch": 0.05, - "eval_logits/chosen": -2.934441089630127, - "eval_logits/rejected": -2.926870346069336, - "eval_logps/chosen": -310.82000732421875, - "eval_logps/rejected": -268.5007019042969, - "eval_loss": 0.6102350950241089, - "eval_rewards/accuracies": 0.658088207244873, - "eval_rewards/chosen": 0.4041019380092621, - "eval_rewards/margins": 0.3008568286895752, - "eval_rewards/rejected": 0.1032450795173645, - "eval_runtime": 304.9772, - "eval_samples_per_second": 7.122, - "eval_steps_per_second": 0.446, + "eval_logits/chosen": -2.7410836219787598, + "eval_logits/rejected": -2.8351781368255615, + "eval_logps/chosen": -277.7868347167969, + "eval_logps/rejected": -258.4841613769531, + "eval_loss": 0.6640329360961914, + "eval_rewards/accuracies": 0.6564885377883911, + "eval_rewards/chosen": 0.04636840894818306, + "eval_rewards/margins": 0.08127209544181824, + "eval_rewards/rejected": -0.03490367904305458, + "eval_runtime": 297.8788, + "eval_samples_per_second": 7.003, + "eval_steps_per_second": 0.44, "step": 100 }, { - "epoch": 0.05, - "learning_rate": 8.943089430894309e-08, - "logits/chosen": -2.940415620803833, - "logits/rejected": -2.9090399742126465, - "logps/chosen": -339.867919921875, - "logps/rejected": -247.30172729492188, - "loss": 0.5816, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.5024594068527222, - "rewards/margins": 0.5129929184913635, - "rewards/rejected": -0.01053349394351244, + "epoch": 0.06, + "learning_rate": 9.322033898305084e-08, + "logits/chosen": -2.748792886734009, + "logits/rejected": -2.71765398979187, + "logps/chosen": -278.54345703125, + "logps/rejected": -263.8841247558594, + "loss": 0.6659, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.047357071191072464, + "rewards/margins": 0.08142384141683578, + "rewards/rejected": -0.03406677395105362, "step": 110 }, { "epoch": 0.06, - "learning_rate": 9.75609756097561e-08, - "logits/chosen": -2.8959217071533203, - "logits/rejected": -2.8908677101135254, - "logps/chosen": -332.07861328125, - "logps/rejected": -297.9265441894531, - "loss": 0.5811, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.4128102660179138, - "rewards/margins": 0.4367516040802002, - "rewards/rejected": -0.023941364139318466, + "learning_rate": 1.0169491525423729e-07, + "logits/chosen": -2.6274914741516113, + "logits/rejected": -2.6163206100463867, + "logps/chosen": -263.28070068359375, + "logps/rejected": -255.45358276367188, + "loss": 0.6585, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.06359101831912994, + "rewards/margins": 0.11870710551738739, + "rewards/rejected": -0.055116087198257446, "step": 120 }, { - "epoch": 0.06, - "learning_rate": 1.0569105691056911e-07, - "logits/chosen": -2.8852524757385254, - "logits/rejected": -2.913208246231079, - "logps/chosen": -345.4274597167969, - "logps/rejected": -233.2921142578125, - "loss": 0.6097, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.5413142442703247, - "rewards/margins": 0.5160594582557678, - "rewards/rejected": 0.025254884734749794, + "epoch": 0.07, + "learning_rate": 1.1016949152542372e-07, + "logits/chosen": -2.6633336544036865, + "logits/rejected": -2.539752721786499, + "logps/chosen": -280.6170654296875, + "logps/rejected": -276.40557861328125, + "loss": 0.6454, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.05809624120593071, + "rewards/margins": 0.1383211314678192, + "rewards/rejected": -0.0802248865365982, "step": 130 }, { "epoch": 0.07, - "learning_rate": 1.1382113821138211e-07, - "logits/chosen": -2.920401096343994, - "logits/rejected": -2.9328560829162598, - "logps/chosen": -328.6422424316406, - "logps/rejected": -268.7213439941406, - "loss": 0.573, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.5083936452865601, - "rewards/margins": 0.49243244528770447, - "rewards/rejected": 0.015961110591888428, + "learning_rate": 1.1864406779661017e-07, + "logits/chosen": -2.6815719604492188, + "logits/rejected": -2.706284523010254, + "logps/chosen": -256.632080078125, + "logps/rejected": -248.27810668945312, + "loss": 0.6457, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07591484487056732, + "rewards/margins": 0.13818106055259705, + "rewards/rejected": -0.06226622313261032, "step": 140 }, { - "epoch": 0.07, - "learning_rate": 1.219512195121951e-07, - "logits/chosen": -2.967921733856201, - "logits/rejected": -2.957890510559082, - "logps/chosen": -314.16119384765625, - "logps/rejected": -236.77090454101562, - "loss": 0.5708, + "epoch": 0.08, + "learning_rate": 1.271186440677966e-07, + "logits/chosen": -2.7584047317504883, + "logits/rejected": -2.631559133529663, + "logps/chosen": -275.9186706542969, + "logps/rejected": -289.1915283203125, + "loss": 0.6205, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.5136945247650146, - "rewards/margins": 0.4742175042629242, - "rewards/rejected": 0.039476923644542694, + "rewards/chosen": 0.11922351270914078, + "rewards/margins": 0.17620989680290222, + "rewards/rejected": -0.056986384093761444, "step": 150 }, { "epoch": 0.08, - "learning_rate": 1.3008130081300813e-07, - "logits/chosen": -2.928835868835449, - "logits/rejected": -2.9954142570495605, - "logps/chosen": -333.37200927734375, - "logps/rejected": -297.33282470703125, - "loss": 0.5685, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.4727661609649658, - "rewards/margins": 0.5160679817199707, - "rewards/rejected": -0.04330185800790787, + "learning_rate": 1.3559322033898305e-07, + "logits/chosen": -2.7076973915100098, + "logits/rejected": -2.5795204639434814, + "logps/chosen": -253.640625, + "logps/rejected": -254.59957885742188, + "loss": 0.6194, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.09049886465072632, + "rewards/margins": 0.210663840174675, + "rewards/rejected": -0.12016497552394867, "step": 160 }, { - "epoch": 0.08, - "learning_rate": 1.3821138211382114e-07, - "logits/chosen": -2.9452319145202637, - "logits/rejected": -2.9631733894348145, - "logps/chosen": -333.86248779296875, - "logps/rejected": -284.1908264160156, - "loss": 0.5688, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.5551854968070984, - "rewards/margins": 0.4876086115837097, - "rewards/rejected": 0.06757686287164688, + "epoch": 0.09, + "learning_rate": 1.440677966101695e-07, + "logits/chosen": -2.651660919189453, + "logits/rejected": -2.6708879470825195, + "logps/chosen": -245.13961791992188, + "logps/rejected": -252.0153045654297, + "loss": 0.6067, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.10699067264795303, + "rewards/margins": 0.25036734342575073, + "rewards/rejected": -0.1433766633272171, "step": 170 }, { "epoch": 0.09, - "learning_rate": 1.4634146341463413e-07, - "logits/chosen": -2.9429287910461426, - "logits/rejected": -2.936645746231079, - "logps/chosen": -347.42919921875, - "logps/rejected": -279.301513671875, - "loss": 0.5251, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.5249996781349182, - "rewards/margins": 0.7658360600471497, - "rewards/rejected": -0.24083638191223145, + "learning_rate": 1.5254237288135593e-07, + "logits/chosen": -2.642138957977295, + "logits/rejected": -2.730170726776123, + "logps/chosen": -299.3494567871094, + "logps/rejected": -285.53607177734375, + "loss": 0.6136, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.1772647351026535, + "rewards/margins": 0.32675617933273315, + "rewards/rejected": -0.14949145913124084, "step": 180 }, { - "epoch": 0.09, - "learning_rate": 1.5447154471544717e-07, - "logits/chosen": -2.993924617767334, - "logits/rejected": -2.998302698135376, - "logps/chosen": -331.9522399902344, - "logps/rejected": -275.6444396972656, - "loss": 0.5995, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.4336457848548889, - "rewards/margins": 0.4917446970939636, - "rewards/rejected": -0.058098919689655304, + "epoch": 0.1, + "learning_rate": 1.6101694915254234e-07, + "logits/chosen": -2.62565541267395, + "logits/rejected": -2.607515335083008, + "logps/chosen": -304.5907897949219, + "logps/rejected": -287.03076171875, + "loss": 0.6115, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.10709013044834137, + "rewards/margins": 0.19044987857341766, + "rewards/rejected": -0.08335976302623749, "step": 190 }, { "epoch": 0.1, - "learning_rate": 1.6260162601626016e-07, - "logits/chosen": -2.9468791484832764, - "logits/rejected": -2.933835506439209, - "logps/chosen": -312.1210021972656, - "logps/rejected": -304.53741455078125, - "loss": 0.5771, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.4262707829475403, - "rewards/margins": 0.5384191870689392, - "rewards/rejected": -0.11214841902256012, + "learning_rate": 1.694915254237288e-07, + "logits/chosen": -2.631618022918701, + "logits/rejected": -2.6601366996765137, + "logps/chosen": -293.8777770996094, + "logps/rejected": -290.7740173339844, + "loss": 0.5924, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08614929020404816, + "rewards/margins": 0.24201679229736328, + "rewards/rejected": -0.15586748719215393, "step": 200 }, { "epoch": 0.1, - "eval_logits/chosen": -2.910601854324341, - "eval_logits/rejected": -2.9031543731689453, - "eval_logps/chosen": -311.596435546875, - "eval_logps/rejected": -272.910888671875, - "eval_loss": 0.5581578612327576, - "eval_rewards/accuracies": 0.7242646813392639, - "eval_rewards/chosen": 0.3264617919921875, - "eval_rewards/margins": 0.6642390489578247, - "eval_rewards/rejected": -0.3377772271633148, - "eval_runtime": 304.346, - "eval_samples_per_second": 7.137, - "eval_steps_per_second": 0.447, + "eval_logits/chosen": -2.7265350818634033, + "eval_logits/rejected": -2.815322160720825, + "eval_logps/chosen": -277.4727783203125, + "eval_logps/rejected": -260.6597900390625, + "eval_loss": 0.6067986488342285, + "eval_rewards/accuracies": 0.6927480697631836, + "eval_rewards/chosen": 0.0777706429362297, + "eval_rewards/margins": 0.3302356004714966, + "eval_rewards/rejected": -0.2524649202823639, + "eval_runtime": 302.3774, + "eval_samples_per_second": 6.899, + "eval_steps_per_second": 0.433, "step": 200 }, { - "epoch": 0.1, - "learning_rate": 1.7073170731707317e-07, - "logits/chosen": -2.8615822792053223, - "logits/rejected": -2.8400533199310303, - "logps/chosen": -315.7840576171875, - "logps/rejected": -292.722900390625, - "loss": 0.5712, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.3762911260128021, - "rewards/margins": 0.7341277003288269, - "rewards/rejected": -0.3578365445137024, + "epoch": 0.11, + "learning_rate": 1.7796610169491524e-07, + "logits/chosen": -2.70915150642395, + "logits/rejected": -2.72874116897583, + "logps/chosen": -276.5365905761719, + "logps/rejected": -238.8749237060547, + "loss": 0.5655, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.17417728900909424, + "rewards/margins": 0.46528753638267517, + "rewards/rejected": -0.2911103069782257, "step": 210 }, { "epoch": 0.11, - "learning_rate": 1.7886178861788619e-07, - "logits/chosen": -2.9871878623962402, - "logits/rejected": -2.966097354888916, - "logps/chosen": -291.62664794921875, - "logps/rejected": -257.0137939453125, - "loss": 0.5876, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.181481271982193, - "rewards/margins": 0.4209473729133606, - "rewards/rejected": -0.23946604132652283, + "learning_rate": 1.8644067796610168e-07, + "logits/chosen": -2.62576961517334, + "logits/rejected": -2.6665549278259277, + "logps/chosen": -338.6006164550781, + "logps/rejected": -294.7860412597656, + "loss": 0.6044, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.22065432369709015, + "rewards/margins": 0.3981766104698181, + "rewards/rejected": -0.17752234637737274, "step": 220 }, { - "epoch": 0.11, - "learning_rate": 1.8699186991869917e-07, - "logits/chosen": -2.9458131790161133, - "logits/rejected": -2.935429096221924, - "logps/chosen": -259.55963134765625, - "logps/rejected": -221.6571502685547, - "loss": 0.5563, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.16037055850028992, - "rewards/margins": 0.55237877368927, - "rewards/rejected": -0.3920082151889801, + "epoch": 0.12, + "learning_rate": 1.9491525423728814e-07, + "logits/chosen": -2.6506543159484863, + "logits/rejected": -2.644029378890991, + "logps/chosen": -254.7323455810547, + "logps/rejected": -238.21957397460938, + "loss": 0.5994, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.010533164255321026, + "rewards/margins": 0.25938352942466736, + "rewards/rejected": -0.24885031580924988, "step": 230 }, { "epoch": 0.12, - "learning_rate": 1.951219512195122e-07, - "logits/chosen": -2.8259873390197754, - "logits/rejected": -2.8723251819610596, - "logps/chosen": -380.71588134765625, - "logps/rejected": -314.49444580078125, - "loss": 0.588, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.1817556917667389, - "rewards/margins": 0.5050875544548035, - "rewards/rejected": -0.3233318626880646, + "learning_rate": 2.0338983050847458e-07, + "logits/chosen": -2.635986328125, + "logits/rejected": -2.682281970977783, + "logps/chosen": -273.7716979980469, + "logps/rejected": -238.53280639648438, + "loss": 0.5808, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.12138669192790985, + "rewards/margins": 0.43423810601234436, + "rewards/rejected": -0.3128513991832733, "step": 240 }, { - "epoch": 0.12, - "learning_rate": 2.032520325203252e-07, - "logits/chosen": -2.8916354179382324, - "logits/rejected": -2.8859972953796387, - "logps/chosen": -393.36639404296875, - "logps/rejected": -320.122314453125, - "loss": 0.5763, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.6454311609268188, - "rewards/margins": 0.7746469378471375, - "rewards/rejected": -0.12921586632728577, + "epoch": 0.13, + "learning_rate": 2.11864406779661e-07, + "logits/chosen": -2.698411464691162, + "logits/rejected": -2.806183338165283, + "logps/chosen": -275.4363098144531, + "logps/rejected": -265.92254638671875, + "loss": 0.556, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.2729412019252777, + "rewards/margins": 0.37595731019973755, + "rewards/rejected": -0.10301606357097626, "step": 250 }, { "epoch": 0.13, - "learning_rate": 2.1138211382113822e-07, - "logits/chosen": -2.990467071533203, - "logits/rejected": -2.9586710929870605, - "logps/chosen": -263.37493896484375, - "logps/rejected": -231.34140014648438, - "loss": 0.5667, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.13536646962165833, - "rewards/margins": 0.6033450961112976, - "rewards/rejected": -0.4679786264896393, + "learning_rate": 2.2033898305084743e-07, + "logits/chosen": -2.689795970916748, + "logits/rejected": -2.773268222808838, + "logps/chosen": -255.66455078125, + "logps/rejected": -241.80563354492188, + "loss": 0.5563, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15760645270347595, + "rewards/margins": 0.4827534556388855, + "rewards/rejected": -0.32514697313308716, "step": 260 }, { - "epoch": 0.13, - "learning_rate": 2.195121951219512e-07, - "logits/chosen": -2.8897347450256348, - "logits/rejected": -2.906104564666748, - "logps/chosen": -362.23626708984375, - "logps/rejected": -270.362060546875, - "loss": 0.5539, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.37160390615463257, - "rewards/margins": 0.9915136098861694, - "rewards/rejected": -0.6199097633361816, + "epoch": 0.14, + "learning_rate": 2.288135593220339e-07, + "logits/chosen": -2.5930798053741455, + "logits/rejected": -2.6595733165740967, + "logps/chosen": -261.72235107421875, + "logps/rejected": -236.1577911376953, + "loss": 0.5588, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.06991849839687347, + "rewards/margins": 0.4791868329048157, + "rewards/rejected": -0.4092682898044586, "step": 270 }, { "epoch": 0.14, - "learning_rate": 2.2764227642276422e-07, - "logits/chosen": -2.93200421333313, - "logits/rejected": -2.8960890769958496, - "logps/chosen": -325.0174865722656, - "logps/rejected": -287.7325134277344, - "loss": 0.5852, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.36586564779281616, - "rewards/margins": 0.6894475221633911, - "rewards/rejected": -0.32358190417289734, + "learning_rate": 2.3728813559322033e-07, + "logits/chosen": -2.747859477996826, + "logits/rejected": -2.7453150749206543, + "logps/chosen": -288.51971435546875, + "logps/rejected": -240.22616577148438, + "loss": 0.593, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.09422459453344345, + "rewards/margins": 0.4610508978366852, + "rewards/rejected": -0.3668263256549835, "step": 280 }, { - "epoch": 0.14, - "learning_rate": 2.3577235772357723e-07, - "logits/chosen": -2.9999938011169434, - "logits/rejected": -2.9751625061035156, - "logps/chosen": -335.31219482421875, - "logps/rejected": -271.50665283203125, - "loss": 0.5613, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.42299097776412964, - "rewards/margins": 0.9359371066093445, - "rewards/rejected": -0.5129461288452148, + "epoch": 0.15, + "learning_rate": 2.457627118644068e-07, + "logits/chosen": -2.662792682647705, + "logits/rejected": -2.6617178916931152, + "logps/chosen": -287.103759765625, + "logps/rejected": -269.4622497558594, + "loss": 0.5615, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20917841792106628, + "rewards/margins": 0.6041997671127319, + "rewards/rejected": -0.39502137899398804, "step": 290 }, { "epoch": 0.15, - "learning_rate": 2.439024390243902e-07, - "logits/chosen": -2.9779090881347656, - "logits/rejected": -2.9833106994628906, - "logps/chosen": -267.40423583984375, - "logps/rejected": -242.1228790283203, - "loss": 0.586, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.1915021389722824, - "rewards/margins": 0.6383779644966125, - "rewards/rejected": -0.4468758702278137, + "learning_rate": 2.542372881355932e-07, + "logits/chosen": -2.665034770965576, + "logits/rejected": -2.646695613861084, + "logps/chosen": -285.3088684082031, + "logps/rejected": -260.4288635253906, + "loss": 0.5488, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2411040961742401, + "rewards/margins": 0.6841613054275513, + "rewards/rejected": -0.44305723905563354, "step": 300 }, { "epoch": 0.15, - "eval_logits/chosen": -2.9511284828186035, - "eval_logits/rejected": -2.9432692527770996, - "eval_logps/chosen": -313.122802734375, - "eval_logps/rejected": -275.2723388671875, - "eval_loss": 0.5411711931228638, - "eval_rewards/accuracies": 0.7058823704719543, - "eval_rewards/chosen": 0.17381814122200012, - "eval_rewards/margins": 0.7477374076843262, - "eval_rewards/rejected": -0.5739192366600037, - "eval_runtime": 304.4107, - "eval_samples_per_second": 7.135, - "eval_steps_per_second": 0.447, + "eval_logits/chosen": -2.7548134326934814, + "eval_logits/rejected": -2.836420774459839, + "eval_logps/chosen": -276.56298828125, + "eval_logps/rejected": -262.9786682128906, + "eval_loss": 0.5772386789321899, + "eval_rewards/accuracies": 0.7385495901107788, + "eval_rewards/chosen": 0.16875192523002625, + "eval_rewards/margins": 0.6531042456626892, + "eval_rewards/rejected": -0.4843522906303406, + "eval_runtime": 297.0708, + "eval_samples_per_second": 7.022, + "eval_steps_per_second": 0.441, "step": 300 }, { - "epoch": 0.15, - "learning_rate": 2.520325203252032e-07, - "logits/chosen": -3.003608226776123, - "logits/rejected": -3.0061421394348145, - "logps/chosen": -358.31854248046875, - "logps/rejected": -314.22430419921875, - "loss": 0.5729, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.37429937720298767, - "rewards/margins": 0.8113619685173035, - "rewards/rejected": -0.4370625615119934, + "epoch": 0.16, + "learning_rate": 2.6271186440677967e-07, + "logits/chosen": -2.689876079559326, + "logits/rejected": -2.621389627456665, + "logps/chosen": -270.97808837890625, + "logps/rejected": -233.0233612060547, + "loss": 0.6944, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.21461305022239685, + "rewards/margins": 0.012382006272673607, + "rewards/rejected": 0.2022310495376587, "step": 310 }, { "epoch": 0.16, - "learning_rate": 2.6016260162601625e-07, - "logits/chosen": -2.9804015159606934, - "logits/rejected": -3.0114943981170654, - "logps/chosen": -358.2615051269531, - "logps/rejected": -280.48944091796875, - "loss": 0.5675, + "learning_rate": 2.711864406779661e-07, + "logits/chosen": -2.5818138122558594, + "logits/rejected": -2.620954990386963, + "logps/chosen": -279.020751953125, + "logps/rejected": -258.8118896484375, + "loss": 0.536, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.22216323018074036, - "rewards/margins": 0.768736720085144, - "rewards/rejected": -0.5465735197067261, + "rewards/chosen": 0.15615960955619812, + "rewards/margins": 0.626147449016571, + "rewards/rejected": -0.4699878692626953, "step": 320 }, { - "epoch": 0.16, - "learning_rate": 2.682926829268293e-07, - "logits/chosen": -2.9590070247650146, - "logits/rejected": -2.960099458694458, - "logps/chosen": -315.183349609375, - "logps/rejected": -229.0343475341797, - "loss": 0.5581, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.2696112394332886, - "rewards/margins": 0.7746988534927368, - "rewards/rejected": -0.5050877332687378, + "epoch": 0.17, + "learning_rate": 2.796610169491525e-07, + "logits/chosen": -2.713191509246826, + "logits/rejected": -2.813098430633545, + "logps/chosen": -328.0247802734375, + "logps/rejected": -286.2821044921875, + "loss": 0.5851, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.14517785608768463, + "rewards/margins": 0.6851416826248169, + "rewards/rejected": -0.5399638414382935, "step": 330 }, { "epoch": 0.17, - "learning_rate": 2.764227642276423e-07, - "logits/chosen": -2.9624826908111572, - "logits/rejected": -2.963923692703247, - "logps/chosen": -301.6102294921875, - "logps/rejected": -251.3206024169922, - "loss": 0.592, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.1166592612862587, - "rewards/margins": 0.5716711282730103, - "rewards/rejected": -0.45501193404197693, + "learning_rate": 2.88135593220339e-07, + "logits/chosen": -2.6544597148895264, + "logits/rejected": -2.637418270111084, + "logps/chosen": -254.28646850585938, + "logps/rejected": -286.66009521484375, + "loss": 0.643, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.24947181344032288, + "rewards/margins": 0.21678391098976135, + "rewards/rejected": 0.032687850296497345, "step": 340 }, { - "epoch": 0.17, - "learning_rate": 2.8455284552845527e-07, - "logits/chosen": -3.0076210498809814, - "logits/rejected": -3.03349232673645, - "logps/chosen": -293.0851135253906, - "logps/rejected": -301.65997314453125, - "loss": 0.5776, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.2251466065645218, - "rewards/margins": 0.5454055070877075, - "rewards/rejected": -0.32025885581970215, + "epoch": 0.18, + "learning_rate": 2.966101694915254e-07, + "logits/chosen": -2.679877758026123, + "logits/rejected": -2.781639575958252, + "logps/chosen": -301.66802978515625, + "logps/rejected": -253.37588500976562, + "loss": 0.5514, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.16999612748622894, + "rewards/margins": 0.7565746903419495, + "rewards/rejected": -0.5865784883499146, "step": 350 }, { "epoch": 0.18, - "learning_rate": 2.9268292682926825e-07, - "logits/chosen": -2.9896209239959717, - "logits/rejected": -2.9818477630615234, - "logps/chosen": -302.31671142578125, - "logps/rejected": -283.38983154296875, - "loss": 0.5991, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.1765313446521759, - "rewards/margins": 0.6645389199256897, - "rewards/rejected": -0.48800748586654663, + "learning_rate": 3.0508474576271186e-07, + "logits/chosen": -2.6983745098114014, + "logits/rejected": -2.752680540084839, + "logps/chosen": -272.7869567871094, + "logps/rejected": -261.7904357910156, + "loss": 0.5101, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.1574278473854065, + "rewards/margins": 0.6489614248275757, + "rewards/rejected": -0.4915335178375244, "step": 360 }, { - "epoch": 0.18, - "learning_rate": 3.008130081300813e-07, - "logits/chosen": -2.939302921295166, - "logits/rejected": -2.957505226135254, - "logps/chosen": -377.38043212890625, - "logps/rejected": -339.09564208984375, - "loss": 0.4911, + "epoch": 0.19, + "learning_rate": 3.135593220338983e-07, + "logits/chosen": -2.6260931491851807, + "logits/rejected": -2.671186685562134, + "logps/chosen": -264.13714599609375, + "logps/rejected": -273.489013671875, + "loss": 0.6183, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.4703899323940277, - "rewards/margins": 1.087965726852417, - "rewards/rejected": -0.6175758838653564, + "rewards/chosen": 0.14635959267616272, + "rewards/margins": 0.4912787079811096, + "rewards/rejected": -0.3449190557003021, "step": 370 }, { "epoch": 0.19, - "learning_rate": 3.0894308943089434e-07, - "logits/chosen": -2.8890323638916016, - "logits/rejected": -2.9313015937805176, - "logps/chosen": -287.6706848144531, - "logps/rejected": -247.0047607421875, - "loss": 0.688, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.09420545399188995, - "rewards/margins": 0.4968571662902832, - "rewards/rejected": -0.5910626649856567, + "learning_rate": 3.220338983050847e-07, + "logits/chosen": -2.669572114944458, + "logits/rejected": -2.6285717487335205, + "logps/chosen": -269.22406005859375, + "logps/rejected": -264.6671447753906, + "loss": 0.6076, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02564525045454502, + "rewards/margins": 0.6520860195159912, + "rewards/rejected": -0.6264407634735107, "step": 380 }, { - "epoch": 0.19, - "learning_rate": 3.170731707317073e-07, - "logits/chosen": -2.9488024711608887, - "logits/rejected": -2.9690146446228027, - "logps/chosen": -335.5271301269531, - "logps/rejected": -307.94793701171875, - "loss": 0.6535, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.3661281168460846, - "rewards/margins": 1.0428093671798706, - "rewards/rejected": -0.6766812205314636, + "epoch": 0.2, + "learning_rate": 3.3050847457627117e-07, + "logits/chosen": -2.5862457752227783, + "logits/rejected": -2.5745327472686768, + "logps/chosen": -283.8790588378906, + "logps/rejected": -280.4973449707031, + "loss": 0.5356, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.08901579678058624, + "rewards/margins": 0.765033483505249, + "rewards/rejected": -0.6760177612304688, "step": 390 }, { "epoch": 0.2, - "learning_rate": 3.252032520325203e-07, - "logits/chosen": -2.9907469749450684, - "logits/rejected": -2.9890072345733643, - "logps/chosen": -342.2677001953125, - "logps/rejected": -290.630615234375, - "loss": 0.5813, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.583183765411377, - "rewards/margins": 0.8398585319519043, - "rewards/rejected": -0.25667476654052734, + "learning_rate": 3.389830508474576e-07, + "logits/chosen": -2.601691722869873, + "logits/rejected": -2.6026904582977295, + "logps/chosen": -278.7247009277344, + "logps/rejected": -300.48211669921875, + "loss": 0.5144, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02959282696247101, + "rewards/margins": 0.6538323163986206, + "rewards/rejected": -0.624239444732666, "step": 400 }, { "epoch": 0.2, - "eval_logits/chosen": -2.946734666824341, - "eval_logits/rejected": -2.938981294631958, - "eval_logps/chosen": -308.435791015625, - "eval_logps/rejected": -273.5282897949219, - "eval_loss": 0.5902574062347412, - "eval_rewards/accuracies": 0.716911792755127, - "eval_rewards/chosen": 0.6425240635871887, - "eval_rewards/margins": 1.0420384407043457, - "eval_rewards/rejected": -0.399514377117157, - "eval_runtime": 304.3903, - "eval_samples_per_second": 7.136, - "eval_steps_per_second": 0.447, + "eval_logits/chosen": -2.707245111465454, + "eval_logits/rejected": -2.788961887359619, + "eval_logps/chosen": -277.64111328125, + "eval_logps/rejected": -265.73919677734375, + "eval_loss": 0.5635179281234741, + "eval_rewards/accuracies": 0.7347328066825867, + "eval_rewards/chosen": 0.06093722581863403, + "eval_rewards/margins": 0.821345865726471, + "eval_rewards/rejected": -0.7604085803031921, + "eval_runtime": 302.2004, + "eval_samples_per_second": 6.903, + "eval_steps_per_second": 0.433, "step": 400 }, { - "epoch": 0.2, - "learning_rate": 3.333333333333333e-07, - "logits/chosen": -2.9667162895202637, - "logits/rejected": -2.9437496662139893, - "logps/chosen": -372.40716552734375, - "logps/rejected": -285.8179931640625, - "loss": 0.6008, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 1.117506980895996, - "rewards/margins": 1.2093473672866821, - "rewards/rejected": -0.09184026718139648, + "epoch": 0.21, + "learning_rate": 3.4745762711864405e-07, + "logits/chosen": -2.732656478881836, + "logits/rejected": -2.776484727859497, + "logps/chosen": -272.28106689453125, + "logps/rejected": -281.3340148925781, + "loss": 0.6293, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03277791664004326, + "rewards/margins": 0.327867716550827, + "rewards/rejected": -0.3606456220149994, "step": 410 }, { "epoch": 0.21, - "learning_rate": 3.4146341463414634e-07, - "logits/chosen": -2.9850645065307617, - "logits/rejected": -2.9572761058807373, - "logps/chosen": -300.002197265625, - "logps/rejected": -216.95358276367188, - "loss": 0.5165, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.46451878547668457, - "rewards/margins": 0.8798694610595703, - "rewards/rejected": -0.4153507351875305, + "learning_rate": 3.559322033898305e-07, + "logits/chosen": -2.6254220008850098, + "logits/rejected": -2.658902645111084, + "logps/chosen": -308.8375549316406, + "logps/rejected": -248.82421875, + "loss": 0.505, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.012793347239494324, + "rewards/margins": 0.5741627812385559, + "rewards/rejected": -0.5613693594932556, "step": 420 }, { - "epoch": 0.21, - "learning_rate": 3.4959349593495933e-07, - "logits/chosen": -3.0136630535125732, - "logits/rejected": -3.01253080368042, - "logps/chosen": -305.3328857421875, - "logps/rejected": -250.8460693359375, - "loss": 0.6166, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.18597328662872314, - "rewards/margins": 0.9783879518508911, - "rewards/rejected": -0.7924147248268127, + "epoch": 0.22, + "learning_rate": 3.644067796610169e-07, + "logits/chosen": -2.6374409198760986, + "logits/rejected": -2.6688437461853027, + "logps/chosen": -308.5555419921875, + "logps/rejected": -277.16455078125, + "loss": 0.5239, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.011289214715361595, + "rewards/margins": 0.9072087407112122, + "rewards/rejected": -0.8959195017814636, "step": 430 }, { - "epoch": 0.21, - "learning_rate": 3.5772357723577237e-07, - "logits/chosen": -3.0371696949005127, - "logits/rejected": -3.0199389457702637, - "logps/chosen": -319.9723815917969, - "logps/rejected": -233.076904296875, - "loss": 0.6486, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.2505221962928772, - "rewards/margins": 1.085371732711792, - "rewards/rejected": -0.8348496556282043, + "epoch": 0.22, + "learning_rate": 3.7288135593220336e-07, + "logits/chosen": -2.6448655128479004, + "logits/rejected": -2.6344261169433594, + "logps/chosen": -272.939697265625, + "logps/rejected": -269.31878662109375, + "loss": 0.5084, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.14245155453681946, + "rewards/margins": 0.9787721633911133, + "rewards/rejected": -0.836320698261261, "step": 440 }, { - "epoch": 0.22, - "learning_rate": 3.6585365853658536e-07, - "logits/chosen": -3.085334062576294, - "logits/rejected": -3.0350608825683594, - "logps/chosen": -336.40185546875, - "logps/rejected": -245.24551391601562, - "loss": 0.5743, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.444705069065094, - "rewards/margins": 1.4889802932739258, - "rewards/rejected": -1.0442752838134766, + "epoch": 0.23, + "learning_rate": 3.813559322033898e-07, + "logits/chosen": -2.798731803894043, + "logits/rejected": -2.735830783843994, + "logps/chosen": -309.8362731933594, + "logps/rejected": -275.83062744140625, + "loss": 0.5688, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.0009150445694103837, + "rewards/margins": 0.4768896996974945, + "rewards/rejected": -0.4759747087955475, "step": 450 }, { - "epoch": 0.22, - "learning_rate": 3.7398373983739835e-07, - "logits/chosen": -3.053133964538574, - "logits/rejected": -3.0543315410614014, - "logps/chosen": -290.3265075683594, - "logps/rejected": -232.86215209960938, - "loss": 0.5622, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.32177478075027466, - "rewards/margins": 0.9794939756393433, - "rewards/rejected": -0.6577191352844238, + "epoch": 0.23, + "learning_rate": 3.898305084745763e-07, + "logits/chosen": -2.6862215995788574, + "logits/rejected": -2.603532552719116, + "logps/chosen": -251.1598663330078, + "logps/rejected": -270.99981689453125, + "loss": 0.4916, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07209299504756927, + "rewards/margins": 1.0653936862945557, + "rewards/rejected": -0.9933006167411804, "step": 460 }, { - "epoch": 0.23, - "learning_rate": 3.821138211382114e-07, - "logits/chosen": -3.014402389526367, - "logits/rejected": -3.028357982635498, - "logps/chosen": -314.81500244140625, - "logps/rejected": -234.4297332763672, - "loss": 0.6041, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.257953017950058, - "rewards/margins": 0.7365555167198181, - "rewards/rejected": -0.47860246896743774, + "epoch": 0.24, + "learning_rate": 3.9830508474576267e-07, + "logits/chosen": -2.7343802452087402, + "logits/rejected": -2.735741138458252, + "logps/chosen": -325.2496643066406, + "logps/rejected": -292.3957214355469, + "loss": 0.5757, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.38839584589004517, + "rewards/margins": 0.835391640663147, + "rewards/rejected": -0.4469958245754242, "step": 470 }, { - "epoch": 0.23, - "learning_rate": 3.902439024390244e-07, - "logits/chosen": -2.9896554946899414, - "logits/rejected": -3.035231828689575, - "logps/chosen": -290.2040710449219, - "logps/rejected": -290.58843994140625, - "loss": 0.5618, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.35373765230178833, - "rewards/margins": 1.0421311855316162, - "rewards/rejected": -0.6883936524391174, + "epoch": 0.24, + "learning_rate": 4.0677966101694916e-07, + "logits/chosen": -2.7187466621398926, + "logits/rejected": -2.738949775695801, + "logps/chosen": -290.6060485839844, + "logps/rejected": -292.77557373046875, + "loss": 0.5272, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.18653082847595215, + "rewards/margins": 0.8780761957168579, + "rewards/rejected": -0.691545307636261, "step": 480 }, { - "epoch": 0.24, - "learning_rate": 3.9837398373983736e-07, - "logits/chosen": -2.9962782859802246, - "logits/rejected": -2.9962570667266846, - "logps/chosen": -337.5731506347656, - "logps/rejected": -263.51129150390625, - "loss": 0.5345, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.3453401029109955, - "rewards/margins": 0.9873741865158081, - "rewards/rejected": -0.6420340538024902, + "epoch": 0.25, + "learning_rate": 4.152542372881356e-07, + "logits/chosen": -2.6602752208709717, + "logits/rejected": -2.580003261566162, + "logps/chosen": -338.55584716796875, + "logps/rejected": -300.44573974609375, + "loss": 0.5326, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.28633373975753784, + "rewards/margins": 1.0145152807235718, + "rewards/rejected": -0.7281816005706787, "step": 490 }, { - "epoch": 0.24, - "learning_rate": 4.065040650406504e-07, - "logits/chosen": -3.0211968421936035, - "logits/rejected": -3.0214200019836426, - "logps/chosen": -285.29351806640625, - "logps/rejected": -275.0813293457031, - "loss": 0.5532, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.20648851990699768, - "rewards/margins": 0.6384798288345337, - "rewards/rejected": -0.4319912791252136, + "epoch": 0.25, + "learning_rate": 4.23728813559322e-07, + "logits/chosen": -2.706733226776123, + "logits/rejected": -2.6993355751037598, + "logps/chosen": -300.39520263671875, + "logps/rejected": -249.0484619140625, + "loss": 0.5399, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.1879514753818512, + "rewards/margins": 0.7782592177391052, + "rewards/rejected": -0.5903077721595764, "step": 500 }, { - "epoch": 0.24, - "eval_logits/chosen": -2.987318277359009, - "eval_logits/rejected": -2.9778709411621094, - "eval_logps/chosen": -314.289306640625, - "eval_logps/rejected": -278.6020202636719, - "eval_loss": 0.590000569820404, - "eval_rewards/accuracies": 0.7132353186607361, - "eval_rewards/chosen": 0.05716845765709877, - "eval_rewards/margins": 0.964056670665741, - "eval_rewards/rejected": -0.9068882465362549, - "eval_runtime": 304.2955, - "eval_samples_per_second": 7.138, - "eval_steps_per_second": 0.447, + "epoch": 0.25, + "eval_logits/chosen": -2.756488084793091, + "eval_logits/rejected": -2.837228775024414, + "eval_logps/chosen": -277.9347229003906, + "eval_logps/rejected": -268.0408630371094, + "eval_loss": 0.5393199324607849, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": 0.031575243920087814, + "eval_rewards/margins": 1.0221443176269531, + "eval_rewards/rejected": -0.9905692338943481, + "eval_runtime": 296.8577, + "eval_samples_per_second": 7.027, + "eval_steps_per_second": 0.441, "step": 500 }, { - "epoch": 0.25, - "learning_rate": 4.146341463414634e-07, - "logits/chosen": -2.9822144508361816, - "logits/rejected": -3.041672468185425, - "logps/chosen": -390.3653564453125, - "logps/rejected": -316.1328125, - "loss": 0.543, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.6455942392349243, - "rewards/margins": 1.1132816076278687, - "rewards/rejected": -0.4676872789859772, + "epoch": 0.26, + "learning_rate": 4.322033898305085e-07, + "logits/chosen": -2.565380573272705, + "logits/rejected": -2.54228138923645, + "logps/chosen": -252.8906707763672, + "logps/rejected": -253.0413360595703, + "loss": 0.5438, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.370120108127594, + "rewards/margins": 0.541715681552887, + "rewards/rejected": -0.9118358492851257, "step": 510 }, { - "epoch": 0.25, - "learning_rate": 4.2276422764227643e-07, - "logits/chosen": -3.056018114089966, - "logits/rejected": -3.000232696533203, - "logps/chosen": -305.2054138183594, - "logps/rejected": -275.71624755859375, - "loss": 0.5752, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.15846194326877594, - "rewards/margins": 1.0086904764175415, - "rewards/rejected": -0.8502284288406372, + "epoch": 0.26, + "learning_rate": 4.4067796610169486e-07, + "logits/chosen": -2.683913469314575, + "logits/rejected": -2.61004900932312, + "logps/chosen": -247.6575164794922, + "logps/rejected": -258.51666259765625, + "loss": 0.5493, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.16846349835395813, + "rewards/margins": 0.47411054372787476, + "rewards/rejected": -0.6425740122795105, "step": 520 }, { - "epoch": 0.26, - "learning_rate": 4.308943089430894e-07, - "logits/chosen": -3.0811100006103516, - "logits/rejected": -3.072749137878418, - "logps/chosen": -336.23236083984375, - "logps/rejected": -250.52572631835938, - "loss": 0.507, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.2015364170074463, - "rewards/margins": 1.3479152917861938, - "rewards/rejected": -1.1463788747787476, + "epoch": 0.27, + "learning_rate": 4.4915254237288135e-07, + "logits/chosen": -2.7208123207092285, + "logits/rejected": -2.671483039855957, + "logps/chosen": -292.650390625, + "logps/rejected": -272.16558837890625, + "loss": 0.5554, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.020361602306365967, + "rewards/margins": 0.8109226226806641, + "rewards/rejected": -0.7905610203742981, "step": 530 }, { - "epoch": 0.26, - "learning_rate": 4.390243902439024e-07, - "logits/chosen": -3.073317766189575, - "logits/rejected": -3.068291664123535, - "logps/chosen": -299.08441162109375, - "logps/rejected": -242.8533172607422, - "loss": 0.6551, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.1226528137922287, - "rewards/margins": 0.9907039403915405, - "rewards/rejected": -0.8680510520935059, + "epoch": 0.27, + "learning_rate": 4.576271186440678e-07, + "logits/chosen": -2.70005202293396, + "logits/rejected": -2.7322866916656494, + "logps/chosen": -282.77288818359375, + "logps/rejected": -254.0004119873047, + "loss": 0.5894, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.16902032494544983, + "rewards/margins": 0.8821357488632202, + "rewards/rejected": -0.7131155133247375, "step": 540 }, { - "epoch": 0.27, - "learning_rate": 4.471544715447154e-07, - "logits/chosen": -2.974269390106201, - "logits/rejected": -2.963493824005127, - "logps/chosen": -358.9617614746094, - "logps/rejected": -275.00335693359375, - "loss": 0.5873, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.11294855177402496, - "rewards/margins": 0.9972805976867676, - "rewards/rejected": -0.8843320608139038, + "epoch": 0.28, + "learning_rate": 4.661016949152542e-07, + "logits/chosen": -2.5971081256866455, + "logits/rejected": -2.5216164588928223, + "logps/chosen": -305.2200012207031, + "logps/rejected": -298.790283203125, + "loss": 0.7142, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7713154554367065, + "rewards/margins": 0.3943839967250824, + "rewards/rejected": -1.1656994819641113, "step": 550 }, { - "epoch": 0.27, - "learning_rate": 4.5528455284552844e-07, - "logits/chosen": -2.9791064262390137, - "logits/rejected": -3.0353074073791504, - "logps/chosen": -327.1454162597656, - "logps/rejected": -310.22369384765625, - "loss": 0.5888, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.021872159093618393, - "rewards/margins": 1.5735278129577637, - "rewards/rejected": -1.5516555309295654, + "epoch": 0.28, + "learning_rate": 4.7457627118644066e-07, + "logits/chosen": -2.633862257003784, + "logits/rejected": -2.6472296714782715, + "logps/chosen": -293.66778564453125, + "logps/rejected": -248.5172576904297, + "loss": 0.621, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.04602837562561035, + "rewards/margins": 0.9052542448043823, + "rewards/rejected": -0.9512826800346375, "step": 560 }, { - "epoch": 0.28, - "learning_rate": 4.634146341463415e-07, - "logits/chosen": -2.982809543609619, - "logits/rejected": -3.021970272064209, - "logps/chosen": -314.0521240234375, - "logps/rejected": -291.07086181640625, - "loss": 0.5367, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.2853181064128876, - "rewards/margins": 1.3083168268203735, - "rewards/rejected": -1.0229986906051636, + "epoch": 0.29, + "learning_rate": 4.830508474576271e-07, + "logits/chosen": -2.578864574432373, + "logits/rejected": -2.7244362831115723, + "logps/chosen": -272.5432434082031, + "logps/rejected": -249.15902709960938, + "loss": 0.5396, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0642789900302887, + "rewards/margins": 0.9198991656303406, + "rewards/rejected": -0.9841780662536621, "step": 570 }, { - "epoch": 0.28, - "learning_rate": 4.7154471544715447e-07, - "logits/chosen": -3.023433208465576, - "logits/rejected": -3.064955711364746, - "logps/chosen": -306.31036376953125, - "logps/rejected": -249.7621612548828, - "loss": 0.6933, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.40745028853416443, - "rewards/margins": 1.0428441762924194, - "rewards/rejected": -0.6353938579559326, + "epoch": 0.3, + "learning_rate": 4.915254237288136e-07, + "logits/chosen": -2.6675069332122803, + "logits/rejected": -2.7055740356445312, + "logps/chosen": -309.1647033691406, + "logps/rejected": -292.64801025390625, + "loss": 0.6726, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.008704179897904396, + "rewards/margins": 0.8699982762336731, + "rewards/rejected": -0.8612940907478333, "step": 580 }, { - "epoch": 0.29, - "learning_rate": 4.796747967479675e-07, - "logits/chosen": -2.908500909805298, - "logits/rejected": -2.8887341022491455, - "logps/chosen": -352.4559631347656, - "logps/rejected": -229.1545867919922, - "loss": 0.9314, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2239396572113037, - "rewards/margins": -0.13709083199501038, - "rewards/rejected": -1.0868487358093262, + "epoch": 0.3, + "learning_rate": 5e-07, + "logits/chosen": -2.6913936138153076, + "logits/rejected": -2.5511505603790283, + "logps/chosen": -213.75790405273438, + "logps/rejected": -269.3441467285156, + "loss": 0.4877, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.07794573903083801, + "rewards/margins": 1.021460771560669, + "rewards/rejected": -0.9435150027275085, "step": 590 }, { - "epoch": 0.29, - "learning_rate": 4.878048780487804e-07, - "logits/chosen": -2.9681689739227295, - "logits/rejected": -3.0342695713043213, - "logps/chosen": -311.7699279785156, - "logps/rejected": -291.0084533691406, - "loss": 0.8947, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.07403398305177689, - "rewards/margins": 1.2035658359527588, - "rewards/rejected": -1.1295318603515625, + "epoch": 0.31, + "learning_rate": 4.99058025621703e-07, + "logits/chosen": -2.743154287338257, + "logits/rejected": -2.752743721008301, + "logps/chosen": -271.9192199707031, + "logps/rejected": -263.9624938964844, + "loss": 0.5776, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.21212521195411682, + "rewards/margins": 0.8748341798782349, + "rewards/rejected": -0.6627089977264404, "step": 600 }, { - "epoch": 0.29, - "eval_logits/chosen": -2.964571237564087, - "eval_logits/rejected": -2.945216178894043, - "eval_logps/chosen": -316.171875, - "eval_logps/rejected": -279.5130615234375, - "eval_loss": 0.6949825882911682, - "eval_rewards/accuracies": 0.7040441036224365, - "eval_rewards/chosen": -0.13108883798122406, - "eval_rewards/margins": 0.866904079914093, - "eval_rewards/rejected": -0.9979929327964783, - "eval_runtime": 304.2974, - "eval_samples_per_second": 7.138, - "eval_steps_per_second": 0.447, + "epoch": 0.31, + "eval_logits/chosen": -2.7569472789764404, + "eval_logits/rejected": -2.838770627975464, + "eval_logps/chosen": -277.82574462890625, + "eval_logps/rejected": -267.9344787597656, + "eval_loss": 0.5705651640892029, + "eval_rewards/accuracies": 0.7423664331436157, + "eval_rewards/chosen": 0.04247231036424637, + "eval_rewards/margins": 1.022408366203308, + "eval_rewards/rejected": -0.9799360632896423, + "eval_runtime": 301.417, + "eval_samples_per_second": 6.921, + "eval_steps_per_second": 0.435, "step": 600 }, { - "epoch": 0.3, - "learning_rate": 4.959349593495934e-07, - "logits/chosen": -2.95692777633667, - "logits/rejected": -2.9698498249053955, - "logps/chosen": -354.22198486328125, - "logps/rejected": -265.5474548339844, - "loss": 0.5566, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.260162353515625, - "rewards/margins": 0.9518325924873352, - "rewards/rejected": -0.691670298576355, + "epoch": 0.31, + "learning_rate": 4.981160512434062e-07, + "logits/chosen": -2.7087044715881348, + "logits/rejected": -2.7249045372009277, + "logps/chosen": -249.1181640625, + "logps/rejected": -284.8170471191406, + "loss": 0.5501, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.24952204525470734, + "rewards/margins": 1.0174764394760132, + "rewards/rejected": -0.7679542899131775, "step": 610 }, { - "epoch": 0.3, - "learning_rate": 4.995475931958016e-07, - "logits/chosen": -3.0378897190093994, - "logits/rejected": -3.0286765098571777, - "logps/chosen": -365.04278564453125, - "logps/rejected": -276.15667724609375, - "loss": 0.5523, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.3607785999774933, - "rewards/margins": 1.2360095977783203, - "rewards/rejected": -0.8752309679985046, + "epoch": 0.32, + "learning_rate": 4.971740768651092e-07, + "logits/chosen": -2.5978667736053467, + "logits/rejected": -2.5184450149536133, + "logps/chosen": -280.148193359375, + "logps/rejected": -278.3171691894531, + "loss": 0.5401, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.10301868617534637, + "rewards/margins": 0.9203518629074097, + "rewards/rejected": -0.8173332214355469, "step": 620 }, { - "epoch": 0.31, - "learning_rate": 4.98642779587405e-07, - "logits/chosen": -3.0086443424224854, - "logits/rejected": -3.033050060272217, - "logps/chosen": -285.6788635253906, - "logps/rejected": -288.28778076171875, - "loss": 0.5964, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.1263740360736847, - "rewards/margins": 1.1808249950408936, - "rewards/rejected": -1.0544511079788208, + "epoch": 0.32, + "learning_rate": 4.962321024868124e-07, + "logits/chosen": -2.585350275039673, + "logits/rejected": -2.6179752349853516, + "logps/chosen": -252.47607421875, + "logps/rejected": -227.0454864501953, + "loss": 0.4839, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.11939480155706406, + "rewards/margins": 0.9655391573905945, + "rewards/rejected": -0.8461443185806274, "step": 630 }, { - "epoch": 0.31, - "learning_rate": 4.977379659790083e-07, - "logits/chosen": -3.112023115158081, - "logits/rejected": -3.081568717956543, - "logps/chosen": -355.56597900390625, - "logps/rejected": -295.56585693359375, - "loss": 0.5871, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.31240302324295044, - "rewards/margins": 1.1467087268829346, - "rewards/rejected": -0.8343055844306946, + "epoch": 0.33, + "learning_rate": 4.952901281085154e-07, + "logits/chosen": -2.6374013423919678, + "logits/rejected": -2.614624500274658, + "logps/chosen": -250.5303192138672, + "logps/rejected": -274.0657653808594, + "loss": 0.6261, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0875898078083992, + "rewards/margins": 0.7766585350036621, + "rewards/rejected": -0.6890687346458435, "step": 640 }, { - "epoch": 0.32, - "learning_rate": 4.968331523706117e-07, - "logits/chosen": -3.079944610595703, - "logits/rejected": -3.067242383956909, - "logps/chosen": -344.10662841796875, - "logps/rejected": -291.33404541015625, - "loss": 0.9637, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.22513656318187714, - "rewards/margins": 1.366295576095581, - "rewards/rejected": -1.1411590576171875, + "epoch": 0.33, + "learning_rate": 4.943481537302186e-07, + "logits/chosen": -2.6180291175842285, + "logits/rejected": -2.727692127227783, + "logps/chosen": -236.42813110351562, + "logps/rejected": -239.7740020751953, + "loss": 0.5047, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.21162250638008118, + "rewards/margins": 0.9807373285293579, + "rewards/rejected": -0.7691147923469543, "step": 650 }, { - "epoch": 0.32, - "learning_rate": 4.95928338762215e-07, - "logits/chosen": -3.0787405967712402, - "logits/rejected": -3.158884286880493, - "logps/chosen": -313.45367431640625, - "logps/rejected": -263.2179870605469, - "loss": 0.6235, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.3742740750312805, - "rewards/margins": 0.9393173456192017, - "rewards/rejected": -0.5650432705879211, + "epoch": 0.34, + "learning_rate": 4.934061793519216e-07, + "logits/chosen": -2.655010938644409, + "logits/rejected": -2.6872143745422363, + "logps/chosen": -299.6675109863281, + "logps/rejected": -306.3320007324219, + "loss": 0.584, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.09119521826505661, + "rewards/margins": 0.5702224969863892, + "rewards/rejected": -0.6614176630973816, "step": 660 }, { - "epoch": 0.33, - "learning_rate": 4.950235251538182e-07, - "logits/chosen": -3.075279951095581, - "logits/rejected": -3.0069966316223145, - "logps/chosen": -292.4249572753906, - "logps/rejected": -259.87994384765625, - "loss": 0.8259, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.40961867570877075, - "rewards/margins": 1.5581610202789307, - "rewards/rejected": -1.1485425233840942, + "epoch": 0.34, + "learning_rate": 4.924642049736247e-07, + "logits/chosen": -2.6529808044433594, + "logits/rejected": -2.582202434539795, + "logps/chosen": -261.41754150390625, + "logps/rejected": -244.1402130126953, + "loss": 0.5348, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.11745382845401764, + "rewards/margins": 1.0263164043426514, + "rewards/rejected": -0.9088624715805054, "step": 670 }, { - "epoch": 0.33, - "learning_rate": 4.941187115454216e-07, - "logits/chosen": -3.135587215423584, - "logits/rejected": -3.100541114807129, - "logps/chosen": -387.0320739746094, - "logps/rejected": -307.3134460449219, - "loss": 0.5294, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.6613325476646423, - "rewards/margins": 1.5946954488754272, - "rewards/rejected": -0.9333627820014954, + "epoch": 0.35, + "learning_rate": 4.915222305953277e-07, + "logits/chosen": -2.6894426345825195, + "logits/rejected": -2.5927743911743164, + "logps/chosen": -272.3468933105469, + "logps/rejected": -259.9945068359375, + "loss": 0.5105, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.04899612069129944, + "rewards/margins": 1.1350120306015015, + "rewards/rejected": -1.0860159397125244, "step": 680 }, { - "epoch": 0.34, - "learning_rate": 4.932138979370249e-07, - "logits/chosen": -3.0810999870300293, - "logits/rejected": -3.0727949142456055, - "logps/chosen": -351.731689453125, - "logps/rejected": -247.8231658935547, - "loss": 0.5703, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.3237338066101074, - "rewards/margins": 1.4943344593048096, - "rewards/rejected": -1.1706006526947021, + "epoch": 0.35, + "learning_rate": 4.905802562170309e-07, + "logits/chosen": -2.5538647174835205, + "logits/rejected": -2.6060104370117188, + "logps/chosen": -294.0184020996094, + "logps/rejected": -293.9318542480469, + "loss": 0.6963, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.24619393050670624, + "rewards/margins": 0.5672358274459839, + "rewards/rejected": -0.8134298324584961, "step": 690 }, { - "epoch": 0.34, - "learning_rate": 4.923090843286283e-07, - "logits/chosen": -3.004992961883545, - "logits/rejected": -3.0005528926849365, - "logps/chosen": -377.6859436035156, - "logps/rejected": -289.76702880859375, - "loss": 0.6725, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.13520386815071106, - "rewards/margins": 0.5780080556869507, - "rewards/rejected": -0.44280415773391724, + "epoch": 0.36, + "learning_rate": 4.896382818387339e-07, + "logits/chosen": -2.6548666954040527, + "logits/rejected": -2.6277880668640137, + "logps/chosen": -268.58563232421875, + "logps/rejected": -226.7079315185547, + "loss": 0.5834, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06387074291706085, + "rewards/margins": 0.5434452295303345, + "rewards/rejected": -0.6073160171508789, "step": 700 }, { - "epoch": 0.34, - "eval_logits/chosen": -3.060330867767334, - "eval_logits/rejected": -3.0431385040283203, - "eval_logps/chosen": -310.595703125, - "eval_logps/rejected": -277.651611328125, - "eval_loss": 0.6230311393737793, - "eval_rewards/accuracies": 0.6930146813392639, - "eval_rewards/chosen": 0.4265329837799072, - "eval_rewards/margins": 1.238381028175354, - "eval_rewards/rejected": -0.8118480443954468, - "eval_runtime": 304.6483, - "eval_samples_per_second": 7.13, - "eval_steps_per_second": 0.446, + "epoch": 0.36, + "eval_logits/chosen": -2.6940970420837402, + "eval_logits/rejected": -2.7830452919006348, + "eval_logps/chosen": -277.7963562011719, + "eval_logps/rejected": -268.3512878417969, + "eval_loss": 0.5596233010292053, + "eval_rewards/accuracies": 0.7423664331436157, + "eval_rewards/chosen": 0.04541310295462608, + "eval_rewards/margins": 1.067026972770691, + "eval_rewards/rejected": -1.0216139554977417, + "eval_runtime": 297.2834, + "eval_samples_per_second": 7.017, + "eval_steps_per_second": 0.441, "step": 700 }, { - "epoch": 0.35, - "learning_rate": 4.914042707202316e-07, - "logits/chosen": -3.0114569664001465, - "logits/rejected": -3.0635268688201904, - "logps/chosen": -293.4792785644531, - "logps/rejected": -281.75726318359375, - "loss": 0.6315, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.4091672897338867, - "rewards/margins": 1.5117290019989014, - "rewards/rejected": -1.1025617122650146, + "epoch": 0.36, + "learning_rate": 4.886963074604371e-07, + "logits/chosen": -2.6284475326538086, + "logits/rejected": -2.547412872314453, + "logps/chosen": -270.00872802734375, + "logps/rejected": -266.3251953125, + "loss": 0.6424, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.025370970368385315, + "rewards/margins": 0.7029758095741272, + "rewards/rejected": -0.7283468842506409, "step": 710 }, { - "epoch": 0.35, - "learning_rate": 4.90499457111835e-07, - "logits/chosen": -3.1278040409088135, - "logits/rejected": -3.1145219802856445, - "logps/chosen": -330.97161865234375, - "logps/rejected": -270.5916442871094, - "loss": 0.6825, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.37046709656715393, - "rewards/margins": 1.2762682437896729, - "rewards/rejected": -0.905800998210907, + "epoch": 0.37, + "learning_rate": 4.877543330821401e-07, + "logits/chosen": -2.6589772701263428, + "logits/rejected": -2.6408326625823975, + "logps/chosen": -328.72137451171875, + "logps/rejected": -260.0521240234375, + "loss": 0.5091, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11194495111703873, + "rewards/margins": 0.8934643864631653, + "rewards/rejected": -1.0054093599319458, "step": 720 }, { - "epoch": 0.36, - "learning_rate": 4.895946435034383e-07, - "logits/chosen": -3.0648272037506104, - "logits/rejected": -3.0368688106536865, - "logps/chosen": -302.07940673828125, - "logps/rejected": -252.5207061767578, - "loss": 0.6246, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.11373963207006454, - "rewards/margins": 1.2480661869049072, - "rewards/rejected": -1.36180579662323, - "step": 730 + "epoch": 0.37, + "learning_rate": 4.868123587038433e-07, + "logits/chosen": -2.526742935180664, + "logits/rejected": -2.568509578704834, + "logps/chosen": -261.3857727050781, + "logps/rejected": -246.47189331054688, + "loss": 0.7511, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.04114454239606857, + "rewards/margins": 0.9449528455734253, + "rewards/rejected": -0.9038082957267761, + "step": 730 }, { - "epoch": 0.36, - "learning_rate": 4.886898298950416e-07, - "logits/chosen": -3.0961246490478516, - "logits/rejected": -3.0534000396728516, - "logps/chosen": -314.1170654296875, - "logps/rejected": -262.7082824707031, - "loss": 0.6299, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.42755192518234253, - "rewards/margins": 1.4352774620056152, - "rewards/rejected": -1.007725477218628, + "epoch": 0.38, + "learning_rate": 4.858703843255463e-07, + "logits/chosen": -2.4164764881134033, + "logits/rejected": -2.587960720062256, + "logps/chosen": -263.1320495605469, + "logps/rejected": -233.99642944335938, + "loss": 0.5897, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.08096225559711456, + "rewards/margins": 1.0214519500732422, + "rewards/rejected": -0.9404897689819336, "step": 740 }, { - "epoch": 0.37, - "learning_rate": 4.877850162866449e-07, - "logits/chosen": -3.077770709991455, - "logits/rejected": -3.091207981109619, - "logps/chosen": -360.36553955078125, - "logps/rejected": -277.3929748535156, - "loss": 0.6712, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.07998634874820709, - "rewards/margins": 1.0905625820159912, - "rewards/rejected": -1.0105763673782349, + "epoch": 0.38, + "learning_rate": 4.849284099472495e-07, + "logits/chosen": -2.3981845378875732, + "logits/rejected": -2.3214974403381348, + "logps/chosen": -257.8962707519531, + "logps/rejected": -246.64675903320312, + "loss": 0.4864, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08276768028736115, + "rewards/margins": 1.0210144519805908, + "rewards/rejected": -1.103782296180725, "step": 750 }, { - "epoch": 0.37, - "learning_rate": 4.868802026782482e-07, - "logits/chosen": -3.051208257675171, - "logits/rejected": -3.0030460357666016, - "logps/chosen": -287.0205383300781, - "logps/rejected": -224.2216033935547, - "loss": 0.7365, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.22932763397693634, - "rewards/margins": 1.0266369581222534, - "rewards/rejected": -1.2559645175933838, + "epoch": 0.39, + "learning_rate": 4.839864355689525e-07, + "logits/chosen": -2.486299514770508, + "logits/rejected": -2.3817036151885986, + "logps/chosen": -263.58502197265625, + "logps/rejected": -286.1875305175781, + "loss": 0.614, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5518718361854553, + "rewards/margins": 1.387749433517456, + "rewards/rejected": -0.8358775973320007, "step": 760 }, { - "epoch": 0.38, - "learning_rate": 4.859753890698516e-07, - "logits/chosen": -3.1167080402374268, - "logits/rejected": -3.0906012058258057, - "logps/chosen": -315.0155944824219, - "logps/rejected": -328.17828369140625, - "loss": 0.5877, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.005679169204086065, - "rewards/margins": 1.3168147802352905, - "rewards/rejected": -1.3224939107894897, + "epoch": 0.39, + "learning_rate": 4.830444611906556e-07, + "logits/chosen": -2.2975809574127197, + "logits/rejected": -2.5370969772338867, + "logps/chosen": -306.3849182128906, + "logps/rejected": -245.950927734375, + "loss": 0.6453, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12810799479484558, + "rewards/margins": 0.6762959361076355, + "rewards/rejected": -0.8044039011001587, "step": 770 }, { - "epoch": 0.38, - "learning_rate": 4.850705754614549e-07, - "logits/chosen": -3.0969173908233643, - "logits/rejected": -3.098081111907959, - "logps/chosen": -319.8554992675781, - "logps/rejected": -291.92578125, - "loss": 0.5668, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.18538102507591248, - "rewards/margins": 1.6001640558242798, - "rewards/rejected": -1.7855451107025146, + "epoch": 0.4, + "learning_rate": 4.821024868123586e-07, + "logits/chosen": -2.5201289653778076, + "logits/rejected": -2.517611026763916, + "logps/chosen": -254.85104370117188, + "logps/rejected": -244.4443817138672, + "loss": 0.5778, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0704607367515564, + "rewards/margins": 0.6457270383834839, + "rewards/rejected": -0.7161878347396851, "step": 780 }, { - "epoch": 0.39, - "learning_rate": 4.841657618530583e-07, - "logits/chosen": -3.1575210094451904, - "logits/rejected": -3.086078643798828, - "logps/chosen": -312.21575927734375, - "logps/rejected": -295.0933532714844, - "loss": 0.5432, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.31577587127685547, - "rewards/margins": 1.261184573173523, - "rewards/rejected": -0.945408821105957, + "epoch": 0.4, + "learning_rate": 4.811605124340618e-07, + "logits/chosen": -2.531914472579956, + "logits/rejected": -2.509597063064575, + "logps/chosen": -265.1935119628906, + "logps/rejected": -241.33633422851562, + "loss": 0.5434, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4141203761100769, + "rewards/margins": 1.151557445526123, + "rewards/rejected": -0.7374370098114014, "step": 790 }, { - "epoch": 0.39, - "learning_rate": 4.832609482446615e-07, - "logits/chosen": -3.0612049102783203, - "logits/rejected": -3.0234055519104004, - "logps/chosen": -373.1920166015625, - "logps/rejected": -319.0777587890625, - "loss": 0.5614, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.8912686109542847, - "rewards/margins": 2.0416665077209473, - "rewards/rejected": -1.150397777557373, + "epoch": 0.41, + "learning_rate": 4.802185380557648e-07, + "logits/chosen": -2.439547538757324, + "logits/rejected": -2.4296250343322754, + "logps/chosen": -267.5282287597656, + "logps/rejected": -269.441650390625, + "loss": 0.5394, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.19664326310157776, + "rewards/margins": 1.12895929813385, + "rewards/rejected": -0.9323161244392395, "step": 800 }, { - "epoch": 0.39, - "eval_logits/chosen": -3.070061206817627, - "eval_logits/rejected": -3.0410265922546387, - "eval_logps/chosen": -311.7314147949219, - "eval_logps/rejected": -279.05657958984375, - "eval_loss": 0.6195098757743835, - "eval_rewards/accuracies": 0.7279411554336548, - "eval_rewards/chosen": 0.3129613995552063, - "eval_rewards/margins": 1.265307903289795, - "eval_rewards/rejected": -0.9523463249206543, - "eval_runtime": 304.4577, - "eval_samples_per_second": 7.134, - "eval_steps_per_second": 0.447, + "epoch": 0.41, + "eval_logits/chosen": -2.5479583740234375, + "eval_logits/rejected": -2.631294012069702, + "eval_logps/chosen": -277.4460144042969, + "eval_logps/rejected": -267.67138671875, + "eval_loss": 0.5357747077941895, + "eval_rewards/accuracies": 0.7480915784835815, + "eval_rewards/chosen": 0.08044610917568207, + "eval_rewards/margins": 1.034072995185852, + "eval_rewards/rejected": -0.953626811504364, + "eval_runtime": 302.149, + "eval_samples_per_second": 6.904, + "eval_steps_per_second": 0.434, "step": 800 }, { - "epoch": 0.4, - "learning_rate": 4.823561346362649e-07, - "logits/chosen": -3.059509038925171, - "logits/rejected": -3.0271408557891846, - "logps/chosen": -314.46014404296875, - "logps/rejected": -287.3227844238281, - "loss": 0.7774, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.14575880765914917, - "rewards/margins": 1.0279204845428467, - "rewards/rejected": -1.173679232597351, + "epoch": 0.41, + "learning_rate": 4.79276563677468e-07, + "logits/chosen": -2.4669220447540283, + "logits/rejected": -2.3798508644104004, + "logps/chosen": -313.71148681640625, + "logps/rejected": -296.85711669921875, + "loss": 0.5522, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.39447715878486633, + "rewards/margins": 1.3083837032318115, + "rewards/rejected": -0.9139065742492676, "step": 810 }, { - "epoch": 0.4, - "learning_rate": 4.814513210278682e-07, - "logits/chosen": -3.0621161460876465, - "logits/rejected": -3.0300240516662598, - "logps/chosen": -347.3387145996094, - "logps/rejected": -269.94842529296875, - "loss": 0.5674, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.5827974677085876, - "rewards/margins": 1.5296648740768433, - "rewards/rejected": -0.9468674659729004, + "epoch": 0.42, + "learning_rate": 4.78334589299171e-07, + "logits/chosen": -2.51173734664917, + "logits/rejected": -2.452078342437744, + "logps/chosen": -247.0750732421875, + "logps/rejected": -237.9070281982422, + "loss": 0.5305, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.09000038355588913, + "rewards/margins": 1.039417028427124, + "rewards/rejected": -1.1294173002243042, "step": 820 }, { - "epoch": 0.41, - "learning_rate": 4.805465074194716e-07, - "logits/chosen": -3.016064405441284, - "logits/rejected": -2.9610564708709717, - "logps/chosen": -319.84124755859375, - "logps/rejected": -251.6979522705078, - "loss": 0.6085, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.07994663715362549, - "rewards/margins": 1.03834867477417, - "rewards/rejected": -0.9584019780158997, + "epoch": 0.42, + "learning_rate": 4.773926149208742e-07, + "logits/chosen": -2.6431527137756348, + "logits/rejected": -2.5651488304138184, + "logps/chosen": -293.65728759765625, + "logps/rejected": -268.22406005859375, + "loss": 0.5439, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03514687344431877, + "rewards/margins": 0.8131545782089233, + "rewards/rejected": -0.7780076861381531, "step": 830 }, { - "epoch": 0.41, - "learning_rate": 4.796416938110749e-07, - "logits/chosen": -2.9860081672668457, - "logits/rejected": -2.981600284576416, - "logps/chosen": -354.3580627441406, - "logps/rejected": -271.50213623046875, - "loss": 0.5454, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.6970037817955017, - "rewards/margins": 1.5614551305770874, - "rewards/rejected": -0.8644511103630066, + "epoch": 0.43, + "learning_rate": 4.764506405425772e-07, + "logits/chosen": -2.467764139175415, + "logits/rejected": -2.464174509048462, + "logps/chosen": -257.2640686035156, + "logps/rejected": -247.9285125732422, + "loss": 0.6165, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.22154740989208221, + "rewards/margins": 0.8597299456596375, + "rewards/rejected": -1.0812774896621704, "step": 840 }, { - "epoch": 0.42, - "learning_rate": 4.787368802026782e-07, - "logits/chosen": -2.9741404056549072, - "logits/rejected": -2.9306490421295166, - "logps/chosen": -327.3341369628906, - "logps/rejected": -265.4449157714844, - "loss": 0.6103, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.3073541820049286, - "rewards/margins": 1.5458253622055054, - "rewards/rejected": -1.238471269607544, + "epoch": 0.43, + "learning_rate": 4.755086661642803e-07, + "logits/chosen": -2.502732276916504, + "logits/rejected": -2.6119589805603027, + "logps/chosen": -255.53225708007812, + "logps/rejected": -231.84963989257812, + "loss": 0.5659, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.15040960907936096, + "rewards/margins": 0.9278343915939331, + "rewards/rejected": -1.0782438516616821, "step": 850 }, { - "epoch": 0.42, - "learning_rate": 4.778320665942816e-07, - "logits/chosen": -2.990180492401123, - "logits/rejected": -2.971078395843506, - "logps/chosen": -329.2413024902344, - "logps/rejected": -247.0337677001953, - "loss": 0.6345, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.13333797454833984, - "rewards/margins": 1.2447474002838135, - "rewards/rejected": -1.1114094257354736, + "epoch": 0.44, + "learning_rate": 4.745666917859834e-07, + "logits/chosen": -2.5779833793640137, + "logits/rejected": -2.6192851066589355, + "logps/chosen": -281.83319091796875, + "logps/rejected": -291.05279541015625, + "loss": 0.6056, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6201379299163818, + "rewards/margins": 0.7899220585823059, + "rewards/rejected": -1.4100600481033325, "step": 860 }, { - "epoch": 0.43, - "learning_rate": 4.769272529858848e-07, - "logits/chosen": -2.941981077194214, - "logits/rejected": -2.9229941368103027, - "logps/chosen": -373.42071533203125, - "logps/rejected": -295.7408142089844, - "loss": 0.5682, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.15170541405677795, - "rewards/margins": 0.9506505131721497, - "rewards/rejected": -0.7989450097084045, + "epoch": 0.44, + "learning_rate": 4.736247174076865e-07, + "logits/chosen": -2.4486584663391113, + "logits/rejected": -2.5848748683929443, + "logps/chosen": -343.8517150878906, + "logps/rejected": -315.92327880859375, + "loss": 0.5849, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21137447655200958, + "rewards/margins": 0.9022495150566101, + "rewards/rejected": -1.113624095916748, "step": 870 }, { - "epoch": 0.43, - "learning_rate": 4.7602243937748823e-07, - "logits/chosen": -2.897479772567749, - "logits/rejected": -2.9384448528289795, - "logps/chosen": -367.583740234375, - "logps/rejected": -243.7733154296875, - "loss": 0.6656, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.09478510916233063, - "rewards/margins": 1.3639883995056152, - "rewards/rejected": -1.2692034244537354, + "epoch": 0.45, + "learning_rate": 4.726827430293896e-07, + "logits/chosen": -2.5461716651916504, + "logits/rejected": -2.6663496494293213, + "logps/chosen": -323.36395263671875, + "logps/rejected": -305.492431640625, + "loss": 0.4929, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.06274759024381638, + "rewards/margins": 0.9509458541870117, + "rewards/rejected": -0.8881982564926147, "step": 880 }, { - "epoch": 0.43, - "learning_rate": 4.751176257690915e-07, - "logits/chosen": -2.8539340496063232, - "logits/rejected": -2.869941473007202, - "logps/chosen": -354.42559814453125, - "logps/rejected": -281.339599609375, - "loss": 0.5512, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.23819153010845184, - "rewards/margins": 1.9900051355361938, - "rewards/rejected": -1.751813292503357, + "epoch": 0.45, + "learning_rate": 4.717407686510927e-07, + "logits/chosen": -2.5778603553771973, + "logits/rejected": -2.5960335731506348, + "logps/chosen": -270.0763854980469, + "logps/rejected": -253.62646484375, + "loss": 0.5639, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.30001965165138245, + "rewards/margins": 1.3010705709457397, + "rewards/rejected": -1.6010901927947998, "step": 890 }, { - "epoch": 0.44, - "learning_rate": 4.7421281216069485e-07, - "logits/chosen": -2.9847445487976074, - "logits/rejected": -2.9310717582702637, - "logps/chosen": -338.1303405761719, - "logps/rejected": -197.42662048339844, - "loss": 0.6018, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.17216387391090393, - "rewards/margins": 1.9233291149139404, - "rewards/rejected": -1.7511651515960693, + "epoch": 0.46, + "learning_rate": 4.707987942727958e-07, + "logits/chosen": -2.4405810832977295, + "logits/rejected": -2.4201414585113525, + "logps/chosen": -269.73748779296875, + "logps/rejected": -268.6572265625, + "loss": 0.5141, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06953033059835434, + "rewards/margins": 1.3686714172363281, + "rewards/rejected": -1.2991411685943604, "step": 900 }, { - "epoch": 0.44, - "eval_logits/chosen": -2.9212145805358887, - "eval_logits/rejected": -2.904815435409546, - "eval_logps/chosen": -312.8752136230469, - "eval_logps/rejected": -280.56695556640625, - "eval_loss": 0.5879461765289307, - "eval_rewards/accuracies": 0.7316176295280457, - "eval_rewards/chosen": 0.19857890903949738, - "eval_rewards/margins": 1.3019603490829468, - "eval_rewards/rejected": -1.1033812761306763, - "eval_runtime": 304.4066, - "eval_samples_per_second": 7.135, - "eval_steps_per_second": 0.447, + "epoch": 0.46, + "eval_logits/chosen": -2.5832109451293945, + "eval_logits/rejected": -2.666200876235962, + "eval_logps/chosen": -280.95458984375, + "eval_logps/rejected": -272.4443664550781, + "eval_loss": 0.541175127029419, + "eval_rewards/accuracies": 0.7442747950553894, + "eval_rewards/chosen": -0.2704112231731415, + "eval_rewards/margins": 1.1605098247528076, + "eval_rewards/rejected": -1.430921196937561, + "eval_runtime": 297.0452, + "eval_samples_per_second": 7.023, + "eval_steps_per_second": 0.441, "step": 900 }, { - "epoch": 0.44, - "learning_rate": 4.733079985522982e-07, - "logits/chosen": -2.97774600982666, - "logits/rejected": -2.9321646690368652, - "logps/chosen": -272.35797119140625, - "logps/rejected": -223.95706176757812, - "loss": 0.5876, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.09260288625955582, - "rewards/margins": 1.4640724658966064, - "rewards/rejected": -1.5566754341125488, + "epoch": 0.46, + "learning_rate": 4.698568198944988e-07, + "logits/chosen": -2.5255184173583984, + "logits/rejected": -2.573775053024292, + "logps/chosen": -300.562255859375, + "logps/rejected": -269.5960998535156, + "loss": 0.6796, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0005705863004550338, + "rewards/margins": 1.1774917840957642, + "rewards/rejected": -1.1769212484359741, "step": 910 }, { - "epoch": 0.45, - "learning_rate": 4.724031849439015e-07, - "logits/chosen": -3.012390613555908, - "logits/rejected": -2.9720027446746826, - "logps/chosen": -360.96478271484375, - "logps/rejected": -272.2576599121094, - "loss": 0.5219, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.6630409359931946, - "rewards/margins": 1.956491470336914, - "rewards/rejected": -1.2934505939483643, + "epoch": 0.47, + "learning_rate": 4.689148455162019e-07, + "logits/chosen": -2.4337317943573, + "logits/rejected": -2.431641101837158, + "logps/chosen": -230.990234375, + "logps/rejected": -234.8484649658203, + "loss": 0.5672, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.11962993443012238, + "rewards/margins": 1.2908594608306885, + "rewards/rejected": -1.4104894399642944, "step": 920 }, { - "epoch": 0.45, - "learning_rate": 4.7149837133550485e-07, - "logits/chosen": -3.046267032623291, - "logits/rejected": -3.033228635787964, - "logps/chosen": -318.56878662109375, - "logps/rejected": -238.29006958007812, - "loss": 0.6678, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.07137431204319, - "rewards/margins": 1.403503179550171, - "rewards/rejected": -1.4748774766921997, + "epoch": 0.47, + "learning_rate": 4.67972871137905e-07, + "logits/chosen": -2.48714017868042, + "logits/rejected": -2.444317579269409, + "logps/chosen": -283.33392333984375, + "logps/rejected": -266.0936584472656, + "loss": 0.5425, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.24451906979084015, + "rewards/margins": 0.9864568710327148, + "rewards/rejected": -1.230975866317749, "step": 930 }, { - "epoch": 0.46, - "learning_rate": 4.705935577271082e-07, - "logits/chosen": -2.9177355766296387, - "logits/rejected": -2.8707752227783203, - "logps/chosen": -333.5362243652344, - "logps/rejected": -332.41522216796875, - "loss": 0.6372, + "epoch": 0.48, + "learning_rate": 4.670308967596081e-07, + "logits/chosen": -2.458479166030884, + "logits/rejected": -2.4806995391845703, + "logps/chosen": -281.46820068359375, + "logps/rejected": -315.03167724609375, + "loss": 0.5216, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.3217393755912781, - "rewards/margins": 1.9765946865081787, - "rewards/rejected": -1.6548553705215454, + "rewards/chosen": -0.00037065744982101023, + "rewards/margins": 1.3867638111114502, + "rewards/rejected": -1.387134313583374, "step": 940 }, { - "epoch": 0.46, - "learning_rate": 4.696887441187115e-07, - "logits/chosen": -3.0128414630889893, - "logits/rejected": -2.9988887310028076, - "logps/chosen": -349.4322509765625, - "logps/rejected": -278.3971252441406, - "loss": 0.5903, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.4983639717102051, - "rewards/margins": 1.0931651592254639, - "rewards/rejected": -1.5915292501449585, + "epoch": 0.48, + "learning_rate": 4.660889223813112e-07, + "logits/chosen": -2.488022565841675, + "logits/rejected": -2.544905424118042, + "logps/chosen": -282.5947570800781, + "logps/rejected": -286.3911437988281, + "loss": 0.6466, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15313948690891266, + "rewards/margins": 0.5795355439186096, + "rewards/rejected": -0.7326749563217163, "step": 950 }, { - "epoch": 0.47, - "learning_rate": 4.6878393051031486e-07, - "logits/chosen": -2.9632811546325684, - "logits/rejected": -2.9890968799591064, - "logps/chosen": -329.6170959472656, - "logps/rejected": -313.6700439453125, - "loss": 0.6402, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.2982276380062103, - "rewards/margins": 0.9787166714668274, - "rewards/rejected": -1.2769442796707153, + "epoch": 0.49, + "learning_rate": 4.6514694800301427e-07, + "logits/chosen": -2.4881091117858887, + "logits/rejected": -2.4446139335632324, + "logps/chosen": -298.6130065917969, + "logps/rejected": -246.40298461914062, + "loss": 0.5778, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.10312096774578094, + "rewards/margins": 0.8071810007095337, + "rewards/rejected": -0.7040599584579468, "step": 960 }, { - "epoch": 0.47, - "learning_rate": 4.678791169019182e-07, - "logits/chosen": -2.8722774982452393, - "logits/rejected": -2.9132344722747803, - "logps/chosen": -331.83251953125, - "logps/rejected": -264.09210205078125, - "loss": 0.5922, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.055458225309848785, - "rewards/margins": 1.1451598405838013, - "rewards/rejected": -1.200618028640747, + "epoch": 0.49, + "learning_rate": 4.6420497362471737e-07, + "logits/chosen": -2.3994803428649902, + "logits/rejected": -2.4658679962158203, + "logps/chosen": -275.06878662109375, + "logps/rejected": -252.40475463867188, + "loss": 0.6466, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.259010374546051, + "rewards/margins": 0.378709077835083, + "rewards/rejected": -0.637719452381134, "step": 970 }, { - "epoch": 0.48, - "learning_rate": 4.669743032935215e-07, - "logits/chosen": -3.0100607872009277, - "logits/rejected": -3.011838436126709, - "logps/chosen": -288.69390869140625, - "logps/rejected": -325.84930419921875, - "loss": 0.516, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.22100801765918732, - "rewards/margins": 1.4370050430297852, - "rewards/rejected": -1.658013105392456, + "epoch": 0.5, + "learning_rate": 4.6326299924642047e-07, + "logits/chosen": -2.5499892234802246, + "logits/rejected": -2.589505672454834, + "logps/chosen": -272.2198486328125, + "logps/rejected": -281.07366943359375, + "loss": 0.5701, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08379928767681122, + "rewards/margins": 0.7602310180664062, + "rewards/rejected": -0.8440302610397339, "step": 980 }, { - "epoch": 0.48, - "learning_rate": 4.660694896851248e-07, - "logits/chosen": -3.0754151344299316, - "logits/rejected": -3.099276065826416, - "logps/chosen": -333.66632080078125, - "logps/rejected": -255.5065155029297, - "loss": 0.5377, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.16998998820781708, - "rewards/margins": 1.155113697052002, - "rewards/rejected": -0.9851238131523132, + "epoch": 0.5, + "learning_rate": 4.6232102486812357e-07, + "logits/chosen": -2.4911158084869385, + "logits/rejected": -2.5684397220611572, + "logps/chosen": -263.782958984375, + "logps/rejected": -277.875244140625, + "loss": 0.5929, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.34514790773391724, + "rewards/margins": 1.0390870571136475, + "rewards/rejected": -1.384235143661499, "step": 990 }, { - "epoch": 0.49, - "learning_rate": 4.6516467607672815e-07, - "logits/chosen": -3.1224780082702637, - "logits/rejected": -3.1299662590026855, - "logps/chosen": -297.83477783203125, - "logps/rejected": -274.7762756347656, - "loss": 0.5223, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.08044996112585068, - "rewards/margins": 0.9978864789009094, - "rewards/rejected": -1.078336477279663, + "epoch": 0.51, + "learning_rate": 4.613790504898267e-07, + "logits/chosen": -2.5921616554260254, + "logits/rejected": -2.600898265838623, + "logps/chosen": -298.99871826171875, + "logps/rejected": -312.77105712890625, + "loss": 0.51, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.06152200698852539, + "rewards/margins": 1.18497633934021, + "rewards/rejected": -1.1234543323516846, "step": 1000 }, { - "epoch": 0.49, - "eval_logits/chosen": -3.0239620208740234, - "eval_logits/rejected": -3.015218734741211, - "eval_logps/chosen": -314.8582763671875, - "eval_logps/rejected": -283.39801025390625, - "eval_loss": 0.5429871678352356, - "eval_rewards/accuracies": 0.7463235259056091, - "eval_rewards/chosen": 0.00027220213087275624, - "eval_rewards/margins": 1.3867586851119995, - "eval_rewards/rejected": -1.386486530303955, - "eval_runtime": 304.5498, - "eval_samples_per_second": 7.132, - "eval_steps_per_second": 0.447, + "epoch": 0.51, + "eval_logits/chosen": -2.621718168258667, + "eval_logits/rejected": -2.711763620376587, + "eval_logps/chosen": -280.32061767578125, + "eval_logps/rejected": -272.1781311035156, + "eval_loss": 0.534999668598175, + "eval_rewards/accuracies": 0.7366412281990051, + "eval_rewards/chosen": -0.20701460540294647, + "eval_rewards/margins": 1.197284460067749, + "eval_rewards/rejected": -1.404299259185791, + "eval_runtime": 301.9673, + "eval_samples_per_second": 6.908, + "eval_steps_per_second": 0.434, "step": 1000 }, { - "epoch": 0.49, - "learning_rate": 4.642598624683315e-07, - "logits/chosen": -3.1268115043640137, - "logits/rejected": -3.0537102222442627, - "logps/chosen": -312.20831298828125, - "logps/rejected": -232.6242218017578, - "loss": 0.6198, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.02092791348695755, - "rewards/margins": 1.3375914096832275, - "rewards/rejected": -1.3585193157196045, + "epoch": 0.51, + "learning_rate": 4.604370761115298e-07, + "logits/chosen": -2.5198278427124023, + "logits/rejected": -2.507664203643799, + "logps/chosen": -297.51336669921875, + "logps/rejected": -301.7564392089844, + "loss": 0.5467, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3291333317756653, + "rewards/margins": 1.3960895538330078, + "rewards/rejected": -1.7252228260040283, "step": 1010 }, { - "epoch": 0.5, - "learning_rate": 4.633550488599348e-07, - "logits/chosen": -3.073307514190674, - "logits/rejected": -3.0372867584228516, - "logps/chosen": -309.48297119140625, - "logps/rejected": -248.77993774414062, - "loss": 0.6195, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.34125393629074097, - "rewards/margins": 1.2326934337615967, - "rewards/rejected": -1.5739473104476929, + "epoch": 0.52, + "learning_rate": 4.594951017332329e-07, + "logits/chosen": -2.590463161468506, + "logits/rejected": -2.6638598442077637, + "logps/chosen": -300.2336120605469, + "logps/rejected": -285.6252746582031, + "loss": 0.6876, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8592126965522766, + "rewards/margins": 0.49618273973464966, + "rewards/rejected": -1.3553953170776367, "step": 1020 }, { - "epoch": 0.5, - "learning_rate": 4.6245023525153815e-07, - "logits/chosen": -3.074204206466675, - "logits/rejected": -3.0756869316101074, - "logps/chosen": -319.09637451171875, - "logps/rejected": -261.5218505859375, - "loss": 0.5698, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.04606182500720024, - "rewards/margins": 1.332489252090454, - "rewards/rejected": -1.3785510063171387, + "epoch": 0.52, + "learning_rate": 4.585531273549359e-07, + "logits/chosen": -2.437877893447876, + "logits/rejected": -2.4427528381347656, + "logps/chosen": -236.7029266357422, + "logps/rejected": -259.52545166015625, + "loss": 0.4876, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.11370401084423065, + "rewards/margins": 1.9164615869522095, + "rewards/rejected": -1.8027576208114624, "step": 1030 }, { - "epoch": 0.51, - "learning_rate": 4.615454216431415e-07, - "logits/chosen": -3.11297869682312, - "logits/rejected": -3.094440221786499, - "logps/chosen": -343.981201171875, - "logps/rejected": -295.459716796875, - "loss": 0.6582, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.3284061849117279, - "rewards/margins": 2.171921730041504, - "rewards/rejected": -1.8435159921646118, + "epoch": 0.53, + "learning_rate": 4.5761115297663897e-07, + "logits/chosen": -2.515664577484131, + "logits/rejected": -2.6834750175476074, + "logps/chosen": -266.6720886230469, + "logps/rejected": -255.0941925048828, + "loss": 0.6642, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.22268524765968323, + "rewards/margins": 1.3201810121536255, + "rewards/rejected": -1.5428663492202759, "step": 1040 }, { - "epoch": 0.51, - "learning_rate": 4.606406080347448e-07, - "logits/chosen": -3.0940699577331543, - "logits/rejected": -3.136464834213257, - "logps/chosen": -287.3896789550781, - "logps/rejected": -272.95526123046875, - "loss": 0.5424, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.17697378993034363, - "rewards/margins": 1.5270637273788452, - "rewards/rejected": -1.3500897884368896, + "epoch": 0.53, + "learning_rate": 4.5666917859834207e-07, + "logits/chosen": -2.5942764282226562, + "logits/rejected": -2.568809747695923, + "logps/chosen": -287.2005615234375, + "logps/rejected": -267.20391845703125, + "loss": 0.586, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.07011856138706207, + "rewards/margins": 0.9706190824508667, + "rewards/rejected": -1.0407376289367676, "step": 1050 }, { - "epoch": 0.52, - "learning_rate": 4.5973579442634816e-07, - "logits/chosen": -3.093822479248047, - "logits/rejected": -3.0921084880828857, - "logps/chosen": -321.46893310546875, - "logps/rejected": -269.60205078125, - "loss": 0.5598, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.3498702049255371, - "rewards/margins": 1.5090956687927246, - "rewards/rejected": -1.1592254638671875, + "epoch": 0.54, + "learning_rate": 4.5572720422004517e-07, + "logits/chosen": -2.588561534881592, + "logits/rejected": -2.6585402488708496, + "logps/chosen": -256.96282958984375, + "logps/rejected": -268.2518310546875, + "loss": 0.4639, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.16893135011196136, + "rewards/margins": 1.1751251220703125, + "rewards/rejected": -1.3440563678741455, "step": 1060 }, { - "epoch": 0.52, - "learning_rate": 4.5883098081795144e-07, - "logits/chosen": -3.062215566635132, - "logits/rejected": -3.0369791984558105, - "logps/chosen": -290.6756896972656, - "logps/rejected": -247.19265747070312, - "loss": 0.6714, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.008381253108382225, - "rewards/margins": 1.2689889669418335, - "rewards/rejected": -1.2773703336715698, + "epoch": 0.54, + "learning_rate": 4.5478522984174827e-07, + "logits/chosen": -2.582852840423584, + "logits/rejected": -2.6485960483551025, + "logps/chosen": -263.36187744140625, + "logps/rejected": -249.5991973876953, + "loss": 0.5816, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08605729043483734, + "rewards/margins": 0.928912341594696, + "rewards/rejected": -1.0149695873260498, "step": 1070 }, { - "epoch": 0.53, - "learning_rate": 4.579261672095548e-07, - "logits/chosen": -3.0601210594177246, - "logits/rejected": -3.046158790588379, - "logps/chosen": -300.5389099121094, - "logps/rejected": -262.9156188964844, - "loss": 0.5937, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.12064117193222046, - "rewards/margins": 1.1033830642700195, - "rewards/rejected": -0.9827417135238647, + "epoch": 0.55, + "learning_rate": 4.5384325546345137e-07, + "logits/chosen": -2.650263786315918, + "logits/rejected": -2.5726656913757324, + "logps/chosen": -331.74774169921875, + "logps/rejected": -295.6081237792969, + "loss": 0.4773, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.04656659811735153, + "rewards/margins": 1.4996758699417114, + "rewards/rejected": -1.4531093835830688, "step": 1080 }, { - "epoch": 0.53, - "learning_rate": 4.570213536011581e-07, - "logits/chosen": -2.910738706588745, - "logits/rejected": -2.8942646980285645, - "logps/chosen": -324.2835388183594, - "logps/rejected": -318.1414794921875, - "loss": 0.6592, + "epoch": 0.55, + "learning_rate": 4.5290128108515447e-07, + "logits/chosen": -2.63649582862854, + "logits/rejected": -2.678618907928467, + "logps/chosen": -272.94281005859375, + "logps/rejected": -319.1750183105469, + "loss": 0.6456, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.07634729146957397, - "rewards/margins": 1.1516107320785522, - "rewards/rejected": -1.075263500213623, + "rewards/chosen": -0.41687941551208496, + "rewards/margins": 0.6916464567184448, + "rewards/rejected": -1.1085259914398193, "step": 1090 }, { - "epoch": 0.54, - "learning_rate": 4.5611653999276145e-07, - "logits/chosen": -2.987218141555786, - "logits/rejected": -2.957432985305786, - "logps/chosen": -294.2276306152344, - "logps/rejected": -231.7030487060547, - "loss": 0.5991, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.1616038978099823, - "rewards/margins": 1.035509467124939, - "rewards/rejected": -1.1971133947372437, + "epoch": 0.56, + "learning_rate": 4.5195930670685757e-07, + "logits/chosen": -2.5882906913757324, + "logits/rejected": -2.614518642425537, + "logps/chosen": -250.8640594482422, + "logps/rejected": -250.9251708984375, + "loss": 0.5219, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27617964148521423, + "rewards/margins": 1.033569574356079, + "rewards/rejected": -1.3097491264343262, "step": 1100 }, { - "epoch": 0.54, - "eval_logits/chosen": -2.969435453414917, - "eval_logits/rejected": -2.969165563583374, - "eval_logps/chosen": -311.44122314453125, - "eval_logps/rejected": -281.18414306640625, - "eval_loss": 0.544279158115387, - "eval_rewards/accuracies": 0.7297794222831726, - "eval_rewards/chosen": 0.3419767916202545, - "eval_rewards/margins": 1.507075548171997, - "eval_rewards/rejected": -1.165098786354065, - "eval_runtime": 304.4829, - "eval_samples_per_second": 7.133, - "eval_steps_per_second": 0.447, + "epoch": 0.56, + "eval_logits/chosen": -2.6604645252227783, + "eval_logits/rejected": -2.7450878620147705, + "eval_logps/chosen": -279.9233093261719, + "eval_logps/rejected": -271.287109375, + "eval_loss": 0.5404812097549438, + "eval_rewards/accuracies": 0.7290076613426208, + "eval_rewards/chosen": -0.1672801822423935, + "eval_rewards/margins": 1.1479164361953735, + "eval_rewards/rejected": -1.3151966333389282, + "eval_runtime": 297.089, + "eval_samples_per_second": 7.021, + "eval_steps_per_second": 0.441, "step": 1100 }, { - "epoch": 0.54, - "learning_rate": 4.552117263843648e-07, - "logits/chosen": -2.9179558753967285, - "logits/rejected": -2.93473482131958, - "logps/chosen": -341.87872314453125, - "logps/rejected": -326.6344909667969, - "loss": 0.5756, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.2733408212661743, - "rewards/margins": 1.7402833700180054, - "rewards/rejected": -1.4669427871704102, + "epoch": 0.56, + "learning_rate": 4.5101733232856067e-07, + "logits/chosen": -2.510779619216919, + "logits/rejected": -2.5328545570373535, + "logps/chosen": -327.66082763671875, + "logps/rejected": -246.44662475585938, + "loss": 0.5302, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4910829961299896, + "rewards/margins": 0.9884698987007141, + "rewards/rejected": -1.479552984237671, "step": 1110 }, { - "epoch": 0.55, - "learning_rate": 4.543069127759681e-07, - "logits/chosen": -2.9620749950408936, - "logits/rejected": -2.979710578918457, - "logps/chosen": -282.0982360839844, - "logps/rejected": -250.32168579101562, - "loss": 0.6166, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.06953828036785126, - "rewards/margins": 1.3747735023498535, - "rewards/rejected": -1.3052351474761963, + "epoch": 0.57, + "learning_rate": 4.500753579502637e-07, + "logits/chosen": -2.5775036811828613, + "logits/rejected": -2.5860979557037354, + "logps/chosen": -298.1309509277344, + "logps/rejected": -257.0569763183594, + "loss": 0.5651, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1157577782869339, + "rewards/margins": 1.2654132843017578, + "rewards/rejected": -1.1496555805206299, "step": 1120 }, { - "epoch": 0.55, - "learning_rate": 4.5340209916757145e-07, - "logits/chosen": -3.0658388137817383, - "logits/rejected": -3.064866542816162, - "logps/chosen": -354.0462951660156, - "logps/rejected": -320.5733642578125, - "loss": 0.6413, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.06140047311782837, - "rewards/margins": 1.6267662048339844, - "rewards/rejected": -1.6881663799285889, + "epoch": 0.57, + "learning_rate": 4.491333835719668e-07, + "logits/chosen": -2.624541759490967, + "logits/rejected": -2.579864978790283, + "logps/chosen": -281.26715087890625, + "logps/rejected": -296.16571044921875, + "loss": 0.5049, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3193248212337494, + "rewards/margins": 0.9984002113342285, + "rewards/rejected": -1.3177250623703003, "step": 1130 }, { - "epoch": 0.56, - "learning_rate": 4.524972855591748e-07, - "logits/chosen": -2.970015048980713, - "logits/rejected": -2.9394404888153076, - "logps/chosen": -316.2723388671875, - "logps/rejected": -294.2318420410156, - "loss": 0.6112, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.34384021162986755, - "rewards/margins": 1.0851294994354248, - "rewards/rejected": -1.4289697408676147, + "epoch": 0.58, + "learning_rate": 4.481914091936699e-07, + "logits/chosen": -2.517090320587158, + "logits/rejected": -2.5506300926208496, + "logps/chosen": -297.8021545410156, + "logps/rejected": -274.5700988769531, + "loss": 0.4954, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.06897372752428055, + "rewards/margins": 1.647580862045288, + "rewards/rejected": -1.7165546417236328, "step": 1140 }, { - "epoch": 0.56, - "learning_rate": 4.5159247195077807e-07, - "logits/chosen": -2.974771022796631, - "logits/rejected": -2.983407497406006, - "logps/chosen": -307.0450134277344, - "logps/rejected": -249.52542114257812, - "loss": 0.5777, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.40662628412246704, - "rewards/margins": 1.2882144451141357, - "rewards/rejected": -1.6948407888412476, + "epoch": 0.58, + "learning_rate": 4.47249434815373e-07, + "logits/chosen": -2.4685566425323486, + "logits/rejected": -2.4525017738342285, + "logps/chosen": -249.1833038330078, + "logps/rejected": -268.045166015625, + "loss": 0.6057, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7198296785354614, + "rewards/margins": 0.7819298505783081, + "rewards/rejected": -1.5017595291137695, "step": 1150 }, { - "epoch": 0.57, - "learning_rate": 4.506876583423814e-07, - "logits/chosen": -2.9282801151275635, - "logits/rejected": -2.9224345684051514, - "logps/chosen": -309.83099365234375, - "logps/rejected": -293.1893615722656, - "loss": 0.6208, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.024507436901330948, - "rewards/margins": 0.8606026768684387, - "rewards/rejected": -0.8851100206375122, + "epoch": 0.59, + "learning_rate": 4.4630746043707607e-07, + "logits/chosen": -2.4799113273620605, + "logits/rejected": -2.4306507110595703, + "logps/chosen": -287.40032958984375, + "logps/rejected": -291.71063232421875, + "loss": 0.5651, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2504750192165375, + "rewards/margins": 1.1714638471603394, + "rewards/rejected": -1.4219388961791992, "step": 1160 }, { - "epoch": 0.57, - "learning_rate": 4.4978284473398474e-07, - "logits/chosen": -2.923459768295288, - "logits/rejected": -2.942648410797119, - "logps/chosen": -326.2023010253906, - "logps/rejected": -257.015625, - "loss": 0.4604, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.15680450201034546, - "rewards/margins": 2.1175801753997803, - "rewards/rejected": -1.9607757329940796, + "epoch": 0.6, + "learning_rate": 4.4536548605877917e-07, + "logits/chosen": -2.5734705924987793, + "logits/rejected": -2.5858707427978516, + "logps/chosen": -294.67681884765625, + "logps/rejected": -243.31912231445312, + "loss": 0.4886, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.02277236059308052, + "rewards/margins": 1.20340096950531, + "rewards/rejected": -1.2261732816696167, "step": 1170 }, { - "epoch": 0.58, - "learning_rate": 4.488780311255881e-07, - "logits/chosen": -2.993777275085449, - "logits/rejected": -3.0089454650878906, - "logps/chosen": -310.4774475097656, - "logps/rejected": -310.0064392089844, - "loss": 0.5855, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.30246180295944214, - "rewards/margins": 1.6295474767684937, - "rewards/rejected": -1.9320093393325806, + "epoch": 0.6, + "learning_rate": 4.4442351168048227e-07, + "logits/chosen": -2.62070369720459, + "logits/rejected": -2.714472532272339, + "logps/chosen": -256.4486083984375, + "logps/rejected": -237.9633331298828, + "loss": 0.5416, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.06612993031740189, + "rewards/margins": 0.9648292660713196, + "rewards/rejected": -1.0309593677520752, "step": 1180 }, { - "epoch": 0.58, - "learning_rate": 4.479732175171914e-07, - "logits/chosen": -3.0185108184814453, - "logits/rejected": -2.986156940460205, - "logps/chosen": -357.2552490234375, - "logps/rejected": -282.03564453125, - "loss": 0.5475, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.00784972868859768, - "rewards/margins": 1.7248270511627197, - "rewards/rejected": -1.7326767444610596, + "epoch": 0.61, + "learning_rate": 4.4348153730218537e-07, + "logits/chosen": -2.557380437850952, + "logits/rejected": -2.5687503814697266, + "logps/chosen": -277.671142578125, + "logps/rejected": -247.2373504638672, + "loss": 0.5121, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04107171297073364, + "rewards/margins": 1.0434238910675049, + "rewards/rejected": -1.0844955444335938, "step": 1190 }, { - "epoch": 0.59, - "learning_rate": 4.4706840390879475e-07, - "logits/chosen": -3.1041131019592285, - "logits/rejected": -3.139683246612549, - "logps/chosen": -288.1397399902344, - "logps/rejected": -246.25350952148438, - "loss": 0.6307, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.218224436044693, - "rewards/margins": 1.3650285005569458, - "rewards/rejected": -1.146803855895996, + "epoch": 0.61, + "learning_rate": 4.425395629238884e-07, + "logits/chosen": -2.5712647438049316, + "logits/rejected": -2.5064244270324707, + "logps/chosen": -272.5246887207031, + "logps/rejected": -263.8275451660156, + "loss": 0.5391, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.15056011080741882, + "rewards/margins": 1.2076849937438965, + "rewards/rejected": -1.3582451343536377, "step": 1200 }, { - "epoch": 0.59, - "eval_logits/chosen": -3.017667055130005, - "eval_logits/rejected": -3.010847330093384, - "eval_logps/chosen": -312.1917724609375, - "eval_logps/rejected": -283.08905029296875, - "eval_loss": 0.5671548247337341, - "eval_rewards/accuracies": 0.71875, - "eval_rewards/chosen": 0.2669214904308319, - "eval_rewards/margins": 1.6225121021270752, - "eval_rewards/rejected": -1.355590581893921, - "eval_runtime": 304.6347, - "eval_samples_per_second": 7.13, - "eval_steps_per_second": 0.446, + "epoch": 0.61, + "eval_logits/chosen": -2.6691508293151855, + "eval_logits/rejected": -2.755162477493286, + "eval_logps/chosen": -280.7105712890625, + "eval_logps/rejected": -272.5871276855469, + "eval_loss": 0.5320115685462952, + "eval_rewards/accuracies": 0.7404580116271973, + "eval_rewards/chosen": -0.24601177871227264, + "eval_rewards/margins": 1.199183702468872, + "eval_rewards/rejected": -1.4451955556869507, + "eval_runtime": 301.4996, + "eval_samples_per_second": 6.919, + "eval_steps_per_second": 0.434, "step": 1200 }, { - "epoch": 0.59, - "learning_rate": 4.461635903003981e-07, - "logits/chosen": -3.0484681129455566, - "logits/rejected": -3.062183141708374, - "logps/chosen": -334.8283996582031, - "logps/rejected": -272.17059326171875, - "loss": 0.5744, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.22666248679161072, - "rewards/margins": 1.2986878156661987, - "rewards/rejected": -1.0720255374908447, + "epoch": 0.62, + "learning_rate": 4.415975885455915e-07, + "logits/chosen": -2.581174612045288, + "logits/rejected": -2.5149054527282715, + "logps/chosen": -277.2206115722656, + "logps/rejected": -276.4008483886719, + "loss": 0.8695, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.275297075510025, + "rewards/margins": 1.014151930809021, + "rewards/rejected": -1.2894489765167236, "step": 1210 }, { - "epoch": 0.6, - "learning_rate": 4.452587766920014e-07, - "logits/chosen": -3.0639710426330566, - "logits/rejected": -3.0934436321258545, - "logps/chosen": -306.6845397949219, - "logps/rejected": -276.6683654785156, - "loss": 0.7719, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.34325629472732544, - "rewards/margins": 1.5583308935165405, - "rewards/rejected": -1.2150747776031494, + "epoch": 0.62, + "learning_rate": 4.406556141672946e-07, + "logits/chosen": -2.702441930770874, + "logits/rejected": -2.6084256172180176, + "logps/chosen": -262.9638977050781, + "logps/rejected": -260.6354675292969, + "loss": 0.581, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.07331562787294388, + "rewards/margins": 1.4954464435577393, + "rewards/rejected": -1.4221307039260864, "step": 1220 }, { - "epoch": 0.6, - "learning_rate": 4.4435396308360475e-07, - "logits/chosen": -3.0114355087280273, - "logits/rejected": -3.0833487510681152, - "logps/chosen": -333.1882019042969, - "logps/rejected": -287.3609619140625, - "loss": 0.6112, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.2510836124420166, - "rewards/margins": 1.6031652688980103, - "rewards/rejected": -1.3520817756652832, + "epoch": 0.63, + "learning_rate": 4.397136397889977e-07, + "logits/chosen": -2.6779658794403076, + "logits/rejected": -2.844871759414673, + "logps/chosen": -309.64898681640625, + "logps/rejected": -302.99163818359375, + "loss": 0.5565, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01319421548396349, + "rewards/margins": 1.1484153270721436, + "rewards/rejected": -1.135221004486084, "step": 1230 }, { - "epoch": 0.61, - "learning_rate": 4.4344914947520804e-07, - "logits/chosen": -2.9526658058166504, - "logits/rejected": -2.97904372215271, - "logps/chosen": -369.76007080078125, - "logps/rejected": -276.10284423828125, - "loss": 0.5585, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.12309380620718002, - "rewards/margins": 1.4945197105407715, - "rewards/rejected": -1.6176135540008545, + "epoch": 0.63, + "learning_rate": 4.387716654107008e-07, + "logits/chosen": -2.7568278312683105, + "logits/rejected": -2.781827449798584, + "logps/chosen": -309.62939453125, + "logps/rejected": -271.3556823730469, + "loss": 0.5592, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.15527522563934326, + "rewards/margins": 0.7692685723304749, + "rewards/rejected": -0.9245438575744629, "step": 1240 }, { - "epoch": 0.61, - "learning_rate": 4.4254433586681137e-07, - "logits/chosen": -2.961038589477539, - "logits/rejected": -3.016075611114502, - "logps/chosen": -326.3849792480469, - "logps/rejected": -319.8125, - "loss": 0.6302, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.04704440012574196, - "rewards/margins": 1.256737232208252, - "rewards/rejected": -1.3037816286087036, + "epoch": 0.64, + "learning_rate": 4.378296910324039e-07, + "logits/chosen": -2.705575942993164, + "logits/rejected": -2.720196008682251, + "logps/chosen": -272.5322570800781, + "logps/rejected": -249.0066680908203, + "loss": 0.6204, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3543552756309509, + "rewards/margins": 0.8724683523178101, + "rewards/rejected": -1.2268235683441162, "step": 1250 }, { - "epoch": 0.62, - "learning_rate": 4.416395222584147e-07, - "logits/chosen": -2.9669456481933594, - "logits/rejected": -2.9792191982269287, - "logps/chosen": -339.0459289550781, - "logps/rejected": -272.414794921875, - "loss": 0.5964, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.284701943397522, - "rewards/margins": 1.6558120250701904, - "rewards/rejected": -1.3711103200912476, + "epoch": 0.64, + "learning_rate": 4.36887716654107e-07, + "logits/chosen": -2.6648824214935303, + "logits/rejected": -2.707021474838257, + "logps/chosen": -322.87200927734375, + "logps/rejected": -345.20843505859375, + "loss": 0.5247, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01667252741754055, + "rewards/margins": 1.1974635124206543, + "rewards/rejected": -1.1807911396026611, "step": 1260 }, { - "epoch": 0.62, - "learning_rate": 4.4073470865001804e-07, - "logits/chosen": -3.0379161834716797, - "logits/rejected": -3.010143995285034, - "logps/chosen": -352.57891845703125, - "logps/rejected": -262.7559509277344, - "loss": 0.5059, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.3200235664844513, - "rewards/margins": 1.8153388500213623, - "rewards/rejected": -1.4953153133392334, + "epoch": 0.65, + "learning_rate": 4.359457422758101e-07, + "logits/chosen": -2.589756488800049, + "logits/rejected": -2.487732172012329, + "logps/chosen": -232.86874389648438, + "logps/rejected": -240.175537109375, + "loss": 0.5913, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.03205676004290581, + "rewards/margins": 1.1439334154129028, + "rewards/rejected": -1.1759899854660034, "step": 1270 }, { - "epoch": 0.63, - "learning_rate": 4.398298950416214e-07, - "logits/chosen": -2.988732099533081, - "logits/rejected": -2.987147569656372, - "logps/chosen": -390.0386657714844, - "logps/rejected": -367.008056640625, - "loss": 0.6521, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.7418962121009827, - "rewards/margins": 1.6876013278961182, - "rewards/rejected": -0.9457048177719116, + "epoch": 0.65, + "learning_rate": 4.350037678975131e-07, + "logits/chosen": -2.6122286319732666, + "logits/rejected": -2.5596940517425537, + "logps/chosen": -276.717041015625, + "logps/rejected": -275.9134216308594, + "loss": 0.4956, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.1398329734802246, + "rewards/margins": 1.3841016292572021, + "rewards/rejected": -1.5239344835281372, "step": 1280 }, { - "epoch": 0.63, - "learning_rate": 4.389250814332247e-07, - "logits/chosen": -3.010056972503662, - "logits/rejected": -3.007854700088501, - "logps/chosen": -319.01971435546875, - "logps/rejected": -283.02655029296875, - "loss": 0.587, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.25877711176872253, - "rewards/margins": 1.088653564453125, - "rewards/rejected": -1.3474305868148804, + "epoch": 0.66, + "learning_rate": 4.340617935192162e-07, + "logits/chosen": -2.6020092964172363, + "logits/rejected": -2.5196774005889893, + "logps/chosen": -298.8046875, + "logps/rejected": -254.69033813476562, + "loss": 0.5306, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21991291642189026, + "rewards/margins": 1.1622774600982666, + "rewards/rejected": -1.382190465927124, "step": 1290 }, { - "epoch": 0.64, - "learning_rate": 4.3802026782482805e-07, - "logits/chosen": -3.019235849380493, - "logits/rejected": -3.0394012928009033, - "logps/chosen": -306.888427734375, - "logps/rejected": -236.05789184570312, - "loss": 0.5154, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.2386196106672287, - "rewards/margins": 1.852574348449707, - "rewards/rejected": -1.613954782485962, + "epoch": 0.66, + "learning_rate": 4.331198191409193e-07, + "logits/chosen": -2.506074905395508, + "logits/rejected": -2.544355869293213, + "logps/chosen": -321.9480285644531, + "logps/rejected": -288.6408386230469, + "loss": 0.536, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.04333578795194626, + "rewards/margins": 1.2949002981185913, + "rewards/rejected": -1.3382360935211182, "step": 1300 }, { - "epoch": 0.64, - "eval_logits/chosen": -2.999460220336914, - "eval_logits/rejected": -2.995955467224121, - "eval_logps/chosen": -312.5672912597656, - "eval_logps/rejected": -283.9920654296875, - "eval_loss": 0.539391815662384, - "eval_rewards/accuracies": 0.748161792755127, - "eval_rewards/chosen": 0.22937078773975372, - "eval_rewards/margins": 1.6752634048461914, - "eval_rewards/rejected": -1.445892572402954, - "eval_runtime": 304.4235, - "eval_samples_per_second": 7.135, - "eval_steps_per_second": 0.447, + "epoch": 0.66, + "eval_logits/chosen": -2.6125967502593994, + "eval_logits/rejected": -2.7006421089172363, + "eval_logps/chosen": -280.16973876953125, + "eval_logps/rejected": -271.69952392578125, + "eval_loss": 0.5501763224601746, + "eval_rewards/accuracies": 0.7270992398262024, + "eval_rewards/chosen": -0.19192270934581757, + "eval_rewards/margins": 1.1645148992538452, + "eval_rewards/rejected": -1.3564376831054688, + "eval_runtime": 296.9716, + "eval_samples_per_second": 7.024, + "eval_steps_per_second": 0.441, "step": 1300 }, { - "epoch": 0.64, - "learning_rate": 4.3711545421643144e-07, - "logits/chosen": -2.9812872409820557, - "logits/rejected": -3.003887414932251, - "logps/chosen": -318.5408935546875, - "logps/rejected": -248.9303436279297, - "loss": 0.5616, + "epoch": 0.67, + "learning_rate": 4.321778447626224e-07, + "logits/chosen": -2.5924477577209473, + "logits/rejected": -2.6131839752197266, + "logps/chosen": -270.43280029296875, + "logps/rejected": -248.0684051513672, + "loss": 0.5595, "rewards/accuracies": 0.75, - "rewards/chosen": 0.37384381890296936, - "rewards/margins": 1.289077877998352, - "rewards/rejected": -0.9152339696884155, + "rewards/chosen": -0.22290806472301483, + "rewards/margins": 0.9343156814575195, + "rewards/rejected": -1.1572238206863403, "step": 1310 }, { - "epoch": 0.64, - "learning_rate": 4.3621064060803477e-07, - "logits/chosen": -2.9410858154296875, - "logits/rejected": -2.9780640602111816, - "logps/chosen": -284.0178527832031, - "logps/rejected": -261.8435974121094, - "loss": 0.5779, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.24035921692848206, - "rewards/margins": 1.4292364120483398, - "rewards/rejected": -1.1888773441314697, + "epoch": 0.67, + "learning_rate": 4.312358703843255e-07, + "logits/chosen": -2.6144769191741943, + "logits/rejected": -2.6292853355407715, + "logps/chosen": -296.95001220703125, + "logps/rejected": -249.1085968017578, + "loss": 0.6089, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.16053441166877747, + "rewards/margins": 0.939122200012207, + "rewards/rejected": -0.7785876989364624, "step": 1320 }, { - "epoch": 0.65, - "learning_rate": 4.353058269996381e-07, - "logits/chosen": -2.9802401065826416, - "logits/rejected": -3.001110076904297, - "logps/chosen": -295.2203063964844, - "logps/rejected": -252.50112915039062, - "loss": 0.6135, + "epoch": 0.68, + "learning_rate": 4.302938960060286e-07, + "logits/chosen": -2.609095811843872, + "logits/rejected": -2.566784381866455, + "logps/chosen": -288.53204345703125, + "logps/rejected": -279.9819641113281, + "loss": 0.5419, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.4564266800880432, - "rewards/margins": 1.258934497833252, - "rewards/rejected": -0.802507758140564, + "rewards/chosen": -0.10803576558828354, + "rewards/margins": 1.2878508567810059, + "rewards/rejected": -1.3958865404129028, "step": 1330 }, { - "epoch": 0.65, - "learning_rate": 4.344010133912414e-07, - "logits/chosen": -2.96594500541687, - "logits/rejected": -2.9266459941864014, - "logps/chosen": -315.6517333984375, - "logps/rejected": -268.1950378417969, - "loss": 0.5223, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.24726061522960663, - "rewards/margins": 1.7142928838729858, - "rewards/rejected": -1.4670321941375732, + "epoch": 0.68, + "learning_rate": 4.293519216277317e-07, + "logits/chosen": -2.6211318969726562, + "logits/rejected": -2.663722276687622, + "logps/chosen": -275.7018127441406, + "logps/rejected": -259.57318115234375, + "loss": 0.607, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.027861136943101883, + "rewards/margins": 1.3817142248153687, + "rewards/rejected": -1.4095754623413086, "step": 1340 }, { - "epoch": 0.66, - "learning_rate": 4.334961997828447e-07, - "logits/chosen": -2.9347498416900635, - "logits/rejected": -2.899383068084717, - "logps/chosen": -321.0707092285156, - "logps/rejected": -255.6732940673828, - "loss": 0.5292, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.5466486215591431, - "rewards/margins": 1.5780394077301025, - "rewards/rejected": -1.0313907861709595, + "epoch": 0.69, + "learning_rate": 4.284099472494348e-07, + "logits/chosen": -2.6988394260406494, + "logits/rejected": -2.5893874168395996, + "logps/chosen": -281.22705078125, + "logps/rejected": -301.45855712890625, + "loss": 0.5618, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31579267978668213, + "rewards/margins": 0.5685089826583862, + "rewards/rejected": -0.8843017816543579, "step": 1350 }, { - "epoch": 0.66, - "learning_rate": 4.3259138617444806e-07, - "logits/chosen": -2.79194974899292, - "logits/rejected": -2.8127634525299072, - "logps/chosen": -392.15887451171875, - "logps/rejected": -244.8552703857422, - "loss": 0.6086, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.21766099333763123, - "rewards/margins": 1.2279932498931885, - "rewards/rejected": -1.4456541538238525, + "epoch": 0.69, + "learning_rate": 4.2746797287113787e-07, + "logits/chosen": -2.6224687099456787, + "logits/rejected": -2.569154739379883, + "logps/chosen": -249.50143432617188, + "logps/rejected": -281.0968017578125, + "loss": 0.5396, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27225735783576965, + "rewards/margins": 1.1059520244598389, + "rewards/rejected": -1.3782094717025757, "step": 1360 }, { - "epoch": 0.67, - "learning_rate": 4.316865725660514e-07, - "logits/chosen": -2.8631577491760254, - "logits/rejected": -2.917994260787964, - "logps/chosen": -327.74005126953125, - "logps/rejected": -259.500244140625, - "loss": 0.6246, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.5231900811195374, - "rewards/margins": 1.3296159505844116, - "rewards/rejected": -0.8064260482788086, + "epoch": 0.7, + "learning_rate": 4.2652599849284097e-07, + "logits/chosen": -2.63877534866333, + "logits/rejected": -2.773172616958618, + "logps/chosen": -262.63165283203125, + "logps/rejected": -245.67813110351562, + "loss": 0.5318, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.320833295583725, + "rewards/margins": 1.2154992818832397, + "rewards/rejected": -1.536332607269287, "step": 1370 }, { - "epoch": 0.67, - "learning_rate": 4.3078175895765473e-07, - "logits/chosen": -2.884788990020752, - "logits/rejected": -2.8382277488708496, - "logps/chosen": -307.4563903808594, - "logps/rejected": -190.59767150878906, - "loss": 0.6359, + "epoch": 0.7, + "learning_rate": 4.2558402411454407e-07, + "logits/chosen": -2.711285352706909, + "logits/rejected": -2.621014356613159, + "logps/chosen": -271.7641906738281, + "logps/rejected": -252.8822784423828, + "loss": 0.4981, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.03841951861977577, - "rewards/margins": 1.6107845306396484, - "rewards/rejected": -1.5723650455474854, + "rewards/chosen": -0.1611357033252716, + "rewards/margins": 1.2175296545028687, + "rewards/rejected": -1.3786654472351074, "step": 1380 }, { - "epoch": 0.68, - "learning_rate": 4.2987694534925807e-07, - "logits/chosen": -2.841109037399292, - "logits/rejected": -2.849846839904785, - "logps/chosen": -346.0831604003906, - "logps/rejected": -292.51788330078125, - "loss": 0.5849, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.5428182482719421, - "rewards/margins": 1.8946325778961182, - "rewards/rejected": -1.3518141508102417, + "epoch": 0.71, + "learning_rate": 4.2464204973624717e-07, + "logits/chosen": -2.6401703357696533, + "logits/rejected": -2.6722915172576904, + "logps/chosen": -327.8500061035156, + "logps/rejected": -291.4416809082031, + "loss": 0.5434, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.25624486804008484, + "rewards/margins": 1.0539482831954956, + "rewards/rejected": -1.3101933002471924, "step": 1390 }, { - "epoch": 0.68, - "learning_rate": 4.289721317408614e-07, - "logits/chosen": -2.8949296474456787, - "logits/rejected": -2.8792874813079834, - "logps/chosen": -378.65814208984375, - "logps/rejected": -276.74627685546875, - "loss": 0.6424, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.03393983095884323, - "rewards/margins": 1.6056735515594482, - "rewards/rejected": -1.639613389968872, + "epoch": 0.71, + "learning_rate": 4.2370007535795027e-07, + "logits/chosen": -2.5574371814727783, + "logits/rejected": -2.5931754112243652, + "logps/chosen": -305.538818359375, + "logps/rejected": -279.69171142578125, + "loss": 0.6544, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.21988415718078613, + "rewards/margins": 1.070007562637329, + "rewards/rejected": -1.2898919582366943, "step": 1400 }, { - "epoch": 0.68, - "eval_logits/chosen": -2.8846874237060547, - "eval_logits/rejected": -2.880082845687866, - "eval_logps/chosen": -313.10162353515625, - "eval_logps/rejected": -283.4009704589844, - "eval_loss": 0.5720194578170776, - "eval_rewards/accuracies": 0.7261029481887817, - "eval_rewards/chosen": 0.1759394109249115, - "eval_rewards/margins": 1.5627223253250122, - "eval_rewards/rejected": -1.3867828845977783, - "eval_runtime": 304.3759, - "eval_samples_per_second": 7.136, - "eval_steps_per_second": 0.447, + "epoch": 0.71, + "eval_logits/chosen": -2.604628324508667, + "eval_logits/rejected": -2.697035789489746, + "eval_logps/chosen": -282.0076904296875, + "eval_logps/rejected": -274.89263916015625, + "eval_loss": 0.5308603644371033, + "eval_rewards/accuracies": 0.7080152630805969, + "eval_rewards/chosen": -0.37571972608566284, + "eval_rewards/margins": 1.3000292778015137, + "eval_rewards/rejected": -1.6757489442825317, + "eval_runtime": 301.8954, + "eval_samples_per_second": 6.91, + "eval_steps_per_second": 0.434, "step": 1400 }, { - "epoch": 0.69, - "learning_rate": 4.2806731813246474e-07, - "logits/chosen": -2.886857032775879, - "logits/rejected": -2.8864784240722656, - "logps/chosen": -343.5644226074219, - "logps/rejected": -306.38836669921875, - "loss": 0.5457, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.12355151027441025, - "rewards/margins": 1.1762131452560425, - "rewards/rejected": -1.052661657333374, + "epoch": 0.72, + "learning_rate": 4.227581009796533e-07, + "logits/chosen": -2.462588310241699, + "logits/rejected": -2.500105381011963, + "logps/chosen": -288.41705322265625, + "logps/rejected": -258.5611877441406, + "loss": 0.5735, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04483962059020996, + "rewards/margins": 2.0927834510803223, + "rewards/rejected": -2.137622833251953, "step": 1410 }, { - "epoch": 0.69, - "learning_rate": 4.2716250452406807e-07, - "logits/chosen": -2.956000804901123, - "logits/rejected": -2.948458671569824, - "logps/chosen": -320.3724365234375, - "logps/rejected": -233.8415985107422, - "loss": 0.4799, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.4409845471382141, - "rewards/margins": 1.7005996704101562, - "rewards/rejected": -1.259615182876587, + "epoch": 0.72, + "learning_rate": 4.218161266013564e-07, + "logits/chosen": -2.574207305908203, + "logits/rejected": -2.731383800506592, + "logps/chosen": -269.80035400390625, + "logps/rejected": -245.1697235107422, + "loss": 0.5215, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.26280277967453003, + "rewards/margins": 1.3922348022460938, + "rewards/rejected": -1.655037522315979, "step": 1420 }, { - "epoch": 0.7, - "learning_rate": 4.2625769091567135e-07, - "logits/chosen": -2.9371492862701416, - "logits/rejected": -2.9248015880584717, - "logps/chosen": -296.7073974609375, - "logps/rejected": -269.4610595703125, - "loss": 0.642, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.09296533465385437, - "rewards/margins": 1.039360761642456, - "rewards/rejected": -0.9463955163955688, + "epoch": 0.73, + "learning_rate": 4.208741522230595e-07, + "logits/chosen": -2.615461587905884, + "logits/rejected": -2.531139612197876, + "logps/chosen": -272.6134338378906, + "logps/rejected": -255.28408813476562, + "loss": 0.7724, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5035494565963745, + "rewards/margins": 0.5120665431022644, + "rewards/rejected": -1.0156160593032837, "step": 1430 }, { - "epoch": 0.7, - "learning_rate": 4.253528773072747e-07, - "logits/chosen": -2.9327235221862793, - "logits/rejected": -2.982673168182373, - "logps/chosen": -363.5020751953125, - "logps/rejected": -301.22296142578125, - "loss": 0.5751, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.0022306025493890047, - "rewards/margins": 1.887359380722046, - "rewards/rejected": -1.8851289749145508, + "epoch": 0.73, + "learning_rate": 4.1993217784476257e-07, + "logits/chosen": -2.714182138442993, + "logits/rejected": -2.6374399662017822, + "logps/chosen": -275.6734924316406, + "logps/rejected": -302.3877258300781, + "loss": 0.5738, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4428681433200836, + "rewards/margins": 1.1528266668319702, + "rewards/rejected": -1.5956947803497314, "step": 1440 }, { - "epoch": 0.71, - "learning_rate": 4.24448063698878e-07, - "logits/chosen": -2.883209466934204, - "logits/rejected": -2.906423568725586, - "logps/chosen": -302.32806396484375, - "logps/rejected": -251.7005157470703, - "loss": 0.7805, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.4701937139034271, - "rewards/margins": 0.7227769494056702, - "rewards/rejected": -1.192970633506775, + "epoch": 0.74, + "learning_rate": 4.1899020346646567e-07, + "logits/chosen": -2.605117082595825, + "logits/rejected": -2.594167947769165, + "logps/chosen": -280.275390625, + "logps/rejected": -249.83792114257812, + "loss": 0.6099, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.11132706701755524, + "rewards/margins": 1.3764369487762451, + "rewards/rejected": -1.265109896659851, "step": 1450 }, { - "epoch": 0.71, - "learning_rate": 4.2354325009048136e-07, - "logits/chosen": -2.848294734954834, - "logits/rejected": -2.916018486022949, - "logps/chosen": -317.5501708984375, - "logps/rejected": -250.91897583007812, - "loss": 0.6165, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.05562925338745117, - "rewards/margins": 1.6674282550811768, - "rewards/rejected": -1.6117990016937256, + "epoch": 0.74, + "learning_rate": 4.1804822908816877e-07, + "logits/chosen": -2.6344046592712402, + "logits/rejected": -2.5441765785217285, + "logps/chosen": -287.36163330078125, + "logps/rejected": -310.89288330078125, + "loss": 0.6445, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4139525294303894, + "rewards/margins": 0.49868473410606384, + "rewards/rejected": -0.9126373529434204, "step": 1460 }, { - "epoch": 0.72, - "learning_rate": 4.226384364820847e-07, - "logits/chosen": -2.854104518890381, - "logits/rejected": -2.8662452697753906, - "logps/chosen": -400.96136474609375, - "logps/rejected": -273.5776672363281, - "loss": 0.626, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.18100908398628235, - "rewards/margins": 1.0209696292877197, - "rewards/rejected": -1.2019786834716797, + "epoch": 0.75, + "learning_rate": 4.1710625470987187e-07, + "logits/chosen": -2.5128302574157715, + "logits/rejected": -2.5804567337036133, + "logps/chosen": -294.478515625, + "logps/rejected": -288.2181091308594, + "loss": 0.4808, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4093014597892761, + "rewards/margins": 1.3569328784942627, + "rewards/rejected": -1.7662343978881836, "step": 1470 }, { - "epoch": 0.72, - "learning_rate": 4.2173362287368803e-07, - "logits/chosen": -2.8714194297790527, - "logits/rejected": -2.852163791656494, - "logps/chosen": -337.67852783203125, - "logps/rejected": -279.61676025390625, - "loss": 0.5413, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.3702465891838074, - "rewards/margins": 1.3009843826293945, - "rewards/rejected": -0.9307376742362976, + "epoch": 0.75, + "learning_rate": 4.1616428033157497e-07, + "logits/chosen": -2.643336772918701, + "logits/rejected": -2.68320369720459, + "logps/chosen": -257.29913330078125, + "logps/rejected": -263.56597900390625, + "loss": 0.5644, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8031185269355774, + "rewards/margins": 1.1325935125350952, + "rewards/rejected": -1.9357120990753174, "step": 1480 }, { - "epoch": 0.73, - "learning_rate": 4.2082880926529137e-07, - "logits/chosen": -2.860583782196045, - "logits/rejected": -2.7854676246643066, - "logps/chosen": -308.6072692871094, - "logps/rejected": -284.7223205566406, - "loss": 0.6106, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.16729530692100525, - "rewards/margins": 0.5469957590103149, - "rewards/rejected": -0.7142910957336426, + "epoch": 0.76, + "learning_rate": 4.1522230595327807e-07, + "logits/chosen": -2.6495070457458496, + "logits/rejected": -2.5842490196228027, + "logps/chosen": -244.33584594726562, + "logps/rejected": -298.60284423828125, + "loss": 0.6466, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.39945897459983826, + "rewards/margins": 1.1669641733169556, + "rewards/rejected": -1.5664231777191162, "step": 1490 }, { - "epoch": 0.73, - "learning_rate": 4.199239956568947e-07, - "logits/chosen": -2.890968084335327, - "logits/rejected": -2.9133200645446777, - "logps/chosen": -303.5365905761719, - "logps/rejected": -259.62066650390625, - "loss": 0.587, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.37988632917404175, - "rewards/margins": 0.9699689149856567, - "rewards/rejected": -1.3498553037643433, + "epoch": 0.76, + "learning_rate": 4.1428033157498117e-07, + "logits/chosen": -2.4568417072296143, + "logits/rejected": -2.472951889038086, + "logps/chosen": -264.20794677734375, + "logps/rejected": -261.67279052734375, + "loss": 0.5697, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.658267617225647, + "rewards/margins": 0.9868196249008179, + "rewards/rejected": -1.6450872421264648, "step": 1500 }, { - "epoch": 0.73, - "eval_logits/chosen": -2.8401081562042236, - "eval_logits/rejected": -2.8348021507263184, - "eval_logps/chosen": -314.14898681640625, - "eval_logps/rejected": -284.8664245605469, - "eval_loss": 0.5491310358047485, - "eval_rewards/accuracies": 0.7224264740943909, - "eval_rewards/chosen": 0.0712040588259697, - "eval_rewards/margins": 1.6045317649841309, - "eval_rewards/rejected": -1.5333276987075806, - "eval_runtime": 304.6624, - "eval_samples_per_second": 7.129, - "eval_steps_per_second": 0.446, + "epoch": 0.76, + "eval_logits/chosen": -2.6730244159698486, + "eval_logits/rejected": -2.765580654144287, + "eval_logps/chosen": -280.74395751953125, + "eval_logps/rejected": -272.92584228515625, + "eval_loss": 0.5662475228309631, + "eval_rewards/accuracies": 0.7156488299369812, + "eval_rewards/chosen": -0.2493485063314438, + "eval_rewards/margins": 1.229722023010254, + "eval_rewards/rejected": -1.4790705442428589, + "eval_runtime": 297.049, + "eval_samples_per_second": 7.022, + "eval_steps_per_second": 0.441, "step": 1500 }, { - "epoch": 0.74, - "learning_rate": 4.1901918204849804e-07, - "logits/chosen": -2.8346734046936035, - "logits/rejected": -2.797400951385498, - "logps/chosen": -274.7719421386719, - "logps/rejected": -229.69393920898438, - "loss": 0.6251, + "epoch": 0.77, + "learning_rate": 4.1333835719668427e-07, + "logits/chosen": -2.618648052215576, + "logits/rejected": -2.575504779815674, + "logps/chosen": -279.4701232910156, + "logps/rejected": -356.17584228515625, + "loss": 0.5529, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.56284099817276, - "rewards/margins": 0.8828245997428894, - "rewards/rejected": -1.4456655979156494, + "rewards/chosen": -0.46867623925209045, + "rewards/margins": 0.8481055498123169, + "rewards/rejected": -1.316781759262085, "step": 1510 }, { - "epoch": 0.74, - "learning_rate": 4.181143684401013e-07, - "logits/chosen": -2.8528428077697754, - "logits/rejected": -2.8624656200408936, - "logps/chosen": -359.9120788574219, - "logps/rejected": -300.40521240234375, - "loss": 0.5445, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.10798443853855133, - "rewards/margins": 1.4090569019317627, - "rewards/rejected": -1.3010724782943726, + "epoch": 0.77, + "learning_rate": 4.1239638281838737e-07, + "logits/chosen": -2.637680768966675, + "logits/rejected": -2.670579433441162, + "logps/chosen": -286.6285400390625, + "logps/rejected": -271.7418518066406, + "loss": 0.4942, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.38739800453186035, + "rewards/margins": 1.265276551246643, + "rewards/rejected": -1.6526744365692139, "step": 1520 }, { - "epoch": 0.75, - "learning_rate": 4.1720955483170465e-07, - "logits/chosen": -2.8548240661621094, - "logits/rejected": -2.909045457839966, - "logps/chosen": -357.49273681640625, - "logps/rejected": -302.66204833984375, - "loss": 0.549, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.34968671202659607, - "rewards/margins": 1.0992151498794556, - "rewards/rejected": -1.448901891708374, + "epoch": 0.78, + "learning_rate": 4.1145440844009036e-07, + "logits/chosen": -2.6004691123962402, + "logits/rejected": -2.691800594329834, + "logps/chosen": -285.45989990234375, + "logps/rejected": -255.03042602539062, + "loss": 0.5876, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.542041540145874, + "rewards/margins": 1.1405549049377441, + "rewards/rejected": -1.6825965642929077, "step": 1530 }, { - "epoch": 0.75, - "learning_rate": 4.16304741223308e-07, - "logits/chosen": -2.838947296142578, - "logits/rejected": -2.9081108570098877, - "logps/chosen": -366.4072265625, - "logps/rejected": -284.40606689453125, - "loss": 0.5547, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.28602319955825806, - "rewards/margins": 2.277418613433838, - "rewards/rejected": -1.991395354270935, + "epoch": 0.78, + "learning_rate": 4.1051243406179347e-07, + "logits/chosen": -2.543489933013916, + "logits/rejected": -2.4293406009674072, + "logps/chosen": -289.1936340332031, + "logps/rejected": -281.3624572753906, + "loss": 0.527, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05935020372271538, + "rewards/margins": 1.4894200563430786, + "rewards/rejected": -1.4300696849822998, "step": 1540 }, { - "epoch": 0.76, - "learning_rate": 4.153999276149113e-07, - "logits/chosen": -2.9312262535095215, - "logits/rejected": -2.8750057220458984, - "logps/chosen": -323.4001159667969, - "logps/rejected": -218.4062042236328, - "loss": 0.5797, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.019425928592681885, - "rewards/margins": 1.6437393426895142, - "rewards/rejected": -1.6631653308868408, + "epoch": 0.79, + "learning_rate": 4.0957045968349657e-07, + "logits/chosen": -2.6180763244628906, + "logits/rejected": -2.5505731105804443, + "logps/chosen": -280.2388610839844, + "logps/rejected": -261.02252197265625, + "loss": 0.5261, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.565861165523529, + "rewards/margins": 0.8470433354377747, + "rewards/rejected": -1.4129045009613037, "step": 1550 }, { - "epoch": 0.76, - "learning_rate": 4.1449511400651466e-07, - "logits/chosen": -2.894601821899414, - "logits/rejected": -2.8909831047058105, - "logps/chosen": -326.6396484375, - "logps/rejected": -281.40631103515625, - "loss": 0.5788, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.019177544862031937, - "rewards/margins": 1.1157176494598389, - "rewards/rejected": -1.1348950862884521, + "epoch": 0.79, + "learning_rate": 4.0862848530519967e-07, + "logits/chosen": -2.5736451148986816, + "logits/rejected": -2.4275200366973877, + "logps/chosen": -265.53271484375, + "logps/rejected": -287.5812683105469, + "loss": 0.5627, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08069765567779541, + "rewards/margins": 1.4943009614944458, + "rewards/rejected": -1.5749986171722412, "step": 1560 }, { - "epoch": 0.77, - "learning_rate": 4.13590300398118e-07, - "logits/chosen": -2.879214286804199, - "logits/rejected": -2.907869338989258, - "logps/chosen": -279.28106689453125, - "logps/rejected": -284.89263916015625, - "loss": 0.5549, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.1043848991394043, - "rewards/margins": 1.6830644607543945, - "rewards/rejected": -1.5786796808242798, + "epoch": 0.8, + "learning_rate": 4.0768651092690277e-07, + "logits/chosen": -2.513634204864502, + "logits/rejected": -2.589459180831909, + "logps/chosen": -285.5072937011719, + "logps/rejected": -252.2686004638672, + "loss": 0.6622, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5319138765335083, + "rewards/margins": 0.9326924085617065, + "rewards/rejected": -1.4646062850952148, "step": 1570 }, { - "epoch": 0.77, - "learning_rate": 4.1268548678972133e-07, - "logits/chosen": -2.8034815788269043, - "logits/rejected": -2.794088125228882, - "logps/chosen": -410.8250427246094, - "logps/rejected": -301.45294189453125, - "loss": 0.5638, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.6233074069023132, - "rewards/margins": 1.5481083393096924, - "rewards/rejected": -0.9248008728027344, + "epoch": 0.8, + "learning_rate": 4.0674453654860587e-07, + "logits/chosen": -2.482975959777832, + "logits/rejected": -2.4376277923583984, + "logps/chosen": -229.789306640625, + "logps/rejected": -236.7023162841797, + "loss": 0.545, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3413255214691162, + "rewards/margins": 1.2952824831008911, + "rewards/rejected": -1.6366078853607178, "step": 1580 }, { - "epoch": 0.78, - "learning_rate": 4.1178067318132467e-07, - "logits/chosen": -2.8749442100524902, - "logits/rejected": -2.8853962421417236, - "logps/chosen": -321.0901794433594, - "logps/rejected": -239.45028686523438, - "loss": 0.6025, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.3520318269729614, - "rewards/margins": 1.5631208419799805, - "rewards/rejected": -1.2110888957977295, + "epoch": 0.81, + "learning_rate": 4.0580256217030897e-07, + "logits/chosen": -2.4523279666900635, + "logits/rejected": -2.4195103645324707, + "logps/chosen": -303.5387268066406, + "logps/rejected": -284.21075439453125, + "loss": 0.5395, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3342668116092682, + "rewards/margins": 0.9444997906684875, + "rewards/rejected": -1.278766393661499, "step": 1590 }, { - "epoch": 0.78, - "learning_rate": 4.1087585957292795e-07, - "logits/chosen": -2.846088409423828, - "logits/rejected": -2.8391013145446777, - "logps/chosen": -329.9521484375, - "logps/rejected": -295.00543212890625, - "loss": 0.5185, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.6595783233642578, - "rewards/margins": 1.8760493993759155, - "rewards/rejected": -1.2164714336395264, + "epoch": 0.81, + "learning_rate": 4.0486058779201207e-07, + "logits/chosen": -2.4088969230651855, + "logits/rejected": -2.5234007835388184, + "logps/chosen": -274.2557067871094, + "logps/rejected": -271.4913330078125, + "loss": 0.5538, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.473999559879303, + "rewards/margins": 1.0612614154815674, + "rewards/rejected": -1.5352609157562256, "step": 1600 }, { - "epoch": 0.78, - "eval_logits/chosen": -2.8520967960357666, - "eval_logits/rejected": -2.840663433074951, - "eval_logps/chosen": -309.3588562011719, - "eval_logps/rejected": -280.1464538574219, - "eval_loss": 0.5474982261657715, - "eval_rewards/accuracies": 0.7555146813392639, - "eval_rewards/chosen": 0.550216019153595, - "eval_rewards/margins": 1.6115472316741943, - "eval_rewards/rejected": -1.0613312721252441, - "eval_runtime": 304.6842, - "eval_samples_per_second": 7.129, - "eval_steps_per_second": 0.446, + "epoch": 0.81, + "eval_logits/chosen": -2.594578981399536, + "eval_logits/rejected": -2.6933910846710205, + "eval_logps/chosen": -282.90802001953125, + "eval_logps/rejected": -274.9263916015625, + "eval_loss": 0.5325531363487244, + "eval_rewards/accuracies": 0.7213740348815918, + "eval_rewards/chosen": -0.4657546877861023, + "eval_rewards/margins": 1.21336829662323, + "eval_rewards/rejected": -1.6791231632232666, + "eval_runtime": 302.0446, + "eval_samples_per_second": 6.906, + "eval_steps_per_second": 0.434, "step": 1600 }, { - "epoch": 0.79, - "learning_rate": 4.099710459645313e-07, - "logits/chosen": -2.844019889831543, - "logits/rejected": -2.857938289642334, - "logps/chosen": -325.49822998046875, - "logps/rejected": -302.9815979003906, - "loss": 0.6184, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.48010191321372986, - "rewards/margins": 1.4709116220474243, - "rewards/rejected": -0.9908096194267273, + "epoch": 0.82, + "learning_rate": 4.039186134137151e-07, + "logits/chosen": -2.477017641067505, + "logits/rejected": -2.512791156768799, + "logps/chosen": -364.48138427734375, + "logps/rejected": -329.79571533203125, + "loss": 0.5225, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.33335477113723755, + "rewards/margins": 1.133012056350708, + "rewards/rejected": -1.4663667678833008, "step": 1610 }, { - "epoch": 0.79, - "learning_rate": 4.090662323561346e-07, - "logits/chosen": -2.9024102687835693, - "logits/rejected": -2.882875680923462, - "logps/chosen": -273.99432373046875, - "logps/rejected": -251.88162231445312, - "loss": 0.5197, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.0770392119884491, - "rewards/margins": 1.6075156927108765, - "rewards/rejected": -1.530476450920105, + "epoch": 0.82, + "learning_rate": 4.029766390354182e-07, + "logits/chosen": -2.339780569076538, + "logits/rejected": -2.2996575832366943, + "logps/chosen": -254.8621368408203, + "logps/rejected": -305.05218505859375, + "loss": 0.5218, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.580379843711853, + "rewards/margins": 1.2120063304901123, + "rewards/rejected": -1.7923862934112549, "step": 1620 }, { - "epoch": 0.8, - "learning_rate": 4.0816141874773795e-07, - "logits/chosen": -2.841510057449341, - "logits/rejected": -2.8507213592529297, - "logps/chosen": -346.05523681640625, - "logps/rejected": -301.13543701171875, - "loss": 0.5696, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.14270959794521332, - "rewards/margins": 1.94040846824646, - "rewards/rejected": -1.797698974609375, + "epoch": 0.83, + "learning_rate": 4.020346646571213e-07, + "logits/chosen": -2.46311092376709, + "logits/rejected": -2.4910902976989746, + "logps/chosen": -253.5718231201172, + "logps/rejected": -241.31729125976562, + "loss": 0.4909, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.30813589692115784, + "rewards/margins": 0.8881072998046875, + "rewards/rejected": -1.196243166923523, "step": 1630 }, { - "epoch": 0.8, - "learning_rate": 4.072566051393413e-07, - "logits/chosen": -2.8012797832489014, - "logits/rejected": -2.801210641860962, - "logps/chosen": -259.74188232421875, - "logps/rejected": -231.1053009033203, - "loss": 0.6376, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.04968654364347458, - "rewards/margins": 1.5200209617614746, - "rewards/rejected": -1.5697072744369507, + "epoch": 0.83, + "learning_rate": 4.010926902788244e-07, + "logits/chosen": -2.437481641769409, + "logits/rejected": -2.487056016921997, + "logps/chosen": -298.3309326171875, + "logps/rejected": -307.5037536621094, + "loss": 0.6464, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.40473002195358276, + "rewards/margins": 1.2244799137115479, + "rewards/rejected": -1.6292098760604858, "step": 1640 }, { - "epoch": 0.81, - "learning_rate": 4.063517915309446e-07, - "logits/chosen": -2.9425883293151855, - "logits/rejected": -2.9121272563934326, - "logps/chosen": -333.64520263671875, - "logps/rejected": -285.2044982910156, - "loss": 0.6176, + "epoch": 0.84, + "learning_rate": 4.001507159005275e-07, + "logits/chosen": -2.4725682735443115, + "logits/rejected": -2.531104564666748, + "logps/chosen": -257.1470642089844, + "logps/rejected": -232.2982635498047, + "loss": 0.517, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.053397201001644135, - "rewards/margins": 1.637304663658142, - "rewards/rejected": -1.5839073657989502, + "rewards/chosen": -0.5268018245697021, + "rewards/margins": 0.9982993006706238, + "rewards/rejected": -1.5251011848449707, "step": 1650 }, { - "epoch": 0.81, - "learning_rate": 4.0544697792254796e-07, - "logits/chosen": -2.8607685565948486, - "logits/rejected": -2.869405508041382, - "logps/chosen": -290.55303955078125, - "logps/rejected": -265.3207702636719, - "loss": 0.6386, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.1116705983877182, - "rewards/margins": 1.3795424699783325, - "rewards/rejected": -1.491213083267212, + "epoch": 0.84, + "learning_rate": 3.9920874152223057e-07, + "logits/chosen": -2.416590929031372, + "logits/rejected": -2.4863741397857666, + "logps/chosen": -302.4273986816406, + "logps/rejected": -274.00994873046875, + "loss": 0.6356, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.31293386220932007, + "rewards/margins": 1.3280731439590454, + "rewards/rejected": -1.6410068273544312, "step": 1660 }, { - "epoch": 0.82, - "learning_rate": 4.045421643141513e-07, - "logits/chosen": -2.8379321098327637, - "logits/rejected": -2.869563341140747, - "logps/chosen": -309.62274169921875, - "logps/rejected": -277.60260009765625, - "loss": 0.5811, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.055942945182323456, - "rewards/margins": 1.4701728820800781, - "rewards/rejected": -1.4142299890518188, + "epoch": 0.85, + "learning_rate": 3.9826676714393367e-07, + "logits/chosen": -2.34822940826416, + "logits/rejected": -2.3350391387939453, + "logps/chosen": -237.3314208984375, + "logps/rejected": -211.2093505859375, + "loss": 0.6246, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.46049946546554565, + "rewards/margins": 1.1196852922439575, + "rewards/rejected": -1.580184817314148, "step": 1670 }, { - "epoch": 0.82, - "learning_rate": 4.0363735070575463e-07, - "logits/chosen": -2.8128914833068848, - "logits/rejected": -2.8281283378601074, - "logps/chosen": -399.00274658203125, - "logps/rejected": -253.89614868164062, - "loss": 0.512, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.41654032468795776, - "rewards/margins": 1.7110408544540405, - "rewards/rejected": -1.2945002317428589, + "epoch": 0.85, + "learning_rate": 3.9732479276563677e-07, + "logits/chosen": -2.4428534507751465, + "logits/rejected": -2.5402212142944336, + "logps/chosen": -254.8546600341797, + "logps/rejected": -236.1012725830078, + "loss": 0.5941, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.17953288555145264, + "rewards/margins": 0.8878132700920105, + "rewards/rejected": -1.0673460960388184, "step": 1680 }, { - "epoch": 0.83, - "learning_rate": 4.027325370973579e-07, - "logits/chosen": -2.7304739952087402, - "logits/rejected": -2.7349672317504883, - "logps/chosen": -318.1178894042969, - "logps/rejected": -295.8105773925781, - "loss": 0.5742, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.05860323831439018, - "rewards/margins": 1.0191328525543213, - "rewards/rejected": -1.0777361392974854, + "epoch": 0.86, + "learning_rate": 3.963828183873398e-07, + "logits/chosen": -2.4770541191101074, + "logits/rejected": -2.5738840103149414, + "logps/chosen": -308.5177917480469, + "logps/rejected": -253.67648315429688, + "loss": 0.4828, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.23972062766551971, + "rewards/margins": 1.4527523517608643, + "rewards/rejected": -1.6924731731414795, "step": 1690 }, { - "epoch": 0.83, - "learning_rate": 4.0182772348896125e-07, - "logits/chosen": -2.796422243118286, - "logits/rejected": -2.81411075592041, - "logps/chosen": -262.18084716796875, - "logps/rejected": -278.8641052246094, - "loss": 0.6767, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.1600240170955658, - "rewards/margins": 1.0725984573364258, - "rewards/rejected": -1.2326226234436035, + "epoch": 0.86, + "learning_rate": 3.954408440090429e-07, + "logits/chosen": -2.4382712841033936, + "logits/rejected": -2.453115940093994, + "logps/chosen": -322.01043701171875, + "logps/rejected": -257.23638916015625, + "loss": 0.551, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4521487355232239, + "rewards/margins": 0.9814785718917847, + "rewards/rejected": -1.4336273670196533, "step": 1700 }, { - "epoch": 0.83, - "eval_logits/chosen": -2.8254613876342773, - "eval_logits/rejected": -2.8208842277526855, - "eval_logps/chosen": -312.29931640625, - "eval_logps/rejected": -283.6239013671875, - "eval_loss": 0.5470752716064453, - "eval_rewards/accuracies": 0.7573529481887817, - "eval_rewards/chosen": 0.2561688721179962, - "eval_rewards/margins": 1.6652470827102661, - "eval_rewards/rejected": -1.4090783596038818, - "eval_runtime": 304.2186, - "eval_samples_per_second": 7.14, - "eval_steps_per_second": 0.447, + "epoch": 0.86, + "eval_logits/chosen": -2.556736469268799, + "eval_logits/rejected": -2.653500556945801, + "eval_logps/chosen": -284.46728515625, + "eval_logps/rejected": -277.02783203125, + "eval_loss": 0.5257931351661682, + "eval_rewards/accuracies": 0.7137404680252075, + "eval_rewards/chosen": -0.6216804385185242, + "eval_rewards/margins": 1.26758873462677, + "eval_rewards/rejected": -1.8892688751220703, + "eval_runtime": 297.4077, + "eval_samples_per_second": 7.014, + "eval_steps_per_second": 0.44, "step": 1700 }, { - "epoch": 0.84, - "learning_rate": 4.009229098805646e-07, - "logits/chosen": -2.861166477203369, - "logits/rejected": -2.8744378089904785, - "logps/chosen": -314.24884033203125, - "logps/rejected": -284.6211242675781, - "loss": 0.5204, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.1735542118549347, - "rewards/margins": 1.6915044784545898, - "rewards/rejected": -1.5179502964019775, + "epoch": 0.87, + "learning_rate": 3.94498869630746e-07, + "logits/chosen": -2.4545798301696777, + "logits/rejected": -2.398289680480957, + "logps/chosen": -296.9984436035156, + "logps/rejected": -284.75982666015625, + "loss": 0.542, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5290297269821167, + "rewards/margins": 1.3974144458770752, + "rewards/rejected": -1.9264440536499023, "step": 1710 }, { - "epoch": 0.84, - "learning_rate": 4.000180962721679e-07, - "logits/chosen": -2.876804828643799, - "logits/rejected": -2.865417957305908, - "logps/chosen": -300.75665283203125, - "logps/rejected": -278.43621826171875, - "loss": 0.5588, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.3660758137702942, - "rewards/margins": 1.9399096965789795, - "rewards/rejected": -1.5738340616226196, + "epoch": 0.87, + "learning_rate": 3.935568952524491e-07, + "logits/chosen": -2.552342176437378, + "logits/rejected": -2.430562734603882, + "logps/chosen": -281.8885803222656, + "logps/rejected": -275.8494873046875, + "loss": 0.488, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.33996737003326416, + "rewards/margins": 1.4860076904296875, + "rewards/rejected": -1.8259750604629517, "step": 1720 }, { - "epoch": 0.85, - "learning_rate": 3.9911328266377125e-07, - "logits/chosen": -2.815491199493408, - "logits/rejected": -2.7964425086975098, - "logps/chosen": -273.8128967285156, - "logps/rejected": -256.08612060546875, - "loss": 0.6072, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.10774189233779907, - "rewards/margins": 1.4934362173080444, - "rewards/rejected": -1.3856942653656006, + "epoch": 0.88, + "learning_rate": 3.926149208741522e-07, + "logits/chosen": -2.451730251312256, + "logits/rejected": -2.521273136138916, + "logps/chosen": -295.59625244140625, + "logps/rejected": -254.90811157226562, + "loss": 0.5294, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6222850680351257, + "rewards/margins": 1.8649215698242188, + "rewards/rejected": -2.48720645904541, "step": 1730 }, { - "epoch": 0.85, - "learning_rate": 3.982084690553746e-07, - "logits/chosen": -2.8579351902008057, - "logits/rejected": -2.8635032176971436, - "logps/chosen": -344.3926086425781, - "logps/rejected": -295.6445007324219, - "loss": 0.6122, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.2743358314037323, - "rewards/margins": 1.348960518836975, - "rewards/rejected": -1.07462477684021, + "epoch": 0.89, + "learning_rate": 3.916729464958553e-07, + "logits/chosen": -2.4977612495422363, + "logits/rejected": -2.6330878734588623, + "logps/chosen": -350.9913330078125, + "logps/rejected": -323.6842346191406, + "loss": 0.5116, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.6430081129074097, + "rewards/margins": 1.4963001012802124, + "rewards/rejected": -2.139308452606201, "step": 1740 }, { - "epoch": 0.85, - "learning_rate": 3.973036554469779e-07, - "logits/chosen": -2.874558925628662, - "logits/rejected": -2.883847951889038, - "logps/chosen": -302.1440734863281, - "logps/rejected": -291.8547058105469, - "loss": 0.7937, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.7894821763038635, - "rewards/margins": 1.6262328624725342, - "rewards/rejected": -0.8367508053779602, + "epoch": 0.89, + "learning_rate": 3.907309721175584e-07, + "logits/chosen": -2.342928171157837, + "logits/rejected": -2.4169085025787354, + "logps/chosen": -309.93853759765625, + "logps/rejected": -251.88815307617188, + "loss": 0.6657, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6915287375450134, + "rewards/margins": 1.0208818912506104, + "rewards/rejected": -1.7124109268188477, "step": 1750 }, { - "epoch": 0.86, - "learning_rate": 3.9639884183858126e-07, - "logits/chosen": -2.79994535446167, - "logits/rejected": -2.775540590286255, - "logps/chosen": -339.4261779785156, - "logps/rejected": -309.87554931640625, - "loss": 0.4904, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.5550933480262756, - "rewards/margins": 2.3695578575134277, - "rewards/rejected": -1.8144643306732178, + "epoch": 0.9, + "learning_rate": 3.897889977392615e-07, + "logits/chosen": -2.515695810317993, + "logits/rejected": -2.5377676486968994, + "logps/chosen": -315.3117980957031, + "logps/rejected": -284.48065185546875, + "loss": 0.6156, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5466021299362183, + "rewards/margins": 1.2473156452178955, + "rewards/rejected": -1.7939178943634033, "step": 1760 }, { - "epoch": 0.86, - "learning_rate": 3.954940282301846e-07, - "logits/chosen": -2.8808398246765137, - "logits/rejected": -2.8615918159484863, - "logps/chosen": -332.28057861328125, - "logps/rejected": -278.11956787109375, - "loss": 0.676, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.377042680978775, - "rewards/margins": 1.6142187118530273, - "rewards/rejected": -1.2371760606765747, + "epoch": 0.9, + "learning_rate": 3.8884702336096456e-07, + "logits/chosen": -2.5526938438415527, + "logits/rejected": -2.5009982585906982, + "logps/chosen": -298.90252685546875, + "logps/rejected": -268.0218200683594, + "loss": 0.5509, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.27220678329467773, + "rewards/margins": 0.9303116798400879, + "rewards/rejected": -1.202518343925476, "step": 1770 }, { - "epoch": 0.87, - "learning_rate": 3.945892146217879e-07, - "logits/chosen": -2.860491991043091, - "logits/rejected": -2.844805955886841, - "logps/chosen": -279.9769592285156, - "logps/rejected": -244.27383422851562, - "loss": 0.5217, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.2582096755504608, - "rewards/margins": 1.5592120885849, - "rewards/rejected": -1.3010023832321167, + "epoch": 0.91, + "learning_rate": 3.879050489826676e-07, + "logits/chosen": -2.4461004734039307, + "logits/rejected": -2.4862380027770996, + "logps/chosen": -277.4447937011719, + "logps/rejected": -279.57794189453125, + "loss": 0.5103, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2458111047744751, + "rewards/margins": 1.5718352794647217, + "rewards/rejected": -1.8176462650299072, "step": 1780 }, { - "epoch": 0.87, - "learning_rate": 3.936844010133912e-07, - "logits/chosen": -2.8820483684539795, - "logits/rejected": -2.86503267288208, - "logps/chosen": -313.58038330078125, - "logps/rejected": -246.91195678710938, - "loss": 0.5704, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.1953628957271576, - "rewards/margins": 1.8687965869903564, - "rewards/rejected": -1.673433542251587, + "epoch": 0.91, + "learning_rate": 3.869630746043707e-07, + "logits/chosen": -2.477055549621582, + "logits/rejected": -2.5192625522613525, + "logps/chosen": -304.3697204589844, + "logps/rejected": -250.54501342773438, + "loss": 0.5216, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.6367429494857788, + "rewards/margins": 1.5008876323699951, + "rewards/rejected": -2.1376309394836426, "step": 1790 }, { - "epoch": 0.88, - "learning_rate": 3.9277958740499455e-07, - "logits/chosen": -2.8518600463867188, - "logits/rejected": -2.8444278240203857, - "logps/chosen": -313.44158935546875, - "logps/rejected": -298.85491943359375, - "loss": 0.48, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.273332417011261, - "rewards/margins": 1.5286543369293213, - "rewards/rejected": -1.255321979522705, + "epoch": 0.92, + "learning_rate": 3.860211002260738e-07, + "logits/chosen": -2.5067267417907715, + "logits/rejected": -2.437110662460327, + "logps/chosen": -306.4678039550781, + "logps/rejected": -322.9219055175781, + "loss": 0.5708, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3336077034473419, + "rewards/margins": 1.3294265270233154, + "rewards/rejected": -1.663034200668335, "step": 1800 }, { - "epoch": 0.88, - "eval_logits/chosen": -2.8411476612091064, - "eval_logits/rejected": -2.833300828933716, - "eval_logps/chosen": -311.8475036621094, - "eval_logps/rejected": -284.313720703125, - "eval_loss": 0.5309780836105347, - "eval_rewards/accuracies": 0.7555146813392639, - "eval_rewards/chosen": 0.3013475239276886, - "eval_rewards/margins": 1.7794023752212524, - "eval_rewards/rejected": -1.4780550003051758, - "eval_runtime": 304.4752, - "eval_samples_per_second": 7.134, - "eval_steps_per_second": 0.447, + "epoch": 0.92, + "eval_logits/chosen": -2.556394338607788, + "eval_logits/rejected": -2.627877950668335, + "eval_logps/chosen": -283.4186096191406, + "eval_logps/rejected": -277.09735107421875, + "eval_loss": 0.5639130473136902, + "eval_rewards/accuracies": 0.7213740348815918, + "eval_rewards/chosen": -0.516810953617096, + "eval_rewards/margins": 1.379412055015564, + "eval_rewards/rejected": -1.8962229490280151, + "eval_runtime": 301.7159, + "eval_samples_per_second": 6.914, + "eval_steps_per_second": 0.434, "step": 1800 }, { - "epoch": 0.88, - "learning_rate": 3.918747737965979e-07, - "logits/chosen": -2.804774045944214, - "logits/rejected": -2.792977809906006, - "logps/chosen": -327.1492614746094, - "logps/rejected": -272.28765869140625, - "loss": 0.5726, + "epoch": 0.92, + "learning_rate": 3.850791258477769e-07, + "logits/chosen": -2.4557442665100098, + "logits/rejected": -2.37328839302063, + "logps/chosen": -285.089111328125, + "logps/rejected": -283.9366760253906, + "loss": 0.6032, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.07695098221302032, - "rewards/margins": 1.5002232789993286, - "rewards/rejected": -1.4232723712921143, + "rewards/chosen": -0.6370270848274231, + "rewards/margins": 0.6979910135269165, + "rewards/rejected": -1.3350181579589844, "step": 1810 }, { - "epoch": 0.89, - "learning_rate": 3.909699601882012e-07, - "logits/chosen": -2.8636252880096436, - "logits/rejected": -2.841243267059326, - "logps/chosen": -347.08160400390625, - "logps/rejected": -295.09619140625, - "loss": 0.5352, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.18940113484859467, - "rewards/margins": 1.6042251586914062, - "rewards/rejected": -1.4148238897323608, + "epoch": 0.93, + "learning_rate": 3.8413715146948e-07, + "logits/chosen": -2.4120583534240723, + "logits/rejected": -2.535059690475464, + "logps/chosen": -274.90399169921875, + "logps/rejected": -256.09332275390625, + "loss": 0.5635, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22262752056121826, + "rewards/margins": 1.5661967992782593, + "rewards/rejected": -1.788824439048767, "step": 1820 }, { - "epoch": 0.89, - "learning_rate": 3.9006514657980455e-07, - "logits/chosen": -2.8435797691345215, - "logits/rejected": -2.845093250274658, - "logps/chosen": -366.5450439453125, - "logps/rejected": -258.64031982421875, - "loss": 0.5056, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.06992091238498688, - "rewards/margins": 1.694715142250061, - "rewards/rejected": -1.6247942447662354, + "epoch": 0.93, + "learning_rate": 3.831951770911831e-07, + "logits/chosen": -2.4969706535339355, + "logits/rejected": -2.5374817848205566, + "logps/chosen": -242.24667358398438, + "logps/rejected": -327.6692810058594, + "loss": 0.6341, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2129790335893631, + "rewards/margins": 1.3899933099746704, + "rewards/rejected": -1.6029722690582275, "step": 1830 }, { - "epoch": 0.9, - "learning_rate": 3.891603329714079e-07, - "logits/chosen": -2.909370183944702, - "logits/rejected": -2.884232759475708, - "logps/chosen": -323.385498046875, - "logps/rejected": -219.72921752929688, - "loss": 0.5575, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.07724098861217499, - "rewards/margins": 1.82958984375, - "rewards/rejected": -1.7523488998413086, + "epoch": 0.94, + "learning_rate": 3.822532027128862e-07, + "logits/chosen": -2.522143840789795, + "logits/rejected": -2.6315150260925293, + "logps/chosen": -242.66592407226562, + "logps/rejected": -228.8377227783203, + "loss": 0.4791, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.440913587808609, + "rewards/margins": 1.237001657485962, + "rewards/rejected": -1.6779152154922485, "step": 1840 }, { - "epoch": 0.9, - "learning_rate": 3.882555193630112e-07, - "logits/chosen": -2.8684873580932617, - "logits/rejected": -2.8539910316467285, - "logps/chosen": -343.2019348144531, - "logps/rejected": -264.88458251953125, - "loss": 0.4884, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.10159387439489365, - "rewards/margins": 2.088768482208252, - "rewards/rejected": -1.9871746301651, + "epoch": 0.94, + "learning_rate": 3.8131122833458926e-07, + "logits/chosen": -2.5416033267974854, + "logits/rejected": -2.469377040863037, + "logps/chosen": -265.96160888671875, + "logps/rejected": -278.81463623046875, + "loss": 0.5192, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5825539827346802, + "rewards/margins": 1.0453699827194214, + "rewards/rejected": -1.6279242038726807, "step": 1850 }, { - "epoch": 0.91, - "learning_rate": 3.8735070575461456e-07, - "logits/chosen": -2.8793351650238037, - "logits/rejected": -2.870913028717041, - "logps/chosen": -358.84716796875, - "logps/rejected": -266.1048889160156, - "loss": 0.5213, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.23684076964855194, - "rewards/margins": 1.2525182962417603, - "rewards/rejected": -1.4893590211868286, + "epoch": 0.95, + "learning_rate": 3.8036925395629236e-07, + "logits/chosen": -2.512683391571045, + "logits/rejected": -2.4817757606506348, + "logps/chosen": -272.36614990234375, + "logps/rejected": -247.10476684570312, + "loss": 0.5046, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6125789880752563, + "rewards/margins": 0.970922589302063, + "rewards/rejected": -1.5835015773773193, "step": 1860 }, { - "epoch": 0.91, - "learning_rate": 3.8644589214621784e-07, - "logits/chosen": -2.793290376663208, - "logits/rejected": -2.834564685821533, - "logps/chosen": -292.1471252441406, - "logps/rejected": -259.4459228515625, - "loss": 0.5509, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.26219049096107483, - "rewards/margins": 1.8031482696533203, - "rewards/rejected": -1.5409576892852783, + "epoch": 0.95, + "learning_rate": 3.7942727957799546e-07, + "logits/chosen": -2.5154783725738525, + "logits/rejected": -2.5012450218200684, + "logps/chosen": -282.32952880859375, + "logps/rejected": -270.69427490234375, + "loss": 0.5542, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6044799089431763, + "rewards/margins": 1.4208053350448608, + "rewards/rejected": -2.025285243988037, "step": 1870 }, { - "epoch": 0.92, - "learning_rate": 3.855410785378212e-07, - "logits/chosen": -2.827361822128296, - "logits/rejected": -2.8536243438720703, - "logps/chosen": -287.46160888671875, - "logps/rejected": -239.70950317382812, - "loss": 0.6182, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.04908237233757973, - "rewards/margins": 1.6372028589248657, - "rewards/rejected": -1.5881205797195435, + "epoch": 0.96, + "learning_rate": 3.7848530519969856e-07, + "logits/chosen": -2.4879260063171387, + "logits/rejected": -2.457428455352783, + "logps/chosen": -272.4909973144531, + "logps/rejected": -241.22433471679688, + "loss": 0.636, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6453158855438232, + "rewards/margins": 1.0200417041778564, + "rewards/rejected": -1.6653575897216797, "step": 1880 }, { - "epoch": 0.92, - "learning_rate": 3.846362649294245e-07, - "logits/chosen": -2.8506338596343994, - "logits/rejected": -2.8581371307373047, - "logps/chosen": -266.163330078125, - "logps/rejected": -233.5065155029297, - "loss": 0.5449, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.36866846680641174, - "rewards/margins": 1.8890787363052368, - "rewards/rejected": -1.5204102993011475, + "epoch": 0.96, + "learning_rate": 3.7754333082140166e-07, + "logits/chosen": -2.5128307342529297, + "logits/rejected": -2.5684497356414795, + "logps/chosen": -336.8165588378906, + "logps/rejected": -291.71453857421875, + "loss": 0.5249, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2176159918308258, + "rewards/margins": 1.229262351989746, + "rewards/rejected": -1.4468783140182495, "step": 1890 }, { - "epoch": 0.93, - "learning_rate": 3.8373145132102785e-07, - "logits/chosen": -2.8329875469207764, - "logits/rejected": -2.8117282390594482, - "logps/chosen": -311.8463439941406, - "logps/rejected": -287.82696533203125, - "loss": 0.5443, + "epoch": 0.97, + "learning_rate": 3.7660135644310477e-07, + "logits/chosen": -2.528883695602417, + "logits/rejected": -2.5390408039093018, + "logps/chosen": -289.835205078125, + "logps/rejected": -272.76654052734375, + "loss": 0.5344, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.16949217021465302, - "rewards/margins": 1.7446651458740234, - "rewards/rejected": -1.9141572713851929, + "rewards/chosen": -0.27965977787971497, + "rewards/margins": 1.2389864921569824, + "rewards/rejected": -1.518646240234375, "step": 1900 }, { - "epoch": 0.93, - "eval_logits/chosen": -2.852118730545044, - "eval_logits/rejected": -2.84346342086792, - "eval_logps/chosen": -311.0306701660156, - "eval_logps/rejected": -284.84002685546875, - "eval_loss": 0.5322908163070679, - "eval_rewards/accuracies": 0.7702205777168274, - "eval_rewards/chosen": 0.38303330540657043, - "eval_rewards/margins": 1.9137194156646729, - "eval_rewards/rejected": -1.5306861400604248, - "eval_runtime": 304.3687, - "eval_samples_per_second": 7.136, - "eval_steps_per_second": 0.447, + "epoch": 0.97, + "eval_logits/chosen": -2.599806547164917, + "eval_logits/rejected": -2.6680245399475098, + "eval_logps/chosen": -282.03875732421875, + "eval_logps/rejected": -276.2930908203125, + "eval_loss": 0.5602597594261169, + "eval_rewards/accuracies": 0.7270992398262024, + "eval_rewards/chosen": -0.37883004546165466, + "eval_rewards/margins": 1.436964750289917, + "eval_rewards/rejected": -1.8157949447631836, + "eval_runtime": 297.1212, + "eval_samples_per_second": 7.021, + "eval_steps_per_second": 0.441, "step": 1900 }, { - "epoch": 0.93, - "learning_rate": 3.828266377126312e-07, - "logits/chosen": -2.8308136463165283, - "logits/rejected": -2.815786123275757, - "logps/chosen": -314.32916259765625, - "logps/rejected": -268.0843505859375, - "loss": 0.5881, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.2804369330406189, - "rewards/margins": 2.1297898292541504, - "rewards/rejected": -1.8493530750274658, + "epoch": 0.97, + "learning_rate": 3.756593820648078e-07, + "logits/chosen": -2.472036361694336, + "logits/rejected": -2.496267318725586, + "logps/chosen": -292.5258483886719, + "logps/rejected": -290.6319580078125, + "loss": 0.5371, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.3897474408149719, + "rewards/margins": 1.1748746633529663, + "rewards/rejected": -1.564622163772583, "step": 1910 }, { - "epoch": 0.94, - "learning_rate": 3.819218241042345e-07, - "logits/chosen": -2.8731918334960938, - "logits/rejected": -2.868706464767456, - "logps/chosen": -298.0368347167969, - "logps/rejected": -260.85943603515625, - "loss": 0.53, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.05784381553530693, - "rewards/margins": 1.4109447002410889, - "rewards/rejected": -1.4687883853912354, + "epoch": 0.98, + "learning_rate": 3.747174076865109e-07, + "logits/chosen": -2.4090096950531006, + "logits/rejected": -2.475609064102173, + "logps/chosen": -296.208740234375, + "logps/rejected": -296.7442626953125, + "loss": 0.6168, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4877321720123291, + "rewards/margins": 1.1195850372314453, + "rewards/rejected": -1.6073172092437744, "step": 1920 }, { - "epoch": 0.94, - "learning_rate": 3.8101701049583785e-07, - "logits/chosen": -2.868292808532715, - "logits/rejected": -2.8513400554656982, - "logps/chosen": -320.32427978515625, - "logps/rejected": -317.34735107421875, - "loss": 0.5216, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.22917433083057404, - "rewards/margins": 2.1150217056274414, - "rewards/rejected": -2.344196081161499, + "epoch": 0.98, + "learning_rate": 3.7377543330821396e-07, + "logits/chosen": -2.420328140258789, + "logits/rejected": -2.475395679473877, + "logps/chosen": -310.14166259765625, + "logps/rejected": -292.9002685546875, + "loss": 0.5162, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.709976077079773, + "rewards/margins": 1.334611415863037, + "rewards/rejected": -2.0445871353149414, "step": 1930 }, { - "epoch": 0.95, - "learning_rate": 3.801121968874412e-07, - "logits/chosen": -2.9521877765655518, - "logits/rejected": -2.9392201900482178, - "logps/chosen": -319.16693115234375, - "logps/rejected": -331.2082824707031, - "loss": 0.4392, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.118543341755867, - "rewards/margins": 2.0723860263824463, - "rewards/rejected": -2.190929412841797, + "epoch": 0.99, + "learning_rate": 3.7283345892991706e-07, + "logits/chosen": -2.4980602264404297, + "logits/rejected": -2.55902361869812, + "logps/chosen": -336.90155029296875, + "logps/rejected": -322.50677490234375, + "loss": 0.5316, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.526197075843811, + "rewards/margins": 1.3391772508621216, + "rewards/rejected": -1.865374207496643, "step": 1940 }, { - "epoch": 0.95, - "learning_rate": 3.792073832790445e-07, - "logits/chosen": -2.966381549835205, - "logits/rejected": -2.925827980041504, - "logps/chosen": -326.33807373046875, - "logps/rejected": -239.7138671875, - "loss": 0.5256, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.09393619000911713, - "rewards/margins": 1.773887038230896, - "rewards/rejected": -1.8678230047225952, + "epoch": 0.99, + "learning_rate": 3.7189148455162016e-07, + "logits/chosen": -2.44006609916687, + "logits/rejected": -2.403160333633423, + "logps/chosen": -248.7161865234375, + "logps/rejected": -256.5013732910156, + "loss": 0.5521, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4066084027290344, + "rewards/margins": 1.3497653007507324, + "rewards/rejected": -1.756373643875122, "step": 1950 }, { - "epoch": 0.96, - "learning_rate": 3.783025696706478e-07, - "logits/chosen": -2.855555772781372, - "logits/rejected": -2.881895065307617, - "logps/chosen": -340.6606750488281, - "logps/rejected": -269.9922790527344, - "loss": 0.4386, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.4827759861946106, - "rewards/margins": 1.521819829940796, - "rewards/rejected": -2.0045957565307617, + "epoch": 1.0, + "learning_rate": 3.7094951017332326e-07, + "logits/chosen": -2.5976829528808594, + "logits/rejected": -2.587454319000244, + "logps/chosen": -296.61785888671875, + "logps/rejected": -235.3370819091797, + "loss": 0.5499, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2806735038757324, + "rewards/margins": 1.0782190561294556, + "rewards/rejected": -1.358892560005188, "step": 1960 }, { - "epoch": 0.96, - "learning_rate": 3.7739775606225114e-07, - "logits/chosen": -2.9031760692596436, - "logits/rejected": -2.868932008743286, - "logps/chosen": -319.40472412109375, - "logps/rejected": -235.11178588867188, - "loss": 0.57, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.03887303173542023, - "rewards/margins": 1.5843039751052856, - "rewards/rejected": -1.6231769323349, + "epoch": 1.0, + "learning_rate": 3.7000753579502636e-07, + "logits/chosen": -2.4377546310424805, + "logits/rejected": -2.4840946197509766, + "logps/chosen": -256.7413330078125, + "logps/rejected": -288.8653869628906, + "loss": 0.417, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.8871258497238159, + "rewards/margins": 3.1716504096984863, + "rewards/rejected": -2.284524440765381, "step": 1970 }, { - "epoch": 0.97, - "learning_rate": 3.764929424538545e-07, - "logits/chosen": -2.8314356803894043, - "logits/rejected": -2.8535282611846924, - "logps/chosen": -283.8772888183594, - "logps/rejected": -274.3182373046875, - "loss": 0.5516, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.09604203701019287, - "rewards/margins": 2.0541794300079346, - "rewards/rejected": -2.150221586227417, + "epoch": 1.01, + "learning_rate": 3.6906556141672946e-07, + "logits/chosen": -2.5575382709503174, + "logits/rejected": -2.507671356201172, + "logps/chosen": -298.3926086425781, + "logps/rejected": -276.83197021484375, + "loss": 0.104, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7850697040557861, + "rewards/margins": 4.537423610687256, + "rewards/rejected": -2.7523539066314697, "step": 1980 }, { - "epoch": 0.97, - "learning_rate": 3.755881288454578e-07, - "logits/chosen": -2.892510175704956, - "logits/rejected": -2.893159866333008, - "logps/chosen": -340.0617370605469, - "logps/rejected": -313.38348388671875, - "loss": 0.5077, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.5523239970207214, - "rewards/margins": 1.5753610134124756, - "rewards/rejected": -2.1276845932006836, + "epoch": 1.01, + "learning_rate": 3.6812358703843256e-07, + "logits/chosen": -2.5167181491851807, + "logits/rejected": -2.4053378105163574, + "logps/chosen": -250.5689239501953, + "logps/rejected": -310.2710876464844, + "loss": 0.102, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1603522300720215, + "rewards/margins": 4.360447883605957, + "rewards/rejected": -3.2000954151153564, "step": 1990 }, { - "epoch": 0.98, - "learning_rate": 3.7468331523706115e-07, - "logits/chosen": -2.789858341217041, - "logits/rejected": -2.8142800331115723, - "logps/chosen": -322.43731689453125, - "logps/rejected": -272.51214599609375, - "loss": 0.5506, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -0.04578378051519394, - "rewards/margins": 2.161404848098755, - "rewards/rejected": -2.207188367843628, + "epoch": 1.02, + "learning_rate": 3.6718161266013566e-07, + "logits/chosen": -2.483384609222412, + "logits/rejected": -2.6710855960845947, + "logps/chosen": -245.9284210205078, + "logps/rejected": -308.3492431640625, + "loss": 0.0925, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2592953443527222, + "rewards/margins": 5.212327480316162, + "rewards/rejected": -3.9530320167541504, "step": 2000 }, { - "epoch": 0.98, - "eval_logits/chosen": -2.849094867706299, - "eval_logits/rejected": -2.8470375537872314, - "eval_logps/chosen": -316.5567321777344, - "eval_logps/rejected": -290.4926452636719, - "eval_loss": 0.5221505761146545, - "eval_rewards/accuracies": 0.7536764740943909, - "eval_rewards/chosen": -0.16957171261310577, - "eval_rewards/margins": 1.926379919052124, - "eval_rewards/rejected": -2.095951557159424, - "eval_runtime": 304.4163, - "eval_samples_per_second": 7.135, - "eval_steps_per_second": 0.447, + "epoch": 1.02, + "eval_logits/chosen": -2.5824897289276123, + "eval_logits/rejected": -2.6520135402679443, + "eval_logps/chosen": -282.8787841796875, + "eval_logps/rejected": -279.4119567871094, + "eval_loss": 0.5587373375892639, + "eval_rewards/accuracies": 0.7404580116271973, + "eval_rewards/chosen": -0.46283194422721863, + "eval_rewards/margins": 1.664847493171692, + "eval_rewards/rejected": -2.1276795864105225, + "eval_runtime": 297.407, + "eval_samples_per_second": 7.014, + "eval_steps_per_second": 0.44, "step": 2000 }, { - "epoch": 0.98, - "learning_rate": 3.737785016286645e-07, - "logits/chosen": -2.841965675354004, - "logits/rejected": -2.8245298862457275, - "logps/chosen": -292.35235595703125, - "logps/rejected": -272.5868225097656, - "loss": 0.646, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.4552840292453766, - "rewards/margins": 1.566802740097046, - "rewards/rejected": -2.0220868587493896, + "epoch": 1.02, + "learning_rate": 3.662396382818387e-07, + "logits/chosen": -2.466040849685669, + "logits/rejected": -2.4850330352783203, + "logps/chosen": -285.1557312011719, + "logps/rejected": -314.3226623535156, + "loss": 0.1259, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.797186017036438, + "rewards/margins": 4.633586883544922, + "rewards/rejected": -3.8364005088806152, "step": 2010 }, { - "epoch": 0.99, - "learning_rate": 3.728736880202678e-07, - "logits/chosen": -2.915170431137085, - "logits/rejected": -2.8954920768737793, - "logps/chosen": -362.6797790527344, - "logps/rejected": -269.60479736328125, - "loss": 0.5034, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.08798433095216751, - "rewards/margins": 2.127814292907715, - "rewards/rejected": -2.039830207824707, + "epoch": 1.03, + "learning_rate": 3.652976639035418e-07, + "logits/chosen": -2.433439254760742, + "logits/rejected": -2.4428954124450684, + "logps/chosen": -354.7630920410156, + "logps/rejected": -337.88360595703125, + "loss": 0.0888, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.6825096607208252, + "rewards/margins": 6.562380313873291, + "rewards/rejected": -4.879870891571045, "step": 2020 }, { - "epoch": 0.99, - "learning_rate": 3.7196887441187115e-07, - "logits/chosen": -2.878261089324951, - "logits/rejected": -2.865908145904541, - "logps/chosen": -271.31756591796875, - "logps/rejected": -227.75411987304688, - "loss": 0.6319, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.16957125067710876, - "rewards/margins": 1.5267400741577148, - "rewards/rejected": -1.696311593055725, + "epoch": 1.03, + "learning_rate": 3.6435568952524486e-07, + "logits/chosen": -2.388587474822998, + "logits/rejected": -2.4829463958740234, + "logps/chosen": -251.0550537109375, + "logps/rejected": -301.1805419921875, + "loss": 0.0645, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6248880624771118, + "rewards/margins": 6.260207176208496, + "rewards/rejected": -4.635318279266357, "step": 2030 }, { - "epoch": 1.0, - "learning_rate": 3.710640608034745e-07, - "logits/chosen": -2.86537504196167, - "logits/rejected": -2.87638783454895, - "logps/chosen": -305.21771240234375, - "logps/rejected": -254.0093536376953, - "loss": 0.5943, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.04048401117324829, - "rewards/margins": 1.65225350856781, - "rewards/rejected": -1.692737340927124, + "epoch": 1.04, + "learning_rate": 3.6341371514694796e-07, + "logits/chosen": -2.5493760108947754, + "logits/rejected": -2.528698682785034, + "logps/chosen": -235.96932983398438, + "logps/rejected": -289.5627746582031, + "loss": 0.0544, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5340107679367065, + "rewards/margins": 4.596531391143799, + "rewards/rejected": -4.0625200271606445, "step": 2040 }, { - "epoch": 1.0, - "learning_rate": 3.7015924719507777e-07, - "logits/chosen": -2.901395320892334, - "logits/rejected": -2.908327579498291, - "logps/chosen": -363.0475769042969, - "logps/rejected": -320.16754150390625, - "loss": 0.4642, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.9380462765693665, - "rewards/margins": 3.346102476119995, - "rewards/rejected": -2.4080562591552734, + "epoch": 1.04, + "learning_rate": 3.6247174076865106e-07, + "logits/chosen": -2.3305134773254395, + "logits/rejected": -2.384181499481201, + "logps/chosen": -265.8026428222656, + "logps/rejected": -303.68646240234375, + "loss": 0.1032, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6562206745147705, + "rewards/margins": 5.506114959716797, + "rewards/rejected": -3.8498940467834473, "step": 2050 }, { - "epoch": 1.01, - "learning_rate": 3.692544335866811e-07, - "logits/chosen": -2.903226613998413, - "logits/rejected": -2.8930602073669434, - "logps/chosen": -291.3240661621094, - "logps/rejected": -291.161376953125, - "loss": 0.1076, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.838857889175415, - "rewards/margins": 5.7334794998168945, - "rewards/rejected": -3.8946216106414795, + "epoch": 1.05, + "learning_rate": 3.6152976639035416e-07, + "logits/chosen": -2.49692440032959, + "logits/rejected": -2.456477403640747, + "logps/chosen": -240.740966796875, + "logps/rejected": -269.8097229003906, + "loss": 0.1098, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8934639096260071, + "rewards/margins": 4.806546688079834, + "rewards/rejected": -3.9130825996398926, "step": 2060 }, { - "epoch": 1.01, - "learning_rate": 3.6834961997828444e-07, - "logits/chosen": -2.845395565032959, - "logits/rejected": -2.8509373664855957, - "logps/chosen": -344.9109191894531, - "logps/rejected": -392.5731201171875, - "loss": 0.0635, - "rewards/accuracies": 1.0, - "rewards/chosen": 2.0180304050445557, - "rewards/margins": 6.360495090484619, - "rewards/rejected": -4.342464447021484, + "epoch": 1.05, + "learning_rate": 3.6058779201205726e-07, + "logits/chosen": -2.4669885635375977, + "logits/rejected": -2.393951892852783, + "logps/chosen": -281.805419921875, + "logps/rejected": -328.47344970703125, + "loss": 0.0874, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.6799099445343018, + "rewards/margins": 6.671219825744629, + "rewards/rejected": -4.9913105964660645, "step": 2070 }, { - "epoch": 1.02, - "learning_rate": 3.674448063698878e-07, - "logits/chosen": -2.792494535446167, - "logits/rejected": -2.7958524227142334, - "logps/chosen": -310.73590087890625, - "logps/rejected": -303.03125, - "loss": 0.0947, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3329970836639404, - "rewards/margins": 5.806687355041504, - "rewards/rejected": -4.473690986633301, + "epoch": 1.06, + "learning_rate": 3.5964581763376036e-07, + "logits/chosen": -2.4482600688934326, + "logits/rejected": -2.403702974319458, + "logps/chosen": -224.82803344726562, + "logps/rejected": -269.99334716796875, + "loss": 0.0894, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2562118768692017, + "rewards/margins": 5.40142822265625, + "rewards/rejected": -4.145216464996338, "step": 2080 }, { - "epoch": 1.02, - "learning_rate": 3.665399927614911e-07, - "logits/chosen": -2.8203608989715576, - "logits/rejected": -2.8439743518829346, - "logps/chosen": -314.5364990234375, - "logps/rejected": -286.63214111328125, - "loss": 0.1005, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.270831823348999, - "rewards/margins": 5.476805686950684, - "rewards/rejected": -4.205974578857422, + "epoch": 1.06, + "learning_rate": 3.587038432554634e-07, + "logits/chosen": -2.332878828048706, + "logits/rejected": -2.235696315765381, + "logps/chosen": -278.15740966796875, + "logps/rejected": -327.5746154785156, + "loss": 0.0823, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4940156936645508, + "rewards/margins": 5.760683536529541, + "rewards/rejected": -4.26666784286499, "step": 2090 }, { - "epoch": 1.03, - "learning_rate": 3.6563517915309445e-07, - "logits/chosen": -2.8499157428741455, - "logits/rejected": -2.8754732608795166, - "logps/chosen": -334.3821105957031, - "logps/rejected": -330.5299377441406, - "loss": 0.1276, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.4577178955078125, - "rewards/margins": 5.256899833679199, - "rewards/rejected": -3.799182415008545, + "epoch": 1.07, + "learning_rate": 3.577618688771665e-07, + "logits/chosen": -2.2535722255706787, + "logits/rejected": -2.3259873390197754, + "logps/chosen": -257.2765197753906, + "logps/rejected": -300.7965087890625, + "loss": 0.112, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2972657680511475, + "rewards/margins": 5.322561740875244, + "rewards/rejected": -4.025295257568359, "step": 2100 }, { - "epoch": 1.03, - "eval_logits/chosen": -2.844696521759033, - "eval_logits/rejected": -2.8374319076538086, - "eval_logps/chosen": -315.4477844238281, - "eval_logps/rejected": -292.6927795410156, - "eval_loss": 0.5308383107185364, - "eval_rewards/accuracies": 0.7536764740943909, - "eval_rewards/chosen": -0.05867999419569969, - "eval_rewards/margins": 2.257284164428711, - "eval_rewards/rejected": -2.315964460372925, - "eval_runtime": 304.4504, - "eval_samples_per_second": 7.134, - "eval_steps_per_second": 0.447, + "epoch": 1.07, + "eval_logits/chosen": -2.50936222076416, + "eval_logits/rejected": -2.5722339153289795, + "eval_logps/chosen": -285.0382995605469, + "eval_logps/rejected": -284.0433349609375, + "eval_loss": 0.5730677843093872, + "eval_rewards/accuracies": 0.7480915784835815, + "eval_rewards/chosen": -0.6787797808647156, + "eval_rewards/margins": 1.9120395183563232, + "eval_rewards/rejected": -2.5908188819885254, + "eval_runtime": 302.0143, + "eval_samples_per_second": 6.907, + "eval_steps_per_second": 0.434, "step": 2100 }, { - "epoch": 1.03, - "learning_rate": 3.647303655446978e-07, - "logits/chosen": -2.8706328868865967, - "logits/rejected": -2.8571372032165527, - "logps/chosen": -350.6748962402344, - "logps/rejected": -348.13372802734375, - "loss": 0.1149, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3321328163146973, - "rewards/margins": 6.214634895324707, - "rewards/rejected": -4.882501602172852, + "epoch": 1.07, + "learning_rate": 3.568198944988696e-07, + "logits/chosen": -2.372765064239502, + "logits/rejected": -2.356163501739502, + "logps/chosen": -244.1463623046875, + "logps/rejected": -309.8556823730469, + "loss": 0.0817, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.721108078956604, + "rewards/margins": 5.2660932540893555, + "rewards/rejected": -4.544985294342041, "step": 2110 }, { - "epoch": 1.04, - "learning_rate": 3.638255519363011e-07, - "logits/chosen": -2.869586229324341, - "logits/rejected": -2.8794963359832764, - "logps/chosen": -343.129150390625, - "logps/rejected": -354.57281494140625, - "loss": 0.0827, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 2.1121039390563965, - "rewards/margins": 6.70623779296875, - "rewards/rejected": -4.594133377075195, + "epoch": 1.08, + "learning_rate": 3.558779201205727e-07, + "logits/chosen": -2.3378894329071045, + "logits/rejected": -2.524498462677002, + "logps/chosen": -239.94595336914062, + "logps/rejected": -288.84918212890625, + "loss": 0.0831, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3765751123428345, + "rewards/margins": 6.299831867218018, + "rewards/rejected": -4.9232563972473145, "step": 2120 }, { - "epoch": 1.04, - "learning_rate": 3.6292073832790445e-07, - "logits/chosen": -2.8766379356384277, - "logits/rejected": -2.88783597946167, - "logps/chosen": -333.79254150390625, - "logps/rejected": -280.58673095703125, - "loss": 0.098, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.8204046487808228, - "rewards/margins": 5.815884113311768, - "rewards/rejected": -3.9954800605773926, + "epoch": 1.08, + "learning_rate": 3.549359457422758e-07, + "logits/chosen": -2.316340923309326, + "logits/rejected": -2.3115782737731934, + "logps/chosen": -269.43890380859375, + "logps/rejected": -307.49481201171875, + "loss": 0.0837, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.7511188983917236, + "rewards/margins": 5.8343915939331055, + "rewards/rejected": -4.083272933959961, "step": 2130 }, { - "epoch": 1.05, - "learning_rate": 3.6201592471950774e-07, - "logits/chosen": -2.9168217182159424, - "logits/rejected": -2.8879613876342773, - "logps/chosen": -334.43511962890625, - "logps/rejected": -276.46722412109375, - "loss": 0.1024, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.22162926197052, - "rewards/margins": 5.542742729187012, - "rewards/rejected": -4.321113586425781, + "epoch": 1.09, + "learning_rate": 3.539939713639789e-07, + "logits/chosen": -2.3972020149230957, + "logits/rejected": -2.362435817718506, + "logps/chosen": -242.6447296142578, + "logps/rejected": -309.8039245605469, + "loss": 0.0857, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9248175621032715, + "rewards/margins": 5.774969577789307, + "rewards/rejected": -4.850151538848877, "step": 2140 }, { - "epoch": 1.05, - "learning_rate": 3.6111111111111107e-07, - "logits/chosen": -2.860757827758789, - "logits/rejected": -2.8546247482299805, - "logps/chosen": -292.5542297363281, - "logps/rejected": -360.7430725097656, - "loss": 0.1022, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.7083183526992798, - "rewards/margins": 6.2340168952941895, - "rewards/rejected": -4.525698661804199, + "epoch": 1.09, + "learning_rate": 3.53051996985682e-07, + "logits/chosen": -2.3665599822998047, + "logits/rejected": -2.4754738807678223, + "logps/chosen": -251.0576171875, + "logps/rejected": -287.749267578125, + "loss": 0.0704, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6899253129959106, + "rewards/margins": 5.328171253204346, + "rewards/rejected": -4.638245582580566, "step": 2150 }, { - "epoch": 1.06, - "learning_rate": 3.602062975027144e-07, - "logits/chosen": -2.856623411178589, - "logits/rejected": -2.8416271209716797, - "logps/chosen": -316.9689025878906, - "logps/rejected": -282.3172607421875, - "loss": 0.1381, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.1407171487808228, - "rewards/margins": 5.505871772766113, - "rewards/rejected": -4.3651533126831055, + "epoch": 1.1, + "learning_rate": 3.5211002260738506e-07, + "logits/chosen": -2.524543046951294, + "logits/rejected": -2.5408785343170166, + "logps/chosen": -281.8375244140625, + "logps/rejected": -346.6162414550781, + "loss": 0.0891, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.8872039914131165, + "rewards/margins": 6.403290748596191, + "rewards/rejected": -5.516086578369141, "step": 2160 }, { - "epoch": 1.06, - "learning_rate": 3.5930148389431774e-07, - "logits/chosen": -2.8782546520233154, - "logits/rejected": -2.8700966835021973, - "logps/chosen": -294.6328125, - "logps/rejected": -254.8290557861328, - "loss": 0.1215, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.9199129343032837, - "rewards/margins": 5.449519634246826, - "rewards/rejected": -3.529606342315674, + "epoch": 1.1, + "learning_rate": 3.511680482290881e-07, + "logits/chosen": -2.4190125465393066, + "logits/rejected": -2.5307183265686035, + "logps/chosen": -280.59991455078125, + "logps/rejected": -276.5904846191406, + "loss": 0.0887, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.7298386096954346, + "rewards/margins": 4.927010536193848, + "rewards/rejected": -4.197172164916992, "step": 2170 }, { - "epoch": 1.06, - "learning_rate": 3.583966702859211e-07, - "logits/chosen": -2.8176722526550293, - "logits/rejected": -2.8288466930389404, - "logps/chosen": -276.1488952636719, - "logps/rejected": -284.7532958984375, - "loss": 0.1299, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.1107614040374756, - "rewards/margins": 5.063755512237549, - "rewards/rejected": -3.952993392944336, + "epoch": 1.11, + "learning_rate": 3.502260738507912e-07, + "logits/chosen": -2.5079550743103027, + "logits/rejected": -2.544100046157837, + "logps/chosen": -240.7677001953125, + "logps/rejected": -281.4288024902344, + "loss": 0.0963, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6959851980209351, + "rewards/margins": 4.724411487579346, + "rewards/rejected": -4.028426170349121, "step": 2180 }, { - "epoch": 1.07, - "learning_rate": 3.574918566775244e-07, - "logits/chosen": -2.8044848442077637, - "logits/rejected": -2.8305211067199707, - "logps/chosen": -377.8848876953125, - "logps/rejected": -337.43402099609375, - "loss": 0.1026, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.5271100997924805, - "rewards/margins": 6.236946105957031, - "rewards/rejected": -4.709836483001709, + "epoch": 1.11, + "learning_rate": 3.492840994724943e-07, + "logits/chosen": -2.4321839809417725, + "logits/rejected": -2.495434284210205, + "logps/chosen": -269.27874755859375, + "logps/rejected": -292.94769287109375, + "loss": 0.0849, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8701983690261841, + "rewards/margins": 4.984339714050293, + "rewards/rejected": -4.11414098739624, "step": 2190 }, { - "epoch": 1.07, - "learning_rate": 3.5658704306912775e-07, - "logits/chosen": -2.8567607402801514, - "logits/rejected": -2.858031988143921, - "logps/chosen": -374.4449462890625, - "logps/rejected": -334.5423889160156, - "loss": 0.1082, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.7140470743179321, - "rewards/margins": 6.78305721282959, - "rewards/rejected": -5.0690107345581055, + "epoch": 1.12, + "learning_rate": 3.483421250941974e-07, + "logits/chosen": -2.420361280441284, + "logits/rejected": -2.4040703773498535, + "logps/chosen": -263.71343994140625, + "logps/rejected": -303.37493896484375, + "loss": 0.0539, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6098963618278503, + "rewards/margins": 5.93268346786499, + "rewards/rejected": -5.322787284851074, "step": 2200 }, { - "epoch": 1.07, - "eval_logits/chosen": -2.850436210632324, - "eval_logits/rejected": -2.84183931350708, - "eval_logps/chosen": -316.6463317871094, - "eval_logps/rejected": -294.66900634765625, - "eval_loss": 0.5517958998680115, - "eval_rewards/accuracies": 0.7647058963775635, - "eval_rewards/chosen": -0.17853271961212158, - "eval_rewards/margins": 2.335050582885742, - "eval_rewards/rejected": -2.5135834217071533, - "eval_runtime": 304.4832, - "eval_samples_per_second": 7.133, - "eval_steps_per_second": 0.447, + "epoch": 1.12, + "eval_logits/chosen": -2.5303497314453125, + "eval_logits/rejected": -2.5936505794525146, + "eval_logps/chosen": -289.07073974609375, + "eval_logps/rejected": -287.4447937011719, + "eval_loss": 0.5868561267852783, + "eval_rewards/accuracies": 0.7366412281990051, + "eval_rewards/chosen": -1.0820242166519165, + "eval_rewards/margins": 1.8489437103271484, + "eval_rewards/rejected": -2.9309680461883545, + "eval_runtime": 297.2334, + "eval_samples_per_second": 7.018, + "eval_steps_per_second": 0.441, "step": 2200 }, { - "epoch": 1.08, - "learning_rate": 3.556822294607311e-07, - "logits/chosen": -2.899690628051758, - "logits/rejected": -2.8651416301727295, - "logps/chosen": -331.7864685058594, - "logps/rejected": -303.08349609375, - "loss": 0.1495, - "rewards/accuracies": 1.0, - "rewards/chosen": 2.0187602043151855, - "rewards/margins": 6.140603065490723, - "rewards/rejected": -4.121843338012695, + "epoch": 1.12, + "learning_rate": 3.474001507159005e-07, + "logits/chosen": -2.474795341491699, + "logits/rejected": -2.452157497406006, + "logps/chosen": -250.7163848876953, + "logps/rejected": -301.6701354980469, + "loss": 0.0699, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8442606925964355, + "rewards/margins": 5.76595401763916, + "rewards/rejected": -4.921692848205566, "step": 2210 }, { - "epoch": 1.08, - "learning_rate": 3.547774158523344e-07, - "logits/chosen": -2.8387465476989746, - "logits/rejected": -2.8396172523498535, - "logps/chosen": -341.380615234375, - "logps/rejected": -307.65509033203125, - "loss": 0.0981, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.9305038452148438, - "rewards/margins": 7.133988380432129, - "rewards/rejected": -5.203484535217285, + "epoch": 1.13, + "learning_rate": 3.464581763376036e-07, + "logits/chosen": -2.435446262359619, + "logits/rejected": -2.3850350379943848, + "logps/chosen": -297.05499267578125, + "logps/rejected": -325.5220947265625, + "loss": 0.0517, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8659135103225708, + "rewards/margins": 6.299160957336426, + "rewards/rejected": -5.4332475662231445, "step": 2220 }, { - "epoch": 1.09, - "learning_rate": 3.538726022439377e-07, - "logits/chosen": -2.880861759185791, - "logits/rejected": -2.9054319858551025, - "logps/chosen": -315.35821533203125, - "logps/rejected": -344.88800048828125, - "loss": 0.0737, + "epoch": 1.13, + "learning_rate": 3.455162019593067e-07, + "logits/chosen": -2.297966480255127, + "logits/rejected": -2.2471892833709717, + "logps/chosen": -314.8829650878906, + "logps/rejected": -365.14385986328125, + "loss": 0.1462, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.302333116531372, - "rewards/margins": 7.454858303070068, - "rewards/rejected": -5.152525424957275, + "rewards/chosen": 0.6976747512817383, + "rewards/margins": 6.606612205505371, + "rewards/rejected": -5.908937931060791, "step": 2230 }, { - "epoch": 1.09, - "learning_rate": 3.5296778863554104e-07, - "logits/chosen": -2.8208775520324707, - "logits/rejected": -2.837143898010254, - "logps/chosen": -302.15020751953125, - "logps/rejected": -340.7874450683594, - "loss": 0.1, + "epoch": 1.14, + "learning_rate": 3.445742275810098e-07, + "logits/chosen": -2.43511700630188, + "logits/rejected": -2.417332410812378, + "logps/chosen": -296.8207092285156, + "logps/rejected": -336.8685607910156, + "loss": 0.0711, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 2.393465280532837, - "rewards/margins": 7.227072715759277, - "rewards/rejected": -4.8336076736450195, + "rewards/chosen": 1.1867338418960571, + "rewards/margins": 7.082969665527344, + "rewards/rejected": -5.896236419677734, "step": 2240 }, { - "epoch": 1.1, - "learning_rate": 3.5206297502714437e-07, - "logits/chosen": -2.7523281574249268, - "logits/rejected": -2.7220346927642822, - "logps/chosen": -317.53912353515625, - "logps/rejected": -290.631591796875, - "loss": 0.0705, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.3900606632232666, - "rewards/margins": 6.789495944976807, - "rewards/rejected": -5.399435520172119, + "epoch": 1.14, + "learning_rate": 3.4363225320271286e-07, + "logits/chosen": -2.4127955436706543, + "logits/rejected": -2.417238235473633, + "logps/chosen": -281.3343200683594, + "logps/rejected": -308.0960693359375, + "loss": 0.1002, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.9532784223556519, + "rewards/margins": 5.915719985961914, + "rewards/rejected": -4.962441444396973, "step": 2250 }, { - "epoch": 1.1, - "learning_rate": 3.511581614187477e-07, - "logits/chosen": -2.836195707321167, - "logits/rejected": -2.880373477935791, - "logps/chosen": -326.37164306640625, - "logps/rejected": -358.9941711425781, - "loss": 0.1046, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.4366786479949951, - "rewards/margins": 6.438986301422119, - "rewards/rejected": -5.002306938171387, + "epoch": 1.15, + "learning_rate": 3.4269027882441596e-07, + "logits/chosen": -2.3315882682800293, + "logits/rejected": -2.4039595127105713, + "logps/chosen": -268.7440490722656, + "logps/rejected": -299.9024658203125, + "loss": 0.0782, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6864622831344604, + "rewards/margins": 5.834595203399658, + "rewards/rejected": -5.148133277893066, "step": 2260 }, { - "epoch": 1.11, - "learning_rate": 3.5025334781035104e-07, - "logits/chosen": -2.697096109390259, - "logits/rejected": -2.6917271614074707, - "logps/chosen": -313.08880615234375, - "logps/rejected": -313.1694641113281, - "loss": 0.0696, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.5296317338943481, - "rewards/margins": 6.273630142211914, - "rewards/rejected": -4.7439985275268555, + "epoch": 1.15, + "learning_rate": 3.4174830444611906e-07, + "logits/chosen": -2.315544366836548, + "logits/rejected": -2.394622802734375, + "logps/chosen": -326.4775695800781, + "logps/rejected": -344.4993896484375, + "loss": 0.0859, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2968065738677979, + "rewards/margins": 5.967825412750244, + "rewards/rejected": -4.671019077301025, "step": 2270 }, { - "epoch": 1.11, - "learning_rate": 3.493485342019544e-07, - "logits/chosen": -2.731708288192749, - "logits/rejected": -2.7655131816864014, - "logps/chosen": -313.3270568847656, - "logps/rejected": -336.34423828125, - "loss": 0.1033, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.6183414459228516, - "rewards/margins": 7.281188011169434, - "rewards/rejected": -5.66284704208374, + "epoch": 1.16, + "learning_rate": 3.4080633006782216e-07, + "logits/chosen": -2.3206300735473633, + "logits/rejected": -2.3000760078430176, + "logps/chosen": -260.9918212890625, + "logps/rejected": -287.2997131347656, + "loss": 0.099, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5387351512908936, + "rewards/margins": 5.4082255363464355, + "rewards/rejected": -4.869490146636963, "step": 2280 }, { - "epoch": 1.12, - "learning_rate": 3.484437205935577e-07, - "logits/chosen": -2.8379099369049072, - "logits/rejected": -2.8352649211883545, - "logps/chosen": -318.93310546875, - "logps/rejected": -365.71258544921875, - "loss": 0.0983, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.085160493850708, - "rewards/margins": 6.662481784820557, - "rewards/rejected": -5.5773210525512695, + "epoch": 1.16, + "learning_rate": 3.398643556895252e-07, + "logits/chosen": -2.3872666358947754, + "logits/rejected": -2.346640110015869, + "logps/chosen": -244.3435821533203, + "logps/rejected": -296.1825256347656, + "loss": 0.0937, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8320892453193665, + "rewards/margins": 5.071323871612549, + "rewards/rejected": -4.2392354011535645, "step": 2290 }, { - "epoch": 1.12, - "learning_rate": 3.4753890698516105e-07, - "logits/chosen": -2.7247507572174072, - "logits/rejected": -2.766514301300049, - "logps/chosen": -323.1333312988281, - "logps/rejected": -338.3423156738281, - "loss": 0.1003, + "epoch": 1.17, + "learning_rate": 3.389223813112283e-07, + "logits/chosen": -2.3533055782318115, + "logits/rejected": -2.340881824493408, + "logps/chosen": -234.72256469726562, + "logps/rejected": -282.35626220703125, + "loss": 0.0811, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.9766772985458374, - "rewards/margins": 7.123917579650879, - "rewards/rejected": -5.147240161895752, + "rewards/chosen": 0.8372732400894165, + "rewards/margins": 5.791101932525635, + "rewards/rejected": -4.953829288482666, "step": 2300 }, { - "epoch": 1.12, - "eval_logits/chosen": -2.8382694721221924, - "eval_logits/rejected": -2.824812650680542, - "eval_logps/chosen": -317.0865478515625, - "eval_logps/rejected": -297.229736328125, - "eval_loss": 0.5922148823738098, - "eval_rewards/accuracies": 0.7610294222831726, - "eval_rewards/chosen": -0.22255387902259827, - "eval_rewards/margins": 2.5471067428588867, - "eval_rewards/rejected": -2.769660711288452, - "eval_runtime": 303.9593, - "eval_samples_per_second": 7.146, - "eval_steps_per_second": 0.447, + "epoch": 1.17, + "eval_logits/chosen": -2.4560341835021973, + "eval_logits/rejected": -2.513721466064453, + "eval_logps/chosen": -286.5822448730469, + "eval_logps/rejected": -285.3392028808594, + "eval_loss": 0.6306248307228088, + "eval_rewards/accuracies": 0.7423664331436157, + "eval_rewards/chosen": -0.8331778645515442, + "eval_rewards/margins": 1.88722825050354, + "eval_rewards/rejected": -2.7204058170318604, + "eval_runtime": 301.8255, + "eval_samples_per_second": 6.911, + "eval_steps_per_second": 0.434, "step": 2300 }, { - "epoch": 1.13, - "learning_rate": 3.466340933767644e-07, - "logits/chosen": -2.8811392784118652, - "logits/rejected": -2.8737263679504395, - "logps/chosen": -362.6997985839844, - "logps/rejected": -377.6747131347656, - "loss": 0.1215, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.6323480606079102, - "rewards/margins": 6.082724094390869, - "rewards/rejected": -4.450376033782959, + "epoch": 1.17, + "learning_rate": 3.379804069329314e-07, + "logits/chosen": -2.408510208129883, + "logits/rejected": -2.355539560317993, + "logps/chosen": -269.3663024902344, + "logps/rejected": -276.4700012207031, + "loss": 0.1084, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.1696228981018066, + "rewards/margins": 5.296755313873291, + "rewards/rejected": -4.127132892608643, "step": 2310 }, { - "epoch": 1.13, - "learning_rate": 3.4572927976836767e-07, - "logits/chosen": -2.8972878456115723, - "logits/rejected": -2.910501003265381, - "logps/chosen": -275.22662353515625, - "logps/rejected": -308.2709655761719, - "loss": 0.1331, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.7100965976715088, - "rewards/margins": 5.187808990478516, - "rewards/rejected": -4.477712631225586, + "epoch": 1.18, + "learning_rate": 3.370384325546345e-07, + "logits/chosen": -2.3480758666992188, + "logits/rejected": -2.3206279277801514, + "logps/chosen": -286.1962890625, + "logps/rejected": -328.54376220703125, + "loss": 0.1032, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8524169921875, + "rewards/margins": 6.0059003829956055, + "rewards/rejected": -5.153482913970947, "step": 2320 }, { - "epoch": 1.14, - "learning_rate": 3.44824466159971e-07, - "logits/chosen": -2.8696389198303223, - "logits/rejected": -2.8878276348114014, - "logps/chosen": -351.02215576171875, - "logps/rejected": -319.7217712402344, - "loss": 0.0737, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 2.3024234771728516, - "rewards/margins": 7.3213701248168945, - "rewards/rejected": -5.018946170806885, + "epoch": 1.19, + "learning_rate": 3.3609645817633756e-07, + "logits/chosen": -2.3404994010925293, + "logits/rejected": -2.3466391563415527, + "logps/chosen": -283.11956787109375, + "logps/rejected": -303.4478454589844, + "loss": 0.0713, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.290867418050766, + "rewards/margins": 6.1293816566467285, + "rewards/rejected": -5.8385138511657715, "step": 2330 }, { - "epoch": 1.14, - "learning_rate": 3.4391965255157434e-07, - "logits/chosen": -2.836214065551758, - "logits/rejected": -2.8280227184295654, - "logps/chosen": -320.8536682128906, - "logps/rejected": -325.0138244628906, - "loss": 0.097, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.840990662574768, - "rewards/margins": 7.01099157333374, - "rewards/rejected": -5.17000150680542, + "epoch": 1.19, + "learning_rate": 3.3515448379804066e-07, + "logits/chosen": -2.335618257522583, + "logits/rejected": -2.350062131881714, + "logps/chosen": -250.2252960205078, + "logps/rejected": -288.460693359375, + "loss": 0.1828, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.27890732884407043, + "rewards/margins": 5.905325889587402, + "rewards/rejected": -5.6264190673828125, "step": 2340 }, { - "epoch": 1.15, - "learning_rate": 3.4301483894317767e-07, - "logits/chosen": -2.814479351043701, - "logits/rejected": -2.8396215438842773, - "logps/chosen": -311.86773681640625, - "logps/rejected": -374.49346923828125, - "loss": 0.1503, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.171228051185608, - "rewards/margins": 6.561440467834473, - "rewards/rejected": -5.390212059020996, + "epoch": 1.2, + "learning_rate": 3.3421250941974376e-07, + "logits/chosen": -2.405531406402588, + "logits/rejected": -2.370527505874634, + "logps/chosen": -326.8871765136719, + "logps/rejected": -346.4276428222656, + "loss": 0.0724, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5522273778915405, + "rewards/margins": 6.337648868560791, + "rewards/rejected": -5.785420894622803, "step": 2350 }, { - "epoch": 1.15, - "learning_rate": 3.42110025334781e-07, - "logits/chosen": -2.7774786949157715, - "logits/rejected": -2.7970921993255615, - "logps/chosen": -335.1246337890625, - "logps/rejected": -321.70135498046875, - "loss": 0.2064, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0966137647628784, - "rewards/margins": 6.218404293060303, - "rewards/rejected": -5.121790885925293, + "epoch": 1.2, + "learning_rate": 3.3327053504144686e-07, + "logits/chosen": -2.3920373916625977, + "logits/rejected": -2.3272738456726074, + "logps/chosen": -280.7931823730469, + "logps/rejected": -281.2308654785156, + "loss": 0.132, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.21835920214653015, + "rewards/margins": 5.349105358123779, + "rewards/rejected": -5.567465305328369, "step": 2360 }, { - "epoch": 1.16, - "learning_rate": 3.4120521172638434e-07, - "logits/chosen": -2.7383646965026855, - "logits/rejected": -2.7241227626800537, - "logps/chosen": -329.1031494140625, - "logps/rejected": -347.4546203613281, - "loss": 0.1037, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.6879613399505615, - "rewards/margins": 7.167945861816406, - "rewards/rejected": -5.479984760284424, + "epoch": 1.21, + "learning_rate": 3.3232856066314996e-07, + "logits/chosen": -2.323270797729492, + "logits/rejected": -2.3886377811431885, + "logps/chosen": -273.2583923339844, + "logps/rejected": -344.60296630859375, + "loss": 0.1121, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7613003253936768, + "rewards/margins": 7.667241096496582, + "rewards/rejected": -6.905940055847168, "step": 2370 }, { - "epoch": 1.16, - "learning_rate": 3.403003981179877e-07, - "logits/chosen": -2.6839699745178223, - "logits/rejected": -2.6567845344543457, - "logps/chosen": -298.90618896484375, - "logps/rejected": -353.8127136230469, - "loss": 0.1721, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.2552112340927124, - "rewards/margins": 7.768686771392822, - "rewards/rejected": -6.5134758949279785, + "epoch": 1.21, + "learning_rate": 3.3138658628485306e-07, + "logits/chosen": -2.459217071533203, + "logits/rejected": -2.5143425464630127, + "logps/chosen": -271.41412353515625, + "logps/rejected": -303.4784851074219, + "loss": 0.092, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6888279914855957, + "rewards/margins": 6.053578853607178, + "rewards/rejected": -5.364750862121582, "step": 2380 }, { - "epoch": 1.17, - "learning_rate": 3.39395584509591e-07, - "logits/chosen": -2.7605209350585938, - "logits/rejected": -2.788830041885376, - "logps/chosen": -302.555908203125, - "logps/rejected": -291.688232421875, - "loss": 0.1217, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.404108464717865, - "rewards/margins": 5.433274745941162, - "rewards/rejected": -5.029166221618652, + "epoch": 1.22, + "learning_rate": 3.3044461190655616e-07, + "logits/chosen": -2.5100953578948975, + "logits/rejected": -2.5121243000030518, + "logps/chosen": -308.7528381347656, + "logps/rejected": -325.7645263671875, + "loss": 0.0796, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9787341952323914, + "rewards/margins": 5.852758884429932, + "rewards/rejected": -4.874024391174316, "step": 2390 }, { - "epoch": 1.17, - "learning_rate": 3.3849077090119435e-07, - "logits/chosen": -2.8283538818359375, - "logits/rejected": -2.8548035621643066, - "logps/chosen": -345.3590393066406, - "logps/rejected": -364.5956726074219, - "loss": 0.129, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.6108449697494507, - "rewards/margins": 7.524580478668213, - "rewards/rejected": -5.913735389709473, + "epoch": 1.22, + "learning_rate": 3.2950263752825926e-07, + "logits/chosen": -2.4334263801574707, + "logits/rejected": -2.417992115020752, + "logps/chosen": -254.85116577148438, + "logps/rejected": -290.29803466796875, + "loss": 0.0877, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.1692960113286972, + "rewards/margins": 5.9460835456848145, + "rewards/rejected": -5.776787757873535, "step": 2400 }, { - "epoch": 1.17, - "eval_logits/chosen": -2.84185528755188, - "eval_logits/rejected": -2.839022397994995, - "eval_logps/chosen": -316.1250915527344, - "eval_logps/rejected": -296.9400939941406, - "eval_loss": 0.5731387734413147, - "eval_rewards/accuracies": 0.7665441036224365, - "eval_rewards/chosen": -0.12640845775604248, - "eval_rewards/margins": 2.6142845153808594, - "eval_rewards/rejected": -2.740692615509033, - "eval_runtime": 304.3451, - "eval_samples_per_second": 7.137, - "eval_steps_per_second": 0.447, + "epoch": 1.22, + "eval_logits/chosen": -2.5290729999542236, + "eval_logits/rejected": -2.5924720764160156, + "eval_logps/chosen": -291.32537841796875, + "eval_logps/rejected": -291.75762939453125, + "eval_loss": 0.5962603688240051, + "eval_rewards/accuracies": 0.7480915784835815, + "eval_rewards/chosen": -1.307490348815918, + "eval_rewards/margins": 2.0547597408294678, + "eval_rewards/rejected": -3.3622498512268066, + "eval_runtime": 296.9185, + "eval_samples_per_second": 7.025, + "eval_steps_per_second": 0.441, "step": 2400 }, { - "epoch": 1.18, - "learning_rate": 3.3758595729279763e-07, - "logits/chosen": -2.781043529510498, - "logits/rejected": -2.7447774410247803, - "logps/chosen": -271.3154602050781, - "logps/rejected": -298.2485656738281, - "loss": 0.1216, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.1118022203445435, - "rewards/margins": 6.4859724044799805, - "rewards/rejected": -5.374170303344727, + "epoch": 1.23, + "learning_rate": 3.2856066314996225e-07, + "logits/chosen": -2.4625680446624756, + "logits/rejected": -2.4047350883483887, + "logps/chosen": -313.5804138183594, + "logps/rejected": -386.77752685546875, + "loss": 0.0949, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6074233055114746, + "rewards/margins": 6.5688276290893555, + "rewards/rejected": -5.961404800415039, "step": 2410 }, { - "epoch": 1.18, - "learning_rate": 3.3668114368440097e-07, - "logits/chosen": -2.889075994491577, - "logits/rejected": -2.880417585372925, - "logps/chosen": -293.88641357421875, - "logps/rejected": -346.7308044433594, - "loss": 0.169, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.203275442123413, - "rewards/margins": 5.923949241638184, - "rewards/rejected": -4.720673561096191, + "epoch": 1.23, + "learning_rate": 3.2761868877166536e-07, + "logits/chosen": -2.4965758323669434, + "logits/rejected": -2.4851412773132324, + "logps/chosen": -283.6171875, + "logps/rejected": -300.8470458984375, + "loss": 0.1552, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.637147843837738, + "rewards/margins": 6.918099403381348, + "rewards/rejected": -6.280951499938965, "step": 2420 }, { - "epoch": 1.19, - "learning_rate": 3.357763300760043e-07, - "logits/chosen": -2.847072124481201, - "logits/rejected": -2.845881223678589, - "logps/chosen": -304.4673156738281, - "logps/rejected": -266.9676818847656, - "loss": 0.0863, + "epoch": 1.24, + "learning_rate": 3.2667671439336846e-07, + "logits/chosen": -2.3009049892425537, + "logits/rejected": -2.3287174701690674, + "logps/chosen": -280.84259033203125, + "logps/rejected": -308.1491394042969, + "loss": 0.1894, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.0863990783691406, - "rewards/margins": 5.875105857849121, - "rewards/rejected": -4.7887067794799805, + "rewards/chosen": 0.1745898723602295, + "rewards/margins": 5.908175468444824, + "rewards/rejected": -5.733586311340332, "step": 2430 }, { - "epoch": 1.19, - "learning_rate": 3.3487151646760764e-07, - "logits/chosen": -2.8331151008605957, - "logits/rejected": -2.8649067878723145, - "logps/chosen": -285.6146240234375, - "logps/rejected": -302.97662353515625, - "loss": 0.1076, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.8372305631637573, - "rewards/margins": 6.089526653289795, - "rewards/rejected": -5.2522969245910645, + "epoch": 1.24, + "learning_rate": 3.2573474001507156e-07, + "logits/chosen": -2.3898582458496094, + "logits/rejected": -2.408456325531006, + "logps/chosen": -274.39056396484375, + "logps/rejected": -301.08990478515625, + "loss": 0.061, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1737585067749023, + "rewards/margins": 6.217486381530762, + "rewards/rejected": -5.043726921081543, "step": 2440 }, { - "epoch": 1.2, - "learning_rate": 3.3396670285921097e-07, - "logits/chosen": -2.9046714305877686, - "logits/rejected": -2.9108693599700928, - "logps/chosen": -324.17291259765625, - "logps/rejected": -325.807861328125, - "loss": 0.1083, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.5144315958023071, - "rewards/margins": 7.1735382080078125, - "rewards/rejected": -5.659106254577637, + "epoch": 1.25, + "learning_rate": 3.2479276563677466e-07, + "logits/chosen": -2.416377544403076, + "logits/rejected": -2.39912748336792, + "logps/chosen": -294.30181884765625, + "logps/rejected": -346.01507568359375, + "loss": 0.0731, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.171212911605835, + "rewards/margins": 6.255845069885254, + "rewards/rejected": -5.084632873535156, "step": 2450 }, { - "epoch": 1.2, - "learning_rate": 3.330618892508143e-07, - "logits/chosen": -2.8689370155334473, - "logits/rejected": -2.9102261066436768, - "logps/chosen": -326.22052001953125, - "logps/rejected": -302.9759216308594, - "loss": 0.0726, + "epoch": 1.25, + "learning_rate": 3.2385079125847776e-07, + "logits/chosen": -2.516638994216919, + "logits/rejected": -2.461843729019165, + "logps/chosen": -273.90057373046875, + "logps/rejected": -311.27276611328125, + "loss": 0.0697, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.1967500448226929, - "rewards/margins": 6.433012962341309, - "rewards/rejected": -5.236263751983643, + "rewards/chosen": 0.6051326394081116, + "rewards/margins": 6.512951850891113, + "rewards/rejected": -5.907818794250488, "step": 2460 }, { - "epoch": 1.21, - "learning_rate": 3.3215707564241764e-07, - "logits/chosen": -2.9117558002471924, - "logits/rejected": -2.912367343902588, - "logps/chosen": -318.78997802734375, - "logps/rejected": -310.45819091796875, - "loss": 0.0751, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.6119754314422607, - "rewards/margins": 5.645215034484863, - "rewards/rejected": -5.033239364624023, + "epoch": 1.26, + "learning_rate": 3.2290881688018086e-07, + "logits/chosen": -2.438994884490967, + "logits/rejected": -2.5107030868530273, + "logps/chosen": -266.1100158691406, + "logps/rejected": -337.4862365722656, + "loss": 0.0749, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5303374528884888, + "rewards/margins": 6.261448860168457, + "rewards/rejected": -5.731112003326416, "step": 2470 }, { - "epoch": 1.21, - "learning_rate": 3.31252262034021e-07, - "logits/chosen": -2.938986301422119, - "logits/rejected": -2.923339366912842, - "logps/chosen": -268.93524169921875, - "logps/rejected": -297.3427429199219, - "loss": 0.115, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.31539708375930786, - "rewards/margins": 5.869711875915527, - "rewards/rejected": -5.554314613342285, + "epoch": 1.26, + "learning_rate": 3.2196684250188396e-07, + "logits/chosen": -2.5541601181030273, + "logits/rejected": -2.3836662769317627, + "logps/chosen": -336.1649475097656, + "logps/rejected": -354.42413330078125, + "loss": 0.0768, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7871938347816467, + "rewards/margins": 7.078111171722412, + "rewards/rejected": -6.290916442871094, "step": 2480 }, { - "epoch": 1.22, - "learning_rate": 3.303474484256243e-07, - "logits/chosen": -2.895840883255005, - "logits/rejected": -2.9173085689544678, - "logps/chosen": -326.73382568359375, - "logps/rejected": -321.5074462890625, - "loss": 0.1034, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.987014651298523, - "rewards/margins": 5.568175792694092, - "rewards/rejected": -4.581160545349121, + "epoch": 1.27, + "learning_rate": 3.2102486812358706e-07, + "logits/chosen": -2.3908188343048096, + "logits/rejected": -2.292724609375, + "logps/chosen": -282.8141784667969, + "logps/rejected": -298.9007263183594, + "loss": 0.0983, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6560268402099609, + "rewards/margins": 5.793659210205078, + "rewards/rejected": -5.137632846832275, "step": 2490 }, { - "epoch": 1.22, - "learning_rate": 3.294426348172276e-07, - "logits/chosen": -2.895078659057617, - "logits/rejected": -2.8941431045532227, - "logps/chosen": -303.32684326171875, - "logps/rejected": -343.2083435058594, - "loss": 0.1052, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.5622583627700806, - "rewards/margins": 7.011788368225098, - "rewards/rejected": -5.44952917098999, + "epoch": 1.27, + "learning_rate": 3.200828937452901e-07, + "logits/chosen": -2.3237905502319336, + "logits/rejected": -2.2244579792022705, + "logps/chosen": -226.01779174804688, + "logps/rejected": -285.91009521484375, + "loss": 0.1114, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.20410139858722687, + "rewards/margins": 5.630406856536865, + "rewards/rejected": -5.426304817199707, "step": 2500 }, { - "epoch": 1.22, - "eval_logits/chosen": -2.8835036754608154, - "eval_logits/rejected": -2.8815555572509766, - "eval_logps/chosen": -315.8643493652344, - "eval_logps/rejected": -297.4045715332031, - "eval_loss": 0.5871793627738953, - "eval_rewards/accuracies": 0.7683823704719543, - "eval_rewards/chosen": -0.10033486038446426, - "eval_rewards/margins": 2.6868107318878174, - "eval_rewards/rejected": -2.7871456146240234, - "eval_runtime": 304.0122, - "eval_samples_per_second": 7.144, - "eval_steps_per_second": 0.447, + "epoch": 1.27, + "eval_logits/chosen": -2.4141666889190674, + "eval_logits/rejected": -2.479241132736206, + "eval_logps/chosen": -291.859375, + "eval_logps/rejected": -293.6587219238281, + "eval_loss": 0.6126044392585754, + "eval_rewards/accuracies": 0.7461832165718079, + "eval_rewards/chosen": -1.360889196395874, + "eval_rewards/margins": 2.191472291946411, + "eval_rewards/rejected": -3.552361249923706, + "eval_runtime": 302.1294, + "eval_samples_per_second": 6.904, + "eval_steps_per_second": 0.434, "step": 2500 }, { - "epoch": 1.23, - "learning_rate": 3.2853782120883093e-07, - "logits/chosen": -2.901777982711792, - "logits/rejected": -2.888019561767578, - "logps/chosen": -297.0653076171875, - "logps/rejected": -321.1457214355469, - "loss": 0.0763, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.261099338531494, - "rewards/margins": 8.412405014038086, - "rewards/rejected": -6.151305198669434, + "epoch": 1.28, + "learning_rate": 3.191409193669932e-07, + "logits/chosen": -2.2465217113494873, + "logits/rejected": -2.2498700618743896, + "logps/chosen": -249.7085418701172, + "logps/rejected": -300.81463623046875, + "loss": 0.0749, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.22908727824687958, + "rewards/margins": 5.943365097045898, + "rewards/rejected": -5.714277744293213, "step": 2510 }, { - "epoch": 1.23, - "learning_rate": 3.2763300760043427e-07, - "logits/chosen": -2.8046631813049316, - "logits/rejected": -2.7767045497894287, - "logps/chosen": -380.04461669921875, - "logps/rejected": -387.30474853515625, - "loss": 0.0969, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.5187785625457764, - "rewards/margins": 7.271962642669678, - "rewards/rejected": -5.7531843185424805, + "epoch": 1.28, + "learning_rate": 3.181989449886963e-07, + "logits/chosen": -2.2804088592529297, + "logits/rejected": -2.2391762733459473, + "logps/chosen": -248.0771942138672, + "logps/rejected": -318.37176513671875, + "loss": 0.0681, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.41089898347854614, + "rewards/margins": 6.202281475067139, + "rewards/rejected": -5.791383266448975, "step": 2520 }, { - "epoch": 1.24, - "learning_rate": 3.267281939920376e-07, - "logits/chosen": -2.90790057182312, - "logits/rejected": -2.8698112964630127, - "logps/chosen": -328.44903564453125, - "logps/rejected": -308.59771728515625, - "loss": 0.0664, + "epoch": 1.29, + "learning_rate": 3.172569706103994e-07, + "logits/chosen": -2.38960599899292, + "logits/rejected": -2.3537020683288574, + "logps/chosen": -231.6435089111328, + "logps/rejected": -331.68170166015625, + "loss": 0.123, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.6165921688079834, - "rewards/margins": 7.895709991455078, - "rewards/rejected": -6.279118537902832, + "rewards/chosen": 0.6632026433944702, + "rewards/margins": 7.348013877868652, + "rewards/rejected": -6.684811592102051, "step": 2530 }, { - "epoch": 1.24, - "learning_rate": 3.2582338038364094e-07, - "logits/chosen": -2.912496566772461, - "logits/rejected": -2.889523983001709, - "logps/chosen": -315.60540771484375, - "logps/rejected": -268.34100341796875, - "loss": 0.0804, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.45944374799728394, - "rewards/margins": 5.7416839599609375, - "rewards/rejected": -5.28223991394043, - "step": 2540 + "epoch": 1.29, + "learning_rate": 3.1631499623210246e-07, + "logits/chosen": -2.399754047393799, + "logits/rejected": -2.367912769317627, + "logps/chosen": -252.1080780029297, + "logps/rejected": -317.89190673828125, + "loss": 0.0879, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6604760885238647, + "rewards/margins": 6.377139568328857, + "rewards/rejected": -5.716663360595703, + "step": 2540 }, { - "epoch": 1.25, - "learning_rate": 3.2491856677524427e-07, - "logits/chosen": -2.9169907569885254, - "logits/rejected": -2.9441370964050293, - "logps/chosen": -289.6059265136719, - "logps/rejected": -311.40185546875, - "loss": 0.0669, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.302586317062378, - "rewards/margins": 6.7128496170043945, - "rewards/rejected": -5.410263538360596, + "epoch": 1.3, + "learning_rate": 3.1537302185380556e-07, + "logits/chosen": -2.3488802909851074, + "logits/rejected": -2.3999979496002197, + "logps/chosen": -242.5727996826172, + "logps/rejected": -307.24493408203125, + "loss": 0.1015, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.9912754893302917, + "rewards/margins": 6.489681243896484, + "rewards/rejected": -5.498406410217285, "step": 2550 }, { - "epoch": 1.25, - "learning_rate": 3.240137531668476e-07, - "logits/chosen": -2.9144740104675293, - "logits/rejected": -2.921952724456787, - "logps/chosen": -373.2574157714844, - "logps/rejected": -315.42144775390625, - "loss": 0.083, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.8909327983856201, - "rewards/margins": 7.606240272521973, - "rewards/rejected": -5.715307712554932, + "epoch": 1.3, + "learning_rate": 3.1443104747550866e-07, + "logits/chosen": -2.273897171020508, + "logits/rejected": -2.38022780418396, + "logps/chosen": -313.0953063964844, + "logps/rejected": -299.621826171875, + "loss": 0.1084, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5919345021247864, + "rewards/margins": 5.737548828125, + "rewards/rejected": -5.145614147186279, "step": 2560 }, { - "epoch": 1.26, - "learning_rate": 3.2310893955845094e-07, - "logits/chosen": -2.8963513374328613, - "logits/rejected": -2.8949122428894043, - "logps/chosen": -335.890380859375, - "logps/rejected": -369.06427001953125, - "loss": 0.1568, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2558376789093018, - "rewards/margins": 7.26554012298584, - "rewards/rejected": -6.009702682495117, + "epoch": 1.31, + "learning_rate": 3.1348907309721176e-07, + "logits/chosen": -2.4264285564422607, + "logits/rejected": -2.3853797912597656, + "logps/chosen": -290.6047058105469, + "logps/rejected": -311.0415954589844, + "loss": 0.0666, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7939327955245972, + "rewards/margins": 6.2428998947143555, + "rewards/rejected": -5.448967456817627, "step": 2570 }, { - "epoch": 1.26, - "learning_rate": 3.222041259500543e-07, - "logits/chosen": -2.9233856201171875, - "logits/rejected": -2.911083698272705, - "logps/chosen": -332.73358154296875, - "logps/rejected": -306.0821533203125, - "loss": 0.1637, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0451687574386597, - "rewards/margins": 6.040487289428711, - "rewards/rejected": -4.995318412780762, + "epoch": 1.31, + "learning_rate": 3.125470987189148e-07, + "logits/chosen": -2.335716962814331, + "logits/rejected": -2.367015838623047, + "logps/chosen": -275.623779296875, + "logps/rejected": -326.81488037109375, + "loss": 0.0736, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8744140863418579, + "rewards/margins": 6.54227352142334, + "rewards/rejected": -5.6678595542907715, "step": 2580 }, { - "epoch": 1.27, - "learning_rate": 3.2129931234165756e-07, - "logits/chosen": -2.9295690059661865, - "logits/rejected": -2.9309005737304688, - "logps/chosen": -329.8963317871094, - "logps/rejected": -295.49383544921875, - "loss": 0.065, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9920492172241211, - "rewards/margins": 6.354214668273926, - "rewards/rejected": -5.362165927886963, + "epoch": 1.32, + "learning_rate": 3.116051243406179e-07, + "logits/chosen": -2.27899169921875, + "logits/rejected": -2.2389168739318848, + "logps/chosen": -277.07440185546875, + "logps/rejected": -301.26190185546875, + "loss": 0.1244, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.7585972547531128, + "rewards/margins": 5.7357892990112305, + "rewards/rejected": -4.977191925048828, "step": 2590 }, { - "epoch": 1.27, - "learning_rate": 3.203944987332609e-07, - "logits/chosen": -2.8989548683166504, - "logits/rejected": -2.8977577686309814, - "logps/chosen": -328.84271240234375, - "logps/rejected": -351.22613525390625, - "loss": 0.0895, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.9989855289459229, - "rewards/margins": 7.407504081726074, - "rewards/rejected": -5.408519268035889, + "epoch": 1.32, + "learning_rate": 3.10663149962321e-07, + "logits/chosen": -2.448560953140259, + "logits/rejected": -2.4991815090179443, + "logps/chosen": -326.71539306640625, + "logps/rejected": -349.9276428222656, + "loss": 0.0864, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5657342672348022, + "rewards/margins": 6.299088478088379, + "rewards/rejected": -5.733354091644287, "step": 2600 }, { - "epoch": 1.27, - "eval_logits/chosen": -2.850917339324951, - "eval_logits/rejected": -2.8467910289764404, - "eval_logps/chosen": -315.93634033203125, - "eval_logps/rejected": -298.3055725097656, - "eval_loss": 0.6245527267456055, - "eval_rewards/accuracies": 0.7610294222831726, - "eval_rewards/chosen": -0.10753544420003891, - "eval_rewards/margins": 2.769707679748535, - "eval_rewards/rejected": -2.8772430419921875, - "eval_runtime": 304.202, - "eval_samples_per_second": 7.14, - "eval_steps_per_second": 0.447, + "epoch": 1.32, + "eval_logits/chosen": -2.50575590133667, + "eval_logits/rejected": -2.570965051651001, + "eval_logps/chosen": -294.3439636230469, + "eval_logps/rejected": -295.719482421875, + "eval_loss": 0.6456525921821594, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -1.609349012374878, + "eval_rewards/margins": 2.149090051651001, + "eval_rewards/rejected": -3.758439302444458, + "eval_runtime": 296.9418, + "eval_samples_per_second": 7.025, + "eval_steps_per_second": 0.441, "step": 2600 }, { - "epoch": 1.28, - "learning_rate": 3.1948968512486423e-07, - "logits/chosen": -2.8952720165252686, - "logits/rejected": -2.875281572341919, - "logps/chosen": -275.56573486328125, - "logps/rejected": -310.2125244140625, - "loss": 0.1375, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.353104829788208, - "rewards/margins": 6.848490238189697, - "rewards/rejected": -5.495386123657227, + "epoch": 1.33, + "learning_rate": 3.097211755840241e-07, + "logits/chosen": -2.336880683898926, + "logits/rejected": -2.3799967765808105, + "logps/chosen": -235.99459838867188, + "logps/rejected": -279.66180419921875, + "loss": 0.1127, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5167864561080933, + "rewards/margins": 5.976006507873535, + "rewards/rejected": -5.459219932556152, "step": 2610 }, { - "epoch": 1.28, - "learning_rate": 3.1858487151646757e-07, - "logits/chosen": -2.827455759048462, - "logits/rejected": -2.8452229499816895, - "logps/chosen": -309.0766906738281, - "logps/rejected": -300.1092224121094, - "loss": 0.2083, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.026174783706665, - "rewards/margins": 7.955998420715332, - "rewards/rejected": -6.929821968078613, + "epoch": 1.33, + "learning_rate": 3.087792012057272e-07, + "logits/chosen": -2.328923463821411, + "logits/rejected": -2.3436508178710938, + "logps/chosen": -272.0519714355469, + "logps/rejected": -316.40594482421875, + "loss": 0.097, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3497155904769897, + "rewards/margins": 7.2396721839904785, + "rewards/rejected": -5.889956474304199, "step": 2620 }, { - "epoch": 1.28, - "learning_rate": 3.176800579080709e-07, - "logits/chosen": -2.8380398750305176, - "logits/rejected": -2.8762106895446777, - "logps/chosen": -300.206787109375, - "logps/rejected": -313.2204284667969, - "loss": 0.149, + "epoch": 1.34, + "learning_rate": 3.078372268274303e-07, + "logits/chosen": -2.39420747756958, + "logits/rejected": -2.4377219676971436, + "logps/chosen": -301.8684997558594, + "logps/rejected": -345.6015625, + "loss": 0.0943, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.5612722635269165, - "rewards/margins": 5.91481351852417, - "rewards/rejected": -5.353541374206543, + "rewards/chosen": 0.38849225640296936, + "rewards/margins": 6.270806789398193, + "rewards/rejected": -5.882315158843994, "step": 2630 }, { - "epoch": 1.29, - "learning_rate": 3.1677524429967424e-07, - "logits/chosen": -2.807554244995117, - "logits/rejected": -2.8306655883789062, - "logps/chosen": -259.348876953125, - "logps/rejected": -291.9242248535156, - "loss": 0.1226, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9537801742553711, - "rewards/margins": 6.7636613845825195, - "rewards/rejected": -5.809881210327148, + "epoch": 1.34, + "learning_rate": 3.068952524491334e-07, + "logits/chosen": -2.357003927230835, + "logits/rejected": -2.4049508571624756, + "logps/chosen": -240.2396697998047, + "logps/rejected": -322.50738525390625, + "loss": 0.0832, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9055463671684265, + "rewards/margins": 6.93111515045166, + "rewards/rejected": -6.025568962097168, "step": 2640 }, { - "epoch": 1.29, - "learning_rate": 3.1587043069127757e-07, - "logits/chosen": -2.8576576709747314, - "logits/rejected": -2.859753131866455, - "logps/chosen": -283.2281494140625, - "logps/rejected": -319.9939880371094, - "loss": 0.1141, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.9901631474494934, - "rewards/margins": 6.837104797363281, - "rewards/rejected": -5.8469414710998535, + "epoch": 1.35, + "learning_rate": 3.059532780708365e-07, + "logits/chosen": -2.3569881916046143, + "logits/rejected": -2.4609577655792236, + "logps/chosen": -298.69207763671875, + "logps/rejected": -278.62408447265625, + "loss": 0.071, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.22584030032157898, + "rewards/margins": 5.36661434173584, + "rewards/rejected": -5.140774250030518, "step": 2650 }, { - "epoch": 1.3, - "learning_rate": 3.149656170828809e-07, - "logits/chosen": -2.887336254119873, - "logits/rejected": -2.893540859222412, - "logps/chosen": -274.8492736816406, - "logps/rejected": -297.57122802734375, - "loss": 0.0491, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.1435272693634033, - "rewards/margins": 6.856845855712891, - "rewards/rejected": -5.713318347930908, + "epoch": 1.35, + "learning_rate": 3.050113036925395e-07, + "logits/chosen": -2.3827061653137207, + "logits/rejected": -2.4103548526763916, + "logps/chosen": -255.54251098632812, + "logps/rejected": -301.31158447265625, + "loss": 0.0674, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.49262189865112305, + "rewards/margins": 6.265991687774658, + "rewards/rejected": -5.773369789123535, "step": 2660 }, { - "epoch": 1.3, - "learning_rate": 3.140608034744842e-07, - "logits/chosen": -2.8348751068115234, - "logits/rejected": -2.824328899383545, - "logps/chosen": -290.5682067871094, - "logps/rejected": -304.4690246582031, - "loss": 0.0738, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9573003649711609, - "rewards/margins": 6.705183506011963, - "rewards/rejected": -5.747883319854736, + "epoch": 1.36, + "learning_rate": 3.040693293142426e-07, + "logits/chosen": -2.323338031768799, + "logits/rejected": -2.4888432025909424, + "logps/chosen": -287.3337707519531, + "logps/rejected": -295.5697326660156, + "loss": 0.1157, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.3787381052970886, + "rewards/margins": 6.640289306640625, + "rewards/rejected": -6.261551856994629, "step": 2670 }, { - "epoch": 1.31, - "learning_rate": 3.131559898660875e-07, - "logits/chosen": -2.8353915214538574, - "logits/rejected": -2.8606410026550293, - "logps/chosen": -314.75628662109375, - "logps/rejected": -356.0235900878906, - "loss": 0.1309, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.32307273149490356, - "rewards/margins": 6.097552299499512, - "rewards/rejected": -5.774479866027832, + "epoch": 1.36, + "learning_rate": 3.031273549359457e-07, + "logits/chosen": -2.3468449115753174, + "logits/rejected": -2.3720457553863525, + "logps/chosen": -280.97100830078125, + "logps/rejected": -349.592529296875, + "loss": 0.0749, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01323468703776598, + "rewards/margins": 6.698792934417725, + "rewards/rejected": -6.685558319091797, "step": 2680 }, { - "epoch": 1.31, - "learning_rate": 3.122511762576909e-07, - "logits/chosen": -2.7965564727783203, - "logits/rejected": -2.812563896179199, - "logps/chosen": -297.37286376953125, - "logps/rejected": -270.93585205078125, - "loss": 0.059, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.20585763454437256, - "rewards/margins": 6.628664493560791, - "rewards/rejected": -6.422807216644287, + "epoch": 1.37, + "learning_rate": 3.021853805576488e-07, + "logits/chosen": -2.4187724590301514, + "logits/rejected": -2.3718185424804688, + "logps/chosen": -288.25787353515625, + "logps/rejected": -358.4739990234375, + "loss": 0.0975, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8575443029403687, + "rewards/margins": 7.281981468200684, + "rewards/rejected": -6.424437046051025, "step": 2690 }, { - "epoch": 1.32, - "learning_rate": 3.1134636264929425e-07, - "logits/chosen": -2.8390262126922607, - "logits/rejected": -2.866852283477783, - "logps/chosen": -310.4302062988281, - "logps/rejected": -347.6809387207031, - "loss": 0.0801, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.3635275661945343, - "rewards/margins": 6.365360736846924, - "rewards/rejected": -6.001832485198975, + "epoch": 1.37, + "learning_rate": 3.012434061793519e-07, + "logits/chosen": -2.3994593620300293, + "logits/rejected": -2.3024771213531494, + "logps/chosen": -243.65792846679688, + "logps/rejected": -296.03070068359375, + "loss": 0.0708, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.17152036726474762, + "rewards/margins": 5.766635894775391, + "rewards/rejected": -5.595116138458252, "step": 2700 }, { - "epoch": 1.32, - "eval_logits/chosen": -2.7980470657348633, - "eval_logits/rejected": -2.799748659133911, - "eval_logps/chosen": -323.3645935058594, - "eval_logps/rejected": -307.0060729980469, - "eval_loss": 0.5784503817558289, - "eval_rewards/accuracies": 0.7720588445663452, - "eval_rewards/chosen": -0.8503587245941162, - "eval_rewards/margins": 2.8969340324401855, - "eval_rewards/rejected": -3.7472925186157227, - "eval_runtime": 303.5968, - "eval_samples_per_second": 7.154, - "eval_steps_per_second": 0.448, + "epoch": 1.37, + "eval_logits/chosen": -2.468432664871216, + "eval_logits/rejected": -2.539393186569214, + "eval_logps/chosen": -296.3444519042969, + "eval_logps/rejected": -295.1769104003906, + "eval_loss": 0.6079808473587036, + "eval_rewards/accuracies": 0.7461832165718079, + "eval_rewards/chosen": -1.8093987703323364, + "eval_rewards/margins": 1.894778847694397, + "eval_rewards/rejected": -3.7041778564453125, + "eval_runtime": 301.6905, + "eval_samples_per_second": 6.914, + "eval_steps_per_second": 0.434, "step": 2700 }, { - "epoch": 1.32, - "learning_rate": 3.104415490408976e-07, - "logits/chosen": -2.8265998363494873, - "logits/rejected": -2.8251121044158936, - "logps/chosen": -348.6389465332031, - "logps/rejected": -333.89886474609375, - "loss": 0.1002, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8468540906906128, - "rewards/margins": 7.429998874664307, - "rewards/rejected": -6.5831451416015625, + "epoch": 1.38, + "learning_rate": 3.00301431801055e-07, + "logits/chosen": -2.330199718475342, + "logits/rejected": -2.2474539279937744, + "logps/chosen": -263.4881286621094, + "logps/rejected": -278.0344543457031, + "loss": 0.0757, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5858259797096252, + "rewards/margins": 6.23793363571167, + "rewards/rejected": -5.652107238769531, "step": 2710 }, { - "epoch": 1.33, - "learning_rate": 3.095367354325009e-07, - "logits/chosen": -2.865467071533203, - "logits/rejected": -2.8583080768585205, - "logps/chosen": -263.1181335449219, - "logps/rejected": -237.5290069580078, - "loss": 0.0853, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.03143047168850899, - "rewards/margins": 5.5778326988220215, - "rewards/rejected": -5.546402931213379, + "epoch": 1.38, + "learning_rate": 2.993594574227581e-07, + "logits/chosen": -2.25746488571167, + "logits/rejected": -2.329878568649292, + "logps/chosen": -248.5938262939453, + "logps/rejected": -295.58740234375, + "loss": 0.0946, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2743186950683594, + "rewards/margins": 5.52974271774292, + "rewards/rejected": -5.255423545837402, "step": 2720 }, { - "epoch": 1.33, - "learning_rate": 3.0863192182410425e-07, - "logits/chosen": -2.8304696083068848, - "logits/rejected": -2.8667330741882324, - "logps/chosen": -318.785888671875, - "logps/rejected": -324.71942138671875, - "loss": 0.0613, + "epoch": 1.39, + "learning_rate": 2.984174830444612e-07, + "logits/chosen": -2.3174257278442383, + "logits/rejected": -2.2660281658172607, + "logps/chosen": -276.1614074707031, + "logps/rejected": -335.32891845703125, + "loss": 0.0772, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8431186676025391, - "rewards/margins": 7.226942539215088, - "rewards/rejected": -6.383824348449707, + "rewards/chosen": -0.044588230550289154, + "rewards/margins": 6.649122714996338, + "rewards/rejected": -6.6937103271484375, "step": 2730 }, { - "epoch": 1.34, - "learning_rate": 3.077271082157076e-07, - "logits/chosen": -2.80513596534729, - "logits/rejected": -2.8130974769592285, - "logps/chosen": -329.828369140625, - "logps/rejected": -322.0484924316406, - "loss": 0.0948, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7383403778076172, - "rewards/margins": 7.414438724517822, - "rewards/rejected": -6.676097869873047, + "epoch": 1.39, + "learning_rate": 2.9747550866616425e-07, + "logits/chosen": -2.442142963409424, + "logits/rejected": -2.360114574432373, + "logps/chosen": -272.2823486328125, + "logps/rejected": -308.30877685546875, + "loss": 0.1091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7997811436653137, + "rewards/margins": 6.003107070922852, + "rewards/rejected": -5.2033257484436035, "step": 2740 }, { - "epoch": 1.34, - "learning_rate": 3.068222946073109e-07, - "logits/chosen": -2.8749823570251465, - "logits/rejected": -2.8934693336486816, - "logps/chosen": -309.43292236328125, - "logps/rejected": -345.50714111328125, - "loss": 0.1105, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.6662381887435913, - "rewards/margins": 6.77623987197876, - "rewards/rejected": -6.110001564025879, + "epoch": 1.4, + "learning_rate": 2.9653353428786735e-07, + "logits/chosen": -2.4135284423828125, + "logits/rejected": -2.41511869430542, + "logps/chosen": -234.8014373779297, + "logps/rejected": -294.651123046875, + "loss": 0.158, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.1786421835422516, + "rewards/margins": 5.884564399719238, + "rewards/rejected": -5.705922603607178, "step": 2750 }, { - "epoch": 1.35, - "learning_rate": 3.0591748099891426e-07, - "logits/chosen": -2.816394805908203, - "logits/rejected": -2.8290398120880127, - "logps/chosen": -303.6565246582031, - "logps/rejected": -333.07354736328125, - "loss": 0.1906, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.818158745765686, - "rewards/margins": 6.884306907653809, - "rewards/rejected": -6.066147804260254, + "epoch": 1.4, + "learning_rate": 2.9559155990957045e-07, + "logits/chosen": -2.3652257919311523, + "logits/rejected": -2.398829936981201, + "logps/chosen": -292.03717041015625, + "logps/rejected": -300.37603759765625, + "loss": 0.1072, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.36263585090637207, + "rewards/margins": 6.317052841186523, + "rewards/rejected": -5.954416751861572, "step": 2760 }, { - "epoch": 1.35, - "learning_rate": 3.0501266739051754e-07, - "logits/chosen": -2.832143783569336, - "logits/rejected": -2.85274076461792, - "logps/chosen": -363.9978942871094, - "logps/rejected": -303.37469482421875, - "loss": 0.0899, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.084667682647705, - "rewards/margins": 6.729748725891113, - "rewards/rejected": -5.645081043243408, + "epoch": 1.41, + "learning_rate": 2.9464958553127355e-07, + "logits/chosen": -2.26711106300354, + "logits/rejected": -2.275381565093994, + "logps/chosen": -294.2849426269531, + "logps/rejected": -328.8524169921875, + "loss": 0.0606, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6599899530410767, + "rewards/margins": 6.481578826904297, + "rewards/rejected": -5.82158899307251, "step": 2770 }, { - "epoch": 1.36, - "learning_rate": 3.041078537821209e-07, - "logits/chosen": -2.8558402061462402, - "logits/rejected": -2.8523974418640137, - "logps/chosen": -347.8671875, - "logps/rejected": -364.4112548828125, - "loss": 0.0805, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.4566287994384766, - "rewards/margins": 7.299247741699219, - "rewards/rejected": -5.842617988586426, + "epoch": 1.41, + "learning_rate": 2.9370761115297666e-07, + "logits/chosen": -2.4011237621307373, + "logits/rejected": -2.46509051322937, + "logps/chosen": -288.33978271484375, + "logps/rejected": -335.15277099609375, + "loss": 0.0992, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3577355742454529, + "rewards/margins": 6.564070224761963, + "rewards/rejected": -6.206334590911865, "step": 2780 }, { - "epoch": 1.36, - "learning_rate": 3.032030401737242e-07, - "logits/chosen": -2.779650926589966, - "logits/rejected": -2.788412570953369, - "logps/chosen": -272.4227294921875, - "logps/rejected": -333.17181396484375, - "loss": 0.0755, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.098186731338501, - "rewards/margins": 6.324115753173828, - "rewards/rejected": -5.225928783416748, + "epoch": 1.42, + "learning_rate": 2.927656367746797e-07, + "logits/chosen": -2.4565563201904297, + "logits/rejected": -2.4363064765930176, + "logps/chosen": -284.31683349609375, + "logps/rejected": -353.6046447753906, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1482659578323364, + "rewards/margins": 7.725053310394287, + "rewards/rejected": -6.57678747177124, "step": 2790 }, { - "epoch": 1.37, - "learning_rate": 3.0229822656532755e-07, - "logits/chosen": -2.7770309448242188, - "logits/rejected": -2.7935116291046143, - "logps/chosen": -311.76129150390625, - "logps/rejected": -362.6114196777344, - "loss": 0.1012, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.1266541481018066, - "rewards/margins": 6.636162757873535, - "rewards/rejected": -5.5095086097717285, + "epoch": 1.42, + "learning_rate": 2.918236623963828e-07, + "logits/chosen": -2.337040424346924, + "logits/rejected": -2.4189975261688232, + "logps/chosen": -324.59759521484375, + "logps/rejected": -350.65301513671875, + "loss": 0.0794, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5803574323654175, + "rewards/margins": 6.807870388031006, + "rewards/rejected": -6.227513313293457, "step": 2800 }, { - "epoch": 1.37, - "eval_logits/chosen": -2.7899482250213623, - "eval_logits/rejected": -2.794990301132202, - "eval_logps/chosen": -319.5309143066406, - "eval_logps/rejected": -303.9475402832031, - "eval_loss": 0.5710012316703796, - "eval_rewards/accuracies": 0.7849264740943909, - "eval_rewards/chosen": -0.46699053049087524, - "eval_rewards/margins": 2.97445011138916, - "eval_rewards/rejected": -3.4414405822753906, - "eval_runtime": 303.4793, - "eval_samples_per_second": 7.157, - "eval_steps_per_second": 0.448, + "epoch": 1.42, + "eval_logits/chosen": -2.4662692546844482, + "eval_logits/rejected": -2.5368869304656982, + "eval_logps/chosen": -295.93536376953125, + "eval_logps/rejected": -296.73797607421875, + "eval_loss": 0.6009625792503357, + "eval_rewards/accuracies": 0.7538167834281921, + "eval_rewards/chosen": -1.7684876918792725, + "eval_rewards/margins": 2.0917961597442627, + "eval_rewards/rejected": -3.8602840900421143, + "eval_runtime": 296.7378, + "eval_samples_per_second": 7.03, + "eval_steps_per_second": 0.441, "step": 2800 }, { - "epoch": 1.37, - "learning_rate": 3.013934129569309e-07, - "logits/chosen": -2.774418592453003, - "logits/rejected": -2.742736339569092, - "logps/chosen": -293.4513244628906, - "logps/rejected": -298.5943298339844, - "loss": 0.0904, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.8109585046768188, - "rewards/margins": 6.035546779632568, - "rewards/rejected": -5.224588394165039, + "epoch": 1.43, + "learning_rate": 2.908816880180859e-07, + "logits/chosen": -2.37556791305542, + "logits/rejected": -2.3833582401275635, + "logps/chosen": -273.7535400390625, + "logps/rejected": -272.9826354980469, + "loss": 0.07, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.3809967041015625, + "rewards/margins": 5.994932651519775, + "rewards/rejected": -5.613936424255371, "step": 2810 }, { - "epoch": 1.38, - "learning_rate": 3.004885993485342e-07, - "logits/chosen": -2.705143690109253, - "logits/rejected": -2.748145580291748, - "logps/chosen": -310.96337890625, - "logps/rejected": -301.6844177246094, - "loss": 0.0764, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0652358531951904, - "rewards/margins": 6.334219932556152, - "rewards/rejected": -5.268984794616699, + "epoch": 1.43, + "learning_rate": 2.8993971363978895e-07, + "logits/chosen": -2.3508405685424805, + "logits/rejected": -2.31650710105896, + "logps/chosen": -276.4288635253906, + "logps/rejected": -333.42022705078125, + "loss": 0.1003, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6772788763046265, + "rewards/margins": 6.1997246742248535, + "rewards/rejected": -5.522445201873779, "step": 2820 }, { - "epoch": 1.38, - "learning_rate": 2.9958378574013755e-07, - "logits/chosen": -2.842708110809326, - "logits/rejected": -2.854308843612671, - "logps/chosen": -296.58734130859375, - "logps/rejected": -330.907470703125, - "loss": 0.1693, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.812210738658905, - "rewards/margins": 6.4347076416015625, - "rewards/rejected": -5.622496128082275, + "epoch": 1.44, + "learning_rate": 2.8899773926149205e-07, + "logits/chosen": -2.4621999263763428, + "logits/rejected": -2.437527656555176, + "logps/chosen": -288.25042724609375, + "logps/rejected": -349.9354553222656, + "loss": 0.0507, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2092394828796387, + "rewards/margins": 7.094609260559082, + "rewards/rejected": -5.885369300842285, "step": 2830 }, { - "epoch": 1.39, - "learning_rate": 2.986789721317409e-07, - "logits/chosen": -2.867182970046997, - "logits/rejected": -2.8954789638519287, - "logps/chosen": -294.0032958984375, - "logps/rejected": -283.11962890625, - "loss": 0.0951, + "epoch": 1.44, + "learning_rate": 2.8805576488319515e-07, + "logits/chosen": -2.376804828643799, + "logits/rejected": -2.3722846508026123, + "logps/chosen": -294.90625, + "logps/rejected": -318.5020446777344, + "loss": 0.0903, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.1737993955612183, - "rewards/margins": 6.133855819702148, - "rewards/rejected": -4.960055828094482, + "rewards/chosen": 0.9208866953849792, + "rewards/margins": 5.965109825134277, + "rewards/rejected": -5.044223785400391, "step": 2840 }, { - "epoch": 1.39, - "learning_rate": 2.977741585233442e-07, - "logits/chosen": -2.867978811264038, - "logits/rejected": -2.872471809387207, - "logps/chosen": -285.687255859375, - "logps/rejected": -393.1041259765625, - "loss": 0.1003, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.4830867052078247, - "rewards/margins": 7.980967044830322, - "rewards/rejected": -6.4978790283203125, + "epoch": 1.45, + "learning_rate": 2.8711379050489825e-07, + "logits/chosen": -2.377434253692627, + "logits/rejected": -2.3900840282440186, + "logps/chosen": -296.39239501953125, + "logps/rejected": -353.85516357421875, + "loss": 0.0987, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.18252840638160706, + "rewards/margins": 5.856029510498047, + "rewards/rejected": -5.673501491546631, "step": 2850 }, { - "epoch": 1.4, - "learning_rate": 2.968693449149475e-07, - "logits/chosen": -2.8083853721618652, - "logits/rejected": -2.8247978687286377, - "logps/chosen": -296.3735046386719, - "logps/rejected": -336.5116271972656, - "loss": 0.1187, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.6419092416763306, - "rewards/margins": 6.954775333404541, - "rewards/rejected": -5.3128662109375, + "epoch": 1.45, + "learning_rate": 2.8617181612660135e-07, + "logits/chosen": -2.3156490325927734, + "logits/rejected": -2.3677258491516113, + "logps/chosen": -287.42572021484375, + "logps/rejected": -316.38824462890625, + "loss": 0.0966, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.88239586353302, + "rewards/margins": 6.383681774139404, + "rewards/rejected": -5.501286506652832, "step": 2860 }, { - "epoch": 1.4, - "learning_rate": 2.9596453130655084e-07, - "logits/chosen": -2.8297431468963623, - "logits/rejected": -2.883183002471924, - "logps/chosen": -283.1943359375, - "logps/rejected": -324.32537841796875, - "loss": 0.0863, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.1296570301055908, - "rewards/margins": 6.565255641937256, - "rewards/rejected": -5.435599327087402, + "epoch": 1.46, + "learning_rate": 2.8522984174830445e-07, + "logits/chosen": -2.3602707386016846, + "logits/rejected": -2.4136736392974854, + "logps/chosen": -227.7016143798828, + "logps/rejected": -287.86175537109375, + "loss": 0.0952, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7701293230056763, + "rewards/margins": 6.855650901794434, + "rewards/rejected": -6.085522174835205, "step": 2870 }, { - "epoch": 1.41, - "learning_rate": 2.950597176981542e-07, - "logits/chosen": -2.801255464553833, - "logits/rejected": -2.822458028793335, - "logps/chosen": -305.63238525390625, - "logps/rejected": -276.85479736328125, - "loss": 0.1189, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.1281497478485107, - "rewards/margins": 6.085671424865723, - "rewards/rejected": -4.957521438598633, + "epoch": 1.46, + "learning_rate": 2.8428786737000755e-07, + "logits/chosen": -2.4034171104431152, + "logits/rejected": -2.5166029930114746, + "logps/chosen": -307.9529724121094, + "logps/rejected": -370.57269287109375, + "loss": 0.063, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6714164018630981, + "rewards/margins": 7.327467918395996, + "rewards/rejected": -6.6560516357421875, "step": 2880 }, { - "epoch": 1.41, - "learning_rate": 2.941549040897575e-07, - "logits/chosen": -2.8852226734161377, - "logits/rejected": -2.8858284950256348, - "logps/chosen": -309.46405029296875, - "logps/rejected": -324.0066223144531, - "loss": 0.1081, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3502447605133057, - "rewards/margins": 6.813738822937012, - "rewards/rejected": -5.463494300842285, + "epoch": 1.47, + "learning_rate": 2.8334589299171065e-07, + "logits/chosen": -2.331711769104004, + "logits/rejected": -2.382993459701538, + "logps/chosen": -295.94573974609375, + "logps/rejected": -307.9302673339844, + "loss": 0.0708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8609134554862976, + "rewards/margins": 6.539481163024902, + "rewards/rejected": -5.678567409515381, "step": 2890 }, { - "epoch": 1.42, - "learning_rate": 2.9325009048136085e-07, - "logits/chosen": -2.7491602897644043, - "logits/rejected": -2.7655651569366455, - "logps/chosen": -320.40289306640625, - "logps/rejected": -359.29852294921875, - "loss": 0.0899, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.7242467403411865, - "rewards/margins": 7.11987829208374, - "rewards/rejected": -6.395631790161133, + "epoch": 1.48, + "learning_rate": 2.824039186134137e-07, + "logits/chosen": -2.412370204925537, + "logits/rejected": -2.394695997238159, + "logps/chosen": -280.772705078125, + "logps/rejected": -312.4391174316406, + "loss": 0.1009, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.9061541557312012, + "rewards/margins": 6.6510467529296875, + "rewards/rejected": -5.744892120361328, "step": 2900 }, { - "epoch": 1.42, - "eval_logits/chosen": -2.821112632751465, - "eval_logits/rejected": -2.820706367492676, - "eval_logps/chosen": -323.8685607910156, - "eval_logps/rejected": -308.16162109375, - "eval_loss": 0.5645406246185303, - "eval_rewards/accuracies": 0.7702205777168274, - "eval_rewards/chosen": -0.9007564187049866, - "eval_rewards/margins": 2.962090492248535, - "eval_rewards/rejected": -3.862847328186035, - "eval_runtime": 303.5053, - "eval_samples_per_second": 7.156, - "eval_steps_per_second": 0.448, + "epoch": 1.48, + "eval_logits/chosen": -2.4072649478912354, + "eval_logits/rejected": -2.483442544937134, + "eval_logps/chosen": -294.30072021484375, + "eval_logps/rejected": -294.0972900390625, + "eval_loss": 0.6101788282394409, + "eval_rewards/accuracies": 0.7347328066825867, + "eval_rewards/chosen": -1.6050245761871338, + "eval_rewards/margins": 1.9911925792694092, + "eval_rewards/rejected": -3.596216917037964, + "eval_runtime": 301.8264, + "eval_samples_per_second": 6.911, + "eval_steps_per_second": 0.434, "step": 2900 }, { - "epoch": 1.42, - "learning_rate": 2.923452768729642e-07, - "logits/chosen": -2.7951157093048096, - "logits/rejected": -2.8064324855804443, - "logps/chosen": -295.3318176269531, - "logps/rejected": -347.5905456542969, - "loss": 0.0467, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.402244210243225, - "rewards/margins": 7.6175689697265625, - "rewards/rejected": -6.215324878692627, + "epoch": 1.48, + "learning_rate": 2.8146194423511675e-07, + "logits/chosen": -2.2668776512145996, + "logits/rejected": -2.1470532417297363, + "logps/chosen": -253.3878936767578, + "logps/rejected": -266.178955078125, + "loss": 0.1025, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.08239737898111343, + "rewards/margins": 5.62557315826416, + "rewards/rejected": -5.543176174163818, "step": 2910 }, { - "epoch": 1.43, - "learning_rate": 2.914404632645675e-07, - "logits/chosen": -2.7112128734588623, - "logits/rejected": -2.7309176921844482, - "logps/chosen": -350.05218505859375, - "logps/rejected": -300.18670654296875, - "loss": 0.0805, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9612985849380493, - "rewards/margins": 6.709507942199707, - "rewards/rejected": -5.748209476470947, + "epoch": 1.49, + "learning_rate": 2.8051996985681985e-07, + "logits/chosen": -2.3062376976013184, + "logits/rejected": -2.282536029815674, + "logps/chosen": -260.6484375, + "logps/rejected": -307.1168518066406, + "loss": 0.0821, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.140464186668396, + "rewards/margins": 7.348119258880615, + "rewards/rejected": -6.207655429840088, "step": 2920 }, { - "epoch": 1.43, - "learning_rate": 2.9053564965617085e-07, - "logits/chosen": -2.8636326789855957, - "logits/rejected": -2.8802738189697266, - "logps/chosen": -294.0220642089844, - "logps/rejected": -340.5633544921875, - "loss": 0.1106, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.14403903484344482, - "rewards/margins": 6.803243160247803, - "rewards/rejected": -6.659204006195068, + "epoch": 1.49, + "learning_rate": 2.7957799547852295e-07, + "logits/chosen": -2.347249984741211, + "logits/rejected": -2.361231803894043, + "logps/chosen": -270.4031066894531, + "logps/rejected": -315.46820068359375, + "loss": 0.0929, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.9538537859916687, + "rewards/margins": 6.171472072601318, + "rewards/rejected": -5.217618465423584, "step": 2930 }, { - "epoch": 1.44, - "learning_rate": 2.896308360477742e-07, - "logits/chosen": -2.897508382797241, - "logits/rejected": -2.8970837593078613, - "logps/chosen": -289.94189453125, - "logps/rejected": -328.33099365234375, - "loss": 0.0637, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.09252788126468658, - "rewards/margins": 6.529477119445801, - "rewards/rejected": -6.436949253082275, + "epoch": 1.5, + "learning_rate": 2.7863602110022605e-07, + "logits/chosen": -2.223649501800537, + "logits/rejected": -2.401780128479004, + "logps/chosen": -278.6842346191406, + "logps/rejected": -321.9062194824219, + "loss": 0.0678, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5670070648193359, + "rewards/margins": 6.330507278442383, + "rewards/rejected": -5.763500213623047, "step": 2940 }, { - "epoch": 1.44, - "learning_rate": 2.8872602243937747e-07, - "logits/chosen": -2.840087413787842, - "logits/rejected": -2.8241333961486816, - "logps/chosen": -290.3756103515625, - "logps/rejected": -346.1082458496094, - "loss": 0.1018, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7968460917472839, - "rewards/margins": 7.29601526260376, - "rewards/rejected": -6.499168395996094, + "epoch": 1.5, + "learning_rate": 2.7769404672192915e-07, + "logits/chosen": -2.369929552078247, + "logits/rejected": -2.3821492195129395, + "logps/chosen": -314.00030517578125, + "logps/rejected": -328.1274719238281, + "loss": 0.1037, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.6795863509178162, + "rewards/margins": 5.990577697753906, + "rewards/rejected": -5.310991287231445, "step": 2950 }, { - "epoch": 1.45, - "learning_rate": 2.878212088309808e-07, - "logits/chosen": -2.8064072132110596, - "logits/rejected": -2.839463233947754, - "logps/chosen": -302.9584045410156, - "logps/rejected": -377.13421630859375, - "loss": 0.0628, + "epoch": 1.51, + "learning_rate": 2.7675207234363225e-07, + "logits/chosen": -2.370741128921509, + "logits/rejected": -2.3149704933166504, + "logps/chosen": -315.950927734375, + "logps/rejected": -310.5325622558594, + "loss": 0.1237, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0208656787872314, - "rewards/margins": 7.858162879943848, - "rewards/rejected": -6.837296962738037, + "rewards/chosen": 0.6832824349403381, + "rewards/margins": 6.224267482757568, + "rewards/rejected": -5.540985107421875, "step": 2960 }, { - "epoch": 1.45, - "learning_rate": 2.8691639522258414e-07, - "logits/chosen": -2.816514492034912, - "logits/rejected": -2.8173654079437256, - "logps/chosen": -321.41754150390625, - "logps/rejected": -363.37969970703125, - "loss": 0.1439, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.17609742283821106, - "rewards/margins": 6.283827781677246, - "rewards/rejected": -6.107730865478516, + "epoch": 1.51, + "learning_rate": 2.7581009796533535e-07, + "logits/chosen": -2.3546643257141113, + "logits/rejected": -2.3952019214630127, + "logps/chosen": -286.61553955078125, + "logps/rejected": -338.02801513671875, + "loss": 0.0919, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4628432989120483, + "rewards/margins": 7.785639762878418, + "rewards/rejected": -6.32279634475708, "step": 2970 }, { - "epoch": 1.46, - "learning_rate": 2.860115816141875e-07, - "logits/chosen": -2.8800814151763916, - "logits/rejected": -2.8853232860565186, - "logps/chosen": -266.3177185058594, - "logps/rejected": -284.6675109863281, - "loss": 0.1006, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.45333433151245117, - "rewards/margins": 5.221524238586426, - "rewards/rejected": -4.768189430236816, + "epoch": 1.52, + "learning_rate": 2.748681235870384e-07, + "logits/chosen": -2.380227565765381, + "logits/rejected": -2.429625988006592, + "logps/chosen": -289.5753173828125, + "logps/rejected": -339.2673645019531, + "loss": 0.101, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.3829438090324402, + "rewards/margins": 7.5160417556762695, + "rewards/rejected": -7.1330976486206055, "step": 2980 }, { - "epoch": 1.46, - "learning_rate": 2.851067680057908e-07, - "logits/chosen": -2.717090129852295, - "logits/rejected": -2.7554845809936523, - "logps/chosen": -289.6501159667969, - "logps/rejected": -278.08306884765625, - "loss": 0.1773, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.1516077071428299, - "rewards/margins": 5.626119613647461, - "rewards/rejected": -5.474511623382568, + "epoch": 1.52, + "learning_rate": 2.739261492087415e-07, + "logits/chosen": -2.327038526535034, + "logits/rejected": -2.2786455154418945, + "logps/chosen": -259.84698486328125, + "logps/rejected": -318.90228271484375, + "loss": 0.0979, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.09783251583576202, + "rewards/margins": 5.981132507324219, + "rewards/rejected": -6.078965187072754, "step": 2990 }, { - "epoch": 1.47, - "learning_rate": 2.8420195439739415e-07, - "logits/chosen": -2.863051652908325, - "logits/rejected": -2.8845012187957764, - "logps/chosen": -278.0904541015625, - "logps/rejected": -277.69866943359375, - "loss": 0.1434, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.6807212233543396, - "rewards/margins": 5.829002380371094, - "rewards/rejected": -5.148281097412109, + "epoch": 1.53, + "learning_rate": 2.729841748304446e-07, + "logits/chosen": -2.2745556831359863, + "logits/rejected": -2.400892734527588, + "logps/chosen": -275.0760192871094, + "logps/rejected": -331.4722595214844, + "loss": 0.083, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.23459462821483612, + "rewards/margins": 6.325497627258301, + "rewards/rejected": -6.090902805328369, "step": 3000 }, { - "epoch": 1.47, - "eval_logits/chosen": -2.8758153915405273, - "eval_logits/rejected": -2.8740193843841553, - "eval_logps/chosen": -321.7423095703125, - "eval_logps/rejected": -302.65899658203125, - "eval_loss": 0.5709971785545349, - "eval_rewards/accuracies": 0.7702205777168274, - "eval_rewards/chosen": -0.688130795955658, - "eval_rewards/margins": 2.6244583129882812, - "eval_rewards/rejected": -3.312589406967163, - "eval_runtime": 303.5414, - "eval_samples_per_second": 7.156, - "eval_steps_per_second": 0.448, + "epoch": 1.53, + "eval_logits/chosen": -2.452087163925171, + "eval_logits/rejected": -2.5306129455566406, + "eval_logps/chosen": -294.6455078125, + "eval_logps/rejected": -294.8183898925781, + "eval_loss": 0.6125035881996155, + "eval_rewards/accuracies": 0.7423664331436157, + "eval_rewards/chosen": -1.6395032405853271, + "eval_rewards/margins": 2.028822660446167, + "eval_rewards/rejected": -3.668325424194336, + "eval_runtime": 297.2135, + "eval_samples_per_second": 7.019, + "eval_steps_per_second": 0.441, "step": 3000 }, { - "epoch": 1.47, - "learning_rate": 2.832971407889975e-07, - "logits/chosen": -2.91792893409729, - "logits/rejected": -2.9293243885040283, - "logps/chosen": -355.6678466796875, - "logps/rejected": -300.4123229980469, - "loss": 0.1303, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.3436975479125977, - "rewards/margins": 6.888288974761963, - "rewards/rejected": -5.544590473175049, + "epoch": 1.53, + "learning_rate": 2.720422004521477e-07, + "logits/chosen": -2.395345687866211, + "logits/rejected": -2.413201332092285, + "logps/chosen": -297.8402099609375, + "logps/rejected": -331.8768615722656, + "loss": 0.0756, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8696461915969849, + "rewards/margins": 6.27551794052124, + "rewards/rejected": -5.405871868133545, "step": 3010 }, { - "epoch": 1.48, - "learning_rate": 2.823923271806008e-07, - "logits/chosen": -2.8839268684387207, - "logits/rejected": -2.885683298110962, - "logps/chosen": -294.4696350097656, - "logps/rejected": -331.1778564453125, - "loss": 0.0898, + "epoch": 1.54, + "learning_rate": 2.711002260738508e-07, + "logits/chosen": -2.4581711292266846, + "logits/rejected": -2.4832825660705566, + "logps/chosen": -280.2608947753906, + "logps/rejected": -315.7003173828125, + "loss": 0.0761, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2089078426361084, - "rewards/margins": 6.403166770935059, - "rewards/rejected": -5.194258689880371, + "rewards/chosen": 1.1012874841690063, + "rewards/margins": 6.882163047790527, + "rewards/rejected": -5.780875205993652, "step": 3020 }, { - "epoch": 1.48, - "learning_rate": 2.8148751357220415e-07, - "logits/chosen": -2.840822696685791, - "logits/rejected": -2.8854775428771973, - "logps/chosen": -344.8243408203125, - "logps/rejected": -331.1698913574219, - "loss": 0.1487, + "epoch": 1.54, + "learning_rate": 2.701582516955539e-07, + "logits/chosen": -2.3183887004852295, + "logits/rejected": -2.3755974769592285, + "logps/chosen": -273.87481689453125, + "logps/rejected": -323.4425354003906, + "loss": 0.081, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2432057857513428, - "rewards/margins": 5.621723651885986, - "rewards/rejected": -4.378518104553223, + "rewards/chosen": 0.9814306497573853, + "rewards/margins": 6.785541534423828, + "rewards/rejected": -5.804111003875732, "step": 3030 }, { - "epoch": 1.49, - "learning_rate": 2.8058269996380744e-07, - "logits/chosen": -2.7500689029693604, - "logits/rejected": -2.7774271965026855, - "logps/chosen": -345.95068359375, - "logps/rejected": -281.6057434082031, - "loss": 0.0817, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.7723453044891357, - "rewards/margins": 7.820898532867432, - "rewards/rejected": -6.048552989959717, + "epoch": 1.55, + "learning_rate": 2.6921627731725695e-07, + "logits/chosen": -2.3994407653808594, + "logits/rejected": -2.3673055171966553, + "logps/chosen": -288.03717041015625, + "logps/rejected": -322.63824462890625, + "loss": 0.0837, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7873355150222778, + "rewards/margins": 6.48495626449585, + "rewards/rejected": -5.6976213455200195, "step": 3040 }, { - "epoch": 1.49, - "learning_rate": 2.7967788635541077e-07, - "logits/chosen": -2.75341534614563, - "logits/rejected": -2.7814879417419434, - "logps/chosen": -316.38275146484375, - "logps/rejected": -317.58624267578125, - "loss": 0.1494, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.8149929046630859, - "rewards/margins": 7.353440284729004, - "rewards/rejected": -6.53844690322876, + "epoch": 1.55, + "learning_rate": 2.6827430293896005e-07, + "logits/chosen": -2.278913736343384, + "logits/rejected": -2.270564079284668, + "logps/chosen": -297.72705078125, + "logps/rejected": -291.41119384765625, + "loss": 0.0998, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.32713982462882996, + "rewards/margins": 5.756745338439941, + "rewards/rejected": -5.429605007171631, "step": 3050 }, { - "epoch": 1.49, - "learning_rate": 2.787730727470141e-07, - "logits/chosen": -2.768599271774292, - "logits/rejected": -2.7462639808654785, - "logps/chosen": -352.63482666015625, - "logps/rejected": -315.82037353515625, - "loss": 0.0877, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.4773457646369934, - "rewards/margins": 6.272608280181885, - "rewards/rejected": -5.795262336730957, + "epoch": 1.56, + "learning_rate": 2.673323285606631e-07, + "logits/chosen": -2.3376736640930176, + "logits/rejected": -2.242246389389038, + "logps/chosen": -211.8117218017578, + "logps/rejected": -277.4327087402344, + "loss": 0.1517, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6408033967018127, + "rewards/margins": 5.57335090637207, + "rewards/rejected": -4.932547092437744, "step": 3060 }, { - "epoch": 1.5, - "learning_rate": 2.7786825913861744e-07, - "logits/chosen": -2.840207099914551, - "logits/rejected": -2.8674092292785645, - "logps/chosen": -306.32049560546875, - "logps/rejected": -375.2984924316406, - "loss": 0.1285, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.2844650149345398, - "rewards/margins": 6.729122161865234, - "rewards/rejected": -6.444657325744629, + "epoch": 1.56, + "learning_rate": 2.663903541823662e-07, + "logits/chosen": -2.285000801086426, + "logits/rejected": -2.2973408699035645, + "logps/chosen": -324.43658447265625, + "logps/rejected": -333.1507263183594, + "loss": 0.0702, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8930590748786926, + "rewards/margins": 6.265158653259277, + "rewards/rejected": -5.372099876403809, "step": 3070 }, { - "epoch": 1.5, - "learning_rate": 2.769634455302208e-07, - "logits/chosen": -2.9240241050720215, - "logits/rejected": -2.915964126586914, - "logps/chosen": -335.8831787109375, - "logps/rejected": -288.6899719238281, - "loss": 0.1498, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.4704078733921051, - "rewards/margins": 6.695226192474365, - "rewards/rejected": -6.224818229675293, + "epoch": 1.57, + "learning_rate": 2.654483798040693e-07, + "logits/chosen": -2.310891628265381, + "logits/rejected": -2.262753963470459, + "logps/chosen": -254.897705078125, + "logps/rejected": -348.63897705078125, + "loss": 0.1367, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6549896597862244, + "rewards/margins": 6.947043418884277, + "rewards/rejected": -6.292054176330566, "step": 3080 }, { - "epoch": 1.51, - "learning_rate": 2.760586319218241e-07, - "logits/chosen": -2.8168272972106934, - "logits/rejected": -2.8107285499572754, - "logps/chosen": -308.25640869140625, - "logps/rejected": -355.07073974609375, - "loss": 0.0876, + "epoch": 1.57, + "learning_rate": 2.645064054257724e-07, + "logits/chosen": -2.2977728843688965, + "logits/rejected": -2.272675037384033, + "logps/chosen": -337.22393798828125, + "logps/rejected": -307.8287353515625, + "loss": 0.0783, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.5022165179252625, - "rewards/margins": 7.053918361663818, - "rewards/rejected": -6.55170202255249, + "rewards/chosen": 0.5678704977035522, + "rewards/margins": 6.053879737854004, + "rewards/rejected": -5.486009120941162, "step": 3090 }, { - "epoch": 1.51, - "learning_rate": 2.7515381831342745e-07, - "logits/chosen": -2.8010165691375732, - "logits/rejected": -2.8165507316589355, - "logps/chosen": -297.3454895019531, - "logps/rejected": -327.2693176269531, - "loss": 0.1172, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.01988457515835762, - "rewards/margins": 6.278430938720703, - "rewards/rejected": -6.298314571380615, + "epoch": 1.58, + "learning_rate": 2.635644310474755e-07, + "logits/chosen": -2.270324230194092, + "logits/rejected": -2.3138184547424316, + "logps/chosen": -277.0635070800781, + "logps/rejected": -384.27130126953125, + "loss": 0.0871, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7171944379806519, + "rewards/margins": 7.3045196533203125, + "rewards/rejected": -6.587324619293213, "step": 3100 }, { - "epoch": 1.51, - "eval_logits/chosen": -2.8470873832702637, - "eval_logits/rejected": -2.8496205806732178, - "eval_logps/chosen": -325.89813232421875, - "eval_logps/rejected": -306.9757385253906, - "eval_loss": 0.5433418154716492, - "eval_rewards/accuracies": 0.7849264740943909, - "eval_rewards/chosen": -1.1037135124206543, - "eval_rewards/margins": 2.640545129776001, - "eval_rewards/rejected": -3.7442586421966553, - "eval_runtime": 303.2389, - "eval_samples_per_second": 7.163, - "eval_steps_per_second": 0.448, + "epoch": 1.58, + "eval_logits/chosen": -2.4278504848480225, + "eval_logits/rejected": -2.5031864643096924, + "eval_logps/chosen": -295.6979064941406, + "eval_logps/rejected": -296.3849792480469, + "eval_loss": 0.6392149329185486, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -1.7447423934936523, + "eval_rewards/margins": 2.080242872238159, + "eval_rewards/rejected": -3.8249852657318115, + "eval_runtime": 302.4911, + "eval_samples_per_second": 6.896, + "eval_steps_per_second": 0.433, "step": 3100 }, { - "epoch": 1.52, - "learning_rate": 2.742490047050308e-07, - "logits/chosen": -2.846949815750122, - "logits/rejected": -2.8920021057128906, - "logps/chosen": -234.6263885498047, - "logps/rejected": -275.60748291015625, - "loss": 0.1248, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.217097207903862, - "rewards/margins": 5.636806488037109, - "rewards/rejected": -5.4197096824646, + "epoch": 1.58, + "learning_rate": 2.626224566691786e-07, + "logits/chosen": -2.338169574737549, + "logits/rejected": -2.2731688022613525, + "logps/chosen": -271.0912170410156, + "logps/rejected": -359.5860290527344, + "loss": 0.075, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.45801377296447754, + "rewards/margins": 6.8046746253967285, + "rewards/rejected": -6.346660614013672, "step": 3110 }, { - "epoch": 1.52, - "learning_rate": 2.7334419109663407e-07, - "logits/chosen": -2.825108051300049, - "logits/rejected": -2.8783891201019287, - "logps/chosen": -337.71661376953125, - "logps/rejected": -336.48052978515625, - "loss": 0.0879, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.0719419717788696, - "rewards/margins": 6.54522180557251, - "rewards/rejected": -5.4732794761657715, + "epoch": 1.59, + "learning_rate": 2.616804822908817e-07, + "logits/chosen": -2.268383741378784, + "logits/rejected": -2.2524795532226562, + "logps/chosen": -249.7799530029297, + "logps/rejected": -285.2999572753906, + "loss": 0.0824, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.010730976238846779, + "rewards/margins": 5.435656547546387, + "rewards/rejected": -5.446387767791748, "step": 3120 }, { - "epoch": 1.53, - "learning_rate": 2.724393774882374e-07, - "logits/chosen": -2.8448872566223145, - "logits/rejected": -2.871584892272949, - "logps/chosen": -301.98468017578125, - "logps/rejected": -326.4334411621094, - "loss": 0.0937, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.7110536098480225, - "rewards/margins": 7.033717155456543, - "rewards/rejected": -6.322664737701416, + "epoch": 1.59, + "learning_rate": 2.607385079125848e-07, + "logits/chosen": -2.304624080657959, + "logits/rejected": -2.3206493854522705, + "logps/chosen": -249.6020965576172, + "logps/rejected": -293.7482604980469, + "loss": 0.0697, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7361634373664856, + "rewards/margins": 6.254228591918945, + "rewards/rejected": -5.518064975738525, "step": 3130 }, { - "epoch": 1.53, - "learning_rate": 2.7153456387984074e-07, - "logits/chosen": -2.8696374893188477, - "logits/rejected": -2.854140281677246, - "logps/chosen": -302.44622802734375, - "logps/rejected": -331.2200012207031, - "loss": 0.0787, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.9861844778060913, - "rewards/margins": 5.889410972595215, - "rewards/rejected": -4.90322732925415, + "epoch": 1.6, + "learning_rate": 2.597965335342879e-07, + "logits/chosen": -2.422750949859619, + "logits/rejected": -2.4973697662353516, + "logps/chosen": -302.3764343261719, + "logps/rejected": -321.0247497558594, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8100934028625488, + "rewards/margins": 7.106518745422363, + "rewards/rejected": -6.296424865722656, "step": 3140 }, { - "epoch": 1.54, - "learning_rate": 2.7062975027144407e-07, - "logits/chosen": -2.894418239593506, - "logits/rejected": -2.9059743881225586, - "logps/chosen": -293.79046630859375, - "logps/rejected": -282.673828125, - "loss": 0.1165, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.676483690738678, - "rewards/margins": 5.742424488067627, - "rewards/rejected": -5.065940856933594, + "epoch": 1.6, + "learning_rate": 2.5885455915599095e-07, + "logits/chosen": -2.3840737342834473, + "logits/rejected": -2.5003018379211426, + "logps/chosen": -304.78143310546875, + "logps/rejected": -324.4736022949219, + "loss": 0.1689, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0771976709365845, + "rewards/margins": 6.414263725280762, + "rewards/rejected": -5.337065696716309, "step": 3150 }, { - "epoch": 1.54, - "learning_rate": 2.697249366630474e-07, - "logits/chosen": -2.8392021656036377, - "logits/rejected": -2.871311664581299, - "logps/chosen": -318.05841064453125, - "logps/rejected": -330.6488037109375, - "loss": 0.0839, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9116417765617371, - "rewards/margins": 6.07071590423584, - "rewards/rejected": -5.159074306488037, + "epoch": 1.61, + "learning_rate": 2.57912584777694e-07, + "logits/chosen": -2.5240299701690674, + "logits/rejected": -2.528115749359131, + "logps/chosen": -292.08929443359375, + "logps/rejected": -291.10101318359375, + "loss": 0.0777, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.09714029729366302, + "rewards/margins": 5.437426567077637, + "rewards/rejected": -5.340286731719971, "step": 3160 }, { - "epoch": 1.55, - "learning_rate": 2.6882012305465074e-07, - "logits/chosen": -2.898746967315674, - "logits/rejected": -2.926267147064209, - "logps/chosen": -309.84490966796875, - "logps/rejected": -279.7029113769531, - "loss": 0.1225, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.41145652532577515, - "rewards/margins": 5.633301258087158, - "rewards/rejected": -5.221844673156738, + "epoch": 1.61, + "learning_rate": 2.569706103993971e-07, + "logits/chosen": -2.579284191131592, + "logits/rejected": -2.504931926727295, + "logps/chosen": -318.3880310058594, + "logps/rejected": -317.40771484375, + "loss": 0.0932, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8843173980712891, + "rewards/margins": 6.076625823974609, + "rewards/rejected": -5.192307949066162, "step": 3170 }, { - "epoch": 1.55, - "learning_rate": 2.679153094462541e-07, - "logits/chosen": -2.8898568153381348, - "logits/rejected": -2.900376558303833, - "logps/chosen": -324.8934020996094, - "logps/rejected": -330.2880554199219, - "loss": 0.086, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.687757670879364, - "rewards/margins": 6.64199161529541, - "rewards/rejected": -5.954233169555664, + "epoch": 1.62, + "learning_rate": 2.560286360211002e-07, + "logits/chosen": -2.4689183235168457, + "logits/rejected": -2.5415992736816406, + "logps/chosen": -231.2115936279297, + "logps/rejected": -298.3565979003906, + "loss": 0.1103, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.31711095571517944, + "rewards/margins": 6.669442653656006, + "rewards/rejected": -6.35233211517334, "step": 3180 }, { - "epoch": 1.56, - "learning_rate": 2.670104958378574e-07, - "logits/chosen": -2.8804478645324707, - "logits/rejected": -2.8809070587158203, - "logps/chosen": -308.6686096191406, - "logps/rejected": -298.6070556640625, - "loss": 0.0952, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.4444310665130615, - "rewards/margins": 6.013411521911621, - "rewards/rejected": -5.5689802169799805, + "epoch": 1.62, + "learning_rate": 2.550866616428033e-07, + "logits/chosen": -2.434196949005127, + "logits/rejected": -2.555429220199585, + "logps/chosen": -269.8191223144531, + "logps/rejected": -347.5003967285156, + "loss": 0.1087, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.26513025164604187, + "rewards/margins": 6.933467864990234, + "rewards/rejected": -6.668337345123291, "step": 3190 }, { - "epoch": 1.56, - "learning_rate": 2.6610568222946075e-07, - "logits/chosen": -2.8997981548309326, - "logits/rejected": -2.8945791721343994, - "logps/chosen": -259.23675537109375, - "logps/rejected": -303.37750244140625, - "loss": 0.0997, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.0836852565407753, - "rewards/margins": 5.9700398445129395, - "rewards/rejected": -5.886354923248291, + "epoch": 1.63, + "learning_rate": 2.541446872645064e-07, + "logits/chosen": -2.322817802429199, + "logits/rejected": -2.36637020111084, + "logps/chosen": -289.0831604003906, + "logps/rejected": -283.831787109375, + "loss": 0.1168, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.49825185537338257, + "rewards/margins": 6.191859245300293, + "rewards/rejected": -5.693608283996582, "step": 3200 }, { - "epoch": 1.56, - "eval_logits/chosen": -2.8260486125946045, - "eval_logits/rejected": -2.820084571838379, - "eval_logps/chosen": -323.536376953125, - "eval_logps/rejected": -306.5312194824219, - "eval_loss": 0.5483145117759705, - "eval_rewards/accuracies": 0.7757353186607361, - "eval_rewards/chosen": -0.8675331473350525, - "eval_rewards/margins": 2.832273006439209, - "eval_rewards/rejected": -3.6998064517974854, - "eval_runtime": 303.5135, - "eval_samples_per_second": 7.156, - "eval_steps_per_second": 0.448, + "epoch": 1.63, + "eval_logits/chosen": -2.4606454372406006, + "eval_logits/rejected": -2.5371811389923096, + "eval_logps/chosen": -294.4764099121094, + "eval_logps/rejected": -293.7373962402344, + "eval_loss": 0.5972779989242554, + "eval_rewards/accuracies": 0.7442747950553894, + "eval_rewards/chosen": -1.622592568397522, + "eval_rewards/margins": 1.9376325607299805, + "eval_rewards/rejected": -3.560225248336792, + "eval_runtime": 297.2703, + "eval_samples_per_second": 7.017, + "eval_steps_per_second": 0.441, "step": 3200 }, { - "epoch": 1.57, - "learning_rate": 2.6520086862106403e-07, - "logits/chosen": -2.8775906562805176, - "logits/rejected": -2.9047904014587402, - "logps/chosen": -325.9310607910156, - "logps/rejected": -300.457763671875, - "loss": 0.0816, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3448231220245361, - "rewards/margins": 6.864799499511719, - "rewards/rejected": -5.5199761390686035, + "epoch": 1.63, + "learning_rate": 2.532027128862095e-07, + "logits/chosen": -2.296696186065674, + "logits/rejected": -2.2465457916259766, + "logps/chosen": -268.1493835449219, + "logps/rejected": -312.7187194824219, + "loss": 0.0762, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5331908464431763, + "rewards/margins": 6.631085395812988, + "rewards/rejected": -6.097894191741943, "step": 3210 }, { - "epoch": 1.57, - "learning_rate": 2.6429605501266737e-07, - "logits/chosen": -2.862781286239624, - "logits/rejected": -2.848540782928467, - "logps/chosen": -345.00396728515625, - "logps/rejected": -371.2835998535156, - "loss": 0.0672, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.1944535970687866, - "rewards/margins": 7.275880336761475, - "rewards/rejected": -6.081427097320557, + "epoch": 1.64, + "learning_rate": 2.5226073850791255e-07, + "logits/chosen": -2.3496594429016113, + "logits/rejected": -2.3881731033325195, + "logps/chosen": -289.9051513671875, + "logps/rejected": -349.10455322265625, + "loss": 0.0827, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.41470688581466675, + "rewards/margins": 6.188368320465088, + "rewards/rejected": -5.773660659790039, "step": 3220 }, { - "epoch": 1.58, - "learning_rate": 2.633912414042707e-07, - "logits/chosen": -2.8544182777404785, - "logits/rejected": -2.838639497756958, - "logps/chosen": -332.526123046875, - "logps/rejected": -317.78045654296875, - "loss": 0.0992, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.5498558282852173, - "rewards/margins": 6.975174903869629, - "rewards/rejected": -6.425318717956543, + "epoch": 1.64, + "learning_rate": 2.5131876412961565e-07, + "logits/chosen": -2.4228880405426025, + "logits/rejected": -2.3895740509033203, + "logps/chosen": -256.2016296386719, + "logps/rejected": -296.44708251953125, + "loss": 0.0787, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.17232027649879456, + "rewards/margins": 5.873257160186768, + "rewards/rejected": -5.700936794281006, "step": 3230 }, { - "epoch": 1.58, - "learning_rate": 2.6248642779587404e-07, - "logits/chosen": -2.845302104949951, - "logits/rejected": -2.838576555252075, - "logps/chosen": -318.8706970214844, - "logps/rejected": -349.5872497558594, - "loss": 0.0902, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5205594897270203, - "rewards/margins": 6.665504455566406, - "rewards/rejected": -6.14494514465332, + "epoch": 1.65, + "learning_rate": 2.5037678975131875e-07, + "logits/chosen": -2.368595600128174, + "logits/rejected": -2.3985543251037598, + "logps/chosen": -286.7682189941406, + "logps/rejected": -321.578125, + "loss": 0.1001, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.45928245782852173, + "rewards/margins": 7.228116512298584, + "rewards/rejected": -6.768834114074707, "step": 3240 }, { - "epoch": 1.59, - "learning_rate": 2.6158161418747737e-07, - "logits/chosen": -2.8233635425567627, - "logits/rejected": -2.8411450386047363, - "logps/chosen": -353.85711669921875, - "logps/rejected": -311.1195068359375, - "loss": 0.1122, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9118728637695312, - "rewards/margins": 6.6601057052612305, - "rewards/rejected": -5.748231887817383, + "epoch": 1.65, + "learning_rate": 2.4943481537302185e-07, + "logits/chosen": -2.3476357460021973, + "logits/rejected": -2.454817056655884, + "logps/chosen": -273.3967590332031, + "logps/rejected": -285.05596923828125, + "loss": 0.1003, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.4910641610622406, + "rewards/margins": 5.733672618865967, + "rewards/rejected": -5.242608547210693, "step": 3250 }, { - "epoch": 1.59, - "learning_rate": 2.606768005790807e-07, - "logits/chosen": -2.8316612243652344, - "logits/rejected": -2.8859477043151855, - "logps/chosen": -392.78570556640625, - "logps/rejected": -351.75628662109375, - "loss": 0.106, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.38626447319984436, - "rewards/margins": 6.8547163009643555, - "rewards/rejected": -6.468452453613281, + "epoch": 1.66, + "learning_rate": 2.484928409947249e-07, + "logits/chosen": -2.3722920417785645, + "logits/rejected": -2.3269031047821045, + "logps/chosen": -285.6039733886719, + "logps/rejected": -324.99139404296875, + "loss": 0.0677, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0653159618377686, + "rewards/margins": 7.1584906578063965, + "rewards/rejected": -6.093174934387207, "step": 3260 }, { - "epoch": 1.6, - "learning_rate": 2.5977198697068404e-07, - "logits/chosen": -2.9022767543792725, - "logits/rejected": -2.8828883171081543, - "logps/chosen": -370.77899169921875, - "logps/rejected": -328.6186218261719, - "loss": 0.1423, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.2938607931137085, - "rewards/margins": 6.228670597076416, - "rewards/rejected": -4.934809684753418, + "epoch": 1.66, + "learning_rate": 2.47550866616428e-07, + "logits/chosen": -2.284839153289795, + "logits/rejected": -2.3606677055358887, + "logps/chosen": -289.9405822753906, + "logps/rejected": -329.0007019042969, + "loss": 0.1531, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04473470523953438, + "rewards/margins": 5.93240213394165, + "rewards/rejected": -5.977137088775635, "step": 3270 }, { - "epoch": 1.6, - "learning_rate": 2.588671733622874e-07, - "logits/chosen": -2.9139137268066406, - "logits/rejected": -2.886974811553955, - "logps/chosen": -303.1072998046875, - "logps/rejected": -292.7776794433594, - "loss": 0.1177, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.7752214670181274, - "rewards/margins": 7.429418087005615, - "rewards/rejected": -5.654197692871094, + "epoch": 1.67, + "learning_rate": 2.466088922381311e-07, + "logits/chosen": -2.3747289180755615, + "logits/rejected": -2.3340859413146973, + "logps/chosen": -296.7153015136719, + "logps/rejected": -337.93951416015625, + "loss": 0.1289, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3914230763912201, + "rewards/margins": 6.5019402503967285, + "rewards/rejected": -6.110517501831055, "step": 3280 }, { - "epoch": 1.61, - "learning_rate": 2.579623597538907e-07, - "logits/chosen": -2.865157127380371, - "logits/rejected": -2.8995003700256348, - "logps/chosen": -329.9259948730469, - "logps/rejected": -332.6876525878906, - "loss": 0.1406, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.4454152584075928, - "rewards/margins": 7.6604204177856445, - "rewards/rejected": -6.215004920959473, + "epoch": 1.67, + "learning_rate": 2.456669178598342e-07, + "logits/chosen": -2.3325932025909424, + "logits/rejected": -2.383507251739502, + "logps/chosen": -251.0063018798828, + "logps/rejected": -311.6315612792969, + "loss": 0.1102, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5094572901725769, + "rewards/margins": 6.646310329437256, + "rewards/rejected": -6.136853218078613, "step": 3290 }, { - "epoch": 1.61, - "learning_rate": 2.57057546145494e-07, - "logits/chosen": -2.881693124771118, - "logits/rejected": -2.8624751567840576, - "logps/chosen": -333.00604248046875, - "logps/rejected": -330.82501220703125, - "loss": 0.0793, + "epoch": 1.68, + "learning_rate": 2.447249434815373e-07, + "logits/chosen": -2.4116387367248535, + "logits/rejected": -2.454158067703247, + "logps/chosen": -355.3347473144531, + "logps/rejected": -343.6013488769531, + "loss": 0.0699, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 2.1855154037475586, - "rewards/margins": 7.756080627441406, - "rewards/rejected": -5.570566177368164, + "rewards/chosen": 0.9019377827644348, + "rewards/margins": 6.828858852386475, + "rewards/rejected": -5.926921367645264, "step": 3300 }, { - "epoch": 1.61, - "eval_logits/chosen": -2.863645315170288, - "eval_logits/rejected": -2.8552770614624023, - "eval_logps/chosen": -318.41302490234375, - "eval_logps/rejected": -301.8674011230469, - "eval_loss": 0.5520920753479004, - "eval_rewards/accuracies": 0.7886029481887817, - "eval_rewards/chosen": -0.35519903898239136, - "eval_rewards/margins": 2.878228187561035, - "eval_rewards/rejected": -3.233427047729492, - "eval_runtime": 303.6066, - "eval_samples_per_second": 7.154, - "eval_steps_per_second": 0.448, + "epoch": 1.68, + "eval_logits/chosen": -2.452702522277832, + "eval_logits/rejected": -2.5287041664123535, + "eval_logps/chosen": -294.6330871582031, + "eval_logps/rejected": -293.4993591308594, + "eval_loss": 0.5815873146057129, + "eval_rewards/accuracies": 0.7423664331436157, + "eval_rewards/chosen": -1.6382594108581543, + "eval_rewards/margins": 1.89816153049469, + "eval_rewards/rejected": -3.536421298980713, + "eval_runtime": 302.0151, + "eval_samples_per_second": 6.907, + "eval_steps_per_second": 0.434, "step": 3300 }, { - "epoch": 1.62, - "learning_rate": 2.5615273253709733e-07, - "logits/chosen": -2.8036551475524902, - "logits/rejected": -2.82938551902771, - "logps/chosen": -298.8094787597656, - "logps/rejected": -381.90087890625, - "loss": 0.1509, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.318374752998352, - "rewards/margins": 7.55935525894165, - "rewards/rejected": -6.240981101989746, + "epoch": 1.68, + "learning_rate": 2.437829691032404e-07, + "logits/chosen": -2.336447238922119, + "logits/rejected": -2.3389265537261963, + "logps/chosen": -306.2047424316406, + "logps/rejected": -323.0862731933594, + "loss": 0.076, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6071743965148926, + "rewards/margins": 6.342054843902588, + "rewards/rejected": -5.734879970550537, "step": 3310 }, { - "epoch": 1.62, - "learning_rate": 2.5524791892870067e-07, - "logits/chosen": -2.836329460144043, - "logits/rejected": -2.8463408946990967, - "logps/chosen": -322.0050048828125, - "logps/rejected": -332.37884521484375, - "loss": 0.1102, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.900260329246521, - "rewards/margins": 7.130246639251709, - "rewards/rejected": -5.22998571395874, + "epoch": 1.69, + "learning_rate": 2.4284099472494345e-07, + "logits/chosen": -2.3741564750671387, + "logits/rejected": -2.2548439502716064, + "logps/chosen": -272.26300048828125, + "logps/rejected": -310.686279296875, + "loss": 0.0688, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2735249698162079, + "rewards/margins": 6.651821136474609, + "rewards/rejected": -6.378296375274658, "step": 3320 }, { - "epoch": 1.63, - "learning_rate": 2.54343105320304e-07, - "logits/chosen": -2.8922011852264404, - "logits/rejected": -2.9095520973205566, - "logps/chosen": -290.95697021484375, - "logps/rejected": -326.2243957519531, - "loss": 0.1057, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.1625845432281494, - "rewards/margins": 6.746781826019287, - "rewards/rejected": -5.584197044372559, + "epoch": 1.69, + "learning_rate": 2.4189902034664655e-07, + "logits/chosen": -2.2759578227996826, + "logits/rejected": -2.294002056121826, + "logps/chosen": -269.53619384765625, + "logps/rejected": -372.09063720703125, + "loss": 0.0551, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.05612024664878845, + "rewards/margins": 6.168383598327637, + "rewards/rejected": -6.224503517150879, "step": 3330 }, { - "epoch": 1.63, - "learning_rate": 2.5343829171190734e-07, - "logits/chosen": -2.9187679290771484, - "logits/rejected": -2.9071030616760254, - "logps/chosen": -336.9691162109375, - "logps/rejected": -345.3871765136719, - "loss": 0.0738, + "epoch": 1.7, + "learning_rate": 2.4095704596834965e-07, + "logits/chosen": -2.2575266361236572, + "logits/rejected": -2.26688814163208, + "logps/chosen": -275.0549621582031, + "logps/rejected": -283.65008544921875, + "loss": 0.0856, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 2.2011337280273438, - "rewards/margins": 8.318750381469727, - "rewards/rejected": -6.117617130279541, + "rewards/chosen": 0.3581641614437103, + "rewards/margins": 5.8348259925842285, + "rewards/rejected": -5.476661682128906, "step": 3340 }, { - "epoch": 1.64, - "learning_rate": 2.5253347810351067e-07, - "logits/chosen": -2.929417371749878, - "logits/rejected": -2.9116897583007812, - "logps/chosen": -347.529296875, - "logps/rejected": -301.0887451171875, - "loss": 0.0907, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0278264284133911, - "rewards/margins": 6.519707679748535, - "rewards/rejected": -5.491880893707275, + "epoch": 1.7, + "learning_rate": 2.4001507159005275e-07, + "logits/chosen": -2.3836681842803955, + "logits/rejected": -2.337185859680176, + "logps/chosen": -279.23419189453125, + "logps/rejected": -305.5158996582031, + "loss": 0.0836, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.38139820098876953, + "rewards/margins": 5.845247268676758, + "rewards/rejected": -5.463849067687988, "step": 3350 }, { - "epoch": 1.64, - "learning_rate": 2.51628664495114e-07, - "logits/chosen": -2.887197494506836, - "logits/rejected": -2.9221837520599365, - "logps/chosen": -294.3488464355469, - "logps/rejected": -345.73046875, - "loss": 0.0985, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.7108898162841797, - "rewards/margins": 7.802008628845215, - "rewards/rejected": -6.091117858886719, + "epoch": 1.71, + "learning_rate": 2.3907309721175585e-07, + "logits/chosen": -2.3031442165374756, + "logits/rejected": -2.3276925086975098, + "logps/chosen": -249.84371948242188, + "logps/rejected": -288.9521179199219, + "loss": 0.0464, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8048938512802124, + "rewards/margins": 6.748453617095947, + "rewards/rejected": -5.943559169769287, "step": 3360 }, { - "epoch": 1.65, - "learning_rate": 2.5072385088671734e-07, - "logits/chosen": -2.8824501037597656, - "logits/rejected": -2.8959083557128906, - "logps/chosen": -277.9067077636719, - "logps/rejected": -320.12103271484375, - "loss": 0.0586, + "epoch": 1.71, + "learning_rate": 2.3813112283345892e-07, + "logits/chosen": -2.280756711959839, + "logits/rejected": -2.305971622467041, + "logps/chosen": -272.00714111328125, + "logps/rejected": -331.1332702636719, + "loss": 0.065, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.8824982643127441, - "rewards/margins": 6.805181980133057, - "rewards/rejected": -5.922682762145996, + "rewards/chosen": 0.5015135407447815, + "rewards/margins": 6.879171848297119, + "rewards/rejected": -6.377658367156982, "step": 3370 }, { - "epoch": 1.65, - "learning_rate": 2.498190372783207e-07, - "logits/chosen": -2.806278705596924, - "logits/rejected": -2.8165547847747803, - "logps/chosen": -292.66705322265625, - "logps/rejected": -267.31158447265625, - "loss": 0.0935, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2365620136260986, - "rewards/margins": 6.15826940536499, - "rewards/rejected": -4.921708106994629, + "epoch": 1.72, + "learning_rate": 2.37189148455162e-07, + "logits/chosen": -2.3661789894104004, + "logits/rejected": -2.3738746643066406, + "logps/chosen": -302.61920166015625, + "logps/rejected": -334.0452575683594, + "loss": 0.1174, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5483524203300476, + "rewards/margins": 6.481667995452881, + "rewards/rejected": -5.933314800262451, "step": 3380 }, { - "epoch": 1.66, - "learning_rate": 2.4891422366992396e-07, - "logits/chosen": -2.9510490894317627, - "logits/rejected": -2.9049758911132812, - "logps/chosen": -325.75567626953125, - "logps/rejected": -291.4295959472656, - "loss": 0.0788, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9036425352096558, - "rewards/margins": 6.364354133605957, - "rewards/rejected": -5.460711479187012, + "epoch": 1.72, + "learning_rate": 2.362471740768651e-07, + "logits/chosen": -2.316767454147339, + "logits/rejected": -2.392373561859131, + "logps/chosen": -311.609130859375, + "logps/rejected": -357.13775634765625, + "loss": 0.0771, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3898351192474365, + "rewards/margins": 7.740750789642334, + "rewards/rejected": -6.350916385650635, "step": 3390 }, { - "epoch": 1.66, - "learning_rate": 2.480094100615273e-07, - "logits/chosen": -2.8406660556793213, - "logits/rejected": -2.8309485912323, - "logps/chosen": -315.2688293457031, - "logps/rejected": -325.52203369140625, - "loss": 0.0706, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.563887357711792, - "rewards/margins": 6.5746331214904785, - "rewards/rejected": -5.010745048522949, + "epoch": 1.73, + "learning_rate": 2.353051996985682e-07, + "logits/chosen": -2.132401943206787, + "logits/rejected": -2.1744465827941895, + "logps/chosen": -254.2714080810547, + "logps/rejected": -341.75396728515625, + "loss": 0.1082, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.30060726404190063, + "rewards/margins": 6.229058265686035, + "rewards/rejected": -6.529665946960449, "step": 3400 }, { - "epoch": 1.66, - "eval_logits/chosen": -2.876291275024414, - "eval_logits/rejected": -2.864589214324951, - "eval_logps/chosen": -319.48651123046875, - "eval_logps/rejected": -302.1758728027344, - "eval_loss": 0.5405582785606384, - "eval_rewards/accuracies": 0.7702205777168274, - "eval_rewards/chosen": -0.46254563331604004, - "eval_rewards/margins": 2.801729202270508, - "eval_rewards/rejected": -3.264274835586548, - "eval_runtime": 303.5871, - "eval_samples_per_second": 7.154, - "eval_steps_per_second": 0.448, + "epoch": 1.73, + "eval_logits/chosen": -2.4441940784454346, + "eval_logits/rejected": -2.5178475379943848, + "eval_logps/chosen": -296.3059387207031, + "eval_logps/rejected": -296.1109313964844, + "eval_loss": 0.589521586894989, + "eval_rewards/accuracies": 0.7423664331436157, + "eval_rewards/chosen": -1.805547833442688, + "eval_rewards/margins": 1.9920285940170288, + "eval_rewards/rejected": -3.7975761890411377, + "eval_runtime": 297.1424, + "eval_samples_per_second": 7.02, + "eval_steps_per_second": 0.441, "step": 3400 }, { - "epoch": 1.67, - "learning_rate": 2.4710459645313063e-07, - "logits/chosen": -2.934293270111084, - "logits/rejected": -2.9486098289489746, - "logps/chosen": -307.47821044921875, - "logps/rejected": -341.99346923828125, - "loss": 0.1007, + "epoch": 1.73, + "learning_rate": 2.3436322532027127e-07, + "logits/chosen": -2.3531718254089355, + "logits/rejected": -2.444936752319336, + "logps/chosen": -279.48956298828125, + "logps/rejected": -336.4264831542969, + "loss": 0.0955, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.5342551469802856, - "rewards/margins": 6.987419128417969, - "rewards/rejected": -5.453164577484131, + "rewards/chosen": 1.3256587982177734, + "rewards/margins": 7.036476135253906, + "rewards/rejected": -5.710817337036133, "step": 3410 }, { - "epoch": 1.67, - "learning_rate": 2.4619978284473397e-07, - "logits/chosen": -2.950535297393799, - "logits/rejected": -2.9616503715515137, - "logps/chosen": -346.59637451171875, - "logps/rejected": -303.1344299316406, - "loss": 0.1916, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.2854617834091187, - "rewards/margins": 6.439335823059082, - "rewards/rejected": -5.153873920440674, + "epoch": 1.74, + "learning_rate": 2.3342125094197437e-07, + "logits/chosen": -2.337709665298462, + "logits/rejected": -2.3244237899780273, + "logps/chosen": -279.7310485839844, + "logps/rejected": -314.3177490234375, + "loss": 0.085, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8951374292373657, + "rewards/margins": 6.653724670410156, + "rewards/rejected": -5.758587837219238, "step": 3420 }, { - "epoch": 1.68, - "learning_rate": 2.452949692363373e-07, - "logits/chosen": -2.9781761169433594, - "logits/rejected": -2.9379866123199463, - "logps/chosen": -309.08233642578125, - "logps/rejected": -308.34918212890625, - "loss": 0.0915, + "epoch": 1.74, + "learning_rate": 2.3247927656367747e-07, + "logits/chosen": -2.2383086681365967, + "logits/rejected": -2.250387668609619, + "logps/chosen": -247.69833374023438, + "logps/rejected": -316.20166015625, + "loss": 0.1076, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9941552877426147, - "rewards/margins": 6.908686637878418, - "rewards/rejected": -5.914531707763672, + "rewards/chosen": 0.32292360067367554, + "rewards/margins": 7.693238735198975, + "rewards/rejected": -7.370314598083496, "step": 3430 }, { - "epoch": 1.68, - "learning_rate": 2.4439015562794064e-07, - "logits/chosen": -2.909832715988159, - "logits/rejected": -2.943786144256592, - "logps/chosen": -317.84185791015625, - "logps/rejected": -319.4661560058594, - "loss": 0.1819, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9553133845329285, - "rewards/margins": 6.973750114440918, - "rewards/rejected": -6.018437385559082, + "epoch": 1.75, + "learning_rate": 2.3153730218538055e-07, + "logits/chosen": -2.2730133533477783, + "logits/rejected": -2.4274871349334717, + "logps/chosen": -250.9065399169922, + "logps/rejected": -288.06298828125, + "loss": 0.0637, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.45260730385780334, + "rewards/margins": 6.174647331237793, + "rewards/rejected": -5.722040176391602, "step": 3440 }, { - "epoch": 1.69, - "learning_rate": 2.4348534201954397e-07, - "logits/chosen": -2.841625928878784, - "logits/rejected": -2.897307872772217, - "logps/chosen": -276.1376037597656, - "logps/rejected": -350.06884765625, - "loss": 0.0993, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.0146503448486328, - "rewards/margins": 7.049843788146973, - "rewards/rejected": -6.035193920135498, + "epoch": 1.75, + "learning_rate": 2.3059532780708362e-07, + "logits/chosen": -2.339019298553467, + "logits/rejected": -2.3908653259277344, + "logps/chosen": -242.775146484375, + "logps/rejected": -296.8477478027344, + "loss": 0.1049, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3753826320171356, + "rewards/margins": 5.82875919342041, + "rewards/rejected": -5.453376293182373, "step": 3450 }, { - "epoch": 1.69, - "learning_rate": 2.425805284111473e-07, - "logits/chosen": -2.921602487564087, - "logits/rejected": -2.9265694618225098, - "logps/chosen": -354.541015625, - "logps/rejected": -311.3784484863281, - "loss": 0.1099, + "epoch": 1.76, + "learning_rate": 2.2965335342878672e-07, + "logits/chosen": -2.258469820022583, + "logits/rejected": -2.311418056488037, + "logps/chosen": -268.7459411621094, + "logps/rejected": -304.2618713378906, + "loss": 0.0832, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.9630530476570129, - "rewards/margins": 6.6176018714904785, - "rewards/rejected": -5.654547691345215, + "rewards/chosen": 0.8613001108169556, + "rewards/margins": 6.3985466957092285, + "rewards/rejected": -5.537246227264404, "step": 3460 }, { - "epoch": 1.7, - "learning_rate": 2.4167571480275064e-07, - "logits/chosen": -2.867741107940674, - "logits/rejected": -2.8722500801086426, - "logps/chosen": -359.26568603515625, - "logps/rejected": -374.5494689941406, - "loss": 0.1454, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.2132556438446045, - "rewards/margins": 5.878228664398193, - "rewards/rejected": -4.66497278213501, + "epoch": 1.77, + "learning_rate": 2.2871137905048982e-07, + "logits/chosen": -2.3698153495788574, + "logits/rejected": -2.3288581371307373, + "logps/chosen": -242.6855926513672, + "logps/rejected": -338.298583984375, + "loss": 0.0687, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5517158508300781, + "rewards/margins": 7.039074897766113, + "rewards/rejected": -6.487359046936035, "step": 3470 }, { - "epoch": 1.7, - "learning_rate": 2.407709011943539e-07, - "logits/chosen": -2.900252103805542, - "logits/rejected": -2.926717758178711, - "logps/chosen": -307.59759521484375, - "logps/rejected": -356.13287353515625, - "loss": 0.0932, + "epoch": 1.77, + "learning_rate": 2.2776940467219292e-07, + "logits/chosen": -2.382579803466797, + "logits/rejected": -2.2819111347198486, + "logps/chosen": -254.0598907470703, + "logps/rejected": -349.50286865234375, + "loss": 0.0918, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2558295726776123, - "rewards/margins": 7.0370917320251465, - "rewards/rejected": -5.781262397766113, + "rewards/chosen": 0.44244199991226196, + "rewards/margins": 6.988653659820557, + "rewards/rejected": -6.546212196350098, "step": 3480 }, { - "epoch": 1.7, - "learning_rate": 2.3986608758595726e-07, - "logits/chosen": -2.9129486083984375, - "logits/rejected": -2.8953404426574707, - "logps/chosen": -289.01861572265625, - "logps/rejected": -334.9922180175781, - "loss": 0.0702, + "epoch": 1.78, + "learning_rate": 2.26827430293896e-07, + "logits/chosen": -2.319734811782837, + "logits/rejected": -2.2849202156066895, + "logps/chosen": -251.9346923828125, + "logps/rejected": -286.87060546875, + "loss": 0.0785, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2145061492919922, - "rewards/margins": 6.741036891937256, - "rewards/rejected": -5.526531219482422, + "rewards/chosen": -0.08254555612802505, + "rewards/margins": 5.771401882171631, + "rewards/rejected": -5.853947639465332, "step": 3490 }, { - "epoch": 1.71, - "learning_rate": 2.389612739775606e-07, - "logits/chosen": -2.8515162467956543, - "logits/rejected": -2.9067862033843994, - "logps/chosen": -342.7030029296875, - "logps/rejected": -332.34381103515625, - "loss": 0.115, + "epoch": 1.78, + "learning_rate": 2.258854559155991e-07, + "logits/chosen": -2.3515708446502686, + "logits/rejected": -2.3444418907165527, + "logps/chosen": -279.8646545410156, + "logps/rejected": -294.0340576171875, + "loss": 0.09, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5411795377731323, - "rewards/margins": 6.527838706970215, - "rewards/rejected": -5.986659526824951, + "rewards/chosen": 0.4335947036743164, + "rewards/margins": 6.248974800109863, + "rewards/rejected": -5.815380096435547, "step": 3500 }, { - "epoch": 1.71, - "eval_logits/chosen": -2.890667200088501, - "eval_logits/rejected": -2.8837928771972656, - "eval_logps/chosen": -322.91961669921875, - "eval_logps/rejected": -305.196044921875, - "eval_loss": 0.5673595666885376, - "eval_rewards/accuracies": 0.7738970518112183, - "eval_rewards/chosen": -0.8058595061302185, - "eval_rewards/margins": 2.760430097579956, - "eval_rewards/rejected": -3.5662899017333984, - "eval_runtime": 303.2921, - "eval_samples_per_second": 7.161, - "eval_steps_per_second": 0.448, + "epoch": 1.78, + "eval_logits/chosen": -2.4561455249786377, + "eval_logits/rejected": -2.5260605812072754, + "eval_logps/chosen": -296.7054748535156, + "eval_logps/rejected": -298.36944580078125, + "eval_loss": 0.6231197714805603, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -1.84550142288208, + "eval_rewards/margins": 2.177931547164917, + "eval_rewards/rejected": -4.023432731628418, + "eval_runtime": 301.7345, + "eval_samples_per_second": 6.913, + "eval_steps_per_second": 0.434, "step": 3500 }, { - "epoch": 1.71, - "learning_rate": 2.3805646036916393e-07, - "logits/chosen": -2.8759562969207764, - "logits/rejected": -2.8387200832366943, - "logps/chosen": -382.8741455078125, - "logps/rejected": -356.2127990722656, - "loss": 0.1186, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 1.0191415548324585, - "rewards/margins": 6.327065467834473, - "rewards/rejected": -5.307923316955566, + "epoch": 1.79, + "learning_rate": 2.2494348153730217e-07, + "logits/chosen": -2.338193416595459, + "logits/rejected": -2.479274034500122, + "logps/chosen": -269.9765930175781, + "logps/rejected": -317.2516174316406, + "loss": 0.084, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.20136389136314392, + "rewards/margins": 6.459589958190918, + "rewards/rejected": -6.25822639465332, "step": 3510 }, { - "epoch": 1.72, - "learning_rate": 2.3715164676076727e-07, - "logits/chosen": -2.8882782459259033, - "logits/rejected": -2.9117918014526367, - "logps/chosen": -346.82159423828125, - "logps/rejected": -314.1947326660156, - "loss": 0.1037, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.1670362949371338, - "rewards/margins": 6.593735694885254, - "rewards/rejected": -5.426699638366699, + "epoch": 1.79, + "learning_rate": 2.2400150715900527e-07, + "logits/chosen": -2.3768019676208496, + "logits/rejected": -2.3087477684020996, + "logps/chosen": -295.57440185546875, + "logps/rejected": -335.53912353515625, + "loss": 0.1071, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3411738872528076, + "rewards/margins": 6.7873358726501465, + "rewards/rejected": -6.44616174697876, "step": 3520 }, { - "epoch": 1.72, - "learning_rate": 2.362468331523706e-07, - "logits/chosen": -2.9359171390533447, - "logits/rejected": -2.929292678833008, - "logps/chosen": -291.4672546386719, - "logps/rejected": -289.69378662109375, - "loss": 0.1114, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.8928520083427429, - "rewards/margins": 5.889705657958984, - "rewards/rejected": -4.996853828430176, + "epoch": 1.8, + "learning_rate": 2.2305953278070835e-07, + "logits/chosen": -2.3922080993652344, + "logits/rejected": -2.4363255500793457, + "logps/chosen": -294.6719055175781, + "logps/rejected": -299.4778137207031, + "loss": 0.107, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5923565626144409, + "rewards/margins": 6.195247650146484, + "rewards/rejected": -5.602891445159912, "step": 3530 }, { - "epoch": 1.73, - "learning_rate": 2.3534201954397394e-07, - "logits/chosen": -2.913856029510498, - "logits/rejected": -2.936469554901123, - "logps/chosen": -261.1668701171875, - "logps/rejected": -285.6080627441406, - "loss": 0.0972, + "epoch": 1.8, + "learning_rate": 2.2211755840241145e-07, + "logits/chosen": -2.4313762187957764, + "logits/rejected": -2.4436609745025635, + "logps/chosen": -240.0015411376953, + "logps/rejected": -298.2782287597656, + "loss": 0.1136, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7923986315727234, - "rewards/margins": 6.449975490570068, - "rewards/rejected": -5.6575775146484375, + "rewards/chosen": 0.5145145654678345, + "rewards/margins": 6.5318708419799805, + "rewards/rejected": -6.017355918884277, "step": 3540 }, { - "epoch": 1.73, - "learning_rate": 2.3443720593557724e-07, - "logits/chosen": -2.886479616165161, - "logits/rejected": -2.876434564590454, - "logps/chosen": -307.9219665527344, - "logps/rejected": -356.7371826171875, - "loss": 0.0987, + "epoch": 1.81, + "learning_rate": 2.2117558402411455e-07, + "logits/chosen": -2.2902588844299316, + "logits/rejected": -2.2561826705932617, + "logps/chosen": -231.10086059570312, + "logps/rejected": -242.2857666015625, + "loss": 0.0971, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.05591990798711777, - "rewards/margins": 6.301457405090332, - "rewards/rejected": -6.245537757873535, + "rewards/chosen": 0.47640013694763184, + "rewards/margins": 5.48819637298584, + "rewards/rejected": -5.011795997619629, "step": 3550 }, { - "epoch": 1.74, - "learning_rate": 2.3353239232718058e-07, - "logits/chosen": -2.8340017795562744, - "logits/rejected": -2.821384906768799, - "logps/chosen": -317.1324768066406, - "logps/rejected": -351.94512939453125, - "loss": 0.1251, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.387749433517456, - "rewards/margins": 7.482069492340088, - "rewards/rejected": -6.094319820404053, + "epoch": 1.81, + "learning_rate": 2.2023360964581765e-07, + "logits/chosen": -2.2816267013549805, + "logits/rejected": -2.3149495124816895, + "logps/chosen": -265.31292724609375, + "logps/rejected": -365.41943359375, + "loss": 0.096, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2561672925949097, + "rewards/margins": 7.3695244789123535, + "rewards/rejected": -6.113357067108154, "step": 3560 }, { - "epoch": 1.74, - "learning_rate": 2.3262757871878392e-07, - "logits/chosen": -2.9561607837677, - "logits/rejected": -2.9507992267608643, - "logps/chosen": -255.468994140625, - "logps/rejected": -294.02557373046875, - "loss": 0.1198, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.48449230194091797, - "rewards/margins": 6.382293701171875, - "rewards/rejected": -5.897801876068115, + "epoch": 1.82, + "learning_rate": 2.192916352675207e-07, + "logits/chosen": -2.2925052642822266, + "logits/rejected": -2.317241668701172, + "logps/chosen": -256.3150634765625, + "logps/rejected": -273.255126953125, + "loss": 0.0838, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.44810542464256287, + "rewards/margins": 5.637542724609375, + "rewards/rejected": -5.189436912536621, "step": 3570 }, { - "epoch": 1.75, - "learning_rate": 2.3172276511038725e-07, - "logits/chosen": -2.9182674884796143, - "logits/rejected": -2.8960814476013184, - "logps/chosen": -296.08062744140625, - "logps/rejected": -311.75152587890625, - "loss": 0.1059, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5791698098182678, - "rewards/margins": 6.129942893981934, - "rewards/rejected": -5.550773620605469, + "epoch": 1.82, + "learning_rate": 2.183496608892238e-07, + "logits/chosen": -2.2493538856506348, + "logits/rejected": -2.228045701980591, + "logps/chosen": -290.98834228515625, + "logps/rejected": -318.1434326171875, + "loss": 0.1392, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.17248782515525818, + "rewards/margins": 5.7656755447387695, + "rewards/rejected": -5.5931878089904785, "step": 3580 }, { - "epoch": 1.75, - "learning_rate": 2.3081795150199056e-07, - "logits/chosen": -2.915517807006836, - "logits/rejected": -2.9154298305511475, - "logps/chosen": -300.8294982910156, - "logps/rejected": -354.2265625, - "loss": 0.0614, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.6250432133674622, - "rewards/margins": 6.578111171722412, - "rewards/rejected": -5.953067779541016, + "epoch": 1.83, + "learning_rate": 2.174076865109269e-07, + "logits/chosen": -2.4821906089782715, + "logits/rejected": -2.487905979156494, + "logps/chosen": -254.1220245361328, + "logps/rejected": -355.9189453125, + "loss": 0.0894, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6711649298667908, + "rewards/margins": 6.761671543121338, + "rewards/rejected": -6.090506553649902, "step": 3590 }, { - "epoch": 1.76, - "learning_rate": 2.299131378935939e-07, - "logits/chosen": -2.9170968532562256, - "logits/rejected": -2.906830310821533, - "logps/chosen": -256.9389343261719, - "logps/rejected": -342.1560974121094, - "loss": 0.1311, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.7688789367675781, - "rewards/margins": 6.357014179229736, - "rewards/rejected": -5.588134288787842, + "epoch": 1.83, + "learning_rate": 2.1646571213263e-07, + "logits/chosen": -2.421340227127075, + "logits/rejected": -2.477186679840088, + "logps/chosen": -300.8204345703125, + "logps/rejected": -312.5556335449219, + "loss": 0.1238, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7146443128585815, + "rewards/margins": 6.508008003234863, + "rewards/rejected": -5.793363571166992, "step": 3600 }, { - "epoch": 1.76, - "eval_logits/chosen": -2.9536280632019043, - "eval_logits/rejected": -2.9513378143310547, - "eval_logps/chosen": -323.1350402832031, - "eval_logps/rejected": -306.6445007324219, - "eval_loss": 0.5627171397209167, - "eval_rewards/accuracies": 0.7720588445663452, - "eval_rewards/chosen": -0.8274054527282715, - "eval_rewards/margins": 2.88373064994812, - "eval_rewards/rejected": -3.7111363410949707, - "eval_runtime": 303.5059, - "eval_samples_per_second": 7.156, - "eval_steps_per_second": 0.448, + "epoch": 1.83, + "eval_logits/chosen": -2.551158905029297, + "eval_logits/rejected": -2.6294496059417725, + "eval_logps/chosen": -295.0212707519531, + "eval_logps/rejected": -294.1321105957031, + "eval_loss": 0.6046690940856934, + "eval_rewards/accuracies": 0.7423664331436157, + "eval_rewards/chosen": -1.677080750465393, + "eval_rewards/margins": 1.922616720199585, + "eval_rewards/rejected": -3.5996978282928467, + "eval_runtime": 296.8842, + "eval_samples_per_second": 7.026, + "eval_steps_per_second": 0.441, "step": 3600 }, { - "epoch": 1.76, - "learning_rate": 2.2900832428519723e-07, - "logits/chosen": -2.930382490158081, - "logits/rejected": -2.953674793243408, - "logps/chosen": -339.317138671875, - "logps/rejected": -411.460693359375, - "loss": 0.1345, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.1304901838302612, - "rewards/margins": 7.993877410888672, - "rewards/rejected": -6.863387107849121, + "epoch": 1.84, + "learning_rate": 2.1552373775433307e-07, + "logits/chosen": -2.4084277153015137, + "logits/rejected": -2.4354751110076904, + "logps/chosen": -290.44696044921875, + "logps/rejected": -286.05389404296875, + "loss": 0.0909, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7692252993583679, + "rewards/margins": 6.157434940338135, + "rewards/rejected": -5.388209342956543, "step": 3610 }, { - "epoch": 1.77, - "learning_rate": 2.2810351067680057e-07, - "logits/chosen": -2.8837013244628906, - "logits/rejected": -2.838395357131958, - "logps/chosen": -323.78607177734375, - "logps/rejected": -352.07623291015625, - "loss": 0.1541, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.293809711933136, - "rewards/margins": 5.587378978729248, - "rewards/rejected": -5.293569087982178, + "epoch": 1.84, + "learning_rate": 2.1458176337603617e-07, + "logits/chosen": -2.442023992538452, + "logits/rejected": -2.503213882446289, + "logps/chosen": -295.61212158203125, + "logps/rejected": -324.33453369140625, + "loss": 0.1087, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.38564035296440125, + "rewards/margins": 6.0786871910095215, + "rewards/rejected": -5.693046569824219, "step": 3620 }, { - "epoch": 1.77, - "learning_rate": 2.271986970684039e-07, - "logits/chosen": -2.9727985858917236, - "logits/rejected": -2.9664175510406494, - "logps/chosen": -283.4669189453125, - "logps/rejected": -328.41571044921875, - "loss": 0.0926, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.6970881819725037, - "rewards/margins": 6.231173515319824, - "rewards/rejected": -5.534085273742676, + "epoch": 1.85, + "learning_rate": 2.1363978899773924e-07, + "logits/chosen": -2.4094722270965576, + "logits/rejected": -2.527698040008545, + "logps/chosen": -244.3059844970703, + "logps/rejected": -309.1972961425781, + "loss": 0.1041, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.954838752746582, + "rewards/margins": 6.438208103179932, + "rewards/rejected": -5.483368873596191, "step": 3630 }, { - "epoch": 1.78, - "learning_rate": 2.262938834600072e-07, - "logits/chosen": -2.9621691703796387, - "logits/rejected": -2.9963552951812744, - "logps/chosen": -324.6197204589844, - "logps/rejected": -359.4258117675781, - "loss": 0.0759, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0718951225280762, - "rewards/margins": 7.1518235206604, - "rewards/rejected": -6.079928398132324, + "epoch": 1.85, + "learning_rate": 2.1269781461944234e-07, + "logits/chosen": -2.5033748149871826, + "logits/rejected": -2.5151901245117188, + "logps/chosen": -286.1416931152344, + "logps/rejected": -314.78759765625, + "loss": 0.0851, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8734874725341797, + "rewards/margins": 6.574765205383301, + "rewards/rejected": -5.701278209686279, "step": 3640 }, { - "epoch": 1.78, - "learning_rate": 2.2538906985161054e-07, - "logits/chosen": -2.890007495880127, - "logits/rejected": -2.920297145843506, - "logps/chosen": -342.0650939941406, - "logps/rejected": -316.7069091796875, - "loss": 0.066, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.8388646841049194, - "rewards/margins": 6.7674360275268555, - "rewards/rejected": -5.928570747375488, + "epoch": 1.86, + "learning_rate": 2.1175584024114542e-07, + "logits/chosen": -2.485938310623169, + "logits/rejected": -2.4653992652893066, + "logps/chosen": -336.11871337890625, + "logps/rejected": -345.1873474121094, + "loss": 0.0875, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.446183443069458, + "rewards/margins": 7.834715843200684, + "rewards/rejected": -6.388533115386963, "step": 3650 }, { - "epoch": 1.79, - "learning_rate": 2.2448425624321388e-07, - "logits/chosen": -2.90877366065979, - "logits/rejected": -2.8847005367279053, - "logps/chosen": -271.696044921875, - "logps/rejected": -303.04412841796875, - "loss": 0.0737, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9462534189224243, - "rewards/margins": 6.50563907623291, - "rewards/rejected": -5.559384822845459, + "epoch": 1.86, + "learning_rate": 2.1081386586284852e-07, + "logits/chosen": -2.4055051803588867, + "logits/rejected": -2.4212534427642822, + "logps/chosen": -295.2558288574219, + "logps/rejected": -342.17047119140625, + "loss": 0.0596, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8791106939315796, + "rewards/margins": 6.56097412109375, + "rewards/rejected": -5.681862831115723, "step": 3660 }, { - "epoch": 1.79, - "learning_rate": 2.2357944263481722e-07, - "logits/chosen": -2.895263671875, - "logits/rejected": -2.8995864391326904, - "logps/chosen": -308.7177734375, - "logps/rejected": -338.81414794921875, - "loss": 0.1374, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.7501927614212036, - "rewards/margins": 6.151031494140625, - "rewards/rejected": -5.400838851928711, + "epoch": 1.87, + "learning_rate": 2.0987189148455162e-07, + "logits/chosen": -2.468773365020752, + "logits/rejected": -2.427738904953003, + "logps/chosen": -285.6302795410156, + "logps/rejected": -299.8651428222656, + "loss": 0.0816, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.422785222530365, + "rewards/margins": 6.147997856140137, + "rewards/rejected": -5.725213050842285, "step": 3670 }, { - "epoch": 1.8, - "learning_rate": 2.2267462902642052e-07, - "logits/chosen": -2.9261951446533203, - "logits/rejected": -2.924281358718872, - "logps/chosen": -345.0509948730469, - "logps/rejected": -309.61090087890625, - "loss": 0.1251, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.9833319783210754, - "rewards/margins": 6.599654197692871, - "rewards/rejected": -5.6163225173950195, + "epoch": 1.87, + "learning_rate": 2.0892991710625472e-07, + "logits/chosen": -2.520200490951538, + "logits/rejected": -2.3786044120788574, + "logps/chosen": -254.3336944580078, + "logps/rejected": -342.19378662109375, + "loss": 0.0871, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.026506716385483742, + "rewards/margins": 6.259054660797119, + "rewards/rejected": -6.285561561584473, "step": 3680 }, { - "epoch": 1.8, - "learning_rate": 2.2176981541802386e-07, - "logits/chosen": -3.0083887577056885, - "logits/rejected": -2.9783711433410645, - "logps/chosen": -305.8582458496094, - "logps/rejected": -253.05801391601562, - "loss": 0.16, + "epoch": 1.88, + "learning_rate": 2.0798794272795777e-07, + "logits/chosen": -2.5233798027038574, + "logits/rejected": -2.5141570568084717, + "logps/chosen": -269.6754150390625, + "logps/rejected": -339.6807556152344, + "loss": 0.1134, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.9826359748840332, - "rewards/margins": 6.130540370941162, - "rewards/rejected": -5.147904872894287, + "rewards/chosen": 0.6938865780830383, + "rewards/margins": 6.421724796295166, + "rewards/rejected": -5.72783899307251, "step": 3690 }, { - "epoch": 1.81, - "learning_rate": 2.208650018096272e-07, - "logits/chosen": -2.9770560264587402, - "logits/rejected": -2.9949138164520264, - "logps/chosen": -294.5087890625, - "logps/rejected": -293.2074279785156, - "loss": 0.1318, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.659691333770752, - "rewards/margins": 7.31722354888916, - "rewards/rejected": -6.657530784606934, + "epoch": 1.88, + "learning_rate": 2.0704596834966087e-07, + "logits/chosen": -2.522254705429077, + "logits/rejected": -2.3955483436584473, + "logps/chosen": -284.70574951171875, + "logps/rejected": -297.5660400390625, + "loss": 0.0847, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5291184186935425, + "rewards/margins": 6.2721452713012695, + "rewards/rejected": -5.743027210235596, "step": 3700 }, { - "epoch": 1.81, - "eval_logits/chosen": -2.9519388675689697, - "eval_logits/rejected": -2.9495904445648193, - "eval_logps/chosen": -323.081787109375, - "eval_logps/rejected": -306.67999267578125, - "eval_loss": 0.5680773258209229, - "eval_rewards/accuracies": 0.7757353186607361, - "eval_rewards/chosen": -0.8220744132995605, - "eval_rewards/margins": 2.8926098346710205, - "eval_rewards/rejected": -3.714684009552002, - "eval_runtime": 303.7569, - "eval_samples_per_second": 7.15, - "eval_steps_per_second": 0.448, + "epoch": 1.88, + "eval_logits/chosen": -2.547114849090576, + "eval_logits/rejected": -2.6224122047424316, + "eval_logps/chosen": -294.97576904296875, + "eval_logps/rejected": -293.8779296875, + "eval_loss": 0.5898069143295288, + "eval_rewards/accuracies": 0.7347328066825867, + "eval_rewards/chosen": -1.672528862953186, + "eval_rewards/margins": 1.9017502069473267, + "eval_rewards/rejected": -3.5742790699005127, + "eval_runtime": 301.5335, + "eval_samples_per_second": 6.918, + "eval_steps_per_second": 0.434, "step": 3700 }, { - "epoch": 1.81, - "learning_rate": 2.1996018820123053e-07, - "logits/chosen": -2.9349725246429443, - "logits/rejected": -2.951141357421875, - "logps/chosen": -302.53204345703125, - "logps/rejected": -315.42108154296875, - "loss": 0.115, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.0496349334716797, - "rewards/margins": 6.860198020935059, - "rewards/rejected": -5.810563564300537, + "epoch": 1.89, + "learning_rate": 2.0610399397136397e-07, + "logits/chosen": -2.357499361038208, + "logits/rejected": -2.3887948989868164, + "logps/chosen": -238.7329559326172, + "logps/rejected": -300.47772216796875, + "loss": 0.2105, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.1059715747833252, + "rewards/margins": 7.015279293060303, + "rewards/rejected": -5.909307479858398, "step": 3710 }, { - "epoch": 1.82, - "learning_rate": 2.1905537459283387e-07, - "logits/chosen": -2.9476025104522705, - "logits/rejected": -2.967245101928711, - "logps/chosen": -326.92425537109375, - "logps/rejected": -334.18585205078125, - "loss": 0.0859, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.826195478439331, - "rewards/margins": 7.695529937744141, - "rewards/rejected": -5.8693342208862305, + "epoch": 1.89, + "learning_rate": 2.0516201959306707e-07, + "logits/chosen": -2.3938887119293213, + "logits/rejected": -2.5061392784118652, + "logps/chosen": -270.1607666015625, + "logps/rejected": -275.0926513671875, + "loss": 0.0896, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.56185382604599, + "rewards/margins": 5.6083197593688965, + "rewards/rejected": -5.046465873718262, "step": 3720 }, { - "epoch": 1.82, - "learning_rate": 2.181505609844372e-07, - "logits/chosen": -2.9699959754943848, - "logits/rejected": -2.9592862129211426, - "logps/chosen": -331.2188415527344, - "logps/rejected": -258.76361083984375, - "loss": 0.1024, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.5807109475135803, - "rewards/margins": 6.25778341293335, - "rewards/rejected": -5.677073001861572, + "epoch": 1.9, + "learning_rate": 2.0422004521477014e-07, + "logits/chosen": -2.4314119815826416, + "logits/rejected": -2.4555587768554688, + "logps/chosen": -279.8658142089844, + "logps/rejected": -353.25482177734375, + "loss": 0.0931, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4166037440299988, + "rewards/margins": 7.007256507873535, + "rewards/rejected": -6.590653419494629, "step": 3730 }, { - "epoch": 1.83, - "learning_rate": 2.1724574737604054e-07, - "logits/chosen": -2.9005179405212402, - "logits/rejected": -2.8999667167663574, - "logps/chosen": -308.59844970703125, - "logps/rejected": -393.07318115234375, - "loss": 0.0972, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0348241329193115, - "rewards/margins": 7.786675930023193, - "rewards/rejected": -6.751851558685303, - "step": 3740 + "epoch": 1.9, + "learning_rate": 2.0327807083647324e-07, + "logits/chosen": -2.5305721759796143, + "logits/rejected": -2.468280792236328, + "logps/chosen": -278.43310546875, + "logps/rejected": -296.37237548828125, + "loss": 0.12, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6041958332061768, + "rewards/margins": 5.6044135093688965, + "rewards/rejected": -5.000216960906982, + "step": 3740 }, { - "epoch": 1.83, - "learning_rate": 2.1634093376764387e-07, - "logits/chosen": -2.9890828132629395, - "logits/rejected": -2.9888575077056885, - "logps/chosen": -366.74017333984375, - "logps/rejected": -324.6880187988281, - "loss": 0.0947, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2270302772521973, - "rewards/margins": 7.239813804626465, - "rewards/rejected": -6.012783050537109, + "epoch": 1.91, + "learning_rate": 2.0233609645817634e-07, + "logits/chosen": -2.36225962638855, + "logits/rejected": -2.416337490081787, + "logps/chosen": -305.2579650878906, + "logps/rejected": -331.9045104980469, + "loss": 0.1075, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.41681164503097534, + "rewards/margins": 6.1702985763549805, + "rewards/rejected": -5.7534871101379395, "step": 3750 }, { - "epoch": 1.84, - "learning_rate": 2.154361201592472e-07, - "logits/chosen": -2.993152141571045, - "logits/rejected": -2.9691381454467773, - "logps/chosen": -341.57208251953125, - "logps/rejected": -314.56585693359375, - "loss": 0.0806, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3404189348220825, - "rewards/margins": 6.949911594390869, - "rewards/rejected": -5.609493255615234, + "epoch": 1.91, + "learning_rate": 2.0139412207987942e-07, + "logits/chosen": -2.4988961219787598, + "logits/rejected": -2.494710922241211, + "logps/chosen": -274.8974609375, + "logps/rejected": -294.8260192871094, + "loss": 0.0894, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7716017961502075, + "rewards/margins": 6.274075508117676, + "rewards/rejected": -5.502473831176758, "step": 3760 }, { - "epoch": 1.84, - "learning_rate": 2.1453130655085052e-07, - "logits/chosen": -2.8522610664367676, - "logits/rejected": -2.8946385383605957, - "logps/chosen": -323.80181884765625, - "logps/rejected": -283.73065185546875, - "loss": 0.1047, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.2719176113605499, - "rewards/margins": 5.973901271820068, - "rewards/rejected": -5.701983451843262, + "epoch": 1.92, + "learning_rate": 2.004521477015825e-07, + "logits/chosen": -2.497544765472412, + "logits/rejected": -2.4463839530944824, + "logps/chosen": -240.38961791992188, + "logps/rejected": -274.1627502441406, + "loss": 0.0975, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.06918475031852722, + "rewards/margins": 5.60644006729126, + "rewards/rejected": -5.53725528717041, "step": 3770 }, { - "epoch": 1.85, - "learning_rate": 2.1362649294245385e-07, - "logits/chosen": -2.910778045654297, - "logits/rejected": -2.887686252593994, - "logps/chosen": -357.68865966796875, - "logps/rejected": -373.08697509765625, - "loss": 0.1218, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.6899020075798035, - "rewards/margins": 7.9256134033203125, - "rewards/rejected": -7.235711574554443, + "epoch": 1.92, + "learning_rate": 1.995101733232856e-07, + "logits/chosen": -2.553527355194092, + "logits/rejected": -2.420929431915283, + "logps/chosen": -272.9808349609375, + "logps/rejected": -331.07733154296875, + "loss": 0.0665, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6573607325553894, + "rewards/margins": 6.504881858825684, + "rewards/rejected": -5.8475213050842285, "step": 3780 }, { - "epoch": 1.85, - "learning_rate": 2.1272167933405719e-07, - "logits/chosen": -2.9140501022338867, - "logits/rejected": -2.9234848022460938, - "logps/chosen": -338.2618103027344, - "logps/rejected": -295.20379638671875, - "loss": 0.1514, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.7664359211921692, - "rewards/margins": 6.264840126037598, - "rewards/rejected": -5.498404502868652, + "epoch": 1.93, + "learning_rate": 1.985681989449887e-07, + "logits/chosen": -2.4012131690979004, + "logits/rejected": -2.5193140506744385, + "logps/chosen": -280.072998046875, + "logps/rejected": -303.76251220703125, + "loss": 0.1033, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.3876453936100006, + "rewards/margins": 5.856767177581787, + "rewards/rejected": -5.469121932983398, "step": 3790 }, { - "epoch": 1.86, - "learning_rate": 2.1181686572566052e-07, - "logits/chosen": -3.002788782119751, - "logits/rejected": -2.9906973838806152, - "logps/chosen": -355.0797119140625, - "logps/rejected": -359.6622619628906, - "loss": 0.0986, + "epoch": 1.93, + "learning_rate": 1.976262245666918e-07, + "logits/chosen": -2.353722095489502, + "logits/rejected": -2.453313112258911, + "logps/chosen": -280.9483642578125, + "logps/rejected": -288.4091796875, + "loss": 0.0908, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.9103447198867798, - "rewards/margins": 6.739536285400391, - "rewards/rejected": -5.8291916847229, + "rewards/chosen": 0.2342231273651123, + "rewards/margins": 5.581732749938965, + "rewards/rejected": -5.347509384155273, "step": 3800 }, { - "epoch": 1.86, - "eval_logits/chosen": -2.9521079063415527, - "eval_logits/rejected": -2.953190326690674, - "eval_logps/chosen": -324.5731506347656, - "eval_logps/rejected": -306.8705139160156, - "eval_loss": 0.5458743572235107, - "eval_rewards/accuracies": 0.7867646813392639, - "eval_rewards/chosen": -0.9712130427360535, - "eval_rewards/margins": 2.762523651123047, - "eval_rewards/rejected": -3.733736515045166, - "eval_runtime": 303.4268, - "eval_samples_per_second": 7.158, - "eval_steps_per_second": 0.448, + "epoch": 1.93, + "eval_logits/chosen": -2.5046539306640625, + "eval_logits/rejected": -2.577751636505127, + "eval_logps/chosen": -294.3269348144531, + "eval_logps/rejected": -293.51580810546875, + "eval_loss": 0.5816638469696045, + "eval_rewards/accuracies": 0.7366412281990051, + "eval_rewards/chosen": -1.6076483726501465, + "eval_rewards/margins": 1.9304182529449463, + "eval_rewards/rejected": -3.5380663871765137, + "eval_runtime": 296.9875, + "eval_samples_per_second": 7.024, + "eval_steps_per_second": 0.441, "step": 3800 }, { - "epoch": 1.86, - "learning_rate": 2.1091205211726386e-07, - "logits/chosen": -2.953983783721924, - "logits/rejected": -2.964948892593384, - "logps/chosen": -312.2245178222656, - "logps/rejected": -359.750732421875, - "loss": 0.0518, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9394378662109375, - "rewards/margins": 7.120744228363037, - "rewards/rejected": -6.1813063621521, + "epoch": 1.94, + "learning_rate": 1.9668425018839487e-07, + "logits/chosen": -2.355513572692871, + "logits/rejected": -2.2735846042633057, + "logps/chosen": -281.0897521972656, + "logps/rejected": -296.72900390625, + "loss": 0.0724, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6503603458404541, + "rewards/margins": 6.108792304992676, + "rewards/rejected": -5.458431720733643, "step": 3810 }, { - "epoch": 1.87, - "learning_rate": 2.1000723850886717e-07, - "logits/chosen": -2.942121744155884, - "logits/rejected": -2.99276065826416, - "logps/chosen": -330.1883544921875, - "logps/rejected": -335.9338073730469, - "loss": 0.131, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.174506425857544, - "rewards/margins": 7.370104789733887, - "rewards/rejected": -6.195598125457764, + "epoch": 1.94, + "learning_rate": 1.9574227581009794e-07, + "logits/chosen": -2.418468952178955, + "logits/rejected": -2.371502161026001, + "logps/chosen": -311.9105224609375, + "logps/rejected": -345.49322509765625, + "loss": 0.1222, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8610822558403015, + "rewards/margins": 6.491988182067871, + "rewards/rejected": -5.630906105041504, "step": 3820 }, { - "epoch": 1.87, - "learning_rate": 2.091024249004705e-07, - "logits/chosen": -2.9848029613494873, - "logits/rejected": -2.9782183170318604, - "logps/chosen": -299.5281677246094, - "logps/rejected": -269.7864990234375, - "loss": 0.1103, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.20068922638893127, - "rewards/margins": 5.587775707244873, - "rewards/rejected": -5.387087345123291, + "epoch": 1.95, + "learning_rate": 1.9480030143180104e-07, + "logits/chosen": -2.348468542098999, + "logits/rejected": -2.3723227977752686, + "logps/chosen": -286.27313232421875, + "logps/rejected": -353.67633056640625, + "loss": 0.0726, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8641120791435242, + "rewards/margins": 7.633820533752441, + "rewards/rejected": -6.769708156585693, "step": 3830 }, { - "epoch": 1.88, - "learning_rate": 2.0819761129207384e-07, - "logits/chosen": -2.903519630432129, - "logits/rejected": -2.9105443954467773, - "logps/chosen": -275.0736083984375, - "logps/rejected": -291.53387451171875, - "loss": 0.1071, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.474869966506958, - "rewards/margins": 5.91092586517334, - "rewards/rejected": -5.436055660247803, + "epoch": 1.95, + "learning_rate": 1.9385832705350414e-07, + "logits/chosen": -2.211182117462158, + "logits/rejected": -2.160393238067627, + "logps/chosen": -223.9073028564453, + "logps/rejected": -285.9527893066406, + "loss": 0.0743, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.1326831430196762, + "rewards/margins": 5.998318672180176, + "rewards/rejected": -5.865635395050049, "step": 3840 }, { - "epoch": 1.88, - "learning_rate": 2.0729279768367717e-07, - "logits/chosen": -2.93672513961792, - "logits/rejected": -2.9664242267608643, - "logps/chosen": -323.4449157714844, - "logps/rejected": -381.16644287109375, - "loss": 0.1079, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.3401843309402466, - "rewards/margins": 7.721823215484619, - "rewards/rejected": -6.3816399574279785, + "epoch": 1.96, + "learning_rate": 1.9291635267520722e-07, + "logits/chosen": -2.421313524246216, + "logits/rejected": -2.4438533782958984, + "logps/chosen": -280.8946838378906, + "logps/rejected": -321.70123291015625, + "loss": 0.1117, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4331938624382019, + "rewards/margins": 6.441449165344238, + "rewards/rejected": -6.008255481719971, "step": 3850 }, { - "epoch": 1.89, - "learning_rate": 2.0638798407528048e-07, - "logits/chosen": -2.895134210586548, - "logits/rejected": -2.8745009899139404, - "logps/chosen": -345.90087890625, - "logps/rejected": -327.84259033203125, - "loss": 0.0946, + "epoch": 1.96, + "learning_rate": 1.9197437829691032e-07, + "logits/chosen": -2.4228568077087402, + "logits/rejected": -2.3654749393463135, + "logps/chosen": -312.3072814941406, + "logps/rejected": -329.73846435546875, + "loss": 0.0721, "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.3021621704101562, - "rewards/margins": 6.0390238761901855, - "rewards/rejected": -4.736861705780029, + "rewards/chosen": 0.6582003831863403, + "rewards/margins": 6.057830810546875, + "rewards/rejected": -5.399630546569824, "step": 3860 }, { - "epoch": 1.89, - "learning_rate": 2.0548317046688382e-07, - "logits/chosen": -2.981945753097534, - "logits/rejected": -2.967672824859619, - "logps/chosen": -323.56329345703125, - "logps/rejected": -358.1614685058594, - "loss": 0.0917, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.5952293872833252, - "rewards/margins": 7.667240142822266, - "rewards/rejected": -6.072011470794678, + "epoch": 1.97, + "learning_rate": 1.9103240391861342e-07, + "logits/chosen": -2.3000106811523438, + "logits/rejected": -2.3441321849823, + "logps/chosen": -257.6878967285156, + "logps/rejected": -272.4525451660156, + "loss": 0.0796, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.8951619863510132, + "rewards/margins": 6.290835380554199, + "rewards/rejected": -5.395674705505371, "step": 3870 }, { - "epoch": 1.9, - "learning_rate": 2.0457835685848715e-07, - "logits/chosen": -2.9356064796447754, - "logits/rejected": -2.9427523612976074, - "logps/chosen": -317.0868835449219, - "logps/rejected": -304.80706787109375, - "loss": 0.1372, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0302358865737915, - "rewards/margins": 5.979710578918457, - "rewards/rejected": -4.949473857879639, + "epoch": 1.97, + "learning_rate": 1.900904295403165e-07, + "logits/chosen": -2.3968539237976074, + "logits/rejected": -2.3876609802246094, + "logps/chosen": -273.7939453125, + "logps/rejected": -331.85369873046875, + "loss": 0.0905, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0945357084274292, + "rewards/margins": 7.351518154144287, + "rewards/rejected": -6.256982803344727, "step": 3880 }, { - "epoch": 1.9, - "learning_rate": 2.0367354325009049e-07, - "logits/chosen": -2.840512752532959, - "logits/rejected": -2.8991363048553467, - "logps/chosen": -385.388916015625, - "logps/rejected": -357.7392578125, - "loss": 0.1071, + "epoch": 1.98, + "learning_rate": 1.8914845516201957e-07, + "logits/chosen": -2.3795459270477295, + "logits/rejected": -2.3961644172668457, + "logps/chosen": -221.3052215576172, + "logps/rejected": -369.16656494140625, + "loss": 0.0657, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.072593331336975, - "rewards/margins": 6.844846248626709, - "rewards/rejected": -5.772252559661865, + "rewards/chosen": 0.41156476736068726, + "rewards/margins": 6.591817378997803, + "rewards/rejected": -6.180253028869629, "step": 3890 }, { - "epoch": 1.91, - "learning_rate": 2.0276872964169382e-07, - "logits/chosen": -2.881774425506592, - "logits/rejected": -2.8937647342681885, - "logps/chosen": -234.49746704101562, - "logps/rejected": -327.7915954589844, - "loss": 0.1091, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.11194837093353271, - "rewards/margins": 6.389140605926514, - "rewards/rejected": -6.277192115783691, + "epoch": 1.98, + "learning_rate": 1.8820648078372267e-07, + "logits/chosen": -2.4188742637634277, + "logits/rejected": -2.39491605758667, + "logps/chosen": -258.48797607421875, + "logps/rejected": -302.8425598144531, + "loss": 0.0666, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.1906200349330902, + "rewards/margins": 6.002566337585449, + "rewards/rejected": -5.811945915222168, "step": 3900 }, { - "epoch": 1.91, - "eval_logits/chosen": -2.9082632064819336, - "eval_logits/rejected": -2.912630081176758, - "eval_logps/chosen": -322.1650695800781, - "eval_logps/rejected": -305.1989440917969, - "eval_loss": 0.5303793549537659, - "eval_rewards/accuracies": 0.7977941036224365, - "eval_rewards/chosen": -0.7304062843322754, - "eval_rewards/margins": 2.836174964904785, - "eval_rewards/rejected": -3.5665814876556396, - "eval_runtime": 303.6128, - "eval_samples_per_second": 7.154, - "eval_steps_per_second": 0.448, + "epoch": 1.98, + "eval_logits/chosen": -2.5061278343200684, + "eval_logits/rejected": -2.578442335128784, + "eval_logps/chosen": -295.20037841796875, + "eval_logps/rejected": -295.57177734375, + "eval_loss": 0.6063258647918701, + "eval_rewards/accuracies": 0.7309160232543945, + "eval_rewards/chosen": -1.6949888467788696, + "eval_rewards/margins": 2.0486738681793213, + "eval_rewards/rejected": -3.7436630725860596, + "eval_runtime": 301.8399, + "eval_samples_per_second": 6.911, + "eval_steps_per_second": 0.434, "step": 3900 }, { - "epoch": 1.91, - "learning_rate": 2.0186391603329713e-07, - "logits/chosen": -2.934346914291382, - "logits/rejected": -2.9375510215759277, - "logps/chosen": -289.3299865722656, - "logps/rejected": -258.72515869140625, - "loss": 0.1329, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.46928709745407104, - "rewards/margins": 5.787825107574463, - "rewards/rejected": -5.318537712097168, + "epoch": 1.99, + "learning_rate": 1.8726450640542577e-07, + "logits/chosen": -2.4050233364105225, + "logits/rejected": -2.385024309158325, + "logps/chosen": -294.2882385253906, + "logps/rejected": -355.4291687011719, + "loss": 0.0876, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5518203377723694, + "rewards/margins": 7.472037315368652, + "rewards/rejected": -6.920217990875244, "step": 3910 }, { - "epoch": 1.91, - "learning_rate": 2.0095910242490047e-07, - "logits/chosen": -2.961609125137329, - "logits/rejected": -2.986272096633911, - "logps/chosen": -305.36212158203125, - "logps/rejected": -337.9978332519531, - "loss": 0.0808, + "epoch": 1.99, + "learning_rate": 1.8632253202712887e-07, + "logits/chosen": -2.4185214042663574, + "logits/rejected": -2.4424355030059814, + "logps/chosen": -267.9097900390625, + "logps/rejected": -344.55609130859375, + "loss": 0.0646, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.1160730123519897, - "rewards/margins": 6.782060146331787, - "rewards/rejected": -5.665987491607666, + "rewards/chosen": 0.565269947052002, + "rewards/margins": 7.364255428314209, + "rewards/rejected": -6.798984527587891, "step": 3920 }, { - "epoch": 1.92, - "learning_rate": 2.000542888165038e-07, - "logits/chosen": -2.9191956520080566, - "logits/rejected": -2.91412615776062, - "logps/chosen": -297.4899597167969, - "logps/rejected": -337.471923828125, - "loss": 0.0859, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0673969984054565, - "rewards/margins": 6.581826210021973, - "rewards/rejected": -5.514429092407227, + "epoch": 2.0, + "learning_rate": 1.8538055764883194e-07, + "logits/chosen": -2.4195945262908936, + "logits/rejected": -2.467193126678467, + "logps/chosen": -264.4360656738281, + "logps/rejected": -274.2948913574219, + "loss": 0.1307, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0061029670760035515, + "rewards/margins": 5.298325061798096, + "rewards/rejected": -5.292221546173096, "step": 3930 }, { - "epoch": 1.92, - "learning_rate": 1.9914947520810714e-07, - "logits/chosen": -2.961738109588623, - "logits/rejected": -2.945741653442383, - "logps/chosen": -261.1332702636719, - "logps/rejected": -257.2488098144531, - "loss": 0.1177, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.6958726644515991, - "rewards/margins": 6.150496006011963, - "rewards/rejected": -5.454623222351074, + "epoch": 2.0, + "learning_rate": 1.8443858327053502e-07, + "logits/chosen": -2.435084819793701, + "logits/rejected": -2.4718165397644043, + "logps/chosen": -276.6685791015625, + "logps/rejected": -309.0755310058594, + "loss": 0.0263, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3204138278961182, + "rewards/margins": 7.539528846740723, + "rewards/rejected": -6.219114780426025, "step": 3940 }, { - "epoch": 1.93, - "learning_rate": 1.9824466159971044e-07, - "logits/chosen": -2.929811954498291, - "logits/rejected": -2.932736873626709, - "logps/chosen": -323.19061279296875, - "logps/rejected": -338.2663269042969, - "loss": 0.1039, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.8938018679618835, - "rewards/margins": 5.906724452972412, - "rewards/rejected": -5.012922763824463, + "epoch": 2.01, + "learning_rate": 1.8349660889223812e-07, + "logits/chosen": -2.4513020515441895, + "logits/rejected": -2.452085018157959, + "logps/chosen": -250.68972778320312, + "logps/rejected": -308.9100036621094, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8725830316543579, + "rewards/margins": 7.932334899902344, + "rewards/rejected": -7.059751987457275, "step": 3950 }, { - "epoch": 1.93, - "learning_rate": 1.9733984799131378e-07, - "logits/chosen": -2.9369266033172607, - "logits/rejected": -2.965855121612549, - "logps/chosen": -289.3251953125, - "logps/rejected": -285.4294128417969, - "loss": 0.1028, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.7660928964614868, - "rewards/margins": 5.472012519836426, - "rewards/rejected": -4.705918788909912, + "epoch": 2.01, + "learning_rate": 1.8255463451394122e-07, + "logits/chosen": -2.415900945663452, + "logits/rejected": -2.5540549755096436, + "logps/chosen": -271.8116149902344, + "logps/rejected": -332.2603454589844, + "loss": 0.0216, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4665905833244324, + "rewards/margins": 7.362532138824463, + "rewards/rejected": -6.89594030380249, "step": 3960 }, { - "epoch": 1.94, - "learning_rate": 1.9643503438291712e-07, - "logits/chosen": -2.8646328449249268, - "logits/rejected": -2.9229960441589355, - "logps/chosen": -308.924072265625, - "logps/rejected": -326.26263427734375, - "loss": 0.0814, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.1412322521209717, - "rewards/margins": 6.8028740882873535, - "rewards/rejected": -5.661641597747803, + "epoch": 2.02, + "learning_rate": 1.816126601356443e-07, + "logits/chosen": -2.5276520252227783, + "logits/rejected": -2.542480945587158, + "logps/chosen": -298.09222412109375, + "logps/rejected": -355.290283203125, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3401168584823608, + "rewards/margins": 8.322530746459961, + "rewards/rejected": -6.982413291931152, "step": 3970 }, { - "epoch": 1.94, - "learning_rate": 1.9553022077452045e-07, - "logits/chosen": -3.0290329456329346, - "logits/rejected": -3.0302562713623047, - "logps/chosen": -298.660400390625, - "logps/rejected": -282.76898193359375, - "loss": 0.0679, + "epoch": 2.02, + "learning_rate": 1.806706857573474e-07, + "logits/chosen": -2.3913216590881348, + "logits/rejected": -2.5120017528533936, + "logps/chosen": -229.17861938476562, + "logps/rejected": -295.5799865722656, + "loss": 0.0133, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.6106700897216797, - "rewards/margins": 7.4019646644592285, - "rewards/rejected": -5.791295051574707, + "rewards/chosen": 0.28036922216415405, + "rewards/margins": 6.928774833679199, + "rewards/rejected": -6.648406028747559, "step": 3980 }, { - "epoch": 1.95, - "learning_rate": 1.9462540716612379e-07, - "logits/chosen": -2.9572644233703613, - "logits/rejected": -2.942161798477173, - "logps/chosen": -311.3888244628906, - "logps/rejected": -318.38275146484375, - "loss": 0.1088, + "epoch": 2.03, + "learning_rate": 1.797287113790505e-07, + "logits/chosen": -2.4164083003997803, + "logits/rejected": -2.4683470726013184, + "logps/chosen": -250.1259307861328, + "logps/rejected": -309.98895263671875, + "loss": 0.0261, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.8238946199417114, - "rewards/margins": 6.569869041442871, - "rewards/rejected": -5.745975017547607, + "rewards/chosen": 0.3368276357650757, + "rewards/margins": 7.374268531799316, + "rewards/rejected": -7.037441253662109, "step": 3990 }, { - "epoch": 1.95, - "learning_rate": 1.937205935577271e-07, - "logits/chosen": -2.9775755405426025, - "logits/rejected": -2.9991657733917236, - "logps/chosen": -341.7480163574219, - "logps/rejected": -330.06024169921875, - "loss": 0.0919, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.4755096435546875, - "rewards/margins": 7.638768196105957, - "rewards/rejected": -6.1632585525512695, + "epoch": 2.03, + "learning_rate": 1.787867370007536e-07, + "logits/chosen": -2.391810894012451, + "logits/rejected": -2.4127678871154785, + "logps/chosen": -282.1521301269531, + "logps/rejected": -336.7127685546875, + "loss": 0.0173, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7647604942321777, + "rewards/margins": 8.000178337097168, + "rewards/rejected": -7.23541784286499, "step": 4000 }, { - "epoch": 1.95, - "eval_logits/chosen": -2.95937442779541, - "eval_logits/rejected": -2.9640583992004395, - "eval_logps/chosen": -321.9328918457031, - "eval_logps/rejected": -305.72076416015625, - "eval_loss": 0.5359004735946655, - "eval_rewards/accuracies": 0.7941176295280457, - "eval_rewards/chosen": -0.7071900963783264, - "eval_rewards/margins": 2.9115729331970215, - "eval_rewards/rejected": -3.618762731552124, - "eval_runtime": 303.3129, - "eval_samples_per_second": 7.161, - "eval_steps_per_second": 0.448, + "epoch": 2.03, + "eval_logits/chosen": -2.5494742393493652, + "eval_logits/rejected": -2.619675636291504, + "eval_logps/chosen": -299.4777526855469, + "eval_logps/rejected": -301.586181640625, + "eval_loss": 0.6212854385375977, + "eval_rewards/accuracies": 0.7309160232543945, + "eval_rewards/chosen": -2.122727632522583, + "eval_rewards/margins": 2.22237491607666, + "eval_rewards/rejected": -4.345102787017822, + "eval_runtime": 301.8414, + "eval_samples_per_second": 6.911, + "eval_steps_per_second": 0.434, "step": 4000 }, { - "epoch": 1.96, - "learning_rate": 1.9281577994933043e-07, - "logits/chosen": -2.956735134124756, - "logits/rejected": -2.9809699058532715, - "logps/chosen": -281.5861511230469, - "logps/rejected": -279.2427978515625, - "loss": 0.0859, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.39633315801620483, - "rewards/margins": 5.939892768859863, - "rewards/rejected": -5.543560028076172, + "epoch": 2.04, + "learning_rate": 1.7784476262245664e-07, + "logits/chosen": -2.4948716163635254, + "logits/rejected": -2.453728437423706, + "logps/chosen": -284.3287353515625, + "logps/rejected": -372.3181457519531, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.407473087310791, + "rewards/margins": 9.088857650756836, + "rewards/rejected": -8.681384086608887, "step": 4010 }, { - "epoch": 1.96, - "learning_rate": 1.9191096634093377e-07, - "logits/chosen": -2.9376158714294434, - "logits/rejected": -2.965857744216919, - "logps/chosen": -292.43780517578125, - "logps/rejected": -342.1912841796875, - "loss": 0.1582, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.8393204808235168, - "rewards/margins": 6.295124530792236, - "rewards/rejected": -5.455803871154785, + "epoch": 2.04, + "learning_rate": 1.7690278824415974e-07, + "logits/chosen": -2.4881949424743652, + "logits/rejected": -2.5378105640411377, + "logps/chosen": -302.18536376953125, + "logps/rejected": -335.93182373046875, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3019936084747314, + "rewards/margins": 8.437501907348633, + "rewards/rejected": -7.1355085372924805, "step": 4020 }, { - "epoch": 1.97, - "learning_rate": 1.910061527325371e-07, - "logits/chosen": -2.98507022857666, - "logits/rejected": -2.983705997467041, - "logps/chosen": -341.12591552734375, - "logps/rejected": -323.22882080078125, - "loss": 0.0911, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.0092313289642334, - "rewards/margins": 6.526862144470215, - "rewards/rejected": -5.517631530761719, + "epoch": 2.05, + "learning_rate": 1.7596081386586284e-07, + "logits/chosen": -2.519768238067627, + "logits/rejected": -2.4485363960266113, + "logps/chosen": -333.52679443359375, + "logps/rejected": -379.7672424316406, + "loss": 0.0113, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.38338369131088257, + "rewards/margins": 7.786715030670166, + "rewards/rejected": -7.403331756591797, "step": 4030 }, { - "epoch": 1.97, - "learning_rate": 1.901013391241404e-07, - "logits/chosen": -2.894787073135376, - "logits/rejected": -2.923898696899414, - "logps/chosen": -322.0625915527344, - "logps/rejected": -353.07513427734375, - "loss": 0.0716, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.821063220500946, - "rewards/margins": 7.539980888366699, - "rewards/rejected": -6.718916893005371, + "epoch": 2.05, + "learning_rate": 1.7501883948756594e-07, + "logits/chosen": -2.4867990016937256, + "logits/rejected": -2.350109577178955, + "logps/chosen": -261.5180969238281, + "logps/rejected": -335.66876220703125, + "loss": 0.0195, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4131608009338379, + "rewards/margins": 8.117756843566895, + "rewards/rejected": -7.704596042633057, "step": 4040 }, { - "epoch": 1.98, - "learning_rate": 1.8919652551574374e-07, - "logits/chosen": -2.8412909507751465, - "logits/rejected": -2.84688663482666, - "logps/chosen": -264.5270690917969, - "logps/rejected": -309.29443359375, - "loss": 0.0926, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.3431406021118164, - "rewards/margins": 6.311507225036621, - "rewards/rejected": -5.9683661460876465, + "epoch": 2.06, + "learning_rate": 1.7407686510926901e-07, + "logits/chosen": -2.53373384475708, + "logits/rejected": -2.392754077911377, + "logps/chosen": -321.0093688964844, + "logps/rejected": -367.75335693359375, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7985485196113586, + "rewards/margins": 8.87114429473877, + "rewards/rejected": -8.072595596313477, "step": 4050 }, { - "epoch": 1.98, - "learning_rate": 1.8829171190734708e-07, - "logits/chosen": -2.961050271987915, - "logits/rejected": -2.9201204776763916, - "logps/chosen": -279.8636169433594, - "logps/rejected": -285.9794616699219, - "loss": 0.0614, + "epoch": 2.07, + "learning_rate": 1.7313489073097212e-07, + "logits/chosen": -2.4697489738464355, + "logits/rejected": -2.491966724395752, + "logps/chosen": -282.41363525390625, + "logps/rejected": -304.3522033691406, + "loss": 0.0162, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.6294604539871216, - "rewards/margins": 6.655107021331787, - "rewards/rejected": -6.025646686553955, + "rewards/chosen": 0.5781731605529785, + "rewards/margins": 8.477033615112305, + "rewards/rejected": -7.89885950088501, "step": 4060 }, { - "epoch": 1.99, - "learning_rate": 1.8738689829895042e-07, - "logits/chosen": -2.919400691986084, - "logits/rejected": -2.9234824180603027, - "logps/chosen": -311.05426025390625, - "logps/rejected": -336.472412109375, - "loss": 0.0624, + "epoch": 2.07, + "learning_rate": 1.721929163526752e-07, + "logits/chosen": -2.4429774284362793, + "logits/rejected": -2.391498327255249, + "logps/chosen": -351.14971923828125, + "logps/rejected": -382.9489440917969, + "loss": 0.0092, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.6969212293624878, - "rewards/margins": 6.726694583892822, - "rewards/rejected": -6.029772758483887, + "rewards/chosen": 0.589583158493042, + "rewards/margins": 8.448725700378418, + "rewards/rejected": -7.859143257141113, "step": 4070 }, { - "epoch": 1.99, - "learning_rate": 1.8648208469055372e-07, - "logits/chosen": -2.9427881240844727, - "logits/rejected": -2.9178051948547363, - "logps/chosen": -326.56524658203125, - "logps/rejected": -288.01312255859375, - "loss": 0.095, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.45500653982162476, - "rewards/margins": 6.449684143066406, - "rewards/rejected": -5.994677543640137, + "epoch": 2.08, + "learning_rate": 1.712509419743783e-07, + "logits/chosen": -2.4819400310516357, + "logits/rejected": -2.4833054542541504, + "logps/chosen": -271.9066467285156, + "logps/rejected": -388.4400329589844, + "loss": 0.0136, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5519894957542419, + "rewards/margins": 9.0414457321167, + "rewards/rejected": -8.489457130432129, "step": 4080 }, { - "epoch": 2.0, - "learning_rate": 1.8557727108215706e-07, - "logits/chosen": -2.916647434234619, - "logits/rejected": -2.9380955696105957, - "logps/chosen": -376.963134765625, - "logps/rejected": -357.92718505859375, - "loss": 0.0996, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2593491077423096, - "rewards/margins": 7.356746673583984, - "rewards/rejected": -6.097397804260254, + "epoch": 2.08, + "learning_rate": 1.7030896759608136e-07, + "logits/chosen": -2.5111820697784424, + "logits/rejected": -2.470893383026123, + "logps/chosen": -298.361572265625, + "logps/rejected": -385.54010009765625, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7081271409988403, + "rewards/margins": 9.882740020751953, + "rewards/rejected": -9.174612045288086, "step": 4090 }, { - "epoch": 2.0, - "learning_rate": 1.846724574737604e-07, - "logits/chosen": -2.949510097503662, - "logits/rejected": -2.9994475841522217, - "logps/chosen": -269.92694091796875, - "logps/rejected": -324.97344970703125, - "loss": 0.0389, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.6557193994522095, - "rewards/margins": 6.664454460144043, - "rewards/rejected": -6.008735179901123, + "epoch": 2.09, + "learning_rate": 1.6936699321778446e-07, + "logits/chosen": -2.463622808456421, + "logits/rejected": -2.392191171646118, + "logps/chosen": -251.0826873779297, + "logps/rejected": -327.29534912109375, + "loss": 0.0213, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8025670051574707, + "rewards/margins": 8.546449661254883, + "rewards/rejected": -7.743882656097412, "step": 4100 }, { - "epoch": 2.0, - "eval_logits/chosen": -2.9238290786743164, - "eval_logits/rejected": -2.92825984954834, - "eval_logps/chosen": -323.2430419921875, - "eval_logps/rejected": -308.35577392578125, - "eval_loss": 0.5430710911750793, - "eval_rewards/accuracies": 0.7959558963775635, - "eval_rewards/chosen": -0.8382065892219543, - "eval_rewards/margins": 3.044058084487915, - "eval_rewards/rejected": -3.8822646141052246, - "eval_runtime": 303.1081, - "eval_samples_per_second": 7.166, - "eval_steps_per_second": 0.449, + "epoch": 2.09, + "eval_logits/chosen": -2.5334906578063965, + "eval_logits/rejected": -2.602919578552246, + "eval_logps/chosen": -302.7117004394531, + "eval_logps/rejected": -307.35565185546875, + "eval_loss": 0.6529473662376404, + "eval_rewards/accuracies": 0.7366412281990051, + "eval_rewards/chosen": -2.4461238384246826, + "eval_rewards/margins": 2.4759316444396973, + "eval_rewards/rejected": -4.922055244445801, + "eval_runtime": 296.9611, + "eval_samples_per_second": 7.024, + "eval_steps_per_second": 0.441, "step": 4100 }, { - "epoch": 2.01, - "learning_rate": 1.8376764386536373e-07, - "logits/chosen": -2.9906725883483887, - "logits/rejected": -2.9862935543060303, - "logps/chosen": -371.25396728515625, - "logps/rejected": -353.2129211425781, - "loss": 0.0483, - "rewards/accuracies": 1.0, - "rewards/chosen": 2.076956272125244, - "rewards/margins": 9.489618301391602, - "rewards/rejected": -7.412663459777832, + "epoch": 2.09, + "learning_rate": 1.6842501883948756e-07, + "logits/chosen": -2.461470365524292, + "logits/rejected": -2.4746007919311523, + "logps/chosen": -293.7532043457031, + "logps/rejected": -337.87152099609375, + "loss": 0.0123, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2249172031879425, + "rewards/margins": 8.05424976348877, + "rewards/rejected": -7.829331874847412, "step": 4110 }, { - "epoch": 2.01, - "learning_rate": 1.8286283025696707e-07, - "logits/chosen": -2.936913013458252, - "logits/rejected": -2.9665207862854004, - "logps/chosen": -326.1781311035156, - "logps/rejected": -338.53485107421875, - "loss": 0.0187, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 2.4228596687316895, - "rewards/margins": 8.462333679199219, - "rewards/rejected": -6.039474010467529, + "epoch": 2.1, + "learning_rate": 1.6748304446119067e-07, + "logits/chosen": -2.445439577102661, + "logits/rejected": -2.4860050678253174, + "logps/chosen": -294.78460693359375, + "logps/rejected": -345.5479736328125, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4239388704299927, + "rewards/margins": 9.9138765335083, + "rewards/rejected": -8.489936828613281, "step": 4120 }, { - "epoch": 2.02, - "learning_rate": 1.8195801664857037e-07, - "logits/chosen": -2.9248874187469482, - "logits/rejected": -2.97477650642395, - "logps/chosen": -290.7440185546875, - "logps/rejected": -339.4092712402344, - "loss": 0.0132, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.443786382675171, - "rewards/margins": 9.173762321472168, - "rewards/rejected": -7.729975700378418, + "epoch": 2.1, + "learning_rate": 1.665410700828937e-07, + "logits/chosen": -2.5708627700805664, + "logits/rejected": -2.4310455322265625, + "logps/chosen": -300.51318359375, + "logps/rejected": -346.686767578125, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4349859356880188, + "rewards/margins": 8.615959167480469, + "rewards/rejected": -8.1809720993042, "step": 4130 }, { - "epoch": 2.02, - "learning_rate": 1.810532030401737e-07, - "logits/chosen": -2.946485996246338, - "logits/rejected": -2.9688897132873535, - "logps/chosen": -300.78631591796875, - "logps/rejected": -330.9696350097656, - "loss": 0.0247, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.6105268001556396, - "rewards/margins": 8.403840065002441, - "rewards/rejected": -6.793313026428223, + "epoch": 2.11, + "learning_rate": 1.655990957045968e-07, + "logits/chosen": -2.5158300399780273, + "logits/rejected": -2.5776991844177246, + "logps/chosen": -325.2979431152344, + "logps/rejected": -375.5154113769531, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5380330681800842, + "rewards/margins": 8.873096466064453, + "rewards/rejected": -8.335062980651855, "step": 4140 }, { - "epoch": 2.03, - "learning_rate": 1.8014838943177704e-07, - "logits/chosen": -2.9448509216308594, - "logits/rejected": -2.9592087268829346, - "logps/chosen": -268.7191162109375, - "logps/rejected": -323.5093078613281, - "loss": 0.0107, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.0064860582351685, - "rewards/margins": 8.544146537780762, - "rewards/rejected": -7.537660121917725, + "epoch": 2.11, + "learning_rate": 1.6465712132629991e-07, + "logits/chosen": -2.3604540824890137, + "logits/rejected": -2.3653512001037598, + "logps/chosen": -237.39260864257812, + "logps/rejected": -334.53955078125, + "loss": 0.0227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6499460935592651, + "rewards/margins": 8.810885429382324, + "rewards/rejected": -9.460832595825195, "step": 4150 }, { - "epoch": 2.03, - "learning_rate": 1.7924357582338038e-07, - "logits/chosen": -2.972289562225342, - "logits/rejected": -3.001542806625366, - "logps/chosen": -389.1747131347656, - "logps/rejected": -400.4701232910156, - "loss": 0.0079, - "rewards/accuracies": 1.0, - "rewards/chosen": 2.761798858642578, - "rewards/margins": 10.334131240844727, - "rewards/rejected": -7.572334289550781, + "epoch": 2.12, + "learning_rate": 1.6371514694800301e-07, + "logits/chosen": -2.449936628341675, + "logits/rejected": -2.3841679096221924, + "logps/chosen": -249.3923797607422, + "logps/rejected": -348.3072814941406, + "loss": 0.0198, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.338760644197464, + "rewards/margins": 9.519830703735352, + "rewards/rejected": -9.181070327758789, "step": 4160 }, { - "epoch": 2.04, - "learning_rate": 1.783387622149837e-07, - "logits/chosen": -2.8570685386657715, - "logits/rejected": -2.8960671424865723, - "logps/chosen": -362.25982666015625, - "logps/rejected": -379.66082763671875, - "loss": 0.0112, + "epoch": 2.12, + "learning_rate": 1.6277317256970611e-07, + "logits/chosen": -2.406186103820801, + "logits/rejected": -2.4443275928497314, + "logps/chosen": -297.19384765625, + "logps/rejected": -386.3888854980469, + "loss": 0.0182, "rewards/accuracies": 1.0, - "rewards/chosen": 2.0383682250976562, - "rewards/margins": 10.73603343963623, - "rewards/rejected": -8.697667121887207, + "rewards/chosen": 0.3307214677333832, + "rewards/margins": 9.84343147277832, + "rewards/rejected": -9.512711524963379, "step": 4170 }, { - "epoch": 2.04, - "learning_rate": 1.7743394860658702e-07, - "logits/chosen": -2.891801357269287, - "logits/rejected": -2.8973212242126465, - "logps/chosen": -304.717529296875, - "logps/rejected": -341.28948974609375, - "loss": 0.0245, + "epoch": 2.13, + "learning_rate": 1.618311981914092e-07, + "logits/chosen": -2.3944525718688965, + "logits/rejected": -2.2698841094970703, + "logps/chosen": -240.02999877929688, + "logps/rejected": -300.5351257324219, + "loss": 0.0111, "rewards/accuracies": 1.0, - "rewards/chosen": 1.632785439491272, - "rewards/margins": 8.525206565856934, - "rewards/rejected": -6.892421722412109, + "rewards/chosen": -0.23821866512298584, + "rewards/margins": 8.546000480651855, + "rewards/rejected": -8.784219741821289, "step": 4180 }, { - "epoch": 2.05, - "learning_rate": 1.7652913499819036e-07, - "logits/chosen": -2.98625111579895, - "logits/rejected": -2.972463369369507, - "logps/chosen": -331.9447021484375, - "logps/rejected": -338.6233825683594, - "loss": 0.0125, + "epoch": 2.13, + "learning_rate": 1.6088922381311226e-07, + "logits/chosen": -2.34932541847229, + "logits/rejected": -2.371035099029541, + "logps/chosen": -276.40679931640625, + "logps/rejected": -384.0252685546875, + "loss": 0.016, "rewards/accuracies": 1.0, - "rewards/chosen": 0.9732617139816284, - "rewards/margins": 8.312514305114746, - "rewards/rejected": -7.339252471923828, + "rewards/chosen": 0.10160058736801147, + "rewards/margins": 8.838811874389648, + "rewards/rejected": -8.737211227416992, "step": 4190 }, { - "epoch": 2.05, - "learning_rate": 1.756243213897937e-07, - "logits/chosen": -2.918269157409668, - "logits/rejected": -2.9248013496398926, - "logps/chosen": -275.7173156738281, - "logps/rejected": -334.4678649902344, - "loss": 0.0056, + "epoch": 2.14, + "learning_rate": 1.5994724943481536e-07, + "logits/chosen": -2.461995840072632, + "logits/rejected": -2.3588435649871826, + "logps/chosen": -251.30361938476562, + "logps/rejected": -327.7369079589844, + "loss": 0.0149, "rewards/accuracies": 1.0, - "rewards/chosen": 0.9555188417434692, - "rewards/margins": 9.142569541931152, - "rewards/rejected": -8.187049865722656, + "rewards/chosen": -0.21784713864326477, + "rewards/margins": 9.507335662841797, + "rewards/rejected": -9.72518253326416, "step": 4200 }, { - "epoch": 2.05, - "eval_logits/chosen": -2.906447410583496, - "eval_logits/rejected": -2.9076926708221436, - "eval_logps/chosen": -328.0263671875, - "eval_logps/rejected": -317.897216796875, - "eval_loss": 0.5895195603370667, - "eval_rewards/accuracies": 0.7996323704719543, - "eval_rewards/chosen": -1.3165363073349, - "eval_rewards/margins": 3.5198771953582764, - "eval_rewards/rejected": -4.836413383483887, - "eval_runtime": 303.3801, - "eval_samples_per_second": 7.159, - "eval_steps_per_second": 0.448, + "epoch": 2.14, + "eval_logits/chosen": -2.527193307876587, + "eval_logits/rejected": -2.5937540531158447, + "eval_logps/chosen": -308.90386962890625, + "eval_logps/rejected": -315.9820861816406, + "eval_loss": 0.6933820843696594, + "eval_rewards/accuracies": 0.7347328066825867, + "eval_rewards/chosen": -3.065340280532837, + "eval_rewards/margins": 2.7193539142608643, + "eval_rewards/rejected": -5.784693241119385, + "eval_runtime": 301.827, + "eval_samples_per_second": 6.911, + "eval_steps_per_second": 0.434, "step": 4200 }, { - "epoch": 2.06, - "learning_rate": 1.7471950778139703e-07, - "logits/chosen": -2.9494941234588623, - "logits/rejected": -2.9647154808044434, - "logps/chosen": -255.7998504638672, - "logps/rejected": -328.80853271484375, - "loss": 0.0152, + "epoch": 2.14, + "learning_rate": 1.5900527505651846e-07, + "logits/chosen": -2.406585216522217, + "logits/rejected": -2.409055471420288, + "logps/chosen": -314.01141357421875, + "logps/rejected": -328.4725646972656, + "loss": 0.003, "rewards/accuracies": 1.0, - "rewards/chosen": 0.9671787023544312, - "rewards/margins": 8.419604301452637, - "rewards/rejected": -7.452425479888916, + "rewards/chosen": 0.4073919355869293, + "rewards/margins": 9.213891983032227, + "rewards/rejected": -8.806499481201172, "step": 4210 }, { - "epoch": 2.06, - "learning_rate": 1.7381469417300034e-07, - "logits/chosen": -2.940955400466919, - "logits/rejected": -2.9565961360931396, - "logps/chosen": -379.8911437988281, - "logps/rejected": -354.20416259765625, - "loss": 0.014, + "epoch": 2.15, + "learning_rate": 1.5806330067822154e-07, + "logits/chosen": -2.4329631328582764, + "logits/rejected": -2.4906868934631348, + "logps/chosen": -292.4327392578125, + "logps/rejected": -388.5157775878906, + "loss": 0.0141, "rewards/accuracies": 1.0, - "rewards/chosen": 1.207024335861206, - "rewards/margins": 9.23960018157959, - "rewards/rejected": -8.032575607299805, + "rewards/chosen": -0.3115505576133728, + "rewards/margins": 9.646425247192383, + "rewards/rejected": -9.957975387573242, "step": 4220 }, { - "epoch": 2.07, - "learning_rate": 1.7290988056460367e-07, - "logits/chosen": -2.888720750808716, - "logits/rejected": -2.91988205909729, - "logps/chosen": -317.1791687011719, - "logps/rejected": -378.646484375, - "loss": 0.0124, + "epoch": 2.15, + "learning_rate": 1.5712132629992464e-07, + "logits/chosen": -2.4410560131073, + "logits/rejected": -2.573024272918701, + "logps/chosen": -300.14385986328125, + "logps/rejected": -341.1786193847656, + "loss": 0.0214, "rewards/accuracies": 1.0, - "rewards/chosen": 0.7943362593650818, - "rewards/margins": 8.83814811706543, - "rewards/rejected": -8.043811798095703, + "rewards/chosen": 0.273735910654068, + "rewards/margins": 8.2180814743042, + "rewards/rejected": -7.944344997406006, "step": 4230 }, { - "epoch": 2.07, - "learning_rate": 1.72005066956207e-07, - "logits/chosen": -2.902064800262451, - "logits/rejected": -2.945105791091919, - "logps/chosen": -280.6346740722656, - "logps/rejected": -314.17041015625, - "loss": 0.0169, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.8052576184272766, - "rewards/margins": 8.470544815063477, - "rewards/rejected": -7.665287017822266, + "epoch": 2.16, + "learning_rate": 1.5617935192162774e-07, + "logits/chosen": -2.3949074745178223, + "logits/rejected": -2.323303699493408, + "logps/chosen": -277.2351379394531, + "logps/rejected": -318.48736572265625, + "loss": 0.0064, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.28990697860717773, + "rewards/margins": 9.171941757202148, + "rewards/rejected": -9.461848258972168, "step": 4240 }, { - "epoch": 2.08, - "learning_rate": 1.7110025334781034e-07, - "logits/chosen": -2.9120936393737793, - "logits/rejected": -2.9272561073303223, - "logps/chosen": -391.3676452636719, - "logps/rejected": -326.0396728515625, - "loss": 0.0183, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.773400068283081, - "rewards/margins": 9.867328643798828, - "rewards/rejected": -8.093927383422852, + "epoch": 2.16, + "learning_rate": 1.5523737754333084e-07, + "logits/chosen": -2.290977954864502, + "logits/rejected": -2.3106536865234375, + "logps/chosen": -242.9626922607422, + "logps/rejected": -332.17529296875, + "loss": 0.018, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.07354114949703217, + "rewards/margins": 8.418808937072754, + "rewards/rejected": -8.492349624633789, "step": 4250 }, { - "epoch": 2.08, - "learning_rate": 1.7019543973941365e-07, - "logits/chosen": -2.8478567600250244, - "logits/rejected": -2.844156265258789, - "logps/chosen": -355.01580810546875, - "logps/rejected": -371.62982177734375, - "loss": 0.0106, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3708417415618896, - "rewards/margins": 9.757500648498535, - "rewards/rejected": -8.386659622192383, + "epoch": 2.17, + "learning_rate": 1.5429540316503389e-07, + "logits/chosen": -2.3914854526519775, + "logits/rejected": -2.319920778274536, + "logps/chosen": -255.0960235595703, + "logps/rejected": -335.22113037109375, + "loss": 0.0113, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.04593334347009659, + "rewards/margins": 8.81833267211914, + "rewards/rejected": -8.772398948669434, "step": 4260 }, { - "epoch": 2.09, - "learning_rate": 1.69290626131017e-07, - "logits/chosen": -2.898806095123291, - "logits/rejected": -2.9081108570098877, - "logps/chosen": -326.9820251464844, - "logps/rejected": -345.529296875, - "loss": 0.0183, + "epoch": 2.17, + "learning_rate": 1.53353428786737e-07, + "logits/chosen": -2.3034889698028564, + "logits/rejected": -2.378272533416748, + "logps/chosen": -281.86004638671875, + "logps/rejected": -361.8553771972656, + "loss": 0.0157, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.1003632545471191, - "rewards/margins": 9.331439971923828, - "rewards/rejected": -8.23107624053955, + "rewards/chosen": 1.1320751905441284, + "rewards/margins": 9.508333206176758, + "rewards/rejected": -8.376259803771973, "step": 4270 }, { - "epoch": 2.09, - "learning_rate": 1.6838581252262032e-07, - "logits/chosen": -2.9381659030914307, - "logits/rejected": -2.9744925498962402, - "logps/chosen": -363.24224853515625, - "logps/rejected": -358.1921081542969, - "loss": 0.0143, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8045371770858765, - "rewards/margins": 8.981616020202637, - "rewards/rejected": -8.177080154418945, + "epoch": 2.18, + "learning_rate": 1.524114544084401e-07, + "logits/chosen": -2.262380361557007, + "logits/rejected": -2.337212085723877, + "logps/chosen": -284.8341979980469, + "logps/rejected": -381.337158203125, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03602912276983261, + "rewards/margins": 9.237323760986328, + "rewards/rejected": -9.27335262298584, "step": 4280 }, { - "epoch": 2.1, - "learning_rate": 1.6748099891422366e-07, - "logits/chosen": -2.9395833015441895, - "logits/rejected": -2.9578940868377686, - "logps/chosen": -297.2131652832031, - "logps/rejected": -357.81439208984375, - "loss": 0.0184, + "epoch": 2.18, + "learning_rate": 1.514694800301432e-07, + "logits/chosen": -2.2815897464752197, + "logits/rejected": -2.350703716278076, + "logps/chosen": -268.76922607421875, + "logps/rejected": -363.57623291015625, + "loss": 0.0138, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.6641481518745422, - "rewards/margins": 8.694684982299805, - "rewards/rejected": -8.030536651611328, + "rewards/chosen": -0.15438330173492432, + "rewards/margins": 9.449551582336426, + "rewards/rejected": -9.603934288024902, "step": 4290 }, { - "epoch": 2.1, - "learning_rate": 1.66576185305827e-07, - "logits/chosen": -2.956148624420166, - "logits/rejected": -2.9515504837036133, - "logps/chosen": -287.3852844238281, - "logps/rejected": -354.4857482910156, - "loss": 0.0192, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4165951609611511, - "rewards/margins": 8.837174415588379, - "rewards/rejected": -8.42057991027832, + "epoch": 2.19, + "learning_rate": 1.5052750565184626e-07, + "logits/chosen": -2.2737479209899902, + "logits/rejected": -2.2863266468048096, + "logps/chosen": -278.785888671875, + "logps/rejected": -407.51702880859375, + "loss": 0.0084, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.15904609858989716, + "rewards/margins": 9.742883682250977, + "rewards/rejected": -9.90192985534668, "step": 4300 }, { - "epoch": 2.1, - "eval_logits/chosen": -2.9052090644836426, - "eval_logits/rejected": -2.904784679412842, - "eval_logps/chosen": -332.2442321777344, - "eval_logps/rejected": -324.2669677734375, - "eval_loss": 0.6121014952659607, - "eval_rewards/accuracies": 0.7867646813392639, - "eval_rewards/chosen": -1.738324761390686, - "eval_rewards/margins": 3.7350597381591797, - "eval_rewards/rejected": -5.473384857177734, - "eval_runtime": 302.8805, - "eval_samples_per_second": 7.171, - "eval_steps_per_second": 0.449, + "epoch": 2.19, + "eval_logits/chosen": -2.440368890762329, + "eval_logits/rejected": -2.508787155151367, + "eval_logps/chosen": -310.0954895019531, + "eval_logps/rejected": -318.322998046875, + "eval_loss": 0.7083070874214172, + "eval_rewards/accuracies": 0.7404580116271973, + "eval_rewards/chosen": -3.1845004558563232, + "eval_rewards/margins": 2.8342840671539307, + "eval_rewards/rejected": -6.018784523010254, + "eval_runtime": 296.8234, + "eval_samples_per_second": 7.028, + "eval_steps_per_second": 0.441, "step": 4300 }, { - "epoch": 2.11, - "learning_rate": 1.656713716974303e-07, - "logits/chosen": -2.893310308456421, - "logits/rejected": -2.8932833671569824, - "logps/chosen": -304.52252197265625, - "logps/rejected": -367.1518859863281, - "loss": 0.0138, + "epoch": 2.19, + "learning_rate": 1.4958553127354936e-07, + "logits/chosen": -2.3321611881256104, + "logits/rejected": -2.3306918144226074, + "logps/chosen": -286.4504699707031, + "logps/rejected": -348.179931640625, + "loss": 0.008, "rewards/accuracies": 1.0, - "rewards/chosen": 0.10967813432216644, - "rewards/margins": 8.908355712890625, - "rewards/rejected": -8.798677444458008, + "rewards/chosen": -0.29880261421203613, + "rewards/margins": 9.495977401733398, + "rewards/rejected": -9.794778823852539, "step": 4310 }, { - "epoch": 2.11, - "learning_rate": 1.6476655808903364e-07, - "logits/chosen": -2.9863438606262207, - "logits/rejected": -2.997415065765381, - "logps/chosen": -269.8865661621094, - "logps/rejected": -308.7164001464844, - "loss": 0.0215, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.13215677440166473, - "rewards/margins": 8.89323616027832, - "rewards/rejected": -9.025392532348633, + "epoch": 2.2, + "learning_rate": 1.4864355689525244e-07, + "logits/chosen": -2.307884454727173, + "logits/rejected": -2.153669595718384, + "logps/chosen": -231.07846069335938, + "logps/rejected": -346.56982421875, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09392056614160538, + "rewards/margins": 9.005593299865723, + "rewards/rejected": -8.911672592163086, "step": 4320 }, { - "epoch": 2.12, - "learning_rate": 1.6386174448063697e-07, - "logits/chosen": -3.0144283771514893, - "logits/rejected": -2.9987194538116455, - "logps/chosen": -319.78997802734375, - "logps/rejected": -359.3088073730469, - "loss": 0.0109, + "epoch": 2.2, + "learning_rate": 1.4770158251695554e-07, + "logits/chosen": -2.318849802017212, + "logits/rejected": -2.2451350688934326, + "logps/chosen": -252.5889434814453, + "logps/rejected": -347.39190673828125, + "loss": 0.0171, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9401049613952637, - "rewards/margins": 9.604791641235352, - "rewards/rejected": -8.66468620300293, + "rewards/chosen": -0.5418094992637634, + "rewards/margins": 8.427831649780273, + "rewards/rejected": -8.969642639160156, "step": 4330 }, { - "epoch": 2.12, - "learning_rate": 1.629569308722403e-07, - "logits/chosen": -2.935408115386963, - "logits/rejected": -2.9562172889709473, - "logps/chosen": -312.78887939453125, - "logps/rejected": -382.107177734375, - "loss": 0.0129, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.339860737323761, - "rewards/margins": 8.899724960327148, - "rewards/rejected": -8.55986499786377, + "epoch": 2.21, + "learning_rate": 1.467596081386586e-07, + "logits/chosen": -2.3386223316192627, + "logits/rejected": -2.319746494293213, + "logps/chosen": -249.5520477294922, + "logps/rejected": -333.26812744140625, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6450480222702026, + "rewards/margins": 8.832240104675293, + "rewards/rejected": -9.477288246154785, "step": 4340 }, { - "epoch": 2.13, - "learning_rate": 1.6205211726384362e-07, - "logits/chosen": -2.950565814971924, - "logits/rejected": -2.969822883605957, - "logps/chosen": -374.9684753417969, - "logps/rejected": -407.42132568359375, - "loss": 0.0132, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.4509483277797699, - "rewards/margins": 9.827601432800293, - "rewards/rejected": -9.376653671264648, + "epoch": 2.21, + "learning_rate": 1.458176337603617e-07, + "logits/chosen": -2.4239277839660645, + "logits/rejected": -2.302274227142334, + "logps/chosen": -265.55548095703125, + "logps/rejected": -356.56787109375, + "loss": 0.0209, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4280431866645813, + "rewards/margins": 9.597391128540039, + "rewards/rejected": -10.025433540344238, "step": 4350 }, { - "epoch": 2.13, - "learning_rate": 1.6114730365544695e-07, - "logits/chosen": -2.9137938022613525, - "logits/rejected": -2.957098960876465, - "logps/chosen": -331.5849304199219, - "logps/rejected": -406.20135498046875, - "loss": 0.0103, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4979720115661621, - "rewards/margins": 10.062200546264648, - "rewards/rejected": -9.564227104187012, + "epoch": 2.22, + "learning_rate": 1.448756593820648e-07, + "logits/chosen": -2.408919334411621, + "logits/rejected": -2.2459819316864014, + "logps/chosen": -306.56903076171875, + "logps/rejected": -341.38494873046875, + "loss": 0.0251, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.19567224383354187, + "rewards/margins": 8.941240310668945, + "rewards/rejected": -9.136914253234863, "step": 4360 }, { - "epoch": 2.13, - "learning_rate": 1.602424900470503e-07, - "logits/chosen": -2.941483736038208, - "logits/rejected": -2.977083683013916, - "logps/chosen": -326.5963134765625, - "logps/rejected": -395.2918395996094, - "loss": 0.0158, + "epoch": 2.22, + "learning_rate": 1.439336850037679e-07, + "logits/chosen": -2.3950576782226562, + "logits/rejected": -2.3486268520355225, + "logps/chosen": -286.4931945800781, + "logps/rejected": -394.418212890625, + "loss": 0.0072, "rewards/accuracies": 1.0, - "rewards/chosen": 0.3507850468158722, - "rewards/margins": 9.47231674194336, - "rewards/rejected": -9.121530532836914, + "rewards/chosen": -0.07271413505077362, + "rewards/margins": 10.440177917480469, + "rewards/rejected": -10.51289176940918, "step": 4370 }, { - "epoch": 2.14, - "learning_rate": 1.5933767643865362e-07, - "logits/chosen": -2.944478988647461, - "logits/rejected": -2.9454710483551025, - "logps/chosen": -313.2370300292969, - "logps/rejected": -389.37939453125, - "loss": 0.0056, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6074202060699463, - "rewards/margins": 11.472959518432617, - "rewards/rejected": -10.86553955078125, + "epoch": 2.23, + "learning_rate": 1.4299171062547096e-07, + "logits/chosen": -2.3615708351135254, + "logits/rejected": -2.3377792835235596, + "logps/chosen": -295.9791564941406, + "logps/rejected": -297.25128173828125, + "loss": 0.008, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.30061447620391846, + "rewards/margins": 8.242612838745117, + "rewards/rejected": -8.54322624206543, "step": 4380 }, { - "epoch": 2.14, - "learning_rate": 1.5843286283025696e-07, - "logits/chosen": -2.921966791152954, - "logits/rejected": -2.928133249282837, - "logps/chosen": -342.3746032714844, - "logps/rejected": -403.03826904296875, - "loss": 0.018, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.2890813946723938, - "rewards/margins": 9.765953063964844, - "rewards/rejected": -9.4768705368042, + "epoch": 2.23, + "learning_rate": 1.4204973624717406e-07, + "logits/chosen": -2.4010937213897705, + "logits/rejected": -2.436049699783325, + "logps/chosen": -280.08074951171875, + "logps/rejected": -376.03857421875, + "loss": 0.0128, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8056174516677856, + "rewards/margins": 8.615859985351562, + "rewards/rejected": -9.421477317810059, "step": 4390 }, { - "epoch": 2.15, - "learning_rate": 1.5752804922186027e-07, - "logits/chosen": -2.9025821685791016, - "logits/rejected": -2.920318365097046, - "logps/chosen": -397.28070068359375, - "logps/rejected": -407.007568359375, - "loss": 0.0241, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8674532175064087, - "rewards/margins": 10.650280952453613, - "rewards/rejected": -9.782827377319336, + "epoch": 2.24, + "learning_rate": 1.4110776186887716e-07, + "logits/chosen": -2.3178069591522217, + "logits/rejected": -2.3701210021972656, + "logps/chosen": -306.66387939453125, + "logps/rejected": -313.7434997558594, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5981841683387756, + "rewards/margins": 8.42170238494873, + "rewards/rejected": -9.01988697052002, "step": 4400 }, { - "epoch": 2.15, - "eval_logits/chosen": -2.888969659805298, - "eval_logits/rejected": -2.8891761302948, - "eval_logps/chosen": -334.5132141113281, - "eval_logps/rejected": -327.95294189453125, - "eval_loss": 0.628560483455658, - "eval_rewards/accuracies": 0.78125, - "eval_rewards/chosen": -1.9652206897735596, - "eval_rewards/margins": 3.8767623901367188, - "eval_rewards/rejected": -5.841982841491699, - "eval_runtime": 303.3549, - "eval_samples_per_second": 7.16, - "eval_steps_per_second": 0.448, + "epoch": 2.24, + "eval_logits/chosen": -2.4479472637176514, + "eval_logits/rejected": -2.5108604431152344, + "eval_logps/chosen": -312.2333679199219, + "eval_logps/rejected": -320.94183349609375, + "eval_loss": 0.7192836403846741, + "eval_rewards/accuracies": 0.7404580116271973, + "eval_rewards/chosen": -3.3982906341552734, + "eval_rewards/margins": 2.882378101348877, + "eval_rewards/rejected": -6.280669212341309, + "eval_runtime": 301.8883, + "eval_samples_per_second": 6.91, + "eval_steps_per_second": 0.434, "step": 4400 }, { - "epoch": 2.15, - "learning_rate": 1.566232356134636e-07, - "logits/chosen": -2.899958610534668, - "logits/rejected": -2.9074761867523193, - "logps/chosen": -321.62158203125, - "logps/rejected": -367.1690979003906, - "loss": 0.0138, + "epoch": 2.24, + "learning_rate": 1.4016578749058026e-07, + "logits/chosen": -2.3137736320495605, + "logits/rejected": -2.457977771759033, + "logps/chosen": -281.3287658691406, + "logps/rejected": -382.7276916503906, + "loss": 0.0067, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.30756908655166626, - "rewards/margins": 10.017231941223145, - "rewards/rejected": -9.709661483764648, + "rewards/chosen": -0.436063289642334, + "rewards/margins": 10.180338859558105, + "rewards/rejected": -10.616401672363281, "step": 4410 }, { - "epoch": 2.16, - "learning_rate": 1.5571842200506696e-07, - "logits/chosen": -2.8032474517822266, - "logits/rejected": -2.809086322784424, - "logps/chosen": -296.623291015625, - "logps/rejected": -329.2464294433594, - "loss": 0.0229, + "epoch": 2.25, + "learning_rate": 1.3922381311228334e-07, + "logits/chosen": -2.4059033393859863, + "logits/rejected": -2.3955695629119873, + "logps/chosen": -285.6089172363281, + "logps/rejected": -304.55816650390625, + "loss": 0.0149, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.032607562839984894, - "rewards/margins": 8.938989639282227, - "rewards/rejected": -8.906383514404297, + "rewards/chosen": -0.42298993468284607, + "rewards/margins": 8.393533706665039, + "rewards/rejected": -8.816523551940918, "step": 4420 }, { - "epoch": 2.16, - "learning_rate": 1.548136083966703e-07, - "logits/chosen": -2.9380667209625244, - "logits/rejected": -2.952648639678955, - "logps/chosen": -301.16302490234375, - "logps/rejected": -349.0029602050781, - "loss": 0.0061, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.2858569025993347, - "rewards/margins": 10.177873611450195, - "rewards/rejected": -10.463730812072754, + "epoch": 2.25, + "learning_rate": 1.3828183873398644e-07, + "logits/chosen": -2.356703042984009, + "logits/rejected": -2.3774828910827637, + "logps/chosen": -259.9477233886719, + "logps/rejected": -343.700927734375, + "loss": 0.0127, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.021397648379206657, + "rewards/margins": 9.302734375, + "rewards/rejected": -9.324131965637207, "step": 4430 }, { - "epoch": 2.17, - "learning_rate": 1.539087947882736e-07, - "logits/chosen": -2.909888982772827, - "logits/rejected": -2.902141571044922, - "logps/chosen": -343.8612976074219, - "logps/rejected": -375.90081787109375, - "loss": 0.045, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.47374147176742554, - "rewards/margins": 8.933242797851562, - "rewards/rejected": -8.459501266479492, + "epoch": 2.26, + "learning_rate": 1.373398643556895e-07, + "logits/chosen": -2.3701252937316895, + "logits/rejected": -2.2263360023498535, + "logps/chosen": -260.7659912109375, + "logps/rejected": -352.81146240234375, + "loss": 0.0144, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3572626113891602, + "rewards/margins": 8.490302085876465, + "rewards/rejected": -9.847564697265625, "step": 4440 }, { - "epoch": 2.17, - "learning_rate": 1.5300398117987694e-07, - "logits/chosen": -2.9289047718048096, - "logits/rejected": -2.8917577266693115, - "logps/chosen": -279.4588317871094, - "logps/rejected": -359.11883544921875, - "loss": 0.0296, + "epoch": 2.26, + "learning_rate": 1.363978899773926e-07, + "logits/chosen": -2.2707505226135254, + "logits/rejected": -2.312922477722168, + "logps/chosen": -230.23788452148438, + "logps/rejected": -319.2444152832031, + "loss": 0.0147, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.29869765043258667, - "rewards/margins": 9.640039443969727, - "rewards/rejected": -9.938738822937012, + "rewards/chosen": -0.7442948818206787, + "rewards/margins": 9.182458877563477, + "rewards/rejected": -9.92675495147705, "step": 4450 }, { - "epoch": 2.18, - "learning_rate": 1.5209916757148028e-07, - "logits/chosen": -2.9893269538879395, - "logits/rejected": -2.973662853240967, - "logps/chosen": -338.5328369140625, - "logps/rejected": -378.2593688964844, - "loss": 0.0135, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6207408905029297, - "rewards/margins": 9.914700508117676, - "rewards/rejected": -9.29395866394043, + "epoch": 2.27, + "learning_rate": 1.3545591559909568e-07, + "logits/chosen": -2.3916873931884766, + "logits/rejected": -2.3660058975219727, + "logps/chosen": -301.7834777832031, + "logps/rejected": -381.32659912109375, + "loss": 0.0333, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.15467780828475952, + "rewards/margins": 9.97493839263916, + "rewards/rejected": -9.820260047912598, "step": 4460 }, { - "epoch": 2.18, - "learning_rate": 1.5119435396308361e-07, - "logits/chosen": -2.9281349182128906, - "logits/rejected": -2.932262420654297, - "logps/chosen": -349.5906066894531, - "logps/rejected": -409.53955078125, - "loss": 0.0103, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.2635531425476074, - "rewards/margins": 10.38292121887207, - "rewards/rejected": -9.119367599487305, + "epoch": 2.27, + "learning_rate": 1.3451394122079879e-07, + "logits/chosen": -2.2422537803649902, + "logits/rejected": -2.285167694091797, + "logps/chosen": -299.6961975097656, + "logps/rejected": -340.37255859375, + "loss": 0.0195, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5292568802833557, + "rewards/margins": 8.242840766906738, + "rewards/rejected": -8.77209758758545, "step": 4470 }, { - "epoch": 2.19, - "learning_rate": 1.5028954035468695e-07, - "logits/chosen": -2.8525052070617676, - "logits/rejected": -2.8899142742156982, - "logps/chosen": -366.3213806152344, - "logps/rejected": -380.4151611328125, - "loss": 0.068, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7513121366500854, - "rewards/margins": 9.868281364440918, - "rewards/rejected": -9.11697006225586, + "epoch": 2.28, + "learning_rate": 1.3357196684250189e-07, + "logits/chosen": -2.368232250213623, + "logits/rejected": -2.367140293121338, + "logps/chosen": -251.2327880859375, + "logps/rejected": -353.19757080078125, + "loss": 0.0081, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.07832960784435272, + "rewards/margins": 9.432154655456543, + "rewards/rejected": -9.353825569152832, "step": 4480 }, { - "epoch": 2.19, - "learning_rate": 1.4938472674629026e-07, - "logits/chosen": -2.8318910598754883, - "logits/rejected": -2.886634588241577, - "logps/chosen": -313.30108642578125, - "logps/rejected": -397.8101806640625, - "loss": 0.0167, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.06532680988311768, - "rewards/margins": 9.928974151611328, - "rewards/rejected": -9.8636474609375, + "epoch": 2.28, + "learning_rate": 1.3262999246420499e-07, + "logits/chosen": -2.3377442359924316, + "logits/rejected": -2.3477752208709717, + "logps/chosen": -290.51300048828125, + "logps/rejected": -340.4662170410156, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2261306792497635, + "rewards/margins": 9.160150527954102, + "rewards/rejected": -8.934020042419434, "step": 4490 }, { - "epoch": 2.2, - "learning_rate": 1.484799131378936e-07, - "logits/chosen": -2.8818166255950928, - "logits/rejected": -2.9010729789733887, - "logps/chosen": -275.638427734375, - "logps/rejected": -346.5643615722656, - "loss": 0.0091, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.8022880554199219, - "rewards/margins": 10.661927223205566, - "rewards/rejected": -9.859638214111328, + "epoch": 2.29, + "learning_rate": 1.3168801808590806e-07, + "logits/chosen": -2.2825725078582764, + "logits/rejected": -2.302478313446045, + "logps/chosen": -274.8885803222656, + "logps/rejected": -333.46563720703125, + "loss": 0.0116, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.43997201323509216, + "rewards/margins": 8.661720275878906, + "rewards/rejected": -9.101692199707031, "step": 4500 }, { - "epoch": 2.2, - "eval_logits/chosen": -2.845259428024292, - "eval_logits/rejected": -2.8419833183288574, - "eval_logps/chosen": -336.50592041015625, - "eval_logps/rejected": -331.6121826171875, - "eval_loss": 0.6282519698143005, - "eval_rewards/accuracies": 0.7849264740943909, - "eval_rewards/chosen": -2.1644909381866455, - "eval_rewards/margins": 4.043414115905762, - "eval_rewards/rejected": -6.207904815673828, - "eval_runtime": 303.3072, - "eval_samples_per_second": 7.161, - "eval_steps_per_second": 0.448, + "epoch": 2.29, + "eval_logits/chosen": -2.4131815433502197, + "eval_logits/rejected": -2.4787368774414062, + "eval_logps/chosen": -311.6757507324219, + "eval_logps/rejected": -320.0794677734375, + "eval_loss": 0.7127760648727417, + "eval_rewards/accuracies": 0.7461832165718079, + "eval_rewards/chosen": -3.34252667427063, + "eval_rewards/margins": 2.8519062995910645, + "eval_rewards/rejected": -6.194432735443115, + "eval_runtime": 297.542, + "eval_samples_per_second": 7.011, + "eval_steps_per_second": 0.44, "step": 4500 }, { - "epoch": 2.2, - "learning_rate": 1.4757509952949693e-07, - "logits/chosen": -2.8793752193450928, - "logits/rejected": -2.886204242706299, - "logps/chosen": -360.40264892578125, - "logps/rejected": -372.06292724609375, - "loss": 0.0067, + "epoch": 2.29, + "learning_rate": 1.3074604370761113e-07, + "logits/chosen": -2.287743091583252, + "logits/rejected": -2.327247142791748, + "logps/chosen": -287.9313049316406, + "logps/rejected": -366.33245849609375, + "loss": 0.0086, "rewards/accuracies": 1.0, - "rewards/chosen": 1.444625735282898, - "rewards/margins": 9.87576961517334, - "rewards/rejected": -8.431144714355469, + "rewards/chosen": -0.5075663328170776, + "rewards/margins": 8.495210647583008, + "rewards/rejected": -9.002776145935059, "step": 4510 }, { - "epoch": 2.21, - "learning_rate": 1.4667028592110026e-07, - "logits/chosen": -2.822070598602295, - "logits/rejected": -2.8870255947113037, - "logps/chosen": -297.08160400390625, - "logps/rejected": -357.68939208984375, - "loss": 0.0098, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5277560949325562, - "rewards/margins": 9.454710006713867, - "rewards/rejected": -8.92695426940918, + "epoch": 2.3, + "learning_rate": 1.2980406932931423e-07, + "logits/chosen": -2.205138683319092, + "logits/rejected": -2.3599820137023926, + "logps/chosen": -263.99774169921875, + "logps/rejected": -360.87890625, + "loss": 0.0128, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2749487161636353, + "rewards/margins": 8.532350540161133, + "rewards/rejected": -9.80729866027832, "step": 4520 }, { - "epoch": 2.21, - "learning_rate": 1.4576547231270357e-07, - "logits/chosen": -2.8347156047821045, - "logits/rejected": -2.8576712608337402, - "logps/chosen": -290.74676513671875, - "logps/rejected": -335.58001708984375, - "loss": 0.0173, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.13412940502166748, - "rewards/margins": 8.751775741577148, - "rewards/rejected": -8.885905265808105, + "epoch": 2.3, + "learning_rate": 1.2886209495101734e-07, + "logits/chosen": -2.2959253787994385, + "logits/rejected": -2.1113479137420654, + "logps/chosen": -279.8739318847656, + "logps/rejected": -370.27825927734375, + "loss": 0.0333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19222494959831238, + "rewards/margins": 9.703863143920898, + "rewards/rejected": -9.896089553833008, "step": 4530 }, { - "epoch": 2.22, - "learning_rate": 1.448606587043069e-07, - "logits/chosen": -2.841374635696411, - "logits/rejected": -2.846250057220459, - "logps/chosen": -281.97137451171875, - "logps/rejected": -362.99908447265625, - "loss": 0.0189, + "epoch": 2.31, + "learning_rate": 1.279201205727204e-07, + "logits/chosen": -2.155374526977539, + "logits/rejected": -2.2465829849243164, + "logps/chosen": -243.92562866210938, + "logps/rejected": -288.19195556640625, + "loss": 0.0115, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.23653677105903625, - "rewards/margins": 9.329381942749023, - "rewards/rejected": -9.092846870422363, + "rewards/chosen": -0.7421571016311646, + "rewards/margins": 7.734604835510254, + "rewards/rejected": -8.476762771606445, "step": 4540 }, { - "epoch": 2.22, - "learning_rate": 1.4395584509591024e-07, - "logits/chosen": -2.8757505416870117, - "logits/rejected": -2.899780750274658, - "logps/chosen": -267.17706298828125, - "logps/rejected": -317.1231994628906, - "loss": 0.0291, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.21638791263103485, - "rewards/margins": 8.885756492614746, - "rewards/rejected": -8.669368743896484, + "epoch": 2.31, + "learning_rate": 1.269781461944235e-07, + "logits/chosen": -2.194904327392578, + "logits/rejected": -2.2567036151885986, + "logps/chosen": -240.79861450195312, + "logps/rejected": -324.8790283203125, + "loss": 0.0296, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.22452616691589355, + "rewards/margins": 8.310542106628418, + "rewards/rejected": -8.53506851196289, "step": 4550 }, { - "epoch": 2.23, - "learning_rate": 1.4305103148751358e-07, - "logits/chosen": -2.833974838256836, - "logits/rejected": -2.841844081878662, - "logps/chosen": -317.38494873046875, - "logps/rejected": -386.83441162109375, - "loss": 0.0304, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.2666037976741791, - "rewards/margins": 9.033517837524414, - "rewards/rejected": -9.300122261047363, + "epoch": 2.32, + "learning_rate": 1.260361718161266e-07, + "logits/chosen": -2.3649609088897705, + "logits/rejected": -2.355762004852295, + "logps/chosen": -291.94244384765625, + "logps/rejected": -322.20025634765625, + "loss": 0.0271, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.1271209418773651, + "rewards/margins": 8.356206893920898, + "rewards/rejected": -8.229085922241211, "step": 4560 }, { - "epoch": 2.23, - "learning_rate": 1.4214621787911691e-07, - "logits/chosen": -2.8718085289001465, - "logits/rejected": -2.8632290363311768, - "logps/chosen": -345.99639892578125, - "logps/rejected": -356.02130126953125, - "loss": 0.0063, + "epoch": 2.32, + "learning_rate": 1.2509419743782968e-07, + "logits/chosen": -2.241570472717285, + "logits/rejected": -2.2535672187805176, + "logps/chosen": -311.3485412597656, + "logps/rejected": -336.29437255859375, + "loss": 0.0085, "rewards/accuracies": 1.0, - "rewards/chosen": 0.8531624674797058, - "rewards/margins": 9.622444152832031, - "rewards/rejected": -8.769281387329102, + "rewards/chosen": 0.06302668154239655, + "rewards/margins": 8.881575584411621, + "rewards/rejected": -8.818550109863281, "step": 4570 }, { - "epoch": 2.24, - "learning_rate": 1.4124140427072022e-07, - "logits/chosen": -2.8380565643310547, - "logits/rejected": -2.826584815979004, - "logps/chosen": -325.34222412109375, - "logps/rejected": -351.6784362792969, - "loss": 0.0173, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.45839446783065796, - "rewards/margins": 9.521805763244629, - "rewards/rejected": -9.063410758972168, + "epoch": 2.33, + "learning_rate": 1.2415222305953278e-07, + "logits/chosen": -2.225210428237915, + "logits/rejected": -2.2509846687316895, + "logps/chosen": -288.7023010253906, + "logps/rejected": -362.2086181640625, + "loss": 0.0433, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.6437242031097412, + "rewards/margins": 9.244821548461914, + "rewards/rejected": -9.888544082641602, "step": 4580 }, { - "epoch": 2.24, - "learning_rate": 1.4033659066232356e-07, - "logits/chosen": -2.772315740585327, - "logits/rejected": -2.739224910736084, - "logps/chosen": -263.865234375, - "logps/rejected": -331.3865051269531, - "loss": 0.01, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.13032841682434082, - "rewards/margins": 9.29617977142334, - "rewards/rejected": -9.426507949829102, + "epoch": 2.33, + "learning_rate": 1.2321024868123586e-07, + "logits/chosen": -2.152923822402954, + "logits/rejected": -2.3054864406585693, + "logps/chosen": -254.2602081298828, + "logps/rejected": -339.173095703125, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7262587547302246, + "rewards/margins": 9.192373275756836, + "rewards/rejected": -9.918632507324219, "step": 4590 }, { - "epoch": 2.25, - "learning_rate": 1.394317770539269e-07, - "logits/chosen": -2.824910879135132, - "logits/rejected": -2.8294365406036377, - "logps/chosen": -392.77923583984375, - "logps/rejected": -348.9220275878906, - "loss": 0.0493, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.1558719426393509, - "rewards/margins": 9.632447242736816, - "rewards/rejected": -9.476574897766113, + "epoch": 2.34, + "learning_rate": 1.2226827430293896e-07, + "logits/chosen": -2.3011200428009033, + "logits/rejected": -2.2208499908447266, + "logps/chosen": -275.1327209472656, + "logps/rejected": -350.51617431640625, + "loss": 0.0077, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.2207508087158203, + "rewards/margins": 9.613337516784668, + "rewards/rejected": -9.834087371826172, "step": 4600 }, { - "epoch": 2.25, - "eval_logits/chosen": -2.8273589611053467, - "eval_logits/rejected": -2.822690963745117, - "eval_logps/chosen": -337.8106994628906, - "eval_logps/rejected": -333.7635192871094, - "eval_loss": 0.6470258235931396, - "eval_rewards/accuracies": 0.7775735259056091, - "eval_rewards/chosen": -2.29496693611145, - "eval_rewards/margins": 4.128070831298828, - "eval_rewards/rejected": -6.423038482666016, - "eval_runtime": 303.1192, - "eval_samples_per_second": 7.165, - "eval_steps_per_second": 0.449, + "epoch": 2.34, + "eval_logits/chosen": -2.377899169921875, + "eval_logits/rejected": -2.4448626041412354, + "eval_logps/chosen": -310.5561828613281, + "eval_logps/rejected": -319.6102294921875, + "eval_loss": 0.7218549847602844, + "eval_rewards/accuracies": 0.7480915784835815, + "eval_rewards/chosen": -3.2305691242218018, + "eval_rewards/margins": 2.916940212249756, + "eval_rewards/rejected": -6.147509574890137, + "eval_runtime": 301.7525, + "eval_samples_per_second": 6.913, + "eval_steps_per_second": 0.434, "step": 4600 }, { - "epoch": 2.25, - "learning_rate": 1.3852696344553023e-07, - "logits/chosen": -2.8787920475006104, - "logits/rejected": -2.8925797939300537, - "logps/chosen": -268.21502685546875, - "logps/rejected": -324.3523864746094, - "loss": 0.0181, + "epoch": 2.34, + "learning_rate": 1.2132629992464206e-07, + "logits/chosen": -2.305997848510742, + "logits/rejected": -2.269470691680908, + "logps/chosen": -296.7271728515625, + "logps/rejected": -344.4403991699219, + "loss": 0.0099, "rewards/accuracies": 1.0, - "rewards/chosen": -0.17401398718357086, - "rewards/margins": 9.012751579284668, - "rewards/rejected": -9.186766624450684, + "rewards/chosen": 0.29369282722473145, + "rewards/margins": 9.259397506713867, + "rewards/rejected": -8.965703964233398, "step": 4610 }, { - "epoch": 2.26, - "learning_rate": 1.3762214983713354e-07, - "logits/chosen": -2.9004530906677246, - "logits/rejected": -2.9169278144836426, - "logps/chosen": -318.0021667480469, - "logps/rejected": -382.9727478027344, - "loss": 0.0329, + "epoch": 2.35, + "learning_rate": 1.2038432554634513e-07, + "logits/chosen": -2.270097255706787, + "logits/rejected": -2.275355577468872, + "logps/chosen": -305.75482177734375, + "logps/rejected": -370.3743591308594, + "loss": 0.017, "rewards/accuracies": 1.0, - "rewards/chosen": 0.3710877299308777, - "rewards/margins": 9.797540664672852, - "rewards/rejected": -9.426451683044434, + "rewards/chosen": -0.736997663974762, + "rewards/margins": 8.412660598754883, + "rewards/rejected": -9.149658203125, "step": 4620 }, { - "epoch": 2.26, - "learning_rate": 1.3671733622873687e-07, - "logits/chosen": -2.8712270259857178, - "logits/rejected": -2.924455404281616, - "logps/chosen": -292.1301574707031, - "logps/rejected": -372.7878723144531, - "loss": 0.0126, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9493836164474487, - "rewards/margins": 10.169816017150879, - "rewards/rejected": -9.220433235168457, + "epoch": 2.36, + "learning_rate": 1.1944235116804823e-07, + "logits/chosen": -2.1765987873077393, + "logits/rejected": -2.2477710247039795, + "logps/chosen": -234.5129852294922, + "logps/rejected": -297.01812744140625, + "loss": 0.0089, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9503382444381714, + "rewards/margins": 8.9337158203125, + "rewards/rejected": -9.884054183959961, "step": 4630 }, { - "epoch": 2.27, - "learning_rate": 1.358125226203402e-07, - "logits/chosen": -2.838878631591797, - "logits/rejected": -2.8224871158599854, - "logps/chosen": -336.27984619140625, - "logps/rejected": -372.06744384765625, - "loss": 0.0088, + "epoch": 2.36, + "learning_rate": 1.1850037678975132e-07, + "logits/chosen": -2.2448458671569824, + "logits/rejected": -2.1628146171569824, + "logps/chosen": -257.5953674316406, + "logps/rejected": -362.9079284667969, + "loss": 0.0075, "rewards/accuracies": 1.0, - "rewards/chosen": 0.08983304351568222, - "rewards/margins": 9.376413345336914, - "rewards/rejected": -9.286581039428711, + "rewards/chosen": -0.7563697099685669, + "rewards/margins": 10.294295310974121, + "rewards/rejected": -11.050664901733398, "step": 4640 }, { - "epoch": 2.27, - "learning_rate": 1.3490770901194354e-07, - "logits/chosen": -2.8875160217285156, - "logits/rejected": -2.8903188705444336, - "logps/chosen": -379.08746337890625, - "logps/rejected": -392.19952392578125, - "loss": 0.0145, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.42943936586380005, - "rewards/margins": 10.64572811126709, - "rewards/rejected": -10.216289520263672, + "epoch": 2.37, + "learning_rate": 1.175584024114544e-07, + "logits/chosen": -2.159095048904419, + "logits/rejected": -2.044412612915039, + "logps/chosen": -258.42095947265625, + "logps/rejected": -369.79595947265625, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45193544030189514, + "rewards/margins": 10.678693771362305, + "rewards/rejected": -11.13062858581543, "step": 4650 }, { - "epoch": 2.28, - "learning_rate": 1.3400289540354688e-07, - "logits/chosen": -2.8476173877716064, - "logits/rejected": -2.839707374572754, - "logps/chosen": -293.92919921875, - "logps/rejected": -381.62188720703125, - "loss": 0.0156, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8184321522712708, - "rewards/margins": 10.19401741027832, - "rewards/rejected": -9.375585556030273, + "epoch": 2.37, + "learning_rate": 1.166164280331575e-07, + "logits/chosen": -2.229854106903076, + "logits/rejected": -2.2149221897125244, + "logps/chosen": -253.66671752929688, + "logps/rejected": -366.19329833984375, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015896644443273544, + "rewards/margins": 10.538129806518555, + "rewards/rejected": -10.55402660369873, "step": 4660 }, { - "epoch": 2.28, - "learning_rate": 1.330980817951502e-07, - "logits/chosen": -2.854004144668579, - "logits/rejected": -2.8539743423461914, - "logps/chosen": -283.37518310546875, - "logps/rejected": -344.537841796875, - "loss": 0.0103, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8604044914245605, - "rewards/margins": 9.976469039916992, - "rewards/rejected": -9.11606502532959, + "epoch": 2.38, + "learning_rate": 1.1567445365486058e-07, + "logits/chosen": -2.252213478088379, + "logits/rejected": -2.1887106895446777, + "logps/chosen": -289.4987487792969, + "logps/rejected": -357.27313232421875, + "loss": 0.0265, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4829696714878082, + "rewards/margins": 9.562679290771484, + "rewards/rejected": -10.045648574829102, "step": 4670 }, { - "epoch": 2.29, - "learning_rate": 1.3219326818675352e-07, - "logits/chosen": -2.837240695953369, - "logits/rejected": -2.841235637664795, - "logps/chosen": -325.87823486328125, - "logps/rejected": -347.0536804199219, - "loss": 0.0053, + "epoch": 2.38, + "learning_rate": 1.1473247927656367e-07, + "logits/chosen": -2.224421262741089, + "logits/rejected": -2.2251248359680176, + "logps/chosen": -284.118896484375, + "logps/rejected": -349.9617919921875, + "loss": 0.0229, "rewards/accuracies": 1.0, - "rewards/chosen": 0.37723517417907715, - "rewards/margins": 8.633864402770996, - "rewards/rejected": -8.256629943847656, + "rewards/chosen": 0.6989560127258301, + "rewards/margins": 10.372453689575195, + "rewards/rejected": -9.673498153686523, "step": 4680 }, { - "epoch": 2.29, - "learning_rate": 1.3128845457835686e-07, - "logits/chosen": -2.8758559226989746, - "logits/rejected": -2.876891613006592, - "logps/chosen": -303.10357666015625, - "logps/rejected": -330.4309387207031, - "loss": 0.0105, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9764778017997742, - "rewards/margins": 10.443432807922363, - "rewards/rejected": -9.46695327758789, + "epoch": 2.39, + "learning_rate": 1.1379050489826676e-07, + "logits/chosen": -2.155917167663574, + "logits/rejected": -2.2038064002990723, + "logps/chosen": -300.345458984375, + "logps/rejected": -325.0493469238281, + "loss": 0.0141, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.3622466027736664, + "rewards/margins": 8.947761535644531, + "rewards/rejected": -9.310009002685547, "step": 4690 }, { - "epoch": 2.3, - "learning_rate": 1.303836409699602e-07, - "logits/chosen": -2.865769624710083, - "logits/rejected": -2.8170485496520996, - "logps/chosen": -333.3045349121094, - "logps/rejected": -277.3068542480469, - "loss": 0.0155, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.17634974420070648, - "rewards/margins": 8.171499252319336, - "rewards/rejected": -8.347848892211914, + "epoch": 2.39, + "learning_rate": 1.1284853051996986e-07, + "logits/chosen": -2.1967692375183105, + "logits/rejected": -2.1822071075439453, + "logps/chosen": -286.33306884765625, + "logps/rejected": -323.97869873046875, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10714861005544662, + "rewards/margins": 9.238419532775879, + "rewards/rejected": -9.131271362304688, "step": 4700 }, { - "epoch": 2.3, - "eval_logits/chosen": -2.845940589904785, - "eval_logits/rejected": -2.840355157852173, - "eval_logps/chosen": -335.2705078125, - "eval_logps/rejected": -330.3618469238281, - "eval_loss": 0.6361545324325562, - "eval_rewards/accuracies": 0.7849264740943909, - "eval_rewards/chosen": -2.0409507751464844, - "eval_rewards/margins": 4.0419206619262695, - "eval_rewards/rejected": -6.082871437072754, - "eval_runtime": 302.7325, - "eval_samples_per_second": 7.175, - "eval_steps_per_second": 0.449, + "epoch": 2.39, + "eval_logits/chosen": -2.31742000579834, + "eval_logits/rejected": -2.3860576152801514, + "eval_logps/chosen": -313.7193603515625, + "eval_logps/rejected": -323.3455810546875, + "eval_loss": 0.7450771331787109, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -3.546886444091797, + "eval_rewards/margins": 2.974155902862549, + "eval_rewards/rejected": -6.521042346954346, + "eval_runtime": 297.4748, + "eval_samples_per_second": 7.012, + "eval_steps_per_second": 0.44, "step": 4700 }, { - "epoch": 2.3, - "learning_rate": 1.294788273615635e-07, - "logits/chosen": -2.8640780448913574, - "logits/rejected": -2.8650619983673096, - "logps/chosen": -271.83380126953125, - "logps/rejected": -310.97784423828125, - "loss": 0.0095, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.39653053879737854, - "rewards/margins": 9.251490592956543, - "rewards/rejected": -8.854959487915039, + "epoch": 2.4, + "learning_rate": 1.1190655614167293e-07, + "logits/chosen": -2.294706344604492, + "logits/rejected": -2.340592861175537, + "logps/chosen": -260.90826416015625, + "logps/rejected": -362.89837646484375, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9304442405700684, + "rewards/margins": 8.845898628234863, + "rewards/rejected": -9.776342391967773, "step": 4710 }, { - "epoch": 2.31, - "learning_rate": 1.2857401375316684e-07, - "logits/chosen": -2.852733850479126, - "logits/rejected": -2.8313260078430176, - "logps/chosen": -355.8695373535156, - "logps/rejected": -351.5999755859375, - "loss": 0.0389, + "epoch": 2.4, + "learning_rate": 1.1096458176337603e-07, + "logits/chosen": -2.218076467514038, + "logits/rejected": -2.146393060684204, + "logps/chosen": -288.2243347167969, + "logps/rejected": -399.74151611328125, + "loss": 0.0059, "rewards/accuracies": 1.0, - "rewards/chosen": 0.47908416390419006, - "rewards/margins": 9.31514835357666, - "rewards/rejected": -8.836065292358398, + "rewards/chosen": -0.29963019490242004, + "rewards/margins": 9.852890014648438, + "rewards/rejected": -10.152520179748535, "step": 4720 }, { - "epoch": 2.31, - "learning_rate": 1.2766920014477017e-07, - "logits/chosen": -2.920712471008301, - "logits/rejected": -2.929276943206787, - "logps/chosen": -322.2744445800781, - "logps/rejected": -392.18536376953125, - "loss": 0.0107, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.3544652462005615, - "rewards/margins": 11.739262580871582, - "rewards/rejected": -10.384796142578125, + "epoch": 2.41, + "learning_rate": 1.1002260738507912e-07, + "logits/chosen": -2.213052272796631, + "logits/rejected": -2.272000551223755, + "logps/chosen": -258.693603515625, + "logps/rejected": -314.71307373046875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.362382173538208, + "rewards/margins": 9.149022102355957, + "rewards/rejected": -9.511404037475586, "step": 4730 }, { - "epoch": 2.32, - "learning_rate": 1.267643865363735e-07, - "logits/chosen": -2.861130475997925, - "logits/rejected": -2.8687164783477783, - "logps/chosen": -314.52203369140625, - "logps/rejected": -377.7655029296875, - "loss": 0.0098, + "epoch": 2.41, + "learning_rate": 1.0908063300678221e-07, + "logits/chosen": -2.230384349822998, + "logits/rejected": -2.141641855239868, + "logps/chosen": -285.4044494628906, + "logps/rejected": -359.29266357421875, + "loss": 0.0135, "rewards/accuracies": 1.0, - "rewards/chosen": 0.2879761755466461, - "rewards/margins": 8.935873031616211, - "rewards/rejected": -8.647897720336914, + "rewards/chosen": -1.2375614643096924, + "rewards/margins": 8.541139602661133, + "rewards/rejected": -9.77869987487793, "step": 4740 }, { - "epoch": 2.32, - "learning_rate": 1.2585957292797684e-07, - "logits/chosen": -2.8522419929504395, - "logits/rejected": -2.8753440380096436, - "logps/chosen": -304.6292419433594, - "logps/rejected": -354.695556640625, - "loss": 0.0188, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.6933711767196655, - "rewards/margins": 10.05359935760498, - "rewards/rejected": -9.360227584838867, + "epoch": 2.42, + "learning_rate": 1.081386586284853e-07, + "logits/chosen": -2.3513834476470947, + "logits/rejected": -2.3596320152282715, + "logps/chosen": -266.305908203125, + "logps/rejected": -380.3756103515625, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33188745379447937, + "rewards/margins": 9.921003341674805, + "rewards/rejected": -10.252891540527344, "step": 4750 }, { - "epoch": 2.33, - "learning_rate": 1.2495475931958015e-07, - "logits/chosen": -2.8773133754730225, - "logits/rejected": -2.900674819946289, - "logps/chosen": -310.9174499511719, - "logps/rejected": -365.1013488769531, - "loss": 0.0229, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5974355936050415, - "rewards/margins": 10.505298614501953, - "rewards/rejected": -9.907862663269043, + "epoch": 2.42, + "learning_rate": 1.071966842501884e-07, + "logits/chosen": -2.2266974449157715, + "logits/rejected": -2.2696118354797363, + "logps/chosen": -298.6575622558594, + "logps/rejected": -355.7513427734375, + "loss": 0.0111, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5612926483154297, + "rewards/margins": 9.986989974975586, + "rewards/rejected": -10.548282623291016, "step": 4760 }, { - "epoch": 2.33, - "learning_rate": 1.240499457111835e-07, - "logits/chosen": -2.8433022499084473, - "logits/rejected": -2.8883471488952637, - "logps/chosen": -349.7984924316406, - "logps/rejected": -370.6795959472656, - "loss": 0.0073, + "epoch": 2.43, + "learning_rate": 1.0625470987189147e-07, + "logits/chosen": -2.29338002204895, + "logits/rejected": -2.3379619121551514, + "logps/chosen": -266.6180114746094, + "logps/rejected": -322.62994384765625, + "loss": 0.0148, "rewards/accuracies": 1.0, - "rewards/chosen": 0.5922546982765198, - "rewards/margins": 9.98127555847168, - "rewards/rejected": -9.389020919799805, + "rewards/chosen": -0.5768706202507019, + "rewards/margins": 9.321121215820312, + "rewards/rejected": -9.897993087768555, "step": 4770 }, { - "epoch": 2.34, - "learning_rate": 1.2314513210278682e-07, - "logits/chosen": -2.7814230918884277, - "logits/rejected": -2.8174662590026855, - "logps/chosen": -336.8526916503906, - "logps/rejected": -393.45050048828125, - "loss": 0.0109, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.3478777408599854, - "rewards/margins": 10.414642333984375, - "rewards/rejected": -9.066762924194336, + "epoch": 2.43, + "learning_rate": 1.0531273549359457e-07, + "logits/chosen": -2.1704368591308594, + "logits/rejected": -2.236093044281006, + "logps/chosen": -266.098388671875, + "logps/rejected": -318.8455505371094, + "loss": 0.0137, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9614885449409485, + "rewards/margins": 9.26085090637207, + "rewards/rejected": -10.222338676452637, "step": 4780 }, { - "epoch": 2.34, - "learning_rate": 1.2224031849439016e-07, - "logits/chosen": -2.8909308910369873, - "logits/rejected": -2.8900511264801025, - "logps/chosen": -357.632568359375, - "logps/rejected": -374.77984619140625, - "loss": 0.0142, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9155074954032898, - "rewards/margins": 9.64457893371582, - "rewards/rejected": -8.729072570800781, + "epoch": 2.44, + "learning_rate": 1.0437076111529766e-07, + "logits/chosen": -2.2239723205566406, + "logits/rejected": -2.147284984588623, + "logps/chosen": -307.8612976074219, + "logps/rejected": -340.2486572265625, + "loss": 0.0284, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.18168917298316956, + "rewards/margins": 9.09406566619873, + "rewards/rejected": -9.275754928588867, "step": 4790 }, { - "epoch": 2.34, - "learning_rate": 1.2133550488599347e-07, - "logits/chosen": -2.854688882827759, - "logits/rejected": -2.8638267517089844, - "logps/chosen": -348.04693603515625, - "logps/rejected": -336.7369689941406, - "loss": 0.0061, + "epoch": 2.44, + "learning_rate": 1.0342878673700074e-07, + "logits/chosen": -2.236088991165161, + "logits/rejected": -2.2426741123199463, + "logps/chosen": -297.51702880859375, + "logps/rejected": -378.7740173339844, + "loss": 0.0112, "rewards/accuracies": 1.0, - "rewards/chosen": -0.07602433115243912, - "rewards/margins": 9.697509765625, - "rewards/rejected": -9.773534774780273, + "rewards/chosen": -0.6450713872909546, + "rewards/margins": 9.456080436706543, + "rewards/rejected": -10.101151466369629, "step": 4800 }, { - "epoch": 2.34, - "eval_logits/chosen": -2.833055019378662, - "eval_logits/rejected": -2.8279426097869873, - "eval_logps/chosen": -336.2384948730469, - "eval_logps/rejected": -331.8154602050781, - "eval_loss": 0.6448474526405334, - "eval_rewards/accuracies": 0.7867646813392639, - "eval_rewards/chosen": -2.1377487182617188, - "eval_rewards/margins": 4.090484142303467, - "eval_rewards/rejected": -6.2282328605651855, - "eval_runtime": 303.3647, - "eval_samples_per_second": 7.16, - "eval_steps_per_second": 0.448, + "epoch": 2.44, + "eval_logits/chosen": -2.3242154121398926, + "eval_logits/rejected": -2.393857955932617, + "eval_logps/chosen": -313.05194091796875, + "eval_logps/rejected": -322.5316467285156, + "eval_loss": 0.7546879649162292, + "eval_rewards/accuracies": 0.7423664331436157, + "eval_rewards/chosen": -3.4801478385925293, + "eval_rewards/margins": 2.959501266479492, + "eval_rewards/rejected": -6.439650058746338, + "eval_runtime": 301.9744, + "eval_samples_per_second": 6.908, + "eval_steps_per_second": 0.434, "step": 4800 }, { - "epoch": 2.35, - "learning_rate": 1.204306912775968e-07, - "logits/chosen": -2.823441743850708, - "logits/rejected": -2.7979588508605957, - "logps/chosen": -383.358154296875, - "logps/rejected": -407.3774108886719, - "loss": 0.0075, + "epoch": 2.45, + "learning_rate": 1.0248681235870383e-07, + "logits/chosen": -2.2089924812316895, + "logits/rejected": -2.2755725383758545, + "logps/chosen": -270.680908203125, + "logps/rejected": -341.1857604980469, + "loss": 0.0045, "rewards/accuracies": 1.0, - "rewards/chosen": 0.8536338806152344, - "rewards/margins": 10.711227416992188, - "rewards/rejected": -9.857593536376953, + "rewards/chosen": -0.691789448261261, + "rewards/margins": 8.840271949768066, + "rewards/rejected": -9.532060623168945, "step": 4810 }, { - "epoch": 2.35, - "learning_rate": 1.1952587766920014e-07, - "logits/chosen": -2.875776529312134, - "logits/rejected": -2.833350658416748, - "logps/chosen": -274.1474304199219, - "logps/rejected": -296.96600341796875, - "loss": 0.0174, + "epoch": 2.45, + "learning_rate": 1.0154483798040693e-07, + "logits/chosen": -2.3454391956329346, + "logits/rejected": -2.2591652870178223, + "logps/chosen": -321.9779968261719, + "logps/rejected": -342.6640930175781, + "loss": 0.0223, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.11717446893453598, - "rewards/margins": 8.582273483276367, - "rewards/rejected": -8.699448585510254, + "rewards/chosen": -0.39104217290878296, + "rewards/margins": 9.368048667907715, + "rewards/rejected": -9.759092330932617, "step": 4820 }, { - "epoch": 2.36, - "learning_rate": 1.1862106406080346e-07, - "logits/chosen": -2.8261425495147705, - "logits/rejected": -2.8836371898651123, - "logps/chosen": -251.71182250976562, - "logps/rejected": -333.8467712402344, - "loss": 0.007, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3092392086982727, - "rewards/margins": 9.181299209594727, - "rewards/rejected": -9.490538597106934, + "epoch": 2.46, + "learning_rate": 1.0060286360211002e-07, + "logits/chosen": -2.259584665298462, + "logits/rejected": -2.161665439605713, + "logps/chosen": -286.1450500488281, + "logps/rejected": -370.9566650390625, + "loss": 0.0087, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.1616705358028412, + "rewards/margins": 9.993936538696289, + "rewards/rejected": -10.155607223510742, "step": 4830 }, { - "epoch": 2.36, - "learning_rate": 1.177162504524068e-07, - "logits/chosen": -2.874056577682495, - "logits/rejected": -2.863772392272949, - "logps/chosen": -320.1214294433594, - "logps/rejected": -389.7304382324219, - "loss": 0.011, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.47916120290756226, - "rewards/margins": 10.58686637878418, - "rewards/rejected": -10.107705116271973, + "epoch": 2.46, + "learning_rate": 9.96608892238131e-08, + "logits/chosen": -2.105440378189087, + "logits/rejected": -2.151834487915039, + "logps/chosen": -306.2788391113281, + "logps/rejected": -394.65850830078125, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5894113779067993, + "rewards/margins": 9.950549125671387, + "rewards/rejected": -10.539959907531738, "step": 4840 }, { - "epoch": 2.37, - "learning_rate": 1.1681143684401013e-07, - "logits/chosen": -2.846298933029175, - "logits/rejected": -2.8311657905578613, - "logps/chosen": -319.12823486328125, - "logps/rejected": -302.36932373046875, - "loss": 0.0174, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.2882283329963684, - "rewards/margins": 9.603487968444824, - "rewards/rejected": -9.31525993347168, + "epoch": 2.47, + "learning_rate": 9.87189148455162e-08, + "logits/chosen": -2.1507091522216797, + "logits/rejected": -2.1006345748901367, + "logps/chosen": -300.9240417480469, + "logps/rejected": -418.56134033203125, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.125248670578003, + "rewards/margins": 9.699620246887207, + "rewards/rejected": -10.824868202209473, "step": 4850 }, { - "epoch": 2.37, - "learning_rate": 1.1590662323561345e-07, - "logits/chosen": -2.835400342941284, - "logits/rejected": -2.833388328552246, - "logps/chosen": -315.0025634765625, - "logps/rejected": -364.9981384277344, - "loss": 0.0128, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.3068752586841583, - "rewards/margins": 9.366029739379883, - "rewards/rejected": -9.05915355682373, + "epoch": 2.47, + "learning_rate": 9.77769404672193e-08, + "logits/chosen": -2.087984561920166, + "logits/rejected": -2.0346813201904297, + "logps/chosen": -281.7013854980469, + "logps/rejected": -378.5484619140625, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6053246855735779, + "rewards/margins": 9.81657886505127, + "rewards/rejected": -10.421903610229492, "step": 4860 }, { - "epoch": 2.38, - "learning_rate": 1.1500180962721679e-07, - "logits/chosen": -2.930753231048584, - "logits/rejected": -2.9529433250427246, - "logps/chosen": -291.2989807128906, - "logps/rejected": -362.6711730957031, - "loss": 0.0253, + "epoch": 2.48, + "learning_rate": 9.683496608892237e-08, + "logits/chosen": -2.2168502807617188, + "logits/rejected": -2.277942419052124, + "logps/chosen": -280.87860107421875, + "logps/rejected": -340.9623107910156, + "loss": 0.0093, "rewards/accuracies": 1.0, - "rewards/chosen": 1.1055619716644287, - "rewards/margins": 9.6641845703125, - "rewards/rejected": -8.558622360229492, + "rewards/chosen": -0.9028668403625488, + "rewards/margins": 9.776205062866211, + "rewards/rejected": -10.679072380065918, "step": 4870 }, { - "epoch": 2.38, - "learning_rate": 1.1409699601882011e-07, - "logits/chosen": -2.9215879440307617, - "logits/rejected": -2.9042065143585205, - "logps/chosen": -270.09173583984375, - "logps/rejected": -296.0253601074219, - "loss": 0.0228, + "epoch": 2.48, + "learning_rate": 9.589299171062547e-08, + "logits/chosen": -2.1402547359466553, + "logits/rejected": -2.1672096252441406, + "logps/chosen": -326.22125244140625, + "logps/rejected": -379.2210998535156, + "loss": 0.0066, "rewards/accuracies": 1.0, - "rewards/chosen": -0.889201283454895, - "rewards/margins": 7.966219425201416, - "rewards/rejected": -8.85542106628418, + "rewards/chosen": -0.9656316637992859, + "rewards/margins": 9.55207633972168, + "rewards/rejected": -10.517707824707031, "step": 4880 }, { - "epoch": 2.39, - "learning_rate": 1.1319218241042345e-07, - "logits/chosen": -2.939760684967041, - "logits/rejected": -2.942763566970825, - "logps/chosen": -302.49322509765625, - "logps/rejected": -319.2682800292969, - "loss": 0.0069, + "epoch": 2.49, + "learning_rate": 9.495101733232856e-08, + "logits/chosen": -2.2046916484832764, + "logits/rejected": -2.1590428352355957, + "logps/chosen": -264.8122253417969, + "logps/rejected": -351.7208557128906, + "loss": 0.0159, "rewards/accuracies": 1.0, - "rewards/chosen": -0.21954341232776642, - "rewards/margins": 8.987138748168945, - "rewards/rejected": -9.206682205200195, + "rewards/chosen": -0.2652990221977234, + "rewards/margins": 10.268514633178711, + "rewards/rejected": -10.5338134765625, "step": 4890 }, { - "epoch": 2.39, - "learning_rate": 1.1228736880202677e-07, - "logits/chosen": -2.894141912460327, - "logits/rejected": -2.8808584213256836, - "logps/chosen": -322.8137512207031, - "logps/rejected": -331.19866943359375, - "loss": 0.0166, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.43441978096961975, - "rewards/margins": 8.915560722351074, - "rewards/rejected": -8.481142044067383, + "epoch": 2.49, + "learning_rate": 9.400904295403164e-08, + "logits/chosen": -2.1020796298980713, + "logits/rejected": -2.1482295989990234, + "logps/chosen": -239.36373901367188, + "logps/rejected": -300.88494873046875, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.654992938041687, + "rewards/margins": 9.085423469543457, + "rewards/rejected": -9.740415573120117, "step": 4900 }, { - "epoch": 2.39, - "eval_logits/chosen": -2.8704357147216797, - "eval_logits/rejected": -2.867837905883789, - "eval_logps/chosen": -334.728271484375, - "eval_logps/rejected": -329.568603515625, - "eval_loss": 0.6480796933174133, - "eval_rewards/accuracies": 0.7867646813392639, - "eval_rewards/chosen": -1.9867266416549683, - "eval_rewards/margins": 4.016818523406982, - "eval_rewards/rejected": -6.00354528427124, - "eval_runtime": 303.2527, - "eval_samples_per_second": 7.162, - "eval_steps_per_second": 0.448, + "epoch": 2.49, + "eval_logits/chosen": -2.283444404602051, + "eval_logits/rejected": -2.3523569107055664, + "eval_logps/chosen": -316.84600830078125, + "eval_logps/rejected": -326.6253356933594, + "eval_loss": 0.7691048383712769, + "eval_rewards/accuracies": 0.7442747950553894, + "eval_rewards/chosen": -3.8595566749572754, + "eval_rewards/margins": 2.989461660385132, + "eval_rewards/rejected": -6.84901762008667, + "eval_runtime": 297.4323, + "eval_samples_per_second": 7.013, + "eval_steps_per_second": 0.44, "step": 4900 }, { - "epoch": 2.4, - "learning_rate": 1.113825551936301e-07, - "logits/chosen": -2.9400696754455566, - "logits/rejected": -2.9435558319091797, - "logps/chosen": -342.7354736328125, - "logps/rejected": -325.6578674316406, - "loss": 0.0069, + "epoch": 2.5, + "learning_rate": 9.306706857573473e-08, + "logits/chosen": -2.2702176570892334, + "logits/rejected": -2.197672128677368, + "logps/chosen": -317.44647216796875, + "logps/rejected": -370.15740966796875, + "loss": 0.0087, "rewards/accuracies": 1.0, - "rewards/chosen": 1.2436249256134033, - "rewards/margins": 9.826736450195312, - "rewards/rejected": -8.583111763000488, + "rewards/chosen": -0.3453849256038666, + "rewards/margins": 8.828009605407715, + "rewards/rejected": -9.173395156860352, "step": 4910 }, { - "epoch": 2.4, - "learning_rate": 1.1047774158523343e-07, - "logits/chosen": -2.8447513580322266, - "logits/rejected": -2.8743577003479004, - "logps/chosen": -328.96014404296875, - "logps/rejected": -338.12640380859375, - "loss": 0.0138, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.7150461673736572, - "rewards/margins": 9.25133228302002, - "rewards/rejected": -8.536287307739258, + "epoch": 2.5, + "learning_rate": 9.212509419743783e-08, + "logits/chosen": -2.0067317485809326, + "logits/rejected": -2.053682565689087, + "logps/chosen": -284.0643005371094, + "logps/rejected": -405.646484375, + "loss": 0.0426, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2845280170440674, + "rewards/margins": 10.26252555847168, + "rewards/rejected": -11.547052383422852, "step": 4920 }, { - "epoch": 2.41, - "learning_rate": 1.0957292797683676e-07, - "logits/chosen": -2.849860668182373, - "logits/rejected": -2.881740093231201, - "logps/chosen": -297.6890563964844, - "logps/rejected": -348.7637939453125, - "loss": 0.0108, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.2231244146823883, - "rewards/margins": 8.894864082336426, - "rewards/rejected": -9.117988586425781, + "epoch": 2.51, + "learning_rate": 9.11831198191409e-08, + "logits/chosen": -2.069619655609131, + "logits/rejected": -2.014024257659912, + "logps/chosen": -285.60797119140625, + "logps/rejected": -365.84307861328125, + "loss": 0.0115, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7954369783401489, + "rewards/margins": 9.284969329833984, + "rewards/rejected": -10.08040714263916, "step": 4930 }, { - "epoch": 2.41, - "learning_rate": 1.0866811436844011e-07, - "logits/chosen": -2.7739806175231934, - "logits/rejected": -2.815706491470337, - "logps/chosen": -326.9042053222656, - "logps/rejected": -368.6151428222656, - "loss": 0.0084, + "epoch": 2.51, + "learning_rate": 9.0241145440844e-08, + "logits/chosen": -2.1276021003723145, + "logits/rejected": -2.140718460083008, + "logps/chosen": -307.3009033203125, + "logps/rejected": -401.25823974609375, + "loss": 0.0081, "rewards/accuracies": 1.0, - "rewards/chosen": 0.9392449259757996, - "rewards/margins": 10.57219409942627, - "rewards/rejected": -9.632949829101562, + "rewards/chosen": -0.05557692050933838, + "rewards/margins": 10.433725357055664, + "rewards/rejected": -10.489302635192871, "step": 4940 }, { - "epoch": 2.42, - "learning_rate": 1.0776330076004343e-07, - "logits/chosen": -2.8920652866363525, - "logits/rejected": -2.898991584777832, - "logps/chosen": -298.92315673828125, - "logps/rejected": -355.5097961425781, - "loss": 0.0172, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.14010891318321228, - "rewards/margins": 9.62451457977295, - "rewards/rejected": -9.484406471252441, + "epoch": 2.52, + "learning_rate": 8.929917106254709e-08, + "logits/chosen": -2.2412877082824707, + "logits/rejected": -2.149836301803589, + "logps/chosen": -265.6690368652344, + "logps/rejected": -375.5118713378906, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8982291221618652, + "rewards/margins": 9.667852401733398, + "rewards/rejected": -10.566082954406738, "step": 4950 }, { - "epoch": 2.42, - "learning_rate": 1.0685848715164677e-07, - "logits/chosen": -2.9169883728027344, - "logits/rejected": -2.8903183937072754, - "logps/chosen": -286.23931884765625, - "logps/rejected": -327.681640625, - "loss": 0.0066, + "epoch": 2.52, + "learning_rate": 8.835719668425018e-08, + "logits/chosen": -2.1044938564300537, + "logits/rejected": -2.225058078765869, + "logps/chosen": -294.85675048828125, + "logps/rejected": -398.77008056640625, + "loss": 0.0082, "rewards/accuracies": 1.0, - "rewards/chosen": -0.4391227662563324, - "rewards/margins": 8.96290397644043, - "rewards/rejected": -9.402026176452637, + "rewards/chosen": -1.0629851818084717, + "rewards/margins": 10.607720375061035, + "rewards/rejected": -11.670705795288086, "step": 4960 }, { - "epoch": 2.43, - "learning_rate": 1.0595367354325009e-07, - "logits/chosen": -2.910407543182373, - "logits/rejected": -2.9346327781677246, - "logps/chosen": -343.13568115234375, - "logps/rejected": -383.9322814941406, - "loss": 0.007, + "epoch": 2.53, + "learning_rate": 8.741522230595327e-08, + "logits/chosen": -2.183194398880005, + "logits/rejected": -2.2053921222686768, + "logps/chosen": -323.3440246582031, + "logps/rejected": -361.22796630859375, + "loss": 0.0148, "rewards/accuracies": 1.0, - "rewards/chosen": 0.19358623027801514, - "rewards/margins": 10.111834526062012, - "rewards/rejected": -9.918249130249023, + "rewards/chosen": -0.964472770690918, + "rewards/margins": 9.647565841674805, + "rewards/rejected": -10.61203670501709, "step": 4970 }, { - "epoch": 2.43, - "learning_rate": 1.0504885993485342e-07, - "logits/chosen": -2.7780022621154785, - "logits/rejected": -2.7677063941955566, - "logps/chosen": -341.8428955078125, - "logps/rejected": -366.1546630859375, - "loss": 0.0248, + "epoch": 2.53, + "learning_rate": 8.647324792765637e-08, + "logits/chosen": -2.1607978343963623, + "logits/rejected": -2.1534907817840576, + "logps/chosen": -293.7584228515625, + "logps/rejected": -361.35565185546875, + "loss": 0.0127, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.4313580393791199, - "rewards/margins": 10.43498706817627, - "rewards/rejected": -10.003628730773926, + "rewards/chosen": -0.08604813367128372, + "rewards/margins": 8.945721626281738, + "rewards/rejected": -9.031770706176758, "step": 4980 }, { - "epoch": 2.44, - "learning_rate": 1.0414404632645675e-07, - "logits/chosen": -2.777137041091919, - "logits/rejected": -2.7877514362335205, - "logps/chosen": -273.2378845214844, - "logps/rejected": -347.6225280761719, - "loss": 0.0111, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.09691391885280609, - "rewards/margins": 10.915949821472168, - "rewards/rejected": -10.819036483764648, + "epoch": 2.54, + "learning_rate": 8.553127354935944e-08, + "logits/chosen": -2.252793073654175, + "logits/rejected": -2.175422191619873, + "logps/chosen": -258.53887939453125, + "logps/rejected": -349.27581787109375, + "loss": 0.0114, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.41624441742897034, + "rewards/margins": 9.724603652954102, + "rewards/rejected": -10.140849113464355, "step": 4990 }, { - "epoch": 2.44, - "learning_rate": 1.0323923271806008e-07, - "logits/chosen": -2.7671542167663574, - "logits/rejected": -2.768014430999756, - "logps/chosen": -305.2103576660156, - "logps/rejected": -382.4863586425781, - "loss": 0.0122, + "epoch": 2.54, + "learning_rate": 8.458929917106254e-08, + "logits/chosen": -2.075585126876831, + "logits/rejected": -2.1678786277770996, + "logps/chosen": -291.48297119140625, + "logps/rejected": -383.5638732910156, + "loss": 0.0118, "rewards/accuracies": 1.0, - "rewards/chosen": 1.0794408321380615, - "rewards/margins": 11.213907241821289, - "rewards/rejected": -10.134466171264648, + "rewards/chosen": -0.1401732861995697, + "rewards/margins": 11.019479751586914, + "rewards/rejected": -11.159653663635254, "step": 5000 }, { - "epoch": 2.44, - "eval_logits/chosen": -2.8250341415405273, - "eval_logits/rejected": -2.8242409229278564, - "eval_logps/chosen": -338.51153564453125, - "eval_logps/rejected": -335.54217529296875, - "eval_loss": 0.6697083711624146, - "eval_rewards/accuracies": 0.7757353186607361, - "eval_rewards/chosen": -2.3650553226470947, - "eval_rewards/margins": 4.235851287841797, - "eval_rewards/rejected": -6.600906848907471, - "eval_runtime": 302.9711, - "eval_samples_per_second": 7.169, - "eval_steps_per_second": 0.449, + "epoch": 2.54, + "eval_logits/chosen": -2.2657597064971924, + "eval_logits/rejected": -2.3346636295318604, + "eval_logps/chosen": -317.1129150390625, + "eval_logps/rejected": -326.8659362792969, + "eval_loss": 0.7716654539108276, + "eval_rewards/accuracies": 0.7461832165718079, + "eval_rewards/chosen": -3.8862454891204834, + "eval_rewards/margins": 2.986832857131958, + "eval_rewards/rejected": -6.873078346252441, + "eval_runtime": 301.4436, + "eval_samples_per_second": 6.92, + "eval_steps_per_second": 0.435, "step": 5000 }, { - "epoch": 2.45, - "learning_rate": 1.023344191096634e-07, - "logits/chosen": -2.850271224975586, - "logits/rejected": -2.865354299545288, - "logps/chosen": -360.3381042480469, - "logps/rejected": -395.33648681640625, - "loss": 0.0156, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.5653321743011475, - "rewards/margins": 9.982986450195312, - "rewards/rejected": -9.417654037475586, + "epoch": 2.55, + "learning_rate": 8.364732479276564e-08, + "logits/chosen": -2.0859687328338623, + "logits/rejected": -2.1620824337005615, + "logps/chosen": -300.7864685058594, + "logps/rejected": -342.90472412109375, + "loss": 0.0132, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.5269465446472168, + "rewards/margins": 9.316871643066406, + "rewards/rejected": -9.843819618225098, "step": 5010 }, { - "epoch": 2.45, - "learning_rate": 1.0142960550126674e-07, - "logits/chosen": -2.7926459312438965, - "logits/rejected": -2.817817449569702, - "logps/chosen": -337.81671142578125, - "logps/rejected": -395.8128662109375, - "loss": 0.0103, + "epoch": 2.55, + "learning_rate": 8.270535041446872e-08, + "logits/chosen": -2.2324938774108887, + "logits/rejected": -2.171308755874634, + "logps/chosen": -308.4803771972656, + "logps/rejected": -374.90447998046875, + "loss": 0.0178, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.659866213798523, - "rewards/margins": 12.181129455566406, - "rewards/rejected": -11.521262168884277, + "rewards/chosen": 0.11912157386541367, + "rewards/margins": 9.949522018432617, + "rewards/rejected": -9.830400466918945, "step": 5020 }, { - "epoch": 2.46, - "learning_rate": 1.0052479189287007e-07, - "logits/chosen": -2.8177309036254883, - "logits/rejected": -2.8066370487213135, - "logps/chosen": -311.06842041015625, - "logps/rejected": -343.4490051269531, - "loss": 0.0066, + "epoch": 2.56, + "learning_rate": 8.176337603617182e-08, + "logits/chosen": -2.222607374191284, + "logits/rejected": -2.1958372592926025, + "logps/chosen": -328.6293640136719, + "logps/rejected": -375.71160888671875, + "loss": 0.0127, "rewards/accuracies": 1.0, - "rewards/chosen": 0.3473469913005829, - "rewards/margins": 10.180368423461914, - "rewards/rejected": -9.833020210266113, + "rewards/chosen": -0.37949132919311523, + "rewards/margins": 9.425061225891113, + "rewards/rejected": -9.80455207824707, "step": 5030 }, { - "epoch": 2.46, - "learning_rate": 9.96199782844734e-08, - "logits/chosen": -2.885071277618408, - "logits/rejected": -2.8668925762176514, - "logps/chosen": -294.9935607910156, - "logps/rejected": -360.81341552734375, - "loss": 0.018, + "epoch": 2.56, + "learning_rate": 8.08214016578749e-08, + "logits/chosen": -2.2369132041931152, + "logits/rejected": -2.2004668712615967, + "logps/chosen": -289.24847412109375, + "logps/rejected": -363.97882080078125, + "loss": 0.0104, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.47163575887680054, - "rewards/margins": 8.59495735168457, - "rewards/rejected": -9.066594123840332, + "rewards/chosen": -0.8536527752876282, + "rewards/margins": 8.470075607299805, + "rewards/rejected": -9.323729515075684, "step": 5040 }, { - "epoch": 2.47, - "learning_rate": 9.871516467607673e-08, - "logits/chosen": -2.80708909034729, - "logits/rejected": -2.8104634284973145, - "logps/chosen": -362.1169128417969, - "logps/rejected": -378.2699279785156, - "loss": 0.0137, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.7892178893089294, - "rewards/margins": 9.493849754333496, - "rewards/rejected": -8.704632759094238, + "epoch": 2.57, + "learning_rate": 7.987942727957798e-08, + "logits/chosen": -2.146327495574951, + "logits/rejected": -2.2037487030029297, + "logps/chosen": -307.6786804199219, + "logps/rejected": -360.6956481933594, + "loss": 0.0179, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.42890268564224243, + "rewards/margins": 9.714688301086426, + "rewards/rejected": -10.143590927124023, "step": 5050 }, { - "epoch": 2.47, - "learning_rate": 9.781035106768005e-08, - "logits/chosen": -2.814383029937744, - "logits/rejected": -2.86273193359375, - "logps/chosen": -341.49652099609375, - "logps/rejected": -385.01580810546875, - "loss": 0.0186, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.0586259365081787, - "rewards/margins": 9.467718124389648, - "rewards/rejected": -8.409092903137207, + "epoch": 2.57, + "learning_rate": 7.893745290128108e-08, + "logits/chosen": -2.2009823322296143, + "logits/rejected": -2.1575918197631836, + "logps/chosen": -268.5896911621094, + "logps/rejected": -395.19207763671875, + "loss": 0.0113, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.4908009469509125, + "rewards/margins": 9.655282974243164, + "rewards/rejected": -10.146084785461426, "step": 5060 }, { - "epoch": 2.48, - "learning_rate": 9.690553745928339e-08, - "logits/chosen": -2.857347011566162, - "logits/rejected": -2.8487515449523926, - "logps/chosen": -284.46295166015625, - "logps/rejected": -337.2325439453125, - "loss": 0.0122, + "epoch": 2.58, + "learning_rate": 7.799547852298418e-08, + "logits/chosen": -2.2042064666748047, + "logits/rejected": -2.186056613922119, + "logps/chosen": -319.55474853515625, + "logps/rejected": -393.94366455078125, + "loss": 0.0099, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.021925926208496, - "rewards/margins": 9.107851028442383, - "rewards/rejected": -8.08592414855957, + "rewards/chosen": -0.20358404517173767, + "rewards/margins": 10.148531913757324, + "rewards/rejected": -10.352116584777832, "step": 5070 }, { - "epoch": 2.48, - "learning_rate": 9.600072385088671e-08, - "logits/chosen": -2.793250322341919, - "logits/rejected": -2.7879016399383545, - "logps/chosen": -342.508544921875, - "logps/rejected": -343.8742980957031, - "loss": 0.0144, + "epoch": 2.58, + "learning_rate": 7.705350414468727e-08, + "logits/chosen": -2.129512071609497, + "logits/rejected": -2.191772937774658, + "logps/chosen": -279.49920654296875, + "logps/rejected": -366.9100646972656, + "loss": 0.0104, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.6626511216163635, - "rewards/margins": 10.360193252563477, - "rewards/rejected": -9.697541236877441, + "rewards/chosen": 0.13818223774433136, + "rewards/margins": 9.699727058410645, + "rewards/rejected": -9.561545372009277, "step": 5080 }, { - "epoch": 2.49, - "learning_rate": 9.509591024249005e-08, - "logits/chosen": -2.855344295501709, - "logits/rejected": -2.8611862659454346, - "logps/chosen": -412.68426513671875, - "logps/rejected": -397.288330078125, - "loss": 0.0148, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.9815244674682617, - "rewards/margins": 9.991876602172852, - "rewards/rejected": -9.010351181030273, + "epoch": 2.59, + "learning_rate": 7.611152976639035e-08, + "logits/chosen": -2.196213960647583, + "logits/rejected": -2.2114243507385254, + "logps/chosen": -271.9547424316406, + "logps/rejected": -360.7081604003906, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4612036347389221, + "rewards/margins": 10.259782791137695, + "rewards/rejected": -10.720987319946289, "step": 5090 }, { - "epoch": 2.49, - "learning_rate": 9.419109663409337e-08, - "logits/chosen": -2.799515724182129, - "logits/rejected": -2.828319549560547, - "logps/chosen": -315.8230285644531, - "logps/rejected": -362.23602294921875, - "loss": 0.0105, + "epoch": 2.59, + "learning_rate": 7.516955538809344e-08, + "logits/chosen": -2.1923937797546387, + "logits/rejected": -2.1321029663085938, + "logps/chosen": -259.404296875, + "logps/rejected": -338.395751953125, + "loss": 0.014, "rewards/accuracies": 1.0, - "rewards/chosen": 0.6228026151657104, - "rewards/margins": 10.558664321899414, - "rewards/rejected": -9.935861587524414, + "rewards/chosen": 0.023987997323274612, + "rewards/margins": 9.157641410827637, + "rewards/rejected": -9.13365364074707, "step": 5100 }, { - "epoch": 2.49, - "eval_logits/chosen": -2.8055360317230225, - "eval_logits/rejected": -2.8009698390960693, - "eval_logps/chosen": -336.57080078125, - "eval_logps/rejected": -333.0202941894531, - "eval_loss": 0.6559944152832031, - "eval_rewards/accuracies": 0.78125, - "eval_rewards/chosen": -2.1709818840026855, - "eval_rewards/margins": 4.177731513977051, - "eval_rewards/rejected": -6.348714351654053, - "eval_runtime": 303.3082, - "eval_samples_per_second": 7.161, - "eval_steps_per_second": 0.448, + "epoch": 2.59, + "eval_logits/chosen": -2.278256893157959, + "eval_logits/rejected": -2.3512167930603027, + "eval_logps/chosen": -314.220458984375, + "eval_logps/rejected": -324.13348388671875, + "eval_loss": 0.7685028910636902, + "eval_rewards/accuracies": 0.7480915784835815, + "eval_rewards/chosen": -3.5969960689544678, + "eval_rewards/margins": 3.0028398036956787, + "eval_rewards/rejected": -6.599836349487305, + "eval_runtime": 296.8967, + "eval_samples_per_second": 7.026, + "eval_steps_per_second": 0.441, "step": 5100 }, { - "epoch": 2.5, - "learning_rate": 9.32862830256967e-08, - "logits/chosen": -2.8963656425476074, - "logits/rejected": -2.8815619945526123, - "logps/chosen": -279.34405517578125, - "logps/rejected": -334.063232421875, - "loss": 0.0173, + "epoch": 2.6, + "learning_rate": 7.422758100979654e-08, + "logits/chosen": -2.2003679275512695, + "logits/rejected": -2.1358842849731445, + "logps/chosen": -292.3018798828125, + "logps/rejected": -338.69610595703125, + "loss": 0.0154, "rewards/accuracies": 1.0, - "rewards/chosen": 0.2388395369052887, - "rewards/margins": 9.919466972351074, - "rewards/rejected": -9.68062686920166, + "rewards/chosen": -0.16416962444782257, + "rewards/margins": 10.127435684204102, + "rewards/rejected": -10.291604995727539, "step": 5110 }, { - "epoch": 2.5, - "learning_rate": 9.238146941730003e-08, - "logits/chosen": -2.805250883102417, - "logits/rejected": -2.8472931385040283, - "logps/chosen": -331.8280334472656, - "logps/rejected": -426.10687255859375, - "loss": 0.0273, + "epoch": 2.6, + "learning_rate": 7.328560663149962e-08, + "logits/chosen": -2.1990761756896973, + "logits/rejected": -2.248767375946045, + "logps/chosen": -293.7164306640625, + "logps/rejected": -401.2142028808594, + "loss": 0.0106, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.6450344324111938, - "rewards/margins": 10.676031112670898, - "rewards/rejected": -10.030996322631836, + "rewards/chosen": -0.5500536561012268, + "rewards/margins": 9.808300018310547, + "rewards/rejected": -10.358353614807129, "step": 5120 }, { - "epoch": 2.51, - "learning_rate": 9.147665580890336e-08, - "logits/chosen": -2.854604721069336, - "logits/rejected": -2.8739752769470215, - "logps/chosen": -336.33746337890625, - "logps/rejected": -378.35357666015625, - "loss": 0.0167, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6183292865753174, - "rewards/margins": 10.173185348510742, - "rewards/rejected": -9.55485725402832, + "epoch": 2.61, + "learning_rate": 7.234363225320272e-08, + "logits/chosen": -2.156942844390869, + "logits/rejected": -2.1898016929626465, + "logps/chosen": -281.28411865234375, + "logps/rejected": -315.0388488769531, + "loss": 0.0425, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9101797342300415, + "rewards/margins": 8.848825454711914, + "rewards/rejected": -9.75900650024414, "step": 5130 }, { - "epoch": 2.51, - "learning_rate": 9.05718422005067e-08, - "logits/chosen": -2.8186161518096924, - "logits/rejected": -2.8525023460388184, - "logps/chosen": -322.26129150390625, - "logps/rejected": -367.9151916503906, - "loss": 0.0121, + "epoch": 2.61, + "learning_rate": 7.14016578749058e-08, + "logits/chosen": -2.1186816692352295, + "logits/rejected": -2.2431023120880127, + "logps/chosen": -267.6355285644531, + "logps/rejected": -338.63470458984375, + "loss": 0.0103, "rewards/accuracies": 1.0, - "rewards/chosen": -0.6210575103759766, - "rewards/margins": 9.436447143554688, - "rewards/rejected": -10.05750560760498, + "rewards/chosen": -0.910683810710907, + "rewards/margins": 8.7294921875, + "rewards/rejected": -9.64017391204834, "step": 5140 }, { - "epoch": 2.52, - "learning_rate": 8.966702859211002e-08, - "logits/chosen": -2.8646483421325684, - "logits/rejected": -2.8717331886291504, - "logps/chosen": -347.71087646484375, - "logps/rejected": -378.00799560546875, - "loss": 0.0061, + "epoch": 2.62, + "learning_rate": 7.045968349660889e-08, + "logits/chosen": -2.1849942207336426, + "logits/rejected": -2.195441484451294, + "logps/chosen": -300.42578125, + "logps/rejected": -402.8392333984375, + "loss": 0.0031, "rewards/accuracies": 1.0, - "rewards/chosen": 0.9491745233535767, - "rewards/margins": 11.066826820373535, - "rewards/rejected": -10.117650985717773, + "rewards/chosen": -0.7194881439208984, + "rewards/margins": 10.500357627868652, + "rewards/rejected": -11.219846725463867, "step": 5150 }, { - "epoch": 2.52, - "learning_rate": 8.876221498371335e-08, - "logits/chosen": -2.7470672130584717, - "logits/rejected": -2.788851499557495, - "logps/chosen": -298.0244445800781, - "logps/rejected": -374.87823486328125, - "loss": 0.0075, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.574886679649353, - "rewards/margins": 9.890555381774902, - "rewards/rejected": -10.465442657470703, + "epoch": 2.62, + "learning_rate": 6.951770911831198e-08, + "logits/chosen": -2.2179243564605713, + "logits/rejected": -2.1924123764038086, + "logps/chosen": -289.130859375, + "logps/rejected": -348.76812744140625, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5753608345985413, + "rewards/margins": 10.490455627441406, + "rewards/rejected": -9.915095329284668, "step": 5160 }, { - "epoch": 2.53, - "learning_rate": 8.785740137531668e-08, - "logits/chosen": -2.838918447494507, - "logits/rejected": -2.8014559745788574, - "logps/chosen": -386.287353515625, - "logps/rejected": -353.0703125, - "loss": 0.006, + "epoch": 2.63, + "learning_rate": 6.857573474001508e-08, + "logits/chosen": -2.194222927093506, + "logits/rejected": -2.055819034576416, + "logps/chosen": -280.11883544921875, + "logps/rejected": -410.5008850097656, + "loss": 0.0211, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.752550482749939, - "rewards/margins": 8.793248176574707, - "rewards/rejected": -9.545797348022461, + "rewards/chosen": -0.2896275520324707, + "rewards/margins": 10.427669525146484, + "rewards/rejected": -10.717297554016113, "step": 5170 }, { - "epoch": 2.53, - "learning_rate": 8.695258776692001e-08, - "logits/chosen": -2.8226521015167236, - "logits/rejected": -2.8333888053894043, - "logps/chosen": -309.34002685546875, - "logps/rejected": -360.4137268066406, - "loss": 0.0095, + "epoch": 2.63, + "learning_rate": 6.763376036171815e-08, + "logits/chosen": -2.2144243717193604, + "logits/rejected": -2.179957866668701, + "logps/chosen": -303.1216735839844, + "logps/rejected": -402.1628723144531, + "loss": 0.0067, "rewards/accuracies": 1.0, - "rewards/chosen": 1.1314066648483276, - "rewards/margins": 10.61998462677002, - "rewards/rejected": -9.488576889038086, + "rewards/chosen": -0.1424558013677597, + "rewards/margins": 10.109817504882812, + "rewards/rejected": -10.25227165222168, "step": 5180 }, { - "epoch": 2.54, - "learning_rate": 8.604777415852333e-08, - "logits/chosen": -2.7401010990142822, - "logits/rejected": -2.752509355545044, - "logps/chosen": -365.42694091796875, - "logps/rejected": -345.9256591796875, - "loss": 0.0166, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.3071354329586029, - "rewards/margins": 10.067877769470215, - "rewards/rejected": -9.760744094848633, + "epoch": 2.64, + "learning_rate": 6.669178598342125e-08, + "logits/chosen": -2.199436664581299, + "logits/rejected": -2.155738353729248, + "logps/chosen": -249.98184204101562, + "logps/rejected": -326.74127197265625, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0048552751541138, + "rewards/margins": 8.819491386413574, + "rewards/rejected": -9.824346542358398, "step": 5190 }, { - "epoch": 2.54, - "learning_rate": 8.514296055012667e-08, - "logits/chosen": -2.8173704147338867, - "logits/rejected": -2.8347525596618652, - "logps/chosen": -346.5697021484375, - "logps/rejected": -429.35528564453125, - "loss": 0.0135, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.6558005809783936, - "rewards/margins": 12.177390098571777, - "rewards/rejected": -10.521589279174805, + "epoch": 2.64, + "learning_rate": 6.574981160512434e-08, + "logits/chosen": -2.217965602874756, + "logits/rejected": -2.1558725833892822, + "logps/chosen": -290.10260009765625, + "logps/rejected": -364.9586486816406, + "loss": 0.0208, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.88923579454422, + "rewards/margins": 8.772710800170898, + "rewards/rejected": -9.661947250366211, "step": 5200 }, { - "epoch": 2.54, - "eval_logits/chosen": -2.7913036346435547, - "eval_logits/rejected": -2.785522699356079, - "eval_logps/chosen": -336.3448181152344, - "eval_logps/rejected": -332.9100036621094, - "eval_loss": 0.6586364507675171, - "eval_rewards/accuracies": 0.779411792755127, - "eval_rewards/chosen": -2.1483800411224365, - "eval_rewards/margins": 4.1893086433410645, - "eval_rewards/rejected": -6.337688446044922, - "eval_runtime": 302.9968, - "eval_samples_per_second": 7.168, - "eval_steps_per_second": 0.449, + "epoch": 2.64, + "eval_logits/chosen": -2.31428599357605, + "eval_logits/rejected": -2.3874757289886475, + "eval_logps/chosen": -317.27935791015625, + "eval_logps/rejected": -327.0299377441406, + "eval_loss": 0.7740700244903564, + "eval_rewards/accuracies": 0.7442747950553894, + "eval_rewards/chosen": -3.9028892517089844, + "eval_rewards/margins": 2.9865951538085938, + "eval_rewards/rejected": -6.889484405517578, + "eval_runtime": 301.8194, + "eval_samples_per_second": 6.911, + "eval_steps_per_second": 0.434, "step": 5200 }, { - "epoch": 2.55, - "learning_rate": 8.423814694172999e-08, - "logits/chosen": -2.8111283779144287, - "logits/rejected": -2.819978952407837, - "logps/chosen": -287.37078857421875, - "logps/rejected": -316.6888732910156, - "loss": 0.0122, + "epoch": 2.65, + "learning_rate": 6.480783722682743e-08, + "logits/chosen": -2.21779203414917, + "logits/rejected": -2.2501723766326904, + "logps/chosen": -277.32733154296875, + "logps/rejected": -344.00689697265625, + "loss": 0.0166, "rewards/accuracies": 1.0, - "rewards/chosen": 0.0014335453743115067, - "rewards/margins": 10.253029823303223, - "rewards/rejected": -10.25159740447998, + "rewards/chosen": -0.8932816386222839, + "rewards/margins": 8.774493217468262, + "rewards/rejected": -9.66777515411377, "step": 5210 }, { - "epoch": 2.55, - "learning_rate": 8.333333333333333e-08, - "logits/chosen": -2.8055851459503174, - "logits/rejected": -2.790299654006958, - "logps/chosen": -296.43194580078125, - "logps/rejected": -345.30535888671875, - "loss": 0.0115, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.019373249262571335, - "rewards/margins": 9.082399368286133, - "rewards/rejected": -9.063026428222656, + "epoch": 2.66, + "learning_rate": 6.386586284853051e-08, + "logits/chosen": -2.2522101402282715, + "logits/rejected": -2.258768081665039, + "logps/chosen": -285.1710205078125, + "logps/rejected": -361.4646911621094, + "loss": 0.0112, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7185908555984497, + "rewards/margins": 10.354574203491211, + "rewards/rejected": -11.073163986206055, "step": 5220 }, { - "epoch": 2.55, - "learning_rate": 8.242851972493666e-08, - "logits/chosen": -2.785881996154785, - "logits/rejected": -2.8347012996673584, - "logps/chosen": -328.89801025390625, - "logps/rejected": -361.3002014160156, - "loss": 0.0106, + "epoch": 2.66, + "learning_rate": 6.292388847023362e-08, + "logits/chosen": -2.171731472015381, + "logits/rejected": -2.1915600299835205, + "logps/chosen": -300.89727783203125, + "logps/rejected": -344.1522216796875, + "loss": 0.0089, "rewards/accuracies": 1.0, - "rewards/chosen": -0.06292136013507843, - "rewards/margins": 9.512456893920898, - "rewards/rejected": -9.575380325317383, + "rewards/chosen": -0.9044286608695984, + "rewards/margins": 8.943483352661133, + "rewards/rejected": -9.847909927368164, "step": 5230 }, { - "epoch": 2.56, - "learning_rate": 8.152370611653998e-08, - "logits/chosen": -2.8407678604125977, - "logits/rejected": -2.879194736480713, - "logps/chosen": -331.65057373046875, - "logps/rejected": -376.3146667480469, - "loss": 0.0094, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.4312402307987213, - "rewards/margins": 10.663827896118164, - "rewards/rejected": -10.232587814331055, + "epoch": 2.67, + "learning_rate": 6.19819140919367e-08, + "logits/chosen": -2.1025567054748535, + "logits/rejected": -2.0875024795532227, + "logps/chosen": -278.44757080078125, + "logps/rejected": -355.54962158203125, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6099380850791931, + "rewards/margins": 8.724331855773926, + "rewards/rejected": -9.334268569946289, "step": 5240 }, { - "epoch": 2.56, - "learning_rate": 8.061889250814332e-08, - "logits/chosen": -2.7495293617248535, - "logits/rejected": -2.760474681854248, - "logps/chosen": -362.3902893066406, - "logps/rejected": -462.63702392578125, - "loss": 0.0077, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9277762174606323, - "rewards/margins": 11.222733497619629, - "rewards/rejected": -10.294957160949707, + "epoch": 2.67, + "learning_rate": 6.103993971363979e-08, + "logits/chosen": -2.236751079559326, + "logits/rejected": -2.3104147911071777, + "logps/chosen": -258.1316833496094, + "logps/rejected": -380.6584167480469, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4985658526420593, + "rewards/margins": 9.4805326461792, + "rewards/rejected": -9.979098320007324, "step": 5250 }, { - "epoch": 2.57, - "learning_rate": 7.971407889974664e-08, - "logits/chosen": -2.8319344520568848, - "logits/rejected": -2.8433635234832764, - "logps/chosen": -283.2074890136719, - "logps/rejected": -368.69403076171875, - "loss": 0.0085, + "epoch": 2.68, + "learning_rate": 6.009796533534288e-08, + "logits/chosen": -2.1440768241882324, + "logits/rejected": -2.2688815593719482, + "logps/chosen": -283.58758544921875, + "logps/rejected": -368.4014892578125, + "loss": 0.0178, "rewards/accuracies": 1.0, - "rewards/chosen": -0.239939883351326, - "rewards/margins": 9.756782531738281, - "rewards/rejected": -9.996722221374512, + "rewards/chosen": -0.7808700203895569, + "rewards/margins": 10.05615234375, + "rewards/rejected": -10.837023735046387, "step": 5260 }, { - "epoch": 2.57, - "learning_rate": 7.880926529134998e-08, - "logits/chosen": -2.8156118392944336, - "logits/rejected": -2.825955867767334, - "logps/chosen": -297.4966735839844, - "logps/rejected": -334.0571594238281, - "loss": 0.0273, + "epoch": 2.68, + "learning_rate": 5.9155990957045964e-08, + "logits/chosen": -2.1298089027404785, + "logits/rejected": -2.155709981918335, + "logps/chosen": -315.98492431640625, + "logps/rejected": -381.3930969238281, + "loss": 0.0806, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.628752589225769, - "rewards/margins": 10.155919075012207, - "rewards/rejected": -9.527166366577148, + "rewards/chosen": -0.07813958823680878, + "rewards/margins": 9.502008438110352, + "rewards/rejected": -9.580145835876465, "step": 5270 }, { - "epoch": 2.58, - "learning_rate": 7.790445168295331e-08, - "logits/chosen": -2.7296481132507324, - "logits/rejected": -2.731184720993042, - "logps/chosen": -341.293701171875, - "logps/rejected": -373.89801025390625, - "loss": 0.0088, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.3213523626327515, - "rewards/margins": 10.019091606140137, - "rewards/rejected": -8.69774055480957, + "epoch": 2.69, + "learning_rate": 5.821401657874905e-08, + "logits/chosen": -2.322612762451172, + "logits/rejected": -2.144012928009033, + "logps/chosen": -264.17950439453125, + "logps/rejected": -371.1512756347656, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7406362295150757, + "rewards/margins": 9.892260551452637, + "rewards/rejected": -10.63289737701416, "step": 5280 }, { - "epoch": 2.58, - "learning_rate": 7.699963807455665e-08, - "logits/chosen": -2.7940499782562256, - "logits/rejected": -2.814470052719116, - "logps/chosen": -358.9014587402344, - "logps/rejected": -310.25750732421875, - "loss": 0.0143, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.28456392884254456, - "rewards/margins": 9.059488296508789, - "rewards/rejected": -8.774924278259277, + "epoch": 2.69, + "learning_rate": 5.7272042200452145e-08, + "logits/chosen": -2.262551784515381, + "logits/rejected": -2.162778377532959, + "logps/chosen": -235.9697265625, + "logps/rejected": -357.1461181640625, + "loss": 0.0243, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.732375979423523, + "rewards/margins": 9.819782257080078, + "rewards/rejected": -10.552157402038574, "step": 5290 }, { - "epoch": 2.59, - "learning_rate": 7.609482446615997e-08, - "logits/chosen": -2.8209762573242188, - "logits/rejected": -2.812096357345581, - "logps/chosen": -274.8023681640625, - "logps/rejected": -343.10235595703125, - "loss": 0.0134, + "epoch": 2.7, + "learning_rate": 5.633006782215523e-08, + "logits/chosen": -2.3629612922668457, + "logits/rejected": -2.335319995880127, + "logps/chosen": -256.4373474121094, + "logps/rejected": -355.1130065917969, + "loss": 0.0076, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9901191592216492, - "rewards/margins": 10.597132682800293, - "rewards/rejected": -9.607013702392578, + "rewards/chosen": -0.7695462703704834, + "rewards/margins": 8.623259544372559, + "rewards/rejected": -9.392806053161621, "step": 5300 }, { - "epoch": 2.59, - "eval_logits/chosen": -2.7950918674468994, - "eval_logits/rejected": -2.791633129119873, - "eval_logps/chosen": -336.3733215332031, - "eval_logps/rejected": -332.46990966796875, - "eval_loss": 0.6481867432594299, - "eval_rewards/accuracies": 0.7757353186607361, - "eval_rewards/chosen": -2.1512327194213867, - "eval_rewards/margins": 4.142441272735596, - "eval_rewards/rejected": -6.293674468994141, - "eval_runtime": 303.1057, - "eval_samples_per_second": 7.166, - "eval_steps_per_second": 0.449, + "epoch": 2.7, + "eval_logits/chosen": -2.3592212200164795, + "eval_logits/rejected": -2.433117151260376, + "eval_logps/chosen": -314.4091796875, + "eval_logps/rejected": -323.9353332519531, + "eval_loss": 0.7599921822547913, + "eval_rewards/accuracies": 0.7423664331436157, + "eval_rewards/chosen": -3.615870475769043, + "eval_rewards/margins": 2.9641480445861816, + "eval_rewards/rejected": -6.580018520355225, + "eval_runtime": 296.8645, + "eval_samples_per_second": 7.027, + "eval_steps_per_second": 0.441, "step": 5300 }, { - "epoch": 2.59, - "learning_rate": 7.51900108577633e-08, - "logits/chosen": -2.8262100219726562, - "logits/rejected": -2.810464859008789, - "logps/chosen": -346.8700866699219, - "logps/rejected": -356.64569091796875, - "loss": 0.0071, + "epoch": 2.7, + "learning_rate": 5.538809344385832e-08, + "logits/chosen": -2.4587504863739014, + "logits/rejected": -2.3089499473571777, + "logps/chosen": -298.1319580078125, + "logps/rejected": -371.02703857421875, + "loss": 0.0098, "rewards/accuracies": 1.0, - "rewards/chosen": 0.6102066040039062, - "rewards/margins": 10.711640357971191, - "rewards/rejected": -10.101434707641602, + "rewards/chosen": -0.1819322109222412, + "rewards/margins": 9.596673965454102, + "rewards/rejected": -9.778606414794922, "step": 5310 }, { - "epoch": 2.6, - "learning_rate": 7.428519724936664e-08, - "logits/chosen": -2.836167812347412, - "logits/rejected": -2.839168071746826, - "logps/chosen": -256.46929931640625, - "logps/rejected": -305.74603271484375, - "loss": 0.0239, + "epoch": 2.71, + "learning_rate": 5.4446119065561414e-08, + "logits/chosen": -2.204026460647583, + "logits/rejected": -2.2400882244110107, + "logps/chosen": -268.72308349609375, + "logps/rejected": -335.87261962890625, + "loss": 0.0086, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.18273834884166718, - "rewards/margins": 9.44776725769043, - "rewards/rejected": -9.26502799987793, + "rewards/chosen": -0.2533165216445923, + "rewards/margins": 9.080385208129883, + "rewards/rejected": -9.333700180053711, "step": 5320 }, { - "epoch": 2.6, - "learning_rate": 7.338038364096996e-08, - "logits/chosen": -2.855609893798828, - "logits/rejected": -2.882725954055786, - "logps/chosen": -299.3067321777344, - "logps/rejected": -361.46478271484375, - "loss": 0.0094, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.3655107617378235, - "rewards/margins": 9.926068305969238, - "rewards/rejected": -9.56055736541748, + "epoch": 2.71, + "learning_rate": 5.35041446872645e-08, + "logits/chosen": -2.340484142303467, + "logits/rejected": -2.1539273262023926, + "logps/chosen": -260.55938720703125, + "logps/rejected": -390.09027099609375, + "loss": 0.0163, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.2474168837070465, + "rewards/margins": 10.559521675109863, + "rewards/rejected": -10.806938171386719, "step": 5330 }, { - "epoch": 2.61, - "learning_rate": 7.24755700325733e-08, - "logits/chosen": -2.7605299949645996, - "logits/rejected": -2.783886194229126, - "logps/chosen": -336.10272216796875, - "logps/rejected": -380.04779052734375, - "loss": 0.0219, + "epoch": 2.72, + "learning_rate": 5.256217030896759e-08, + "logits/chosen": -2.0718276500701904, + "logits/rejected": -2.1071338653564453, + "logps/chosen": -266.92828369140625, + "logps/rejected": -395.7525939941406, + "loss": 0.012, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.20739421248435974, - "rewards/margins": 10.576163291931152, - "rewards/rejected": -10.368769645690918, + "rewards/chosen": -0.5647242665290833, + "rewards/margins": 10.194375991821289, + "rewards/rejected": -10.759099960327148, "step": 5340 }, { - "epoch": 2.61, - "learning_rate": 7.157075642417662e-08, - "logits/chosen": -2.791677474975586, - "logits/rejected": -2.8254895210266113, - "logps/chosen": -405.6573486328125, - "logps/rejected": -370.52459716796875, - "loss": 0.012, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.4442583322525024, - "rewards/margins": 11.328303337097168, - "rewards/rejected": -9.884044647216797, + "epoch": 2.72, + "learning_rate": 5.162019593067068e-08, + "logits/chosen": -2.312995433807373, + "logits/rejected": -2.388847827911377, + "logps/chosen": -292.45361328125, + "logps/rejected": -374.68096923828125, + "loss": 0.0086, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.17983612418174744, + "rewards/margins": 10.592061996459961, + "rewards/rejected": -10.412225723266602, "step": 5350 }, { - "epoch": 2.62, - "learning_rate": 7.066594281577995e-08, - "logits/chosen": -2.8818321228027344, - "logits/rejected": -2.9152278900146484, - "logps/chosen": -347.34991455078125, - "logps/rejected": -377.57989501953125, - "loss": 0.0102, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.12087879329919815, - "rewards/margins": 10.046680450439453, - "rewards/rejected": -9.925801277160645, + "epoch": 2.73, + "learning_rate": 5.067822155237377e-08, + "logits/chosen": -2.236600637435913, + "logits/rejected": -2.2955946922302246, + "logps/chosen": -262.1943664550781, + "logps/rejected": -310.1500549316406, + "loss": 0.0231, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8180893063545227, + "rewards/margins": 8.33509349822998, + "rewards/rejected": -9.15318489074707, "step": 5360 }, { - "epoch": 2.62, - "learning_rate": 6.976112920738328e-08, - "logits/chosen": -2.8282628059387207, - "logits/rejected": -2.8411741256713867, - "logps/chosen": -321.87530517578125, - "logps/rejected": -398.439453125, - "loss": 0.0323, + "epoch": 2.73, + "learning_rate": 4.973624717407686e-08, + "logits/chosen": -2.3075735569000244, + "logits/rejected": -2.3322246074676514, + "logps/chosen": -238.5531768798828, + "logps/rejected": -295.55279541015625, + "loss": 0.0072, "rewards/accuracies": 1.0, - "rewards/chosen": 0.0035088597796857357, - "rewards/margins": 10.454134941101074, - "rewards/rejected": -10.450626373291016, + "rewards/chosen": -0.7571172714233398, + "rewards/margins": 8.686990737915039, + "rewards/rejected": -9.444108963012695, "step": 5370 }, { - "epoch": 2.63, - "learning_rate": 6.885631559898661e-08, - "logits/chosen": -2.7643587589263916, - "logits/rejected": -2.785660743713379, - "logps/chosen": -260.66827392578125, - "logps/rejected": -389.4213562011719, - "loss": 0.0081, + "epoch": 2.74, + "learning_rate": 4.879427279577995e-08, + "logits/chosen": -2.305372953414917, + "logits/rejected": -2.2694144248962402, + "logps/chosen": -237.29641723632812, + "logps/rejected": -319.9894104003906, + "loss": 0.0129, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.12342226505279541, - "rewards/margins": 10.744087219238281, - "rewards/rejected": -10.620664596557617, + "rewards/chosen": -0.9666566848754883, + "rewards/margins": 8.84016227722168, + "rewards/rejected": -9.806818008422852, "step": 5380 }, { - "epoch": 2.63, - "learning_rate": 6.795150199058993e-08, - "logits/chosen": -2.7761893272399902, - "logits/rejected": -2.799177408218384, - "logps/chosen": -319.4676208496094, - "logps/rejected": -364.3558654785156, - "loss": 0.009, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.1065781116485596, - "rewards/margins": 10.896021842956543, - "rewards/rejected": -9.789443016052246, + "epoch": 2.74, + "learning_rate": 4.785229841748304e-08, + "logits/chosen": -2.3525428771972656, + "logits/rejected": -2.3090224266052246, + "logps/chosen": -306.0455627441406, + "logps/rejected": -409.92120361328125, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3530218005180359, + "rewards/margins": 10.933697700500488, + "rewards/rejected": -10.58067512512207, "step": 5390 }, { - "epoch": 2.64, - "learning_rate": 6.704668838219327e-08, - "logits/chosen": -2.7726569175720215, - "logits/rejected": -2.775095224380493, - "logps/chosen": -294.89215087890625, - "logps/rejected": -404.05224609375, - "loss": 0.0121, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.1279352605342865, - "rewards/margins": 10.296116828918457, - "rewards/rejected": -10.168180465698242, + "epoch": 2.75, + "learning_rate": 4.691032403918613e-08, + "logits/chosen": -2.281357526779175, + "logits/rejected": -2.237678050994873, + "logps/chosen": -251.3409881591797, + "logps/rejected": -310.4827575683594, + "loss": 0.0146, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8187946081161499, + "rewards/margins": 8.709222793579102, + "rewards/rejected": -9.528017044067383, "step": 5400 }, { - "epoch": 2.64, - "eval_logits/chosen": -2.7987520694732666, - "eval_logits/rejected": -2.7944884300231934, - "eval_logps/chosen": -335.19342041015625, - "eval_logps/rejected": -331.9374694824219, - "eval_loss": 0.6488088965415955, - "eval_rewards/accuracies": 0.7775735259056091, - "eval_rewards/chosen": -2.0332438945770264, - "eval_rewards/margins": 4.207189559936523, - "eval_rewards/rejected": -6.240434169769287, - "eval_runtime": 303.1368, - "eval_samples_per_second": 7.165, - "eval_steps_per_second": 0.449, + "epoch": 2.75, + "eval_logits/chosen": -2.3750522136688232, + "eval_logits/rejected": -2.447535753250122, + "eval_logps/chosen": -315.9074401855469, + "eval_logps/rejected": -326.6905212402344, + "eval_loss": 0.7768276333808899, + "eval_rewards/accuracies": 0.7423664331436157, + "eval_rewards/chosen": -3.765695333480835, + "eval_rewards/margins": 3.089841365814209, + "eval_rewards/rejected": -6.855537414550781, + "eval_runtime": 301.6538, + "eval_samples_per_second": 6.915, + "eval_steps_per_second": 0.434, "step": 5400 }, { - "epoch": 2.64, - "learning_rate": 6.61418747737966e-08, - "logits/chosen": -2.8054347038269043, - "logits/rejected": -2.811387538909912, - "logps/chosen": -322.5823059082031, - "logps/rejected": -376.0274963378906, - "loss": 0.0202, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.678113579750061, - "rewards/margins": 9.744855880737305, - "rewards/rejected": -9.066740036010742, + "epoch": 2.75, + "learning_rate": 4.596834966088922e-08, + "logits/chosen": -2.2270708084106445, + "logits/rejected": -2.240201473236084, + "logps/chosen": -259.4315490722656, + "logps/rejected": -358.59698486328125, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9431527256965637, + "rewards/margins": 9.088873863220215, + "rewards/rejected": -10.032026290893555, "step": 5410 }, { - "epoch": 2.65, - "learning_rate": 6.523706116539993e-08, - "logits/chosen": -2.7633614540100098, - "logits/rejected": -2.7820792198181152, - "logps/chosen": -312.8914794921875, - "logps/rejected": -361.0492858886719, - "loss": 0.0142, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.697822093963623, - "rewards/margins": 10.055113792419434, - "rewards/rejected": -9.357292175292969, + "epoch": 2.76, + "learning_rate": 4.5026375282592306e-08, + "logits/chosen": -2.4417529106140137, + "logits/rejected": -2.3347702026367188, + "logps/chosen": -303.779296875, + "logps/rejected": -370.93328857421875, + "loss": 0.009, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6730049848556519, + "rewards/margins": 9.935312271118164, + "rewards/rejected": -10.608317375183105, "step": 5420 }, { - "epoch": 2.65, - "learning_rate": 6.433224755700326e-08, - "logits/chosen": -2.7979183197021484, - "logits/rejected": -2.7626900672912598, - "logps/chosen": -266.14178466796875, - "logps/rejected": -288.9662170410156, - "loss": 0.0455, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.3691931664943695, - "rewards/margins": 9.630014419555664, - "rewards/rejected": -9.260821342468262, + "epoch": 2.76, + "learning_rate": 4.40844009042954e-08, + "logits/chosen": -2.205430507659912, + "logits/rejected": -2.3156282901763916, + "logps/chosen": -245.5878448486328, + "logps/rejected": -319.97259521484375, + "loss": 0.0176, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7904682159423828, + "rewards/margins": 9.259397506713867, + "rewards/rejected": -10.049864768981934, "step": 5430 }, { - "epoch": 2.66, - "learning_rate": 6.342743394860658e-08, - "logits/chosen": -2.7854111194610596, - "logits/rejected": -2.7940335273742676, - "logps/chosen": -341.5016784667969, - "logps/rejected": -360.7286682128906, - "loss": 0.0106, + "epoch": 2.77, + "learning_rate": 4.314242652599849e-08, + "logits/chosen": -2.3045692443847656, + "logits/rejected": -2.176016330718994, + "logps/chosen": -308.97247314453125, + "logps/rejected": -334.92950439453125, + "loss": 0.0055, "rewards/accuracies": 1.0, - "rewards/chosen": 1.7187831401824951, - "rewards/margins": 11.579765319824219, - "rewards/rejected": -9.860981941223145, + "rewards/chosen": -0.09693922102451324, + "rewards/margins": 9.563849449157715, + "rewards/rejected": -9.660788536071777, "step": 5440 }, { - "epoch": 2.66, - "learning_rate": 6.252262034020992e-08, - "logits/chosen": -2.776249647140503, - "logits/rejected": -2.796550989151001, - "logps/chosen": -338.69390869140625, - "logps/rejected": -362.85003662109375, - "loss": 0.0168, + "epoch": 2.77, + "learning_rate": 4.2200452147701575e-08, + "logits/chosen": -2.167794704437256, + "logits/rejected": -2.215398073196411, + "logps/chosen": -295.13031005859375, + "logps/rejected": -371.29888916015625, + "loss": 0.016, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8232637643814087, - "rewards/margins": 10.688947677612305, - "rewards/rejected": -9.865682601928711, + "rewards/chosen": -0.7187148928642273, + "rewards/margins": 9.774347305297852, + "rewards/rejected": -10.493062973022461, "step": 5450 }, { - "epoch": 2.67, - "learning_rate": 6.161780673181324e-08, - "logits/chosen": -2.8220622539520264, - "logits/rejected": -2.812959671020508, - "logps/chosen": -255.7144012451172, - "logps/rejected": -312.28118896484375, - "loss": 0.0103, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2427293062210083, - "rewards/margins": 11.159791946411133, - "rewards/rejected": -9.917062759399414, + "epoch": 2.78, + "learning_rate": 4.1258477769404675e-08, + "logits/chosen": -2.213601589202881, + "logits/rejected": -2.3222367763519287, + "logps/chosen": -304.4975280761719, + "logps/rejected": -384.44464111328125, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.047443341463804245, + "rewards/margins": 11.246149063110352, + "rewards/rejected": -11.198705673217773, "step": 5460 }, { - "epoch": 2.67, - "learning_rate": 6.071299312341658e-08, - "logits/chosen": -2.799553155899048, - "logits/rejected": -2.831979274749756, - "logps/chosen": -339.5647277832031, - "logps/rejected": -419.84967041015625, - "loss": 0.0159, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5822495818138123, - "rewards/margins": 10.838584899902344, - "rewards/rejected": -10.256335258483887, + "epoch": 2.78, + "learning_rate": 4.031650339110776e-08, + "logits/chosen": -2.243669033050537, + "logits/rejected": -2.140501022338867, + "logps/chosen": -241.9927520751953, + "logps/rejected": -313.94708251953125, + "loss": 0.0211, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9616853594779968, + "rewards/margins": 8.771138191223145, + "rewards/rejected": -9.732824325561523, "step": 5470 }, { - "epoch": 2.68, - "learning_rate": 5.98081795150199e-08, - "logits/chosen": -2.866814136505127, - "logits/rejected": -2.870924711227417, - "logps/chosen": -335.62396240234375, - "logps/rejected": -351.0826110839844, - "loss": 0.0097, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.400399774312973, - "rewards/margins": 9.191247940063477, - "rewards/rejected": -8.790847778320312, + "epoch": 2.79, + "learning_rate": 3.9374529012810856e-08, + "logits/chosen": -2.2058587074279785, + "logits/rejected": -2.1941728591918945, + "logps/chosen": -274.68304443359375, + "logps/rejected": -333.4448547363281, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1749818325042725, + "rewards/margins": 9.428367614746094, + "rewards/rejected": -10.603350639343262, "step": 5480 }, { - "epoch": 2.68, - "learning_rate": 5.890336590662323e-08, - "logits/chosen": -2.84086537361145, - "logits/rejected": -2.8465240001678467, - "logps/chosen": -271.5531005859375, - "logps/rejected": -331.49761962890625, - "loss": 0.0104, + "epoch": 2.79, + "learning_rate": 3.8432554634513943e-08, + "logits/chosen": -2.2903943061828613, + "logits/rejected": -2.271043300628662, + "logps/chosen": -298.74688720703125, + "logps/rejected": -383.7245178222656, + "loss": 0.021, "rewards/accuracies": 1.0, - "rewards/chosen": -0.6313773393630981, - "rewards/margins": 9.69230842590332, - "rewards/rejected": -10.323686599731445, + "rewards/chosen": -0.09447214752435684, + "rewards/margins": 10.52599811553955, + "rewards/rejected": -10.620469093322754, "step": 5490 }, { - "epoch": 2.69, - "learning_rate": 5.799855229822656e-08, - "logits/chosen": -2.849621534347534, - "logits/rejected": -2.840254068374634, - "logps/chosen": -275.830322265625, - "logps/rejected": -288.81439208984375, - "loss": 0.0201, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.040472228080034256, - "rewards/margins": 9.257044792175293, - "rewards/rejected": -9.297517776489258, + "epoch": 2.8, + "learning_rate": 3.749058025621703e-08, + "logits/chosen": -2.282285690307617, + "logits/rejected": -2.341499090194702, + "logps/chosen": -267.6167297363281, + "logps/rejected": -344.1416320800781, + "loss": 0.0161, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.8351944088935852, + "rewards/margins": 9.350050926208496, + "rewards/rejected": -10.1852445602417, "step": 5500 }, { - "epoch": 2.69, - "eval_logits/chosen": -2.803006172180176, - "eval_logits/rejected": -2.799884557723999, - "eval_logps/chosen": -336.4127502441406, - "eval_logps/rejected": -333.1279602050781, - "eval_loss": 0.6544587016105652, - "eval_rewards/accuracies": 0.7738970518112183, - "eval_rewards/chosen": -2.155174970626831, - "eval_rewards/margins": 4.204307556152344, - "eval_rewards/rejected": -6.359483242034912, - "eval_runtime": 303.1821, - "eval_samples_per_second": 7.164, - "eval_steps_per_second": 0.449, + "epoch": 2.8, + "eval_logits/chosen": -2.3620362281799316, + "eval_logits/rejected": -2.433206558227539, + "eval_logps/chosen": -317.4207763671875, + "eval_logps/rejected": -328.7700500488281, + "eval_loss": 0.7901568412780762, + "eval_rewards/accuracies": 0.7480915784835815, + "eval_rewards/chosen": -3.91702938079834, + "eval_rewards/margins": 3.1464624404907227, + "eval_rewards/rejected": -7.0634918212890625, + "eval_runtime": 297.0888, + "eval_samples_per_second": 7.021, + "eval_steps_per_second": 0.441, "step": 5500 }, { - "epoch": 2.69, - "learning_rate": 5.709373868982989e-08, - "logits/chosen": -2.847914695739746, - "logits/rejected": -2.842100143432617, - "logps/chosen": -322.5870056152344, - "logps/rejected": -336.5581970214844, - "loss": 0.0103, + "epoch": 2.8, + "learning_rate": 3.6548605877920125e-08, + "logits/chosen": -2.2243800163269043, + "logits/rejected": -2.2160937786102295, + "logps/chosen": -292.40545654296875, + "logps/rejected": -356.87054443359375, + "loss": 0.0155, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.33272266387939453, - "rewards/margins": 9.429616928100586, - "rewards/rejected": -9.096895217895508, + "rewards/chosen": -0.35705074667930603, + "rewards/margins": 10.191495895385742, + "rewards/rejected": -10.548547744750977, "step": 5510 }, { - "epoch": 2.7, - "learning_rate": 5.618892508143322e-08, - "logits/chosen": -2.797506809234619, - "logits/rejected": -2.7999701499938965, - "logps/chosen": -282.1653747558594, - "logps/rejected": -360.48211669921875, - "loss": 0.0126, + "epoch": 2.81, + "learning_rate": 3.560663149962321e-08, + "logits/chosen": -2.180793046951294, + "logits/rejected": -2.1247339248657227, + "logps/chosen": -276.05218505859375, + "logps/rejected": -372.5242614746094, + "loss": 0.0102, "rewards/accuracies": 1.0, - "rewards/chosen": -0.25770294666290283, - "rewards/margins": 9.75844669342041, - "rewards/rejected": -10.016149520874023, + "rewards/chosen": -0.866191029548645, + "rewards/margins": 9.306673049926758, + "rewards/rejected": -10.172863006591797, "step": 5520 }, { - "epoch": 2.7, - "learning_rate": 5.528411147303655e-08, - "logits/chosen": -2.775840997695923, - "logits/rejected": -2.8263487815856934, - "logps/chosen": -365.1943054199219, - "logps/rejected": -357.63348388671875, - "loss": 0.0079, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.03896459937095642, - "rewards/margins": 9.972452163696289, - "rewards/rejected": -9.933488845825195, + "epoch": 2.81, + "learning_rate": 3.46646571213263e-08, + "logits/chosen": -2.189734935760498, + "logits/rejected": -2.2070634365081787, + "logps/chosen": -317.32305908203125, + "logps/rejected": -377.46575927734375, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.114651083946228, + "rewards/margins": 9.642301559448242, + "rewards/rejected": -10.756954193115234, "step": 5530 }, { - "epoch": 2.71, - "learning_rate": 5.437929786463988e-08, - "logits/chosen": -2.804297685623169, - "logits/rejected": -2.8283231258392334, - "logps/chosen": -307.0234069824219, - "logps/rejected": -403.87835693359375, - "loss": 0.0167, + "epoch": 2.82, + "learning_rate": 3.372268274302939e-08, + "logits/chosen": -2.2536730766296387, + "logits/rejected": -2.286621570587158, + "logps/chosen": -284.72882080078125, + "logps/rejected": -376.8185119628906, + "loss": 0.0071, "rewards/accuracies": 1.0, - "rewards/chosen": -0.12525925040245056, - "rewards/margins": 10.878571510314941, - "rewards/rejected": -11.00383186340332, + "rewards/chosen": -0.5705639123916626, + "rewards/margins": 10.194475173950195, + "rewards/rejected": -10.765039443969727, "step": 5540 }, { - "epoch": 2.71, - "learning_rate": 5.347448425624321e-08, - "logits/chosen": -2.8549203872680664, - "logits/rejected": -2.8703112602233887, - "logps/chosen": -311.4620666503906, - "logps/rejected": -363.9953918457031, - "loss": 0.0092, + "epoch": 2.82, + "learning_rate": 3.278070836473248e-08, + "logits/chosen": -2.196479320526123, + "logits/rejected": -2.2336013317108154, + "logps/chosen": -239.5353546142578, + "logps/rejected": -358.18426513671875, + "loss": 0.0159, "rewards/accuracies": 1.0, - "rewards/chosen": -0.9108712077140808, - "rewards/margins": 9.696334838867188, - "rewards/rejected": -10.607206344604492, + "rewards/chosen": -0.5057913064956665, + "rewards/margins": 9.998066902160645, + "rewards/rejected": -10.50385856628418, "step": 5550 }, { - "epoch": 2.72, - "learning_rate": 5.2569670647846546e-08, - "logits/chosen": -2.7483763694763184, - "logits/rejected": -2.751288890838623, - "logps/chosen": -436.1394958496094, - "logps/rejected": -402.9700622558594, - "loss": 0.0159, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.8061929941177368, - "rewards/margins": 11.283575057983398, - "rewards/rejected": -9.477383613586426, + "epoch": 2.83, + "learning_rate": 3.183873398643557e-08, + "logits/chosen": -2.253528118133545, + "logits/rejected": -2.2258002758026123, + "logps/chosen": -307.4124755859375, + "logps/rejected": -409.0328369140625, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3856652081012726, + "rewards/margins": 10.914469718933105, + "rewards/rejected": -11.300134658813477, "step": 5560 }, { - "epoch": 2.72, - "learning_rate": 5.1664857039449875e-08, - "logits/chosen": -2.870236873626709, - "logits/rejected": -2.8603241443634033, - "logps/chosen": -300.8960266113281, - "logps/rejected": -347.0853271484375, - "loss": 0.0073, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.2610408663749695, - "rewards/margins": 9.532546997070312, - "rewards/rejected": -9.271507263183594, + "epoch": 2.83, + "learning_rate": 3.089675960813866e-08, + "logits/chosen": -2.326773166656494, + "logits/rejected": -2.3797287940979004, + "logps/chosen": -290.23492431640625, + "logps/rejected": -325.58428955078125, + "loss": 0.0089, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4582270085811615, + "rewards/margins": 9.405486106872559, + "rewards/rejected": -9.863713264465332, "step": 5570 }, { - "epoch": 2.73, - "learning_rate": 5.0760043431053204e-08, - "logits/chosen": -2.8325443267822266, - "logits/rejected": -2.8421969413757324, - "logps/chosen": -315.81414794921875, - "logps/rejected": -347.36016845703125, - "loss": 0.0254, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.9280185699462891, - "rewards/margins": 9.7285737991333, - "rewards/rejected": -10.656591415405273, + "epoch": 2.84, + "learning_rate": 2.995478522984175e-08, + "logits/chosen": -2.2719123363494873, + "logits/rejected": -2.116914987564087, + "logps/chosen": -295.5330810546875, + "logps/rejected": -360.0628967285156, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7894667387008667, + "rewards/margins": 9.751416206359863, + "rewards/rejected": -10.54088306427002, "step": 5580 }, { - "epoch": 2.73, - "learning_rate": 4.985522982265653e-08, - "logits/chosen": -2.864389181137085, - "logits/rejected": -2.8822696208953857, - "logps/chosen": -381.22772216796875, - "logps/rejected": -429.4710998535156, - "loss": 0.0131, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.771894097328186, - "rewards/margins": 10.640081405639648, - "rewards/rejected": -9.868186950683594, + "epoch": 2.84, + "learning_rate": 2.9012810851544836e-08, + "logits/chosen": -2.2664589881896973, + "logits/rejected": -2.322875499725342, + "logps/chosen": -259.9700012207031, + "logps/rejected": -339.7001647949219, + "loss": 0.0156, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6178351640701294, + "rewards/margins": 9.064801216125488, + "rewards/rejected": -9.682635307312012, "step": 5590 }, { - "epoch": 2.74, - "learning_rate": 4.895041621425986e-08, - "logits/chosen": -2.8606672286987305, - "logits/rejected": -2.893714427947998, - "logps/chosen": -350.6031799316406, - "logps/rejected": -359.85723876953125, - "loss": 0.0067, + "epoch": 2.85, + "learning_rate": 2.8070836473247926e-08, + "logits/chosen": -2.291879177093506, + "logits/rejected": -2.1940712928771973, + "logps/chosen": -242.53451538085938, + "logps/rejected": -346.9772033691406, + "loss": 0.0056, "rewards/accuracies": 1.0, - "rewards/chosen": 1.3215224742889404, - "rewards/margins": 10.639204025268555, - "rewards/rejected": -9.317682266235352, + "rewards/chosen": -0.4267337918281555, + "rewards/margins": 10.6320161819458, + "rewards/rejected": -11.058749198913574, "step": 5600 }, { - "epoch": 2.74, - "eval_logits/chosen": -2.8095755577087402, - "eval_logits/rejected": -2.8061280250549316, - "eval_logps/chosen": -338.3343505859375, - "eval_logps/rejected": -335.3709411621094, - "eval_loss": 0.6635177135467529, - "eval_rewards/accuracies": 0.7757353186607361, - "eval_rewards/chosen": -2.3473329544067383, - "eval_rewards/margins": 4.23644495010376, - "eval_rewards/rejected": -6.58377742767334, - "eval_runtime": 302.8222, - "eval_samples_per_second": 7.173, - "eval_steps_per_second": 0.449, + "epoch": 2.85, + "eval_logits/chosen": -2.3599491119384766, + "eval_logits/rejected": -2.4313082695007324, + "eval_logps/chosen": -317.76324462890625, + "eval_logps/rejected": -328.8216552734375, + "eval_loss": 0.782730758190155, + "eval_rewards/accuracies": 0.7423664331436157, + "eval_rewards/chosen": -3.951274871826172, + "eval_rewards/margins": 3.117375373840332, + "eval_rewards/rejected": -7.068650245666504, + "eval_runtime": 301.8036, + "eval_samples_per_second": 6.912, + "eval_steps_per_second": 0.434, "step": 5600 }, { - "epoch": 2.74, - "learning_rate": 4.804560260586319e-08, - "logits/chosen": -2.756701707839966, - "logits/rejected": -2.77923583984375, - "logps/chosen": -283.5895080566406, - "logps/rejected": -399.92291259765625, - "loss": 0.023, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.2037104368209839, - "rewards/margins": 11.615274429321289, - "rewards/rejected": -11.411565780639648, + "epoch": 2.85, + "learning_rate": 2.7128862094951014e-08, + "logits/chosen": -2.20296573638916, + "logits/rejected": -2.3706116676330566, + "logps/chosen": -271.8478088378906, + "logps/rejected": -355.76934814453125, + "loss": 0.0257, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.4428713917732239, + "rewards/margins": 9.450614929199219, + "rewards/rejected": -9.893485069274902, "step": 5610 }, { - "epoch": 2.75, - "learning_rate": 4.714078899746652e-08, - "logits/chosen": -2.8051834106445312, - "logits/rejected": -2.7867562770843506, - "logps/chosen": -327.09759521484375, - "logps/rejected": -339.6051025390625, - "loss": 0.0457, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.08774641156196594, - "rewards/margins": 9.319291114807129, - "rewards/rejected": -9.407037734985352, + "epoch": 2.86, + "learning_rate": 2.6186887716654104e-08, + "logits/chosen": -2.257016181945801, + "logits/rejected": -2.2679696083068848, + "logps/chosen": -274.9690856933594, + "logps/rejected": -358.05609130859375, + "loss": 0.0189, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.6042969822883606, + "rewards/margins": 9.57944107055664, + "rewards/rejected": -10.183736801147461, "step": 5620 }, { - "epoch": 2.75, - "learning_rate": 4.623597538906985e-08, - "logits/chosen": -2.8442134857177734, - "logits/rejected": -2.869610548019409, - "logps/chosen": -318.1136169433594, - "logps/rejected": -350.0159912109375, - "loss": 0.0082, + "epoch": 2.86, + "learning_rate": 2.5244913338357195e-08, + "logits/chosen": -2.234752893447876, + "logits/rejected": -2.2068352699279785, + "logps/chosen": -286.51171875, + "logps/rejected": -344.405517578125, + "loss": 0.0123, "rewards/accuracies": 1.0, - "rewards/chosen": 0.030749324709177017, - "rewards/margins": 9.704257011413574, - "rewards/rejected": -9.673507690429688, + "rewards/chosen": -0.32843226194381714, + "rewards/margins": 9.54783821105957, + "rewards/rejected": -9.876269340515137, "step": 5630 }, { - "epoch": 2.76, - "learning_rate": 4.5331161780673176e-08, - "logits/chosen": -2.83524489402771, - "logits/rejected": -2.8693079948425293, - "logps/chosen": -318.2431335449219, - "logps/rejected": -433.58172607421875, - "loss": 0.0128, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.4792313575744629, - "rewards/margins": 11.19707202911377, - "rewards/rejected": -10.717839241027832, + "epoch": 2.87, + "learning_rate": 2.4302938960060285e-08, + "logits/chosen": -2.1969046592712402, + "logits/rejected": -2.1973648071289062, + "logps/chosen": -300.8368835449219, + "logps/rejected": -396.38775634765625, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2708401381969452, + "rewards/margins": 9.721776008605957, + "rewards/rejected": -9.992616653442383, "step": 5640 }, { - "epoch": 2.76, - "learning_rate": 4.442634817227651e-08, - "logits/chosen": -2.778451919555664, - "logits/rejected": -2.7945749759674072, - "logps/chosen": -432.8408203125, - "logps/rejected": -408.1561279296875, - "loss": 0.0168, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.20977607369422913, - "rewards/margins": 9.997663497924805, - "rewards/rejected": -9.78788948059082, + "epoch": 2.87, + "learning_rate": 2.3360964581763373e-08, + "logits/chosen": -2.2528560161590576, + "logits/rejected": -2.3048038482666016, + "logps/chosen": -299.9004211425781, + "logps/rejected": -379.5865783691406, + "loss": 0.0146, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.9191215634346008, + "rewards/margins": 9.807453155517578, + "rewards/rejected": -10.72657585144043, "step": 5650 }, { - "epoch": 2.77, - "learning_rate": 4.352153456387984e-08, - "logits/chosen": -2.795010805130005, - "logits/rejected": -2.7964773178100586, - "logps/chosen": -302.0987243652344, - "logps/rejected": -362.7620544433594, - "loss": 0.0112, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.18427368998527527, - "rewards/margins": 10.081182479858398, - "rewards/rejected": -9.896909713745117, - "step": 5660 - }, - { - "epoch": 2.77, - "learning_rate": 4.261672095548317e-08, - "logits/chosen": -2.806636095046997, - "logits/rejected": -2.780097723007202, - "logps/chosen": -296.74658203125, - "logps/rejected": -376.2073059082031, - "loss": 0.0086, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.11439491808414459, - "rewards/margins": 10.658102035522461, - "rewards/rejected": -10.543708801269531, - "step": 5670 - }, - { - "epoch": 2.77, - "learning_rate": 4.17119073470865e-08, - "logits/chosen": -2.7787232398986816, - "logits/rejected": -2.7881293296813965, - "logps/chosen": -326.5613098144531, - "logps/rejected": -340.31182861328125, - "loss": 0.0128, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.28507500886917114, - "rewards/margins": 9.609560012817383, - "rewards/rejected": -9.324484825134277, - "step": 5680 - }, - { - "epoch": 2.78, - "learning_rate": 4.0807093738689826e-08, - "logits/chosen": -2.8198859691619873, - "logits/rejected": -2.8190059661865234, - "logps/chosen": -338.02001953125, - "logps/rejected": -383.8334045410156, - "loss": 0.0135, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7389591336250305, - "rewards/margins": 10.881532669067383, - "rewards/rejected": -10.142572402954102, - "step": 5690 - }, - { - "epoch": 2.78, - "learning_rate": 3.9902280130293154e-08, - "logits/chosen": -2.8719403743743896, - "logits/rejected": -2.8731768131256104, - "logps/chosen": -349.32489013671875, - "logps/rejected": -325.7822570800781, - "loss": 0.0061, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.18898944556713104, - "rewards/margins": 9.255106925964355, - "rewards/rejected": -9.066117286682129, - "step": 5700 - }, - { - "epoch": 2.78, - "eval_logits/chosen": -2.8147506713867188, - "eval_logits/rejected": -2.8122363090515137, - "eval_logps/chosen": -336.9588928222656, - "eval_logps/rejected": -333.6155700683594, - "eval_loss": 0.6531806588172913, - "eval_rewards/accuracies": 0.78125, - "eval_rewards/chosen": -2.2097904682159424, - "eval_rewards/margins": 4.198454856872559, - "eval_rewards/rejected": -6.40824556350708, - "eval_runtime": 303.0654, - "eval_samples_per_second": 7.167, - "eval_steps_per_second": 0.449, - "step": 5700 - }, - { - "epoch": 2.79, - "learning_rate": 3.899746652189649e-08, - "logits/chosen": -2.8471293449401855, - "logits/rejected": -2.868624448776245, - "logps/chosen": -330.0867919921875, - "logps/rejected": -342.5035705566406, - "loss": 0.0102, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.2724529802799225, - "rewards/margins": 9.562896728515625, - "rewards/rejected": -9.29044246673584, - "step": 5710 - }, - { - "epoch": 2.79, - "learning_rate": 3.809265291349982e-08, - "logits/chosen": -2.8190321922302246, - "logits/rejected": -2.7897706031799316, - "logps/chosen": -332.739990234375, - "logps/rejected": -435.59429931640625, - "loss": 0.0095, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.5346451997756958, - "rewards/margins": 11.004878044128418, - "rewards/rejected": -10.470232963562012, - "step": 5720 - }, - { - "epoch": 2.8, - "learning_rate": 3.718783930510315e-08, - "logits/chosen": -2.864314317703247, - "logits/rejected": -2.852177143096924, - "logps/chosen": -314.70977783203125, - "logps/rejected": -346.8056945800781, - "loss": 0.0209, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.020382285118103, - "rewards/margins": 10.600324630737305, - "rewards/rejected": -9.57994270324707, - "step": 5730 - }, - { - "epoch": 2.8, - "learning_rate": 3.6283025696706476e-08, - "logits/chosen": -2.8252153396606445, - "logits/rejected": -2.8308401107788086, - "logps/chosen": -348.7105407714844, - "logps/rejected": -376.7633361816406, - "loss": 0.0143, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.2877705991268158, - "rewards/margins": 9.78177261352539, - "rewards/rejected": -9.494003295898438, - "step": 5740 - }, - { - "epoch": 2.81, - "learning_rate": 3.537821208830981e-08, - "logits/chosen": -2.76407790184021, - "logits/rejected": -2.774273633956909, - "logps/chosen": -296.7833251953125, - "logps/rejected": -378.09368896484375, - "loss": 0.0101, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.6109660863876343, - "rewards/margins": 10.392324447631836, - "rewards/rejected": -9.781356811523438, - "step": 5750 - }, - { - "epoch": 2.81, - "learning_rate": 3.447339847991314e-08, - "logits/chosen": -2.845273494720459, - "logits/rejected": -2.816277027130127, - "logps/chosen": -332.2270202636719, - "logps/rejected": -376.72222900390625, - "loss": 0.0061, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.2786130905151367, - "rewards/margins": 9.380584716796875, - "rewards/rejected": -9.659197807312012, - "step": 5760 - }, - { - "epoch": 2.82, - "learning_rate": 3.356858487151647e-08, - "logits/chosen": -2.786465883255005, - "logits/rejected": -2.7717623710632324, - "logps/chosen": -277.74676513671875, - "logps/rejected": -348.41448974609375, - "loss": 0.0071, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.39390698075294495, - "rewards/margins": 9.836775779724121, - "rewards/rejected": -10.230684280395508, - "step": 5770 - }, - { - "epoch": 2.82, - "learning_rate": 3.26637712631198e-08, - "logits/chosen": -2.7894585132598877, - "logits/rejected": -2.7859456539154053, - "logps/chosen": -289.73211669921875, - "logps/rejected": -381.6850891113281, - "loss": 0.0142, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.09570641815662384, - "rewards/margins": 10.305830001831055, - "rewards/rejected": -10.210124015808105, - "step": 5780 - }, - { - "epoch": 2.83, - "learning_rate": 3.1758957654723126e-08, - "logits/chosen": -2.867675542831421, - "logits/rejected": -2.8739678859710693, - "logps/chosen": -320.26922607421875, - "logps/rejected": -405.62054443359375, - "loss": 0.0101, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.255063533782959, - "rewards/margins": 11.12975025177002, - "rewards/rejected": -9.874685287475586, - "step": 5790 - }, - { - "epoch": 2.83, - "learning_rate": 3.0854144046326454e-08, - "logits/chosen": -2.809892177581787, - "logits/rejected": -2.8058743476867676, - "logps/chosen": -271.73394775390625, - "logps/rejected": -355.0030212402344, - "loss": 0.0094, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.29362156987190247, - "rewards/margins": 9.553832054138184, - "rewards/rejected": -9.847452163696289, - "step": 5800 - }, - { - "epoch": 2.83, - "eval_logits/chosen": -2.8063340187072754, - "eval_logits/rejected": -2.8032078742980957, - "eval_logps/chosen": -337.4847412109375, - "eval_logps/rejected": -334.0425720214844, - "eval_loss": 0.6483454704284668, - "eval_rewards/accuracies": 0.779411792755127, - "eval_rewards/chosen": -2.262376308441162, - "eval_rewards/margins": 4.188565254211426, - "eval_rewards/rejected": -6.45094108581543, - "eval_runtime": 302.8057, - "eval_samples_per_second": 7.173, - "eval_steps_per_second": 0.449, - "step": 5800 - }, - { - "epoch": 2.84, - "learning_rate": 2.994933043792978e-08, - "logits/chosen": -2.7310791015625, - "logits/rejected": -2.750136137008667, - "logps/chosen": -384.9732666015625, - "logps/rejected": -358.75103759765625, - "loss": 0.0102, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.6269290447235107, - "rewards/margins": 9.520708084106445, - "rewards/rejected": -8.893778800964355, - "step": 5810 - }, - { - "epoch": 2.84, - "learning_rate": 2.9044516829533115e-08, - "logits/chosen": -2.824110507965088, - "logits/rejected": -2.832254648208618, - "logps/chosen": -365.55718994140625, - "logps/rejected": -322.91949462890625, - "loss": 0.024, + "epoch": 2.88, + "learning_rate": 2.2418990203466463e-08, + "logits/chosen": -2.2575693130493164, + "logits/rejected": -2.1442599296569824, + "logps/chosen": -276.9645690917969, + "logps/rejected": -345.7447814941406, + "loss": 0.0174, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.19977617263793945, - "rewards/margins": 8.714005470275879, - "rewards/rejected": -8.514229774475098, - "step": 5820 - }, - { - "epoch": 2.85, - "learning_rate": 2.8139703221136444e-08, - "logits/chosen": -2.840216636657715, - "logits/rejected": -2.8146824836730957, - "logps/chosen": -344.5521240234375, - "logps/rejected": -352.198486328125, - "loss": 0.0335, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.1767922341823578, - "rewards/margins": 10.290531158447266, - "rewards/rejected": -10.113738059997559, - "step": 5830 + "rewards/chosen": -0.8034262657165527, + "rewards/margins": 9.3135404586792, + "rewards/rejected": -10.11696720123291, + "step": 5660 }, { - "epoch": 2.85, - "learning_rate": 2.7234889612739776e-08, - "logits/chosen": -2.839446544647217, - "logits/rejected": -2.8432693481445312, - "logps/chosen": -321.9909362792969, - "logps/rejected": -337.4820861816406, - "loss": 0.0072, + "epoch": 2.88, + "learning_rate": 2.1477015825169554e-08, + "logits/chosen": -2.3129067420959473, + "logits/rejected": -2.185722827911377, + "logps/chosen": -285.9835510253906, + "logps/rejected": -367.640380859375, + "loss": 0.0281, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9593833684921265, - "rewards/margins": 11.449156761169434, - "rewards/rejected": -10.489773750305176, - "step": 5840 - }, - { - "epoch": 2.86, - "learning_rate": 2.6330076004343104e-08, - "logits/chosen": -2.8397727012634277, - "logits/rejected": -2.850811243057251, - "logps/chosen": -398.1559753417969, - "logps/rejected": -384.4075622558594, - "loss": 0.005, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7598093748092651, - "rewards/margins": 10.206931114196777, - "rewards/rejected": -9.447120666503906, - "step": 5850 - }, - { - "epoch": 2.86, - "learning_rate": 2.5425262395946433e-08, - "logits/chosen": -2.848437786102295, - "logits/rejected": -2.858236789703369, - "logps/chosen": -352.17510986328125, - "logps/rejected": -346.6148986816406, - "loss": 0.0117, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.1418819427490234, - "rewards/margins": 9.778833389282227, - "rewards/rejected": -8.636951446533203, - "step": 5860 - }, - { - "epoch": 2.87, - "learning_rate": 2.4520448787549765e-08, - "logits/chosen": -2.8752691745758057, - "logits/rejected": -2.8807597160339355, - "logps/chosen": -328.90899658203125, - "logps/rejected": -364.1043395996094, - "loss": 0.0142, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -0.18656238913536072, - "rewards/margins": 9.28158187866211, - "rewards/rejected": -9.468144416809082, - "step": 5870 + "rewards/chosen": -0.37922757863998413, + "rewards/margins": 10.11483383178711, + "rewards/rejected": -10.494062423706055, + "step": 5670 }, { - "epoch": 2.87, - "learning_rate": 2.3615635179153094e-08, - "logits/chosen": -2.7788753509521484, - "logits/rejected": -2.803744077682495, - "logps/chosen": -323.4560241699219, - "logps/rejected": -346.2277526855469, - "loss": 0.0329, + "epoch": 2.89, + "learning_rate": 2.053504144687264e-08, + "logits/chosen": -2.2857284545898438, + "logits/rejected": -2.3002820014953613, + "logps/chosen": -289.5118103027344, + "logps/rejected": -368.2225646972656, + "loss": 0.0082, "rewards/accuracies": 1.0, - "rewards/chosen": 0.6372056603431702, - "rewards/margins": 9.732200622558594, - "rewards/rejected": -9.09499454498291, - "step": 5880 + "rewards/chosen": -0.49899688363075256, + "rewards/margins": 10.350085258483887, + "rewards/rejected": -10.84908390045166, + "step": 5680 }, { - "epoch": 2.88, - "learning_rate": 2.2710821570756422e-08, - "logits/chosen": -2.803210735321045, - "logits/rejected": -2.839939594268799, - "logps/chosen": -367.9231872558594, - "logps/rejected": -374.4939880371094, + "epoch": 2.89, + "learning_rate": 1.9593067068575735e-08, + "logits/chosen": -2.275081157684326, + "logits/rejected": -2.1859753131866455, + "logps/chosen": -254.21267700195312, + "logps/rejected": -303.00335693359375, "loss": 0.014, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6905131936073303, - "rewards/margins": 10.809957504272461, - "rewards/rejected": -10.119444847106934, - "step": 5890 - }, - { - "epoch": 2.88, - "learning_rate": 2.180600796235975e-08, - "logits/chosen": -2.81392765045166, - "logits/rejected": -2.794023036956787, - "logps/chosen": -311.6784362792969, - "logps/rejected": -357.15960693359375, - "loss": 0.0128, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.29847970604896545, - "rewards/margins": 10.090739250183105, - "rewards/rejected": -9.792258262634277, - "step": 5900 - }, - { - "epoch": 2.88, - "eval_logits/chosen": -2.7943384647369385, - "eval_logits/rejected": -2.790273427963257, - "eval_logps/chosen": -338.63714599609375, - "eval_logps/rejected": -335.0156555175781, - "eval_loss": 0.6510014533996582, - "eval_rewards/accuracies": 0.783088207244873, - "eval_rewards/chosen": -2.377617835998535, - "eval_rewards/margins": 4.170632839202881, - "eval_rewards/rejected": -6.548251628875732, - "eval_runtime": 303.1241, - "eval_samples_per_second": 7.165, - "eval_steps_per_second": 0.449, - "step": 5900 - }, - { - "epoch": 2.89, - "learning_rate": 2.0901194353963083e-08, - "logits/chosen": -2.840933084487915, - "logits/rejected": -2.8567490577697754, - "logps/chosen": -306.9349670410156, - "logps/rejected": -388.85882568359375, - "loss": 0.0128, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.061295557767152786, - "rewards/margins": 10.640839576721191, - "rewards/rejected": -10.70213508605957, - "step": 5910 + "rewards/chosen": -0.5944699048995972, + "rewards/margins": 8.801831245422363, + "rewards/rejected": -9.396299362182617, + "step": 5690 }, { - "epoch": 2.89, - "learning_rate": 1.9996380745566412e-08, - "logits/chosen": -2.8274033069610596, - "logits/rejected": -2.8213577270507812, - "logps/chosen": -305.7401123046875, - "logps/rejected": -321.970458984375, - "loss": 0.0158, + "epoch": 2.9, + "learning_rate": 1.8651092690278825e-08, + "logits/chosen": -2.1943910121917725, + "logits/rejected": -2.2771873474121094, + "logps/chosen": -303.30877685546875, + "logps/rejected": -356.7920227050781, + "loss": 0.0083, "rewards/accuracies": 1.0, - "rewards/chosen": 0.6986391544342041, - "rewards/margins": 11.060417175292969, - "rewards/rejected": -10.361778259277344, - "step": 5920 + "rewards/chosen": -0.005204296205192804, + "rewards/margins": 10.570098876953125, + "rewards/rejected": -10.575301170349121, + "step": 5700 }, { "epoch": 2.9, - "learning_rate": 1.9091567137169744e-08, - "logits/chosen": -2.8204190731048584, - "logits/rejected": -2.8155109882354736, - "logps/chosen": -330.60186767578125, - "logps/rejected": -300.04840087890625, - "loss": 0.0109, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.22704121470451355, - "rewards/margins": 8.84349536895752, - "rewards/rejected": -8.616454124450684, - "step": 5930 + "eval_logits/chosen": -2.359806537628174, + "eval_logits/rejected": -2.4324193000793457, + "eval_logps/chosen": -317.05596923828125, + "eval_logps/rejected": -327.8432312011719, + "eval_loss": 0.7741116285324097, + "eval_rewards/accuracies": 0.7442747950553894, + "eval_rewards/chosen": -3.8805482387542725, + "eval_rewards/margins": 3.090261220932007, + "eval_rewards/rejected": -6.970809459686279, + "eval_runtime": 296.9644, + "eval_samples_per_second": 7.024, + "eval_steps_per_second": 0.441, + "step": 5700 }, { "epoch": 2.9, - "learning_rate": 1.8186753528773072e-08, - "logits/chosen": -2.799633502960205, - "logits/rejected": -2.7954487800598145, - "logps/chosen": -327.3326416015625, - "logps/rejected": -369.9510803222656, - "loss": 0.0075, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.024764299392700195, - "rewards/margins": 10.835735321044922, - "rewards/rejected": -10.8109712600708, - "step": 5940 + "learning_rate": 1.7709118311981916e-08, + "logits/chosen": -2.2069544792175293, + "logits/rejected": -2.2963550090789795, + "logps/chosen": -285.5211486816406, + "logps/rejected": -383.9078674316406, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3077685832977295, + "rewards/margins": 10.186443328857422, + "rewards/rejected": -11.49421215057373, + "step": 5710 }, { "epoch": 2.91, - "learning_rate": 1.72819399203764e-08, - "logits/chosen": -2.7677059173583984, - "logits/rejected": -2.7824387550354004, - "logps/chosen": -330.8522033691406, - "logps/rejected": -326.2854919433594, - "loss": 0.0076, + "learning_rate": 1.6767143933685003e-08, + "logits/chosen": -2.2039952278137207, + "logits/rejected": -2.217965841293335, + "logps/chosen": -280.9574890136719, + "logps/rejected": -343.5159912109375, + "loss": 0.0162, "rewards/accuracies": 1.0, - "rewards/chosen": 0.3487926423549652, - "rewards/margins": 10.338022232055664, - "rewards/rejected": -9.98923110961914, - "step": 5950 + "rewards/chosen": -0.5334898829460144, + "rewards/margins": 9.367307662963867, + "rewards/rejected": -9.900795936584473, + "step": 5720 }, { "epoch": 2.91, - "learning_rate": 1.6377126311979733e-08, - "logits/chosen": -2.794630527496338, - "logits/rejected": -2.772195816040039, - "logps/chosen": -244.35543823242188, - "logps/rejected": -292.8790588378906, - "loss": 0.0094, + "learning_rate": 1.5825169555388094e-08, + "logits/chosen": -2.276930332183838, + "logits/rejected": -2.289088249206543, + "logps/chosen": -318.0512390136719, + "logps/rejected": -363.48199462890625, + "loss": 0.011, "rewards/accuracies": 1.0, - "rewards/chosen": -0.9236311912536621, - "rewards/margins": 8.738210678100586, - "rewards/rejected": -9.661842346191406, - "step": 5960 + "rewards/chosen": -0.5276788473129272, + "rewards/margins": 9.740002632141113, + "rewards/rejected": -10.267681121826172, + "step": 5730 }, { "epoch": 2.92, - "learning_rate": 1.5472312703583062e-08, - "logits/chosen": -2.868705987930298, - "logits/rejected": -2.8628172874450684, - "logps/chosen": -310.0809631347656, - "logps/rejected": -373.93353271484375, - "loss": 0.0097, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.24606695771217346, - "rewards/margins": 10.380061149597168, - "rewards/rejected": -10.133995056152344, - "step": 5970 + "learning_rate": 1.4883195177091183e-08, + "logits/chosen": -2.290330410003662, + "logits/rejected": -2.239116907119751, + "logps/chosen": -305.8013916015625, + "logps/rejected": -340.6756896972656, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6274837851524353, + "rewards/margins": 9.25733757019043, + "rewards/rejected": -9.884819984436035, + "step": 5740 }, { "epoch": 2.92, - "learning_rate": 1.456749909518639e-08, - "logits/chosen": -2.7835230827331543, - "logits/rejected": -2.796848773956299, - "logps/chosen": -321.2089538574219, - "logps/rejected": -383.9186706542969, - "loss": 0.0145, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8094062805175781, - "rewards/margins": 11.442567825317383, - "rewards/rejected": -10.633160591125488, - "step": 5980 - }, - { - "epoch": 2.93, - "learning_rate": 1.366268548678972e-08, - "logits/chosen": -2.8277807235717773, - "logits/rejected": -2.8257572650909424, - "logps/chosen": -303.5525207519531, - "logps/rejected": -344.08807373046875, - "loss": 0.0086, + "learning_rate": 1.3941220798794272e-08, + "logits/chosen": -2.2608349323272705, + "logits/rejected": -2.174391984939575, + "logps/chosen": -250.30111694335938, + "logps/rejected": -341.62518310546875, + "loss": 0.0112, "rewards/accuracies": 1.0, - "rewards/chosen": 0.5277048349380493, - "rewards/margins": 11.100028038024902, - "rewards/rejected": -10.5723237991333, - "step": 5990 + "rewards/chosen": -0.7482810616493225, + "rewards/margins": 9.675050735473633, + "rewards/rejected": -10.423332214355469, + "step": 5750 }, { "epoch": 2.93, - "learning_rate": 1.2757871878393051e-08, - "logits/chosen": -2.7910408973693848, - "logits/rejected": -2.8172109127044678, - "logps/chosen": -313.90765380859375, - "logps/rejected": -374.1523742675781, - "loss": 0.0061, + "learning_rate": 1.299924642049736e-08, + "logits/chosen": -2.2983784675598145, + "logits/rejected": -2.2714810371398926, + "logps/chosen": -266.9122009277344, + "logps/rejected": -360.6929931640625, + "loss": 0.0163, "rewards/accuracies": 1.0, - "rewards/chosen": 1.0078392028808594, - "rewards/margins": 11.071653366088867, - "rewards/rejected": -10.063814163208008, - "step": 6000 + "rewards/chosen": -0.6003319621086121, + "rewards/margins": 9.648820877075195, + "rewards/rejected": -10.249155044555664, + "step": 5760 }, { "epoch": 2.93, - "eval_logits/chosen": -2.78775954246521, - "eval_logits/rejected": -2.782463312149048, - "eval_logps/chosen": -339.28759765625, - "eval_logps/rejected": -336.06964111328125, - "eval_loss": 0.654687225818634, - "eval_rewards/accuracies": 0.78125, - "eval_rewards/chosen": -2.442659616470337, - "eval_rewards/margins": 4.210994720458984, - "eval_rewards/rejected": -6.653655052185059, - "eval_runtime": 303.0006, - "eval_samples_per_second": 7.168, - "eval_steps_per_second": 0.449, - "step": 6000 + "learning_rate": 1.2057272042200451e-08, + "logits/chosen": -2.3055496215820312, + "logits/rejected": -2.239773750305176, + "logps/chosen": -259.98968505859375, + "logps/rejected": -362.4081115722656, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32147616147994995, + "rewards/margins": 10.13626480102539, + "rewards/rejected": -10.457742691040039, + "step": 5770 }, { "epoch": 2.94, - "learning_rate": 1.185305826999638e-08, - "logits/chosen": -2.8637053966522217, - "logits/rejected": -2.8752946853637695, - "logps/chosen": -338.73297119140625, - "logps/rejected": -391.7603759765625, - "loss": 0.0076, + "learning_rate": 1.111529766390354e-08, + "logits/chosen": -2.3338782787323, + "logits/rejected": -2.3960072994232178, + "logps/chosen": -288.9483337402344, + "logps/rejected": -317.8904724121094, + "loss": 0.0064, "rewards/accuracies": 1.0, - "rewards/chosen": 0.9966065287590027, - "rewards/margins": 12.010201454162598, - "rewards/rejected": -11.013593673706055, - "step": 6010 + "rewards/chosen": -0.20854076743125916, + "rewards/margins": 9.097440719604492, + "rewards/rejected": -9.305981636047363, + "step": 5780 }, { - "epoch": 2.94, - "learning_rate": 1.094824466159971e-08, - "logits/chosen": -2.762047290802002, - "logits/rejected": -2.7815585136413574, - "logps/chosen": -321.5288391113281, - "logps/rejected": -367.6561279296875, - "loss": 0.0074, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.27254194021224976, - "rewards/margins": 10.39825439453125, - "rewards/rejected": -10.670794486999512, - "step": 6020 + "epoch": 2.95, + "learning_rate": 1.0173323285606632e-08, + "logits/chosen": -2.225872755050659, + "logits/rejected": -2.1147029399871826, + "logps/chosen": -270.2624816894531, + "logps/rejected": -389.66741943359375, + "loss": 0.0197, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.602431058883667, + "rewards/margins": 9.71196460723877, + "rewards/rejected": -11.314395904541016, + "step": 5790 }, { "epoch": 2.95, - "learning_rate": 1.0043431053203039e-08, - "logits/chosen": -2.8213751316070557, - "logits/rejected": -2.8210928440093994, - "logps/chosen": -303.3100891113281, - "logps/rejected": -431.57232666015625, - "loss": 0.0071, + "learning_rate": 9.231348907309721e-09, + "logits/chosen": -2.22969913482666, + "logits/rejected": -2.1869258880615234, + "logps/chosen": -280.09869384765625, + "logps/rejected": -333.8819885253906, + "loss": 0.0243, "rewards/accuracies": 1.0, - "rewards/chosen": 0.29716354608535767, - "rewards/margins": 12.473363876342773, - "rewards/rejected": -12.176200866699219, - "step": 6030 + "rewards/chosen": -0.891821026802063, + "rewards/margins": 8.560599327087402, + "rewards/rejected": -9.452421188354492, + "step": 5800 }, { "epoch": 2.95, - "learning_rate": 9.13861744480637e-09, - "logits/chosen": -2.753966808319092, - "logits/rejected": -2.7236969470977783, - "logps/chosen": -334.3569641113281, - "logps/rejected": -377.1719055175781, - "loss": 0.0093, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.19726666808128357, - "rewards/margins": 10.590192794799805, - "rewards/rejected": -10.392925262451172, - "step": 6040 + "eval_logits/chosen": -2.3620197772979736, + "eval_logits/rejected": -2.4354655742645264, + "eval_logps/chosen": -316.4267578125, + "eval_logps/rejected": -327.04864501953125, + "eval_loss": 0.7656615376472473, + "eval_rewards/accuracies": 0.7404580116271973, + "eval_rewards/chosen": -3.8176305294036865, + "eval_rewards/margins": 3.0737171173095703, + "eval_rewards/rejected": -6.891347408294678, + "eval_runtime": 301.7518, + "eval_samples_per_second": 6.913, + "eval_steps_per_second": 0.434, + "step": 5800 }, { "epoch": 2.96, - "learning_rate": 8.2338038364097e-09, - "logits/chosen": -2.7817087173461914, - "logits/rejected": -2.8001036643981934, - "logps/chosen": -347.7796936035156, - "logps/rejected": -399.48272705078125, - "loss": 0.012, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.5988336801528931, - "rewards/margins": 9.878965377807617, - "rewards/rejected": -9.280133247375488, - "step": 6050 + "learning_rate": 8.289374529012812e-09, + "logits/chosen": -2.155146598815918, + "logits/rejected": -2.232573986053467, + "logps/chosen": -253.884765625, + "logps/rejected": -316.1662292480469, + "loss": 0.0089, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4325670599937439, + "rewards/margins": 9.85391902923584, + "rewards/rejected": -10.28648567199707, + "step": 5810 }, { "epoch": 2.96, - "learning_rate": 7.328990228013029e-09, - "logits/chosen": -2.804426908493042, - "logits/rejected": -2.833937644958496, - "logps/chosen": -355.1780700683594, - "logps/rejected": -434.5801696777344, - "loss": 0.0196, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.13217031955719, - "rewards/margins": 12.14919376373291, - "rewards/rejected": -11.017023086547852, - "step": 6060 + "learning_rate": 7.3474001507159e-09, + "logits/chosen": -2.3524250984191895, + "logits/rejected": -2.2284231185913086, + "logps/chosen": -325.3186340332031, + "logps/rejected": -358.0306701660156, + "loss": 0.0103, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.22089111804962158, + "rewards/margins": 9.495137214660645, + "rewards/rejected": -9.716028213500977, + "step": 5820 }, { "epoch": 2.97, - "learning_rate": 6.424176619616359e-09, - "logits/chosen": -2.8069968223571777, - "logits/rejected": -2.8338141441345215, - "logps/chosen": -363.4900817871094, - "logps/rejected": -372.24560546875, - "loss": 0.0169, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.15832024812698364, - "rewards/margins": 10.235078811645508, - "rewards/rejected": -10.076757431030273, - "step": 6070 + "learning_rate": 6.4054257724189895e-09, + "logits/chosen": -2.2638158798217773, + "logits/rejected": -2.269150972366333, + "logps/chosen": -270.5865478515625, + "logps/rejected": -370.8131408691406, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4692980647087097, + "rewards/margins": 9.660306930541992, + "rewards/rejected": -10.129603385925293, + "step": 5830 }, { "epoch": 2.97, - "learning_rate": 5.519363011219688e-09, - "logits/chosen": -2.8311402797698975, - "logits/rejected": -2.81882381439209, - "logps/chosen": -363.3807678222656, - "logps/rejected": -374.721923828125, - "loss": 0.017, + "learning_rate": 5.463451394122079e-09, + "logits/chosen": -2.2793831825256348, + "logits/rejected": -2.255838632583618, + "logps/chosen": -314.7529296875, + "logps/rejected": -368.70123291015625, + "loss": 0.0251, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.4465043544769287, - "rewards/margins": 10.296545028686523, - "rewards/rejected": -9.850040435791016, - "step": 6080 + "rewards/chosen": -0.5236554741859436, + "rewards/margins": 9.78014850616455, + "rewards/rejected": -10.303804397583008, + "step": 5840 }, { "epoch": 2.98, - "learning_rate": 4.614549402823018e-09, - "logits/chosen": -2.8037521839141846, - "logits/rejected": -2.8082339763641357, - "logps/chosen": -284.8935546875, - "logps/rejected": -370.9079895019531, - "loss": 0.0119, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.20338019728660583, - "rewards/margins": 10.172778129577637, - "rewards/rejected": -10.376158714294434, - "step": 6090 + "learning_rate": 4.52147701582517e-09, + "logits/chosen": -2.2641196250915527, + "logits/rejected": -2.307553768157959, + "logps/chosen": -280.8160400390625, + "logps/rejected": -338.44342041015625, + "loss": 0.0151, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.047176718711853, + "rewards/margins": 9.032100677490234, + "rewards/rejected": -10.079277038574219, + "step": 5850 }, { "epoch": 2.98, - "learning_rate": 3.709735794426348e-09, - "logits/chosen": -2.7907533645629883, - "logits/rejected": -2.810549259185791, - "logps/chosen": -373.81964111328125, - "logps/rejected": -389.025390625, - "loss": 0.0082, + "learning_rate": 3.579502637528259e-09, + "logits/chosen": -2.2263641357421875, + "logits/rejected": -2.079395294189453, + "logps/chosen": -266.8037109375, + "logps/rejected": -345.18890380859375, + "loss": 0.0102, "rewards/accuracies": 1.0, - "rewards/chosen": 0.43906983733177185, - "rewards/margins": 10.52045726776123, - "rewards/rejected": -10.081387519836426, - "step": 6100 - }, - { - "epoch": 2.98, - "eval_logits/chosen": -2.78605055809021, - "eval_logits/rejected": -2.780413866043091, - "eval_logps/chosen": -339.6953430175781, - "eval_logps/rejected": -336.6893310546875, - "eval_loss": 0.656951904296875, - "eval_rewards/accuracies": 0.78125, - "eval_rewards/chosen": -2.4834330081939697, - "eval_rewards/margins": 4.232187747955322, - "eval_rewards/rejected": -6.715620040893555, - "eval_runtime": 303.3821, - "eval_samples_per_second": 7.159, - "eval_steps_per_second": 0.448, - "step": 6100 - }, - { - "epoch": 2.98, - "learning_rate": 2.8049221860296777e-09, - "logits/chosen": -2.824645757675171, - "logits/rejected": -2.8512518405914307, - "logps/chosen": -313.879150390625, - "logps/rejected": -356.1576843261719, - "loss": 0.014, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.3202458918094635, - "rewards/margins": 10.065518379211426, - "rewards/rejected": -9.745271682739258, - "step": 6110 + "rewards/chosen": -1.0268042087554932, + "rewards/margins": 9.61182689666748, + "rewards/rejected": -10.638631820678711, + "step": 5860 }, { "epoch": 2.99, - "learning_rate": 1.9001085776330076e-09, - "logits/chosen": -2.739125967025757, - "logits/rejected": -2.7443509101867676, - "logps/chosen": -355.3984375, - "logps/rejected": -336.5526428222656, - "loss": 0.0106, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.02041384018957615, - "rewards/margins": 9.25210189819336, - "rewards/rejected": -9.272515296936035, - "step": 6120 + "learning_rate": 2.6375282592313484e-09, + "logits/chosen": -2.2066586017608643, + "logits/rejected": -2.21642804145813, + "logps/chosen": -331.57293701171875, + "logps/rejected": -358.3147277832031, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.113747239112854, + "rewards/margins": 11.221342086791992, + "rewards/rejected": -11.335088729858398, + "step": 5870 }, { "epoch": 2.99, - "learning_rate": 9.952949692363373e-10, - "logits/chosen": -2.789135456085205, - "logits/rejected": -2.8342463970184326, - "logps/chosen": -261.383544921875, - "logps/rejected": -335.1572265625, - "loss": 0.017, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.17767223715782166, - "rewards/margins": 9.392723083496094, - "rewards/rejected": -9.570394515991211, - "step": 6130 + "learning_rate": 1.6955538809344383e-09, + "logits/chosen": -2.4038608074188232, + "logits/rejected": -2.319794178009033, + "logps/chosen": -329.29656982421875, + "logps/rejected": -378.1693115234375, + "loss": 0.0087, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.025945544242858887, + "rewards/margins": 9.82371997833252, + "rewards/rejected": -9.849664688110352, + "step": 5880 }, { "epoch": 3.0, - "learning_rate": 9.048136083966702e-11, - "logits/chosen": -2.816842555999756, - "logits/rejected": -2.8055644035339355, - "logps/chosen": -350.9559631347656, - "logps/rejected": -372.9457092285156, - "loss": 0.0134, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.4857265055179596, - "rewards/margins": 10.015380859375, - "rewards/rejected": -9.529653549194336, - "step": 6140 + "learning_rate": 7.535795026375282e-10, + "logits/chosen": -2.3141093254089355, + "logits/rejected": -2.212184190750122, + "logps/chosen": -255.28628540039062, + "logps/rejected": -306.7519226074219, + "loss": 0.0159, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7222930192947388, + "rewards/margins": 8.908611297607422, + "rewards/rejected": -9.630905151367188, + "step": 5890 }, { "epoch": 3.0, - "step": 6141, + "step": 5898, "total_flos": 0.0, - "train_loss": 0.23823042969992683, - "train_runtime": 80075.9645, - "train_samples_per_second": 2.454, + "train_loss": 0.22745071912974513, + "train_runtime": 76961.4992, + "train_samples_per_second": 2.452, "train_steps_per_second": 0.077 } ], "logging_steps": 10, - "max_steps": 6141, + "max_steps": 5898, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0,