{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1459, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006856359273225917, "grad_norm": 0.6368917591184686, "kl": 0.0, "learning_rate": 0.0, "logits/chosen": -90308608.0, "logits/rejected": -104988672.0, "logps/chosen": -250.75, "logps/rejected": -287.0, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0013712718546451835, "grad_norm": 0.631866291977487, "kl": 0.0, "learning_rate": 4e-08, "logits/chosen": -98000245.84126984, "logits/rejected": -101824795.56923077, "logps/chosen": -242.53968253968253, "logps/rejected": -307.2, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": -2.1538461538461537, "rewards/rejected": 2.1538461538461537, "step": 2 }, { "epoch": 0.002056907781967775, "grad_norm": 0.7353768896340452, "kl": 0.123046875, "learning_rate": 8e-08, "logits/chosen": -90876586.66666667, "logits/rejected": -91411154.8235294, "logps/chosen": -234.4, "logps/rejected": -363.7647058823529, "loss": 0.5004, "rewards/chosen": -0.0017578125, "rewards/margins": -0.009202306410845589, "rewards/rejected": 0.007444493910845588, "step": 3 }, { "epoch": 0.002742543709290367, "grad_norm": 0.7678725202907154, "kl": 0.01171875, "learning_rate": 1.2e-07, "logits/chosen": -111339706.18181819, "logits/rejected": -87403883.35483871, "logps/chosen": -379.1515151515151, "logps/rejected": -294.7096774193548, "loss": 0.4994, "rewards/chosen": 1.178650595925071, "rewards/margins": 1.1697827783847485, "rewards/rejected": 0.00886781754032258, "step": 4 }, { "epoch": 0.0034281796366129585, "grad_norm": 0.6896166565741775, "kl": 0.0, "learning_rate": 1.6e-07, "logits/chosen": -95108677.1891892, "logits/rejected": -120625076.14814815, "logps/chosen": -325.8378378378378, "logps/rejected": -375.7037037037037, "loss": 0.4983, "rewards/chosen": -0.0003398173564189189, "rewards/margins": 0.009502666729229229, "rewards/rejected": -0.009842484085648149, "step": 5 }, { "epoch": 0.00411381556393555, "grad_norm": 0.7389329278950366, "kl": 0.0693359375, "learning_rate": 2e-07, "logits/chosen": -114923929.6, "logits/rejected": -111272417.88235295, "logps/chosen": -298.4, "logps/rejected": -278.11764705882354, "loss": 0.5023, "rewards/chosen": -0.018825276692708334, "rewards/margins": -0.028529866536458334, "rewards/rejected": 0.00970458984375, "step": 6 }, { "epoch": 0.0047994514912581415, "grad_norm": 0.6035464354468132, "kl": 0.109375, "learning_rate": 2.4e-07, "logits/chosen": -112864907.63636364, "logits/rejected": -81856578.06451613, "logps/chosen": -305.45454545454544, "logps/rejected": -302.7096774193548, "loss": 0.5042, "rewards/chosen": -0.000732421875, "rewards/margins": -0.03547127016129032, "rewards/rejected": 0.03473884828629032, "step": 7 }, { "epoch": 0.005485087418580734, "grad_norm": 0.8115666340801385, "kl": 0.173828125, "learning_rate": 2.8e-07, "logits/chosen": -122738580.21052632, "logits/rejected": -107634111.09859155, "logps/chosen": -426.6666666666667, "logps/rejected": -295.2112676056338, "loss": 0.5002, "rewards/chosen": 1.8806760018331963, "rewards/margins": 1.88261881103214, "rewards/rejected": -0.0019428091989436619, "step": 8 }, { "epoch": 0.0061707233459033254, "grad_norm": 0.6604393826821955, "kl": 0.0546875, "learning_rate": 3.2e-07, "logits/chosen": -113783939.28205128, "logits/rejected": -108380815.36, "logps/chosen": -256.8205128205128, "logps/rejected": -365.76, "loss": 0.4994, "rewards/chosen": 0.005015837840544872, "rewards/margins": 0.009005095653044873, "rewards/rejected": -0.0039892578125, "step": 9 }, { "epoch": 0.006856359273225917, "grad_norm": 0.7014823634189542, "kl": 0.169921875, "learning_rate": 3.6e-07, "logits/chosen": -94827742.60869566, "logits/rejected": -71445347.79661018, "logps/chosen": -360.1159420289855, "logps/rejected": -286.10169491525426, "loss": 0.4985, "rewards/chosen": 0.009181810461956522, "rewards/margins": 0.010605274445007369, "rewards/rejected": -0.0014234639830508474, "step": 10 }, { "epoch": 0.0075419952005485085, "grad_norm": 0.7712198508103478, "kl": 0.05859375, "learning_rate": 4e-07, "logits/chosen": -94123025.3559322, "logits/rejected": -82184043.5942029, "logps/chosen": -249.4915254237288, "logps/rejected": -362.6666666666667, "loss": 0.4969, "rewards/chosen": 0.012066340042372881, "rewards/margins": 0.022504236328604765, "rewards/rejected": -0.010437896286231884, "step": 11 }, { "epoch": 0.0082276311278711, "grad_norm": 0.6199801812718058, "kl": 0.01171875, "learning_rate": 4.3999999999999997e-07, "logits/chosen": -90943162.92063493, "logits/rejected": -80595164.55384615, "logps/chosen": -232.88888888888889, "logps/rejected": -236.30769230769232, "loss": 0.4968, "rewards/chosen": 0.020988343253968252, "rewards/margins": 0.023944322821275944, "rewards/rejected": -0.002955979567307692, "step": 12 }, { "epoch": 0.008913267055193692, "grad_norm": 0.6028559467475199, "kl": 0.041015625, "learning_rate": 4.8e-07, "logits/chosen": -81759390.64788732, "logits/rejected": -96395407.71929824, "logps/chosen": -247.66197183098592, "logps/rejected": -304.280701754386, "loss": 0.5003, "rewards/chosen": -0.004666180677816901, "rewards/margins": -0.17036828106351626, "rewards/rejected": 0.16570210038569935, "step": 13 }, { "epoch": 0.009598902982516283, "grad_norm": 0.7000032708402567, "kl": 0.189453125, "learning_rate": 5.2e-07, "logits/chosen": -98596537.50724638, "logits/rejected": -77914528.54237288, "logps/chosen": -285.2173913043478, "logps/rejected": -317.5593220338983, "loss": 0.5028, "rewards/chosen": 0.0002919072690217391, "rewards/margins": -0.021076604383520634, "rewards/rejected": 0.021368511652542374, "step": 14 }, { "epoch": 0.010284538909838875, "grad_norm": 0.7253536034311295, "kl": 0.189453125, "learning_rate": 5.6e-07, "logits/chosen": -80988197.23636363, "logits/rejected": -95492236.2739726, "logps/chosen": -250.76363636363635, "logps/rejected": -309.47945205479454, "loss": 0.5015, "rewards/chosen": -0.011328125, "rewards/margins": -0.01642497859589041, "rewards/rejected": 0.005096853595890411, "step": 15 }, { "epoch": 0.010970174837161468, "grad_norm": 0.6853722509492658, "kl": 0.0234375, "learning_rate": 6e-07, "logits/chosen": -79225742.22222222, "logits/rejected": -77433304.61538461, "logps/chosen": -261.58730158730157, "logps/rejected": -309.16923076923075, "loss": 0.4996, "rewards/chosen": -0.012997581845238096, "rewards/margins": -0.001004643143315019, "rewards/rejected": -0.011992938701923077, "step": 16 }, { "epoch": 0.011655810764484058, "grad_norm": 0.8768474923753069, "kl": 0.115234375, "learning_rate": 6.4e-07, "logits/chosen": -120350845.3877551, "logits/rejected": -98645782.68354431, "logps/chosen": -408.81632653061223, "logps/rejected": -358.4810126582278, "loss": 0.4986, "rewards/chosen": 2.3091961607641105, "rewards/margins": 2.323214159181832, "rewards/rejected": -0.014017998417721519, "step": 17 }, { "epoch": 0.012341446691806651, "grad_norm": 0.6720013239153537, "kl": 0.09375, "learning_rate": 6.800000000000001e-07, "logits/chosen": -93554306.16949153, "logits/rejected": -101514314.20289855, "logps/chosen": -245.96610169491527, "logps/rejected": -282.6666666666667, "loss": 0.5006, "rewards/chosen": -0.011677370233050847, "rewards/margins": -0.24462362789340525, "rewards/rejected": 0.2329462576603544, "step": 18 }, { "epoch": 0.013027082619129242, "grad_norm": 0.6002673254594435, "kl": 0.0, "learning_rate": 7.2e-07, "logits/chosen": -94755825.57746479, "logits/rejected": -81825720.14035088, "logps/chosen": -294.0845070422535, "logps/rejected": -296.140350877193, "loss": 0.4963, "rewards/chosen": 0.010195450044014084, "rewards/margins": 0.03087458929840005, "rewards/rejected": -0.020679139254385966, "step": 19 }, { "epoch": 0.013712718546451834, "grad_norm": 0.7368798943827564, "kl": 0.0078125, "learning_rate": 7.599999999999999e-07, "logits/chosen": -88486284.38709678, "logits/rejected": -93037288.72727273, "logps/chosen": -252.90322580645162, "logps/rejected": -294.06060606060606, "loss": 0.4978, "rewards/chosen": 0.0035538211945564517, "rewards/margins": 0.015968002065768573, "rewards/rejected": -0.012414180871212122, "step": 20 }, { "epoch": 0.014398354473774426, "grad_norm": 0.7578373516127234, "kl": 0.0625, "learning_rate": 8e-07, "logits/chosen": -98449635.55555555, "logits/rejected": -104483108.57142857, "logps/chosen": -323.3333333333333, "logps/rejected": -399.0, "loss": 0.505, "rewards/chosen": 2.0150633917914496, "rewards/margins": 1.747806569886586, "rewards/rejected": 0.2672568219048636, "step": 21 }, { "epoch": 0.015083990401097017, "grad_norm": 0.6611780950725193, "kl": 0.064453125, "learning_rate": 8.399999999999999e-07, "logits/chosen": -98133398.34920634, "logits/rejected": -102341017.6, "logps/chosen": -250.15873015873015, "logps/rejected": -330.83076923076925, "loss": 0.4998, "rewards/chosen": -0.005867125496031746, "rewards/margins": -0.0008490966498778995, "rewards/rejected": -0.0050180288461538465, "step": 22 }, { "epoch": 0.01576962632841961, "grad_norm": 0.7623180149414707, "kl": 0.16796875, "learning_rate": 8.799999999999999e-07, "logits/chosen": -101403467.29411764, "logits/rejected": -111775478.02597402, "logps/chosen": -220.54901960784315, "logps/rejected": -323.7402597402597, "loss": 0.4969, "rewards/chosen": 7.347671807981005, "rewards/margins": 7.344184084766719, "rewards/rejected": 0.0034877232142857145, "step": 23 }, { "epoch": 0.0164552622557422, "grad_norm": 0.6435813167349861, "kl": 0.052734375, "learning_rate": 9.2e-07, "logits/chosen": -88334584.24242425, "logits/rejected": -91598187.35483871, "logps/chosen": -346.1818181818182, "logps/rejected": -268.38709677419354, "loss": 0.4995, "rewards/chosen": -0.016575668797348484, "rewards/margins": -0.4469061913961073, "rewards/rejected": 0.4303305225987588, "step": 24 }, { "epoch": 0.017140898183064794, "grad_norm": 0.7657760745924768, "kl": 0.037109375, "learning_rate": 9.6e-07, "logits/chosen": -99754530.13333334, "logits/rejected": -82220694.58823529, "logps/chosen": -316.8, "logps/rejected": -336.47058823529414, "loss": 0.4995, "rewards/chosen": 0.0016764322916666666, "rewards/margins": 0.004110657935049019, "rewards/rejected": -0.002434225643382353, "step": 25 }, { "epoch": 0.017826534110387385, "grad_norm": 0.7675382389332193, "kl": 0.095703125, "learning_rate": 1e-06, "logits/chosen": -118403139.14754099, "logits/rejected": -97908827.70149253, "logps/chosen": -419.672131147541, "logps/rejected": -305.43283582089555, "loss": 0.497, "rewards/chosen": 0.0016969774590163934, "rewards/margins": 0.021525204487934303, "rewards/rejected": -0.01982822702891791, "step": 26 }, { "epoch": 0.018512170037709975, "grad_norm": 0.6421141068836281, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103763433.73913044, "logits/rejected": -86374226.44067797, "logps/chosen": -300.9855072463768, "logps/rejected": -285.2881355932203, "loss": 0.5004, "rewards/chosen": -0.017111073369565216, "rewards/margins": -0.006807511399226232, "rewards/rejected": -0.010303561970338984, "step": 27 }, { "epoch": 0.019197805965032566, "grad_norm": 0.6999402727853057, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79874137.04347827, "logits/rejected": -96468992.0, "logps/chosen": -249.97101449275362, "logps/rejected": -362.3050847457627, "loss": 0.4986, "rewards/chosen": 0.0009624094202898551, "rewards/margins": 0.01043630706329833, "rewards/rejected": -0.009473897643008475, "step": 28 }, { "epoch": 0.01988344189235516, "grad_norm": 0.661780393636588, "kl": 0.015625, "learning_rate": 1e-06, "logits/chosen": -115681610.32258065, "logits/rejected": -91384987.15151516, "logps/chosen": -285.03225806451616, "logps/rejected": -296.24242424242425, "loss": 0.4989, "rewards/chosen": 2.0292165202479207, "rewards/margins": 1.822165662125991, "rewards/rejected": 0.20705085812192975, "step": 29 }, { "epoch": 0.02056907781967775, "grad_norm": 0.641319809473504, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104794998.4477612, "logits/rejected": -82304621.1147541, "logps/chosen": -266.5074626865672, "logps/rejected": -310.8196721311475, "loss": 0.4938, "rewards/chosen": 0.022484622784514924, "rewards/margins": 0.05002048344025263, "rewards/rejected": -0.027535860655737706, "step": 30 }, { "epoch": 0.02125471374700034, "grad_norm": 0.7293202682310518, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -70464307.2, "logits/rejected": -102390362.35294117, "logps/chosen": -249.86666666666667, "logps/rejected": -328.94117647058823, "loss": 0.4961, "rewards/chosen": 2.1040828704833983, "rewards/margins": 2.1358211517333983, "rewards/rejected": -0.03173828125, "step": 31 }, { "epoch": 0.021940349674322936, "grad_norm": 0.7677018778182698, "kl": 0.05859375, "learning_rate": 1e-06, "logits/chosen": -90237454.62857144, "logits/rejected": -77160730.48275863, "logps/chosen": -368.9142857142857, "logps/rejected": -328.2758620689655, "loss": 0.5003, "rewards/chosen": -0.031138392857142858, "rewards/margins": 0.001593288177339898, "rewards/rejected": -0.032731681034482756, "step": 32 }, { "epoch": 0.022625985601645526, "grad_norm": 0.6433607404320141, "kl": 0.09375, "learning_rate": 1e-06, "logits/chosen": -95204043.17460318, "logits/rejected": -84208718.76923077, "logps/chosen": -269.7142857142857, "logps/rejected": -262.15384615384613, "loss": 0.4966, "rewards/chosen": 1.6255318777901786, "rewards/margins": 1.6454537527901787, "rewards/rejected": -0.019921875, "step": 33 }, { "epoch": 0.023311621528968117, "grad_norm": 0.6915341088059899, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94706168.57971014, "logits/rejected": -77061449.76271187, "logps/chosen": -371.0144927536232, "logps/rejected": -243.79661016949152, "loss": 0.4976, "rewards/chosen": -0.0030597189198369565, "rewards/margins": 0.02107709914160372, "rewards/rejected": -0.024136818061440676, "step": 34 }, { "epoch": 0.02399725745629071, "grad_norm": 0.797991642335091, "kl": 0.01171875, "learning_rate": 1e-06, "logits/chosen": -90825746.61818182, "logits/rejected": -100663296.0, "logps/chosen": -279.41818181818184, "logps/rejected": -299.3972602739726, "loss": 0.496, "rewards/chosen": -0.0001220703125, "rewards/margins": 0.028679834653253425, "rewards/rejected": -0.028801904965753425, "step": 35 }, { "epoch": 0.024682893383613302, "grad_norm": 0.8587500477516397, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97307852.8, "logits/rejected": -107786955.17460318, "logps/chosen": -329.6, "logps/rejected": -285.968253968254, "loss": 0.495, "rewards/chosen": 0.017142427884615386, "rewards/margins": 0.03941735347985348, "rewards/rejected": -0.022274925595238096, "step": 36 }, { "epoch": 0.025368529310935892, "grad_norm": 0.8712246490447852, "kl": 0.1171875, "learning_rate": 1e-06, "logits/chosen": -78959437.20634921, "logits/rejected": -87241523.2, "logps/chosen": -285.968253968254, "logps/rejected": -326.4, "loss": 0.4945, "rewards/chosen": 0.002139136904761905, "rewards/margins": -1.9394060854510073, "rewards/rejected": 1.9415452223557692, "step": 37 }, { "epoch": 0.026054165238258483, "grad_norm": 0.68426585908767, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80290962.28571428, "logits/rejected": -92274688.0, "logps/chosen": -234.85714285714286, "logps/rejected": -287.55555555555554, "loss": 0.4957, "rewards/chosen": -0.009591238839285714, "rewards/margins": -0.6125226853385805, "rewards/rejected": 0.6029314464992948, "step": 38 }, { "epoch": 0.026739801165581077, "grad_norm": 0.7896158382140843, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85644981.67741935, "logits/rejected": -91257887.03030303, "logps/chosen": -282.83870967741933, "logps/rejected": -305.93939393939394, "loss": 0.4909, "rewards/chosen": 0.016948084677419355, "rewards/margins": 0.07341115285923754, "rewards/rejected": -0.056463068181818184, "step": 39 }, { "epoch": 0.027425437092903668, "grad_norm": 0.6803686013900151, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98216618.66666667, "logits/rejected": -81480523.29411764, "logps/chosen": -294.4, "logps/rejected": -292.47058823529414, "loss": 0.4956, "rewards/chosen": -0.008131917317708333, "rewards/margins": -0.41430509043674846, "rewards/rejected": 0.4061731731190401, "step": 40 }, { "epoch": 0.02811107302022626, "grad_norm": 0.8271810356294474, "kl": 0.013671875, "learning_rate": 1e-06, "logits/chosen": -68735964.68965517, "logits/rejected": -99105411.65714286, "logps/chosen": -291.58620689655174, "logps/rejected": -329.14285714285717, "loss": 0.4915, "rewards/chosen": 0.003232758620689655, "rewards/margins": 0.06244034790640394, "rewards/rejected": -0.059207589285714284, "step": 41 }, { "epoch": 0.028796708947548853, "grad_norm": 0.7110093291143293, "kl": 0.02734375, "learning_rate": 1e-06, "logits/chosen": -92405760.0, "logits/rejected": -85065728.0, "logps/chosen": -329.25, "logps/rejected": -297.0, "loss": 0.4948, "rewards/chosen": 0.007080078125, "rewards/margins": 0.044139862060546875, "rewards/rejected": -0.037059783935546875, "step": 42 }, { "epoch": 0.029482344874871443, "grad_norm": 0.7741545664038055, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86535114.10526316, "logits/rejected": -89084653.97183098, "logps/chosen": -198.73684210526315, "logps/rejected": -317.7464788732394, "loss": 0.4967, "rewards/chosen": -0.01278954221491228, "rewards/margins": 0.021954075742834197, "rewards/rejected": -0.03474361795774648, "step": 43 }, { "epoch": 0.030167980802194034, "grad_norm": 0.6507786343768094, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101392740.17391305, "logits/rejected": -86018776.94915254, "logps/chosen": -265.9710144927536, "logps/rejected": -293.96610169491527, "loss": 0.4979, "rewards/chosen": -0.01005134029664855, "rewards/margins": 0.02235367244831967, "rewards/rejected": -0.03240501274496822, "step": 44 }, { "epoch": 0.030853616729516628, "grad_norm": 0.655903284054315, "kl": 0.0517578125, "learning_rate": 1e-06, "logits/chosen": -79818876.12121212, "logits/rejected": -94304189.93548387, "logps/chosen": -246.54545454545453, "logps/rejected": -318.4516129032258, "loss": 0.4917, "rewards/chosen": 0.004668264678030303, "rewards/margins": -0.28356650754270435, "rewards/rejected": 0.2882347722207346, "step": 45 }, { "epoch": 0.03153925265683922, "grad_norm": 0.6256539536196621, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90780826.30136986, "logits/rejected": -99443134.83636364, "logps/chosen": -289.972602739726, "logps/rejected": -283.2, "loss": 0.499, "rewards/chosen": -0.012916042380136987, "rewards/margins": 0.01068125875622665, "rewards/rejected": -0.023597301136363637, "step": 46 }, { "epoch": 0.03222488858416181, "grad_norm": 0.7075479128558411, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101425896.72727273, "logits/rejected": -103639898.83870968, "logps/chosen": -275.8787878787879, "logps/rejected": -276.38709677419354, "loss": 0.4952, "rewards/chosen": 1.8093401590983074, "rewards/margins": 1.8575382437757266, "rewards/rejected": -0.04819808467741935, "step": 47 }, { "epoch": 0.0329105245114844, "grad_norm": 0.7592879567668842, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97178797.29230769, "logits/rejected": -67774626.53968254, "logps/chosen": -289.4769230769231, "logps/rejected": -283.1746031746032, "loss": 0.492, "rewards/chosen": 0.006242487980769231, "rewards/margins": 0.06558028559981685, "rewards/rejected": -0.059337797619047616, "step": 48 }, { "epoch": 0.03359616043880699, "grad_norm": 0.6924695620843894, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81714029.71428572, "logits/rejected": -76779064.8888889, "logps/chosen": -291.14285714285717, "logps/rejected": -273.77777777777777, "loss": 0.4935, "rewards/chosen": 0.0025384085518973215, "rewards/margins": 0.047080509246341766, "rewards/rejected": -0.04454210069444445, "step": 49 }, { "epoch": 0.03428179636612959, "grad_norm": 0.7369293411017099, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81924228.12903225, "logits/rejected": -101552996.84848484, "logps/chosen": -277.93548387096774, "logps/rejected": -310.3030303030303, "loss": 0.4898, "rewards/chosen": 0.012931577620967742, "rewards/margins": 0.08075828216642228, "rewards/rejected": -0.06782670454545454, "step": 50 }, { "epoch": 0.03496743229345218, "grad_norm": 0.829418572138442, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98689505.88235295, "logits/rejected": -82796650.3896104, "logps/chosen": -306.8235294117647, "logps/rejected": -310.02597402597405, "loss": 0.4887, "rewards/chosen": 2.475302453134574, "rewards/margins": 2.548050018069639, "rewards/rejected": -0.07274756493506493, "step": 51 }, { "epoch": 0.03565306822077477, "grad_norm": 0.6144951904088404, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92031539.94202898, "logits/rejected": -97677520.27118644, "logps/chosen": -274.7826086956522, "logps/rejected": -251.11864406779662, "loss": 0.4948, "rewards/chosen": -0.00271385303442029, "rewards/margins": 0.04505494993168141, "rewards/rejected": -0.0477688029661017, "step": 52 }, { "epoch": 0.03633870414809736, "grad_norm": 0.7118957098343458, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101745697.03225806, "logits/rejected": -97358692.84848484, "logps/chosen": -294.7096774193548, "logps/rejected": -283.6363636363636, "loss": 0.4901, "rewards/chosen": 2.378050281155494, "rewards/margins": 2.4404910955494334, "rewards/rejected": -0.06244081439393939, "step": 53 }, { "epoch": 0.03702434007541995, "grad_norm": 0.7097286211935591, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96101070.59649123, "logits/rejected": -75851920.22535211, "logps/chosen": -319.1578947368421, "logps/rejected": -271.5492957746479, "loss": 0.4931, "rewards/chosen": 0.0007881030701754386, "rewards/margins": 0.0508816329997529, "rewards/rejected": -0.05009352992957746, "step": 54 }, { "epoch": 0.03770997600274254, "grad_norm": 0.6866081490800326, "kl": 0.001953125, "learning_rate": 1e-06, "logits/chosen": -92016576.98461539, "logits/rejected": -104524718.73015873, "logps/chosen": -247.63076923076923, "logps/rejected": -284.95238095238096, "loss": 0.4966, "rewards/chosen": 1.5991685133713942, "rewards/margins": 1.644400408212664, "rewards/rejected": -0.04523189484126984, "step": 55 }, { "epoch": 0.03839561193006513, "grad_norm": 0.7808303592198413, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84394480.48484848, "logits/rejected": -95318940.90322581, "logps/chosen": -247.27272727272728, "logps/rejected": -321.03225806451616, "loss": 0.4954, "rewards/chosen": 1.5948023940577651, "rewards/margins": 1.6358967740980876, "rewards/rejected": -0.04109438004032258, "step": 56 }, { "epoch": 0.03908124785738773, "grad_norm": 0.9512200221860261, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103381826.37037037, "logits/rejected": -111319095.35135135, "logps/chosen": -286.51851851851853, "logps/rejected": -359.7837837837838, "loss": 0.4883, "rewards/chosen": -0.014359085648148149, "rewards/margins": 0.07949648867617617, "rewards/rejected": -0.09385557432432433, "step": 57 }, { "epoch": 0.03976688378471032, "grad_norm": 0.6576891340554938, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95242964.67692308, "logits/rejected": -86549130.15873016, "logps/chosen": -304.24615384615385, "logps/rejected": -256.76190476190476, "loss": 0.4892, "rewards/chosen": 0.023016826923076925, "rewards/margins": 0.09078716422466422, "rewards/rejected": -0.0677703373015873, "step": 58 }, { "epoch": 0.04045251971203291, "grad_norm": 0.6722996609681352, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85388517.25373134, "logits/rejected": -92824760.6557377, "logps/chosen": -242.62686567164178, "logps/rejected": -300.0655737704918, "loss": 0.4895, "rewards/chosen": -0.011900944496268656, "rewards/margins": 0.09094229320864937, "rewards/rejected": -0.10284323770491803, "step": 59 }, { "epoch": 0.0411381556393555, "grad_norm": 0.6849206837221353, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84531357.53846154, "logits/rejected": -88546417.77777778, "logps/chosen": -255.5076923076923, "logps/rejected": -265.14285714285717, "loss": 0.487, "rewards/chosen": 0.014933894230769231, "rewards/margins": 0.10700978708791209, "rewards/rejected": -0.09207589285714286, "step": 60 }, { "epoch": 0.04182379156667809, "grad_norm": 0.6961219836731656, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91947008.0, "logits/rejected": -103022592.0, "logps/chosen": -297.5, "logps/rejected": -315.5, "loss": 0.4869, "rewards/chosen": 1.71661376953125, "rewards/margins": 1.78179931640625, "rewards/rejected": -0.065185546875, "step": 61 }, { "epoch": 0.04250942749400068, "grad_norm": 0.6861706547355378, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109295052.05797102, "logits/rejected": -80473764.88135593, "logps/chosen": -340.8695652173913, "logps/rejected": -244.88135593220338, "loss": 0.487, "rewards/chosen": 0.010289288949275362, "rewards/margins": 0.1141690558984279, "rewards/rejected": -0.10387976694915255, "step": 62 }, { "epoch": 0.04319506342132328, "grad_norm": 0.6883426763475975, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80871424.0, "logits/rejected": -120324096.0, "logps/chosen": -260.75, "logps/rejected": -259.0, "loss": 0.4875, "rewards/chosen": 2.19260835647583, "rewards/margins": 2.29447603225708, "rewards/rejected": -0.10186767578125, "step": 63 }, { "epoch": 0.04388069934864587, "grad_norm": 0.5807225436007579, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100788499.10447761, "logits/rejected": -78729148.85245901, "logps/chosen": -242.86567164179104, "logps/rejected": -274.62295081967216, "loss": 0.4946, "rewards/chosen": -0.019359987173507464, "rewards/margins": 0.047302410367476136, "rewards/rejected": -0.0666623975409836, "step": 64 }, { "epoch": 0.04456633527596846, "grad_norm": 0.7883265400654669, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88080384.0, "logits/rejected": -94671433.14285715, "logps/chosen": -261.4153846153846, "logps/rejected": -374.3492063492063, "loss": 0.4815, "rewards/chosen": 0.0032076322115384614, "rewards/margins": 0.14947499332264957, "rewards/rejected": -0.1462673611111111, "step": 65 }, { "epoch": 0.04525197120329105, "grad_norm": 0.7984409446848395, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103639898.83870968, "logits/rejected": -123922618.18181819, "logps/chosen": -333.93548387096774, "logps/rejected": -347.1515151515151, "loss": 0.4839, "rewards/chosen": 2.233031242124496, "rewards/margins": 2.350100370912375, "rewards/rejected": -0.11706912878787878, "step": 66 }, { "epoch": 0.04593760713061364, "grad_norm": 0.6934945098993982, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105775104.0, "logits/rejected": -75366400.0, "logps/chosen": -335.75, "logps/rejected": -254.375, "loss": 0.4886, "rewards/chosen": -0.0078125, "rewards/margins": 0.08935546875, "rewards/rejected": -0.09716796875, "step": 67 }, { "epoch": 0.046623243057936234, "grad_norm": 0.868229751136968, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94648822.33962265, "logits/rejected": -93505017.17333333, "logps/chosen": -230.33962264150944, "logps/rejected": -356.6933333333333, "loss": 0.4821, "rewards/chosen": -0.0029688421285377358, "rewards/margins": 0.12328115787146227, "rewards/rejected": -0.12625, "step": 68 }, { "epoch": 0.047308878985258825, "grad_norm": 0.7508341892100799, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93418589.0909091, "logits/rejected": -90380486.19354838, "logps/chosen": -298.6666666666667, "logps/rejected": -363.61290322580646, "loss": 0.488, "rewards/chosen": -0.010135535037878788, "rewards/margins": 0.10415377947825025, "rewards/rejected": -0.11428931451612903, "step": 69 }, { "epoch": 0.04799451491258142, "grad_norm": 0.7693043400301872, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90850207.39622642, "logits/rejected": -96972308.48, "logps/chosen": -259.0188679245283, "logps/rejected": -276.05333333333334, "loss": 0.4854, "rewards/chosen": -0.00908848024764151, "rewards/margins": 0.09872401975235849, "rewards/rejected": -0.1078125, "step": 70 }, { "epoch": 0.04868015083990401, "grad_norm": 0.7486392872452337, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91518338.09836066, "logits/rejected": -99912077.37313433, "logps/chosen": -332.0655737704918, "logps/rejected": -278.44776119402985, "loss": 0.4825, "rewards/chosen": -0.0023293417008196722, "rewards/margins": -0.08744632594008478, "rewards/rejected": 0.0851169842392651, "step": 71 }, { "epoch": 0.049365786767226603, "grad_norm": 0.8167345067840552, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74908797.75438596, "logits/rejected": -98064009.0140845, "logps/chosen": -285.1929824561403, "logps/rejected": -345.6901408450704, "loss": 0.4882, "rewards/chosen": 2.234217727393435, "rewards/margins": 2.30315478725259, "rewards/rejected": -0.06893705985915492, "step": 72 }, { "epoch": 0.050051422694549194, "grad_norm": 0.7112042436722547, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87724934.50847457, "logits/rejected": -96408204.98550725, "logps/chosen": -226.4406779661017, "logps/rejected": -273.6231884057971, "loss": 0.4813, "rewards/chosen": 2.50881544210143, "rewards/margins": 2.408838087028966, "rewards/rejected": 0.09997735507246377, "step": 73 }, { "epoch": 0.050737058621871785, "grad_norm": 0.8547708985055993, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96019602.28571428, "logits/rejected": -122683392.0, "logps/chosen": -300.85714285714283, "logps/rejected": -356.0, "loss": 0.4792, "rewards/chosen": 0.03602818080357143, "rewards/margins": 0.1597260974702381, "rewards/rejected": -0.12369791666666667, "step": 74 }, { "epoch": 0.051422694549194375, "grad_norm": 0.7735020960948638, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -118372579.55555555, "logits/rejected": -95372020.18461539, "logps/chosen": -245.33333333333334, "logps/rejected": -321.4769230769231, "loss": 0.4811, "rewards/chosen": -0.010371132502480158, "rewards/margins": 0.14852309826675061, "rewards/rejected": -0.15889423076923076, "step": 75 }, { "epoch": 0.052108330476516966, "grad_norm": 0.7219654717942161, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96613623.1724138, "logits/rejected": -83047219.2, "logps/chosen": -295.17241379310343, "logps/rejected": -285.25714285714287, "loss": 0.4843, "rewards/chosen": -0.019480738146551723, "rewards/margins": 0.11377819042487684, "rewards/rejected": -0.13325892857142857, "step": 76 }, { "epoch": 0.052793966403839564, "grad_norm": 0.8174473804846929, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91279429.42372881, "logits/rejected": -89235337.27536231, "logps/chosen": -272.9491525423729, "logps/rejected": -323.71014492753625, "loss": 0.4735, "rewards/chosen": 2.1473856780488614, "rewards/margins": 2.338961765005383, "rewards/rejected": -0.19157608695652173, "step": 77 }, { "epoch": 0.053479602331162154, "grad_norm": 0.8281849638848513, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103165161.54385965, "logits/rejected": -81050494.1971831, "logps/chosen": -264.140350877193, "logps/rejected": -311.4366197183099, "loss": 0.4789, "rewards/chosen": -0.013894599780701754, "rewards/margins": -1.4632540100263554, "rewards/rejected": 1.4493594102456535, "step": 78 }, { "epoch": 0.054165238258484745, "grad_norm": 0.7229515266679017, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78426827.17460318, "logits/rejected": -82789108.18461539, "logps/chosen": -194.03174603174602, "logps/rejected": -310.15384615384613, "loss": 0.4767, "rewards/chosen": -0.0021081349206349205, "rewards/margins": 0.18659378815628816, "rewards/rejected": -0.18870192307692307, "step": 79 }, { "epoch": 0.054850874185807336, "grad_norm": 0.7825669131738466, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88429909.33333333, "logits/rejected": -89353654.85714285, "logps/chosen": -279.1111111111111, "logps/rejected": -387.7142857142857, "loss": 0.478, "rewards/chosen": -0.008192274305555556, "rewards/margins": 0.20581442212301587, "rewards/rejected": -0.21400669642857142, "step": 80 }, { "epoch": 0.055536510113129926, "grad_norm": 0.7868371812097547, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84986225.31147541, "logits/rejected": -96531593.5522388, "logps/chosen": -263.8688524590164, "logps/rejected": -327.64179104477614, "loss": 0.4793, "rewards/chosen": 1.9469250538310066, "rewards/margins": 2.1337254269653347, "rewards/rejected": -0.18680037313432835, "step": 81 }, { "epoch": 0.05622214604045252, "grad_norm": 0.8501417103884645, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92655988.36363636, "logits/rejected": -110810805.67741935, "logps/chosen": -299.8787878787879, "logps/rejected": -333.93548387096774, "loss": 0.4757, "rewards/chosen": -0.008594859730113636, "rewards/margins": 0.20486280156020895, "rewards/rejected": -0.2134576612903226, "step": 82 }, { "epoch": 0.056907781967775115, "grad_norm": 0.6203138371018657, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86758890.9589041, "logits/rejected": -143979017.3090909, "logps/chosen": -191.56164383561645, "logps/rejected": -292.94545454545454, "loss": 0.486, "rewards/chosen": -0.005424604023972603, "rewards/margins": 0.13534244143057286, "rewards/rejected": -0.14076704545454546, "step": 83 }, { "epoch": 0.057593417895097705, "grad_norm": 0.6315844362762683, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94584594.55072464, "logits/rejected": -76421640.6779661, "logps/chosen": -294.2608695652174, "logps/rejected": -293.1525423728813, "loss": 0.4815, "rewards/chosen": 0.003369317538496377, "rewards/margins": 0.11578986838595401, "rewards/rejected": -0.11242055084745763, "step": 84 }, { "epoch": 0.058279053822420296, "grad_norm": 0.7300509252048198, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83177183.54929577, "logits/rejected": -133481885.19298245, "logps/chosen": -218.81690140845072, "logps/rejected": -353.12280701754383, "loss": 0.4738, "rewards/chosen": -0.004112566021126761, "rewards/margins": -1.3146800537712637, "rewards/rejected": 1.310567487750137, "step": 85 }, { "epoch": 0.058964689749742887, "grad_norm": 0.7640087386168839, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95354880.0, "logits/rejected": -90832896.0, "logps/chosen": -326.5, "logps/rejected": -314.0, "loss": 0.4713, "rewards/chosen": 0.011796951293945312, "rewards/margins": 0.2361621856689453, "rewards/rejected": -0.224365234375, "step": 86 }, { "epoch": 0.05965032567706548, "grad_norm": 0.8796805344690053, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85758537.14285715, "logits/rejected": -129790407.1111111, "logps/chosen": -250.42857142857142, "logps/rejected": -386.44444444444446, "loss": 0.4744, "rewards/chosen": -0.025844029017857144, "rewards/margins": 0.17771499875992064, "rewards/rejected": -0.2035590277777778, "step": 87 }, { "epoch": 0.06033596160438807, "grad_norm": 0.6617770346805462, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84853996.3076923, "logits/rejected": -87680926.47619048, "logps/chosen": -258.46153846153845, "logps/rejected": -248.88888888888889, "loss": 0.4785, "rewards/chosen": -0.024489182692307692, "rewards/margins": 0.1806199442918193, "rewards/rejected": -0.20510912698412698, "step": 88 }, { "epoch": 0.06102159753171066, "grad_norm": 0.9391220609389903, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101886634.66666667, "logits/rejected": -85668659.2, "logps/chosen": -308.3333333333333, "logps/rejected": -273.6, "loss": 0.472, "rewards/chosen": 2.5936320622762046, "rewards/margins": 2.7928508122762046, "rewards/rejected": -0.19921875, "step": 89 }, { "epoch": 0.061707233459033256, "grad_norm": 0.843449176225446, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85758537.14285715, "logits/rejected": -91575637.33333333, "logps/chosen": -250.0, "logps/rejected": -353.3333333333333, "loss": 0.4738, "rewards/chosen": -0.004560198102678571, "rewards/margins": 0.18738858661954363, "rewards/rejected": -0.1919487847222222, "step": 90 }, { "epoch": 0.06239286938635585, "grad_norm": 0.7338258548241371, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101044596.36363636, "logits/rejected": -91598187.35483871, "logps/chosen": -269.8181818181818, "logps/rejected": -334.96774193548384, "loss": 0.4684, "rewards/chosen": 1.3831009142326587, "rewards/margins": 1.6467097852004007, "rewards/rejected": -0.26360887096774194, "step": 91 }, { "epoch": 0.06307850531367844, "grad_norm": 0.6889364141029541, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91742077.96825397, "logits/rejected": -76917082.58461538, "logps/chosen": -252.95238095238096, "logps/rejected": -363.81538461538463, "loss": 0.4723, "rewards/chosen": -0.026142423115079364, "rewards/margins": 0.24453065380799754, "rewards/rejected": -0.2706730769230769, "step": 92 }, { "epoch": 0.06376414124100103, "grad_norm": 0.7676380769874468, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100510775.85454546, "logits/rejected": -99858908.93150684, "logps/chosen": -333.6727272727273, "logps/rejected": -284.93150684931504, "loss": 0.4693, "rewards/chosen": -0.020001775568181817, "rewards/margins": 0.21736979977428394, "rewards/rejected": -0.23737157534246575, "step": 93 }, { "epoch": 0.06444977716832362, "grad_norm": 0.7535927036602672, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101763441.31147541, "logits/rejected": -98409640.11940299, "logps/chosen": -281.44262295081967, "logps/rejected": -328.1194029850746, "loss": 0.4734, "rewards/chosen": -0.006783907530737705, "rewards/margins": 0.2059026596334414, "rewards/rejected": -0.2126865671641791, "step": 94 }, { "epoch": 0.06513541309564622, "grad_norm": 0.7178241892952157, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90022191.4074074, "logits/rejected": -74533915.67567568, "logps/chosen": -241.62962962962962, "logps/rejected": -276.5405405405405, "loss": 0.4691, "rewards/chosen": -0.021430121527777776, "rewards/margins": -0.12023508512937987, "rewards/rejected": 0.0988049636016021, "step": 95 }, { "epoch": 0.0658210490229688, "grad_norm": 0.9898317648606647, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82393192.13559322, "logits/rejected": -89235337.27536231, "logps/chosen": -185.22033898305085, "logps/rejected": -257.6231884057971, "loss": 0.4708, "rewards/chosen": -0.024149231991525424, "rewards/margins": 0.22064243467514122, "rewards/rejected": -0.24479166666666666, "step": 96 }, { "epoch": 0.0665066849502914, "grad_norm": 0.6044923494947586, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98962902.48648648, "logits/rejected": -75031438.22222222, "logps/chosen": -236.75675675675674, "logps/rejected": -279.1111111111111, "loss": 0.4772, "rewards/chosen": -0.0076211465371621625, "rewards/margins": 0.22501774235172672, "rewards/rejected": -0.2326388888888889, "step": 97 }, { "epoch": 0.06719232087761398, "grad_norm": 0.7852528226610802, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97431619.14754099, "logits/rejected": -101351913.07462686, "logps/chosen": -292.1967213114754, "logps/rejected": -335.04477611940297, "loss": 0.4687, "rewards/chosen": -0.001328765368852459, "rewards/margins": 9640639.043447355, "rewards/rejected": -9640639.04477612, "step": 98 }, { "epoch": 0.06787795680493658, "grad_norm": 0.6875483497730634, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84401169.96491228, "logits/rejected": -97355112.56338029, "logps/chosen": -240.8421052631579, "logps/rejected": -255.09859154929578, "loss": 0.4704, "rewards/chosen": -0.026153029057017545, "rewards/margins": 0.2152642244641092, "rewards/rejected": -0.24141725352112675, "step": 99 }, { "epoch": 0.06856359273225918, "grad_norm": 0.6758294596205584, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89923335.75757575, "logits/rejected": -88689234.58064516, "logps/chosen": -265.45454545454544, "logps/rejected": -287.741935483871, "loss": 0.4748, "rewards/chosen": -0.017504142992424244, "rewards/margins": 0.21787892152370478, "rewards/rejected": -0.23538306451612903, "step": 100 }, { "epoch": 0.06924922865958176, "grad_norm": 0.7230697189267881, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -110083290.22950819, "logits/rejected": -98785249.43283582, "logps/chosen": -277.24590163934425, "logps/rejected": -307.1044776119403, "loss": 0.4702, "rewards/chosen": -0.037621670081967214, "rewards/margins": 0.24176265827624177, "rewards/rejected": -0.279384328358209, "step": 101 }, { "epoch": 0.06993486458690436, "grad_norm": 0.679737661174997, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87411726.8405797, "logits/rejected": -98601688.94915254, "logps/chosen": -318.3768115942029, "logps/rejected": -332.7457627118644, "loss": 0.4704, "rewards/chosen": -0.013459578804347826, "rewards/margins": 0.2717628788227708, "rewards/rejected": -0.2852224576271186, "step": 102 }, { "epoch": 0.07062050051422694, "grad_norm": 0.7018938363609475, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99228402.5263158, "logits/rejected": -85095975.38461539, "logps/chosen": -334.7368421052632, "logps/rejected": -307.6923076923077, "loss": 0.4744, "rewards/chosen": -0.02012232730263158, "rewards/margins": 0.2710435380819838, "rewards/rejected": -0.29116586538461536, "step": 103 }, { "epoch": 0.07130613644154954, "grad_norm": 0.6686052341750127, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103606667.22807017, "logits/rejected": -71716690.92957747, "logps/chosen": -249.68421052631578, "logps/rejected": -251.71830985915494, "loss": 0.4664, "rewards/chosen": 2.1000746342173793, "rewards/margins": 2.3628387187244213, "rewards/rejected": -0.26276408450704225, "step": 104 }, { "epoch": 0.07199177236887212, "grad_norm": 0.8052434714752893, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93787387.80327868, "logits/rejected": -117941324.41791044, "logps/chosen": -300.0655737704918, "logps/rejected": -358.6865671641791, "loss": 0.4593, "rewards/chosen": -0.012150998975409836, "rewards/margins": 0.33183220997981405, "rewards/rejected": -0.3439832089552239, "step": 105 }, { "epoch": 0.07267740829619472, "grad_norm": 0.808484816040175, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82099617.18518518, "logits/rejected": -114493163.24324325, "logps/chosen": -273.6296296296296, "logps/rejected": -360.64864864864865, "loss": 0.4533, "rewards/chosen": -0.011501736111111112, "rewards/margins": 0.12300291720095341, "rewards/rejected": -0.13450465331206451, "step": 106 }, { "epoch": 0.07336304422351732, "grad_norm": 0.6828788021194093, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80466109.04615384, "logits/rejected": -94538280.63492064, "logps/chosen": -297.3538461538462, "logps/rejected": -308.3174603174603, "loss": 0.4661, "rewards/chosen": 0.015354567307692307, "rewards/margins": 0.2928843292124542, "rewards/rejected": -0.2775297619047619, "step": 107 }, { "epoch": 0.0740486801508399, "grad_norm": 0.7803470502514472, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100313770.66666667, "logits/rejected": -108620137.41176471, "logps/chosen": -275.2, "logps/rejected": -306.3529411764706, "loss": 0.4528, "rewards/chosen": 2.122554143269857, "rewards/margins": 2.173335393269857, "rewards/rejected": -0.05078125, "step": 108 }, { "epoch": 0.0747343160781625, "grad_norm": 0.6787856531004154, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -108534798.02739726, "logits/rejected": -91893387.63636364, "logps/chosen": -305.09589041095893, "logps/rejected": -353.74545454545455, "loss": 0.4613, "rewards/chosen": -0.0021136558219178084, "rewards/margins": 0.3910681623599004, "rewards/rejected": -0.3931818181818182, "step": 109 }, { "epoch": 0.07541995200548508, "grad_norm": 0.7758775950467238, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85816791.36507936, "logits/rejected": -94984853.66153847, "logps/chosen": -238.47619047619048, "logps/rejected": -328.8615384615385, "loss": 0.4513, "rewards/chosen": 0.0038093687996031745, "rewards/margins": 0.21775167649191085, "rewards/rejected": -0.21394230769230768, "step": 110 }, { "epoch": 0.07610558793280768, "grad_norm": 0.8352510560819688, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97178797.29230769, "logits/rejected": -92407840.50793651, "logps/chosen": -308.18461538461537, "logps/rejected": -333.2063492063492, "loss": 0.4606, "rewards/chosen": -0.03060772235576923, "rewards/margins": 0.34092005542200854, "rewards/rejected": -0.3715277777777778, "step": 111 }, { "epoch": 0.07679122386013026, "grad_norm": 0.7408030396332568, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89888273.65517241, "logits/rejected": -91555664.45714286, "logps/chosen": -284.13793103448273, "logps/rejected": -293.25714285714287, "loss": 0.4547, "rewards/chosen": 2.4140111988988417, "rewards/margins": 2.7747254846131275, "rewards/rejected": -0.3607142857142857, "step": 112 }, { "epoch": 0.07747685978745286, "grad_norm": 0.7171819538565236, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100303784.22857143, "logits/rejected": -118597561.37931034, "logps/chosen": -324.34285714285716, "logps/rejected": -284.9655172413793, "loss": 0.463, "rewards/chosen": 1.7037269592285156, "rewards/margins": 2.0490933385388606, "rewards/rejected": -0.3453663793103448, "step": 113 }, { "epoch": 0.07816249571477546, "grad_norm": 0.7747366781412802, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85923313.37142856, "logits/rejected": -107967170.20689656, "logps/chosen": -297.6, "logps/rejected": -332.6896551724138, "loss": 0.4488, "rewards/chosen": -0.006919642857142857, "rewards/margins": 0.49254156403940885, "rewards/rejected": -0.4994612068965517, "step": 114 }, { "epoch": 0.07884813164209804, "grad_norm": 0.7846969553453681, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96833714.08695652, "logits/rejected": -67464313.49152543, "logps/chosen": -307.94202898550725, "logps/rejected": -340.8813559322034, "loss": 0.4554, "rewards/chosen": -0.0026466259057971015, "rewards/margins": 0.40625167917894867, "rewards/rejected": -0.4088983050847458, "step": 115 }, { "epoch": 0.07953376756942064, "grad_norm": 0.7092236791564432, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95386590.96774194, "logits/rejected": -89351385.21212122, "logps/chosen": -272.258064516129, "logps/rejected": -281.93939393939394, "loss": 0.4572, "rewards/chosen": -0.012742565524193549, "rewards/margins": 0.35965326780913975, "rewards/rejected": -0.3723958333333333, "step": 116 }, { "epoch": 0.08021940349674322, "grad_norm": 0.6938599783898868, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92339215.75384615, "logits/rejected": -79891504.76190476, "logps/chosen": -311.38461538461536, "logps/rejected": -313.3968253968254, "loss": 0.457, "rewards/chosen": 2.2597369854266827, "rewards/margins": 0.9070735516856732, "rewards/rejected": 1.3526634337410095, "step": 117 }, { "epoch": 0.08090503942406582, "grad_norm": 0.8495026705363521, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -108845626.75409836, "logits/rejected": -127832369.6716418, "logps/chosen": -260.4590163934426, "logps/rejected": -375.8805970149254, "loss": 0.4387, "rewards/chosen": 1.8694640612993083, "rewards/margins": 2.3909192851799053, "rewards/rejected": -0.5214552238805971, "step": 118 }, { "epoch": 0.0815906753513884, "grad_norm": 0.6049462113232846, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89964781.44927536, "logits/rejected": -90852890.03389831, "logps/chosen": -272.69565217391306, "logps/rejected": -257.35593220338984, "loss": 0.4627, "rewards/chosen": -0.007080078125, "rewards/margins": 0.3409072100105932, "rewards/rejected": -0.3479872881355932, "step": 119 }, { "epoch": 0.082276311278711, "grad_norm": 0.749257413690131, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100792351.5076923, "logits/rejected": -107454073.90476191, "logps/chosen": -283.5692307692308, "logps/rejected": -357.58730158730157, "loss": 0.4453, "rewards/chosen": -0.02931189903846154, "rewards/margins": 0.4825928628663003, "rewards/rejected": -0.5119047619047619, "step": 120 }, { "epoch": 0.0829619472060336, "grad_norm": 0.7048140960507032, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83743900.20338982, "logits/rejected": -99082833.6231884, "logps/chosen": -236.47457627118644, "logps/rejected": -297.27536231884056, "loss": 0.4456, "rewards/chosen": 0.011731163930084746, "rewards/margins": 0.43202101900254847, "rewards/rejected": -0.42028985507246375, "step": 121 }, { "epoch": 0.08364758313335618, "grad_norm": 0.8372568561701347, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103934853.12, "logits/rejected": -82703071.17948718, "logps/chosen": -322.56, "logps/rejected": -301.94871794871796, "loss": 0.4302, "rewards/chosen": 3.2497940063476562, "rewards/margins": 3.7333677242963743, "rewards/rejected": -0.48357371794871795, "step": 122 }, { "epoch": 0.08433321906067878, "grad_norm": 0.5893490919337979, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100439599.78666666, "logits/rejected": -76605401.35849057, "logps/chosen": -301.2266666666667, "logps/rejected": -262.33962264150944, "loss": 0.4587, "rewards/chosen": -0.005364583333333333, "rewards/margins": 0.4409797562893082, "rewards/rejected": -0.44634433962264153, "step": 123 }, { "epoch": 0.08501885498800137, "grad_norm": 0.6722612427460741, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85638923.46268657, "logits/rejected": -80998198.55737706, "logps/chosen": -246.6865671641791, "logps/rejected": -258.3606557377049, "loss": 0.4561, "rewards/chosen": 0.008687033582089552, "rewards/margins": 0.29454768931979447, "rewards/rejected": -0.2858606557377049, "step": 124 }, { "epoch": 0.08570449091532396, "grad_norm": 0.8348794749381252, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95971362.71186441, "logits/rejected": -109659774.14492753, "logps/chosen": -338.9830508474576, "logps/rejected": -328.3478260869565, "loss": 0.4388, "rewards/chosen": -0.012678760593220338, "rewards/margins": 0.4913973263633014, "rewards/rejected": -0.5040760869565217, "step": 125 }, { "epoch": 0.08639012684264656, "grad_norm": 0.7416835262928678, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99795508.96551724, "logits/rejected": -86522499.65714286, "logps/chosen": -344.55172413793105, "logps/rejected": -282.0571428571429, "loss": 0.4359, "rewards/chosen": -0.0012796336206896551, "rewards/margins": 0.5156846520935962, "rewards/rejected": -0.5169642857142858, "step": 126 }, { "epoch": 0.08707576276996914, "grad_norm": 0.7299928943299268, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100921407.01538461, "logits/rejected": -83486622.47619048, "logps/chosen": -221.53846153846155, "logps/rejected": -340.8253968253968, "loss": 0.4436, "rewards/chosen": -0.026908052884615386, "rewards/margins": 0.4973975026709402, "rewards/rejected": -0.5243055555555556, "step": 127 }, { "epoch": 0.08776139869729174, "grad_norm": 0.8287867167556742, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92540993.01587301, "logits/rejected": -87886800.73846154, "logps/chosen": -305.5238095238095, "logps/rejected": -365.04615384615386, "loss": 0.4323, "rewards/chosen": 0.027227492559523808, "rewards/margins": 14050540.334919801, "rewards/rejected": -14050540.307692308, "step": 128 }, { "epoch": 0.08844703462461433, "grad_norm": 0.6239240823955862, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99138094.54545455, "logits/rejected": -108393973.96078432, "logps/chosen": -252.67532467532467, "logps/rejected": -271.37254901960785, "loss": 0.4595, "rewards/chosen": -0.006227171266233766, "rewards/margins": 0.46068459343964857, "rewards/rejected": -0.46691176470588236, "step": 129 }, { "epoch": 0.08913267055193692, "grad_norm": 0.7580046756939915, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97842988.13793103, "logits/rejected": -84245591.77142857, "logps/chosen": -366.8965517241379, "logps/rejected": -265.14285714285717, "loss": 0.4443, "rewards/chosen": -0.04061153017241379, "rewards/margins": 0.4388527555418719, "rewards/rejected": -0.47946428571428573, "step": 130 }, { "epoch": 0.08981830647925951, "grad_norm": 0.8138210931639612, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86022068.14814815, "logits/rejected": -89327339.24324325, "logps/chosen": -236.2962962962963, "logps/rejected": -334.27027027027026, "loss": 0.4259, "rewards/chosen": 2.068253693757234, "rewards/margins": 2.6535577478112877, "rewards/rejected": -0.5853040540540541, "step": 131 }, { "epoch": 0.0905039424065821, "grad_norm": 0.673979841052907, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99284346.73972602, "logits/rejected": -124456438.69090909, "logps/chosen": -309.47945205479454, "logps/rejected": -352.8727272727273, "loss": 0.4421, "rewards/chosen": 1.3154121294413528, "rewards/margins": 1.9989348567140799, "rewards/rejected": -0.6835227272727272, "step": 132 }, { "epoch": 0.0911895783339047, "grad_norm": 0.7553099683123975, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83198489.18032786, "logits/rejected": -82508845.85074627, "logps/chosen": -307.40983606557376, "logps/rejected": -333.85074626865674, "loss": 0.4235, "rewards/chosen": -0.034643954918032786, "rewards/margins": 0.6705799256789822, "rewards/rejected": -0.7052238805970149, "step": 133 }, { "epoch": 0.09187521426122729, "grad_norm": 0.6770952778823458, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80281600.0, "logits/rejected": -96600064.0, "logps/chosen": -284.5, "logps/rejected": -292.25, "loss": 0.4452, "rewards/chosen": 2.059494733810425, "rewards/margins": 2.579514265060425, "rewards/rejected": -0.52001953125, "step": 134 }, { "epoch": 0.09256085018854988, "grad_norm": 0.6398512543828453, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101635888.23188406, "logits/rejected": -85734417.3559322, "logps/chosen": -204.9855072463768, "logps/rejected": -284.7457627118644, "loss": 0.4366, "rewards/chosen": -0.020875792572463768, "rewards/margins": 0.6168360718343159, "rewards/rejected": -0.6377118644067796, "step": 135 }, { "epoch": 0.09324648611587247, "grad_norm": 0.7106552839270407, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104502150.50847457, "logits/rejected": -103459498.66666667, "logps/chosen": -320.8135593220339, "logps/rejected": -345.9710144927536, "loss": 0.4268, "rewards/chosen": -0.022676112288135594, "rewards/margins": 0.6304035978567919, "rewards/rejected": -0.6530797101449275, "step": 136 }, { "epoch": 0.09393212204319507, "grad_norm": 0.6990283317631791, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89128960.0, "logits/rejected": -111149056.0, "logps/chosen": -283.25, "logps/rejected": -317.5, "loss": 0.4458, "rewards/chosen": -0.089599609375, "rewards/margins": 0.506591796875, "rewards/rejected": -0.59619140625, "step": 137 }, { "epoch": 0.09461775797051765, "grad_norm": 0.6712098582844891, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88286661.24590164, "logits/rejected": -88205587.10447761, "logps/chosen": -337.8360655737705, "logps/rejected": -276.7761194029851, "loss": 0.4419, "rewards/chosen": -0.012615266393442622, "rewards/margins": 0.4897168231587962, "rewards/rejected": -0.5023320895522388, "step": 138 }, { "epoch": 0.09530339389784025, "grad_norm": 0.6017411278797948, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91692145.77777778, "logits/rejected": -93997348.57142857, "logps/chosen": -266.6666666666667, "logps/rejected": -286.42857142857144, "loss": 0.4469, "rewards/chosen": -0.009141710069444444, "rewards/margins": 0.5611707899305556, "rewards/rejected": -0.5703125, "step": 139 }, { "epoch": 0.09598902982516284, "grad_norm": 0.8223308361453823, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97867093.33333333, "logits/rejected": -79691776.0, "logps/chosen": -377.02564102564105, "logps/rejected": -241.92, "loss": 0.4543, "rewards/chosen": 1.0848954029572315, "rewards/margins": 1.5886454029572317, "rewards/rejected": -0.50375, "step": 140 }, { "epoch": 0.09667466575248543, "grad_norm": 0.6915845980055697, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88933462.77966101, "logits/rejected": -99933851.82608695, "logps/chosen": -252.20338983050848, "logps/rejected": -316.7536231884058, "loss": 0.4232, "rewards/chosen": -0.03231759798728814, "rewards/margins": 0.6923200831721321, "rewards/rejected": -0.7246376811594203, "step": 141 }, { "epoch": 0.09736030167980803, "grad_norm": 0.717138087624617, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91056986.83870968, "logits/rejected": -96341891.87878788, "logps/chosen": -261.4193548387097, "logps/rejected": -365.09090909090907, "loss": 0.4164, "rewards/chosen": -0.01688508064516129, "rewards/margins": 0.14646632510546018, "rewards/rejected": -0.16335140575062146, "step": 142 }, { "epoch": 0.09804593760713061, "grad_norm": 0.6878121856635392, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90897453.85074627, "logits/rejected": -118815693.63934426, "logps/chosen": -304.95522388059703, "logps/rejected": -354.88524590163934, "loss": 0.4199, "rewards/chosen": 0.010902518656716417, "rewards/margins": 0.7988123547222902, "rewards/rejected": -0.7879098360655737, "step": 143 }, { "epoch": 0.09873157353445321, "grad_norm": 0.7143154763553867, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91639187.39393939, "logits/rejected": -123258417.5483871, "logps/chosen": -309.8181818181818, "logps/rejected": -345.2903225806452, "loss": 0.4318, "rewards/chosen": 1.7621920036547112, "rewards/margins": 1.8075732112746552, "rewards/rejected": -0.04538120761994393, "step": 144 }, { "epoch": 0.09941720946177579, "grad_norm": 0.6721064102907255, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -121421546.30508475, "logits/rejected": -72701269.33333333, "logps/chosen": -343.59322033898303, "logps/rejected": -247.18840579710144, "loss": 0.43, "rewards/chosen": -0.02549407441737288, "rewards/margins": 0.5881834618145112, "rewards/rejected": -0.613677536231884, "step": 145 }, { "epoch": 0.10010284538909839, "grad_norm": 0.6313987866759586, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91855257.6, "logits/rejected": -91551532.13793103, "logps/chosen": -274.9714285714286, "logps/rejected": -356.6896551724138, "loss": 0.4434, "rewards/chosen": -0.04575892857142857, "rewards/margins": 0.3542937010967086, "rewards/rejected": -0.40005262966813715, "step": 146 }, { "epoch": 0.10078848131642099, "grad_norm": 0.6909169265011506, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96468992.0, "logits/rejected": -99690703.76811594, "logps/chosen": -262.50847457627117, "logps/rejected": -287.536231884058, "loss": 0.4238, "rewards/chosen": 0.009086996822033898, "rewards/margins": 0.6449565620394252, "rewards/rejected": -0.6358695652173914, "step": 147 }, { "epoch": 0.10147411724374357, "grad_norm": 0.797071277955522, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -112629398.58823529, "logits/rejected": -90247441.06666666, "logps/chosen": -311.7647058823529, "logps/rejected": -339.46666666666664, "loss": 0.4367, "rewards/chosen": -0.06186810661764706, "rewards/margins": -1.6384561650893266, "rewards/rejected": 1.5765880584716796, "step": 148 }, { "epoch": 0.10215975317106617, "grad_norm": 0.7066453512211442, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90207073.35211268, "logits/rejected": -91244508.07017544, "logps/chosen": -290.7042253521127, "logps/rejected": -292.2105263157895, "loss": 0.4455, "rewards/chosen": -0.02189700704225352, "rewards/margins": 0.5422477297998518, "rewards/rejected": -0.5641447368421053, "step": 149 }, { "epoch": 0.10284538909838875, "grad_norm": 0.6647222641534964, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95331553.62711865, "logits/rejected": -98596537.50724638, "logps/chosen": -328.9491525423729, "logps/rejected": -313.9710144927536, "loss": 0.4285, "rewards/chosen": 2.486233468783104, "rewards/margins": 3.1103276716816546, "rewards/rejected": -0.6240942028985508, "step": 150 }, { "epoch": 0.10353102502571135, "grad_norm": 0.641087593075081, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99975705.18032786, "logits/rejected": -90960055.40298508, "logps/chosen": -280.39344262295083, "logps/rejected": -261.97014925373134, "loss": 0.4311, "rewards/chosen": 2.3559705390304817, "rewards/margins": 2.9166048673886906, "rewards/rejected": -0.5606343283582089, "step": 151 }, { "epoch": 0.10421666095303393, "grad_norm": 0.6122011578698335, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74871456.47761194, "logits/rejected": -83129730.09836066, "logps/chosen": -234.7462686567164, "logps/rejected": -296.1311475409836, "loss": 0.4301, "rewards/chosen": -0.021114520172574626, "rewards/margins": 0.3036805617946385, "rewards/rejected": -0.32479508196721313, "step": 152 }, { "epoch": 0.10490229688035653, "grad_norm": 0.6854479301793017, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86706387.86206897, "logits/rejected": -79991369.14285715, "logps/chosen": -324.41379310344826, "logps/rejected": -312.22857142857146, "loss": 0.4086, "rewards/chosen": -0.0075599407327586205, "rewards/margins": 0.7906543449815271, "rewards/rejected": -0.7982142857142858, "step": 153 }, { "epoch": 0.10558793280767913, "grad_norm": 0.6440165790980693, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91742077.96825397, "logits/rejected": -88273967.26153846, "logps/chosen": -269.2063492063492, "logps/rejected": -340.67692307692306, "loss": 0.4102, "rewards/chosen": -0.014787946428571428, "rewards/margins": 0.8804043612637363, "rewards/rejected": -0.8951923076923077, "step": 154 }, { "epoch": 0.10627356873500171, "grad_norm": 0.6499339749053659, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89097184.96969697, "logits/rejected": -113110907.87096775, "logps/chosen": -323.8787878787879, "logps/rejected": -360.0, "loss": 0.4218, "rewards/chosen": -0.02071496212121212, "rewards/margins": 0.7867447152981427, "rewards/rejected": -0.8074596774193549, "step": 155 }, { "epoch": 0.10695920466232431, "grad_norm": 0.6438727828700436, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85138111.04477613, "logits/rejected": -82235862.03278689, "logps/chosen": -286.56716417910445, "logps/rejected": -331.0163934426229, "loss": 0.4289, "rewards/chosen": 0.014050839552238806, "rewards/margins": 0.7117967411915831, "rewards/rejected": -0.6977459016393442, "step": 156 }, { "epoch": 0.10764484058964689, "grad_norm": 0.6516100516124279, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90898432.0, "logits/rejected": -110100480.0, "logps/chosen": -271.25, "logps/rejected": -371.75, "loss": 0.4226, "rewards/chosen": -0.06512451171875, "rewards/margins": 0.77081298828125, "rewards/rejected": -0.8359375, "step": 157 }, { "epoch": 0.10833047651696949, "grad_norm": 0.6131170279018961, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100064109.71428572, "logits/rejected": -94010262.06896552, "logps/chosen": -268.57142857142856, "logps/rejected": -334.3448275862069, "loss": 0.433, "rewards/chosen": -0.06886858258928572, "rewards/margins": 0.7382434863762315, "rewards/rejected": -0.8071120689655172, "step": 158 }, { "epoch": 0.10901611244429209, "grad_norm": 0.6169418315112161, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91375908.57142857, "logits/rejected": -88080384.0, "logps/chosen": -223.71428571428572, "logps/rejected": -278.6666666666667, "loss": 0.4099, "rewards/chosen": -0.005944388253348214, "rewards/margins": 0.8169722784133184, "rewards/rejected": -0.8229166666666666, "step": 159 }, { "epoch": 0.10970174837161467, "grad_norm": 0.6654526029068637, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94458024.32876712, "logits/rejected": -133607647.41818182, "logps/chosen": -333.5890410958904, "logps/rejected": -357.8181818181818, "loss": 0.4263, "rewards/chosen": 1.297780598679634, "rewards/margins": 2.155166962315998, "rewards/rejected": -0.8573863636363637, "step": 160 }, { "epoch": 0.11038738429893727, "grad_norm": 0.7022432975758548, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92986928.3018868, "logits/rejected": -96972308.48, "logps/chosen": -221.28301886792454, "logps/rejected": -321.28, "loss": 0.431, "rewards/chosen": -0.0582252358490566, "rewards/margins": 0.5276080974842767, "rewards/rejected": -0.5858333333333333, "step": 161 }, { "epoch": 0.11107302022625985, "grad_norm": 0.6477687342025285, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88493906.92957747, "logits/rejected": -89846406.73684211, "logps/chosen": -313.6901408450704, "logps/rejected": -315.2280701754386, "loss": 0.4228, "rewards/chosen": 0.02365757042253521, "rewards/margins": 0.8334163423523598, "rewards/rejected": -0.8097587719298246, "step": 162 }, { "epoch": 0.11175865615358245, "grad_norm": 1.289896127967303, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92838101.97014925, "logits/rejected": -92549724.32786885, "logps/chosen": -253.13432835820896, "logps/rejected": -308.9836065573771, "loss": 0.4155, "rewards/chosen": -0.040345149253731345, "rewards/margins": 0.8879335392708588, "rewards/rejected": -0.9282786885245902, "step": 163 }, { "epoch": 0.11244429208090503, "grad_norm": 0.5010718571109158, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85776470.53521127, "logits/rejected": -100663296.0, "logps/chosen": -226.25352112676057, "logps/rejected": -232.42105263157896, "loss": 0.4475, "rewards/chosen": -0.04918573943661972, "rewards/margins": 0.5697835588089943, "rewards/rejected": -0.6189692982456141, "step": 164 }, { "epoch": 0.11312992800822763, "grad_norm": 0.6153902463210866, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90921686.70967741, "logits/rejected": -89351385.21212122, "logps/chosen": -284.38709677419354, "logps/rejected": -302.06060606060606, "loss": 0.4018, "rewards/chosen": 0.05004095262096774, "rewards/margins": 0.9345106495906648, "rewards/rejected": -0.884469696969697, "step": 165 }, { "epoch": 0.11381556393555023, "grad_norm": 0.6271743331102504, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97949334.58823529, "logits/rejected": -99544814.93333334, "logps/chosen": -350.11764705882354, "logps/rejected": -315.2, "loss": 0.4062, "rewards/chosen": 0.040268841911764705, "rewards/margins": 0.535322546491436, "rewards/rejected": -0.49505370457967124, "step": 166 }, { "epoch": 0.11450119986287281, "grad_norm": 0.5315014431522762, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83510470.68656716, "logits/rejected": -86361406.95081967, "logps/chosen": -288.23880597014926, "logps/rejected": -293.5081967213115, "loss": 0.4377, "rewards/chosen": -0.0887305416277985, "rewards/margins": 0.674589130503349, "rewards/rejected": -0.7633196721311475, "step": 167 }, { "epoch": 0.11518683579019541, "grad_norm": 0.6673222619370237, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101915327.04477613, "logits/rejected": -86086370.62295082, "logps/chosen": -350.089552238806, "logps/rejected": -305.57377049180326, "loss": 0.4126, "rewards/chosen": -0.026469216417910446, "rewards/margins": 1.0052930786640566, "rewards/rejected": -1.0317622950819672, "step": 168 }, { "epoch": 0.115872471717518, "grad_norm": 0.6526912683473082, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97181232.3018868, "logits/rejected": -108716359.68, "logps/chosen": -233.66037735849056, "logps/rejected": -313.38666666666666, "loss": 0.3929, "rewards/chosen": -0.04641435731132076, "rewards/margins": 0.8919189760220126, "rewards/rejected": -0.9383333333333334, "step": 169 }, { "epoch": 0.11655810764484059, "grad_norm": 0.6314119914307964, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95226235.25925925, "logits/rejected": -96582351.56756757, "logps/chosen": -286.51851851851853, "logps/rejected": -364.97297297297297, "loss": 0.3971, "rewards/chosen": 4.298032972547743, "rewards/margins": 5.30310054011531, "rewards/rejected": -1.0050675675675675, "step": 170 }, { "epoch": 0.11724374357216318, "grad_norm": 0.859261228451207, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101012821.33333333, "logits/rejected": -86807113.14285715, "logps/chosen": -331.1111111111111, "logps/rejected": -301.7142857142857, "loss": 0.4261, "rewards/chosen": -0.017903645833333332, "rewards/margins": 0.7153552827380952, "rewards/rejected": -0.7332589285714286, "step": 171 }, { "epoch": 0.11792937949948577, "grad_norm": 0.6105893339884036, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93902328.35820895, "logits/rejected": -72884626.8852459, "logps/chosen": -256.95522388059703, "logps/rejected": -291.9344262295082, "loss": 0.4039, "rewards/chosen": 0.01355527052238806, "rewards/margins": 1.001260188555175, "rewards/rejected": -0.9877049180327869, "step": 172 }, { "epoch": 0.11861501542680837, "grad_norm": 0.6396768585477335, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92629136.22535211, "logits/rejected": -93304867.92982456, "logps/chosen": -312.11267605633805, "logps/rejected": -309.3333333333333, "loss": 0.4285, "rewards/chosen": -0.02955133142605634, "rewards/margins": 0.8531241071704349, "rewards/rejected": -0.8826754385964912, "step": 173 }, { "epoch": 0.11930065135413095, "grad_norm": 0.6890904121416818, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91534516.70588236, "logits/rejected": -85847053.2987013, "logps/chosen": -306.5098039215686, "logps/rejected": -316.2597402597403, "loss": 0.3878, "rewards/chosen": 2.5300041647518383, "rewards/margins": 3.440718450466124, "rewards/rejected": -0.9107142857142857, "step": 174 }, { "epoch": 0.11998628728145355, "grad_norm": 0.5911381509971806, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87889733.81818181, "logits/rejected": -82938979.09677419, "logps/chosen": -243.3939393939394, "logps/rejected": -295.48387096774195, "loss": 0.4079, "rewards/chosen": -0.014393199573863636, "rewards/margins": 0.9644374455874267, "rewards/rejected": -0.9788306451612904, "step": 175 }, { "epoch": 0.12067192320877614, "grad_norm": 0.6548593971390578, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77526973.93548387, "logits/rejected": -93164388.84848484, "logps/chosen": -240.0, "logps/rejected": -336.0, "loss": 0.4092, "rewards/chosen": 0.013215095766129033, "rewards/margins": 0.8560181260691593, "rewards/rejected": -0.8428030303030303, "step": 176 }, { "epoch": 0.12135755913609873, "grad_norm": 0.6029449057013725, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91371299.44615385, "logits/rejected": -87348045.20634921, "logps/chosen": -297.10769230769233, "logps/rejected": -297.6507936507937, "loss": 0.4213, "rewards/chosen": -0.051382211538461536, "rewards/margins": 0.8275860424297924, "rewards/rejected": -0.878968253968254, "step": 177 }, { "epoch": 0.12204319506342132, "grad_norm": 0.6166944537584013, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89772822.45614035, "logits/rejected": -90384297.46478873, "logps/chosen": -269.3333333333333, "logps/rejected": -320.90140845070425, "loss": 0.3842, "rewards/chosen": 2.2754018683182564, "rewards/margins": 3.1107891922619184, "rewards/rejected": -0.835387323943662, "step": 178 }, { "epoch": 0.12272883099074391, "grad_norm": 0.5845103682851918, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86997982.96774194, "logits/rejected": -88588784.48484848, "logps/chosen": -278.4516129032258, "logps/rejected": -320.969696969697, "loss": 0.4068, "rewards/chosen": 1.6538521551316785, "rewards/margins": 2.5506324581619815, "rewards/rejected": -0.896780303030303, "step": 179 }, { "epoch": 0.12341446691806651, "grad_norm": 0.6423614683711513, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90726021.90769231, "logits/rejected": -88413265.26984127, "logps/chosen": -290.46153846153845, "logps/rejected": -352.5079365079365, "loss": 0.394, "rewards/chosen": 0.037259615384615384, "rewards/margins": 1.0952953296703296, "rewards/rejected": -1.0580357142857142, "step": 180 }, { "epoch": 0.1241001028453891, "grad_norm": 0.5776024718782571, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98146713.6, "logits/rejected": -94733417.93103448, "logps/chosen": -304.9142857142857, "logps/rejected": -338.48275862068965, "loss": 0.4032, "rewards/chosen": 1.6884078979492188, "rewards/margins": 2.630218242776805, "rewards/rejected": -0.9418103448275862, "step": 181 }, { "epoch": 0.1247857387727117, "grad_norm": 0.6348478623013413, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82442139.27868852, "logits/rejected": -120069777.19402985, "logps/chosen": -269.37704918032784, "logps/rejected": -354.3880597014925, "loss": 0.3995, "rewards/chosen": -0.02856045081967213, "rewards/margins": 1.0096858178370443, "rewards/rejected": -1.0382462686567164, "step": 182 }, { "epoch": 0.12547137470003428, "grad_norm": 0.5351344797982769, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105923948.47457626, "logits/rejected": -96286630.95652173, "logps/chosen": -308.06779661016947, "logps/rejected": -205.68115942028984, "loss": 0.4278, "rewards/chosen": -0.045633606991525424, "rewards/margins": 0.6380167553273152, "rewards/rejected": -0.6836503623188406, "step": 183 }, { "epoch": 0.12615701062735687, "grad_norm": 0.5796480881405678, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93291488.96969697, "logits/rejected": -69019978.32258065, "logps/chosen": -276.6060606060606, "logps/rejected": -322.83870967741933, "loss": 0.4086, "rewards/chosen": -0.021247632575757576, "rewards/margins": 0.6957394235411464, "rewards/rejected": -0.7169870561169039, "step": 184 }, { "epoch": 0.12684264655467947, "grad_norm": 0.6191367000156907, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -108441823.41818182, "logits/rejected": -92964162.63013698, "logps/chosen": -294.8363636363636, "logps/rejected": -334.027397260274, "loss": 0.3824, "rewards/chosen": -0.0011363636363636363, "rewards/margins": 1.1598225404732254, "rewards/rejected": -1.1609589041095891, "step": 185 }, { "epoch": 0.12752828248200207, "grad_norm": 0.8286995443027816, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95886449.77777778, "logits/rejected": -81189741.71428572, "logps/chosen": -316.0, "logps/rejected": -306.85714285714283, "loss": 0.4271, "rewards/chosen": -0.061360677083333336, "rewards/margins": 0.8906482514880952, "rewards/rejected": -0.9520089285714286, "step": 186 }, { "epoch": 0.12821391840932464, "grad_norm": 0.5729412284594303, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -114988457.35384615, "logits/rejected": -85217605.07936507, "logps/chosen": -358.15384615384613, "logps/rejected": -292.8253968253968, "loss": 0.4156, "rewards/chosen": -0.0734375, "rewards/margins": 0.9255704365079365, "rewards/rejected": -0.9990079365079365, "step": 187 }, { "epoch": 0.12889955433664724, "grad_norm": 0.5456239153645529, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77007421.44, "logits/rejected": -97181232.3018868, "logps/chosen": -238.72, "logps/rejected": -348.07547169811323, "loss": 0.4175, "rewards/chosen": -0.03763020833333333, "rewards/margins": 1.0791150746855347, "rewards/rejected": -1.116745283018868, "step": 188 }, { "epoch": 0.12958519026396983, "grad_norm": 0.7491333915270592, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87599070.4262295, "logits/rejected": -89895829.01492538, "logps/chosen": -232.39344262295083, "logps/rejected": -370.14925373134326, "loss": 0.3815, "rewards/chosen": -0.0005122950819672131, "rewards/margins": 1.299860839246391, "rewards/rejected": -1.3003731343283582, "step": 189 }, { "epoch": 0.13027082619129243, "grad_norm": 0.5723314564487028, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90402230.85714285, "logits/rejected": -105090616.8888889, "logps/chosen": -284.57142857142856, "logps/rejected": -377.3333333333333, "loss": 0.3687, "rewards/chosen": -0.010646275111607142, "rewards/margins": 1.3322356693328372, "rewards/rejected": -1.3428819444444444, "step": 190 }, { "epoch": 0.130956462118615, "grad_norm": 0.5970252052182746, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82970959.12727273, "logits/rejected": -93826005.91780822, "logps/chosen": -187.05454545454546, "logps/rejected": -329.4246575342466, "loss": 0.3882, "rewards/chosen": -0.0011452414772727272, "rewards/margins": 0.7616972242761519, "rewards/rejected": -0.7628424657534246, "step": 191 }, { "epoch": 0.1316420980459376, "grad_norm": 0.6582740627881332, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98910452.53731343, "logits/rejected": -91105783.60655738, "logps/chosen": -343.4029850746269, "logps/rejected": -305.04918032786884, "loss": 0.4034, "rewards/chosen": -0.05685925839552239, "rewards/margins": 1.0630177907848055, "rewards/rejected": -1.1198770491803278, "step": 192 }, { "epoch": 0.1323277339732602, "grad_norm": 0.5617809731712698, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87353371.30666667, "logits/rejected": -106836045.28301887, "logps/chosen": -312.32, "logps/rejected": -315.92452830188677, "loss": 0.4258, "rewards/chosen": -0.12791666666666668, "rewards/margins": 1.1542177672955976, "rewards/rejected": -1.2821344339622642, "step": 193 }, { "epoch": 0.1330133699005828, "grad_norm": 0.5901384109474745, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77491485.37704918, "logits/rejected": -80192588.41791044, "logps/chosen": -248.65573770491804, "logps/rejected": -312.5970149253731, "loss": 0.3775, "rewards/chosen": 0.022797131147540985, "rewards/margins": 1.2457448923415708, "rewards/rejected": -1.2229477611940298, "step": 194 }, { "epoch": 0.1336990058279054, "grad_norm": 0.5632292474041872, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101900959.47540984, "logits/rejected": -90647047.64179105, "logps/chosen": -355.1475409836066, "logps/rejected": -292.2985074626866, "loss": 0.397, "rewards/chosen": -0.09310162653688525, "rewards/margins": 1.055219268985503, "rewards/rejected": -1.148320895522388, "step": 195 }, { "epoch": 0.13438464175522796, "grad_norm": 0.6013344618191889, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104158549.33333333, "logits/rejected": -84452877.83783785, "logps/chosen": -263.7037037037037, "logps/rejected": -292.3243243243243, "loss": 0.3766, "rewards/chosen": 2.0597527115433305, "rewards/margins": 3.1535027115433305, "rewards/rejected": -1.09375, "step": 196 }, { "epoch": 0.13507027768255056, "grad_norm": 0.5301538299734007, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97447662.93333334, "logits/rejected": -76330164.70588236, "logps/chosen": -278.93333333333334, "logps/rejected": -318.8235294117647, "loss": 0.3867, "rewards/chosen": 0.0013671875, "rewards/margins": 1.215521599264706, "rewards/rejected": -1.2141544117647058, "step": 197 }, { "epoch": 0.13575591360987316, "grad_norm": 0.5612714832061899, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72383519.03030303, "logits/rejected": -85509681.5483871, "logps/chosen": -223.03030303030303, "logps/rejected": -342.19354838709677, "loss": 0.382, "rewards/chosen": -0.029000946969696968, "rewards/margins": 1.3339022788367545, "rewards/rejected": -1.3629032258064515, "step": 198 }, { "epoch": 0.13644154953719576, "grad_norm": 0.5485234673372712, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97790485.0410959, "logits/rejected": -97307852.8, "logps/chosen": -302.6849315068493, "logps/rejected": -328.43636363636364, "loss": 0.4053, "rewards/chosen": 0.034514126712328765, "rewards/margins": 1.1208777630759652, "rewards/rejected": -1.0863636363636364, "step": 199 }, { "epoch": 0.13712718546451835, "grad_norm": 0.5974938967390607, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98823688.98245615, "logits/rejected": -80459747.15492958, "logps/chosen": -254.31578947368422, "logps/rejected": -290.0281690140845, "loss": 0.4005, "rewards/chosen": -0.09974643640350878, "rewards/margins": 0.8571197607795898, "rewards/rejected": -0.9568661971830986, "step": 200 }, { "epoch": 0.13781282139184092, "grad_norm": 0.5945652910411876, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83615479.74193548, "logits/rejected": -90876586.66666667, "logps/chosen": -280.7741935483871, "logps/rejected": -305.93939393939394, "loss": 0.3924, "rewards/chosen": 0.032132056451612906, "rewards/margins": 1.1675487231182797, "rewards/rejected": -1.1354166666666667, "step": 201 }, { "epoch": 0.13849845731916352, "grad_norm": 0.5667407613052631, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94988649.41176471, "logits/rejected": -96818517.33333333, "logps/chosen": -255.52941176470588, "logps/rejected": -340.8, "loss": 0.4096, "rewards/chosen": -0.00457763671875, "rewards/margins": 1.0318806966145833, "rewards/rejected": -1.0364583333333333, "step": 202 }, { "epoch": 0.13918409324648612, "grad_norm": 0.5780995442617441, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97336779.03448276, "logits/rejected": -92274688.0, "logps/chosen": -309.51724137931035, "logps/rejected": -293.9428571428571, "loss": 0.3874, "rewards/chosen": 2.196149497196592, "rewards/margins": 3.3782923543394494, "rewards/rejected": -1.1821428571428572, "step": 203 }, { "epoch": 0.13986972917380872, "grad_norm": 0.5282921435237685, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100910019.76470588, "logits/rejected": -112127726.93333334, "logps/chosen": -308.2352941176471, "logps/rejected": -336.53333333333336, "loss": 0.403, "rewards/chosen": -0.020626292509191176, "rewards/margins": 1.1981237074908089, "rewards/rejected": -1.21875, "step": 204 }, { "epoch": 0.14055536510113129, "grad_norm": 0.6194901584669563, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93902328.35820895, "logits/rejected": -133392619.01639344, "logps/chosen": -300.65671641791045, "logps/rejected": -419.672131147541, "loss": 0.3685, "rewards/chosen": 0.04110307835820896, "rewards/margins": 1.5595457013090286, "rewards/rejected": -1.5184426229508197, "step": 205 }, { "epoch": 0.14124100102845388, "grad_norm": 0.6513205245615452, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98090789.54666667, "logits/rejected": -108972766.18867925, "logps/chosen": -324.48, "logps/rejected": -370.41509433962267, "loss": 0.3966, "rewards/chosen": -0.05022135416666667, "rewards/margins": 1.4851560043238994, "rewards/rejected": -1.5353773584905661, "step": 206 }, { "epoch": 0.14192663695577648, "grad_norm": 0.5293028832620854, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80774177.03225806, "logits/rejected": -115025609.6969697, "logps/chosen": -256.51612903225805, "logps/rejected": -342.7878787878788, "loss": 0.3946, "rewards/chosen": -0.04914314516129032, "rewards/margins": 1.156349279081134, "rewards/rejected": -1.2054924242424243, "step": 207 }, { "epoch": 0.14261227288309908, "grad_norm": 0.5986161375371988, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96409917.29577465, "logits/rejected": -100516127.43859649, "logps/chosen": -286.19718309859155, "logps/rejected": -366.5964912280702, "loss": 0.3752, "rewards/chosen": 0.18100792253521128, "rewards/margins": 1.4814465190264394, "rewards/rejected": -1.3004385964912282, "step": 208 }, { "epoch": 0.14329790881042168, "grad_norm": 0.5367273066498671, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105398800.51612903, "logits/rejected": -88143934.06060606, "logps/chosen": -276.9032258064516, "logps/rejected": -275.6363636363636, "loss": 0.3947, "rewards/chosen": -0.02428805443548387, "rewards/margins": 0.86965133950391, "rewards/rejected": -0.8939393939393939, "step": 209 }, { "epoch": 0.14398354473774425, "grad_norm": 0.578590514527458, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104033718.85714285, "logits/rejected": -87497841.77777778, "logps/chosen": -342.0, "logps/rejected": -300.44444444444446, "loss": 0.3633, "rewards/chosen": 0.14191545758928573, "rewards/margins": 1.4196932353670635, "rewards/rejected": -1.2777777777777777, "step": 210 }, { "epoch": 0.14466918066506684, "grad_norm": 0.5216369955529584, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83504779.63636364, "logits/rejected": -90515786.32258065, "logps/chosen": -262.06060606060606, "logps/rejected": -291.0967741935484, "loss": 0.4013, "rewards/chosen": 1.8286216042258523, "rewards/margins": 2.4256468294652676, "rewards/rejected": -0.5970252252394154, "step": 211 }, { "epoch": 0.14535481659238944, "grad_norm": 0.5174610998602692, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92399891.10447761, "logits/rejected": -83748561.83606558, "logps/chosen": -344.35820895522386, "logps/rejected": -332.59016393442624, "loss": 0.3784, "rewards/chosen": 0.01482334421641791, "rewards/margins": 1.5640036720852704, "rewards/rejected": -1.5491803278688525, "step": 212 }, { "epoch": 0.14604045251971204, "grad_norm": 0.5730943894438295, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79616877.71428572, "logits/rejected": -97051534.22222222, "logps/chosen": -267.7142857142857, "logps/rejected": -401.3333333333333, "loss": 0.3685, "rewards/chosen": 2.2263381140572682, "rewards/margins": 1.8773580581422835, "rewards/rejected": 0.3489800559149848, "step": 213 }, { "epoch": 0.14672608844703464, "grad_norm": 0.5385892730737903, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100899594.81690142, "logits/rejected": -83214623.43859649, "logps/chosen": -285.2957746478873, "logps/rejected": -327.0175438596491, "loss": 0.4115, "rewards/chosen": -0.09410761443661972, "rewards/margins": 1.1208046662651345, "rewards/rejected": -1.2149122807017543, "step": 214 }, { "epoch": 0.1474117243743572, "grad_norm": 0.4787004500525132, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105940001.03225806, "logits/rejected": -99265194.66666667, "logps/chosen": -252.38709677419354, "logps/rejected": -301.8181818181818, "loss": 0.392, "rewards/chosen": -0.0871975806451613, "rewards/margins": 1.4042796920821115, "rewards/rejected": -1.4914772727272727, "step": 215 }, { "epoch": 0.1480973603016798, "grad_norm": 0.5391200578247876, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80623843.55555555, "logits/rejected": -84983051.81538461, "logps/chosen": -309.8412698412698, "logps/rejected": -294.6461538461538, "loss": 0.4005, "rewards/chosen": 2.1402166457403276, "rewards/margins": 3.5084858765095586, "rewards/rejected": -1.3682692307692308, "step": 216 }, { "epoch": 0.1487829962290024, "grad_norm": 0.6231792331427327, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85171431.22580644, "logits/rejected": -87571983.51515152, "logps/chosen": -288.51612903225805, "logps/rejected": -356.3636363636364, "loss": 0.3763, "rewards/chosen": 0.0655241935483871, "rewards/margins": 1.4746151026392962, "rewards/rejected": -1.4090909090909092, "step": 217 }, { "epoch": 0.149468632156325, "grad_norm": 0.531340337960459, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94001754.35294117, "logits/rejected": -86961902.93333334, "logps/chosen": -260.70588235294116, "logps/rejected": -299.46666666666664, "loss": 0.4038, "rewards/chosen": 0.015869140625, "rewards/margins": 1.2669108072916666, "rewards/rejected": -1.2510416666666666, "step": 218 }, { "epoch": 0.1501542680836476, "grad_norm": 0.534546937055213, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90029849.23943663, "logits/rejected": -66888111.15789474, "logps/chosen": -290.92957746478874, "logps/rejected": -205.19298245614036, "loss": 0.4211, "rewards/chosen": -0.04162081866197183, "rewards/margins": 0.9397388304608352, "rewards/rejected": -0.981359649122807, "step": 219 }, { "epoch": 0.15083990401097017, "grad_norm": 0.5823042950144128, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83886080.0, "logits/rejected": -88970084.84848484, "logps/chosen": -203.61290322580646, "logps/rejected": -382.06060606060606, "loss": 0.3859, "rewards/chosen": -0.10216103830645161, "rewards/margins": 1.3770056283602152, "rewards/rejected": -1.4791666666666667, "step": 220 }, { "epoch": 0.15152553993829276, "grad_norm": 0.5556089789971745, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104857600.0, "logits/rejected": -87286867.02702703, "logps/chosen": -398.81481481481484, "logps/rejected": -301.8378378378378, "loss": 0.3653, "rewards/chosen": 2.706934611002604, "rewards/margins": 4.036326502894496, "rewards/rejected": -1.3293918918918919, "step": 221 }, { "epoch": 0.15221117586561536, "grad_norm": 0.5166988716906602, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98705954.13333334, "logits/rejected": -85058017.88235295, "logps/chosen": -390.6666666666667, "logps/rejected": -317.1764705882353, "loss": 0.3817, "rewards/chosen": -0.0205078125, "rewards/margins": 1.4188304227941178, "rewards/rejected": -1.4393382352941178, "step": 222 }, { "epoch": 0.15289681179293796, "grad_norm": 0.5350381714838675, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100293210.35294117, "logits/rejected": -82767598.93333334, "logps/chosen": -277.6470588235294, "logps/rejected": -330.6666666666667, "loss": 0.4006, "rewards/chosen": -0.04928768382352941, "rewards/margins": 1.2600873161764705, "rewards/rejected": -1.309375, "step": 223 }, { "epoch": 0.15358244772026053, "grad_norm": 0.5011838091719313, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88360004.26666667, "logits/rejected": -87710298.35294117, "logps/chosen": -254.4, "logps/rejected": -276.94117647058823, "loss": 0.3972, "rewards/chosen": 1.7825674692789713, "rewards/margins": 2.9388174692789715, "rewards/rejected": -1.15625, "step": 224 }, { "epoch": 0.15426808364758313, "grad_norm": 0.5319324687315473, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97953130.33846153, "logits/rejected": -74898285.71428572, "logps/chosen": -322.46153846153845, "logps/rejected": -280.3809523809524, "loss": 0.3812, "rewards/chosen": 0.09026442307692308, "rewards/margins": 1.4632802960927962, "rewards/rejected": -1.373015873015873, "step": 225 }, { "epoch": 0.15495371957490572, "grad_norm": 0.5264299045290647, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84803584.0, "logits/rejected": -88342528.0, "logps/chosen": -279.5, "logps/rejected": -259.25, "loss": 0.399, "rewards/chosen": -0.04376220703125, "rewards/margins": 1.17596435546875, "rewards/rejected": -1.2197265625, "step": 226 }, { "epoch": 0.15563935550222832, "grad_norm": 0.5028445531811759, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89212180.31746031, "logits/rejected": -95372020.18461539, "logps/chosen": -219.68253968253967, "logps/rejected": -341.16923076923075, "loss": 0.3878, "rewards/chosen": 1.7163182818700398, "rewards/margins": 3.164395204946963, "rewards/rejected": -1.448076923076923, "step": 227 }, { "epoch": 0.15632499142955092, "grad_norm": 0.5486353569559097, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78951604.70588236, "logits/rejected": -109960669.86666666, "logps/chosen": -295.7647058823529, "logps/rejected": -386.1333333333333, "loss": 0.3879, "rewards/chosen": -0.09731158088235294, "rewards/margins": 1.6422717524509802, "rewards/rejected": -1.7395833333333333, "step": 228 }, { "epoch": 0.1570106273568735, "grad_norm": 0.5177341540317759, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84848707.14754099, "logits/rejected": -90772250.74626866, "logps/chosen": -294.0327868852459, "logps/rejected": -290.3880597014925, "loss": 0.3767, "rewards/chosen": -0.05321465163934426, "rewards/margins": 1.4262629603009542, "rewards/rejected": -1.4794776119402986, "step": 229 }, { "epoch": 0.1576962632841961, "grad_norm": 0.4665580897305315, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86560708.63768116, "logits/rejected": -79691776.0, "logps/chosen": -238.14492753623188, "logps/rejected": -289.08474576271186, "loss": 0.4039, "rewards/chosen": -0.026721014492753624, "rewards/margins": 0.539477690113642, "rewards/rejected": -0.5661987046063957, "step": 230 }, { "epoch": 0.15838189921151868, "grad_norm": 0.5007812836346517, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92386536.10666667, "logits/rejected": -80404016.3018868, "logps/chosen": -280.74666666666667, "logps/rejected": -335.0943396226415, "loss": 0.3926, "rewards/chosen": -0.0077083333333333335, "rewards/margins": 1.0220637953656273, "rewards/rejected": -1.0297721286989607, "step": 231 }, { "epoch": 0.15906753513884128, "grad_norm": 0.5190349067440994, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82221673.65079366, "logits/rejected": -92016576.98461539, "logps/chosen": -277.58730158730157, "logps/rejected": -343.6307692307692, "loss": 0.3709, "rewards/chosen": -0.004588293650793651, "rewards/margins": 1.061999066437848, "rewards/rejected": -1.0665873600886417, "step": 232 }, { "epoch": 0.15975317106616388, "grad_norm": 0.5037530900128407, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -110916039.1111111, "logits/rejected": -110729625.6, "logps/chosen": -232.12698412698413, "logps/rejected": -306.7076923076923, "loss": 0.3746, "rewards/chosen": -0.06721230158730158, "rewards/margins": 1.5481723137973138, "rewards/rejected": -1.6153846153846154, "step": 233 }, { "epoch": 0.16043880699348645, "grad_norm": 0.5453869685181708, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93810630.30985916, "logits/rejected": -86903035.50877193, "logps/chosen": -291.6056338028169, "logps/rejected": -311.29824561403507, "loss": 0.4019, "rewards/chosen": 0.0818937059859155, "rewards/margins": 1.0687358112490735, "rewards/rejected": -0.9868421052631579, "step": 234 }, { "epoch": 0.16112444292080905, "grad_norm": 0.5203813883322923, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98211241.35384615, "logits/rejected": -114378004.31746031, "logps/chosen": -296.12307692307695, "logps/rejected": -291.04761904761904, "loss": 0.3923, "rewards/chosen": -0.11150841346153846, "rewards/margins": 1.4242058722527473, "rewards/rejected": -1.5357142857142858, "step": 235 }, { "epoch": 0.16181007884813164, "grad_norm": 0.4553803363977109, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104918387.01449275, "logits/rejected": -91421609.22033899, "logps/chosen": -243.94202898550725, "logps/rejected": -264.6779661016949, "loss": 0.4009, "rewards/chosen": -0.06513247282608696, "rewards/margins": 1.0916471881908623, "rewards/rejected": -1.1567796610169492, "step": 236 }, { "epoch": 0.16249571477545424, "grad_norm": 0.5419118331474444, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -127506841.6, "logits/rejected": -91125563.61643836, "logps/chosen": -285.6727272727273, "logps/rejected": -378.3013698630137, "loss": 0.3506, "rewards/chosen": -0.015518465909090909, "rewards/margins": 1.6009198902552926, "rewards/rejected": -1.6164383561643836, "step": 237 }, { "epoch": 0.1631813507027768, "grad_norm": 0.5307430030024263, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93968541.53846154, "logits/rejected": -78698388.21052632, "logps/chosen": -314.46153846153845, "logps/rejected": -266.94736842105266, "loss": 0.3656, "rewards/chosen": -0.09084848257211539, "rewards/margins": 1.3565199384805162, "rewards/rejected": -1.4473684210526316, "step": 238 }, { "epoch": 0.1638669866300994, "grad_norm": 0.5302293425882556, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92274688.0, "logits/rejected": -94845390.4516129, "logps/chosen": -311.27272727272725, "logps/rejected": -283.61290322580646, "loss": 0.3648, "rewards/chosen": 0.095703125, "rewards/margins": 1.2582531590615549, "rewards/rejected": -1.1625500340615549, "step": 239 }, { "epoch": 0.164552622557422, "grad_norm": 0.5069081933919434, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93308899.94520548, "logits/rejected": -89986885.81818181, "logps/chosen": -292.3835616438356, "logps/rejected": -292.94545454545454, "loss": 0.4114, "rewards/chosen": -0.04387842465753425, "rewards/margins": 1.2947579389788293, "rewards/rejected": -1.3386363636363636, "step": 240 }, { "epoch": 0.1652382584847446, "grad_norm": 0.5404925130858234, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97179890.98305085, "logits/rejected": -82244830.60869566, "logps/chosen": -307.79661016949154, "logps/rejected": -328.3478260869565, "loss": 0.3674, "rewards/chosen": 0.08693061440677965, "rewards/margins": -0.5557477984314789, "rewards/rejected": 0.6426784128382586, "step": 241 }, { "epoch": 0.1659238944120672, "grad_norm": 0.51251906163133, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94920325.90769231, "logits/rejected": -101062753.52380952, "logps/chosen": -290.2153846153846, "logps/rejected": -347.93650793650795, "loss": 0.3746, "rewards/chosen": 0.07007211538461539, "rewards/margins": 1.5581673534798535, "rewards/rejected": -1.4880952380952381, "step": 242 }, { "epoch": 0.16660953033938977, "grad_norm": 0.6180344883751349, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94304189.93548387, "logits/rejected": -83758979.87878788, "logps/chosen": -262.7096774193548, "logps/rejected": -353.45454545454544, "loss": 0.3705, "rewards/chosen": 1.9611309420677923, "rewards/margins": 1.4914081942650579, "rewards/rejected": 0.4697227478027344, "step": 243 }, { "epoch": 0.16729516626671237, "grad_norm": 0.49441851461655295, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90749486.54545455, "logits/rejected": -86524432.51612903, "logps/chosen": -249.93939393939394, "logps/rejected": -341.6774193548387, "loss": 0.3852, "rewards/chosen": -0.05622632575757576, "rewards/margins": 1.5591970613391986, "rewards/rejected": -1.6154233870967742, "step": 244 }, { "epoch": 0.16798080219403497, "grad_norm": 0.5578795986512851, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109128164.07272728, "logits/rejected": -98020309.91780822, "logps/chosen": -285.96363636363634, "logps/rejected": -303.1232876712329, "loss": 0.3724, "rewards/chosen": 2.2790754838423295, "rewards/margins": 3.75339055233548, "rewards/rejected": -1.4743150684931507, "step": 245 }, { "epoch": 0.16866643812135756, "grad_norm": 0.6333043624313269, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91117638.62068966, "logits/rejected": -90357291.88571429, "logps/chosen": -265.1034482758621, "logps/rejected": -277.25714285714287, "loss": 0.3596, "rewards/chosen": 0.1000134698275862, "rewards/margins": 1.4714420412561577, "rewards/rejected": -1.3714285714285714, "step": 246 }, { "epoch": 0.16935207404868016, "grad_norm": 0.5158692465154214, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96165056.92753623, "logits/rejected": -88791282.98305085, "logps/chosen": -302.3768115942029, "logps/rejected": -299.1186440677966, "loss": 0.3806, "rewards/chosen": 0.06691576086956522, "rewards/margins": 1.7258140659543109, "rewards/rejected": -1.6588983050847457, "step": 247 }, { "epoch": 0.17003770997600273, "grad_norm": 0.5051353060885188, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98349197.2413793, "logits/rejected": -83047219.2, "logps/chosen": -304.0, "logps/rejected": -271.54285714285714, "loss": 0.3801, "rewards/chosen": -0.10452586206896551, "rewards/margins": 1.336545566502463, "rewards/rejected": -1.4410714285714286, "step": 248 }, { "epoch": 0.17072334590332533, "grad_norm": 0.4707247680986871, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88080384.0, "logits/rejected": -122446616.77419356, "logps/chosen": -311.5151515151515, "logps/rejected": -306.3225806451613, "loss": 0.3761, "rewards/chosen": 0.008552320075757576, "rewards/margins": 1.6375845781402738, "rewards/rejected": -1.6290322580645162, "step": 249 }, { "epoch": 0.17140898183064793, "grad_norm": 0.4762508051581914, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96801873.26984127, "logits/rejected": -103115350.64615385, "logps/chosen": -300.95238095238096, "logps/rejected": -262.6461538461538, "loss": 0.3846, "rewards/chosen": -0.008804563492063492, "rewards/margins": 1.4161954365079366, "rewards/rejected": -1.425, "step": 250 }, { "epoch": 0.17209461775797052, "grad_norm": 0.5679326786194394, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105906176.0, "logits/rejected": -73531392.0, "logps/chosen": -304.75, "logps/rejected": -292.0, "loss": 0.3577, "rewards/chosen": 2.211991548538208, "rewards/margins": 3.831132173538208, "rewards/rejected": -1.619140625, "step": 251 }, { "epoch": 0.17278025368529312, "grad_norm": 0.4690968715807265, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88213536.50793651, "logits/rejected": -97888602.58461538, "logps/chosen": -235.68253968253967, "logps/rejected": -283.32307692307694, "loss": 0.3725, "rewards/chosen": 0.022584945436507936, "rewards/margins": 1.5918157146672771, "rewards/rejected": -1.5692307692307692, "step": 252 }, { "epoch": 0.1734658896126157, "grad_norm": 0.5862047169108748, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -121772334.16393442, "logits/rejected": -85638923.46268657, "logps/chosen": -265.9672131147541, "logps/rejected": -352.0, "loss": 0.3828, "rewards/chosen": -0.1071657274590164, "rewards/margins": 32490744.251043227, "rewards/rejected": -32490744.358208954, "step": 253 }, { "epoch": 0.1741515255399383, "grad_norm": 0.4811574870181459, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78026390.58823529, "logits/rejected": -107094562.13333334, "logps/chosen": -248.7058823529412, "logps/rejected": -340.0, "loss": 0.3674, "rewards/chosen": -0.0006318933823529412, "rewards/margins": 1.8535347732843137, "rewards/rejected": -1.8541666666666667, "step": 254 }, { "epoch": 0.1748371614672609, "grad_norm": 0.4537811175993053, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86970127.05882353, "logits/rejected": -85284181.33333333, "logps/chosen": -277.1764705882353, "logps/rejected": -317.06666666666666, "loss": 0.3701, "rewards/chosen": 0.049057904411764705, "rewards/margins": 1.8469745710784313, "rewards/rejected": -1.7979166666666666, "step": 255 }, { "epoch": 0.17552279739458349, "grad_norm": 0.646219368389777, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -108186412.6984127, "logits/rejected": -92274688.0, "logps/chosen": -427.42857142857144, "logps/rejected": -374.6461538461538, "loss": 0.3577, "rewards/chosen": 0.03433469742063492, "rewards/margins": 1845092.0958731591, "rewards/rejected": -1845092.0615384616, "step": 256 }, { "epoch": 0.17620843332190606, "grad_norm": 0.5304887642371617, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90527061.33333333, "logits/rejected": -84185673.14285715, "logps/chosen": -310.22222222222223, "logps/rejected": -286.0, "loss": 0.3972, "rewards/chosen": 0.09239366319444445, "rewards/margins": -1.3350821298266213, "rewards/rejected": 1.4274757930210658, "step": 257 }, { "epoch": 0.17689406924922865, "grad_norm": 0.4900556261854408, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77428183.36507936, "logits/rejected": -100276129.47692308, "logps/chosen": -250.9206349206349, "logps/rejected": -362.33846153846156, "loss": 0.3464, "rewards/chosen": 0.048487103174603176, "rewards/margins": 0.8537727477058531, "rewards/rejected": -0.80528564453125, "step": 258 }, { "epoch": 0.17757970517655125, "grad_norm": 0.4967433604651902, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88450469.64705883, "logits/rejected": -91296017.06666666, "logps/chosen": -267.52941176470586, "logps/rejected": -289.8666666666667, "loss": 0.389, "rewards/chosen": -0.04428998161764706, "rewards/margins": 1.4411266850490196, "rewards/rejected": -1.4854166666666666, "step": 259 }, { "epoch": 0.17826534110387385, "grad_norm": 0.5081350477408298, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84548338.5263158, "logits/rejected": -119698983.38461539, "logps/chosen": -271.57894736842104, "logps/rejected": -335.53846153846155, "loss": 0.3905, "rewards/chosen": -0.007298519736842105, "rewards/margins": 1.8496726341093117, "rewards/rejected": -1.8569711538461537, "step": 260 }, { "epoch": 0.17895097703119645, "grad_norm": 0.44948238986021183, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86516406.23728813, "logits/rejected": -95557186.7826087, "logps/chosen": -299.1186440677966, "logps/rejected": -324.6376811594203, "loss": 0.3346, "rewards/chosen": 2.0725031707246426, "rewards/margins": 3.9728654895652222, "rewards/rejected": -1.9003623188405796, "step": 261 }, { "epoch": 0.17963661295851902, "grad_norm": 0.5701776212292279, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98972044.38709678, "logits/rejected": -88715884.60606061, "logps/chosen": -293.6774193548387, "logps/rejected": -364.1212121212121, "loss": 0.3553, "rewards/chosen": 0.06265751008064516, "rewards/margins": 1.6829226615957968, "rewards/rejected": -1.6202651515151516, "step": 262 }, { "epoch": 0.1803222488858416, "grad_norm": 0.5133173234066587, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90832896.0, "logits/rejected": -82116608.0, "logps/chosen": -302.75, "logps/rejected": -309.75, "loss": 0.3716, "rewards/chosen": -0.130126953125, "rewards/margins": 1.881591796875, "rewards/rejected": -2.01171875, "step": 263 }, { "epoch": 0.1810078848131642, "grad_norm": 0.5170784582791521, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -122292746.03921568, "logits/rejected": -87481197.71428572, "logps/chosen": -255.2156862745098, "logps/rejected": -252.88311688311688, "loss": 0.354, "rewards/chosen": 2.2692503087660847, "rewards/margins": 3.6702243347401105, "rewards/rejected": -1.400974025974026, "step": 264 }, { "epoch": 0.1816935207404868, "grad_norm": 0.4987638475370412, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95852182.58823529, "logits/rejected": -101641966.93333334, "logps/chosen": -288.2352941176471, "logps/rejected": -330.93333333333334, "loss": 0.3804, "rewards/chosen": 0.07645909926470588, "rewards/margins": 1.6368757659313724, "rewards/rejected": -1.5604166666666666, "step": 265 }, { "epoch": 0.1823791566678094, "grad_norm": 0.48270741146900187, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94054089.6969697, "logits/rejected": -82262478.4516129, "logps/chosen": -264.1212121212121, "logps/rejected": -267.35483870967744, "loss": 0.3944, "rewards/chosen": 2.880584716796875, "rewards/margins": 4.461229878087197, "rewards/rejected": -1.5806451612903225, "step": 266 }, { "epoch": 0.18306479259513198, "grad_norm": 0.5263141554099727, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100287686.68656716, "logits/rejected": -103413659.27868852, "logps/chosen": -369.67164179104475, "logps/rejected": -343.60655737704917, "loss": 0.3834, "rewards/chosen": 0.10439015858208955, "rewards/margins": 1.3461934372706144, "rewards/rejected": -1.2418032786885247, "step": 267 }, { "epoch": 0.18375042852245457, "grad_norm": 0.7140066555810668, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93061120.0, "logits/rejected": -84344832.0, "logps/chosen": -335.0, "logps/rejected": -334.0, "loss": 0.3581, "rewards/chosen": 0.0029144287109375, "rewards/margins": 1.9130706787109375, "rewards/rejected": -1.91015625, "step": 268 }, { "epoch": 0.18443606444977717, "grad_norm": 0.5104095998948502, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101757462.26086956, "logits/rejected": -99383677.83050847, "logps/chosen": -311.8840579710145, "logps/rejected": -349.2881355932203, "loss": 0.3488, "rewards/chosen": 0.15058876811594202, "rewards/margins": 2.546775208793908, "rewards/rejected": -2.3961864406779663, "step": 269 }, { "epoch": 0.18512170037709977, "grad_norm": 0.4850484408879755, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102830353.06666666, "logits/rejected": -85613146.35294117, "logps/chosen": -308.0, "logps/rejected": -295.29411764705884, "loss": 0.364, "rewards/chosen": 0.07127278645833333, "rewards/margins": 5420194.424213964, "rewards/rejected": -5420194.352941177, "step": 270 }, { "epoch": 0.18580733630442234, "grad_norm": 0.5192777554532683, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101908480.0, "logits/rejected": -72286208.0, "logps/chosen": -374.0, "logps/rejected": -274.5, "loss": 0.378, "rewards/chosen": -0.04791259765625, "rewards/margins": 1.42474365234375, "rewards/rejected": -1.47265625, "step": 271 }, { "epoch": 0.18649297223174494, "grad_norm": 0.48350687999290026, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96468992.0, "logits/rejected": -105836270.93333334, "logps/chosen": -255.2941176470588, "logps/rejected": -327.73333333333335, "loss": 0.3761, "rewards/chosen": 0.00505514705882353, "rewards/margins": 1.6967218137254902, "rewards/rejected": -1.6916666666666667, "step": 272 }, { "epoch": 0.18717860815906753, "grad_norm": 0.5123106910795452, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103995756.71232876, "logits/rejected": -100053215.41818182, "logps/chosen": -381.8082191780822, "logps/rejected": -306.3272727272727, "loss": 0.3846, "rewards/chosen": 0.1407320205479452, "rewards/margins": 1694362.177095657, "rewards/rejected": -1694362.0363636364, "step": 273 }, { "epoch": 0.18786424408639013, "grad_norm": 0.5316626728331246, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89224285.0909091, "logits/rejected": -102828098.06451613, "logps/chosen": -280.969696969697, "logps/rejected": -362.3225806451613, "loss": 0.3601, "rewards/chosen": -0.06315104166666667, "rewards/margins": 2.108219926075269, "rewards/rejected": -2.1713709677419355, "step": 274 }, { "epoch": 0.18854988001371273, "grad_norm": 0.4937577582568332, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107894854.62068966, "logits/rejected": -84125754.51428571, "logps/chosen": -287.17241379310343, "logps/rejected": -287.54285714285714, "loss": 0.3512, "rewards/chosen": 0.016870959051724137, "rewards/margins": 1.7329423876231527, "rewards/rejected": -1.7160714285714285, "step": 275 }, { "epoch": 0.1892355159410353, "grad_norm": 0.4711096335307074, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85652102.73684211, "logits/rejected": -74256903.2112676, "logps/chosen": -230.87719298245614, "logps/rejected": -325.85915492957747, "loss": 0.3415, "rewards/chosen": 0.0561609100877193, "rewards/margins": 1.9012313326229306, "rewards/rejected": -1.8450704225352113, "step": 276 }, { "epoch": 0.1899211518683579, "grad_norm": 0.44993163646136597, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93631668.70588236, "logits/rejected": -96468992.0, "logps/chosen": -331.29411764705884, "logps/rejected": -314.6666666666667, "loss": 0.3544, "rewards/chosen": 0.13470818014705882, "rewards/margins": 2.253458180147059, "rewards/rejected": -2.11875, "step": 277 }, { "epoch": 0.1906067877956805, "grad_norm": 0.4993515000988666, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -64014705.31147541, "logits/rejected": -85889329.6716418, "logps/chosen": -203.01639344262296, "logps/rejected": -266.02985074626866, "loss": 0.3716, "rewards/chosen": 2.091400396628458, "rewards/margins": 3.514907859315025, "rewards/rejected": -1.4235074626865671, "step": 278 }, { "epoch": 0.1912924237230031, "grad_norm": 0.49734864187629957, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90032904.8275862, "logits/rejected": -88260139.88571429, "logps/chosen": -240.13793103448276, "logps/rejected": -342.85714285714283, "loss": 0.3498, "rewards/chosen": 0.05455280172413793, "rewards/margins": 1.324195658866995, "rewards/rejected": -1.269642857142857, "step": 279 }, { "epoch": 0.1919780596503257, "grad_norm": 0.5157528024925647, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97470616.8358209, "logits/rejected": -83198489.18032786, "logps/chosen": -382.56716417910445, "logps/rejected": -279.8688524590164, "loss": 0.3685, "rewards/chosen": 0.11438899253731344, "rewards/margins": 1.1512742384389527, "rewards/rejected": -1.0368852459016393, "step": 280 }, { "epoch": 0.19266369557764826, "grad_norm": 0.505783135026677, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93576368.55172414, "logits/rejected": -91855257.6, "logps/chosen": -300.2758620689655, "logps/rejected": -319.54285714285714, "loss": 0.3532, "rewards/chosen": 0.1515355603448276, "rewards/margins": 1.8015355603448275, "rewards/rejected": -1.65, "step": 281 }, { "epoch": 0.19334933150497086, "grad_norm": 0.5371803724224713, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100663296.0, "logits/rejected": -93512351.47540984, "logps/chosen": -277.4925373134328, "logps/rejected": -304.5245901639344, "loss": 0.367, "rewards/chosen": 1.7094824492041745, "rewards/margins": 4667090.496367695, "rewards/rejected": -4667088.786885246, "step": 282 }, { "epoch": 0.19403496743229345, "grad_norm": 0.5032616953404185, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81395712.0, "logits/rejected": -86638592.0, "logps/chosen": -357.25, "logps/rejected": -366.0, "loss": 0.3633, "rewards/chosen": -0.115478515625, "rewards/margins": 1.851318359375, "rewards/rejected": -1.966796875, "step": 283 }, { "epoch": 0.19472060335961605, "grad_norm": 0.5138693073916883, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85590016.0, "logits/rejected": -87752704.0, "logps/chosen": -298.0, "logps/rejected": -353.0, "loss": 0.3398, "rewards/chosen": 0.135009765625, "rewards/margins": 1.1984891891479492, "rewards/rejected": -1.0634794235229492, "step": 284 }, { "epoch": 0.19540623928693865, "grad_norm": 0.48752083268413715, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89395016.59701492, "logits/rejected": -69721709.1147541, "logps/chosen": -304.8358208955224, "logps/rejected": -322.3606557377049, "loss": 0.3644, "rewards/chosen": 2.313480661876166, "rewards/margins": 3.888544648732595, "rewards/rejected": -1.5750639868564293, "step": 285 }, { "epoch": 0.19609187521426122, "grad_norm": 0.4959919196560829, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97770672.55172414, "logits/rejected": -105576623.54285714, "logps/chosen": -326.0689655172414, "logps/rejected": -312.45714285714286, "loss": 0.3459, "rewards/chosen": 1.9316120805411503, "rewards/margins": 3.93161208054115, "rewards/rejected": -2.0, "step": 286 }, { "epoch": 0.19677751114158382, "grad_norm": 0.46966478937164036, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88735744.0, "logits/rejected": -93978624.0, "logps/chosen": -320.5, "logps/rejected": -268.25, "loss": 0.3646, "rewards/chosen": 0.0467529296875, "rewards/margins": 1.6766357421875, "rewards/rejected": -1.6298828125, "step": 287 }, { "epoch": 0.19746314706890641, "grad_norm": 0.6098427972464755, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94702969.26315789, "logits/rejected": -101372192.45070423, "logps/chosen": -269.6140350877193, "logps/rejected": -293.40845070422534, "loss": 0.3681, "rewards/chosen": 0.03522478070175439, "rewards/margins": 1.5616332314059798, "rewards/rejected": -1.5264084507042253, "step": 288 }, { "epoch": 0.198148782996229, "grad_norm": 0.4712997535720034, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79691776.0, "logits/rejected": -84132803.76470588, "logps/chosen": -236.53333333333333, "logps/rejected": -261.4117647058824, "loss": 0.3782, "rewards/chosen": -0.05789388020833333, "rewards/margins": 1.3281355315563725, "rewards/rejected": -1.3860294117647058, "step": 289 }, { "epoch": 0.19883441892355158, "grad_norm": 0.5058013229017009, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96613623.1724138, "logits/rejected": -76516088.68571429, "logps/chosen": -375.7241379310345, "logps/rejected": -283.2, "loss": 0.3557, "rewards/chosen": 0.05691002155172414, "rewards/margins": 1.78012430726601, "rewards/rejected": -1.7232142857142858, "step": 290 }, { "epoch": 0.19952005485087418, "grad_norm": 0.45095246991978954, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89828010.66666667, "logits/rejected": -92027964.23529412, "logps/chosen": -280.26666666666665, "logps/rejected": -360.94117647058823, "loss": 0.3527, "rewards/chosen": -0.022184244791666665, "rewards/margins": 2.106492225796569, "rewards/rejected": -2.1286764705882355, "step": 291 }, { "epoch": 0.20020569077819678, "grad_norm": 0.4652411826825061, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107374182.4, "logits/rejected": -119176086.06896552, "logps/chosen": -275.8857142857143, "logps/rejected": -371.3103448275862, "loss": 0.3546, "rewards/chosen": 0.12388392857142858, "rewards/margins": 2.279056342364532, "rewards/rejected": -2.1551724137931036, "step": 292 }, { "epoch": 0.20089132670551937, "grad_norm": 0.53161062483662, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95452191.03030303, "logits/rejected": -103775198.96774194, "logps/chosen": -294.7878787878788, "logps/rejected": -329.80645161290323, "loss": 0.3666, "rewards/chosen": 0.058830492424242424, "rewards/margins": 1.7382659762952102, "rewards/rejected": -1.6794354838709677, "step": 293 }, { "epoch": 0.20157696263284197, "grad_norm": 0.46481898473697614, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84872975.05882353, "logits/rejected": -93113548.8, "logps/chosen": -220.47058823529412, "logps/rejected": -330.6666666666667, "loss": 0.3734, "rewards/chosen": 2.0239814309512867, "rewards/margins": 4.155231430951287, "rewards/rejected": -2.13125, "step": 294 }, { "epoch": 0.20226259856016454, "grad_norm": 0.5767497692460318, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77797574.19354838, "logits/rejected": -103904349.0909091, "logps/chosen": -215.74193548387098, "logps/rejected": -359.27272727272725, "loss": 0.3519, "rewards/chosen": 2.0166698578865296, "rewards/margins": 3.8575789487956205, "rewards/rejected": -1.8409090909090908, "step": 295 }, { "epoch": 0.20294823448748714, "grad_norm": 0.4454580389823589, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84980246.26086956, "logits/rejected": -74502213.42372881, "logps/chosen": -269.6811594202899, "logps/rejected": -287.45762711864404, "loss": 0.3585, "rewards/chosen": 0.1174705615942029, "rewards/margins": 2.0284875107467455, "rewards/rejected": -1.9110169491525424, "step": 296 }, { "epoch": 0.20363387041480974, "grad_norm": 0.46995595848557437, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97533218.3880597, "logits/rejected": -107607963.27868852, "logps/chosen": -293.97014925373134, "logps/rejected": -366.95081967213116, "loss": 0.3626, "rewards/chosen": 0.03434001865671642, "rewards/margins": 0.678793223702823, "rewards/rejected": -0.6444532050461066, "step": 297 }, { "epoch": 0.20431950634213233, "grad_norm": 0.49154610433140394, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83296256.0, "logits/rejected": -127795200.0, "logps/chosen": -246.5, "logps/rejected": -333.5, "loss": 0.3632, "rewards/chosen": 0.0162353515625, "rewards/margins": 1.7779541015625, "rewards/rejected": -1.76171875, "step": 298 }, { "epoch": 0.20500514226945493, "grad_norm": 0.5417616454607392, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93902328.35820895, "logits/rejected": -115652775.86885247, "logps/chosen": -280.35820895522386, "logps/rejected": -314.2295081967213, "loss": 0.3565, "rewards/chosen": 0.05153917910447761, "rewards/margins": -0.292740876207382, "rewards/rejected": 0.3442800553118596, "step": 299 }, { "epoch": 0.2056907781967775, "grad_norm": 0.49041233730997463, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83427328.0, "logits/rejected": -97779712.0, "logps/chosen": -316.0, "logps/rejected": -381.5, "loss": 0.3548, "rewards/chosen": 1.6761536598205566, "rewards/margins": 5431376.17615366, "rewards/rejected": -5431374.5, "step": 300 }, { "epoch": 0.2063764141241001, "grad_norm": 0.5516091514490956, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100546787.55555555, "logits/rejected": -90926518.85714285, "logps/chosen": -278.22222222222223, "logps/rejected": -297.7142857142857, "loss": 0.3897, "rewards/chosen": -0.08878580729166667, "rewards/margins": 1.806303478422619, "rewards/rejected": -1.8950892857142858, "step": 301 }, { "epoch": 0.2070620500514227, "grad_norm": 0.42159477054538985, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88207484.12121212, "logits/rejected": -74279770.83870968, "logps/chosen": -252.6060606060606, "logps/rejected": -276.38709677419354, "loss": 0.356, "rewards/chosen": 0.11570785984848485, "rewards/margins": 2.0451433437194524, "rewards/rejected": -1.9294354838709677, "step": 302 }, { "epoch": 0.2077476859787453, "grad_norm": 0.4650809761211422, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86936482.9090909, "logits/rejected": -73276958.11764705, "logps/chosen": -308.15584415584414, "logps/rejected": -255.68627450980392, "loss": 0.4031, "rewards/chosen": -0.012073863636363636, "rewards/margins": 1.8224849598930482, "rewards/rejected": -1.8345588235294117, "step": 303 }, { "epoch": 0.20843332190606786, "grad_norm": 0.4445780672967585, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80581476.84848484, "logits/rejected": -101272146.58064516, "logps/chosen": -277.8181818181818, "logps/rejected": -313.2903225806452, "loss": 0.3453, "rewards/chosen": 0.1426373106060606, "rewards/margins": 2.1325566654447705, "rewards/rejected": -1.9899193548387097, "step": 304 }, { "epoch": 0.20911895783339046, "grad_norm": 0.4547028821487842, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86948587.68253969, "logits/rejected": -93823354.09230769, "logps/chosen": -225.015873015873, "logps/rejected": -374.6461538461538, "loss": 0.3275, "rewards/chosen": 0.029947916666666668, "rewards/margins": 2.5337940705128204, "rewards/rejected": -2.503846153846154, "step": 305 }, { "epoch": 0.20980459376071306, "grad_norm": 0.5128970963220834, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101635888.23188406, "logits/rejected": -102867082.84745763, "logps/chosen": -308.17391304347825, "logps/rejected": -306.7118644067797, "loss": 0.3758, "rewards/chosen": 0.03974184782608696, "rewards/margins": 1.2317408629876336, "rewards/rejected": -1.1919990151615467, "step": 306 }, { "epoch": 0.21049022968803566, "grad_norm": 0.4554532885783403, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72197036.06557377, "logits/rejected": -83385267.58208956, "logps/chosen": -198.55737704918033, "logps/rejected": -306.14925373134326, "loss": 0.3452, "rewards/chosen": 0.026479252049180328, "rewards/margins": 1.97237477443724, "rewards/rejected": -1.9458955223880596, "step": 307 }, { "epoch": 0.21117586561535825, "grad_norm": 0.49055864711618324, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91076315.42857143, "logits/rejected": -94488348.44444445, "logps/chosen": -256.85714285714283, "logps/rejected": -313.77777777777777, "loss": 0.3268, "rewards/chosen": 0.2553013392857143, "rewards/margins": 1.9809957837301586, "rewards/rejected": -1.7256944444444444, "step": 308 }, { "epoch": 0.21186150154268082, "grad_norm": 0.44346140009779417, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84267380.36363636, "logits/rejected": -115816910.4516129, "logps/chosen": -269.8181818181818, "logps/rejected": -364.64516129032256, "loss": 0.3546, "rewards/chosen": 1.4349253105394768, "rewards/margins": 3.4812962782814125, "rewards/rejected": -2.0463709677419355, "step": 309 }, { "epoch": 0.21254713747000342, "grad_norm": 0.4599581761281105, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -65807183.448275864, "logits/rejected": -73100726.85714285, "logps/chosen": -215.17241379310346, "logps/rejected": -342.85714285714283, "loss": 0.3425, "rewards/chosen": -0.017847521551724137, "rewards/margins": 2.2125096213054185, "rewards/rejected": -2.2303571428571427, "step": 310 }, { "epoch": 0.21323277339732602, "grad_norm": 0.4664897517240057, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92783088.48484848, "logits/rejected": -86186182.19354838, "logps/chosen": -302.3030303030303, "logps/rejected": -312.51612903225805, "loss": 0.3731, "rewards/chosen": 0.18110795454545456, "rewards/margins": 1.261753115835777, "rewards/rejected": -1.0806451612903225, "step": 311 }, { "epoch": 0.21391840932464862, "grad_norm": 0.5086030421461495, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92639410.08695652, "logits/rejected": -109762802.98305085, "logps/chosen": -229.79710144927537, "logps/rejected": -290.9830508474576, "loss": 0.3701, "rewards/chosen": 0.17266757246376813, "rewards/margins": 1.5180065555146156, "rewards/rejected": -1.3453389830508475, "step": 312 }, { "epoch": 0.21460404525197121, "grad_norm": 0.5139687473912289, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87263311.79220779, "logits/rejected": -71961098.03921568, "logps/chosen": -284.46753246753246, "logps/rejected": -233.09803921568627, "loss": 0.3937, "rewards/chosen": 0.09424462256493507, "rewards/margins": 1.6653230539374841, "rewards/rejected": -1.571078431372549, "step": 313 }, { "epoch": 0.21528968117929378, "grad_norm": 0.4859084572685694, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -119707703.35135135, "logits/rejected": -75652816.5925926, "logps/chosen": -325.18918918918916, "logps/rejected": -296.8888888888889, "loss": 0.362, "rewards/chosen": 0.14189189189189189, "rewards/margins": 2.2854104104104103, "rewards/rejected": -2.1435185185185186, "step": 314 }, { "epoch": 0.21597531710661638, "grad_norm": 0.4907442317376305, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87947231.49206349, "logits/rejected": -78336693.16923077, "logps/chosen": -288.76190476190476, "logps/rejected": -323.9384615384615, "loss": 0.3519, "rewards/chosen": -0.007688492063492063, "rewards/margins": 1.8807730463980463, "rewards/rejected": -1.8884615384615384, "step": 315 }, { "epoch": 0.21666095303393898, "grad_norm": 0.48544145360148594, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99010994.42424242, "logits/rejected": -80774177.03225806, "logps/chosen": -314.90909090909093, "logps/rejected": -316.38709677419354, "loss": 0.3777, "rewards/chosen": 1.7273074063387783, "rewards/margins": 3.3946461160161974, "rewards/rejected": -1.6673387096774193, "step": 316 }, { "epoch": 0.21734658896126158, "grad_norm": 0.4562921268705196, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84035876.57142857, "logits/rejected": -92507704.8888889, "logps/chosen": -200.85714285714286, "logps/rejected": -266.6666666666667, "loss": 0.3586, "rewards/chosen": 0.037806919642857144, "rewards/margins": 1.6037791418650795, "rewards/rejected": -1.5659722222222223, "step": 317 }, { "epoch": 0.21803222488858418, "grad_norm": 0.4717166414140821, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93125706.20289855, "logits/rejected": -83743900.20338982, "logps/chosen": -273.6231884057971, "logps/rejected": -286.10169491525426, "loss": 0.3662, "rewards/chosen": 0.043733016304347824, "rewards/margins": 1.9314448807111275, "rewards/rejected": -1.8877118644067796, "step": 318 }, { "epoch": 0.21871786081590674, "grad_norm": 0.5145535042339248, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84794845.86666666, "logits/rejected": -115528402.8235294, "logps/chosen": -232.8, "logps/rejected": -302.3529411764706, "loss": 0.3544, "rewards/chosen": 2.462629191080729, "rewards/margins": 3.901048308727788, "rewards/rejected": -1.4384191176470589, "step": 319 }, { "epoch": 0.21940349674322934, "grad_norm": 0.4918490247905495, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94431758.62857144, "logits/rejected": -80270300.68965517, "logps/chosen": -282.2857142857143, "logps/rejected": -245.93103448275863, "loss": 0.3966, "rewards/chosen": 0.04210379464285714, "rewards/margins": 0.9427187633044615, "rewards/rejected": -0.9006149686616043, "step": 320 }, { "epoch": 0.22008913267055194, "grad_norm": 0.47567883415555934, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87141360.71641791, "logits/rejected": -84711188.98360656, "logps/chosen": -306.3880597014925, "logps/rejected": -320.26229508196724, "loss": 0.3616, "rewards/chosen": 0.12173507462686567, "rewards/margins": 1.9762432713481772, "rewards/rejected": -1.8545081967213115, "step": 321 }, { "epoch": 0.22077476859787454, "grad_norm": 0.4881834342139794, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97092469.62162162, "logits/rejected": -108119836.44444445, "logps/chosen": -250.8108108108108, "logps/rejected": -408.0, "loss": 0.3633, "rewards/chosen": -0.02465160472972973, "rewards/margins": 2.5980335804554553, "rewards/rejected": -2.622685185185185, "step": 322 }, { "epoch": 0.2214604045251971, "grad_norm": 0.4748184852792245, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98672778.84745763, "logits/rejected": -110997088.46376811, "logps/chosen": -284.20338983050846, "logps/rejected": -322.3188405797101, "loss": 0.3283, "rewards/chosen": 0.2604939088983051, "rewards/margins": 2.385493908898305, "rewards/rejected": -2.125, "step": 323 }, { "epoch": 0.2221460404525197, "grad_norm": 0.5767618278033512, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100952558.34482759, "logits/rejected": -106655158.85714285, "logps/chosen": -388.13793103448273, "logps/rejected": -355.2, "loss": 0.3393, "rewards/chosen": 2.3072720231681036, "rewards/margins": 3.213956898656385, "rewards/rejected": -0.9066848754882812, "step": 324 }, { "epoch": 0.2228316763798423, "grad_norm": 0.5060790212515645, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93832572.34285714, "logits/rejected": -103519761.65517241, "logps/chosen": -293.9428571428571, "logps/rejected": -377.51724137931035, "loss": 0.3723, "rewards/chosen": 0.02880859375, "rewards/margins": 2.317601697198276, "rewards/rejected": -2.288793103448276, "step": 325 }, { "epoch": 0.2235173123071649, "grad_norm": 0.4821692981788757, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90568530.44067797, "logits/rejected": -74403305.73913044, "logps/chosen": -254.3728813559322, "logps/rejected": -293.5652173913044, "loss": 0.3437, "rewards/chosen": 0.01969676906779661, "rewards/margins": 1.583102566169246, "rewards/rejected": -1.5634057971014492, "step": 326 }, { "epoch": 0.2242029482344875, "grad_norm": 0.45442124634873116, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84250802.08695652, "logits/rejected": -83104091.11864407, "logps/chosen": -261.1014492753623, "logps/rejected": -295.864406779661, "loss": 0.3581, "rewards/chosen": 1.5570329306782156, "rewards/margins": 0.7143091488985546, "rewards/rejected": 0.842723781779661, "step": 327 }, { "epoch": 0.22488858416181007, "grad_norm": 0.46231237997691227, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102216741.92592593, "logits/rejected": -81618888.64864865, "logps/chosen": -301.037037037037, "logps/rejected": -323.8918918918919, "loss": 0.3331, "rewards/chosen": 0.1267361111111111, "rewards/margins": 2.2365334084084085, "rewards/rejected": -2.1097972972972974, "step": 328 }, { "epoch": 0.22557422008913267, "grad_norm": 0.5315262722645477, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97320960.0, "logits/rejected": -98828288.0, "logps/chosen": -337.75, "logps/rejected": -337.0, "loss": 0.3496, "rewards/chosen": 0.03302001953125, "rewards/margins": 2.08966064453125, "rewards/rejected": -2.056640625, "step": 329 }, { "epoch": 0.22625985601645526, "grad_norm": 0.5042614757860037, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82648416.52459016, "logits/rejected": -82571447.40298508, "logps/chosen": -274.0983606557377, "logps/rejected": -357.0149253731343, "loss": 0.3323, "rewards/chosen": 0.28125, "rewards/margins": 1.7196828358208955, "rewards/rejected": -1.4384328358208955, "step": 330 }, { "epoch": 0.22694549194377786, "grad_norm": 0.5030183044080642, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84436152.6557377, "logits/rejected": -88769001.07462686, "logps/chosen": -260.72131147540983, "logps/rejected": -289.67164179104475, "loss": 0.3563, "rewards/chosen": 0.1479252049180328, "rewards/margins": 1.778522219843406, "rewards/rejected": -1.6305970149253732, "step": 331 }, { "epoch": 0.22763112787110046, "grad_norm": 0.4908340810517053, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83198489.18032786, "logits/rejected": -92086883.34328358, "logps/chosen": -276.72131147540983, "logps/rejected": -340.05970149253733, "loss": 0.3596, "rewards/chosen": 0.01831454918032787, "rewards/margins": 1.9604787282848055, "rewards/rejected": -1.9421641791044777, "step": 332 }, { "epoch": 0.22831676379842303, "grad_norm": 0.5140100662597951, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81678551.57894737, "logits/rejected": -92510986.81690142, "logps/chosen": -236.0701754385965, "logps/rejected": -346.59154929577466, "loss": 0.3219, "rewards/chosen": 0.03303179824561404, "rewards/margins": 2.4573275728935013, "rewards/rejected": -2.4242957746478875, "step": 333 }, { "epoch": 0.22900239972574563, "grad_norm": 0.44990564084739193, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109310015.01538461, "logits/rejected": -90676857.90476191, "logps/chosen": -267.0769230769231, "logps/rejected": -304.76190476190476, "loss": 0.369, "rewards/chosen": 2.0761195256159857, "rewards/margins": 3.7626274621239224, "rewards/rejected": -1.6865079365079365, "step": 334 }, { "epoch": 0.22968803565306822, "grad_norm": 0.5008501005773538, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91724615.3442623, "logits/rejected": -85826728.11940299, "logps/chosen": -350.6885245901639, "logps/rejected": -343.8805970149254, "loss": 0.3082, "rewards/chosen": 0.4305840163934426, "rewards/margins": 1.9553037101609334, "rewards/rejected": -1.5247196937674907, "step": 335 }, { "epoch": 0.23037367158039082, "grad_norm": 0.42789684260861294, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95420416.0, "logits/rejected": -94371840.0, "logps/chosen": -242.57142857142858, "logps/rejected": -368.44444444444446, "loss": 0.3107, "rewards/chosen": 0.08286830357142858, "rewards/margins": 2.721757192460317, "rewards/rejected": -2.638888888888889, "step": 336 }, { "epoch": 0.2310593075077134, "grad_norm": 0.5032352107526085, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91766287.51515152, "logits/rejected": -84494930.58064516, "logps/chosen": -335.5151515151515, "logps/rejected": -261.93548387096774, "loss": 0.3656, "rewards/chosen": 1.6182891383315579, "rewards/margins": 3.4207084931702676, "rewards/rejected": -1.8024193548387097, "step": 337 }, { "epoch": 0.231744943435036, "grad_norm": 0.4821489951499927, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100197262.22222222, "logits/rejected": -98452784.43243243, "logps/chosen": -330.962962962963, "logps/rejected": -307.2432432432432, "loss": 0.3084, "rewards/chosen": 1.9270973205566406, "rewards/margins": 3.4838938326449007, "rewards/rejected": -1.55679651208826, "step": 338 }, { "epoch": 0.23243057936235859, "grad_norm": 0.45521305104214904, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101711872.0, "logits/rejected": -90993095.1111111, "logps/chosen": -280.42857142857144, "logps/rejected": -333.3333333333333, "loss": 0.3433, "rewards/chosen": 2.3560303279331754, "rewards/margins": 4.559155327933175, "rewards/rejected": -2.203125, "step": 339 }, { "epoch": 0.23311621528968118, "grad_norm": 0.4453971497014749, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90876586.66666667, "logits/rejected": -90338855.38461539, "logps/chosen": -329.6507936507937, "logps/rejected": -286.2769230769231, "loss": 0.334, "rewards/chosen": 2.4523102291046626, "rewards/margins": 4.331156382950816, "rewards/rejected": -1.8788461538461538, "step": 340 }, { "epoch": 0.23380185121700378, "grad_norm": 0.5026765766874794, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95178436.92307693, "logits/rejected": -98235014.73684211, "logps/chosen": -296.0, "logps/rejected": -301.05263157894734, "loss": 0.3169, "rewards/chosen": 0.14547025240384615, "rewards/margins": 0.15078492878902294, "rewards/rejected": -0.005314676385176809, "step": 341 }, { "epoch": 0.23448748714432635, "grad_norm": 0.49126109362932713, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93456182.08450705, "logits/rejected": -118470691.92982456, "logps/chosen": -280.11267605633805, "logps/rejected": -335.719298245614, "loss": 0.3502, "rewards/chosen": 0.171599911971831, "rewards/margins": 2.421599911971831, "rewards/rejected": -2.25, "step": 342 }, { "epoch": 0.23517312307164895, "grad_norm": 0.430067087333576, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90380486.19354838, "logits/rejected": -98439043.87878788, "logps/chosen": -232.0, "logps/rejected": -362.6666666666667, "loss": 0.3457, "rewards/chosen": -0.0025201612903225806, "rewards/margins": 2.3649040811339197, "rewards/rejected": -2.367424242424242, "step": 343 }, { "epoch": 0.23585875899897155, "grad_norm": 0.5015973409526279, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77795720.76712328, "logits/rejected": -84572420.65454546, "logps/chosen": -263.2328767123288, "logps/rejected": -300.8, "loss": 0.3594, "rewards/chosen": 0.17698523116438356, "rewards/margins": 0.6926097456574589, "rewards/rejected": -0.5156245144930753, "step": 344 }, { "epoch": 0.23654439492629414, "grad_norm": 0.481014034243041, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86948587.68253969, "logits/rejected": -79304609.47692308, "logps/chosen": -269.2063492063492, "logps/rejected": -296.12307692307695, "loss": 0.3744, "rewards/chosen": 0.004774305555555556, "rewards/margins": 1.7374666132478633, "rewards/rejected": -1.7326923076923078, "step": 345 }, { "epoch": 0.23723003085361674, "grad_norm": 0.4360951894858122, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -68388482.16949153, "logits/rejected": -78536822.72463769, "logps/chosen": -219.9322033898305, "logps/rejected": -266.4347826086956, "loss": 0.3658, "rewards/chosen": -0.0487453654661017, "rewards/margins": 1.7863995620701303, "rewards/rejected": -1.835144927536232, "step": 346 }, { "epoch": 0.2379156667809393, "grad_norm": 0.43691546019421473, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82687707.42857143, "logits/rejected": -98469352.36923076, "logps/chosen": -265.6507936507937, "logps/rejected": -378.0923076923077, "loss": 0.3368, "rewards/chosen": 0.2166418650793651, "rewards/margins": 2.573372634310134, "rewards/rejected": -2.356730769230769, "step": 347 }, { "epoch": 0.2386013027082619, "grad_norm": 0.4622831656185244, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103284736.0, "logits/rejected": -114556928.0, "logps/chosen": -208.75, "logps/rejected": -352.5, "loss": 0.3558, "rewards/chosen": -0.0953369140625, "rewards/margins": 2.2230224609375, "rewards/rejected": -2.318359375, "step": 348 }, { "epoch": 0.2392869386355845, "grad_norm": 0.4366868000528226, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103539390.17142858, "logits/rejected": -70869274.48275863, "logps/chosen": -379.8857142857143, "logps/rejected": -293.7931034482759, "loss": 0.351, "rewards/chosen": 0.3763950892857143, "rewards/margins": 2.3548433651477834, "rewards/rejected": -1.978448275862069, "step": 349 }, { "epoch": 0.2399725745629071, "grad_norm": 0.6469912868105077, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83953730.06451613, "logits/rejected": -88588784.48484848, "logps/chosen": -318.4516129032258, "logps/rejected": -304.969696969697, "loss": 0.3583, "rewards/chosen": -0.002772177419354839, "rewards/margins": -0.635348281785773, "rewards/rejected": 0.6325761043664181, "step": 350 }, { "epoch": 0.2406582104902297, "grad_norm": 0.5541627302785789, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94741925.64705883, "logits/rejected": -134567253.33333334, "logps/chosen": -317.6470588235294, "logps/rejected": -333.8666666666667, "loss": 0.372, "rewards/chosen": 0.09512867647058823, "rewards/margins": 1.4818472993140128, "rewards/rejected": -1.3867186228434245, "step": 351 }, { "epoch": 0.24134384641755227, "grad_norm": 0.4389652475732392, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99383677.83050847, "logits/rejected": -79083905.85507247, "logps/chosen": -321.6271186440678, "logps/rejected": -294.95652173913044, "loss": 0.3532, "rewards/chosen": 0.2106064618644068, "rewards/margins": 1.9461137082412183, "rewards/rejected": -1.7355072463768115, "step": 352 }, { "epoch": 0.24202948234487487, "grad_norm": 0.47425278657608577, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84009441.88235295, "logits/rejected": -83886080.0, "logps/chosen": -254.35294117647058, "logps/rejected": -256.53333333333336, "loss": 0.3551, "rewards/chosen": 0.14039522058823528, "rewards/margins": 1.5633118872549019, "rewards/rejected": -1.4229166666666666, "step": 353 }, { "epoch": 0.24271511827219747, "grad_norm": 0.5124323579741519, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93353223.31428571, "logits/rejected": -77811570.7586207, "logps/chosen": -300.34285714285716, "logps/rejected": -326.8965517241379, "loss": 0.3406, "rewards/chosen": 0.26674107142857145, "rewards/margins": 1.9951893472906403, "rewards/rejected": -1.728448275862069, "step": 354 }, { "epoch": 0.24340075419952006, "grad_norm": 0.4048463367078053, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97565963.81538461, "logits/rejected": -93206755.55555555, "logps/chosen": -354.46153846153845, "logps/rejected": -350.984126984127, "loss": 0.3175, "rewards/chosen": 0.45697115384615383, "rewards/margins": 2.9014155982905985, "rewards/rejected": -2.4444444444444446, "step": 355 }, { "epoch": 0.24408639012684263, "grad_norm": 0.4736152671640628, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89162785.03225806, "logits/rejected": -110132255.03030303, "logps/chosen": -259.8709677419355, "logps/rejected": -317.3333333333333, "loss": 0.3651, "rewards/chosen": -0.023626512096774195, "rewards/margins": 1.3030780333577712, "rewards/rejected": -1.3267045454545454, "step": 356 }, { "epoch": 0.24477202605416523, "grad_norm": 0.5033823744101059, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103692515.55555555, "logits/rejected": -114144987.42857143, "logps/chosen": -293.1111111111111, "logps/rejected": -367.7142857142857, "loss": 0.3581, "rewards/chosen": 0.14078776041666666, "rewards/margins": 2.4198056175595237, "rewards/rejected": -2.279017857142857, "step": 357 }, { "epoch": 0.24545766198148783, "grad_norm": 0.5264035168286375, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97481410.20689656, "logits/rejected": -99944272.45714286, "logps/chosen": -288.82758620689657, "logps/rejected": -300.8, "loss": 0.3492, "rewards/chosen": 1.7173436921218346, "rewards/margins": 3.8655579778361204, "rewards/rejected": -2.148214285714286, "step": 358 }, { "epoch": 0.24614329790881043, "grad_norm": 0.4127156634504339, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91365922.13333334, "logits/rejected": -73523681.88235295, "logps/chosen": -242.13333333333333, "logps/rejected": -275.7647058823529, "loss": 0.3643, "rewards/chosen": -0.05286458333333333, "rewards/margins": 1.7927236519607843, "rewards/rejected": -1.8455882352941178, "step": 359 }, { "epoch": 0.24682893383613302, "grad_norm": 0.46675345655479666, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105725387.03448276, "logits/rejected": -76036739.65714286, "logps/chosen": -372.9655172413793, "logps/rejected": -246.62857142857143, "loss": 0.3498, "rewards/chosen": 2.094500837654903, "rewards/margins": 3.1503013573256617, "rewards/rejected": -1.055800519670759, "step": 360 }, { "epoch": 0.2475145697634556, "grad_norm": 0.5092712416675254, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93125706.20289855, "logits/rejected": -69028291.2542373, "logps/chosen": -321.8550724637681, "logps/rejected": -327.0508474576271, "loss": 0.3574, "rewards/chosen": 0.10411005434782608, "rewards/margins": 2.3647032746868093, "rewards/rejected": -2.260593220338983, "step": 361 }, { "epoch": 0.2482002056907782, "grad_norm": 0.4687343210426719, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94671433.14285715, "logits/rejected": -96662575.26153846, "logps/chosen": -247.61904761904762, "logps/rejected": -312.3692307692308, "loss": 0.3446, "rewards/chosen": 0.09126984126984126, "rewards/margins": 2.0912698412698414, "rewards/rejected": -2.0, "step": 362 }, { "epoch": 0.2488858416181008, "grad_norm": 0.5319198588261344, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92162839.89333333, "logits/rejected": -84202631.24528302, "logps/chosen": -247.68, "logps/rejected": -352.3018867924528, "loss": 0.3827, "rewards/chosen": 1.1503927612304687, "rewards/margins": 2.441666346136129, "rewards/rejected": -1.2912735849056605, "step": 363 }, { "epoch": 0.2495714775454234, "grad_norm": 0.4613323973067412, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86155129.70491803, "logits/rejected": -102666545.6716418, "logps/chosen": -296.655737704918, "logps/rejected": -341.0149253731343, "loss": 0.3542, "rewards/chosen": 0.0012807377049180327, "rewards/margins": 2.3165792451676044, "rewards/rejected": -2.3152985074626864, "step": 364 }, { "epoch": 0.250257113472746, "grad_norm": 0.6952896033932157, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95281924.83018868, "logits/rejected": -106032005.12, "logps/chosen": -309.4339622641509, "logps/rejected": -363.94666666666666, "loss": 0.3043, "rewards/chosen": 2.6144627984964623, "rewards/margins": 5.057796131829796, "rewards/rejected": -2.4433333333333334, "step": 365 }, { "epoch": 0.25094274940006855, "grad_norm": 0.4805130148152874, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83307555.31034483, "logits/rejected": -85983232.0, "logps/chosen": -256.55172413793105, "logps/rejected": -293.9428571428571, "loss": 0.3498, "rewards/chosen": 0.02646821120689655, "rewards/margins": 1.9675396397783251, "rewards/rejected": -1.9410714285714286, "step": 366 }, { "epoch": 0.2516283853273912, "grad_norm": 0.5110988032147763, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109368455.24528302, "logits/rejected": -108324891.30666667, "logps/chosen": -311.24528301886795, "logps/rejected": -391.25333333333333, "loss": 0.3, "rewards/chosen": 0.37942216981132076, "rewards/margins": 2.9010888364779874, "rewards/rejected": -2.5216666666666665, "step": 367 }, { "epoch": 0.25231402125471375, "grad_norm": 0.5075263959380133, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99386768.69565217, "logits/rejected": -101072496.3902439, "logps/chosen": -212.17391304347825, "logps/rejected": -337.5609756097561, "loss": 0.3073, "rewards/chosen": 4.802943022354789, "rewards/margins": 6.8730649735743015, "rewards/rejected": -2.0701219512195124, "step": 368 }, { "epoch": 0.2529996571820363, "grad_norm": 0.5177587762609839, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83611043.67213115, "logits/rejected": -116063277.85074627, "logps/chosen": -336.78688524590166, "logps/rejected": -390.6865671641791, "loss": 0.3168, "rewards/chosen": 0.2699795081967213, "rewards/margins": 2.6505765231220946, "rewards/rejected": -2.3805970149253732, "step": 369 }, { "epoch": 0.25368529310935894, "grad_norm": 0.4659920974674444, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92416867.79661018, "logits/rejected": -87411726.8405797, "logps/chosen": -305.35593220338984, "logps/rejected": -325.1014492753623, "loss": 0.323, "rewards/chosen": 0.3376588983050847, "rewards/margins": 2.440919767870302, "rewards/rejected": -2.1032608695652173, "step": 370 }, { "epoch": 0.2543709290366815, "grad_norm": 0.4535326362265297, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101432251.73333333, "logits/rejected": -85243060.70588236, "logps/chosen": -274.6666666666667, "logps/rejected": -380.70588235294116, "loss": 0.316, "rewards/chosen": 2.2926376342773436, "rewards/margins": 4.689696457806756, "rewards/rejected": -2.3970588235294117, "step": 371 }, { "epoch": 0.25505656496400414, "grad_norm": 0.4687956003326749, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79383371.29411764, "logits/rejected": -96608802.13333334, "logps/chosen": -214.11764705882354, "logps/rejected": -307.46666666666664, "loss": 0.3677, "rewards/chosen": 0.0759420955882353, "rewards/margins": 1.9342754289215687, "rewards/rejected": -1.8583333333333334, "step": 372 }, { "epoch": 0.2557422008913267, "grad_norm": 0.4976125804860617, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95490321.06666666, "logits/rejected": -88203745.88235295, "logps/chosen": -261.6, "logps/rejected": -294.8235294117647, "loss": 0.3587, "rewards/chosen": 1.5853556315104167, "rewards/margins": 2.4318781310436774, "rewards/rejected": -0.8465224995332605, "step": 373 }, { "epoch": 0.2564278368186493, "grad_norm": 0.443829922700277, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94584594.55072464, "logits/rejected": -88364743.59322034, "logps/chosen": -308.6376811594203, "logps/rejected": -253.83050847457628, "loss": 0.3499, "rewards/chosen": 0.3023097826086957, "rewards/margins": 2.126462324981577, "rewards/rejected": -1.8241525423728813, "step": 374 }, { "epoch": 0.2571134727459719, "grad_norm": 0.49039274614953676, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -119044216.47058824, "logits/rejected": -72351744.0, "logps/chosen": -369.4117647058824, "logps/rejected": -280.26666666666665, "loss": 0.3492, "rewards/chosen": 0.3374597886029412, "rewards/margins": 1.4405948339724073, "rewards/rejected": -1.1031350453694662, "step": 375 }, { "epoch": 0.2577991086732945, "grad_norm": 0.4838409230172499, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97367771.42857143, "logits/rejected": -74507150.22222222, "logps/chosen": -353.42857142857144, "logps/rejected": -275.1111111111111, "loss": 0.3244, "rewards/chosen": 2.6313958849225725, "rewards/margins": 4.467333384922572, "rewards/rejected": -1.8359375, "step": 376 }, { "epoch": 0.25848474460061704, "grad_norm": 0.480780804091721, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88149143.08196722, "logits/rejected": -98284437.01492538, "logps/chosen": -272.26229508196724, "logps/rejected": -289.1940298507463, "loss": 0.3469, "rewards/chosen": 1.8713881695856813, "rewards/margins": 3.751985184511055, "rewards/rejected": -1.8805970149253732, "step": 377 }, { "epoch": 0.25917038052793967, "grad_norm": 0.4823357543115453, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97775414.55737706, "logits/rejected": -85764126.56716418, "logps/chosen": -343.344262295082, "logps/rejected": -373.4925373134328, "loss": 0.3221, "rewards/chosen": 0.319672131147541, "rewards/margins": 2.5099706386102274, "rewards/rejected": -2.1902985074626864, "step": 378 }, { "epoch": 0.25985601645526224, "grad_norm": 0.45468183841173787, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91113188.43076923, "logits/rejected": -84218961.26984127, "logps/chosen": -259.9384615384615, "logps/rejected": -317.968253968254, "loss": 0.3339, "rewards/chosen": 0.12439903846153846, "rewards/margins": 1.409112783578726, "rewards/rejected": -1.2847137451171875, "step": 379 }, { "epoch": 0.26054165238258487, "grad_norm": 0.47252667297391665, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79272345.6, "logits/rejected": -90424259.76470588, "logps/chosen": -345.06666666666666, "logps/rejected": -335.52941176470586, "loss": 0.2871, "rewards/chosen": 0.6359375, "rewards/margins": 2.9263786764705886, "rewards/rejected": -2.2904411764705883, "step": 380 }, { "epoch": 0.26122728830990743, "grad_norm": 0.4624070526940044, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90836640.91428572, "logits/rejected": -73761897.93103448, "logps/chosen": -299.42857142857144, "logps/rejected": -251.0344827586207, "loss": 0.3512, "rewards/chosen": 0.2708705357142857, "rewards/margins": -2.3854478657539255, "rewards/rejected": 2.6563184014682113, "step": 381 }, { "epoch": 0.26191292423723, "grad_norm": 0.6492370736411205, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94405128.12698413, "logits/rejected": -80401581.29230769, "logps/chosen": -297.14285714285717, "logps/rejected": -348.8, "loss": 0.3524, "rewards/chosen": 1.821910676502046, "rewards/margins": 4.054602984194354, "rewards/rejected": -2.2326923076923078, "step": 382 }, { "epoch": 0.26259856016455263, "grad_norm": 0.4503641410581728, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84841744.20253165, "logits/rejected": -99892913.63265306, "logps/chosen": -224.20253164556962, "logps/rejected": -308.57142857142856, "loss": 0.3605, "rewards/chosen": 1.3728788351710839, "rewards/margins": 3.536144141293533, "rewards/rejected": -2.163265306122449, "step": 383 }, { "epoch": 0.2632841960918752, "grad_norm": 0.5061252004374518, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86314361.26315789, "logits/rejected": -107397812.28169014, "logps/chosen": -263.0175438596491, "logps/rejected": -334.4225352112676, "loss": 0.3293, "rewards/chosen": 2.390277260228207, "rewards/margins": 3.471694827521616, "rewards/rejected": -1.081417567293409, "step": 384 }, { "epoch": 0.2639698320191978, "grad_norm": 0.45721162785088537, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105456786.28571428, "logits/rejected": -90790549.66153847, "logps/chosen": -319.23809523809524, "logps/rejected": -320.0, "loss": 0.317, "rewards/chosen": 0.3231336805555556, "rewards/margins": 2.7269798344017095, "rewards/rejected": -2.4038461538461537, "step": 385 }, { "epoch": 0.2646554679465204, "grad_norm": 0.50815848592045, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -118522913.03225806, "logits/rejected": -70794767.51515152, "logps/chosen": -303.48387096774195, "logps/rejected": -270.1818181818182, "loss": 0.334, "rewards/chosen": 0.3503024193548387, "rewards/margins": 1.7016281769305963, "rewards/rejected": -1.3513257575757576, "step": 386 }, { "epoch": 0.26534110387384297, "grad_norm": 0.42746036659685577, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103502517.16923077, "logits/rejected": -85017876.31746031, "logps/chosen": -234.09230769230768, "logps/rejected": -337.77777777777777, "loss": 0.3596, "rewards/chosen": -0.017998798076923075, "rewards/margins": 2.208191678113553, "rewards/rejected": -2.2261904761904763, "step": 387 }, { "epoch": 0.2660267398011656, "grad_norm": 0.504038790982435, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98014261.89473684, "logits/rejected": -96227012.92307693, "logps/chosen": -234.73684210526315, "logps/rejected": -251.53846153846155, "loss": 0.3716, "rewards/chosen": 0.20750668174342105, "rewards/margins": -0.46350135494340294, "rewards/rejected": 0.671008036686824, "step": 388 }, { "epoch": 0.26671237572848816, "grad_norm": 0.5369302138176197, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87752704.0, "logits/rejected": -79298560.0, "logps/chosen": -279.5, "logps/rejected": -302.25, "loss": 0.3211, "rewards/chosen": 0.320556640625, "rewards/margins": 2.457275390625, "rewards/rejected": -2.13671875, "step": 389 }, { "epoch": 0.2673980116558108, "grad_norm": 0.48140229984629557, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85388517.25373134, "logits/rejected": -121566056.91803278, "logps/chosen": -232.59701492537314, "logps/rejected": -374.0327868852459, "loss": 0.3547, "rewards/chosen": 0.32509328358208955, "rewards/margins": 1.992101480303401, "rewards/rejected": -1.6670081967213115, "step": 390 }, { "epoch": 0.26808364758313336, "grad_norm": 0.489396653091148, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96468992.0, "logits/rejected": -101928244.82539682, "logps/chosen": -346.0923076923077, "logps/rejected": -306.7936507936508, "loss": 0.3461, "rewards/chosen": 0.45877403846153847, "rewards/margins": 2.109567689255189, "rewards/rejected": -1.6507936507936507, "step": 391 }, { "epoch": 0.2687692835104559, "grad_norm": 0.4561425630692359, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93189808.87272727, "logits/rejected": -87390909.36986302, "logps/chosen": -369.74545454545455, "logps/rejected": -309.041095890411, "loss": 0.3307, "rewards/chosen": 0.1849431818181818, "rewards/margins": 1.6603261565212, "rewards/rejected": -1.475382974703018, "step": 392 }, { "epoch": 0.26945491943777855, "grad_norm": 0.551042916795334, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97033609.84615384, "logits/rejected": -124173473.68421052, "logps/chosen": -289.2307692307692, "logps/rejected": -346.5263157894737, "loss": 0.3324, "rewards/chosen": 2.8238516587477465, "rewards/margins": 4.9488516587477465, "rewards/rejected": -2.125, "step": 393 }, { "epoch": 0.2701405553651011, "grad_norm": 0.45631991951861417, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -108918751.49206349, "logits/rejected": -97501436.06153846, "logps/chosen": -348.6984126984127, "logps/rejected": -289.7230769230769, "loss": 0.3432, "rewards/chosen": 0.45907738095238093, "rewards/margins": 2.311000457875458, "rewards/rejected": -1.851923076923077, "step": 394 }, { "epoch": 0.27082619129242375, "grad_norm": 0.7596718319968985, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95936381.96825397, "logits/rejected": -99888962.95384616, "logps/chosen": -291.55555555555554, "logps/rejected": -305.2307692307692, "loss": 0.3294, "rewards/chosen": 1.4853790525406125, "rewards/margins": 3.9661482833098436, "rewards/rejected": -2.480769230769231, "step": 395 }, { "epoch": 0.2715118272197463, "grad_norm": 0.5069578238729981, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86657316.57142857, "logits/rejected": -91051349.33333333, "logps/chosen": -278.0, "logps/rejected": -335.1111111111111, "loss": 0.3245, "rewards/chosen": 0.17853655133928573, "rewards/margins": 2.183744884672619, "rewards/rejected": -2.0052083333333335, "step": 396 }, { "epoch": 0.2721974631470689, "grad_norm": 0.47572599849981967, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100663296.0, "logits/rejected": -101044596.36363636, "logps/chosen": -347.8709677419355, "logps/rejected": -383.5151515151515, "loss": 0.3156, "rewards/chosen": 0.25579637096774194, "rewards/margins": 2.9489781891495603, "rewards/rejected": -2.6931818181818183, "step": 397 }, { "epoch": 0.2728830990743915, "grad_norm": 0.4452726989831957, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97455887.05882353, "logits/rejected": -93742694.4, "logps/chosen": -309.1764705882353, "logps/rejected": -331.73333333333335, "loss": 0.3539, "rewards/chosen": 0.19427849264705882, "rewards/margins": 2.040111825980392, "rewards/rejected": -1.8458333333333334, "step": 398 }, { "epoch": 0.2735687350017141, "grad_norm": 0.4976495537582343, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97702610.8235294, "logits/rejected": -77245098.66666667, "logps/chosen": -316.2352941176471, "logps/rejected": -311.46666666666664, "loss": 0.35, "rewards/chosen": 0.3098000919117647, "rewards/margins": 2.5827167585784316, "rewards/rejected": -2.2729166666666667, "step": 399 }, { "epoch": 0.2742543709290367, "grad_norm": 0.5036798791631636, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95706391.27272727, "logits/rejected": -93883462.1369863, "logps/chosen": -345.0181818181818, "logps/rejected": -334.027397260274, "loss": 0.2816, "rewards/chosen": 2.2874368840997867, "rewards/margins": 4.736067021086088, "rewards/rejected": -2.4486301369863015, "step": 400 }, { "epoch": 0.2749400068563593, "grad_norm": 0.5652727958073567, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85721088.0, "logits/rejected": -78118912.0, "logps/chosen": -204.5, "logps/rejected": -354.0, "loss": 0.3364, "rewards/chosen": 1.7262616157531738, "rewards/margins": 3.972355365753174, "rewards/rejected": -2.24609375, "step": 401 }, { "epoch": 0.27562564278368185, "grad_norm": 0.41102218976249366, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105250816.0, "logits/rejected": -94109696.0, "logps/chosen": -237.5, "logps/rejected": -379.5, "loss": 0.2989, "rewards/chosen": 0.384033203125, "rewards/margins": 2.3598610162734985, "rewards/rejected": -1.9758278131484985, "step": 402 }, { "epoch": 0.27631127871100447, "grad_norm": 0.39294633442278964, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77403973.81818181, "logits/rejected": -78947625.29032259, "logps/chosen": -224.4848484848485, "logps/rejected": -274.3225806451613, "loss": 0.3266, "rewards/chosen": 0.3593454071969697, "rewards/margins": 2.6557163749389052, "rewards/rejected": -2.2963709677419355, "step": 403 }, { "epoch": 0.27699691463832704, "grad_norm": 0.4414290763277681, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81229687.46666667, "logits/rejected": -91041069.1764706, "logps/chosen": -244.8, "logps/rejected": -329.88235294117646, "loss": 0.3143, "rewards/chosen": 1.668471908569336, "rewards/margins": 3.923986614451689, "rewards/rejected": -2.255514705882353, "step": 404 }, { "epoch": 0.27768255056564967, "grad_norm": 0.42561955302717935, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86456782.4516129, "logits/rejected": -70667667.39393939, "logps/chosen": -320.258064516129, "logps/rejected": -267.1515151515151, "loss": 0.3307, "rewards/chosen": 0.34576612903225806, "rewards/margins": 2.466978250244379, "rewards/rejected": -2.121212121212121, "step": 405 }, { "epoch": 0.27836818649297224, "grad_norm": 0.45041637030385057, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92495440.84210527, "logits/rejected": -106452617.0140845, "logps/chosen": -287.57894736842104, "logps/rejected": -341.40845070422534, "loss": 0.3254, "rewards/chosen": 0.3072916666666667, "rewards/margins": 2.3988409624413145, "rewards/rejected": -2.091549295774648, "step": 406 }, { "epoch": 0.2790538224202948, "grad_norm": 0.4563199488735066, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102072857.18032786, "logits/rejected": -67734879.52238806, "logps/chosen": -268.1967213114754, "logps/rejected": -260.2985074626866, "loss": 0.341, "rewards/chosen": 2.2393368580302253, "rewards/margins": 3.935232380418285, "rewards/rejected": -1.6958955223880596, "step": 407 }, { "epoch": 0.27973945834761743, "grad_norm": 0.4720058386057998, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107720378.92063493, "logits/rejected": -97049741.78461538, "logps/chosen": -240.25396825396825, "logps/rejected": -339.6923076923077, "loss": 0.3429, "rewards/chosen": 0.1617373511904762, "rewards/margins": 2.271352735805861, "rewards/rejected": -2.1096153846153847, "step": 408 }, { "epoch": 0.28042509427494, "grad_norm": 0.5250573642265153, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87270956.91228071, "logits/rejected": -87725935.77464789, "logps/chosen": -271.1578947368421, "logps/rejected": -308.28169014084506, "loss": 0.337, "rewards/chosen": 0.29272889254385964, "rewards/margins": 2.194137343248085, "rewards/rejected": -1.9014084507042253, "step": 409 }, { "epoch": 0.28111073020226257, "grad_norm": 0.4251696954447427, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95115990.70967741, "logits/rejected": -75497472.0, "logps/chosen": -294.19354838709677, "logps/rejected": -255.5151515151515, "loss": 0.3235, "rewards/chosen": 2.0418074823194936, "rewards/margins": 4.155443845955857, "rewards/rejected": -2.1136363636363638, "step": 410 }, { "epoch": 0.2817963661295852, "grad_norm": 0.46037636577574137, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -106083627.32307692, "logits/rejected": -104857600.0, "logps/chosen": -305.7230769230769, "logps/rejected": -302.4761904761905, "loss": 0.3487, "rewards/chosen": 0.15895432692307693, "rewards/margins": 2.3137162316849818, "rewards/rejected": -2.1547619047619047, "step": 411 }, { "epoch": 0.28248200205690777, "grad_norm": 0.45060171332021015, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96043482.89855072, "logits/rejected": -72653876.0677966, "logps/chosen": -291.2463768115942, "logps/rejected": -293.4237288135593, "loss": 0.3539, "rewards/chosen": 0.26721014492753625, "rewards/margins": 2.087125399164824, "rewards/rejected": -1.819915254237288, "step": 412 }, { "epoch": 0.2831676379842304, "grad_norm": 0.4622028842953323, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109126802.28571428, "logits/rejected": -102410922.66666667, "logps/chosen": -368.85714285714283, "logps/rejected": -334.22222222222223, "loss": 0.3069, "rewards/chosen": 2.5755577087402344, "rewards/margins": 4.846391042073568, "rewards/rejected": -2.2708333333333335, "step": 413 }, { "epoch": 0.28385327391155296, "grad_norm": 0.45400413061274864, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81159782.4, "logits/rejected": -97332525.1764706, "logps/chosen": -293.6, "logps/rejected": -319.7647058823529, "loss": 0.3174, "rewards/chosen": 2.3680577596028645, "rewards/margins": 4.9029842301911, "rewards/rejected": -2.5349264705882355, "step": 414 }, { "epoch": 0.28453890983887553, "grad_norm": 0.43442358147412286, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91475772.95238096, "logits/rejected": -102147434.33846153, "logps/chosen": -307.55555555555554, "logps/rejected": -374.15384615384613, "loss": 0.3013, "rewards/chosen": 0.26351686507936506, "rewards/margins": 1.793321270878644, "rewards/rejected": -1.5298044057992788, "step": 415 }, { "epoch": 0.28522454576619816, "grad_norm": 0.46511744374057223, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93323264.0, "logits/rejected": -78942793.14285715, "logps/chosen": -281.3333333333333, "logps/rejected": -316.0, "loss": 0.3328, "rewards/chosen": 0.3537326388888889, "rewards/margins": 2.766679067460317, "rewards/rejected": -2.4129464285714284, "step": 416 }, { "epoch": 0.2859101816935207, "grad_norm": 0.5114547698443415, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98855406.34482759, "logits/rejected": -88919244.8, "logps/chosen": -356.41379310344826, "logps/rejected": -292.57142857142856, "loss": 0.2915, "rewards/chosen": 0.4501616379310345, "rewards/margins": 2.775161637931035, "rewards/rejected": -2.325, "step": 417 }, { "epoch": 0.28659581762084335, "grad_norm": 0.4685488089413992, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98353389.44927536, "logits/rejected": -84810248.6779661, "logps/chosen": -270.1449275362319, "logps/rejected": -334.10169491525426, "loss": 0.3209, "rewards/chosen": 0.31796308876811596, "rewards/margins": 2.9472003769037096, "rewards/rejected": -2.6292372881355934, "step": 418 }, { "epoch": 0.2872814535481659, "grad_norm": 0.5632401171119192, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85167672.8888889, "logits/rejected": -113695597.71428572, "logps/chosen": -292.8888888888889, "logps/rejected": -279.85714285714283, "loss": 0.3607, "rewards/chosen": 0.058024088541666664, "rewards/margins": 2.429675874255952, "rewards/rejected": -2.3716517857142856, "step": 419 }, { "epoch": 0.2879670894754885, "grad_norm": 0.47078693875475547, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93606213.07936507, "logits/rejected": -79240081.72307692, "logps/chosen": -313.6507936507937, "logps/rejected": -264.12307692307695, "loss": 0.3371, "rewards/chosen": 2.141203865172371, "rewards/margins": 4.118126942095448, "rewards/rejected": -1.976923076923077, "step": 420 }, { "epoch": 0.2886527254028111, "grad_norm": 0.6603891168295627, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101711872.0, "logits/rejected": -65910491.428571425, "logps/chosen": -325.3333333333333, "logps/rejected": -290.85714285714283, "loss": 0.3497, "rewards/chosen": 0.1443142361111111, "rewards/margins": 2.3295820932539684, "rewards/rejected": -2.185267857142857, "step": 421 }, { "epoch": 0.2893383613301337, "grad_norm": 0.4460895048050362, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101250498.56, "logits/rejected": -87650198.97435898, "logps/chosen": -347.84, "logps/rejected": -303.5897435897436, "loss": 0.3089, "rewards/chosen": 0.326875, "rewards/margins": 2.5768750000000002, "rewards/rejected": -2.25, "step": 422 }, { "epoch": 0.2900239972574563, "grad_norm": 0.46835868273091663, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96656796.65671642, "logits/rejected": -92756001.5737705, "logps/chosen": -320.4776119402985, "logps/rejected": -325.7704918032787, "loss": 0.3527, "rewards/chosen": 1.9178585223297575, "rewards/margins": 3.991629014133036, "rewards/rejected": -2.0737704918032787, "step": 423 }, { "epoch": 0.2907096331847789, "grad_norm": 0.5262594902144447, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94992124.39436619, "logits/rejected": -90508665.26315789, "logps/chosen": -360.11267605633805, "logps/rejected": -248.140350877193, "loss": 0.364, "rewards/chosen": 0.2953345070422535, "rewards/margins": 2.2152906473931306, "rewards/rejected": -1.9199561403508771, "step": 424 }, { "epoch": 0.29139526911210145, "grad_norm": 0.45142918279779787, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86660773.41538462, "logits/rejected": -95936381.96825397, "logps/chosen": -200.98461538461538, "logps/rejected": -291.55555555555554, "loss": 0.3291, "rewards/chosen": 0.25987830528846156, "rewards/margins": 2.3729735433837, "rewards/rejected": -2.113095238095238, "step": 425 }, { "epoch": 0.2920809050394241, "grad_norm": 0.4541308835347384, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103040068.26666667, "logits/rejected": -77101176.47058824, "logps/chosen": -261.6, "logps/rejected": -313.88235294117646, "loss": 0.3522, "rewards/chosen": 1.8614051818847657, "rewards/margins": 4.074640476002413, "rewards/rejected": -2.213235294117647, "step": 426 }, { "epoch": 0.29276654096674665, "grad_norm": 0.4622045730109293, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -69028291.2542373, "logits/rejected": -110753940.4057971, "logps/chosen": -164.47457627118644, "logps/rejected": -295.42028985507244, "loss": 0.3521, "rewards/chosen": 0.07279528601694915, "rewards/margins": 1.5927228222488332, "rewards/rejected": -1.519927536231884, "step": 427 }, { "epoch": 0.29345217689406927, "grad_norm": 0.4335018991771979, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88546417.77777778, "logits/rejected": -83319282.16216215, "logps/chosen": -236.2962962962963, "logps/rejected": -307.8918918918919, "loss": 0.3076, "rewards/chosen": 2.226021095558449, "rewards/margins": 4.530075149612503, "rewards/rejected": -2.304054054054054, "step": 428 }, { "epoch": 0.29413781282139184, "grad_norm": 0.4530701194351764, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92701227.38983051, "logits/rejected": -82731126.72463769, "logps/chosen": -259.1186440677966, "logps/rejected": -302.1449275362319, "loss": 0.3196, "rewards/chosen": 0.2272080243644068, "rewards/margins": 2.5224978794368704, "rewards/rejected": -2.295289855072464, "step": 429 }, { "epoch": 0.2948234487487144, "grad_norm": 0.4901850963219579, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99876864.0, "logits/rejected": -122814464.0, "logps/chosen": -336.25, "logps/rejected": -396.5, "loss": 0.3, "rewards/chosen": 0.450836181640625, "rewards/margins": 2.929351806640625, "rewards/rejected": -2.478515625, "step": 430 }, { "epoch": 0.29550908467603704, "grad_norm": 0.4919947818958762, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80785942.26086956, "logits/rejected": -83530630.50847457, "logps/chosen": -229.79710144927537, "logps/rejected": -314.8474576271187, "loss": 0.3489, "rewards/chosen": 0.16802536231884058, "rewards/margins": 2.1764999385900268, "rewards/rejected": -2.0084745762711864, "step": 431 }, { "epoch": 0.2961947206033596, "grad_norm": 0.48818203931984044, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98333127.1111111, "logits/rejected": -107479040.0, "logps/chosen": -288.8888888888889, "logps/rejected": -325.42857142857144, "loss": 0.356, "rewards/chosen": 0.14171006944444445, "rewards/margins": 2.152870783730159, "rewards/rejected": -2.0111607142857144, "step": 432 }, { "epoch": 0.29688035653068223, "grad_norm": 0.48583823444319596, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96646216.11267605, "logits/rejected": -94997306.38596492, "logps/chosen": -238.64788732394365, "logps/rejected": -354.2456140350877, "loss": 0.3381, "rewards/chosen": 1.848141952299736, "rewards/margins": 4.328405110194472, "rewards/rejected": -2.4802631578947367, "step": 433 }, { "epoch": 0.2975659924580048, "grad_norm": 0.5070574863992884, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87381333.33333333, "logits/rejected": -108145027.45945945, "logps/chosen": -315.85185185185185, "logps/rejected": -326.4864864864865, "loss": 0.3254, "rewards/chosen": 1.954734378390842, "rewards/margins": 4.292572216228679, "rewards/rejected": -2.3378378378378377, "step": 434 }, { "epoch": 0.29825162838532737, "grad_norm": 0.6160716961855789, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84964615.31428571, "logits/rejected": -106593174.06896552, "logps/chosen": -252.22857142857143, "logps/rejected": -404.41379310344826, "loss": 0.3523, "rewards/chosen": 0.2722098214285714, "rewards/margins": 2.532985683497537, "rewards/rejected": -2.2607758620689653, "step": 435 }, { "epoch": 0.29893726431265, "grad_norm": 0.44667530400149497, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -117059211.63636364, "logits/rejected": -66094113.03225806, "logps/chosen": -265.45454545454544, "logps/rejected": -241.5483870967742, "loss": 0.357, "rewards/chosen": 0.13896780303030304, "rewards/margins": 1.6873548998044965, "rewards/rejected": -1.5483870967741935, "step": 436 }, { "epoch": 0.29962290023997257, "grad_norm": 0.512495944940066, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -111731598.22222222, "logits/rejected": -89578349.71428572, "logps/chosen": -314.22222222222223, "logps/rejected": -273.0, "loss": 0.3534, "rewards/chosen": 0.2686631944444444, "rewards/margins": 2.404823908730159, "rewards/rejected": -2.1361607142857144, "step": 437 }, { "epoch": 0.3003085361672952, "grad_norm": 0.5390863364768615, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86184328.76712328, "logits/rejected": -81522017.74545455, "logps/chosen": -296.7671232876712, "logps/rejected": -365.09090909090907, "loss": 0.3506, "rewards/chosen": 0.0918236301369863, "rewards/margins": 2.498641811955168, "rewards/rejected": -2.4068181818181817, "step": 438 }, { "epoch": 0.30099417209461776, "grad_norm": 0.4508547473290508, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99614720.0, "logits/rejected": -78905344.0, "logps/chosen": -393.75, "logps/rejected": -328.5, "loss": 0.3214, "rewards/chosen": 3.9247705936431885, "rewards/margins": 5.486534118652344, "rewards/rejected": -1.5617635250091553, "step": 439 }, { "epoch": 0.30167980802194033, "grad_norm": 0.459715740096506, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89190640.94117647, "logits/rejected": -79482060.8, "logps/chosen": -287.29411764705884, "logps/rejected": -320.26666666666665, "loss": 0.3225, "rewards/chosen": 0.2903262867647059, "rewards/margins": 2.348659620098039, "rewards/rejected": -2.058333333333333, "step": 440 }, { "epoch": 0.30236544394926296, "grad_norm": 0.5027892337825036, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86682282.66666667, "logits/rejected": -106535321.6, "logps/chosen": -311.1111111111111, "logps/rejected": -405.4153846153846, "loss": 0.3196, "rewards/chosen": 0.23685515873015872, "rewards/margins": 1.87339362026862, "rewards/rejected": -1.6365384615384615, "step": 441 }, { "epoch": 0.3030510798765855, "grad_norm": 0.41045371529185937, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82235862.03278689, "logits/rejected": -99411264.95522387, "logps/chosen": -402.0983606557377, "logps/rejected": -337.910447761194, "loss": 0.2819, "rewards/chosen": 2.428273935787013, "rewards/margins": 4.8685724432496995, "rewards/rejected": -2.4402985074626864, "step": 442 }, { "epoch": 0.3037367158039081, "grad_norm": 0.45294372126378407, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103631572.67692308, "logits/rejected": -129823695.23809524, "logps/chosen": -324.4307692307692, "logps/rejected": -291.04761904761904, "loss": 0.3241, "rewards/chosen": 0.39471153846153845, "rewards/margins": 2.553441697191697, "rewards/rejected": -2.1587301587301586, "step": 443 }, { "epoch": 0.3044223517312307, "grad_norm": 0.46826068913960306, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95297054.11764705, "logits/rejected": -72701269.33333333, "logps/chosen": -239.05882352941177, "logps/rejected": -243.46666666666667, "loss": 0.3506, "rewards/chosen": 0.28699448529411764, "rewards/margins": 1.2049606921626073, "rewards/rejected": -0.9179662068684896, "step": 444 }, { "epoch": 0.3051079876585533, "grad_norm": 0.43306415101527174, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103910499.09677419, "logits/rejected": -88715884.60606061, "logps/chosen": -300.9032258064516, "logps/rejected": -326.3030303030303, "loss": 0.3407, "rewards/chosen": 0.1493195564516129, "rewards/margins": 2.25159228372434, "rewards/rejected": -2.102272727272727, "step": 445 }, { "epoch": 0.3057936235858759, "grad_norm": 0.4757800471983844, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86184328.76712328, "logits/rejected": -87851603.78181818, "logps/chosen": -302.90410958904107, "logps/rejected": -331.3454545454546, "loss": 0.3509, "rewards/chosen": 0.2737585616438356, "rewards/margins": 2.101031288916563, "rewards/rejected": -1.8272727272727274, "step": 446 }, { "epoch": 0.3064792595131985, "grad_norm": 0.502210514597286, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88207484.12121212, "logits/rejected": -78879975.22580644, "logps/chosen": -289.57575757575756, "logps/rejected": -290.06451612903226, "loss": 0.331, "rewards/chosen": 2.706614870013613, "rewards/margins": 4.62596970872329, "rewards/rejected": -1.9193548387096775, "step": 447 }, { "epoch": 0.30716489544052106, "grad_norm": 0.4926891604482103, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103581072.69565217, "logits/rejected": -95971362.71186441, "logps/chosen": -304.69565217391306, "logps/rejected": -343.864406779661, "loss": 0.3303, "rewards/chosen": 0.2377434329710145, "rewards/margins": 2.8521502126320315, "rewards/rejected": -2.614406779661017, "step": 448 }, { "epoch": 0.3078505313678437, "grad_norm": 0.43495495324420985, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84585130.66666667, "logits/rejected": -81564233.14285715, "logps/chosen": -271.55555555555554, "logps/rejected": -310.57142857142856, "loss": 0.346, "rewards/chosen": 1.4609293407864041, "rewards/margins": 3.7689650550721185, "rewards/rejected": -2.3080357142857144, "step": 449 }, { "epoch": 0.30853616729516625, "grad_norm": 0.4709696055500037, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -117669292.21818182, "logits/rejected": -94343111.89041096, "logps/chosen": -318.8363636363636, "logps/rejected": -371.7260273972603, "loss": 0.3118, "rewards/chosen": 0.29467329545454546, "rewards/margins": 3.032686994084682, "rewards/rejected": -2.738013698630137, "step": 450 }, { "epoch": 0.3092218032224889, "grad_norm": 0.4519171864768458, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94671433.14285715, "logits/rejected": -94716940.96202531, "logps/chosen": -276.57142857142856, "logps/rejected": -332.55696202531647, "loss": 0.3074, "rewards/chosen": 4.238124925263074, "rewards/margins": 6.698567963237758, "rewards/rejected": -2.4604430379746836, "step": 451 }, { "epoch": 0.30990743914981145, "grad_norm": 0.46190782837884475, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79253565.13432837, "logits/rejected": -76528858.22950819, "logps/chosen": -257.67164179104475, "logps/rejected": -300.59016393442624, "loss": 0.3168, "rewards/chosen": 0.28428171641791045, "rewards/margins": 1.650051900043679, "rewards/rejected": -1.3657701836257685, "step": 452 }, { "epoch": 0.310593075077134, "grad_norm": 0.4130362261689787, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81089877.33333333, "logits/rejected": -91904602.35294117, "logps/chosen": -281.06666666666666, "logps/rejected": -302.11764705882354, "loss": 0.309, "rewards/chosen": 0.19427083333333334, "rewards/margins": 2.7200061274509806, "rewards/rejected": -2.525735294117647, "step": 453 }, { "epoch": 0.31127871100445664, "grad_norm": 0.4650862846050903, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81966652.7457627, "logits/rejected": -81880108.52173913, "logps/chosen": -272.271186440678, "logps/rejected": -352.92753623188406, "loss": 0.3119, "rewards/chosen": 2.0927734375, "rewards/margins": 4.629005321557971, "rewards/rejected": -2.536231884057971, "step": 454 }, { "epoch": 0.3119643469317792, "grad_norm": 0.4509911121561556, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79921600.87671232, "logits/rejected": -97155332.65454546, "logps/chosen": -253.15068493150685, "logps/rejected": -317.3818181818182, "loss": 0.3347, "rewards/chosen": 0.2803938356164384, "rewards/margins": 2.8167574719800745, "rewards/rejected": -2.536363636363636, "step": 455 }, { "epoch": 0.31264998285910184, "grad_norm": 0.5044356885346046, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97770672.55172414, "logits/rejected": -67708050.28571428, "logps/chosen": -318.0689655172414, "logps/rejected": -308.8, "loss": 0.2951, "rewards/chosen": 2.681004754428206, "rewards/margins": 5.102433325856778, "rewards/rejected": -2.4214285714285713, "step": 456 }, { "epoch": 0.3133356187864244, "grad_norm": 0.5312250404299266, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -116020901.41538462, "logits/rejected": -72900998.09523809, "logps/chosen": -294.15384615384613, "logps/rejected": -303.23809523809524, "loss": 0.3499, "rewards/chosen": 0.24131610576923077, "rewards/margins": 2.1738557883089134, "rewards/rejected": -1.9325396825396826, "step": 457 }, { "epoch": 0.314021254713747, "grad_norm": 0.6593253692515451, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109611144.53333333, "logits/rejected": -97887653.64705883, "logps/chosen": -308.8, "logps/rejected": -307.52941176470586, "loss": 0.3008, "rewards/chosen": 0.48020833333333335, "rewards/margins": 2.8570465686274513, "rewards/rejected": -2.3768382352941178, "step": 458 }, { "epoch": 0.3147068906410696, "grad_norm": 0.4957988511637606, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79691776.0, "logits/rejected": -105306989.71428572, "logps/chosen": -261.3333333333333, "logps/rejected": -355.7142857142857, "loss": 0.3004, "rewards/chosen": 0.5014105902777778, "rewards/margins": 2.387571304563492, "rewards/rejected": -1.8861607142857142, "step": 459 }, { "epoch": 0.3153925265683922, "grad_norm": 0.9115042020617561, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88848355.15492958, "logits/rejected": -81310630.1754386, "logps/chosen": -210.25352112676057, "logps/rejected": -443.50877192982455, "loss": 0.3422, "rewards/chosen": 0.07298085387323944, "rewards/margins": 2.0283009779138097, "rewards/rejected": -1.9553201240405702, "step": 460 }, { "epoch": 0.3160781624957148, "grad_norm": 0.5050567513222044, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81026327.27272727, "logits/rejected": -114193308.90322581, "logps/chosen": -258.1818181818182, "logps/rejected": -381.93548387096774, "loss": 0.3204, "rewards/chosen": 0.16105883049242425, "rewards/margins": 3.0340427014601663, "rewards/rejected": -2.872983870967742, "step": 461 }, { "epoch": 0.31676379842303737, "grad_norm": 0.4791654481141649, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92973738.66666667, "logits/rejected": -76731090.8235294, "logps/chosen": -254.66666666666666, "logps/rejected": -290.11764705882354, "loss": 0.3229, "rewards/chosen": 0.10455729166666666, "rewards/margins": 2.4666896446078432, "rewards/rejected": -2.3621323529411766, "step": 462 }, { "epoch": 0.31744943435035994, "grad_norm": 0.44691268764638353, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101012821.33333333, "logits/rejected": -84687932.23529412, "logps/chosen": -276.26666666666665, "logps/rejected": -272.0, "loss": 0.3287, "rewards/chosen": 0.3790364583333333, "rewards/margins": 2.568374693627451, "rewards/rejected": -2.1893382352941178, "step": 463 }, { "epoch": 0.31813507027768256, "grad_norm": 0.4091559986469635, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92532799.01538461, "logits/rejected": -89611637.84126984, "logps/chosen": -325.4153846153846, "logps/rejected": -298.92063492063494, "loss": 0.3239, "rewards/chosen": 0.49182692307692305, "rewards/margins": 2.7715888278388277, "rewards/rejected": -2.2797619047619047, "step": 464 }, { "epoch": 0.31882070620500513, "grad_norm": 0.4568444132390184, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86682282.66666667, "logits/rejected": -79353525.67741935, "logps/chosen": -241.21212121212122, "logps/rejected": -308.9032258064516, "loss": 0.322, "rewards/chosen": 0.2947443181818182, "rewards/margins": 2.5407120601173023, "rewards/rejected": -2.245967741935484, "step": 465 }, { "epoch": 0.31950634213232776, "grad_norm": 0.5579032779647763, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79465056.86486487, "logits/rejected": -137013930.66666666, "logps/chosen": -251.8918918918919, "logps/rejected": -347.25925925925924, "loss": 0.3432, "rewards/chosen": 0.16047297297297297, "rewards/margins": 1.8957134853970181, "rewards/rejected": -1.735240512424045, "step": 466 }, { "epoch": 0.32019197805965033, "grad_norm": 0.49104737014841365, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85721088.0, "logits/rejected": -89653248.0, "logps/chosen": -272.25, "logps/rejected": -388.0, "loss": 0.3159, "rewards/chosen": 0.21728515625, "rewards/margins": 2.92431640625, "rewards/rejected": -2.70703125, "step": 467 }, { "epoch": 0.3208776139869729, "grad_norm": 0.48409287082365865, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82652461.1764706, "logits/rejected": -88429909.33333333, "logps/chosen": -304.94117647058823, "logps/rejected": -284.26666666666665, "loss": 0.3371, "rewards/chosen": 0.21518841911764705, "rewards/margins": 2.4089384191176473, "rewards/rejected": -2.19375, "step": 468 }, { "epoch": 0.3215632499142955, "grad_norm": 0.47462307558327627, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80802032.94117647, "logits/rejected": -118698803.2, "logps/chosen": -220.94117647058823, "logps/rejected": -304.8, "loss": 0.3292, "rewards/chosen": 0.3547794117647059, "rewards/margins": 2.4568627450980394, "rewards/rejected": -2.1020833333333333, "step": 469 }, { "epoch": 0.3222488858416181, "grad_norm": 0.5048661137218255, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81247727.48387097, "logits/rejected": -102061397.33333333, "logps/chosen": -248.51612903225808, "logps/rejected": -384.0, "loss": 0.3079, "rewards/chosen": 0.21282762096774194, "rewards/margins": 2.8113124694525906, "rewards/rejected": -2.5984848484848486, "step": 470 }, { "epoch": 0.3229345217689407, "grad_norm": 0.4412490576088495, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86755866.94736843, "logits/rejected": -82350137.69014084, "logps/chosen": -327.57894736842104, "logps/rejected": -332.16901408450707, "loss": 0.2949, "rewards/chosen": 2.689641316731771, "rewards/margins": 5.1192187815205035, "rewards/rejected": -2.4295774647887325, "step": 471 }, { "epoch": 0.3236201576962633, "grad_norm": 0.4603615998874001, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99716195.09677419, "logits/rejected": -85347731.39393939, "logps/chosen": -269.93548387096774, "logps/rejected": -237.57575757575756, "loss": 0.3253, "rewards/chosen": 0.35227129536290325, "rewards/margins": 2.39393796202957, "rewards/rejected": -2.0416666666666665, "step": 472 }, { "epoch": 0.32430579362358586, "grad_norm": 0.4269872853989022, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83646405.48571429, "logits/rejected": -83235239.72413793, "logps/chosen": -276.57142857142856, "logps/rejected": -311.7241379310345, "loss": 0.3242, "rewards/chosen": 0.3912946428571429, "rewards/margins": 2.7361222290640392, "rewards/rejected": -2.3448275862068964, "step": 473 }, { "epoch": 0.3249914295509085, "grad_norm": 0.4936113238954753, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98689505.88235295, "logits/rejected": -86718596.98701298, "logps/chosen": -320.0, "logps/rejected": -254.75324675324674, "loss": 0.2961, "rewards/chosen": 0.6096813725490197, "rewards/margins": 2.7785125413801888, "rewards/rejected": -2.168831168831169, "step": 474 }, { "epoch": 0.32567706547823105, "grad_norm": 0.4542495015587104, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87016157.6119403, "logits/rejected": -91312060.85245901, "logps/chosen": -251.9402985074627, "logps/rejected": -280.1311475409836, "loss": 0.3453, "rewards/chosen": 0.2749533582089552, "rewards/margins": 1.4537393389338527, "rewards/rejected": -1.1787859807248975, "step": 475 }, { "epoch": 0.3263627014055536, "grad_norm": 0.5901793896019488, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -108396544.0, "logits/rejected": -95682560.0, "logps/chosen": -281.5, "logps/rejected": -394.5, "loss": 0.329, "rewards/chosen": 2.076014757156372, "rewards/margins": 4.831874132156372, "rewards/rejected": -2.755859375, "step": 476 }, { "epoch": 0.32704833733287625, "grad_norm": 0.5243626343613408, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -108535681.96923077, "logits/rejected": -82421402.41269842, "logps/chosen": -268.0615384615385, "logps/rejected": -302.984126984127, "loss": 0.3558, "rewards/chosen": 1.7660860501802884, "rewards/margins": 4.121244780339019, "rewards/rejected": -2.3551587301587302, "step": 477 }, { "epoch": 0.3277339732601988, "grad_norm": 0.48409719974845944, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75147946.66666667, "logits/rejected": -91657878.58823529, "logps/chosen": -228.26666666666668, "logps/rejected": -305.88235294117646, "loss": 0.3128, "rewards/chosen": 0.4109375, "rewards/margins": 2740989.3521139706, "rewards/rejected": -2740988.9411764704, "step": 478 }, { "epoch": 0.32841960918752144, "grad_norm": 0.491246229419645, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95630131.2, "logits/rejected": -88746146.53968254, "logps/chosen": -274.7076923076923, "logps/rejected": -385.26984126984127, "loss": 0.3202, "rewards/chosen": 2.178686758188101, "rewards/margins": 4.835432789934133, "rewards/rejected": -2.6567460317460316, "step": 479 }, { "epoch": 0.329105245114844, "grad_norm": 0.4095107860494895, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74703955.02702703, "logits/rejected": -80235482.07407407, "logps/chosen": -153.83783783783784, "logps/rejected": -331.25925925925924, "loss": 0.331, "rewards/chosen": 0.26290646114864863, "rewards/margins": 2.823091646333834, "rewards/rejected": -2.560185185185185, "step": 480 }, { "epoch": 0.3297908810421666, "grad_norm": 0.4118494675590665, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95769941.33333333, "logits/rejected": -91287792.94117647, "logps/chosen": -274.53333333333336, "logps/rejected": -330.5882352941176, "loss": 0.2828, "rewards/chosen": 1.6955660502115886, "rewards/margins": 4.210271932564529, "rewards/rejected": -2.514705882352941, "step": 481 }, { "epoch": 0.3304765169694892, "grad_norm": 0.4923883951245572, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86854356.67692308, "logits/rejected": -104657871.23809524, "logps/chosen": -344.8615384615385, "logps/rejected": -288.5079365079365, "loss": 0.3117, "rewards/chosen": 0.6615384615384615, "rewards/margins": 2.791498778998779, "rewards/rejected": -2.1299603174603177, "step": 482 }, { "epoch": 0.3311621528968118, "grad_norm": 0.5606985213139223, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -123498951.1111111, "logits/rejected": -83211995.42857143, "logps/chosen": -328.22222222222223, "logps/rejected": -307.85714285714283, "loss": 0.3534, "rewards/chosen": 0.3605143229166667, "rewards/margins": 2.755603608630952, "rewards/rejected": -2.3950892857142856, "step": 483 }, { "epoch": 0.3318477888241344, "grad_norm": 0.45270878684379673, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91208339.52542374, "logits/rejected": -103094776.57971014, "logps/chosen": -257.6271186440678, "logps/rejected": -332.28985507246375, "loss": 0.3137, "rewards/chosen": 0.24854343220338984, "rewards/margins": 2.6362245916236797, "rewards/rejected": -2.38768115942029, "step": 484 }, { "epoch": 0.332533424751457, "grad_norm": 0.5233605400611124, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100534240.4923077, "logits/rejected": -96801873.26984127, "logps/chosen": -291.9384615384615, "logps/rejected": -339.55555555555554, "loss": 0.3522, "rewards/chosen": 0.09059495192307693, "rewards/margins": 2.0390076503357752, "rewards/rejected": -1.9484126984126984, "step": 485 }, { "epoch": 0.33321906067877954, "grad_norm": 0.4824166758967104, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92843407.18644068, "logits/rejected": -88384319.07246377, "logps/chosen": -324.8813559322034, "logps/rejected": -290.0869565217391, "loss": 0.2985, "rewards/chosen": 0.4565677966101695, "rewards/margins": 2.8188866371898795, "rewards/rejected": -2.36231884057971, "step": 486 }, { "epoch": 0.33390469660610217, "grad_norm": 0.4422880088179128, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97085801.41176471, "logits/rejected": -94511650.13333334, "logps/chosen": -310.5882352941176, "logps/rejected": -306.93333333333334, "loss": 0.3308, "rewards/chosen": 0.19990808823529413, "rewards/margins": 2.549908088235294, "rewards/rejected": -2.35, "step": 487 }, { "epoch": 0.33459033253342474, "grad_norm": 0.4304775017502752, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90177536.0, "logits/rejected": -107479040.0, "logps/chosen": -256.75, "logps/rejected": -328.0, "loss": 0.304, "rewards/chosen": 0.265380859375, "rewards/margins": 3.105224609375, "rewards/rejected": -2.83984375, "step": 488 }, { "epoch": 0.33527596846074736, "grad_norm": 0.48564885454721735, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -113639424.0, "logits/rejected": -103022592.0, "logps/chosen": -307.0, "logps/rejected": -337.0, "loss": 0.3202, "rewards/chosen": 2.3539631366729736, "rewards/margins": 4.713338136672974, "rewards/rejected": -2.359375, "step": 489 }, { "epoch": 0.33596160438806993, "grad_norm": 0.4742992441620405, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90312836.12903225, "logits/rejected": -78865625.21212122, "logps/chosen": -253.93548387096774, "logps/rejected": -266.6666666666667, "loss": 0.3472, "rewards/chosen": 0.10294858870967742, "rewards/margins": 2.2904485887096775, "rewards/rejected": -2.1875, "step": 490 }, { "epoch": 0.3366472403153925, "grad_norm": 0.6821945287633338, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97477748.65822785, "logits/rejected": -94585835.10204081, "logps/chosen": -339.8481012658228, "logps/rejected": -297.46938775510205, "loss": 0.3649, "rewards/chosen": 0.3214003164556962, "rewards/margins": 2.108711169752093, "rewards/rejected": -1.7873108532963966, "step": 491 }, { "epoch": 0.33733287624271513, "grad_norm": 0.5446848828689234, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92041671.1111111, "logits/rejected": -90926518.85714285, "logps/chosen": -280.44444444444446, "logps/rejected": -308.85714285714283, "loss": 0.3403, "rewards/chosen": 0.2188042534722222, "rewards/margins": 2.580411396329365, "rewards/rejected": -2.361607142857143, "step": 492 }, { "epoch": 0.3380185121700377, "grad_norm": 0.49296016175939317, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89366637.22666667, "logits/rejected": -84756595.9245283, "logps/chosen": -236.8, "logps/rejected": -323.3207547169811, "loss": 0.3495, "rewards/chosen": 0.41583333333333333, "rewards/margins": 1.6030460074562696, "rewards/rejected": -1.1872126741229363, "step": 493 }, { "epoch": 0.3387041480973603, "grad_norm": 0.40540776746657387, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88974252.06557377, "logits/rejected": -79942182.20895523, "logps/chosen": -230.29508196721312, "logps/rejected": -291.5820895522388, "loss": 0.3093, "rewards/chosen": 2.340121409932121, "rewards/margins": 4.580793051723166, "rewards/rejected": -2.2406716417910446, "step": 494 }, { "epoch": 0.3393897840246829, "grad_norm": 0.45441141420527054, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94618563.76470588, "logits/rejected": -78573294.93333334, "logps/chosen": -304.47058823529414, "logps/rejected": -325.06666666666666, "loss": 0.3145, "rewards/chosen": 0.40320542279411764, "rewards/margins": 2.8927887561274512, "rewards/rejected": -2.4895833333333335, "step": 495 }, { "epoch": 0.34007541995200546, "grad_norm": 0.5444780456582471, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95030944.91428572, "logits/rejected": -82150505.93103448, "logps/chosen": -229.71428571428572, "logps/rejected": -317.51724137931035, "loss": 0.3445, "rewards/chosen": 0.24386160714285715, "rewards/margins": 2.5261891933497536, "rewards/rejected": -2.2823275862068964, "step": 496 }, { "epoch": 0.3407610558793281, "grad_norm": 0.49140916502449566, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102127973.58730158, "logits/rejected": -111374903.13846155, "logps/chosen": -321.77777777777777, "logps/rejected": -382.5230769230769, "loss": 0.2979, "rewards/chosen": 0.5262896825396826, "rewards/margins": 3.0935973748473753, "rewards/rejected": -2.5673076923076925, "step": 497 }, { "epoch": 0.34144669180665066, "grad_norm": 0.4985993305053034, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103542967.40298508, "logits/rejected": -82012395.01639344, "logps/chosen": -297.3134328358209, "logps/rejected": -377.1803278688525, "loss": 0.3176, "rewards/chosen": 0.5494402985074627, "rewards/margins": 2.899850134573036, "rewards/rejected": -2.3504098360655736, "step": 498 }, { "epoch": 0.3421323277339733, "grad_norm": 0.4314163592895895, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76850473.29032259, "logits/rejected": -93926989.57575758, "logps/chosen": -207.48387096774192, "logps/rejected": -253.57575757575756, "loss": 0.3232, "rewards/chosen": 0.26001764112903225, "rewards/margins": 2.2183509744623655, "rewards/rejected": -1.9583333333333333, "step": 499 }, { "epoch": 0.34281796366129585, "grad_norm": 0.6166559954027532, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -122198229.97014925, "logits/rejected": -104101250.09836066, "logps/chosen": -271.5223880597015, "logps/rejected": -310.0327868852459, "loss": 0.3004, "rewards/chosen": 2.379152440313083, "rewards/margins": 5.0369393255589845, "rewards/rejected": -2.6577868852459017, "step": 500 }, { "epoch": 0.3435035995886184, "grad_norm": 0.49193008004483685, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95027200.0, "logits/rejected": -125566976.0, "logps/chosen": -287.75, "logps/rejected": -300.25, "loss": 0.3122, "rewards/chosen": 0.440216064453125, "rewards/margins": 2.783966064453125, "rewards/rejected": -2.34375, "step": 501 }, { "epoch": 0.34418923551594105, "grad_norm": 0.5057895182803481, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100913702.20895523, "logits/rejected": -94406219.5409836, "logps/chosen": -286.56716417910445, "logps/rejected": -339.1475409836066, "loss": 0.3245, "rewards/chosen": 1.5293631482480177, "rewards/margins": 4.039609049887362, "rewards/rejected": -2.5102459016393444, "step": 502 }, { "epoch": 0.3448748714432636, "grad_norm": 0.5205912036687357, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88403022.76923077, "logits/rejected": -87614350.22222222, "logps/chosen": -239.5076923076923, "logps/rejected": -244.06349206349208, "loss": 0.3298, "rewards/chosen": 0.4125, "rewards/margins": 1.1575464642237103, "rewards/rejected": -0.7450464642237103, "step": 503 }, { "epoch": 0.34556050737058625, "grad_norm": 0.49950328545216227, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -119738760.76712328, "logits/rejected": -83275999.41818182, "logps/chosen": -313.4246575342466, "logps/rejected": -343.8545454545455, "loss": 0.293, "rewards/chosen": 0.827054794520548, "rewards/margins": 3.4793275217932753, "rewards/rejected": -2.6522727272727273, "step": 504 }, { "epoch": 0.3462461432979088, "grad_norm": 0.49593700231926124, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105283109.10144928, "logits/rejected": -89928721.3559322, "logps/chosen": -251.36231884057972, "logps/rejected": -286.3728813559322, "loss": 0.324, "rewards/chosen": 2.0755723593891533, "rewards/margins": 4.427267274643391, "rewards/rejected": -2.3516949152542375, "step": 505 }, { "epoch": 0.3469317792252314, "grad_norm": 0.5809977646888055, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95118283.9322034, "logits/rejected": -84797885.2173913, "logps/chosen": -265.49152542372883, "logps/rejected": -323.71014492753625, "loss": 0.2891, "rewards/chosen": 0.5328389830508474, "rewards/margins": 2.646969417833456, "rewards/rejected": -2.114130434782609, "step": 506 }, { "epoch": 0.347617415152554, "grad_norm": 0.45585134554428847, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98812867.76470588, "logits/rejected": -79132535.46666667, "logps/chosen": -242.35294117647058, "logps/rejected": -311.73333333333335, "loss": 0.3259, "rewards/chosen": 1.7993693632238053, "rewards/margins": 4.226452696557139, "rewards/rejected": -2.4270833333333335, "step": 507 }, { "epoch": 0.3483030510798766, "grad_norm": 0.4374017060943708, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -114022930.96296297, "logits/rejected": -80371933.4054054, "logps/chosen": -361.77777777777777, "logps/rejected": -285.6216216216216, "loss": 0.2562, "rewards/chosen": 0.8229166666666666, "rewards/margins": 3.3431869369369367, "rewards/rejected": -2.52027027027027, "step": 508 }, { "epoch": 0.34898868700719915, "grad_norm": 0.544776316116157, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89732685.57575758, "logits/rejected": -113855058.58064516, "logps/chosen": -339.8787878787879, "logps/rejected": -325.16129032258067, "loss": 0.3065, "rewards/chosen": 0.4962121212121212, "rewards/margins": 3.4518572825024436, "rewards/rejected": -2.9556451612903225, "step": 509 }, { "epoch": 0.3496743229345218, "grad_norm": 0.4778059450192497, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98474963.47826087, "logits/rejected": -91919238.50847457, "logps/chosen": -356.17391304347825, "logps/rejected": -335.1864406779661, "loss": 0.3218, "rewards/chosen": 0.6440217391304348, "rewards/margins": 2.9745302137067062, "rewards/rejected": -2.330508474576271, "step": 510 }, { "epoch": 0.35035995886184435, "grad_norm": 0.4689211178484436, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109808253.90163934, "logits/rejected": -99411264.95522387, "logps/chosen": -251.54098360655738, "logps/rejected": -342.44776119402985, "loss": 0.3017, "rewards/chosen": 0.34349385245901637, "rewards/margins": 2.9927475838023, "rewards/rejected": -2.6492537313432836, "step": 511 }, { "epoch": 0.35104559478916697, "grad_norm": 0.4601939067779642, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74878640.26229508, "logits/rejected": -89520219.70149253, "logps/chosen": -206.95081967213116, "logps/rejected": -355.82089552238807, "loss": 0.2929, "rewards/chosen": 0.3829405737704918, "rewards/margins": 3.0433883349645217, "rewards/rejected": -2.66044776119403, "step": 512 }, { "epoch": 0.35173123071648954, "grad_norm": 0.4455196323341254, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91003686.78787878, "logits/rejected": -69611916.38709678, "logps/chosen": -298.42424242424244, "logps/rejected": -227.09677419354838, "loss": 0.3416, "rewards/chosen": 0.5784801136363636, "rewards/margins": 2.1339236620234603, "rewards/rejected": -1.5554435483870968, "step": 513 }, { "epoch": 0.3524168666438121, "grad_norm": 0.5347315955447339, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105438349.78461538, "logits/rejected": -91209467.93650794, "logps/chosen": -313.84615384615387, "logps/rejected": -301.2063492063492, "loss": 0.3198, "rewards/chosen": 0.39615384615384613, "rewards/margins": 2.5509157509157507, "rewards/rejected": -2.1547619047619047, "step": 514 }, { "epoch": 0.35310250257113474, "grad_norm": 0.5118112167917758, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102243758.3768116, "logits/rejected": -77061449.76271187, "logps/chosen": -325.3333333333333, "logps/rejected": -279.0508474576271, "loss": 0.3443, "rewards/chosen": 0.2864583333333333, "rewards/margins": 2.4495939265536726, "rewards/rejected": -2.163135593220339, "step": 515 }, { "epoch": 0.3537881384984573, "grad_norm": 0.4836618150251438, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105874400.96969697, "logits/rejected": -79353525.67741935, "logps/chosen": -241.93939393939394, "logps/rejected": -284.9032258064516, "loss": 0.3308, "rewards/chosen": 0.2507102272727273, "rewards/margins": 2.6357908724340176, "rewards/rejected": -2.3850806451612905, "step": 516 }, { "epoch": 0.35447377442577993, "grad_norm": 0.44680257118734273, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98172928.0, "logits/rejected": -81920000.0, "logps/chosen": -280.75, "logps/rejected": -328.25, "loss": 0.2985, "rewards/chosen": 1.8808319568634033, "rewards/margins": 3.711390733718872, "rewards/rejected": -1.8305587768554688, "step": 517 }, { "epoch": 0.3551594103531025, "grad_norm": 0.5370930158125438, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92274688.0, "logits/rejected": -85731573.76, "logps/chosen": -226.71698113207546, "logps/rejected": -267.73333333333335, "loss": 0.3076, "rewards/chosen": 0.45931603773584906, "rewards/margins": -0.18238928452977599, "rewards/rejected": 0.641705322265625, "step": 518 }, { "epoch": 0.35584504628042507, "grad_norm": 0.4354845262062154, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81564233.14285715, "logits/rejected": -87148316.44444445, "logps/chosen": -235.71428571428572, "logps/rejected": -321.55555555555554, "loss": 0.28, "rewards/chosen": 0.3656529017857143, "rewards/margins": 2.580930679563492, "rewards/rejected": -2.2152777777777777, "step": 519 }, { "epoch": 0.3565306822077477, "grad_norm": 0.617188442626123, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102891520.0, "logits/rejected": -82378752.0, "logps/chosen": -355.75, "logps/rejected": -269.5, "loss": 0.3249, "rewards/chosen": 2.545741081237793, "rewards/margins": 4.639491081237793, "rewards/rejected": -2.09375, "step": 520 }, { "epoch": 0.35721631813507027, "grad_norm": 0.5212070692674873, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89366637.22666667, "logits/rejected": -97181232.3018868, "logps/chosen": -331.0933333333333, "logps/rejected": -326.0377358490566, "loss": 0.3209, "rewards/chosen": 0.5147916666666666, "rewards/margins": 3.0265841194968552, "rewards/rejected": -2.5117924528301887, "step": 521 }, { "epoch": 0.3579019540623929, "grad_norm": 0.4108714579304551, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95070890.66666667, "logits/rejected": -96646216.11267605, "logps/chosen": -243.08771929824562, "logps/rejected": -368.22535211267603, "loss": 0.2618, "rewards/chosen": 0.5575657894736842, "rewards/margins": 3.534678465530022, "rewards/rejected": -2.977112676056338, "step": 522 }, { "epoch": 0.35858758998971546, "grad_norm": 0.4836811941932157, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96075776.0, "logits/rejected": -81133568.0, "logps/chosen": -369.25, "logps/rejected": -337.0, "loss": 0.2983, "rewards/chosen": 0.69091796875, "rewards/margins": 3.17138671875, "rewards/rejected": -2.48046875, "step": 523 }, { "epoch": 0.35927322591703803, "grad_norm": 0.5261040823627832, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -110472555.35483871, "logits/rejected": -90876586.66666667, "logps/chosen": -246.70967741935485, "logps/rejected": -336.72727272727275, "loss": 0.2827, "rewards/chosen": 2.4777996924615677, "rewards/margins": 5.206966359128234, "rewards/rejected": -2.7291666666666665, "step": 524 }, { "epoch": 0.35995886184436066, "grad_norm": 0.5011189052224807, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86050882.06451613, "logits/rejected": -99010994.42424242, "logps/chosen": -278.19354838709677, "logps/rejected": -332.1212121212121, "loss": 0.2901, "rewards/chosen": 0.484375, "rewards/margins": 2.855587121212121, "rewards/rejected": -2.371212121212121, "step": 525 }, { "epoch": 0.3606444977716832, "grad_norm": 0.46277933063447474, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92159775.56164384, "logits/rejected": -87927863.85454546, "logps/chosen": -212.82191780821918, "logps/rejected": -295.56363636363636, "loss": 0.3339, "rewards/chosen": 0.293771404109589, "rewards/margins": 2.691498676836862, "rewards/rejected": -2.397727272727273, "step": 526 }, { "epoch": 0.36133013369900585, "grad_norm": 0.9461135039926059, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97658421.49253732, "logits/rejected": -105063877.24590164, "logps/chosen": -271.76119402985074, "logps/rejected": -391.8688524590164, "loss": 0.2871, "rewards/chosen": 0.6977611940298507, "rewards/margins": 2.809692666558027, "rewards/rejected": -2.1119314725281764, "step": 527 }, { "epoch": 0.3620157696263284, "grad_norm": 0.46585634573183166, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99900695.27272727, "logits/rejected": -90718736.51612903, "logps/chosen": -337.93939393939394, "logps/rejected": -337.80645161290323, "loss": 0.3004, "rewards/chosen": 2.429410067471591, "rewards/margins": 4.860861680374817, "rewards/rejected": -2.431451612903226, "step": 528 }, { "epoch": 0.362701405553651, "grad_norm": 0.5254852536950214, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95918919.3442623, "logits/rejected": -109677919.52238806, "logps/chosen": -321.3114754098361, "logps/rejected": -429.85074626865674, "loss": 0.3015, "rewards/chosen": 0.6846823770491803, "rewards/margins": 3.464533123317837, "rewards/rejected": -2.779850746268657, "step": 529 }, { "epoch": 0.3633870414809736, "grad_norm": 0.454340298583057, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93073603.04761904, "logits/rejected": -105567405.29230769, "logps/chosen": -265.14285714285717, "logps/rejected": -278.6461538461538, "loss": 0.3037, "rewards/chosen": 0.6235119047619048, "rewards/margins": 2.6292811355311354, "rewards/rejected": -2.0057692307692307, "step": 530 }, { "epoch": 0.3640726774082962, "grad_norm": 0.4954629399055441, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99731228.44444445, "logits/rejected": -86918884.43076923, "logps/chosen": -307.3015873015873, "logps/rejected": -303.75384615384615, "loss": 0.3021, "rewards/chosen": 0.5952380952380952, "rewards/margins": 2.8894688644688644, "rewards/rejected": -2.294230769230769, "step": 531 }, { "epoch": 0.3647583133356188, "grad_norm": 0.4673021233747073, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97949334.58823529, "logits/rejected": -84934656.0, "logps/chosen": -304.2352941176471, "logps/rejected": -277.3333333333333, "loss": 0.3279, "rewards/chosen": 0.6383272058823529, "rewards/margins": 2.4091605392156863, "rewards/rejected": -1.7708333333333333, "step": 532 }, { "epoch": 0.3654439492629414, "grad_norm": 0.38343159441903296, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75747878.20895523, "logits/rejected": -105476431.73770492, "logps/chosen": -232.83582089552237, "logps/rejected": -345.7049180327869, "loss": 0.2849, "rewards/chosen": 0.6231343283582089, "rewards/margins": 5492239.836249082, "rewards/rejected": -5492239.213114754, "step": 533 }, { "epoch": 0.36612958519026395, "grad_norm": 0.4547346844477424, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85915581.93548387, "logits/rejected": -91257887.03030303, "logps/chosen": -238.70967741935485, "logps/rejected": -390.3030303030303, "loss": 0.3055, "rewards/chosen": 0.3555396295362903, "rewards/margins": 2.9956911446878056, "rewards/rejected": -2.640151515151515, "step": 534 }, { "epoch": 0.3668152211175866, "grad_norm": 0.4199582943019765, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89889691.60784313, "logits/rejected": -98484436.77922077, "logps/chosen": -304.3137254901961, "logps/rejected": -324.15584415584414, "loss": 0.2662, "rewards/chosen": 0.6151960784313726, "rewards/margins": 3.3651960784313726, "rewards/rejected": -2.75, "step": 535 }, { "epoch": 0.36750085704490915, "grad_norm": 0.4609403029868759, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102372086.51851852, "logits/rejected": -83999439.56756757, "logps/chosen": -262.22222222222223, "logps/rejected": -290.1621621621622, "loss": 0.3008, "rewards/chosen": 0.20934606481481483, "rewards/margins": 2.7025893080580583, "rewards/rejected": -2.4932432432432434, "step": 536 }, { "epoch": 0.36818649297223177, "grad_norm": 0.5806113060387673, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102315597.57575758, "logits/rejected": -80097676.38709678, "logps/chosen": -331.1515151515151, "logps/rejected": -290.5806451612903, "loss": 0.3093, "rewards/chosen": 0.3896780303030303, "rewards/margins": 3.1033877077223853, "rewards/rejected": -2.713709677419355, "step": 537 }, { "epoch": 0.36887212889955434, "grad_norm": 0.4718916049222466, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95944704.0, "logits/rejected": -84672512.0, "logps/chosen": -310.0, "logps/rejected": -311.5, "loss": 0.3076, "rewards/chosen": 0.6357421875, "rewards/margins": 3.1865234375, "rewards/rejected": -2.55078125, "step": 538 }, { "epoch": 0.3695577648268769, "grad_norm": 0.67735257412441, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107941647.05882353, "logits/rejected": -79202440.53333333, "logps/chosen": -284.47058823529414, "logps/rejected": -312.8, "loss": 0.325, "rewards/chosen": 0.28722426470588236, "rewards/margins": 2.835140931372549, "rewards/rejected": -2.5479166666666666, "step": 539 }, { "epoch": 0.37024340075419954, "grad_norm": 0.4442128018177032, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103791251.52542374, "logits/rejected": -108079311.76811594, "logps/chosen": -335.45762711864404, "logps/rejected": -324.6376811594203, "loss": 0.3088, "rewards/chosen": 2.45120653055482, "rewards/margins": 3.0870559718619983, "rewards/rejected": -0.6358494413071785, "step": 540 }, { "epoch": 0.3709290366815221, "grad_norm": 0.48524992551964075, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80841827.09677419, "logits/rejected": -87953283.87878788, "logps/chosen": -321.03225806451616, "logps/rejected": -304.24242424242425, "loss": 0.3011, "rewards/chosen": 0.5453629032258065, "rewards/margins": 2.068560077298072, "rewards/rejected": -1.5231971740722656, "step": 541 }, { "epoch": 0.3716146726088447, "grad_norm": 0.402668452871382, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81355034.48275863, "logits/rejected": -72741215.08571428, "logps/chosen": -223.44827586206895, "logps/rejected": -289.8285714285714, "loss": 0.2912, "rewards/chosen": 2.8421033661941, "rewards/margins": 5.4796033661941, "rewards/rejected": -2.6375, "step": 542 }, { "epoch": 0.3723003085361673, "grad_norm": 0.47754001255749867, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98529351.85964912, "logits/rejected": -96764365.52112676, "logps/chosen": -286.5964912280702, "logps/rejected": -354.7042253521127, "loss": 0.2677, "rewards/chosen": 0.7411766721491229, "rewards/margins": 2.7024442777829254, "rewards/rejected": -1.9612676056338028, "step": 543 }, { "epoch": 0.37298594446348987, "grad_norm": 0.456574543160089, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -114378004.31746031, "logits/rejected": -80530636.8, "logps/chosen": -338.031746031746, "logps/rejected": -348.0615384615385, "loss": 0.2948, "rewards/chosen": 0.5793650793650794, "rewards/margins": 3.446672771672772, "rewards/rejected": -2.8673076923076923, "step": 544 }, { "epoch": 0.3736715803908125, "grad_norm": 0.4064728659626285, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82179922.44067797, "logits/rejected": -103337924.63768116, "logps/chosen": -242.84745762711864, "logps/rejected": -352.463768115942, "loss": 0.2731, "rewards/chosen": 0.4417372881355932, "rewards/margins": 3.6265198968312458, "rewards/rejected": -3.1847826086956523, "step": 545 }, { "epoch": 0.37435721631813507, "grad_norm": 0.44040223639409937, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101994821.07936507, "logits/rejected": -87047939.93846154, "logps/chosen": -276.3174603174603, "logps/rejected": -323.9384615384615, "loss": 0.2937, "rewards/chosen": 0.6081349206349206, "rewards/margins": 3.011981074481074, "rewards/rejected": -2.4038461538461537, "step": 546 }, { "epoch": 0.37504285224545764, "grad_norm": 0.4568996878492139, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -108148515.44615385, "logits/rejected": -82887436.1904762, "logps/chosen": -279.1384615384615, "logps/rejected": -329.3968253968254, "loss": 0.2938, "rewards/chosen": 0.55625, "rewards/margins": 3.0284722222222222, "rewards/rejected": -2.4722222222222223, "step": 547 }, { "epoch": 0.37572848817278026, "grad_norm": 0.5738473686138256, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94837873.77777778, "logits/rejected": -100738194.28571428, "logps/chosen": -289.1111111111111, "logps/rejected": -374.2857142857143, "loss": 0.3386, "rewards/chosen": 0.3449164496527778, "rewards/margins": 3.0502735925099205, "rewards/rejected": -2.705357142857143, "step": 548 }, { "epoch": 0.37641412410010283, "grad_norm": 0.5234791469418224, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92477638.19354838, "logits/rejected": -108289303.27272727, "logps/chosen": -257.2903225806452, "logps/rejected": -337.93939393939394, "loss": 0.3217, "rewards/chosen": 0.44329637096774194, "rewards/margins": 2.564508492179863, "rewards/rejected": -2.121212121212121, "step": 549 }, { "epoch": 0.37709976002742546, "grad_norm": 0.45280000910010393, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82670339.71014492, "logits/rejected": -84312619.38983051, "logps/chosen": -266.4347826086956, "logps/rejected": -332.47457627118644, "loss": 0.3051, "rewards/chosen": 0.47282608695652173, "rewards/margins": 1.6372850568486033, "rewards/rejected": -1.1644589698920815, "step": 550 }, { "epoch": 0.377785395954748, "grad_norm": 0.4700698266091913, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89780777.51351352, "logits/rejected": -88080384.0, "logps/chosen": -216.0, "logps/rejected": -348.14814814814815, "loss": 0.3192, "rewards/chosen": 0.3572635135135135, "rewards/margins": 3.130411661661662, "rewards/rejected": -2.7731481481481484, "step": 551 }, { "epoch": 0.3784710318820706, "grad_norm": 0.5068512422701674, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -106989131.5409836, "logits/rejected": -93401515.94029851, "logps/chosen": -393.44262295081967, "logps/rejected": -335.76119402985074, "loss": 0.2901, "rewards/chosen": 0.7095286885245902, "rewards/margins": 3.226319733300709, "rewards/rejected": -2.5167910447761193, "step": 552 }, { "epoch": 0.3791566678093932, "grad_norm": 0.44742184337114493, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101475096.77419356, "logits/rejected": -93164388.84848484, "logps/chosen": -242.83870967741936, "logps/rejected": -361.6969696969697, "loss": 0.3188, "rewards/chosen": 2.3544850503244708, "rewards/margins": 5.3222880806275015, "rewards/rejected": -2.9678030303030303, "step": 553 }, { "epoch": 0.3798423037367158, "grad_norm": 0.44262030381668077, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90467910.8923077, "logits/rejected": -90610281.65079366, "logps/chosen": -348.55384615384617, "logps/rejected": -270.73015873015873, "loss": 0.3118, "rewards/chosen": 2.2629805344801683, "rewards/margins": 4.457424978924613, "rewards/rejected": -2.1944444444444446, "step": 554 }, { "epoch": 0.3805279396640384, "grad_norm": 0.4537764651325065, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107171698.7586207, "logits/rejected": -92154850.74285714, "logps/chosen": -352.55172413793105, "logps/rejected": -369.8285714285714, "loss": 0.2962, "rewards/chosen": 0.5738146551724138, "rewards/margins": 2.563100369458128, "rewards/rejected": -1.9892857142857143, "step": 555 }, { "epoch": 0.381213575591361, "grad_norm": 0.5084428913032416, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90917707.29411764, "logits/rejected": -87521143.46666667, "logps/chosen": -262.3529411764706, "logps/rejected": -302.4, "loss": 0.3347, "rewards/chosen": 0.18211454503676472, "rewards/margins": 2.9008645450367645, "rewards/rejected": -2.71875, "step": 556 }, { "epoch": 0.38189921151868356, "grad_norm": 0.4303221181077931, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86441984.0, "logits/rejected": -81002496.0, "logps/chosen": -245.25, "logps/rejected": -341.25, "loss": 0.3127, "rewards/chosen": 2.226170063018799, "rewards/margins": 4.804295063018799, "rewards/rejected": -2.578125, "step": 557 }, { "epoch": 0.3825848474460062, "grad_norm": 0.4903303922316872, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90477129.14285715, "logits/rejected": -71086221.2413793, "logps/chosen": -308.57142857142856, "logps/rejected": -264.82758620689657, "loss": 0.289, "rewards/chosen": 0.6780691964285714, "rewards/margins": 3.089707127463054, "rewards/rejected": -2.4116379310344827, "step": 558 }, { "epoch": 0.38327048337332875, "grad_norm": 0.4841130229269491, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94733417.93103448, "logits/rejected": -75197878.85714285, "logps/chosen": -356.6896551724138, "logps/rejected": -296.45714285714286, "loss": 0.2999, "rewards/chosen": 0.6179956896551724, "rewards/margins": 2.9929956896551726, "rewards/rejected": -2.375, "step": 559 }, { "epoch": 0.3839561193006514, "grad_norm": 0.49848790985450203, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95655171.82089552, "logits/rejected": -86155129.70491803, "logps/chosen": -344.1194029850746, "logps/rejected": -312.39344262295083, "loss": 0.2994, "rewards/chosen": 0.5713619402985075, "rewards/margins": 3.4443127599706385, "rewards/rejected": -2.872950819672131, "step": 560 }, { "epoch": 0.38464175522797395, "grad_norm": 0.48297377891804194, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109549533.28813559, "logits/rejected": -110571579.36231884, "logps/chosen": -276.47457627118644, "logps/rejected": -340.40579710144925, "loss": 0.3143, "rewards/chosen": 0.3668399099576271, "rewards/margins": 2.5733616490880618, "rewards/rejected": -2.2065217391304346, "step": 561 }, { "epoch": 0.3853273911552965, "grad_norm": 0.5502104432946415, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92597326.76923077, "logits/rejected": -89680842.10526316, "logps/chosen": -219.53846153846155, "logps/rejected": -325.05263157894734, "loss": 0.2871, "rewards/chosen": 0.3641826923076923, "rewards/margins": 2.7654984817813766, "rewards/rejected": -2.401315789473684, "step": 562 }, { "epoch": 0.38601302708261914, "grad_norm": 0.45662373664333344, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -116477884.85245901, "logits/rejected": -92274688.0, "logps/chosen": -247.86885245901638, "logps/rejected": -315.7014925373134, "loss": 0.2945, "rewards/chosen": 0.42315573770491804, "rewards/margins": 2.893304991436261, "rewards/rejected": -2.470149253731343, "step": 563 }, { "epoch": 0.3866986630099417, "grad_norm": 0.49496313740710157, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98000245.84126984, "logits/rejected": -100534240.4923077, "logps/chosen": -337.77777777777777, "logps/rejected": -317.53846153846155, "loss": 0.3279, "rewards/chosen": 0.33271329365079366, "rewards/margins": 2.8557902167277165, "rewards/rejected": -2.523076923076923, "step": 564 }, { "epoch": 0.38738429893726434, "grad_norm": 0.47969252301673193, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103629998.82926829, "logits/rejected": -95921908.86956522, "logps/chosen": -224.1951219512195, "logps/rejected": -381.2173913043478, "loss": 0.3185, "rewards/chosen": 0.4376905487804878, "rewards/margins": 1.991692895980301, "rewards/rejected": -1.5540023471998132, "step": 565 }, { "epoch": 0.3880699348645869, "grad_norm": 0.43845723953910326, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85880093.37704918, "logits/rejected": -83447869.13432837, "logps/chosen": -244.45901639344262, "logps/rejected": -270.8059701492537, "loss": 0.3151, "rewards/chosen": 0.602202868852459, "rewards/margins": 2.680561077807683, "rewards/rejected": -2.078358208955224, "step": 566 }, { "epoch": 0.3887555707919095, "grad_norm": 0.4721810995442323, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91555664.45714286, "logits/rejected": -72387901.79310344, "logps/chosen": -245.71428571428572, "logps/rejected": -271.17241379310343, "loss": 0.3156, "rewards/chosen": 0.4765625, "rewards/margins": 1.6872266572097252, "rewards/rejected": -1.2106641572097252, "step": 567 }, { "epoch": 0.3894412067192321, "grad_norm": 0.49634534605464353, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90984132.92307693, "logits/rejected": -109051904.0, "logps/chosen": -206.92307692307693, "logps/rejected": -322.5263157894737, "loss": 0.2834, "rewards/chosen": 2.959077688363882, "rewards/margins": 5.462367162048093, "rewards/rejected": -2.5032894736842106, "step": 568 }, { "epoch": 0.3901268426465547, "grad_norm": 0.4411445847613973, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93014859.29411764, "logits/rejected": -91296017.06666666, "logps/chosen": -257.88235294117646, "logps/rejected": -296.53333333333336, "loss": 0.3061, "rewards/chosen": 0.4586397058823529, "rewards/margins": 2.1786859400132124, "rewards/rejected": -1.7200462341308593, "step": 569 }, { "epoch": 0.3908124785738773, "grad_norm": 0.40658585967977373, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98594872.10958904, "logits/rejected": -88385424.2909091, "logps/chosen": -332.2739726027397, "logps/rejected": -331.6363636363636, "loss": 0.2625, "rewards/chosen": 0.9717465753424658, "rewards/margins": 3.9603829389788294, "rewards/rejected": -2.9886363636363638, "step": 570 }, { "epoch": 0.39149811450119987, "grad_norm": 0.46954955371153867, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88424179.40983607, "logits/rejected": -90271438.3283582, "logps/chosen": -270.95081967213116, "logps/rejected": -354.86567164179104, "loss": 0.3037, "rewards/chosen": 0.3495133196721312, "rewards/margins": 2.9465282450452657, "rewards/rejected": -2.5970149253731343, "step": 571 }, { "epoch": 0.39218375042852244, "grad_norm": 0.5093983438972617, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109722992.64, "logits/rejected": -112850518.94339623, "logps/chosen": -300.58666666666664, "logps/rejected": -393.35849056603774, "loss": 0.3134, "rewards/chosen": 0.5677083333333334, "rewards/margins": 3.0488404088050314, "rewards/rejected": -2.481132075471698, "step": 572 }, { "epoch": 0.39286938635584506, "grad_norm": 0.5081874638204986, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94010262.06896552, "logits/rejected": -98386388.11428571, "logps/chosen": -354.2068965517241, "logps/rejected": -372.57142857142856, "loss": 0.2722, "rewards/chosen": 0.8033405172413793, "rewards/margins": 3.906911945812808, "rewards/rejected": -3.1035714285714286, "step": 573 }, { "epoch": 0.39355502228316763, "grad_norm": 0.5997453548230828, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97867093.33333333, "logits/rejected": -77391673.80645162, "logps/chosen": -232.4848484848485, "logps/rejected": -324.9032258064516, "loss": 0.3026, "rewards/chosen": 0.5748106060606061, "rewards/margins": 2.8711815738025415, "rewards/rejected": -2.2963709677419355, "step": 574 }, { "epoch": 0.3942406582104902, "grad_norm": 0.5096393925513168, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98135222.35616438, "logits/rejected": -105238900.36363636, "logps/chosen": -272.43835616438355, "logps/rejected": -328.43636363636364, "loss": 0.3098, "rewards/chosen": 1.6968240607274723, "rewards/margins": 3.709603990148637, "rewards/rejected": -2.0127799294211646, "step": 575 }, { "epoch": 0.39492629413781283, "grad_norm": 0.4670985518176378, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99198618.41269842, "logits/rejected": -87628689.72307692, "logps/chosen": -232.38095238095238, "logps/rejected": -323.44615384615383, "loss": 0.3303, "rewards/chosen": 2.012332734607515, "rewards/margins": 4.400794273069053, "rewards/rejected": -2.3884615384615384, "step": 576 }, { "epoch": 0.3956119300651354, "grad_norm": 0.5250169252159373, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -111305559.88059701, "logits/rejected": -107607963.27868852, "logps/chosen": -295.1641791044776, "logps/rejected": -325.24590163934425, "loss": 0.313, "rewards/chosen": 0.5439598880597015, "rewards/margins": 2.8595336585515048, "rewards/rejected": -2.3155737704918034, "step": 577 }, { "epoch": 0.396297565992458, "grad_norm": 0.594928677316284, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88661133.78461538, "logits/rejected": -85084452.57142857, "logps/chosen": -256.0, "logps/rejected": -316.95238095238096, "loss": 0.3039, "rewards/chosen": 2.511992469200721, "rewards/margins": 3.6394769875848976, "rewards/rejected": -1.1274845183841766, "step": 578 }, { "epoch": 0.3969832019197806, "grad_norm": 0.4885286026784159, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -110285522.8235294, "logits/rejected": -72107076.26666667, "logps/chosen": -309.88235294117646, "logps/rejected": -316.8, "loss": 0.3108, "rewards/chosen": 0.4568301930147059, "rewards/margins": 1.8948528065400965, "rewards/rejected": -1.4380226135253906, "step": 579 }, { "epoch": 0.39766883784710316, "grad_norm": 0.7108832996175873, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88424179.40983607, "logits/rejected": -94152734.56716418, "logps/chosen": -299.0163934426229, "logps/rejected": -310.2089552238806, "loss": 0.2872, "rewards/chosen": 2.5696571225025613, "rewards/margins": 4.552866077726442, "rewards/rejected": -1.9832089552238805, "step": 580 }, { "epoch": 0.3983544737744258, "grad_norm": 0.4675956081598382, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98959360.0, "logits/rejected": -92536832.0, "logps/chosen": -256.0, "logps/rejected": -331.25, "loss": 0.2985, "rewards/chosen": 2.12860107421875, "rewards/margins": 5.03094482421875, "rewards/rejected": -2.90234375, "step": 581 }, { "epoch": 0.39904010970174836, "grad_norm": 0.4187223462701225, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89915392.0, "logits/rejected": -96600064.0, "logps/chosen": -259.0, "logps/rejected": -336.25, "loss": 0.2928, "rewards/chosen": 1.8123655319213867, "rewards/margins": 4.628771781921387, "rewards/rejected": -2.81640625, "step": 582 }, { "epoch": 0.399725745629071, "grad_norm": 0.4749721956965877, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86149672.63492064, "logits/rejected": -95242964.67692308, "logps/chosen": -245.5873015873016, "logps/rejected": -301.53846153846155, "loss": 0.3102, "rewards/chosen": 0.2597811259920635, "rewards/margins": 2.9828580490689864, "rewards/rejected": -2.723076923076923, "step": 583 }, { "epoch": 0.40041138155639355, "grad_norm": 0.45119499913575173, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -127008768.0, "logits/rejected": -90570752.0, "logps/chosen": -295.625, "logps/rejected": -350.75, "loss": 0.2964, "rewards/chosen": 0.4189453125, "rewards/margins": 2.245598554611206, "rewards/rejected": -1.826653242111206, "step": 584 }, { "epoch": 0.4010970174837161, "grad_norm": 0.4509762507966821, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98286523.73333333, "logits/rejected": -89807450.35294117, "logps/chosen": -299.73333333333335, "logps/rejected": -303.52941176470586, "loss": 0.3108, "rewards/chosen": 0.54453125, "rewards/margins": 3.0482077205882354, "rewards/rejected": -2.5036764705882355, "step": 585 }, { "epoch": 0.40178265341103875, "grad_norm": 0.45564677224790945, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83811181.71428572, "logits/rejected": -83886080.0, "logps/chosen": -269.14285714285717, "logps/rejected": -293.77777777777777, "loss": 0.262, "rewards/chosen": 2.595402581351144, "rewards/margins": 5.317624803573366, "rewards/rejected": -2.7222222222222223, "step": 586 }, { "epoch": 0.4024682893383613, "grad_norm": 0.44800745955700155, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101641966.93333334, "logits/rejected": -99182953.41176471, "logps/chosen": -315.2, "logps/rejected": -307.29411764705884, "loss": 0.2841, "rewards/chosen": 0.6114583333333333, "rewards/margins": 3.2033700980392155, "rewards/rejected": -2.5919117647058822, "step": 587 }, { "epoch": 0.40315392526568394, "grad_norm": 0.4479760818993347, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88725661.53846154, "logits/rejected": -94538280.63492064, "logps/chosen": -331.32307692307694, "logps/rejected": -351.23809523809524, "loss": 0.2637, "rewards/chosen": 2.3060051551231973, "rewards/margins": 5.385370234488277, "rewards/rejected": -3.0793650793650795, "step": 588 }, { "epoch": 0.4038395611930065, "grad_norm": 0.4090878093119243, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89202803.38028169, "logits/rejected": -91980350.87719299, "logps/chosen": -324.28169014084506, "logps/rejected": -328.140350877193, "loss": 0.2808, "rewards/chosen": 2.155789657377861, "rewards/margins": 4.0935597647140725, "rewards/rejected": -1.9377701073362117, "step": 589 }, { "epoch": 0.4045251971203291, "grad_norm": 0.5182611166248186, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86458586.45333333, "logits/rejected": -117123960.75471698, "logps/chosen": -248.32, "logps/rejected": -314.5660377358491, "loss": 0.33, "rewards/chosen": 0.28739583333333335, "rewards/margins": 2.4572071540880507, "rewards/rejected": -2.169811320754717, "step": 590 }, { "epoch": 0.4052108330476517, "grad_norm": 0.5143055794389059, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92775500.41791044, "logits/rejected": -111664749.1147541, "logps/chosen": -325.97014925373134, "logps/rejected": -378.75409836065575, "loss": 0.3129, "rewards/chosen": 0.8069029850746269, "rewards/margins": 3.0220669195008565, "rewards/rejected": -2.2151639344262297, "step": 591 }, { "epoch": 0.4058964689749743, "grad_norm": 0.4443506954319456, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109051904.0, "logits/rejected": -84449493.97014925, "logps/chosen": -286.1639344262295, "logps/rejected": -256.95522388059703, "loss": 0.2876, "rewards/chosen": 2.4866262967469264, "rewards/margins": 5.0239397295827475, "rewards/rejected": -2.537313432835821, "step": 592 }, { "epoch": 0.4065821049022969, "grad_norm": 0.44681907053029024, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84291980.38709678, "logits/rejected": -84521580.60606061, "logps/chosen": -266.5806451612903, "logps/rejected": -335.5151515151515, "loss": 0.2979, "rewards/chosen": 0.5304939516129032, "rewards/margins": 3.350569709188661, "rewards/rejected": -2.820075757575758, "step": 593 }, { "epoch": 0.4072677408296195, "grad_norm": 0.5400684570723615, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91825298.28571428, "logits/rejected": -88429909.33333333, "logps/chosen": -314.0, "logps/rejected": -316.0, "loss": 0.2993, "rewards/chosen": 0.37667410714285715, "rewards/margins": 2.3384796626984126, "rewards/rejected": -1.9618055555555556, "step": 594 }, { "epoch": 0.40795337675694204, "grad_norm": 0.5799326106662586, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -119092813.57575758, "logits/rejected": -103707548.90322581, "logps/chosen": -332.1212121212121, "logps/rejected": -297.03225806451616, "loss": 0.3317, "rewards/chosen": 0.549508759469697, "rewards/margins": 2.5575732755987293, "rewards/rejected": -2.0080645161290325, "step": 595 }, { "epoch": 0.40863901268426467, "grad_norm": 0.4401488669266548, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79691776.0, "logits/rejected": -86310912.0, "logps/chosen": -226.5, "logps/rejected": -340.5, "loss": 0.2918, "rewards/chosen": 0.6689453125, "rewards/margins": 3.3798828125, "rewards/rejected": -2.7109375, "step": 596 }, { "epoch": 0.40932464861158724, "grad_norm": 0.4481301230262707, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90749486.54545455, "logits/rejected": -77459323.87096775, "logps/chosen": -258.6666666666667, "logps/rejected": -353.03225806451616, "loss": 0.313, "rewards/chosen": 1.9888522986209753, "rewards/margins": 3.5034763712803754, "rewards/rejected": -1.5146240726594002, "step": 597 }, { "epoch": 0.41001028453890986, "grad_norm": 0.4845365649718793, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -111604958.60869566, "logits/rejected": -93767575.86440678, "logps/chosen": -273.6231884057971, "logps/rejected": -350.10169491525426, "loss": 0.2892, "rewards/chosen": 0.6148097826086957, "rewards/margins": 3.4686233419307295, "rewards/rejected": -2.8538135593220337, "step": 598 }, { "epoch": 0.41069592046623243, "grad_norm": 0.5043383373123428, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93748362.37837838, "logits/rejected": -95226235.25925925, "logps/chosen": -335.13513513513516, "logps/rejected": -319.1111111111111, "loss": 0.3148, "rewards/chosen": 0.5915329391891891, "rewards/margins": 3.2952366428928928, "rewards/rejected": -2.7037037037037037, "step": 599 }, { "epoch": 0.411381556393555, "grad_norm": 0.44960953993480435, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84531357.53846154, "logits/rejected": -93473060.57142857, "logps/chosen": -310.6461538461538, "logps/rejected": -355.55555555555554, "loss": 0.2946, "rewards/chosen": 0.5463942307692308, "rewards/margins": 3.232902167277167, "rewards/rejected": -2.6865079365079363, "step": 600 }, { "epoch": 0.41206719232087763, "grad_norm": 0.4815809433677578, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109421989.64705883, "logits/rejected": -96888422.4, "logps/chosen": -344.2352941176471, "logps/rejected": -316.53333333333336, "loss": 0.2795, "rewards/chosen": 1.0753676470588236, "rewards/margins": 3.3357843137254903, "rewards/rejected": -2.2604166666666665, "step": 601 }, { "epoch": 0.4127528282482002, "grad_norm": 0.47583854398627007, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83297405.75438596, "logits/rejected": -91093193.91549295, "logps/chosen": -280.42105263157896, "logps/rejected": -346.59154929577466, "loss": 0.2822, "rewards/chosen": 0.48135964912280704, "rewards/margins": 3.146852606869286, "rewards/rejected": -2.665492957746479, "step": 602 }, { "epoch": 0.4134384641755228, "grad_norm": 0.49939475524353677, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -116550811.15151516, "logits/rejected": -90109885.93548387, "logps/chosen": -376.72727272727275, "logps/rejected": -302.4516129032258, "loss": 0.264, "rewards/chosen": 0.9427083333333334, "rewards/margins": 3.652385752688172, "rewards/rejected": -2.7096774193548385, "step": 603 }, { "epoch": 0.4141241001028454, "grad_norm": 0.4677229606324361, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94661102.34482759, "logits/rejected": -96109480.22857143, "logps/chosen": -319.7241379310345, "logps/rejected": -384.9142857142857, "loss": 0.2812, "rewards/chosen": 0.4964978448275862, "rewards/margins": 2.823736091200354, "rewards/rejected": -2.327238246372768, "step": 604 }, { "epoch": 0.41480973603016796, "grad_norm": 0.4947914527973293, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92554308.26666667, "logits/rejected": -88573831.52941176, "logps/chosen": -298.4, "logps/rejected": -275.29411764705884, "loss": 0.2988, "rewards/chosen": 0.5255208333333333, "rewards/margins": 2.871109068627451, "rewards/rejected": -2.3455882352941178, "step": 605 }, { "epoch": 0.4154953719574906, "grad_norm": 0.41357406495942445, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102471185.65517241, "logits/rejected": -92154850.74285714, "logps/chosen": -298.0689655172414, "logps/rejected": -315.8857142857143, "loss": 0.2563, "rewards/chosen": 0.8224676724137931, "rewards/margins": 3.8617533866995073, "rewards/rejected": -3.039285714285714, "step": 606 }, { "epoch": 0.41618100788481316, "grad_norm": 0.423142832819958, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96240211.78181818, "logits/rejected": -87908015.34246576, "logps/chosen": -254.83636363636364, "logps/rejected": -338.4109589041096, "loss": 0.2693, "rewards/chosen": 0.5167613636363636, "rewards/margins": 3.715391500622665, "rewards/rejected": -3.1986301369863015, "step": 607 }, { "epoch": 0.41686664381213573, "grad_norm": 0.5607726684409713, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91113188.43076923, "logits/rejected": -82421402.41269842, "logps/chosen": -289.2307692307692, "logps/rejected": -343.36507936507934, "loss": 0.2866, "rewards/chosen": 0.42259615384615384, "rewards/margins": 3.692437423687424, "rewards/rejected": -3.2698412698412698, "step": 608 }, { "epoch": 0.41755227973945835, "grad_norm": 0.4329626975239932, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107216896.0, "logits/rejected": -86612377.6, "logps/chosen": -323.0, "logps/rejected": -337.4, "loss": 0.2583, "rewards/chosen": 4.722297032674153, "rewards/margins": 7.7816720326741535, "rewards/rejected": -3.059375, "step": 609 }, { "epoch": 0.4182379156667809, "grad_norm": 0.5996073399497259, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95860141.41935484, "logits/rejected": -84902880.96969697, "logps/chosen": -355.61290322580646, "logps/rejected": -351.75757575757575, "loss": 0.2928, "rewards/chosen": 0.7701612903225806, "rewards/margins": 3.268267350928641, "rewards/rejected": -2.4981060606060606, "step": 610 }, { "epoch": 0.41892355159410355, "grad_norm": 0.4817335038745418, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80470718.17142858, "logits/rejected": -94661102.34482759, "logps/chosen": -251.65714285714284, "logps/rejected": -383.17241379310343, "loss": 0.3148, "rewards/chosen": 0.5040178571428572, "rewards/margins": 2.451483030272235, "rewards/rejected": -1.9474651731293777, "step": 611 }, { "epoch": 0.4196091875214261, "grad_norm": 0.5636018752414663, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102580692.11428571, "logits/rejected": -67398126.34482759, "logps/chosen": -317.0285714285714, "logps/rejected": -260.9655172413793, "loss": 0.3089, "rewards/chosen": 0.5104910714285714, "rewards/margins": 2.771266933497537, "rewards/rejected": -2.2607758620689653, "step": 612 }, { "epoch": 0.4202948234487487, "grad_norm": 0.5127167783801705, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90460934.91891892, "logits/rejected": -86682282.66666667, "logps/chosen": -278.27027027027026, "logps/rejected": -297.48148148148147, "loss": 0.3051, "rewards/chosen": 0.6786317567567568, "rewards/margins": 3.303631756756757, "rewards/rejected": -2.625, "step": 613 }, { "epoch": 0.4209804593760713, "grad_norm": 0.4144689494400939, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78293674.66666667, "logits/rejected": -95605458.8235294, "logps/chosen": -268.4, "logps/rejected": -353.88235294117646, "loss": 0.2806, "rewards/chosen": 0.45940755208333334, "rewards/margins": 3.3160251991421568, "rewards/rejected": -2.8566176470588234, "step": 614 }, { "epoch": 0.4216660953033939, "grad_norm": 0.4888190037874014, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107424263.64179105, "logits/rejected": -95506364.85245901, "logps/chosen": -341.4925373134328, "logps/rejected": -329.9672131147541, "loss": 0.2929, "rewards/chosen": 0.6044776119402985, "rewards/margins": 3.4856251529239053, "rewards/rejected": -2.8811475409836067, "step": 615 }, { "epoch": 0.4223517312307165, "grad_norm": 0.5052348459611163, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94027531.46268657, "logits/rejected": -117578030.16393442, "logps/chosen": -331.2238805970149, "logps/rejected": -361.44262295081967, "loss": 0.2913, "rewards/chosen": 0.6902985074626866, "rewards/margins": 2.8009542451676044, "rewards/rejected": -2.110655737704918, "step": 616 }, { "epoch": 0.4230373671580391, "grad_norm": 0.4154703577855744, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103022592.0, "logits/rejected": -105906176.0, "logps/chosen": -335.5, "logps/rejected": -340.5, "loss": 0.2586, "rewards/chosen": 0.86181640625, "rewards/margins": 3.95947265625, "rewards/rejected": -3.09765625, "step": 617 }, { "epoch": 0.42372300308536165, "grad_norm": 0.4230903980816782, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73774811.42857143, "logits/rejected": -114061767.1111111, "logps/chosen": -273.42857142857144, "logps/rejected": -292.8888888888889, "loss": 0.271, "rewards/chosen": 0.5714285714285714, "rewards/margins": 3.191220238095238, "rewards/rejected": -2.6197916666666665, "step": 618 }, { "epoch": 0.4244086390126843, "grad_norm": 0.5207833812004047, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96239167.12328768, "logits/rejected": -86936482.9090909, "logps/chosen": -269.1506849315069, "logps/rejected": -317.3818181818182, "loss": 0.3316, "rewards/chosen": 1.9305465907266695, "rewards/margins": 4.326001136181215, "rewards/rejected": -2.3954545454545455, "step": 619 }, { "epoch": 0.42509427494000684, "grad_norm": 0.4467629292576887, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82284618.47272727, "logits/rejected": -97675572.60273972, "logps/chosen": -306.6181818181818, "logps/rejected": -339.28767123287673, "loss": 0.2545, "rewards/chosen": 3.1203868519176137, "rewards/margins": 6.291619728629943, "rewards/rejected": -3.171232876712329, "step": 620 }, { "epoch": 0.42577991086732947, "grad_norm": 0.5367958921061569, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74746253.37313433, "logits/rejected": -92343447.08196722, "logps/chosen": -234.02985074626866, "logps/rejected": -326.2950819672131, "loss": 0.2846, "rewards/chosen": 0.6282649253731343, "rewards/margins": 3.1651501712747736, "rewards/rejected": -2.5368852459016393, "step": 621 }, { "epoch": 0.42646554679465204, "grad_norm": 0.44610003442495777, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -117307359.49206349, "logits/rejected": -94081465.1076923, "logps/chosen": -286.22222222222223, "logps/rejected": -294.4, "loss": 0.2809, "rewards/chosen": 2.685518295045883, "rewards/margins": 4.954749064276652, "rewards/rejected": -2.269230769230769, "step": 622 }, { "epoch": 0.4271511827219746, "grad_norm": 0.466289372900567, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95630131.2, "logits/rejected": -116083531.29411764, "logps/chosen": -297.6, "logps/rejected": -392.70588235294116, "loss": 0.2719, "rewards/chosen": 0.56953125, "rewards/margins": 3.793795955882353, "rewards/rejected": -3.224264705882353, "step": 623 }, { "epoch": 0.42783681864929723, "grad_norm": 0.4273917022384625, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80982331.07692307, "logits/rejected": -96069534.47619048, "logps/chosen": -256.73846153846154, "logps/rejected": -345.3968253968254, "loss": 0.2694, "rewards/chosen": 0.8192307692307692, "rewards/margins": 3.7081196581196583, "rewards/rejected": -2.888888888888889, "step": 624 }, { "epoch": 0.4285224545766198, "grad_norm": 0.45249177607323277, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92008382.98412699, "logits/rejected": -87176995.44615385, "logps/chosen": -308.3174603174603, "logps/rejected": -331.32307692307694, "loss": 0.2754, "rewards/chosen": 2.540134006076389, "rewards/margins": 4411964.140134006, "rewards/rejected": -4411961.6, "step": 625 }, { "epoch": 0.42920809050394243, "grad_norm": 0.4912572494488539, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -106452617.0140845, "logits/rejected": -78661596.07017544, "logps/chosen": -393.46478873239437, "logps/rejected": -286.3157894736842, "loss": 0.2945, "rewards/chosen": 0.8142605633802817, "rewards/margins": 3.594962317766247, "rewards/rejected": -2.780701754385965, "step": 626 }, { "epoch": 0.429893726431265, "grad_norm": 0.5670692896474406, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90410552.8888889, "logits/rejected": -82687707.42857143, "logps/chosen": -341.3333333333333, "logps/rejected": -356.2857142857143, "loss": 0.303, "rewards/chosen": 0.6727430555555556, "rewards/margins": 2.6147073412698414, "rewards/rejected": -1.9419642857142858, "step": 627 }, { "epoch": 0.43057936235858757, "grad_norm": 0.5335894889592879, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94102974.35897435, "logits/rejected": -93616865.28, "logps/chosen": -256.20512820512823, "logps/rejected": -253.6, "loss": 0.3163, "rewards/chosen": 0.500801282051282, "rewards/margins": 3.138301282051282, "rewards/rejected": -2.6375, "step": 628 }, { "epoch": 0.4312649982859102, "grad_norm": 0.4739562414350787, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78370282.9589041, "logits/rejected": -85335021.38181818, "logps/chosen": -207.34246575342465, "logps/rejected": -377.6, "loss": 0.3094, "rewards/chosen": 0.4340753424657534, "rewards/margins": 3.252257160647572, "rewards/rejected": -2.8181818181818183, "step": 629 }, { "epoch": 0.43195063421323276, "grad_norm": 0.5268713454361523, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102940203.88571429, "logits/rejected": -74702000.55172414, "logps/chosen": -299.65714285714284, "logps/rejected": -295.7241379310345, "loss": 0.3271, "rewards/chosen": 0.35044642857142855, "rewards/margins": 2.8138084975369457, "rewards/rejected": -2.4633620689655173, "step": 630 }, { "epoch": 0.4326362701405554, "grad_norm": 0.4393942724899644, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92754037.02857143, "logits/rejected": -76726836.96551724, "logps/chosen": -253.71428571428572, "logps/rejected": -297.37931034482756, "loss": 0.3008, "rewards/chosen": 0.5206473214285714, "rewards/margins": 2.3409560067313055, "rewards/rejected": -1.8203086853027344, "step": 631 }, { "epoch": 0.43332190606787796, "grad_norm": 0.4517581410584769, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90211915.5409836, "logits/rejected": -105045404.65671642, "logps/chosen": -281.7049180327869, "logps/rejected": -341.97014925373134, "loss": 0.2816, "rewards/chosen": 0.7202868852459017, "rewards/margins": 3.5001376315145585, "rewards/rejected": -2.779850746268657, "step": 632 }, { "epoch": 0.43400754199520053, "grad_norm": 0.47020239761176563, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83627968.98461539, "logits/rejected": -86415977.65079366, "logps/chosen": -321.7230769230769, "logps/rejected": -264.12698412698415, "loss": 0.2983, "rewards/chosen": 0.6783653846153846, "rewards/margins": 3.102968559218559, "rewards/rejected": -2.4246031746031744, "step": 633 }, { "epoch": 0.43469317792252316, "grad_norm": 0.4871595233138122, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85128836.74074075, "logits/rejected": -85586473.51351352, "logps/chosen": -297.77777777777777, "logps/rejected": -320.43243243243245, "loss": 0.265, "rewards/chosen": 2.854127954553675, "rewards/margins": 5.732506332932053, "rewards/rejected": -2.8783783783783785, "step": 634 }, { "epoch": 0.4353788138498457, "grad_norm": 0.4637832198033027, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79827076.12903225, "logits/rejected": -70286367.03030303, "logps/chosen": -225.80645161290323, "logps/rejected": -285.09090909090907, "loss": 0.286, "rewards/chosen": 0.5264616935483871, "rewards/margins": 3.179870784457478, "rewards/rejected": -2.653409090909091, "step": 635 }, { "epoch": 0.43606444977716835, "grad_norm": 0.5717861029635978, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95806733.4736842, "logits/rejected": -92113368.61538461, "logps/chosen": -238.73684210526315, "logps/rejected": -332.3076923076923, "loss": 0.3156, "rewards/chosen": 0.40542763157894735, "rewards/margins": 3.261196862348178, "rewards/rejected": -2.855769230769231, "step": 636 }, { "epoch": 0.4367500857044909, "grad_norm": 0.517207003947754, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89895829.01492538, "logits/rejected": -83473525.50819673, "logps/chosen": -255.044776119403, "logps/rejected": -281.7049180327869, "loss": 0.3034, "rewards/chosen": 0.5559701492537313, "rewards/margins": 2.0093469192732627, "rewards/rejected": -1.4533767700195312, "step": 637 }, { "epoch": 0.4374357216318135, "grad_norm": 0.47508092887739156, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87898022.95652173, "logits/rejected": -105852858.57627119, "logps/chosen": -295.18840579710144, "logps/rejected": -320.0, "loss": 0.2809, "rewards/chosen": 0.5271739130434783, "rewards/margins": 3.5483603537214448, "rewards/rejected": -3.0211864406779663, "step": 638 }, { "epoch": 0.4381213575591361, "grad_norm": 0.5626417480947475, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85004561.06666666, "logits/rejected": -94727960.1509434, "logps/chosen": -248.10666666666665, "logps/rejected": -284.07547169811323, "loss": 0.3304, "rewards/chosen": 0.40125, "rewards/margins": 3.167759433962264, "rewards/rejected": -2.766509433962264, "step": 639 }, { "epoch": 0.4388069934864587, "grad_norm": 0.4261971393255929, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80740352.0, "logits/rejected": -66959067.428571425, "logps/chosen": -205.11111111111111, "logps/rejected": -262.57142857142856, "loss": 0.3257, "rewards/chosen": 1.8730095757378473, "rewards/margins": 3.7100631471664185, "rewards/rejected": -1.8370535714285714, "step": 640 }, { "epoch": 0.43949262941378126, "grad_norm": 0.43493341869811514, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104500637.95744681, "logits/rejected": -96935025.77777778, "logps/chosen": -267.2340425531915, "logps/rejected": -319.60493827160496, "loss": 0.2453, "rewards/chosen": 0.7716090425531915, "rewards/margins": 3.54012756107171, "rewards/rejected": -2.7685185185185186, "step": 641 }, { "epoch": 0.4401782653411039, "grad_norm": 0.42676573461173234, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107018302.06060606, "logits/rejected": -103504598.70967741, "logps/chosen": -342.7878787878788, "logps/rejected": -347.35483870967744, "loss": 0.2629, "rewards/chosen": 1.0871212121212122, "rewards/margins": 3.885508308895406, "rewards/rejected": -2.7983870967741935, "step": 642 }, { "epoch": 0.44086390126842645, "grad_norm": 0.47135884454400295, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84372376.11594203, "logits/rejected": -109762802.98305085, "logps/chosen": -290.3188405797101, "logps/rejected": -328.6779661016949, "loss": 0.2789, "rewards/chosen": 0.7708333333333334, "rewards/margins": 3.4721045197740112, "rewards/rejected": -2.7012711864406778, "step": 643 }, { "epoch": 0.4415495371957491, "grad_norm": 0.485273651949064, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105825516.3076923, "logits/rejected": -92495440.84210527, "logps/chosen": -309.84615384615387, "logps/rejected": -286.10526315789474, "loss": 0.2632, "rewards/chosen": 2.471149738018329, "rewards/margins": 5.007333948544645, "rewards/rejected": -2.536184210526316, "step": 644 }, { "epoch": 0.44223517312307165, "grad_norm": 0.4807137001277921, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100985934.76923077, "logits/rejected": -92358574.08, "logps/chosen": -258.46153846153845, "logps/rejected": -273.6, "loss": 0.3116, "rewards/chosen": 0.6430288461538461, "rewards/margins": 2.2920879892202524, "rewards/rejected": -1.6490591430664063, "step": 645 }, { "epoch": 0.4429208090503942, "grad_norm": 0.5967790458045119, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -106954752.0, "logits/rejected": -104857600.0, "logps/chosen": -307.55555555555554, "logps/rejected": -383.14285714285717, "loss": 0.2808, "rewards/chosen": 0.6833767361111112, "rewards/margins": 2.3775344727531316, "rewards/rejected": -1.6941577366420202, "step": 646 }, { "epoch": 0.44360644497771684, "grad_norm": 0.47381943233166535, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104045799.22580644, "logits/rejected": -87699083.63636364, "logps/chosen": -310.19354838709677, "logps/rejected": -331.6363636363636, "loss": 0.2668, "rewards/chosen": 0.8462701612903226, "rewards/margins": 3.6038459188660803, "rewards/rejected": -2.757575757575758, "step": 647 }, { "epoch": 0.4442920809050394, "grad_norm": 0.435135218855459, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84560164.57142857, "logits/rejected": -72060472.8888889, "logps/chosen": -267.7142857142857, "logps/rejected": -307.55555555555554, "loss": 0.2401, "rewards/chosen": 1.1060267857142858, "rewards/margins": 3.795262896825397, "rewards/rejected": -2.689236111111111, "step": 648 }, { "epoch": 0.44497771683236204, "grad_norm": 0.707959364287756, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102216741.92592593, "logits/rejected": -112566050.5945946, "logps/chosen": -295.7037037037037, "logps/rejected": -434.1621621621622, "loss": 0.2892, "rewards/chosen": 0.50390625, "rewards/margins": 3.3721494932432434, "rewards/rejected": -2.8682432432432434, "step": 649 }, { "epoch": 0.4456633527596846, "grad_norm": 0.49041952769111646, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96910497.68421052, "logits/rejected": -103971479.43661971, "logps/chosen": -410.6666666666667, "logps/rejected": -325.40845070422534, "loss": 0.2672, "rewards/chosen": 0.6398026315789473, "rewards/margins": 3.3968448851000743, "rewards/rejected": -2.757042253521127, "step": 650 }, { "epoch": 0.4463489886870072, "grad_norm": 0.46639347944137466, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94741925.64705883, "logits/rejected": -89758105.6, "logps/chosen": -254.8235294117647, "logps/rejected": -346.1333333333333, "loss": 0.3001, "rewards/chosen": 0.6410845588235294, "rewards/margins": 2.9028053433287377, "rewards/rejected": -2.2617207845052083, "step": 651 }, { "epoch": 0.4470346246143298, "grad_norm": 0.5266414892455197, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90666871.46666667, "logits/rejected": -101156743.52941176, "logps/chosen": -247.73333333333332, "logps/rejected": -385.4117647058824, "loss": 0.2692, "rewards/chosen": 0.9239583333333333, "rewards/margins": 3.721752450980392, "rewards/rejected": -2.797794117647059, "step": 652 }, { "epoch": 0.44772026054165237, "grad_norm": 0.4971617557450823, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72047318.70967741, "logits/rejected": -103459498.66666667, "logps/chosen": -206.4516129032258, "logps/rejected": -374.7878787878788, "loss": 0.2758, "rewards/chosen": 2.3557847545992945, "rewards/margins": 5.317905966720506, "rewards/rejected": -2.962121212121212, "step": 653 }, { "epoch": 0.448405896468975, "grad_norm": 0.5417998562292199, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102608480.46376811, "logits/rejected": -85663327.45762712, "logps/chosen": -308.17391304347825, "logps/rejected": -334.64406779661016, "loss": 0.3005, "rewards/chosen": 0.7214673913043478, "rewards/margins": 3.5435012896094324, "rewards/rejected": -2.8220338983050848, "step": 654 }, { "epoch": 0.44909153239629757, "grad_norm": 0.43237421669502546, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88842984.72727273, "logits/rejected": -84021380.12903225, "logps/chosen": -262.54545454545456, "logps/rejected": -320.1290322580645, "loss": 0.2682, "rewards/chosen": 0.7838541666666666, "rewards/margins": 3.271757392473118, "rewards/rejected": -2.4879032258064515, "step": 655 }, { "epoch": 0.44977716832362014, "grad_norm": 0.42720229314953145, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75852921.49152543, "logits/rejected": -106498849.39130434, "logps/chosen": -209.89830508474577, "logps/rejected": -310.72463768115944, "loss": 0.2777, "rewards/chosen": 0.4896716101694915, "rewards/margins": 1.3848187402787533, "rewards/rejected": -0.8951471301092617, "step": 656 }, { "epoch": 0.45046280425094276, "grad_norm": 0.47833982459006047, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88612056.33802816, "logits/rejected": -87344541.19298245, "logps/chosen": -234.3661971830986, "logps/rejected": -333.4736842105263, "loss": 0.2981, "rewards/chosen": 0.6047535211267606, "rewards/margins": 3.3810693106004446, "rewards/rejected": -2.776315789473684, "step": 657 }, { "epoch": 0.45114844017826533, "grad_norm": 0.4717192107807279, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91037024.52459016, "logits/rejected": -81632424.11940299, "logps/chosen": -247.60655737704917, "logps/rejected": -310.2089552238806, "loss": 0.2841, "rewards/chosen": 0.2520491803278688, "rewards/margins": 3.21846709077563, "rewards/rejected": -2.966417910447761, "step": 658 }, { "epoch": 0.45183407610558796, "grad_norm": 0.44288021342776235, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85952838.49275362, "logits/rejected": -79194146.71186441, "logps/chosen": -261.5652173913044, "logps/rejected": -320.0, "loss": 0.2939, "rewards/chosen": 0.4201766304347826, "rewards/margins": 0.9426358868817503, "rewards/rejected": -0.5224592564469677, "step": 659 }, { "epoch": 0.4525197120329105, "grad_norm": 0.4275104559431811, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87162880.0, "logits/rejected": -78905344.0, "logps/chosen": -267.75, "logps/rejected": -309.75, "loss": 0.2729, "rewards/chosen": 2.742168426513672, "rewards/margins": 5.421855926513672, "rewards/rejected": -2.6796875, "step": 660 }, { "epoch": 0.4532053479602331, "grad_norm": 0.5117895589448934, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86542472.53333333, "logits/rejected": -102883809.88235295, "logps/chosen": -272.0, "logps/rejected": -374.5882352941176, "loss": 0.2853, "rewards/chosen": 0.7177083333333333, "rewards/margins": 3.287561274509804, "rewards/rejected": -2.5698529411764706, "step": 661 }, { "epoch": 0.4538909838875557, "grad_norm": 0.5448638444130548, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98904394.32258065, "logits/rejected": -106637001.6969697, "logps/chosen": -310.7096774193548, "logps/rejected": -359.5151515151515, "loss": 0.2847, "rewards/chosen": 0.5519153225806451, "rewards/margins": 2.8909304740957964, "rewards/rejected": -2.3390151515151514, "step": 662 }, { "epoch": 0.4545766198148783, "grad_norm": 0.5137183888078891, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91398266.26865672, "logits/rejected": -97912932.72131148, "logps/chosen": -375.4029850746269, "logps/rejected": -319.4754098360656, "loss": 0.2824, "rewards/chosen": 0.8675373134328358, "rewards/margins": 3.613438952777098, "rewards/rejected": -2.7459016393442623, "step": 663 }, { "epoch": 0.4552622557422009, "grad_norm": 0.4330540089062366, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90424259.76470588, "logits/rejected": -80530636.8, "logps/chosen": -240.47058823529412, "logps/rejected": -274.4, "loss": 0.2888, "rewards/chosen": 0.6833639705882353, "rewards/margins": 3.3812806372549016, "rewards/rejected": -2.6979166666666665, "step": 664 }, { "epoch": 0.4559478916695235, "grad_norm": 0.49763240495936917, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93261583.05882353, "logits/rejected": -92554308.26666667, "logps/chosen": -254.11764705882354, "logps/rejected": -298.1333333333333, "loss": 0.3118, "rewards/chosen": 0.40877757352941174, "rewards/margins": 2.898360906862745, "rewards/rejected": -2.4895833333333335, "step": 665 }, { "epoch": 0.45663352759684606, "grad_norm": 0.40880835408083627, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96043482.89855072, "logits/rejected": -119857568.54237288, "logps/chosen": -287.536231884058, "logps/rejected": -307.2542372881356, "loss": 0.2992, "rewards/chosen": 0.7608695652173914, "rewards/margins": 1.3048276149269054, "rewards/rejected": -0.543958049709514, "step": 666 }, { "epoch": 0.4573191635241687, "grad_norm": 0.5306770579367449, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79015275.35483871, "logits/rejected": -87063583.03030303, "logps/chosen": -224.25806451612902, "logps/rejected": -365.57575757575756, "loss": 0.2856, "rewards/chosen": 0.44524949596774194, "rewards/margins": 3.5437343444525906, "rewards/rejected": -3.0984848484848486, "step": 667 }, { "epoch": 0.45800479945149125, "grad_norm": 0.4978410620881906, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96043482.89855072, "logits/rejected": -100805475.79661018, "logps/chosen": -281.27536231884056, "logps/rejected": -301.0169491525424, "loss": 0.311, "rewards/chosen": 0.6354166666666666, "rewards/margins": -2.477225783181056, "rewards/rejected": 3.1126424498477223, "step": 668 }, { "epoch": 0.4586904353788139, "grad_norm": 0.499778123174176, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86402662.4, "logits/rejected": -81912289.88235295, "logps/chosen": -296.26666666666665, "logps/rejected": -291.7647058823529, "loss": 0.2997, "rewards/chosen": 2.31895751953125, "rewards/margins": 4.7932222254136025, "rewards/rejected": -2.474264705882353, "step": 669 }, { "epoch": 0.45937607130613645, "grad_norm": 0.494122943940786, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89172052.16438356, "logits/rejected": -82970959.12727273, "logps/chosen": -279.2328767123288, "logps/rejected": -369.45454545454544, "loss": 0.3075, "rewards/chosen": 0.4300085616438356, "rewards/margins": 3.030008561643836, "rewards/rejected": -2.6, "step": 670 }, { "epoch": 0.460061707233459, "grad_norm": 0.41563576165221255, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -117795961.49152543, "logits/rejected": -81089877.33333333, "logps/chosen": -353.08474576271186, "logps/rejected": -291.2463768115942, "loss": 0.2563, "rewards/chosen": 1.1557203389830508, "rewards/margins": 2.826010194055515, "rewards/rejected": -1.6702898550724639, "step": 671 }, { "epoch": 0.46074734316078164, "grad_norm": 0.5765336327357349, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92407840.50793651, "logits/rejected": -86080023.63076924, "logps/chosen": -296.3809523809524, "logps/rejected": -309.16923076923075, "loss": 0.2966, "rewards/chosen": 0.6939484126984127, "rewards/margins": 3.4747176434676432, "rewards/rejected": -2.7807692307692307, "step": 672 }, { "epoch": 0.4614329790881042, "grad_norm": 0.5510642654712604, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109983971.55555555, "logits/rejected": -100534240.4923077, "logps/chosen": -257.5238095238095, "logps/rejected": -402.2153846153846, "loss": 0.3119, "rewards/chosen": 0.1511656746031746, "rewards/margins": 3.493473366910867, "rewards/rejected": -3.3423076923076924, "step": 673 }, { "epoch": 0.4621186150154268, "grad_norm": 0.4782083845986452, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80581476.84848484, "logits/rejected": -86862682.83870968, "logps/chosen": -244.36363636363637, "logps/rejected": -261.6774193548387, "loss": 0.3, "rewards/chosen": 0.5307765151515151, "rewards/margins": 2.871502321603128, "rewards/rejected": -2.340725806451613, "step": 674 }, { "epoch": 0.4628042509427494, "grad_norm": 0.4448103897636344, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107605592.27586207, "logits/rejected": -91435827.2, "logps/chosen": -316.41379310344826, "logps/rejected": -340.1142857142857, "loss": 0.2987, "rewards/chosen": 0.513739224137931, "rewards/margins": 3.270882081280788, "rewards/rejected": -2.757142857142857, "step": 675 }, { "epoch": 0.463489886870072, "grad_norm": 0.5195884462405251, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -70844416.0, "logits/rejected": -100925440.0, "logps/chosen": -172.25, "logps/rejected": -339.75, "loss": 0.2899, "rewards/chosen": 0.4609375, "rewards/margins": 1.7751381397247314, "rewards/rejected": -1.3142006397247314, "step": 676 }, { "epoch": 0.4641755227973946, "grad_norm": 0.40741212063646864, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96101070.59649123, "logits/rejected": -101608491.26760563, "logps/chosen": -225.40350877192984, "logps/rejected": -321.80281690140845, "loss": 0.273, "rewards/chosen": 2.802875184176261, "rewards/margins": 5.820480817979078, "rewards/rejected": -3.017605633802817, "step": 677 }, { "epoch": 0.46486115872471717, "grad_norm": 0.46963814476493354, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77398889.81333333, "logits/rejected": -95044511.39622642, "logps/chosen": -229.33333333333334, "logps/rejected": -329.0566037735849, "loss": 0.2991, "rewards/chosen": 0.5554166666666667, "rewards/margins": 3.225227987421384, "rewards/rejected": -2.669811320754717, "step": 678 }, { "epoch": 0.46554679465203974, "grad_norm": 0.5251506911325757, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89815958.06896552, "logits/rejected": -75197878.85714285, "logps/chosen": -352.82758620689657, "logps/rejected": -312.6857142857143, "loss": 0.2532, "rewards/chosen": 0.6169181034482759, "rewards/margins": 3.841918103448276, "rewards/rejected": -3.225, "step": 679 }, { "epoch": 0.46623243057936237, "grad_norm": 0.49961699134000465, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97336779.03448276, "logits/rejected": -87840709.48571429, "logps/chosen": -266.2068965517241, "logps/rejected": -326.62857142857143, "loss": 0.2919, "rewards/chosen": 2.246204507761988, "rewards/margins": 4.790847364904845, "rewards/rejected": -2.544642857142857, "step": 680 }, { "epoch": 0.46691806650668494, "grad_norm": 0.5736481159186657, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -71835778.03174603, "logits/rejected": -117698623.01538461, "logps/chosen": -202.66666666666666, "logps/rejected": -365.04615384615386, "loss": 0.2807, "rewards/chosen": 0.46478174603174605, "rewards/margins": 2.6179043985257366, "rewards/rejected": -2.1531226524939906, "step": 681 }, { "epoch": 0.46760370243400756, "grad_norm": 0.5045122903620213, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107460961.10344827, "logits/rejected": -98626062.62857144, "logps/chosen": -341.7931034482759, "logps/rejected": -278.1714285714286, "loss": 0.2838, "rewards/chosen": 2.859851179451778, "rewards/margins": 5.284851179451778, "rewards/rejected": -2.425, "step": 682 }, { "epoch": 0.46828933836133013, "grad_norm": 0.5168936272019826, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -113945258.66666667, "logits/rejected": -100663296.0, "logps/chosen": -308.74074074074076, "logps/rejected": -319.7837837837838, "loss": 0.2679, "rewards/chosen": 1.9768408316153068, "rewards/margins": 4.757246237020712, "rewards/rejected": -2.7804054054054053, "step": 683 }, { "epoch": 0.4689749742886527, "grad_norm": 0.6530407091795432, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -113770496.0, "logits/rejected": -98740906.66666667, "logps/chosen": -335.6, "logps/rejected": -391.0, "loss": 0.3492, "rewards/chosen": 0.51171875, "rewards/margins": 9994056.51171875, "rewards/rejected": -9994056.0, "step": 684 }, { "epoch": 0.4696606102159753, "grad_norm": 0.4893011319327738, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107401686.03278689, "logits/rejected": -91961680.23880596, "logps/chosen": -240.0, "logps/rejected": -354.14925373134326, "loss": 0.2394, "rewards/chosen": 0.9692622950819673, "rewards/margins": 3.588665280156594, "rewards/rejected": -2.6194029850746268, "step": 685 }, { "epoch": 0.4703462461432979, "grad_norm": 0.4600678969605806, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90426350.6440678, "logits/rejected": -77807378.55072464, "logps/chosen": -193.89830508474577, "logps/rejected": -291.71014492753625, "loss": 0.237, "rewards/chosen": 0.9332627118644068, "rewards/margins": 3.5528279292557112, "rewards/rejected": -2.619565217391304, "step": 686 }, { "epoch": 0.4710318820706205, "grad_norm": 0.5603852799174678, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89866846.81481482, "logits/rejected": -89100620.1081081, "logps/chosen": -290.3703703703704, "logps/rejected": -288.43243243243245, "loss": 0.284, "rewards/chosen": 2.6695161042390048, "rewards/margins": 5.122218806941707, "rewards/rejected": -2.4527027027027026, "step": 687 }, { "epoch": 0.4717175179979431, "grad_norm": 0.48415049555831274, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82448032.91428572, "logits/rejected": -86634072.27586207, "logps/chosen": -274.51428571428573, "logps/rejected": -328.0, "loss": 0.3098, "rewards/chosen": 0.4880580357142857, "rewards/margins": 3.457885621921182, "rewards/rejected": -2.9698275862068964, "step": 688 }, { "epoch": 0.47240315392526566, "grad_norm": 0.5163233067745545, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84156680.25806452, "logits/rejected": -89542035.39393939, "logps/chosen": -261.5483870967742, "logps/rejected": -345.2121212121212, "loss": 0.2834, "rewards/chosen": 0.5463709677419355, "rewards/margins": 3.254704301075269, "rewards/rejected": -2.7083333333333335, "step": 689 }, { "epoch": 0.4730887898525883, "grad_norm": 0.42233802504480905, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -71769201.77777778, "logits/rejected": -87757745.23076923, "logps/chosen": -193.52380952380952, "logps/rejected": -369.7230769230769, "loss": 0.2703, "rewards/chosen": 0.6577380952380952, "rewards/margins": 3.638507326007326, "rewards/rejected": -2.980769230769231, "step": 690 }, { "epoch": 0.47377442577991086, "grad_norm": 0.48548366515845454, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81723392.0, "logits/rejected": -95682560.0, "logps/chosen": -299.0, "logps/rejected": -340.5, "loss": 0.2781, "rewards/chosen": 0.562744140625, "rewards/margins": 0.3727571964263916, "rewards/rejected": 0.1899869441986084, "step": 691 }, { "epoch": 0.4744600617072335, "grad_norm": 0.4238774228221741, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87067965.79310344, "logits/rejected": -91555664.45714286, "logps/chosen": -258.48275862068965, "logps/rejected": -354.0571428571429, "loss": 0.257, "rewards/chosen": 0.6196120689655172, "rewards/margins": 3.905326354679803, "rewards/rejected": -3.2857142857142856, "step": 692 }, { "epoch": 0.47514569763455605, "grad_norm": 0.6026604846050292, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94109696.0, "logits/rejected": -84934656.0, "logps/chosen": -430.5, "logps/rejected": -370.25, "loss": 0.2328, "rewards/chosen": 1.1728515625, "rewards/margins": 4.7783203125, "rewards/rejected": -3.60546875, "step": 693 }, { "epoch": 0.4758313335618786, "grad_norm": 0.5347738037032904, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94092219.73333333, "logits/rejected": -89437364.70588236, "logps/chosen": -283.8666666666667, "logps/rejected": -308.94117647058823, "loss": 0.2888, "rewards/chosen": 0.4405598958333333, "rewards/margins": 3.4865157781862743, "rewards/rejected": -3.045955882352941, "step": 694 }, { "epoch": 0.47651696948920125, "grad_norm": 0.5307974190919882, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87947231.49206349, "logits/rejected": -86209079.13846155, "logps/chosen": -278.3492063492063, "logps/rejected": -326.6461538461538, "loss": 0.3097, "rewards/chosen": 0.29811507936507936, "rewards/margins": 0.4774012732127356, "rewards/rejected": -0.17928619384765626, "step": 695 }, { "epoch": 0.4772026054165238, "grad_norm": 0.4621366694108747, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83755008.0, "logits/rejected": -92798976.0, "logps/chosen": -265.75, "logps/rejected": -348.5, "loss": 0.291, "rewards/chosen": 0.61181640625, "rewards/margins": 3.32275390625, "rewards/rejected": -2.7109375, "step": 696 }, { "epoch": 0.47788824134384644, "grad_norm": 0.604963142280322, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76607728.94117647, "logits/rejected": -82034049.66233766, "logps/chosen": -188.94117647058823, "logps/rejected": -341.61038961038963, "loss": 0.2631, "rewards/chosen": 2.7213676303040746, "rewards/margins": 5.552536461472906, "rewards/rejected": -2.831168831168831, "step": 697 }, { "epoch": 0.478573877271169, "grad_norm": 0.523778545694628, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91675501.71428572, "logits/rejected": -102876956.44444445, "logps/chosen": -332.42857142857144, "logps/rejected": -305.1111111111111, "loss": 0.2738, "rewards/chosen": 0.8035714285714286, "rewards/margins": 3.400793650793651, "rewards/rejected": -2.5972222222222223, "step": 698 }, { "epoch": 0.4792595131984916, "grad_norm": 0.4571204832388889, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73649134.6440678, "logits/rejected": -80542794.20289855, "logps/chosen": -216.135593220339, "logps/rejected": -303.30434782608694, "loss": 0.2904, "rewards/chosen": 0.3297139830508475, "rewards/margins": 3.125003838123311, "rewards/rejected": -2.795289855072464, "step": 699 }, { "epoch": 0.4799451491258142, "grad_norm": 0.44704499498432054, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82350137.69014084, "logits/rejected": -94261463.57894737, "logps/chosen": -229.18309859154928, "logps/rejected": -360.9824561403509, "loss": 0.3034, "rewards/chosen": 0.535431338028169, "rewards/margins": 3.616571688905362, "rewards/rejected": -3.081140350877193, "step": 700 }, { "epoch": 0.4806307850531368, "grad_norm": 0.5117845446380463, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83886080.0, "logits/rejected": -89478485.33333333, "logps/chosen": -271.05882352941177, "logps/rejected": -369.6, "loss": 0.2839, "rewards/chosen": 0.5002297794117647, "rewards/margins": 3.5335631127450977, "rewards/rejected": -3.033333333333333, "step": 701 }, { "epoch": 0.4813164209804594, "grad_norm": 0.4984355005758045, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96098906.35294117, "logits/rejected": -70184686.93333334, "logps/chosen": -263.05882352941177, "logps/rejected": -313.06666666666666, "loss": 0.2793, "rewards/chosen": 0.5560661764705882, "rewards/margins": 10678206.422732843, "rewards/rejected": -10678205.866666667, "step": 702 }, { "epoch": 0.482002056907782, "grad_norm": 0.5186169418872186, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83619774.98412699, "logits/rejected": -86273606.8923077, "logps/chosen": -250.79365079365078, "logps/rejected": -341.16923076923075, "loss": 0.2854, "rewards/chosen": 0.6785714285714286, "rewards/margins": 3.313186813186813, "rewards/rejected": -2.6346153846153846, "step": 703 }, { "epoch": 0.48268769283510454, "grad_norm": 0.4739295776777468, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85303872.90140845, "logits/rejected": -125387614.31578948, "logps/chosen": -267.2676056338028, "logps/rejected": -407.57894736842104, "loss": 0.3031, "rewards/chosen": 0.5273987676056338, "rewards/margins": 3.8124864869038797, "rewards/rejected": -3.2850877192982457, "step": 704 }, { "epoch": 0.48337332876242717, "grad_norm": 0.5659248934989475, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92754037.02857143, "logits/rejected": -95167311.44827586, "logps/chosen": -278.1714285714286, "logps/rejected": -376.2758620689655, "loss": 0.3098, "rewards/chosen": 0.39598214285714284, "rewards/margins": 3.4067580049261084, "rewards/rejected": -3.0107758620689653, "step": 705 }, { "epoch": 0.48405896468974974, "grad_norm": 0.4680182074404998, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102101343.08571428, "logits/rejected": -93359421.79310344, "logps/chosen": -308.1142857142857, "logps/rejected": -312.2758620689655, "loss": 0.3121, "rewards/chosen": 0.6072544642857143, "rewards/margins": 3.327082050492611, "rewards/rejected": -2.7198275862068964, "step": 706 }, { "epoch": 0.4847446006170723, "grad_norm": 0.6873748866257772, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104266852.95774648, "logits/rejected": -86608698.38596492, "logps/chosen": -301.7464788732394, "logps/rejected": -303.1578947368421, "loss": 0.2734, "rewards/chosen": 0.6830985915492958, "rewards/margins": 3.8980108722510503, "rewards/rejected": -3.2149122807017543, "step": 707 }, { "epoch": 0.48543023654439493, "grad_norm": 0.4481588733140598, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82104937.20547946, "logits/rejected": -91359567.12727273, "logps/chosen": -315.6164383561644, "logps/rejected": -274.90909090909093, "loss": 0.301, "rewards/chosen": 0.7133989726027398, "rewards/margins": 3.4361262453300125, "rewards/rejected": -2.7227272727272727, "step": 708 }, { "epoch": 0.4861158724717175, "grad_norm": 0.4970287104879086, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95123649.20754717, "logits/rejected": -87465219.41333333, "logps/chosen": -284.6792452830189, "logps/rejected": -359.25333333333333, "loss": 0.2514, "rewards/chosen": 0.568359375, "rewards/margins": 3.948359375, "rewards/rejected": -3.38, "step": 709 }, { "epoch": 0.48680150839904013, "grad_norm": 0.46390186564003183, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79879580.65671642, "logits/rejected": -86361406.95081967, "logps/chosen": -253.8507462686567, "logps/rejected": -324.1967213114754, "loss": 0.2823, "rewards/chosen": 0.5680970149253731, "rewards/margins": 3.7033429165647176, "rewards/rejected": -3.1352459016393444, "step": 710 }, { "epoch": 0.4874871443263627, "grad_norm": 0.4664039293201457, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107694923.29411764, "logits/rejected": -96608802.13333334, "logps/chosen": -325.1764705882353, "logps/rejected": -352.26666666666665, "loss": 0.2809, "rewards/chosen": 0.7720588235294118, "rewards/margins": 3.967892156862745, "rewards/rejected": -3.1958333333333333, "step": 711 }, { "epoch": 0.48817278025368527, "grad_norm": 0.4259310020895518, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83646405.48571429, "logits/rejected": -85910916.4137931, "logps/chosen": -247.54285714285714, "logps/rejected": -221.79310344827587, "loss": 0.325, "rewards/chosen": 0.34614955357142857, "rewards/margins": 2.8116667949507392, "rewards/rejected": -2.4655172413793105, "step": 712 }, { "epoch": 0.4888584161810079, "grad_norm": 0.491166095467593, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95510293.94285715, "logits/rejected": -83307555.31034483, "logps/chosen": -247.31428571428572, "logps/rejected": -324.6896551724138, "loss": 0.3143, "rewards/chosen": 0.3732142857142857, "rewards/margins": 2.679248768472906, "rewards/rejected": -2.3060344827586206, "step": 713 }, { "epoch": 0.48954405210833046, "grad_norm": 0.4334044242138106, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97824074.83076923, "logits/rejected": -105989396.31746031, "logps/chosen": -261.4153846153846, "logps/rejected": -353.015873015873, "loss": 0.2508, "rewards/chosen": 0.7584134615384616, "rewards/margins": 4.484603937728938, "rewards/rejected": -3.7261904761904763, "step": 714 }, { "epoch": 0.4902296880356531, "grad_norm": 0.45835603624215787, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94197077.33333333, "logits/rejected": -88005485.71428572, "logps/chosen": -264.44444444444446, "logps/rejected": -393.14285714285717, "loss": 0.2743, "rewards/chosen": 0.6753472222222222, "rewards/margins": 4.179811507936508, "rewards/rejected": -3.5044642857142856, "step": 715 }, { "epoch": 0.49091532396297566, "grad_norm": 0.45578377988145935, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102107236.72131148, "logits/rejected": -87078759.1641791, "logps/chosen": -273.8360655737705, "logps/rejected": -307.5820895522388, "loss": 0.2396, "rewards/chosen": 0.7902151639344263, "rewards/margins": 3.954394268412038, "rewards/rejected": -3.1641791044776117, "step": 716 }, { "epoch": 0.49160095989029823, "grad_norm": 0.8711844777831625, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109171741.25714286, "logits/rejected": -105653071.44827586, "logps/chosen": -216.68571428571428, "logps/rejected": -353.6551724137931, "loss": 0.3119, "rewards/chosen": 0.3634626116071429, "rewards/margins": 9382974.156566061, "rewards/rejected": -9382973.793103449, "step": 717 }, { "epoch": 0.49228659581762085, "grad_norm": 0.4489997436370703, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91619328.0, "logits/rejected": -120586240.0, "logps/chosen": -356.25, "logps/rejected": -282.0, "loss": 0.2754, "rewards/chosen": 0.7646484375, "rewards/margins": 3.8037109375, "rewards/rejected": -3.0390625, "step": 718 }, { "epoch": 0.4929722317449434, "grad_norm": 0.5369618701985296, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72532532.96551724, "logits/rejected": -105336949.02857143, "logps/chosen": -218.89655172413794, "logps/rejected": -373.0285714285714, "loss": 0.2666, "rewards/chosen": 0.7009698275862069, "rewards/margins": 3.5366841133004927, "rewards/rejected": -2.835714285714286, "step": 719 }, { "epoch": 0.49365786767226605, "grad_norm": 0.4931235517692624, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -117440512.0, "logits/rejected": -93672789.33333333, "logps/chosen": -317.16129032258067, "logps/rejected": -327.75757575757575, "loss": 0.3029, "rewards/chosen": 0.38810483870967744, "rewards/margins": 3.4184078690127078, "rewards/rejected": -3.0303030303030303, "step": 720 }, { "epoch": 0.4943435035995886, "grad_norm": 0.5386238298558534, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -116182220.8, "logits/rejected": -91781240.47058824, "logps/chosen": -375.46666666666664, "logps/rejected": -397.6470588235294, "loss": 0.2581, "rewards/chosen": 1.0833333333333333, "rewards/margins": 4.241421568627451, "rewards/rejected": -3.1580882352941178, "step": 721 }, { "epoch": 0.4950291395269112, "grad_norm": 0.4864317938389758, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101562075.42857143, "logits/rejected": -74565404.44444445, "logps/chosen": -314.85714285714283, "logps/rejected": -283.1111111111111, "loss": 0.2567, "rewards/chosen": 2.9921531677246094, "rewards/margins": 5.773403167724609, "rewards/rejected": -2.78125, "step": 722 }, { "epoch": 0.4957147754542338, "grad_norm": 0.4514799699575106, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107171698.7586207, "logits/rejected": -103539390.17142858, "logps/chosen": -276.6896551724138, "logps/rejected": -395.42857142857144, "loss": 0.2532, "rewards/chosen": 0.5123922413793104, "rewards/margins": 3.948106527093596, "rewards/rejected": -3.4357142857142855, "step": 723 }, { "epoch": 0.4964004113815564, "grad_norm": 0.5251792904027235, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76589140.16438356, "logits/rejected": -75649992.14545454, "logps/chosen": -191.013698630137, "logps/rejected": -300.8, "loss": 0.3149, "rewards/chosen": 0.3792808219178082, "rewards/margins": 3.320189912826899, "rewards/rejected": -2.940909090909091, "step": 724 }, { "epoch": 0.497086047308879, "grad_norm": 0.4777429070630456, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -108444033.85507247, "logits/rejected": -87298395.11864407, "logps/chosen": -270.84057971014494, "logps/rejected": -350.64406779661016, "loss": 0.2836, "rewards/chosen": 0.6367753623188406, "rewards/margins": 4.1155889216408745, "rewards/rejected": -3.4788135593220337, "step": 725 }, { "epoch": 0.4977716832362016, "grad_norm": 0.45655666677275547, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84749613.1764706, "logits/rejected": -83886080.0, "logps/chosen": -271.7647058823529, "logps/rejected": -296.8, "loss": 0.2773, "rewards/chosen": 0.703125, "rewards/margins": 3.548958333333333, "rewards/rejected": -2.845833333333333, "step": 726 }, { "epoch": 0.49845731916352415, "grad_norm": 0.46894426361337405, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87786046.87719299, "logits/rejected": -74493202.02816902, "logps/chosen": -199.71929824561403, "logps/rejected": -296.3380281690141, "loss": 0.2608, "rewards/chosen": 0.6644736842105263, "rewards/margins": 2.9919384729429206, "rewards/rejected": -2.3274647887323945, "step": 727 }, { "epoch": 0.4991429550908468, "grad_norm": 0.42290147272673534, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90963968.0, "logits/rejected": -104857600.0, "logps/chosen": -318.75, "logps/rejected": -342.0, "loss": 0.2504, "rewards/chosen": 2.3784213066101074, "rewards/margins": 5.562015056610107, "rewards/rejected": -3.18359375, "step": 728 }, { "epoch": 0.49982859101816934, "grad_norm": 0.4979592634226754, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88995504.87272727, "logits/rejected": -85839591.4520548, "logps/chosen": -226.61818181818182, "logps/rejected": -356.82191780821915, "loss": 0.2625, "rewards/chosen": 0.47670454545454544, "rewards/margins": 3.570882627646326, "rewards/rejected": -3.094178082191781, "step": 729 }, { "epoch": 0.500514226945492, "grad_norm": 0.4360609875458326, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78293674.66666667, "logits/rejected": -89433385.29032259, "logps/chosen": -235.63636363636363, "logps/rejected": -278.96774193548384, "loss": 0.2906, "rewards/chosen": 0.6036931818181818, "rewards/margins": 3.0674028592375366, "rewards/rejected": -2.463709677419355, "step": 730 }, { "epoch": 0.5011998628728145, "grad_norm": 0.7133223393072444, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81852478.06060606, "logits/rejected": -71709068.38709678, "logps/chosen": -261.3333333333333, "logps/rejected": -279.2258064516129, "loss": 0.3, "rewards/chosen": 0.3888494318181818, "rewards/margins": 3.3606236253665687, "rewards/rejected": -2.971774193548387, "step": 731 }, { "epoch": 0.5018854988001371, "grad_norm": 0.4465735172718752, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83035061.79710145, "logits/rejected": -85378967.86440678, "logps/chosen": -326.95652173913044, "logps/rejected": -288.271186440678, "loss": 0.2789, "rewards/chosen": 1.0144927536231885, "rewards/margins": 3.730594448538443, "rewards/rejected": -2.7161016949152543, "step": 732 }, { "epoch": 0.5025711347274597, "grad_norm": 0.5356112992332732, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92614766.7027027, "logits/rejected": -93439772.44444445, "logps/chosen": -286.27027027027026, "logps/rejected": -378.6666666666667, "loss": 0.2738, "rewards/chosen": 0.9641047297297297, "rewards/margins": 4.501141766766767, "rewards/rejected": -3.537037037037037, "step": 733 }, { "epoch": 0.5032567706547824, "grad_norm": 0.47685289549060206, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91788391.88405797, "logits/rejected": -93341036.47457626, "logps/chosen": -263.18840579710144, "logps/rejected": -301.0169491525424, "loss": 0.268, "rewards/chosen": 0.6005434782608695, "rewards/margins": 3.7700350036845984, "rewards/rejected": -3.169491525423729, "step": 734 }, { "epoch": 0.5039424065821049, "grad_norm": 0.6199433768337387, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95404765.6119403, "logits/rejected": -93649869.63934426, "logps/chosen": -322.86567164179104, "logps/rejected": -344.1311475409836, "loss": 0.2739, "rewards/chosen": 0.6529850746268657, "rewards/margins": 4.046427697577686, "rewards/rejected": -3.3934426229508197, "step": 735 }, { "epoch": 0.5046280425094275, "grad_norm": 0.5188166737423203, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80327276.60606061, "logits/rejected": -87742133.67741935, "logps/chosen": -230.06060606060606, "logps/rejected": -278.96774193548384, "loss": 0.3027, "rewards/chosen": 2.084116386644768, "rewards/margins": 4.8159712253544456, "rewards/rejected": -2.7318548387096775, "step": 736 }, { "epoch": 0.5053136784367501, "grad_norm": 0.5541016780579253, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97332525.1764706, "logits/rejected": -92694118.4, "logps/chosen": -282.8235294117647, "logps/rejected": -390.4, "loss": 0.3027, "rewards/chosen": 0.6300551470588235, "rewards/margins": 3.18422181372549, "rewards/rejected": -2.5541666666666667, "step": 737 }, { "epoch": 0.5059993143640726, "grad_norm": 0.4919239111912419, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94651460.26666667, "logits/rejected": -92027964.23529412, "logps/chosen": -306.6666666666667, "logps/rejected": -304.94117647058823, "loss": 0.2597, "rewards/chosen": 2.883283996582031, "rewards/margins": 5.591004584817325, "rewards/rejected": -2.707720588235294, "step": 738 }, { "epoch": 0.5066849502913953, "grad_norm": 0.5560110447237296, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102208565.89473684, "logits/rejected": -105260898.46153846, "logps/chosen": -284.63157894736844, "logps/rejected": -417.53846153846155, "loss": 0.2861, "rewards/chosen": 0.756578947368421, "rewards/margins": 3.999367408906883, "rewards/rejected": -3.2427884615384617, "step": 739 }, { "epoch": 0.5073705862187179, "grad_norm": 0.4679602693293843, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76038672.51612903, "logits/rejected": -98121293.57575758, "logps/chosen": -242.32258064516128, "logps/rejected": -322.90909090909093, "loss": 0.2731, "rewards/chosen": 2.652672060074345, "rewards/margins": 5.376156908559194, "rewards/rejected": -2.7234848484848486, "step": 740 }, { "epoch": 0.5080562221460404, "grad_norm": 0.46808035155842215, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86835200.0, "logits/rejected": -72548352.0, "logps/chosen": -267.875, "logps/rejected": -260.75, "loss": 0.3081, "rewards/chosen": 0.41650390625, "rewards/margins": 2.92236328125, "rewards/rejected": -2.505859375, "step": 741 }, { "epoch": 0.508741858073363, "grad_norm": 0.43915028944037116, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77256373.67741935, "logits/rejected": -104603399.75757575, "logps/chosen": -282.3225806451613, "logps/rejected": -325.57575757575756, "loss": 0.2561, "rewards/chosen": 0.7207661290322581, "rewards/margins": 3.758644916911046, "rewards/rejected": -3.037878787878788, "step": 742 }, { "epoch": 0.5094274940006857, "grad_norm": 0.44084457808012595, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85578518.45614035, "logits/rejected": -72780035.60563381, "logps/chosen": -272.280701754386, "logps/rejected": -338.4788732394366, "loss": 0.2347, "rewards/chosen": 0.8519736842105263, "rewards/margins": 4.158311712379541, "rewards/rejected": -3.306338028169014, "step": 743 }, { "epoch": 0.5101131299280083, "grad_norm": 0.4687879848631448, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85065728.0, "logits/rejected": -81264640.0, "logps/chosen": -247.75, "logps/rejected": -362.0, "loss": 0.2697, "rewards/chosen": 0.673828125, "rewards/margins": 3.505859375, "rewards/rejected": -2.83203125, "step": 744 }, { "epoch": 0.5107987658553308, "grad_norm": 0.4662108300131742, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95452191.03030303, "logits/rejected": -80232976.51612903, "logps/chosen": -323.8787878787879, "logps/rejected": -328.258064516129, "loss": 0.2672, "rewards/chosen": 0.8816287878787878, "rewards/margins": 3.974370723362659, "rewards/rejected": -3.092741935483871, "step": 745 }, { "epoch": 0.5114844017826534, "grad_norm": 0.48674776903860256, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81273234.8852459, "logits/rejected": -86765751.40298508, "logps/chosen": -283.5409836065574, "logps/rejected": -351.76119402985074, "loss": 0.2584, "rewards/chosen": 0.7453893442622951, "rewards/margins": 2.3657135502586804, "rewards/rejected": -1.6203242059963852, "step": 746 }, { "epoch": 0.512170037709976, "grad_norm": 0.5148510478556072, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82736955.61643836, "logits/rejected": -85792581.81818181, "logps/chosen": -327.013698630137, "logps/rejected": -358.4, "loss": 0.2537, "rewards/chosen": 0.9434931506849316, "rewards/margins": 3.8844022415940227, "rewards/rejected": -2.940909090909091, "step": 747 }, { "epoch": 0.5128556736372986, "grad_norm": 0.42716697958668903, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72926769.5483871, "logits/rejected": -100154895.51515152, "logps/chosen": -210.06451612903226, "logps/rejected": -311.75757575757575, "loss": 0.2838, "rewards/chosen": 2.431145944902974, "rewards/margins": 4.684219666123973, "rewards/rejected": -2.253073721220999, "step": 748 }, { "epoch": 0.5135413095646212, "grad_norm": 0.5128230258753446, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78779970.7826087, "logits/rejected": -85307877.96610169, "logps/chosen": -256.231884057971, "logps/rejected": -313.22033898305085, "loss": 0.2615, "rewards/chosen": 0.7789855072463768, "rewards/margins": 3.2909015870100156, "rewards/rejected": -2.5119160797636386, "step": 749 }, { "epoch": 0.5142269454919438, "grad_norm": 0.40077626778508435, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85843421.86666666, "logits/rejected": -84996336.94117647, "logps/chosen": -288.26666666666665, "logps/rejected": -321.1764705882353, "loss": 0.2516, "rewards/chosen": 0.68984375, "rewards/margins": 4.134696691176471, "rewards/rejected": -3.4448529411764706, "step": 750 }, { "epoch": 0.5149125814192663, "grad_norm": 0.5148551103454084, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90729418.10526316, "logits/rejected": -91920239.77464789, "logps/chosen": -247.2982456140351, "logps/rejected": -352.0, "loss": 0.2613, "rewards/chosen": 0.6089638157894737, "rewards/margins": 3.3871328298739805, "rewards/rejected": -2.778169014084507, "step": 751 }, { "epoch": 0.515598217346589, "grad_norm": 0.4531818659137674, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86338134.64615385, "logits/rejected": -97800517.07936507, "logps/chosen": -248.86153846153846, "logps/rejected": -353.5238095238095, "loss": 0.2775, "rewards/chosen": 0.6105769230769231, "rewards/margins": 3.9518467643467643, "rewards/rejected": -3.3412698412698414, "step": 752 }, { "epoch": 0.5162838532739116, "grad_norm": 0.46342236461775743, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91022656.95522387, "logits/rejected": -75153676.59016393, "logps/chosen": -225.43283582089552, "logps/rejected": -299.5409836065574, "loss": 0.2993, "rewards/chosen": 0.2953008395522388, "rewards/margins": 3.1231696920112553, "rewards/rejected": -2.8278688524590163, "step": 753 }, { "epoch": 0.5169694892012341, "grad_norm": 0.4386905503551421, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82348168.53333333, "logits/rejected": -86106593.88235295, "logps/chosen": -287.73333333333335, "logps/rejected": -332.70588235294116, "loss": 0.2545, "rewards/chosen": 0.8036458333333333, "rewards/margins": 3.932322303921569, "rewards/rejected": -3.1286764705882355, "step": 754 }, { "epoch": 0.5176551251285567, "grad_norm": 0.4701639399655455, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79101028.95774648, "logits/rejected": -87638878.31578948, "logps/chosen": -229.18309859154928, "logps/rejected": -289.12280701754383, "loss": 0.2892, "rewards/chosen": 0.5262984154929577, "rewards/margins": 2.249158919107952, "rewards/rejected": -1.7228605036149944, "step": 755 }, { "epoch": 0.5183407610558793, "grad_norm": 0.5702084286783491, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90732664.47058824, "logits/rejected": -95070890.66666667, "logps/chosen": -327.05882352941177, "logps/rejected": -376.53333333333336, "loss": 0.3047, "rewards/chosen": 0.5338350183823529, "rewards/margins": 3.2150850183823527, "rewards/rejected": -2.68125, "step": 756 }, { "epoch": 0.519026396983202, "grad_norm": 0.7297845492945687, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91086301.86666666, "logits/rejected": -70562996.70588236, "logps/chosen": -311.2, "logps/rejected": -295.7647058823529, "loss": 0.282, "rewards/chosen": 5.049409993489584, "rewards/margins": 7.626615875842525, "rewards/rejected": -2.577205882352941, "step": 757 }, { "epoch": 0.5197120329105245, "grad_norm": 0.4109712351992956, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100810464.56140351, "logits/rejected": -88139458.70422535, "logps/chosen": -268.35087719298247, "logps/rejected": -333.9718309859155, "loss": 0.2676, "rewards/chosen": 0.3715049342105263, "rewards/margins": 3.8996739482950336, "rewards/rejected": -3.528169014084507, "step": 758 }, { "epoch": 0.5203976688378471, "grad_norm": 0.5127407534089364, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84724940.8, "logits/rejected": -99144668.68965517, "logps/chosen": -239.31428571428572, "logps/rejected": -310.0689655172414, "loss": 0.2971, "rewards/chosen": 0.5767857142857142, "rewards/margins": 3.585406403940887, "rewards/rejected": -3.0086206896551726, "step": 759 }, { "epoch": 0.5210833047651697, "grad_norm": 0.5612887297296987, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95869805.71428572, "logits/rejected": -86099740.44444445, "logps/chosen": -374.85714285714283, "logps/rejected": -262.22222222222223, "loss": 0.2556, "rewards/chosen": 2.327530997140067, "rewards/margins": 5.1782254415845115, "rewards/rejected": -2.8506944444444446, "step": 760 }, { "epoch": 0.5217689406924922, "grad_norm": 0.4381188494184626, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84427280.51612903, "logits/rejected": -92655988.36363636, "logps/chosen": -286.7096774193548, "logps/rejected": -315.6363636363636, "loss": 0.2461, "rewards/chosen": 0.9959677419354839, "rewards/margins": 3.9656647116324537, "rewards/rejected": -2.9696969696969697, "step": 761 }, { "epoch": 0.5224545766198149, "grad_norm": 0.5464898691938399, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94401377.35211268, "logits/rejected": -91097339.50877193, "logps/chosen": -359.6619718309859, "logps/rejected": -327.57894736842104, "loss": 0.2844, "rewards/chosen": 0.9938380281690141, "rewards/margins": 4.103487150976032, "rewards/rejected": -3.1096491228070176, "step": 762 }, { "epoch": 0.5231402125471375, "grad_norm": 0.48404894168754825, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84144191.01538461, "logits/rejected": -90011095.36507936, "logps/chosen": -272.0, "logps/rejected": -310.85714285714283, "loss": 0.2881, "rewards/chosen": 0.4151442307692308, "rewards/margins": 3.7960966117216115, "rewards/rejected": -3.380952380952381, "step": 763 }, { "epoch": 0.52382584847446, "grad_norm": 0.5084626602898504, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72980889.6, "logits/rejected": -83153741.20634921, "logps/chosen": -213.16923076923078, "logps/rejected": -287.23809523809524, "loss": 0.3024, "rewards/chosen": 0.6509615384615385, "rewards/margins": 2.2620726495726498, "rewards/rejected": -1.6111111111111112, "step": 764 }, { "epoch": 0.5245114844017826, "grad_norm": 0.47460268952062257, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85621654.06896552, "logits/rejected": -76935519.08571428, "logps/chosen": -290.2068965517241, "logps/rejected": -325.48571428571427, "loss": 0.2552, "rewards/chosen": 2.8797952060041756, "rewards/margins": 6.04408092028989, "rewards/rejected": -3.164285714285714, "step": 765 }, { "epoch": 0.5251971203291053, "grad_norm": 0.49918597707229767, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85613146.35294117, "logits/rejected": -106814941.86666666, "logps/chosen": -266.8235294117647, "logps/rejected": -388.26666666666665, "loss": 0.2685, "rewards/chosen": 2.0661540311925553, "rewards/margins": 3.678105096256032, "rewards/rejected": -1.6119510650634765, "step": 766 }, { "epoch": 0.5258827562564279, "grad_norm": 0.5009752533399848, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78839808.0, "logits/rejected": -87097344.0, "logps/chosen": -213.875, "logps/rejected": -310.5, "loss": 0.2869, "rewards/chosen": 0.6201171875, "rewards/margins": 3.5693359375, "rewards/rejected": -2.94921875, "step": 767 }, { "epoch": 0.5265683921837504, "grad_norm": 0.618945022341287, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73514710.10909091, "logits/rejected": -76876421.26027398, "logps/chosen": -223.70909090909092, "logps/rejected": -300.71232876712327, "loss": 0.2392, "rewards/chosen": 0.7072443181818182, "rewards/margins": 4.118203222291407, "rewards/rejected": -3.410958904109589, "step": 768 }, { "epoch": 0.527254028111073, "grad_norm": 0.7243304931397313, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90247441.06666666, "logits/rejected": -118674130.8235294, "logps/chosen": -248.66666666666666, "logps/rejected": -346.5882352941176, "loss": 0.2798, "rewards/chosen": 0.5690104166666666, "rewards/margins": 3.3484221813725488, "rewards/rejected": -2.7794117647058822, "step": 769 }, { "epoch": 0.5279396640383957, "grad_norm": 0.465108678582054, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77894217.14285715, "logits/rejected": -88770585.92405063, "logps/chosen": -244.08163265306123, "logps/rejected": -317.56962025316454, "loss": 0.2639, "rewards/chosen": 0.27933673469387754, "rewards/margins": 3.295159519504004, "rewards/rejected": -3.0158227848101267, "step": 770 }, { "epoch": 0.5286252999657182, "grad_norm": 0.449155114272281, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79503971.34328358, "logits/rejected": -89386806.55737706, "logps/chosen": -229.97014925373134, "logps/rejected": -325.24590163934425, "loss": 0.2918, "rewards/chosen": 0.3582089552238806, "rewards/margins": 3.6082089552238807, "rewards/rejected": -3.25, "step": 771 }, { "epoch": 0.5293109358930408, "grad_norm": 0.6019685709009917, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77914528.54237288, "logits/rejected": -83764505.9710145, "logps/chosen": -241.89830508474577, "logps/rejected": -334.3768115942029, "loss": 0.2573, "rewards/chosen": 0.739406779661017, "rewards/margins": 3.706798084008843, "rewards/rejected": -2.967391304347826, "step": 772 }, { "epoch": 0.5299965718203634, "grad_norm": 0.5021899149738089, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76306899.08771929, "logits/rejected": -85540171.71830986, "logps/chosen": -234.24561403508773, "logps/rejected": -353.80281690140845, "loss": 0.2709, "rewards/chosen": 0.4199561403508772, "rewards/margins": 3.772068816407215, "rewards/rejected": -3.352112676056338, "step": 773 }, { "epoch": 0.5306822077476859, "grad_norm": 0.6134142245119002, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75497472.0, "logits/rejected": -85181379.76470588, "logps/chosen": -199.73333333333332, "logps/rejected": -354.8235294117647, "loss": 0.2622, "rewards/chosen": 0.6140625, "rewards/margins": 3.889797794117647, "rewards/rejected": -3.275735294117647, "step": 774 }, { "epoch": 0.5313678436750086, "grad_norm": 0.500141950762571, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -106637001.6969697, "logits/rejected": -84765530.83870968, "logps/chosen": -362.1818181818182, "logps/rejected": -386.3225806451613, "loss": 0.2625, "rewards/chosen": 3.0709898977568657, "rewards/margins": 14798378.29679635, "rewards/rejected": -14798375.225806452, "step": 775 }, { "epoch": 0.5320534796023312, "grad_norm": 0.5361699280539683, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82673269.20481928, "logits/rejected": -78060657.77777778, "logps/chosen": -255.2289156626506, "logps/rejected": -285.8666666666667, "loss": 0.2916, "rewards/chosen": 0.9231927710843374, "rewards/margins": 3.4454149933065596, "rewards/rejected": -2.522222222222222, "step": 776 }, { "epoch": 0.5327391155296538, "grad_norm": 0.5072764782463377, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81422758.6031746, "logits/rejected": -102341017.6, "logps/chosen": -261.3333333333333, "logps/rejected": -355.6923076923077, "loss": 0.2397, "rewards/chosen": 0.9761904761904762, "rewards/margins": 4.11080586080586, "rewards/rejected": -3.1346153846153846, "step": 777 }, { "epoch": 0.5334247514569763, "grad_norm": 0.5398570870950556, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79056275.39393939, "logits/rejected": -92004087.74193548, "logps/chosen": -223.03030303030303, "logps/rejected": -332.9032258064516, "loss": 0.2915, "rewards/chosen": 0.4715909090909091, "rewards/margins": 3.5240102639296187, "rewards/rejected": -3.0524193548387095, "step": 778 }, { "epoch": 0.534110387384299, "grad_norm": 0.45884017621611495, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103459498.66666667, "logits/rejected": -98196058.35294117, "logps/chosen": -355.73333333333335, "logps/rejected": -340.94117647058823, "loss": 0.2479, "rewards/chosen": 0.9583333333333334, "rewards/margins": 4.384803921568627, "rewards/rejected": -3.426470588235294, "step": 779 }, { "epoch": 0.5347960233116216, "grad_norm": 0.44015244198837145, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77324023.74193548, "logits/rejected": -117313411.87878788, "logps/chosen": -286.19354838709677, "logps/rejected": -386.42424242424244, "loss": 0.2682, "rewards/chosen": 0.7817540322580645, "rewards/margins": 4.293117668621701, "rewards/rejected": -3.5113636363636362, "step": 780 }, { "epoch": 0.5354816592389441, "grad_norm": 0.5105339156491023, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -111553769.54385965, "logits/rejected": -81404942.42253521, "logps/chosen": -204.6315789473684, "logps/rejected": -349.9718309859155, "loss": 0.2517, "rewards/chosen": 2.656241433662281, "rewards/margins": 5.839340025211577, "rewards/rejected": -3.183098591549296, "step": 781 }, { "epoch": 0.5361672951662667, "grad_norm": 0.5254615537018765, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87874106.75409836, "logits/rejected": -85451118.80597015, "logps/chosen": -287.4754098360656, "logps/rejected": -336.7164179104478, "loss": 0.2407, "rewards/chosen": 0.9654200819672131, "rewards/margins": 4.204226052116467, "rewards/rejected": -3.2388059701492535, "step": 782 }, { "epoch": 0.5368529310935893, "grad_norm": 0.5795425800842487, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82533078.70967741, "logits/rejected": -73336769.93939394, "logps/chosen": -272.0, "logps/rejected": -290.1818181818182, "loss": 0.3032, "rewards/chosen": 1.8412017822265625, "rewards/margins": 3.669327013420336, "rewards/rejected": -1.8281252311937737, "step": 783 }, { "epoch": 0.5375385670209119, "grad_norm": 0.5016830546469448, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72581120.0, "logits/rejected": -91947008.0, "logps/chosen": -255.5, "logps/rejected": -380.0, "loss": 0.2762, "rewards/chosen": 1.6691735982894897, "rewards/margins": 5.30589234828949, "rewards/rejected": -3.63671875, "step": 784 }, { "epoch": 0.5382242029482345, "grad_norm": 0.5555286812262459, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83738911.43859649, "logits/rejected": -91683940.95774648, "logps/chosen": -265.5438596491228, "logps/rejected": -297.46478873239437, "loss": 0.2456, "rewards/chosen": 0.9078947368421053, "rewards/margins": 3.8973313565604153, "rewards/rejected": -2.98943661971831, "step": 785 }, { "epoch": 0.5389098388755571, "grad_norm": 0.4554064728333996, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85876597.15254237, "logits/rejected": -84493950.14492753, "logps/chosen": -260.06779661016947, "logps/rejected": -324.6376811594203, "loss": 0.2466, "rewards/chosen": 0.555614406779661, "rewards/margins": 4.001266580692705, "rewards/rejected": -3.4456521739130435, "step": 786 }, { "epoch": 0.5395954748028796, "grad_norm": 0.47792759836825327, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79754377.5522388, "logits/rejected": -67521418.49180327, "logps/chosen": -261.0149253731343, "logps/rejected": -318.6885245901639, "loss": 0.2841, "rewards/chosen": 0.5251865671641791, "rewards/margins": 3.1153505015904086, "rewards/rejected": -2.5901639344262297, "step": 787 }, { "epoch": 0.5402811107302022, "grad_norm": 0.444650608292553, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102201207.46666667, "logits/rejected": -92953178.35294117, "logps/chosen": -323.46666666666664, "logps/rejected": -338.3529411764706, "loss": 0.2358, "rewards/chosen": 2.939045715332031, "rewards/margins": 5.780957480037913, "rewards/rejected": -2.8419117647058822, "step": 788 }, { "epoch": 0.5409667466575249, "grad_norm": 0.5295806330682833, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86936482.9090909, "logits/rejected": -91240476.05479452, "logps/chosen": -208.43636363636364, "logps/rejected": -339.7260273972603, "loss": 0.2477, "rewards/chosen": 0.5434659090909091, "rewards/margins": 3.8927809775840596, "rewards/rejected": -3.3493150684931505, "step": 789 }, { "epoch": 0.5416523825848475, "grad_norm": 0.44067905529945733, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86507520.0, "logits/rejected": -103481344.0, "logps/chosen": -305.25, "logps/rejected": -362.0, "loss": 0.2668, "rewards/chosen": 2.6772358417510986, "rewards/margins": 5.938954591751099, "rewards/rejected": -3.26171875, "step": 790 }, { "epoch": 0.54233801851217, "grad_norm": 0.5083459782846885, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88769001.07462686, "logits/rejected": -88699215.73770492, "logps/chosen": -271.04477611940297, "logps/rejected": -328.655737704918, "loss": 0.2588, "rewards/chosen": 0.8125, "rewards/margins": 2.959268288534196, "rewards/rejected": -2.146768288534196, "step": 791 }, { "epoch": 0.5430236544394926, "grad_norm": 0.4395773903945821, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91226112.0, "logits/rejected": -106299392.0, "logps/chosen": -235.5, "logps/rejected": -311.5, "loss": 0.2554, "rewards/chosen": 0.66015625, "rewards/margins": 3.890625, "rewards/rejected": -3.23046875, "step": 792 }, { "epoch": 0.5437092903668153, "grad_norm": 0.4146254372495561, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84737098.20289855, "logits/rejected": -84312619.38983051, "logps/chosen": -294.4927536231884, "logps/rejected": -373.6949152542373, "loss": 0.2549, "rewards/chosen": 0.9963768115942029, "rewards/margins": 4.432817489560304, "rewards/rejected": -3.4364406779661016, "step": 793 }, { "epoch": 0.5443949262941378, "grad_norm": 0.5181098955868506, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -110389218.31884058, "logits/rejected": -72938235.66101696, "logps/chosen": -284.7536231884058, "logps/rejected": -299.3898305084746, "loss": 0.2806, "rewards/chosen": 0.6179800724637681, "rewards/margins": 2.456167338668085, "rewards/rejected": -1.8381872662043168, "step": 794 }, { "epoch": 0.5450805622214604, "grad_norm": 0.6463829700492328, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93592897.82857142, "logits/rejected": -78317779.86206897, "logps/chosen": -338.2857142857143, "logps/rejected": -306.7586206896552, "loss": 0.2442, "rewards/chosen": 1.1794642857142856, "rewards/margins": 4.4596366995073895, "rewards/rejected": -3.2801724137931036, "step": 795 }, { "epoch": 0.545766198148783, "grad_norm": 0.4556799611313624, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87425024.0, "logits/rejected": -72810496.0, "logps/chosen": -311.5, "logps/rejected": -282.0, "loss": 0.2706, "rewards/chosen": 0.7587890625, "rewards/margins": 2.8706068992614746, "rewards/rejected": -2.1118178367614746, "step": 796 }, { "epoch": 0.5464518340761055, "grad_norm": 0.5614302260333566, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102791748.7761194, "logits/rejected": -78935426.09836066, "logps/chosen": -269.13432835820896, "logps/rejected": -342.2950819672131, "loss": 0.2894, "rewards/chosen": 0.5121268656716418, "rewards/margins": 3.5244219476388547, "rewards/rejected": -3.012295081967213, "step": 797 }, { "epoch": 0.5471374700034282, "grad_norm": 0.4580793581989081, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76715173.16129032, "logits/rejected": -90113985.93939394, "logps/chosen": -275.8709677419355, "logps/rejected": -333.09090909090907, "loss": 0.2528, "rewards/chosen": 0.9299395161290323, "rewards/margins": 4.403424364613881, "rewards/rejected": -3.4734848484848486, "step": 798 }, { "epoch": 0.5478231059307508, "grad_norm": 0.4684633585805961, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89361976.8888889, "logits/rejected": -79691776.0, "logps/chosen": -243.11111111111111, "logps/rejected": -316.2857142857143, "loss": 0.2787, "rewards/chosen": 0.7669270833333334, "rewards/margins": 4.005766369047619, "rewards/rejected": -3.2388392857142856, "step": 799 }, { "epoch": 0.5485087418580734, "grad_norm": 0.6057028798460544, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99509862.4, "logits/rejected": -79953920.0, "logps/chosen": -354.4, "logps/rejected": -351.0, "loss": 0.295, "rewards/chosen": 0.8509765625, "rewards/margins": 4.6322265625, "rewards/rejected": -3.78125, "step": 800 }, { "epoch": 0.5491943777853959, "grad_norm": 0.4358288626679221, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -65710762.666666664, "logits/rejected": -83432641.72972973, "logps/chosen": -175.40740740740742, "logps/rejected": -291.6756756756757, "loss": 0.235, "rewards/chosen": 0.49442997685185186, "rewards/margins": 3.866051598473473, "rewards/rejected": -3.3716216216216215, "step": 801 }, { "epoch": 0.5498800137127186, "grad_norm": 0.5431458572303783, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92724077.71428572, "logits/rejected": -84468622.22222222, "logps/chosen": -252.28571428571428, "logps/rejected": -326.6666666666667, "loss": 0.2539, "rewards/chosen": 0.7405133928571429, "rewards/margins": 3.914124503968254, "rewards/rejected": -3.173611111111111, "step": 802 }, { "epoch": 0.5505656496400412, "grad_norm": 0.6612954903744946, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88997888.0, "logits/rejected": -87425024.0, "logps/chosen": -304.0, "logps/rejected": -294.75, "loss": 0.2864, "rewards/chosen": 0.53173828125, "rewards/margins": 2.84423828125, "rewards/rejected": -2.3125, "step": 803 }, { "epoch": 0.5512512855673637, "grad_norm": 0.4689798788509223, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92764023.46666667, "logits/rejected": -88327107.76470588, "logps/chosen": -267.46666666666664, "logps/rejected": -303.05882352941177, "loss": 0.273, "rewards/chosen": 2.0301836649576823, "rewards/margins": 5.383124841428271, "rewards/rejected": -3.3529411764705883, "step": 804 }, { "epoch": 0.5519369214946863, "grad_norm": 0.5586389647832954, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96206848.0, "logits/rejected": -101777408.0, "logps/chosen": -272.75, "logps/rejected": -345.75, "loss": 0.2873, "rewards/chosen": 2.5483059883117676, "rewards/margins": 5.811977863311768, "rewards/rejected": -3.263671875, "step": 805 }, { "epoch": 0.5526225574220089, "grad_norm": 0.47581933838339396, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97168042.66666667, "logits/rejected": -95173692.23529412, "logps/chosen": -270.4, "logps/rejected": -350.11764705882354, "loss": 0.2499, "rewards/chosen": 2.181757609049479, "rewards/margins": 5.575139961990656, "rewards/rejected": -3.3933823529411766, "step": 806 }, { "epoch": 0.5533081933493315, "grad_norm": 0.6071282931572302, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -111898038.85714285, "logits/rejected": -100430279.1111111, "logps/chosen": -307.42857142857144, "logps/rejected": -351.77777777777777, "loss": 0.253, "rewards/chosen": 2.124434334891183, "rewards/margins": 5.0966565571134055, "rewards/rejected": -2.9722222222222223, "step": 807 }, { "epoch": 0.5539938292766541, "grad_norm": 0.4575268786344927, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -111038679.57894737, "logits/rejected": -68172208.67605634, "logps/chosen": -318.87719298245617, "logps/rejected": -297.9154929577465, "loss": 0.2469, "rewards/chosen": 0.9237938596491229, "rewards/margins": 4.209005127254757, "rewards/rejected": -3.285211267605634, "step": 808 }, { "epoch": 0.5546794652039767, "grad_norm": 0.5635997688762249, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104464384.0, "logits/rejected": -109707264.0, "logps/chosen": -367.5, "logps/rejected": -382.75, "loss": 0.2782, "rewards/chosen": 2.535122871398926, "rewards/margins": 6.046841621398926, "rewards/rejected": -3.51171875, "step": 809 }, { "epoch": 0.5553651011312993, "grad_norm": 0.46931186713583734, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77562848.96969697, "logits/rejected": -78947625.29032259, "logps/chosen": -235.63636363636363, "logps/rejected": -300.38709677419354, "loss": 0.286, "rewards/chosen": 0.6652462121212122, "rewards/margins": 2.47976234115347, "rewards/rejected": -1.814516129032258, "step": 810 }, { "epoch": 0.5560507370586218, "grad_norm": 0.558312759017508, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86522499.65714286, "logits/rejected": -87574174.89655173, "logps/chosen": -357.7142857142857, "logps/rejected": -391.7241379310345, "loss": 0.2925, "rewards/chosen": 0.7965401785714286, "rewards/margins": 3.9732643165024633, "rewards/rejected": -3.1767241379310347, "step": 811 }, { "epoch": 0.5567363729859445, "grad_norm": 0.6489078406864531, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -108311732.70588236, "logits/rejected": -85983232.0, "logps/chosen": -330.11764705882354, "logps/rejected": -344.26666666666665, "loss": 0.3007, "rewards/chosen": 0.6475183823529411, "rewards/margins": 2.8384012558880976, "rewards/rejected": -2.1908828735351564, "step": 812 }, { "epoch": 0.5574220089132671, "grad_norm": 0.697118910702933, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97114269.53846154, "logits/rejected": -94813345.68421052, "logps/chosen": -364.7692307692308, "logps/rejected": -376.0, "loss": 0.2236, "rewards/chosen": 1.1893028846153846, "rewards/margins": 4.4623292004048585, "rewards/rejected": -3.273026315789474, "step": 813 }, { "epoch": 0.5581076448405896, "grad_norm": 0.5734533938536286, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80401581.29230769, "logits/rejected": -88479841.52380952, "logps/chosen": -311.1384615384615, "logps/rejected": -365.2063492063492, "loss": 0.2613, "rewards/chosen": 0.8947115384615385, "rewards/margins": 4.351060744810745, "rewards/rejected": -3.4563492063492065, "step": 814 }, { "epoch": 0.5587932807679122, "grad_norm": 0.500144555245363, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78230124.60606061, "logits/rejected": -77121073.5483871, "logps/chosen": -269.57575757575756, "logps/rejected": -256.51612903225805, "loss": 0.2726, "rewards/chosen": 0.6174242424242424, "rewards/margins": 3.516617790811339, "rewards/rejected": -2.899193548387097, "step": 815 }, { "epoch": 0.5594789166952349, "grad_norm": 0.5805334740137232, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79768036.07272728, "logits/rejected": -98479959.67123288, "logps/chosen": -300.5090909090909, "logps/rejected": -373.47945205479454, "loss": 0.2262, "rewards/chosen": 1.0613636363636363, "rewards/margins": 4.41752801992528, "rewards/rejected": -3.356164383561644, "step": 816 }, { "epoch": 0.5601645526225574, "grad_norm": 0.48525403878600637, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85350757.58730158, "logits/rejected": -85563801.6, "logps/chosen": -243.55555555555554, "logps/rejected": -322.46153846153845, "loss": 0.2321, "rewards/chosen": 0.8273809523809523, "rewards/margins": 4.250457875457875, "rewards/rejected": -3.423076923076923, "step": 817 }, { "epoch": 0.56085018854988, "grad_norm": 0.9301644769469958, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88539136.0, "logits/rejected": -83820544.0, "logps/chosen": -214.5, "logps/rejected": -339.0, "loss": 0.2572, "rewards/chosen": 0.771484375, "rewards/margins": 0.4730863571166992, "rewards/rejected": 0.2983980178833008, "step": 818 }, { "epoch": 0.5615358244772026, "grad_norm": 0.5787403043176497, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90410552.8888889, "logits/rejected": -94072246.85714285, "logps/chosen": -238.88888888888889, "logps/rejected": -336.14285714285717, "loss": 0.2913, "rewards/chosen": 0.6558837890625, "rewards/margins": 3.4996337890625, "rewards/rejected": -2.84375, "step": 819 }, { "epoch": 0.5622214604045251, "grad_norm": 0.3988384843063598, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80024657.26984127, "logits/rejected": -80143470.27692308, "logps/chosen": -234.15873015873015, "logps/rejected": -338.46153846153845, "loss": 0.2614, "rewards/chosen": 0.7123015873015873, "rewards/margins": 4.143070818070818, "rewards/rejected": -3.4307692307692306, "step": 820 }, { "epoch": 0.5629070963318478, "grad_norm": 0.47157026036749083, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104303635.32075472, "logits/rejected": -81537269.76, "logps/chosen": -226.11320754716982, "logps/rejected": -321.28, "loss": 0.2478, "rewards/chosen": 4.491599028965212, "rewards/margins": 7.878265695631879, "rewards/rejected": -3.3866666666666667, "step": 821 }, { "epoch": 0.5635927322591704, "grad_norm": 0.6280636673226505, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104438169.6, "logits/rejected": -77717985.88235295, "logps/chosen": -268.8, "logps/rejected": -298.5882352941176, "loss": 0.2558, "rewards/chosen": 0.6734375, "rewards/margins": 3.7727022058823527, "rewards/rejected": -3.099264705882353, "step": 822 }, { "epoch": 0.564278368186493, "grad_norm": 0.5183852753409, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91093193.91549295, "logits/rejected": -74835213.4736842, "logps/chosen": -274.7042253521127, "logps/rejected": -362.10526315789474, "loss": 0.2513, "rewards/chosen": 0.960387323943662, "rewards/margins": 4.852931183592785, "rewards/rejected": -3.892543859649123, "step": 823 }, { "epoch": 0.5649640041138155, "grad_norm": 0.5443319239474332, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99321118.72, "logits/rejected": -85069088.82051282, "logps/chosen": -306.88, "logps/rejected": -372.5128205128205, "loss": 0.2573, "rewards/chosen": 2.99390380859375, "rewards/margins": 5.939416629106571, "rewards/rejected": -2.9455128205128207, "step": 824 }, { "epoch": 0.5656496400411382, "grad_norm": 0.5045809458325137, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -110900241.3559322, "logits/rejected": -106255701.33333333, "logps/chosen": -285.2881355932203, "logps/rejected": -326.95652173913044, "loss": 0.2392, "rewards/chosen": 2.992031291379767, "rewards/margins": 6.419567523263825, "rewards/rejected": -3.427536231884058, "step": 825 }, { "epoch": 0.5663352759684608, "grad_norm": 0.463287088231993, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102793736.12698413, "logits/rejected": -80014414.76923077, "logps/chosen": -242.53968253968253, "logps/rejected": -351.26153846153846, "loss": 0.2649, "rewards/chosen": 2.6403299967447915, "rewards/margins": 5.312684357471955, "rewards/rejected": -2.6723543607271636, "step": 826 }, { "epoch": 0.5670209118957833, "grad_norm": 0.5513394935602326, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84872975.05882353, "logits/rejected": -88360004.26666667, "logps/chosen": -254.35294117647058, "logps/rejected": -322.1333333333333, "loss": 0.2719, "rewards/chosen": 0.7711397058823529, "rewards/margins": -0.5109265121759153, "rewards/rejected": 1.2820662180582683, "step": 827 }, { "epoch": 0.5677065478231059, "grad_norm": 0.3824456738335056, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -59209591.46666667, "logits/rejected": -88450469.64705883, "logps/chosen": -241.33333333333334, "logps/rejected": -328.0, "loss": 0.2488, "rewards/chosen": 0.6359375, "rewards/margins": 4.3491727941176475, "rewards/rejected": -3.713235294117647, "step": 828 }, { "epoch": 0.5683921837504285, "grad_norm": 0.6243111001115721, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72105020.23529412, "logits/rejected": -101781777.06666666, "logps/chosen": -318.8235294117647, "logps/rejected": -342.93333333333334, "loss": 0.263, "rewards/chosen": 0.6879595588235294, "rewards/margins": 4.5171262254901965, "rewards/rejected": -3.8291666666666666, "step": 829 }, { "epoch": 0.5690778196777511, "grad_norm": 0.7955071744783527, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104275057.77777778, "logits/rejected": -69880100.57142857, "logps/chosen": -282.8888888888889, "logps/rejected": -303.7142857142857, "loss": 0.2792, "rewards/chosen": 0.5327690972222222, "rewards/margins": 3.755983382936508, "rewards/rejected": -3.2232142857142856, "step": 830 }, { "epoch": 0.5697634556050737, "grad_norm": 0.513650821751055, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78293674.66666667, "logits/rejected": -84015135.5076923, "logps/chosen": -250.15873015873015, "logps/rejected": -307.2, "loss": 0.2333, "rewards/chosen": 0.9136904761904762, "rewards/margins": 4.302152014652014, "rewards/rejected": -3.3884615384615384, "step": 831 }, { "epoch": 0.5704490915323963, "grad_norm": 0.5950897966125482, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88019596.98550725, "logits/rejected": -91990328.40677966, "logps/chosen": -341.1014492753623, "logps/rejected": -326.77966101694915, "loss": 0.274, "rewards/chosen": 0.5981657608695652, "rewards/margins": 4.204097964259396, "rewards/rejected": -3.6059322033898304, "step": 832 }, { "epoch": 0.5711347274597189, "grad_norm": 0.49350341684977855, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76685858.13333334, "logits/rejected": -92891497.41176471, "logps/chosen": -240.0, "logps/rejected": -299.29411764705884, "loss": 0.2379, "rewards/chosen": 0.7729166666666667, "rewards/margins": 4.258210784313725, "rewards/rejected": -3.485294117647059, "step": 833 }, { "epoch": 0.5718203633870415, "grad_norm": 0.601409958335155, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99130761.84615384, "logits/rejected": -96800121.26315789, "logps/chosen": -336.0, "logps/rejected": -438.7368421052632, "loss": 0.2459, "rewards/chosen": 0.7111628605769231, "rewards/margins": 4.010504965840081, "rewards/rejected": -3.299342105263158, "step": 834 }, { "epoch": 0.5725059993143641, "grad_norm": 0.5711490209972662, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94072246.85714285, "logits/rejected": -100446349.2413793, "logps/chosen": -342.1714285714286, "logps/rejected": -361.37931034482756, "loss": 0.2929, "rewards/chosen": 0.6423549107142857, "rewards/margins": 4.094941117610837, "rewards/rejected": -3.4525862068965516, "step": 835 }, { "epoch": 0.5731916352416867, "grad_norm": 0.48451875949422, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82167102.95081967, "logits/rejected": -87329165.37313433, "logps/chosen": -289.57377049180326, "logps/rejected": -314.02985074626866, "loss": 0.2622, "rewards/chosen": 0.7822745901639344, "rewards/margins": 3.8830208588206507, "rewards/rejected": -3.1007462686567164, "step": 836 }, { "epoch": 0.5738772711690092, "grad_norm": 0.4999375715457139, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86854356.67692308, "logits/rejected": -84085808.76190476, "logps/chosen": -234.33846153846153, "logps/rejected": -371.8095238095238, "loss": 0.2604, "rewards/chosen": 0.739423076923077, "rewards/margins": 4.223550061050061, "rewards/rejected": -3.484126984126984, "step": 837 }, { "epoch": 0.5745629070963318, "grad_norm": 0.559291937279088, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103083086.76923077, "logits/rejected": -79029517.4736842, "logps/chosen": -378.46153846153845, "logps/rejected": -320.42105263157896, "loss": 0.2531, "rewards/chosen": 0.6670673076923077, "rewards/margins": 3.7756199392712553, "rewards/rejected": -3.1085526315789473, "step": 838 }, { "epoch": 0.5752485430236545, "grad_norm": 0.49966892646538486, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77594624.0, "logits/rejected": -113627508.36363636, "logps/chosen": -234.32258064516128, "logps/rejected": -343.030303030303, "loss": 0.2364, "rewards/chosen": 0.745148689516129, "rewards/margins": 1825118.199694144, "rewards/rejected": -1825117.4545454546, "step": 839 }, { "epoch": 0.575934178950977, "grad_norm": 0.4804174784749807, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86390142.08955224, "logits/rejected": -84023598.16393442, "logps/chosen": -260.7761194029851, "logps/rejected": -287.4754098360656, "loss": 0.2904, "rewards/chosen": 0.6746735074626866, "rewards/margins": 3.8959849828725224, "rewards/rejected": -3.221311475409836, "step": 840 }, { "epoch": 0.5766198148782996, "grad_norm": 0.7027032570242294, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81657856.0, "logits/rejected": -93192192.0, "logps/chosen": -286.5, "logps/rejected": -337.5, "loss": 0.2672, "rewards/chosen": 0.74365234375, "rewards/margins": 4.30615234375, "rewards/rejected": -3.5625, "step": 841 }, { "epoch": 0.5773054508056222, "grad_norm": 0.5155185654625967, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77894217.14285715, "logits/rejected": -89177355.81538461, "logps/chosen": -224.88888888888889, "logps/rejected": -426.83076923076925, "loss": 0.2331, "rewards/chosen": 1.1349206349206349, "rewards/margins": 4.884920634920634, "rewards/rejected": -3.75, "step": 842 }, { "epoch": 0.5779910867329449, "grad_norm": 0.4729508411991597, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99580894.96774194, "logits/rejected": -89351385.21212122, "logps/chosen": -278.19354838709677, "logps/rejected": -352.0, "loss": 0.2405, "rewards/chosen": 0.8618951612903226, "rewards/margins": 4.430076979472141, "rewards/rejected": -3.5681818181818183, "step": 843 }, { "epoch": 0.5786767226602674, "grad_norm": 0.6005753631866386, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85919681.93939394, "logits/rejected": -81450677.67741935, "logps/chosen": -323.1515151515151, "logps/rejected": -319.48387096774195, "loss": 0.2863, "rewards/chosen": 0.8726325757575758, "rewards/margins": 3.4976325757575757, "rewards/rejected": -2.625, "step": 844 }, { "epoch": 0.57936235858759, "grad_norm": 0.597193590764772, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91459128.8888889, "logits/rejected": -88305078.85714285, "logps/chosen": -268.22222222222223, "logps/rejected": -382.57142857142856, "loss": 0.297, "rewards/chosen": 0.6623263888888888, "rewards/margins": 3.887772817460317, "rewards/rejected": -3.2254464285714284, "step": 845 }, { "epoch": 0.5800479945149126, "grad_norm": 0.6256855088791459, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96468992.0, "logits/rejected": -80977127.22580644, "logps/chosen": -272.969696969697, "logps/rejected": -335.2258064516129, "loss": 0.2545, "rewards/chosen": 0.8797348484848485, "rewards/margins": 3.201553538631018, "rewards/rejected": -2.3218186901461695, "step": 846 }, { "epoch": 0.5807336304422351, "grad_norm": 0.557278451599367, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86435558.90196079, "logits/rejected": -68307236.57142857, "logps/chosen": -242.35294117647058, "logps/rejected": -292.987012987013, "loss": 0.2591, "rewards/chosen": 2.810616287530637, "rewards/margins": 5.742434469348819, "rewards/rejected": -2.9318181818181817, "step": 847 }, { "epoch": 0.5814192663695578, "grad_norm": 0.6494634409212011, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91711274.02985075, "logits/rejected": -81273234.8852459, "logps/chosen": -276.7761194029851, "logps/rejected": -296.91803278688525, "loss": 0.2873, "rewards/chosen": 0.47755363805970147, "rewards/margins": 3.446815933141669, "rewards/rejected": -2.9692622950819674, "step": 848 }, { "epoch": 0.5821049022968804, "grad_norm": 0.5184715851846271, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -114085068.8, "logits/rejected": -79402513.65517241, "logps/chosen": -300.34285714285716, "logps/rejected": -297.1034482758621, "loss": 0.2642, "rewards/chosen": 0.8678571428571429, "rewards/margins": 4.100615763546799, "rewards/rejected": -3.2327586206896552, "step": 849 }, { "epoch": 0.5827905382242029, "grad_norm": 0.4797933993623713, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79051966.91525424, "logits/rejected": -77685804.52173913, "logps/chosen": -238.3728813559322, "logps/rejected": -301.9130434782609, "loss": 0.2499, "rewards/chosen": 0.743114406779661, "rewards/margins": 3.840940493736183, "rewards/rejected": -3.097826086956522, "step": 850 }, { "epoch": 0.5834761741515255, "grad_norm": 0.621241586250268, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107454073.90476191, "logits/rejected": -81692136.36923076, "logps/chosen": -310.85714285714283, "logps/rejected": -352.73846153846154, "loss": 0.2533, "rewards/chosen": 0.7132936507936508, "rewards/margins": 4.217139804639805, "rewards/rejected": -3.503846153846154, "step": 851 }, { "epoch": 0.5841618100788482, "grad_norm": 0.47022950211912606, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83113445.05263157, "logits/rejected": -76142749.53846154, "logps/chosen": -258.94736842105266, "logps/rejected": -320.3076923076923, "loss": 0.2789, "rewards/chosen": 0.7467105263157895, "rewards/margins": 2.5276515358372738, "rewards/rejected": -1.7809410095214844, "step": 852 }, { "epoch": 0.5848474460061707, "grad_norm": 0.46762545959455637, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97145492.64516129, "logits/rejected": -73463870.06060606, "logps/chosen": -283.61290322580646, "logps/rejected": -320.72727272727275, "loss": 0.2518, "rewards/chosen": 4.69703625094506, "rewards/margins": 7.988702917611727, "rewards/rejected": -3.2916666666666665, "step": 853 }, { "epoch": 0.5855330819334933, "grad_norm": 0.6098242248264706, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101135893.63380282, "logits/rejected": -75276719.15789473, "logps/chosen": -258.92957746478874, "logps/rejected": -324.49122807017545, "loss": 0.307, "rewards/chosen": 0.6078345070422535, "rewards/margins": 3.0902906473931306, "rewards/rejected": -2.482456140350877, "step": 854 }, { "epoch": 0.5862187178608159, "grad_norm": 0.7668559478135735, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93739365.58730158, "logits/rejected": -82466469.41538462, "logps/chosen": -279.87301587301585, "logps/rejected": -383.26153846153846, "loss": 0.244, "rewards/chosen": 0.8338293650793651, "rewards/margins": 4.8569062881562886, "rewards/rejected": -4.023076923076923, "step": 855 }, { "epoch": 0.5869043537881385, "grad_norm": 0.5637779308127913, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90410552.8888889, "logits/rejected": -96693686.85714285, "logps/chosen": -262.22222222222223, "logps/rejected": -374.0, "loss": 0.2622, "rewards/chosen": 0.8489583333333334, "rewards/margins": 4.576636904761905, "rewards/rejected": -3.7276785714285716, "step": 856 }, { "epoch": 0.5875899897154611, "grad_norm": 0.5576578391469712, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96763329.12280701, "logits/rejected": -75733770.81690142, "logps/chosen": -237.75438596491227, "logps/rejected": -314.8169014084507, "loss": 0.2272, "rewards/chosen": 2.6310074120237115, "rewards/margins": 5.8528383979392045, "rewards/rejected": -3.221830985915493, "step": 857 }, { "epoch": 0.5882756256427837, "grad_norm": 0.4712018542488198, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83822529.93939394, "logits/rejected": -93357089.03225806, "logps/chosen": -269.09090909090907, "logps/rejected": -360.51612903225805, "loss": 0.2668, "rewards/chosen": 0.6318063446969697, "rewards/margins": 4.228580538245357, "rewards/rejected": -3.596774193548387, "step": 858 }, { "epoch": 0.5889612615701063, "grad_norm": 0.6130963590696513, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105057328.76190476, "logits/rejected": -89822633.35384615, "logps/chosen": -295.1111111111111, "logps/rejected": -344.12307692307695, "loss": 0.2531, "rewards/chosen": 0.7857142857142857, "rewards/margins": 3.9164835164835163, "rewards/rejected": -3.1307692307692307, "step": 859 }, { "epoch": 0.5896468974974288, "grad_norm": 0.5485098498702126, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88779434.66666667, "logits/rejected": -92274688.0, "logps/chosen": -352.0, "logps/rejected": -236.0, "loss": 0.248, "rewards/chosen": 1.4166666666666667, "rewards/margins": 4.126225490196078, "rewards/rejected": -2.7095588235294117, "step": 860 }, { "epoch": 0.5903325334247514, "grad_norm": 0.6559599324550974, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90252434.28571428, "logits/rejected": -77303352.8888889, "logps/chosen": -287.14285714285717, "logps/rejected": -324.22222222222223, "loss": 0.2712, "rewards/chosen": 0.431640625, "rewards/margins": 3.6260850694444446, "rewards/rejected": -3.1944444444444446, "step": 861 }, { "epoch": 0.5910181693520741, "grad_norm": 0.5341634439098352, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95270619.42857143, "logits/rejected": -76654521.37931034, "logps/chosen": -350.4, "logps/rejected": -369.1034482758621, "loss": 0.2878, "rewards/chosen": 0.95625, "rewards/margins": 4.051077586206897, "rewards/rejected": -3.0948275862068964, "step": 862 }, { "epoch": 0.5917038052793966, "grad_norm": 0.4949465203307694, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89600059.36231884, "logits/rejected": -100663296.0, "logps/chosen": -288.231884057971, "logps/rejected": -332.47457627118644, "loss": 0.2772, "rewards/chosen": 0.6557971014492754, "rewards/margins": 4.07105133873741, "rewards/rejected": -3.4152542372881354, "step": 863 }, { "epoch": 0.5923894412067192, "grad_norm": 0.8847789626803256, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94308289.93939394, "logits/rejected": -103369298.58064516, "logps/chosen": -335.5151515151515, "logps/rejected": -435.0967741935484, "loss": 0.2884, "rewards/chosen": 0.6789772727272727, "rewards/margins": 4.0297837243401755, "rewards/rejected": -3.350806451612903, "step": 864 }, { "epoch": 0.5930750771340418, "grad_norm": 0.6046625695374075, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91448537.21212122, "logits/rejected": -99851495.22580644, "logps/chosen": -318.3030303030303, "logps/rejected": -357.6774193548387, "loss": 0.2737, "rewards/chosen": 0.8731060606060606, "rewards/margins": 3.9013318670576735, "rewards/rejected": -3.028225806451613, "step": 865 }, { "epoch": 0.5937607130613645, "grad_norm": 0.4918189604332254, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84004229.40845071, "logits/rejected": -84989844.21052632, "logps/chosen": -344.3380281690141, "logps/rejected": -343.29824561403507, "loss": 0.2679, "rewards/chosen": 0.9929577464788732, "rewards/margins": 4.207870027180627, "rewards/rejected": -3.2149122807017543, "step": 866 }, { "epoch": 0.594446348988687, "grad_norm": 0.5069018569817495, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77201408.0, "logits/rejected": -78577664.0, "logps/chosen": -289.0, "logps/rejected": -320.25, "loss": 0.2506, "rewards/chosen": 0.845703125, "rewards/margins": 4.091796875, "rewards/rejected": -3.24609375, "step": 867 }, { "epoch": 0.5951319849160096, "grad_norm": 0.5849705660775436, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103434532.57142857, "logits/rejected": -76371285.33333333, "logps/chosen": -273.0, "logps/rejected": -320.8888888888889, "loss": 0.2362, "rewards/chosen": 0.7644391741071429, "rewards/margins": 4.358189174107143, "rewards/rejected": -3.59375, "step": 868 }, { "epoch": 0.5958176208433322, "grad_norm": 0.5644297215855826, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87829977.79104477, "logits/rejected": -86086370.62295082, "logps/chosen": -308.05970149253733, "logps/rejected": -379.27868852459017, "loss": 0.2838, "rewards/chosen": 0.5097947761194029, "rewards/margins": 3.948319366283337, "rewards/rejected": -3.4385245901639343, "step": 869 }, { "epoch": 0.5965032567706547, "grad_norm": 0.5594519790572806, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89590333.44, "logits/rejected": -76921952.60377358, "logps/chosen": -302.93333333333334, "logps/rejected": -321.50943396226415, "loss": 0.2844, "rewards/chosen": 0.8791666666666667, "rewards/margins": 3.2230622009661216, "rewards/rejected": -2.3438955342994547, "step": 870 }, { "epoch": 0.5971888926979774, "grad_norm": 0.4669335123805718, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -110554341.25373134, "logits/rejected": -80173089.5737705, "logps/chosen": -249.07462686567163, "logps/rejected": -235.54098360655738, "loss": 0.2754, "rewards/chosen": 0.6007462686567164, "rewards/margins": 3330581.0597626623, "rewards/rejected": -3330580.4590163934, "step": 871 }, { "epoch": 0.5978745286253, "grad_norm": 0.4746821216822704, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96333691.87096775, "logits/rejected": -91956937.6969697, "logps/chosen": -281.4193548387097, "logps/rejected": -379.3939393939394, "loss": 0.2357, "rewards/chosen": 0.9400201612903226, "rewards/margins": 4.280929252199414, "rewards/rejected": -3.340909090909091, "step": 872 }, { "epoch": 0.5985601645526225, "grad_norm": 2.9804687077254535, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -68580549.61403508, "logits/rejected": -98300307.83098592, "logps/chosen": -216.7017543859649, "logps/rejected": -350.6478873239437, "loss": 0.257, "rewards/chosen": 0.41173245614035087, "rewards/margins": 4.070183160365703, "rewards/rejected": -3.658450704225352, "step": 873 }, { "epoch": 0.5992458004799451, "grad_norm": 0.5144958067438937, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92930048.0, "logits/rejected": -79036416.0, "logps/chosen": -259.75, "logps/rejected": -274.75, "loss": 0.2605, "rewards/chosen": 0.526123046875, "rewards/margins": 3.944091796875, "rewards/rejected": -3.41796875, "step": 874 }, { "epoch": 0.5999314364072678, "grad_norm": 0.6439387722816632, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92132508.20338982, "logits/rejected": -99569129.73913044, "logps/chosen": -284.20338983050846, "logps/rejected": -375.6521739130435, "loss": 0.2809, "rewards/chosen": 2.2442725230071505, "rewards/margins": 5.6174609288042525, "rewards/rejected": -3.3731884057971016, "step": 875 }, { "epoch": 0.6006170723345904, "grad_norm": 0.4747086278547648, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104707803.42857143, "logits/rejected": -90294044.44444445, "logps/chosen": -285.7142857142857, "logps/rejected": -397.3333333333333, "loss": 0.2286, "rewards/chosen": 0.7885044642857143, "rewards/margins": 4.4864211309523805, "rewards/rejected": -3.6979166666666665, "step": 876 }, { "epoch": 0.6013027082619129, "grad_norm": 0.4942014456598171, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96774032.2909091, "logits/rejected": -90551001.42465754, "logps/chosen": -268.2181818181818, "logps/rejected": -323.94520547945206, "loss": 0.239, "rewards/chosen": 4.1786737615411935, "rewards/margins": 7.634153213595988, "rewards/rejected": -3.4554794520547945, "step": 877 }, { "epoch": 0.6019883441892355, "grad_norm": 0.5850161816121856, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95147498.9589041, "logits/rejected": -75916902.4, "logps/chosen": -312.7671232876712, "logps/rejected": -274.03636363636366, "loss": 0.3145, "rewards/chosen": 0.4413527397260274, "rewards/margins": 2.6859565383322064, "rewards/rejected": -2.244603798606179, "step": 878 }, { "epoch": 0.6026739801165582, "grad_norm": 0.5842722570137453, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96468992.0, "logits/rejected": -89944519.1111111, "logps/chosen": -306.5945945945946, "logps/rejected": -384.0, "loss": 0.2574, "rewards/chosen": 1.0692567567567568, "rewards/margins": 4.779904904904905, "rewards/rejected": -3.7106481481481484, "step": 879 }, { "epoch": 0.6033596160438807, "grad_norm": 0.46562945940017886, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72105020.23529412, "logits/rejected": -76336332.8, "logps/chosen": -262.70588235294116, "logps/rejected": -265.6, "loss": 0.2454, "rewards/chosen": 1.0955882352941178, "rewards/margins": 3.3864680570714616, "rewards/rejected": -2.290879821777344, "step": 880 }, { "epoch": 0.6040452519712033, "grad_norm": 0.4717719169265064, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84825103.28358209, "logits/rejected": -96468992.0, "logps/chosen": -211.1044776119403, "logps/rejected": -343.08196721311475, "loss": 0.274, "rewards/chosen": 0.4528917910447761, "rewards/margins": 3.8791212992414974, "rewards/rejected": -3.4262295081967213, "step": 881 }, { "epoch": 0.6047308878985259, "grad_norm": 0.6015152712287638, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85899345.92, "logits/rejected": -81353670.03773585, "logps/chosen": -281.81333333333333, "logps/rejected": -304.60377358490564, "loss": 0.3009, "rewards/chosen": 0.655, "rewards/margins": 3.421509433962264, "rewards/rejected": -2.766509433962264, "step": 882 }, { "epoch": 0.6054165238258484, "grad_norm": 0.5287078431037382, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92973738.66666667, "logits/rejected": -73955448.47058824, "logps/chosen": -245.06666666666666, "logps/rejected": -259.05882352941177, "loss": 0.2498, "rewards/chosen": 0.7291666666666666, "rewards/margins": 3.6776960784313726, "rewards/rejected": -2.948529411764706, "step": 883 }, { "epoch": 0.606102159753171, "grad_norm": 0.5065088870056174, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78160522.15873016, "logits/rejected": -71529015.13846155, "logps/chosen": -192.5079365079365, "logps/rejected": -306.2153846153846, "loss": 0.2665, "rewards/chosen": 2.018831283327133, "rewards/margins": 5.3130620525579015, "rewards/rejected": -3.294230769230769, "step": 884 }, { "epoch": 0.6067877956804937, "grad_norm": 0.5121978238206191, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92154850.74285714, "logits/rejected": -97481410.20689656, "logps/chosen": -309.9428571428571, "logps/rejected": -333.51724137931035, "loss": 0.2419, "rewards/chosen": 2.525023978097098, "rewards/margins": 4.197437771200546, "rewards/rejected": -1.6724137931034482, "step": 885 }, { "epoch": 0.6074734316078162, "grad_norm": 0.43880670448765585, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80929439.47540984, "logits/rejected": -84512095.52238806, "logps/chosen": -305.3114754098361, "logps/rejected": -360.35820895522386, "loss": 0.2138, "rewards/chosen": 2.845669105404713, "rewards/margins": 7.024773583016653, "rewards/rejected": -4.17910447761194, "step": 886 }, { "epoch": 0.6081590675351388, "grad_norm": 0.5110596331047471, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95070890.66666667, "logits/rejected": -74146763.9322034, "logps/chosen": -276.40579710144925, "logps/rejected": -367.1864406779661, "loss": 0.2726, "rewards/chosen": 0.5683876811594203, "rewards/margins": 4.627709715057725, "rewards/rejected": -4.059322033898305, "step": 887 }, { "epoch": 0.6088447034624614, "grad_norm": 0.584201336852947, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82525765.1891892, "logits/rejected": -140586856.2962963, "logps/chosen": -215.78378378378378, "logps/rejected": -366.81481481481484, "loss": 0.2895, "rewards/chosen": 1.6809424323004645, "rewards/margins": 5.347609098967131, "rewards/rejected": -3.6666666666666665, "step": 888 }, { "epoch": 0.6095303393897841, "grad_norm": 0.4385158529131589, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90466798.34482759, "logits/rejected": -103299715.65714286, "logps/chosen": -268.9655172413793, "logps/rejected": -302.1714285714286, "loss": 0.2148, "rewards/chosen": 1.0765086206896552, "rewards/margins": 3.579406264732624, "rewards/rejected": -2.5028976440429687, "step": 889 }, { "epoch": 0.6102159753171066, "grad_norm": 0.45767326036291794, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -116231983.72881356, "logits/rejected": -92639410.08695652, "logps/chosen": -357.1525423728813, "logps/rejected": -300.9855072463768, "loss": 0.2206, "rewards/chosen": 1.3209745762711864, "rewards/margins": 3.842713706705969, "rewards/rejected": -2.5217391304347827, "step": 890 }, { "epoch": 0.6109016112444292, "grad_norm": 0.459452017285901, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81048756.70588236, "logits/rejected": -93532979.2, "logps/chosen": -280.47058823529414, "logps/rejected": -305.06666666666666, "loss": 0.2509, "rewards/chosen": 0.9273897058823529, "rewards/margins": 4.489889705882353, "rewards/rejected": -3.5625, "step": 891 }, { "epoch": 0.6115872471717518, "grad_norm": 0.4920671642900236, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100409095.75757575, "logits/rejected": -86186182.19354838, "logps/chosen": -282.42424242424244, "logps/rejected": -355.61290322580646, "loss": 0.243, "rewards/chosen": 0.9867424242424242, "rewards/margins": 3.5819954149650806, "rewards/rejected": -2.5952529907226562, "step": 892 }, { "epoch": 0.6122728830990743, "grad_norm": 0.607094335699412, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86645490.5263158, "logits/rejected": -91145452.3076923, "logps/chosen": -251.3684210526316, "logps/rejected": -369.53846153846155, "loss": 0.2514, "rewards/chosen": 0.962171052631579, "rewards/margins": 4.04870951417004, "rewards/rejected": -3.0865384615384617, "step": 893 }, { "epoch": 0.612958519026397, "grad_norm": 0.5092800152902258, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88983772.55384615, "logits/rejected": -74099370.66666667, "logps/chosen": -279.38461538461536, "logps/rejected": -272.5079365079365, "loss": 0.2472, "rewards/chosen": 0.828125, "rewards/margins": 4.145585317460318, "rewards/rejected": -3.3174603174603177, "step": 894 }, { "epoch": 0.6136441549537196, "grad_norm": 0.5520933568849606, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -65964962.90909091, "logits/rejected": -84427280.51612903, "logps/chosen": -221.8181818181818, "logps/rejected": -348.64516129032256, "loss": 0.2714, "rewards/chosen": 2.1926819772431343, "rewards/margins": 5.958811009501199, "rewards/rejected": -3.7661290322580645, "step": 895 }, { "epoch": 0.6143297908810421, "grad_norm": 0.46610528350437974, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88715884.60606061, "logits/rejected": -85509681.5483871, "logps/chosen": -256.0, "logps/rejected": -370.06451612903226, "loss": 0.2546, "rewards/chosen": 0.6822916666666666, "rewards/margins": 4.170194892473118, "rewards/rejected": -3.4879032258064515, "step": 896 }, { "epoch": 0.6150154268083647, "grad_norm": 0.758619638845252, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89338675.2, "logits/rejected": -86600041.41176471, "logps/chosen": -276.53333333333336, "logps/rejected": -320.47058823529414, "loss": 0.2449, "rewards/chosen": 0.8171875, "rewards/margins": 4.1113051470588236, "rewards/rejected": -3.2941176470588234, "step": 897 }, { "epoch": 0.6157010627356874, "grad_norm": 0.5339792885908663, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93125706.20289855, "logits/rejected": -100450026.30508475, "logps/chosen": -253.2173913043478, "logps/rejected": -405.6949152542373, "loss": 0.2587, "rewards/chosen": 0.7989130434782609, "rewards/margins": 3.2014554163596167, "rewards/rejected": -2.402542372881356, "step": 898 }, { "epoch": 0.61638669866301, "grad_norm": 0.46450875339967285, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82508845.85074627, "logits/rejected": -77422726.29508197, "logps/chosen": -283.46268656716416, "logps/rejected": -296.1311475409836, "loss": 0.256, "rewards/chosen": 0.7602611940298507, "rewards/margins": 4.219277587472474, "rewards/rejected": -3.459016393442623, "step": 899 }, { "epoch": 0.6170723345903325, "grad_norm": 0.5761792819181478, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91684864.0, "logits/rejected": -94633984.0, "logps/chosen": -335.125, "logps/rejected": -343.25, "loss": 0.2313, "rewards/chosen": 2.824432611465454, "rewards/margins": 6.304901361465454, "rewards/rejected": -3.48046875, "step": 900 }, { "epoch": 0.6177579705176551, "grad_norm": 0.5836475822113857, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -111240236.52173913, "logits/rejected": -99810217.22033899, "logps/chosen": -326.72463768115944, "logps/rejected": -303.1864406779661, "loss": 0.2607, "rewards/chosen": 0.802536231884058, "rewards/margins": 1.6475983584316611, "rewards/rejected": -0.8450621265476033, "step": 901 }, { "epoch": 0.6184436064449778, "grad_norm": 0.638293985222504, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90960055.40298508, "logits/rejected": -85811334.29508197, "logps/chosen": -324.7761194029851, "logps/rejected": -312.39344262295083, "loss": 0.2772, "rewards/chosen": 0.8152985074626866, "rewards/margins": 3.8808722779544897, "rewards/rejected": -3.0655737704918034, "step": 902 }, { "epoch": 0.6191292423723003, "grad_norm": 0.4207698987478463, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94271975.61904761, "logits/rejected": -72658250.83076923, "logps/chosen": -280.63492063492066, "logps/rejected": -330.0923076923077, "loss": 0.234, "rewards/chosen": 0.9523809523809523, "rewards/margins": 4.46007326007326, "rewards/rejected": -3.5076923076923077, "step": 903 }, { "epoch": 0.6198148782996229, "grad_norm": 1.2302063176872298, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74864997.58730158, "logits/rejected": -81434025.35384615, "logps/chosen": -232.12698412698413, "logps/rejected": -316.55384615384617, "loss": 0.2649, "rewards/chosen": 0.6775793650793651, "rewards/margins": 4.039117826617827, "rewards/rejected": -3.3615384615384616, "step": 904 }, { "epoch": 0.6205005142269455, "grad_norm": 0.5552016563540217, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87451238.4, "logits/rejected": -94618563.76470588, "logps/chosen": -306.0, "logps/rejected": -396.70588235294116, "loss": 0.2408, "rewards/chosen": 0.9791666666666666, "rewards/margins": 4.776960784313726, "rewards/rejected": -3.797794117647059, "step": 905 }, { "epoch": 0.621186150154268, "grad_norm": 0.4696567714883108, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93712735.08571428, "logits/rejected": -85332391.72413793, "logps/chosen": -215.54285714285714, "logps/rejected": -328.55172413793105, "loss": 0.2606, "rewards/chosen": 0.7133928571428572, "rewards/margins": 4.299599753694581, "rewards/rejected": -3.586206896551724, "step": 906 }, { "epoch": 0.6218717860815907, "grad_norm": 0.6173156864003463, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81444619.46268657, "logits/rejected": -78247835.27868852, "logps/chosen": -287.04477611940297, "logps/rejected": -317.6393442622951, "loss": 0.2539, "rewards/chosen": 1.0867537313432836, "rewards/margins": 4.52117996085148, "rewards/rejected": -3.4344262295081966, "step": 907 }, { "epoch": 0.6225574220089133, "grad_norm": 0.5155883670130456, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87601034.97142857, "logits/rejected": -82295137.10344827, "logps/chosen": -227.88571428571427, "logps/rejected": -337.1034482758621, "loss": 0.2549, "rewards/chosen": 0.8069196428571429, "rewards/margins": 4.33278171182266, "rewards/rejected": -3.5258620689655173, "step": 908 }, { "epoch": 0.6232430579362359, "grad_norm": 0.6174802372828969, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82313216.0, "logits/rejected": -98041856.0, "logps/chosen": -268.22222222222223, "logps/rejected": -369.7142857142857, "loss": 0.2844, "rewards/chosen": 0.7352430555555556, "rewards/margins": 3.0706132555764816, "rewards/rejected": -2.335370200020926, "step": 909 }, { "epoch": 0.6239286938635584, "grad_norm": 0.42954075406179204, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81966652.7457627, "logits/rejected": -74707240.8115942, "logps/chosen": -282.3050847457627, "logps/rejected": -335.768115942029, "loss": 0.2148, "rewards/chosen": 1.05614406779661, "rewards/margins": 4.83150638663719, "rewards/rejected": -3.7753623188405796, "step": 910 }, { "epoch": 0.624614329790881, "grad_norm": 0.45968120596128187, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83377679.51515152, "logits/rejected": -85983232.0, "logps/chosen": -293.57575757575756, "logps/rejected": -381.93548387096774, "loss": 0.2285, "rewards/chosen": 2.3671965165571733, "rewards/margins": 6.012357806879754, "rewards/rejected": -3.6451612903225805, "step": 911 }, { "epoch": 0.6252999657182037, "grad_norm": 0.662063531109895, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99114629.90769231, "logits/rejected": -105323633.77777778, "logps/chosen": -361.10769230769233, "logps/rejected": -410.92063492063494, "loss": 0.2572, "rewards/chosen": 0.7581730769230769, "rewards/margins": 2.982870371612437, "rewards/rejected": -2.22469729468936, "step": 912 }, { "epoch": 0.6259856016455262, "grad_norm": 0.516352290350644, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86515345.19402985, "logits/rejected": -78866667.01639344, "logps/chosen": -273.43283582089555, "logps/rejected": -256.0, "loss": 0.2662, "rewards/chosen": 0.7961753731343284, "rewards/margins": 3.082288924539938, "rewards/rejected": -2.2861135514056095, "step": 913 }, { "epoch": 0.6266712375728488, "grad_norm": 0.6243972906149293, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104134444.13793103, "logits/rejected": -93712735.08571428, "logps/chosen": -296.2758620689655, "logps/rejected": -359.77142857142854, "loss": 0.2367, "rewards/chosen": 0.9423491379310345, "rewards/margins": 4.35306342364532, "rewards/rejected": -3.4107142857142856, "step": 914 }, { "epoch": 0.6273568735001714, "grad_norm": 0.523775853333639, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98231815.42028986, "logits/rejected": -84170439.59322034, "logps/chosen": -284.5217391304348, "logps/rejected": -304.0, "loss": 0.2501, "rewards/chosen": 0.8913043478260869, "rewards/margins": 4.476050110537951, "rewards/rejected": -3.5847457627118646, "step": 915 }, { "epoch": 0.628042509427494, "grad_norm": 0.4966102319887658, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89455565.63934426, "logits/rejected": -95780374.92537314, "logps/chosen": -327.60655737704917, "logps/rejected": -260.7761194029851, "loss": 0.2249, "rewards/chosen": 1.1716188524590163, "rewards/margins": 4.354454673354539, "rewards/rejected": -3.1828358208955225, "step": 916 }, { "epoch": 0.6287281453548166, "grad_norm": 0.4718333703597573, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86498925.1147541, "logits/rejected": -88017782.4477612, "logps/chosen": -309.7704918032787, "logps/rejected": -319.04477611940297, "loss": 0.2175, "rewards/chosen": 3.3088856681448515, "rewards/margins": 7.267840892025449, "rewards/rejected": -3.958955223880597, "step": 917 }, { "epoch": 0.6294137812821392, "grad_norm": 0.5199066360798678, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107463152.48484848, "logits/rejected": -82736028.90322581, "logps/chosen": -247.27272727272728, "logps/rejected": -346.5806451612903, "loss": 0.2647, "rewards/chosen": 0.71875, "rewards/margins": 3.4566532258064515, "rewards/rejected": -2.7379032258064515, "step": 918 }, { "epoch": 0.6300994172094617, "grad_norm": 0.5553940323822139, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101695740.06153846, "logits/rejected": -102660583.61904761, "logps/chosen": -307.2, "logps/rejected": -365.2063492063492, "loss": 0.2255, "rewards/chosen": 1.271153846153846, "rewards/margins": 2.474511361034798, "rewards/rejected": -1.2033575148809523, "step": 919 }, { "epoch": 0.6307850531367843, "grad_norm": 1.0812020955765498, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85284181.33333333, "logits/rejected": -70828341.13207547, "logps/chosen": -245.33333333333334, "logps/rejected": -304.0, "loss": 0.2718, "rewards/chosen": 0.81015625, "rewards/margins": 2.9121608230302916, "rewards/rejected": -2.1020045730302916, "step": 920 }, { "epoch": 0.631470689064107, "grad_norm": 0.5759398204265979, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86773961.44262294, "logits/rejected": -85951931.2238806, "logps/chosen": -237.37704918032787, "logps/rejected": -381.6119402985075, "loss": 0.2674, "rewards/chosen": 2.1124422667456453, "rewards/margins": 4.492339654805347, "rewards/rejected": -2.3798973880597014, "step": 921 }, { "epoch": 0.6321563249914296, "grad_norm": 0.4951886657198319, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104333312.0, "logits/rejected": -109051904.0, "logps/chosen": -270.0, "logps/rejected": -375.5, "loss": 0.2357, "rewards/chosen": 0.8984375, "rewards/margins": 4.9296875, "rewards/rejected": -4.03125, "step": 922 }, { "epoch": 0.6328419609187521, "grad_norm": 0.5332895338001915, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88661133.78461538, "logits/rejected": -98732584.63492064, "logps/chosen": -336.0, "logps/rejected": -312.8888888888889, "loss": 0.252, "rewards/chosen": 1.1307692307692307, "rewards/margins": 4.456166056166056, "rewards/rejected": -3.3253968253968256, "step": 923 }, { "epoch": 0.6335275968460747, "grad_norm": 0.6572453235873829, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75387095.57894737, "logits/rejected": -77433304.61538461, "logps/chosen": -250.31578947368422, "logps/rejected": -333.2307692307692, "loss": 0.2453, "rewards/chosen": 1.9192004956697162, "rewards/margins": 4.914392803362023, "rewards/rejected": -2.9951923076923075, "step": 924 }, { "epoch": 0.6342132327733974, "grad_norm": 0.8034967004049604, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95289344.0, "logits/rejected": -80347136.0, "logps/chosen": -329.5, "logps/rejected": -302.5, "loss": 0.2681, "rewards/chosen": 0.55120849609375, "rewards/margins": 4.15277099609375, "rewards/rejected": -3.6015625, "step": 925 }, { "epoch": 0.6348988687007199, "grad_norm": 0.5232381186982655, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88578013.28813559, "logits/rejected": -133245135.76811594, "logps/chosen": -254.91525423728814, "logps/rejected": -364.9855072463768, "loss": 0.2563, "rewards/chosen": 2.152525045103946, "rewards/margins": 6.123539537857569, "rewards/rejected": -3.971014492753623, "step": 926 }, { "epoch": 0.6355845046280425, "grad_norm": 0.503628443224812, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86485366.9859155, "logits/rejected": -87270956.91228071, "logps/chosen": -232.7887323943662, "logps/rejected": -307.64912280701753, "loss": 0.265, "rewards/chosen": 0.8080985915492958, "rewards/margins": 4.106344205584383, "rewards/rejected": -3.2982456140350878, "step": 927 }, { "epoch": 0.6362701405553651, "grad_norm": 0.48059028497630935, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98566144.0, "logits/rejected": -91766287.51515152, "logps/chosen": -258.3225806451613, "logps/rejected": -384.4848484848485, "loss": 0.2315, "rewards/chosen": 0.9798387096774194, "rewards/margins": 4.225339216454171, "rewards/rejected": -3.245500506776752, "step": 928 }, { "epoch": 0.6369557764826876, "grad_norm": 0.5986169639283653, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80290962.28571428, "logits/rejected": -111848106.66666667, "logps/chosen": -208.85714285714286, "logps/rejected": -449.3333333333333, "loss": 0.2112, "rewards/chosen": 0.9578683035714286, "rewards/margins": 4.982173859126984, "rewards/rejected": -4.024305555555555, "step": 929 }, { "epoch": 0.6376414124100103, "grad_norm": 0.5518995638878387, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78357224.72727273, "logits/rejected": -77053423.48387097, "logps/chosen": -269.09090909090907, "logps/rejected": -384.0, "loss": 0.2305, "rewards/chosen": 1.1628787878787878, "rewards/margins": 5.029814271749755, "rewards/rejected": -3.8669354838709675, "step": 930 }, { "epoch": 0.6383270483373329, "grad_norm": 0.643986500996801, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87940573.86666666, "logits/rejected": -99923124.70588236, "logps/chosen": -340.0, "logps/rejected": -401.88235294117646, "loss": 0.2375, "rewards/chosen": 1.1348958333333334, "rewards/margins": 4.943719362745098, "rewards/rejected": -3.8088235294117645, "step": 931 }, { "epoch": 0.6390126842646555, "grad_norm": 0.6810306436508675, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86168274.8235294, "logits/rejected": -103319688.53333333, "logps/chosen": -252.7058823529412, "logps/rejected": -385.3333333333333, "loss": 0.2632, "rewards/chosen": 0.8308823529411765, "rewards/margins": 4.347549019607843, "rewards/rejected": -3.5166666666666666, "step": 932 }, { "epoch": 0.639698320191978, "grad_norm": 0.5662438361614233, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102608480.46376811, "logits/rejected": -80331585.08474576, "logps/chosen": -345.7391304347826, "logps/rejected": -296.9491525423729, "loss": 0.2898, "rewards/chosen": 2.6005430359771284, "rewards/margins": 5.380204052926281, "rewards/rejected": -2.7796610169491527, "step": 933 }, { "epoch": 0.6403839561193007, "grad_norm": 0.5330750065066214, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93952409.6, "logits/rejected": -86682282.66666667, "logps/chosen": -318.03076923076924, "logps/rejected": -311.87301587301585, "loss": 0.2531, "rewards/chosen": 0.9117788461538462, "rewards/margins": 4.395905830280831, "rewards/rejected": -3.484126984126984, "step": 934 }, { "epoch": 0.6410695920466233, "grad_norm": 0.5009255947594494, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86911479.60655738, "logits/rejected": -96281187.34328358, "logps/chosen": -280.1311475409836, "logps/rejected": -386.3880597014925, "loss": 0.2193, "rewards/chosen": 1.069672131147541, "rewards/margins": 4.733851235625153, "rewards/rejected": -3.6641791044776117, "step": 935 }, { "epoch": 0.6417552279739458, "grad_norm": 0.5366760870364893, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85889329.6716418, "logits/rejected": -66799448.13114754, "logps/chosen": -299.2238805970149, "logps/rejected": -344.655737704918, "loss": 0.2549, "rewards/chosen": 0.8768656716417911, "rewards/margins": 4.66375091754343, "rewards/rejected": -3.7868852459016393, "step": 936 }, { "epoch": 0.6424408639012684, "grad_norm": 0.6043271267328902, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77811570.7586207, "logits/rejected": -70464307.2, "logps/chosen": -253.79310344827587, "logps/rejected": -314.51428571428573, "loss": 0.2448, "rewards/chosen": 2.8895158438846984, "rewards/margins": 6.600230129598984, "rewards/rejected": -3.710714285714286, "step": 937 }, { "epoch": 0.643126499828591, "grad_norm": 0.4186630879486599, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86282825.14285715, "logits/rejected": -84079663.26153846, "logps/chosen": -243.3015873015873, "logps/rejected": -333.04615384615386, "loss": 0.2372, "rewards/chosen": 2.411595662434896, "rewards/margins": 6.376980277819511, "rewards/rejected": -3.9653846153846155, "step": 938 }, { "epoch": 0.6438121357559136, "grad_norm": 0.5179632707164182, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86720069.1891892, "logits/rejected": -87381333.33333333, "logps/chosen": -295.35135135135135, "logps/rejected": -324.14814814814815, "loss": 0.2602, "rewards/chosen": 0.9366554054054054, "rewards/margins": 4.547766516516517, "rewards/rejected": -3.611111111111111, "step": 939 }, { "epoch": 0.6444977716832362, "grad_norm": 0.571565319297677, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98133398.34920634, "logits/rejected": -87628689.72307692, "logps/chosen": -316.44444444444446, "logps/rejected": -308.67692307692306, "loss": 0.2562, "rewards/chosen": 0.7529761904761905, "rewards/margins": 4.106822344322344, "rewards/rejected": -3.353846153846154, "step": 940 }, { "epoch": 0.6451834076105588, "grad_norm": 0.5897374262059653, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89529325.38181818, "logits/rejected": -76933877.47945206, "logps/chosen": -389.8181818181818, "logps/rejected": -356.3835616438356, "loss": 0.217, "rewards/chosen": 1.2471590909090908, "rewards/margins": 5.202638542963886, "rewards/rejected": -3.9554794520547945, "step": 941 }, { "epoch": 0.6458690435378814, "grad_norm": 0.5094150087735618, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97485792.96969697, "logits/rejected": -108510703.48387097, "logps/chosen": -259.8787878787879, "logps/rejected": -337.03225806451616, "loss": 0.258, "rewards/chosen": 0.6652462121212122, "rewards/margins": 4.649117179863148, "rewards/rejected": -3.9838709677419355, "step": 942 }, { "epoch": 0.646554679465204, "grad_norm": 0.5209086439874233, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92495440.84210527, "logits/rejected": -82704585.91549295, "logps/chosen": -300.9122807017544, "logps/rejected": -317.2957746478873, "loss": 0.2203, "rewards/chosen": 1.0320723684210527, "rewards/margins": 4.327847016308377, "rewards/rejected": -3.295774647887324, "step": 943 }, { "epoch": 0.6472403153925266, "grad_norm": 0.4882094993223699, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81304969.84615384, "logits/rejected": -88280112.76190476, "logps/chosen": -260.0615384615385, "logps/rejected": -283.42857142857144, "loss": 0.2682, "rewards/chosen": 0.498046875, "rewards/margins": 3.878999255952381, "rewards/rejected": -3.380952380952381, "step": 944 }, { "epoch": 0.6479259513198492, "grad_norm": 0.4617903454807034, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89148025.01818182, "logits/rejected": -73658872.98630136, "logps/chosen": -282.76363636363635, "logps/rejected": -349.36986301369865, "loss": 0.2266, "rewards/chosen": 0.7119318181818182, "rewards/margins": 4.5304249688667495, "rewards/rejected": -3.8184931506849313, "step": 945 }, { "epoch": 0.6486115872471717, "grad_norm": 0.7252179190244424, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80609280.0, "logits/rejected": -84148224.0, "logps/chosen": -285.75, "logps/rejected": -341.25, "loss": 0.2479, "rewards/chosen": 0.6630859375, "rewards/margins": 4.6669921875, "rewards/rejected": -4.00390625, "step": 946 }, { "epoch": 0.6492972231744943, "grad_norm": 0.6433071800306135, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78661596.07017544, "logits/rejected": -92983584.45070423, "logps/chosen": -252.6315789473684, "logps/rejected": -368.67605633802816, "loss": 0.2349, "rewards/chosen": 0.5002741228070176, "rewards/margins": 4.553091024215469, "rewards/rejected": -4.052816901408451, "step": 947 }, { "epoch": 0.649982859101817, "grad_norm": 0.621433469694733, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83248970.53164557, "logits/rejected": -83714883.91836734, "logps/chosen": -268.55696202531647, "logps/rejected": -272.9795918367347, "loss": 0.2668, "rewards/chosen": 1.0174050632911393, "rewards/margins": 3.9459764918625675, "rewards/rejected": -2.9285714285714284, "step": 948 }, { "epoch": 0.6506684950291395, "grad_norm": 0.5780371004684105, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105002231.1724138, "logits/rejected": -107733694.17142858, "logps/chosen": -340.9655172413793, "logps/rejected": -467.2, "loss": 0.222, "rewards/chosen": 2.753185403758082, "rewards/margins": 6.93175683232951, "rewards/rejected": -4.178571428571429, "step": 949 }, { "epoch": 0.6513541309564621, "grad_norm": 0.5796410510573736, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86389132.38709678, "logits/rejected": -79183375.51515152, "logps/chosen": -301.93548387096774, "logps/rejected": -325.3333333333333, "loss": 0.2447, "rewards/chosen": 2.591314008159022, "rewards/margins": 6.208738250583265, "rewards/rejected": -3.617424242424242, "step": 950 }, { "epoch": 0.6520397668837847, "grad_norm": 0.6268227946600575, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90061027.55555555, "logits/rejected": -82313216.0, "logps/chosen": -283.1111111111111, "logps/rejected": -333.7142857142857, "loss": 0.2525, "rewards/chosen": 0.8567708333333334, "rewards/margins": 3.5755208333333335, "rewards/rejected": -2.71875, "step": 951 }, { "epoch": 0.6527254028111072, "grad_norm": 0.649425554316605, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -106388853.84126984, "logits/rejected": -77691415.63076924, "logps/chosen": -245.33333333333334, "logps/rejected": -389.9076923076923, "loss": 0.2241, "rewards/chosen": 0.6631944444444444, "rewards/margins": 4.886271367521368, "rewards/rejected": -4.223076923076923, "step": 952 }, { "epoch": 0.6534110387384299, "grad_norm": 0.6282902701576659, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109175265.88235295, "logits/rejected": -82278263.46666667, "logps/chosen": -320.94117647058823, "logps/rejected": -390.93333333333334, "loss": 0.2881, "rewards/chosen": 0.4765625, "rewards/margins": 4.0640625, "rewards/rejected": -3.5875, "step": 953 }, { "epoch": 0.6540966746657525, "grad_norm": 0.562693039130917, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95575123.93442623, "logits/rejected": -80317791.52238806, "logps/chosen": -323.672131147541, "logps/rejected": -403.5820895522388, "loss": 0.2407, "rewards/chosen": 0.8570696721311475, "rewards/margins": 4.756323403474431, "rewards/rejected": -3.8992537313432836, "step": 954 }, { "epoch": 0.6547823105930751, "grad_norm": 0.7044653950963454, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -117567612.12121212, "logits/rejected": -85239081.29032259, "logps/chosen": -397.57575757575756, "logps/rejected": -353.03225806451616, "loss": 0.2648, "rewards/chosen": 0.9995265151515151, "rewards/margins": 4.717268450635386, "rewards/rejected": -3.717741935483871, "step": 955 }, { "epoch": 0.6554679465203976, "grad_norm": 0.5327599007792089, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74353570.9090909, "logits/rejected": -79285875.61290322, "logps/chosen": -255.75757575757575, "logps/rejected": -304.0, "loss": 0.2758, "rewards/chosen": 0.43134469696969696, "rewards/margins": 4.201505987292277, "rewards/rejected": -3.7701612903225805, "step": 956 }, { "epoch": 0.6561535824477203, "grad_norm": 0.6644904806140695, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78828242.8235294, "logits/rejected": -99544814.93333334, "logps/chosen": -289.4117647058824, "logps/rejected": -338.1333333333333, "loss": 0.2495, "rewards/chosen": 2.0941610897288605, "rewards/margins": 6.002494423062194, "rewards/rejected": -3.908333333333333, "step": 957 }, { "epoch": 0.6568392183750429, "grad_norm": 0.4483789275894026, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78564948.05970149, "logits/rejected": -71646963.40983607, "logps/chosen": -275.34328358208955, "logps/rejected": -344.655737704918, "loss": 0.2144, "rewards/chosen": 1.2798507462686568, "rewards/margins": 5.378211402006361, "rewards/rejected": -4.098360655737705, "step": 958 }, { "epoch": 0.6575248543023654, "grad_norm": 0.4878677191549166, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94082577.65517241, "logits/rejected": -81908765.25714286, "logps/chosen": -316.13793103448273, "logps/rejected": -330.9714285714286, "loss": 0.2075, "rewards/chosen": 1.1853448275862069, "rewards/margins": 4.999630541871921, "rewards/rejected": -3.8142857142857145, "step": 959 }, { "epoch": 0.658210490229688, "grad_norm": 0.5201841523780729, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100358255.7090909, "logits/rejected": -81472918.79452054, "logps/chosen": -277.23636363636365, "logps/rejected": -313.2054794520548, "loss": 0.2184, "rewards/chosen": 1.0355113636363635, "rewards/margins": 4.645100404732254, "rewards/rejected": -3.6095890410958904, "step": 960 }, { "epoch": 0.6588961261570107, "grad_norm": 0.5850999299384294, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73460238.62857144, "logits/rejected": -80848825.37931034, "logps/chosen": -296.9142857142857, "logps/rejected": -329.6551724137931, "loss": 0.2403, "rewards/chosen": 1.277232142857143, "rewards/margins": 4.953956280788177, "rewards/rejected": -3.6767241379310347, "step": 961 }, { "epoch": 0.6595817620843332, "grad_norm": 0.5910673980042072, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -113796280.6557377, "logits/rejected": -87767376.23880596, "logps/chosen": -303.4754098360656, "logps/rejected": -332.4179104477612, "loss": 0.222, "rewards/chosen": 0.9211065573770492, "rewards/margins": 5.394987154391975, "rewards/rejected": -4.473880597014926, "step": 962 }, { "epoch": 0.6602673980116558, "grad_norm": 0.5130398051982408, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -70807477.52727273, "logits/rejected": -79864144.65753424, "logps/chosen": -246.4, "logps/rejected": -313.2054794520548, "loss": 0.2294, "rewards/chosen": 1.1403409090909091, "rewards/margins": 4.804724470734745, "rewards/rejected": -3.664383561643836, "step": 963 }, { "epoch": 0.6609530339389784, "grad_norm": 0.4796265965036501, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109661984.58181818, "logits/rejected": -102961544.76712328, "logps/chosen": -244.94545454545454, "logps/rejected": -380.93150684931504, "loss": 0.1822, "rewards/chosen": 0.9045454545454545, "rewards/margins": 5.390846824408468, "rewards/rejected": -4.486301369863014, "step": 964 }, { "epoch": 0.661638669866301, "grad_norm": 0.5808552805948939, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82348168.53333333, "logits/rejected": -86106593.88235295, "logps/chosen": -263.73333333333335, "logps/rejected": -384.47058823529414, "loss": 0.2259, "rewards/chosen": 0.8291666666666667, "rewards/margins": 5.233578431372549, "rewards/rejected": -4.404411764705882, "step": 965 }, { "epoch": 0.6623243057936236, "grad_norm": 0.6655523000445683, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103042154.98507462, "logits/rejected": -66971345.836065575, "logps/chosen": -289.43283582089555, "logps/rejected": -287.4754098360656, "loss": 0.2676, "rewards/chosen": 0.6548507462686567, "rewards/margins": 3.962227795448985, "rewards/rejected": -3.307377049180328, "step": 966 }, { "epoch": 0.6630099417209462, "grad_norm": 0.5724912945355606, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77327713.74545455, "logits/rejected": -74922909.80821918, "logps/chosen": -262.9818181818182, "logps/rejected": -318.6849315068493, "loss": 0.2385, "rewards/chosen": 0.5353693181818182, "rewards/margins": 1.5815549553554056, "rewards/rejected": -1.0461856371735874, "step": 967 }, { "epoch": 0.6636955776482688, "grad_norm": 0.588695331323904, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92512101.43396227, "logits/rejected": -69737294.50666666, "logps/chosen": -256.60377358490564, "logps/rejected": -294.82666666666665, "loss": 0.2445, "rewards/chosen": 0.6140182783018868, "rewards/margins": 4.414018278301887, "rewards/rejected": -3.8, "step": 968 }, { "epoch": 0.6643812135755913, "grad_norm": 0.6351639285153525, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88080384.0, "logits/rejected": -90352298.66666667, "logps/chosen": -312.94117647058823, "logps/rejected": -366.4, "loss": 0.2313, "rewards/chosen": 0.9384191176470589, "rewards/margins": 4.905085784313726, "rewards/rejected": -3.966666666666667, "step": 969 }, { "epoch": 0.665066849502914, "grad_norm": 0.5753798476568399, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87949312.0, "logits/rejected": -124780544.0, "logps/chosen": -296.0, "logps/rejected": -421.5, "loss": 0.2383, "rewards/chosen": 0.81103515625, "rewards/margins": 4.36962890625, "rewards/rejected": -3.55859375, "step": 970 }, { "epoch": 0.6657524854302366, "grad_norm": 0.609235187702961, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96839077.64705883, "logits/rejected": -92134877.86666666, "logps/chosen": -320.5882352941176, "logps/rejected": -393.3333333333333, "loss": 0.262, "rewards/chosen": 0.6725643382352942, "rewards/margins": 4.5892310049019605, "rewards/rejected": -3.9166666666666665, "step": 971 }, { "epoch": 0.6664381213575591, "grad_norm": 0.770521482087364, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105063877.24590164, "logits/rejected": -82821853.6119403, "logps/chosen": -288.0, "logps/rejected": -376.35820895522386, "loss": 0.2595, "rewards/chosen": 0.9538934426229508, "rewards/margins": 4.218818815757279, "rewards/rejected": -3.264925373134328, "step": 972 }, { "epoch": 0.6671237572848817, "grad_norm": 0.4433067081773024, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99983138.5945946, "logits/rejected": -108585870.22222222, "logps/chosen": -282.3783783783784, "logps/rejected": -398.22222222222223, "loss": 0.2394, "rewards/chosen": 0.953125, "rewards/margins": 4.179976851851851, "rewards/rejected": -3.2268518518518516, "step": 973 }, { "epoch": 0.6678093932122043, "grad_norm": 0.557301596885999, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101763441.31147541, "logits/rejected": -82947056.71641791, "logps/chosen": -282.4918032786885, "logps/rejected": -313.3134328358209, "loss": 0.2355, "rewards/chosen": 0.8165983606557377, "rewards/margins": 4.219583435282603, "rewards/rejected": -3.4029850746268657, "step": 974 }, { "epoch": 0.668495029139527, "grad_norm": 0.6542509047880425, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88461684.36363636, "logits/rejected": -111216706.06451613, "logps/chosen": -280.72727272727275, "logps/rejected": -410.83870967741933, "loss": 0.2325, "rewards/chosen": 0.9422348484848485, "rewards/margins": 5.474492913000978, "rewards/rejected": -4.532258064516129, "step": 975 }, { "epoch": 0.6691806650668495, "grad_norm": 0.5496422935023878, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86573056.0, "logits/rejected": -86048768.0, "logps/chosen": -222.375, "logps/rejected": -338.0, "loss": 0.2558, "rewards/chosen": 2.342270851135254, "rewards/margins": 5.959458351135254, "rewards/rejected": -3.6171875, "step": 976 }, { "epoch": 0.6698663009941721, "grad_norm": 0.6211768139752447, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72678349.63934426, "logits/rejected": -79378768.23880596, "logps/chosen": -231.08196721311475, "logps/rejected": -325.4925373134328, "loss": 0.2253, "rewards/chosen": 0.8703893442622951, "rewards/margins": 4.747255015904086, "rewards/rejected": -3.876865671641791, "step": 977 }, { "epoch": 0.6705519369214947, "grad_norm": 0.571351271536355, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80332572.44444445, "logits/rejected": -87256502.85714285, "logps/chosen": -247.33333333333334, "logps/rejected": -315.42857142857144, "loss": 0.2566, "rewards/chosen": 0.8372395833333334, "rewards/margins": 4.636346726190476, "rewards/rejected": -3.799107142857143, "step": 978 }, { "epoch": 0.6712375728488172, "grad_norm": 0.7226754750500317, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81542204.23529412, "logits/rejected": -91296017.06666666, "logps/chosen": -323.52941176470586, "logps/rejected": -350.4, "loss": 0.2317, "rewards/chosen": 0.96875, "rewards/margins": 4.985416666666667, "rewards/rejected": -4.016666666666667, "step": 979 }, { "epoch": 0.6719232087761399, "grad_norm": 0.5185044773310838, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97966957.71428572, "logits/rejected": -79808284.44444445, "logps/chosen": -366.0, "logps/rejected": -286.0, "loss": 0.2122, "rewards/chosen": 1.3381696428571428, "rewards/margins": 4.956225198412698, "rewards/rejected": -3.6180555555555554, "step": 980 }, { "epoch": 0.6726088447034625, "grad_norm": 0.5117727606437006, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74698556.95238096, "logits/rejected": -80982331.07692307, "logps/chosen": -244.31746031746033, "logps/rejected": -324.9230769230769, "loss": 0.2183, "rewards/chosen": 0.9563492063492064, "rewards/margins": 5.0909645909645915, "rewards/rejected": -4.134615384615385, "step": 981 }, { "epoch": 0.673294480630785, "grad_norm": 0.5528765560222023, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78611424.96969697, "logits/rejected": -101948647.22580644, "logps/chosen": -227.15151515151516, "logps/rejected": -364.9032258064516, "loss": 0.2317, "rewards/chosen": 0.8925189393939394, "rewards/margins": 4.7150995845552295, "rewards/rejected": -3.8225806451612905, "step": 982 }, { "epoch": 0.6739801165581076, "grad_norm": 0.6867330392602917, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78410183.1111111, "logits/rejected": -101187584.0, "logps/chosen": -259.1111111111111, "logps/rejected": -318.0, "loss": 0.2694, "rewards/chosen": 0.7604166666666666, "rewards/margins": 3.602487927391415, "rewards/rejected": -2.8420712607247487, "step": 983 }, { "epoch": 0.6746657524854303, "grad_norm": 0.5515736020247114, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93339908.06349206, "logits/rejected": -94597687.13846155, "logps/chosen": -359.1111111111111, "logps/rejected": -401.2307692307692, "loss": 0.2453, "rewards/chosen": 0.7765376984126984, "rewards/margins": 4.426537698412698, "rewards/rejected": -3.65, "step": 984 }, { "epoch": 0.6753513884127528, "grad_norm": 0.6020525111985673, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -69099381.15254237, "logits/rejected": -74707240.8115942, "logps/chosen": -200.94915254237287, "logps/rejected": -281.9710144927536, "loss": 0.2539, "rewards/chosen": 2.6360804670948093, "rewards/margins": 5.683181916370172, "rewards/rejected": -3.0471014492753623, "step": 985 }, { "epoch": 0.6760370243400754, "grad_norm": 0.556927654478526, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92760984.11594203, "logits/rejected": -80900304.27118644, "logps/chosen": -297.27536231884056, "logps/rejected": -357.1525423728813, "loss": 0.2372, "rewards/chosen": 1.0615942028985508, "rewards/margins": 2.943988379317049, "rewards/rejected": -1.8823941764184984, "step": 986 }, { "epoch": 0.676722660267398, "grad_norm": 0.52616080387297, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85538381.57575758, "logits/rejected": -91192286.96774194, "logps/chosen": -172.6060606060606, "logps/rejected": -340.64516129032256, "loss": 0.2632, "rewards/chosen": 0.5501893939393939, "rewards/margins": 4.55422165200391, "rewards/rejected": -4.004032258064516, "step": 987 }, { "epoch": 0.6774082961947206, "grad_norm": 0.5082210749571603, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82021944.8888889, "logits/rejected": -86725301.16923077, "logps/chosen": -227.8095238095238, "logps/rejected": -329.3538461538462, "loss": 0.2373, "rewards/chosen": 0.675843253968254, "rewards/margins": 4.506612484737484, "rewards/rejected": -3.830769230769231, "step": 988 }, { "epoch": 0.6780939321220432, "grad_norm": 0.5268983640458696, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79425470.98412699, "logits/rejected": -84789468.55384615, "logps/chosen": -218.4126984126984, "logps/rejected": -357.9076923076923, "loss": 0.2468, "rewards/chosen": 0.8695436507936508, "rewards/margins": 5.000312881562882, "rewards/rejected": -4.130769230769231, "step": 989 }, { "epoch": 0.6787795680493658, "grad_norm": 0.5817783918657512, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76437574.62068966, "logits/rejected": -78133891.65714286, "logps/chosen": -287.44827586206895, "logps/rejected": -329.6, "loss": 0.2339, "rewards/chosen": 0.96875, "rewards/margins": 3.8580357142857142, "rewards/rejected": -2.8892857142857142, "step": 990 }, { "epoch": 0.6794652039766884, "grad_norm": 0.7658827242391106, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95162569.44262294, "logits/rejected": -84637298.62686567, "logps/chosen": -293.7704918032787, "logps/rejected": -411.7014925373134, "loss": 0.2201, "rewards/chosen": 3.027639670450179, "rewards/margins": 5.643311312241224, "rewards/rejected": -2.6156716417910446, "step": 991 }, { "epoch": 0.6801508399040109, "grad_norm": 0.5315679413151205, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97380797.2173913, "logits/rejected": -73791314.44067797, "logps/chosen": -246.2608695652174, "logps/rejected": -342.77966101694915, "loss": 0.2678, "rewards/chosen": 1.9775702435037363, "rewards/margins": 5.532654989266448, "rewards/rejected": -3.555084745762712, "step": 992 }, { "epoch": 0.6808364758313336, "grad_norm": 0.6798340536971544, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94937738.15873016, "logits/rejected": -65753781.16923077, "logps/chosen": -299.93650793650795, "logps/rejected": -275.44615384615383, "loss": 0.2321, "rewards/chosen": 1.035218253968254, "rewards/margins": 4.6313721001221, "rewards/rejected": -3.5961538461538463, "step": 993 }, { "epoch": 0.6815221117586562, "grad_norm": 0.46748749628677005, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83547829.67741935, "logits/rejected": -69650866.42424242, "logps/chosen": -378.5806451612903, "logps/rejected": -246.3030303030303, "loss": 0.2285, "rewards/chosen": 3.0871965962071575, "rewards/margins": 6.450832959843521, "rewards/rejected": -3.3636363636363638, "step": 994 }, { "epoch": 0.6822077476859787, "grad_norm": 0.5693314032828575, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109545351.52941176, "logits/rejected": -81439402.66666667, "logps/chosen": -299.7647058823529, "logps/rejected": -334.93333333333334, "loss": 0.2546, "rewards/chosen": 0.9564568014705882, "rewards/margins": 4.593956801470588, "rewards/rejected": -3.6375, "step": 995 }, { "epoch": 0.6828933836133013, "grad_norm": 0.6497070345491094, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93847552.0, "logits/rejected": -95158272.0, "logps/chosen": -330.75, "logps/rejected": -462.5, "loss": 0.2626, "rewards/chosen": 0.654296875, "rewards/margins": 4.611328125, "rewards/rejected": -3.95703125, "step": 996 }, { "epoch": 0.6835790195406239, "grad_norm": 0.7258838182167832, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90105220.4137931, "logits/rejected": -73220564.11428571, "logps/chosen": -353.6551724137931, "logps/rejected": -304.45714285714286, "loss": 0.2256, "rewards/chosen": 1.0387931034482758, "rewards/margins": 4.674507389162562, "rewards/rejected": -3.6357142857142857, "step": 997 }, { "epoch": 0.6842646554679466, "grad_norm": 0.5082070024947759, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85324127.08571428, "logits/rejected": -81355034.48275863, "logps/chosen": -263.54285714285714, "logps/rejected": -288.9655172413793, "loss": 0.2468, "rewards/chosen": 0.9321428571428572, "rewards/margins": 4.231711822660099, "rewards/rejected": -3.2995689655172415, "step": 998 }, { "epoch": 0.6849502913952691, "grad_norm": 0.491481890774805, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85764126.56716418, "logits/rejected": -93649869.63934426, "logps/chosen": -247.88059701492537, "logps/rejected": -360.91803278688525, "loss": 0.2459, "rewards/chosen": 0.7737873134328358, "rewards/margins": 1.7381260225292754, "rewards/rejected": -0.9643387090964396, "step": 999 }, { "epoch": 0.6856359273225917, "grad_norm": 0.7545173581834995, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73400320.0, "logits/rejected": -69533696.0, "logps/chosen": -255.75, "logps/rejected": -277.0, "loss": 0.2501, "rewards/chosen": 0.6419677734375, "rewards/margins": 2.9725213050842285, "rewards/rejected": -2.3305535316467285, "step": 1000 }, { "epoch": 0.6863215632499143, "grad_norm": 0.6474301006172967, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91488256.0, "logits/rejected": -82313216.0, "logps/chosen": -332.5, "logps/rejected": -319.5, "loss": 0.2364, "rewards/chosen": 1.021484375, "rewards/margins": 4.650390625, "rewards/rejected": -3.62890625, "step": 1001 }, { "epoch": 0.6870071991772368, "grad_norm": 0.475773856227534, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -64674669.71428572, "logits/rejected": -93090247.1111111, "logps/chosen": -177.57142857142858, "logps/rejected": -331.55555555555554, "loss": 0.2258, "rewards/chosen": 0.6485770089285714, "rewards/margins": 4.5166325644841265, "rewards/rejected": -3.8680555555555554, "step": 1002 }, { "epoch": 0.6876928351045595, "grad_norm": 0.6369930574829671, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74317824.0, "logits/rejected": -90767360.0, "logps/chosen": -283.75, "logps/rejected": -311.5, "loss": 0.2651, "rewards/chosen": 0.6103515625, "rewards/margins": 3.8876953125, "rewards/rejected": -3.27734375, "step": 1003 }, { "epoch": 0.6883784710318821, "grad_norm": 0.6465018904431131, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84158437.4025974, "logits/rejected": -85942111.37254901, "logps/chosen": -272.83116883116884, "logps/rejected": -351.37254901960785, "loss": 0.2515, "rewards/chosen": 0.9415584415584416, "rewards/margins": 10187778.196460402, "rewards/rejected": -10187777.25490196, "step": 1004 }, { "epoch": 0.6890641069592046, "grad_norm": 0.6289109340576569, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90756060.68965517, "logits/rejected": -92274688.0, "logps/chosen": -317.7931034482759, "logps/rejected": -361.6, "loss": 0.2256, "rewards/chosen": 1.0140086206896552, "rewards/margins": 4.671151477832512, "rewards/rejected": -3.657142857142857, "step": 1005 }, { "epoch": 0.6897497428865272, "grad_norm": 0.7069737066340558, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77017147.36231884, "logits/rejected": -82748641.62711865, "logps/chosen": -179.94202898550725, "logps/rejected": -338.7118644067797, "loss": 0.2369, "rewards/chosen": 1.6483094588569973, "rewards/margins": 5.1758518317383535, "rewards/rejected": -3.527542372881356, "step": 1006 }, { "epoch": 0.6904353788138499, "grad_norm": 0.6053606831269649, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88080384.0, "logits/rejected": -104464384.0, "logps/chosen": -256.5, "logps/rejected": -407.5, "loss": 0.2119, "rewards/chosen": 0.98046875, "rewards/margins": 5.19140625, "rewards/rejected": -4.2109375, "step": 1007 }, { "epoch": 0.6911210147411725, "grad_norm": 0.5211630947959489, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88812722.79365079, "logits/rejected": -95242964.67692308, "logps/chosen": -280.8888888888889, "logps/rejected": -422.89230769230767, "loss": 0.2501, "rewards/chosen": 0.7645089285714286, "rewards/margins": 5.018355082417583, "rewards/rejected": -4.253846153846154, "step": 1008 }, { "epoch": 0.691806650668495, "grad_norm": 0.5568089991878372, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90709649.19402985, "logits/rejected": -80723162.22950819, "logps/chosen": -238.2089552238806, "logps/rejected": -304.0, "loss": 0.2418, "rewards/chosen": 1.9607926553754664, "rewards/margins": 6.120628720949237, "rewards/rejected": -4.159836065573771, "step": 1009 }, { "epoch": 0.6924922865958176, "grad_norm": 0.5494438598457816, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93772653.71428572, "logits/rejected": -84585130.66666667, "logps/chosen": -256.57142857142856, "logps/rejected": -328.0, "loss": 0.2229, "rewards/chosen": 0.5998883928571429, "rewards/margins": 4.64155505952381, "rewards/rejected": -4.041666666666667, "step": 1010 }, { "epoch": 0.6931779225231403, "grad_norm": 0.6628877747461375, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -111009245.86666666, "logits/rejected": -91781240.47058824, "logps/chosen": -355.2, "logps/rejected": -304.0, "loss": 0.2537, "rewards/chosen": 0.9369791666666667, "rewards/margins": 4.60609681372549, "rewards/rejected": -3.6691176470588234, "step": 1011 }, { "epoch": 0.6938635584504628, "grad_norm": 0.7174342063670136, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86596245.66153847, "logits/rejected": -65977067.68253968, "logps/chosen": -312.12307692307695, "logps/rejected": -300.44444444444446, "loss": 0.2378, "rewards/chosen": 1.164423076923077, "rewards/margins": 4.612835775335776, "rewards/rejected": -3.4484126984126986, "step": 1012 }, { "epoch": 0.6945491943777854, "grad_norm": 0.495014764432738, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84254001.40350877, "logits/rejected": -79869000.11267605, "logps/chosen": -227.3684210526316, "logps/rejected": -274.0281690140845, "loss": 0.2357, "rewards/chosen": 0.631578947368421, "rewards/margins": 4.029466271312083, "rewards/rejected": -3.397887323943662, "step": 1013 }, { "epoch": 0.695234830305108, "grad_norm": 0.8231193431288598, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92498384.21333334, "logits/rejected": -86576765.58490565, "logps/chosen": -286.50666666666666, "logps/rejected": -343.8490566037736, "loss": 0.257, "rewards/chosen": 0.9525, "rewards/margins": 3.56953836692954, "rewards/rejected": -2.61703836692954, "step": 1014 }, { "epoch": 0.6959204662324305, "grad_norm": 0.5661193298731324, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81089877.33333333, "logits/rejected": -76396251.42857143, "logps/chosen": -270.6666666666667, "logps/rejected": -261.14285714285717, "loss": 0.2576, "rewards/chosen": 1.0789930555555556, "rewards/margins": 4.440600198412699, "rewards/rejected": -3.361607142857143, "step": 1015 }, { "epoch": 0.6966061021597532, "grad_norm": 0.5657775953225983, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73252633.23943663, "logits/rejected": -68948471.01754385, "logps/chosen": -294.76056338028167, "logps/rejected": -240.8421052631579, "loss": 0.251, "rewards/chosen": 1.0105633802816902, "rewards/margins": 4.462317766246603, "rewards/rejected": -3.4517543859649122, "step": 1016 }, { "epoch": 0.6972917380870758, "grad_norm": 0.6609530589540364, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81162505.97402598, "logits/rejected": -93138221.1764706, "logps/chosen": -273.038961038961, "logps/rejected": -379.6078431372549, "loss": 0.2534, "rewards/chosen": 2.1333134638798703, "rewards/margins": 5.334293856036734, "rewards/rejected": -3.200980392156863, "step": 1017 }, { "epoch": 0.6979773740143983, "grad_norm": 0.5891026023743975, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84724940.8, "logits/rejected": -95137466.92063493, "logps/chosen": -247.3846153846154, "logps/rejected": -345.6507936507937, "loss": 0.2585, "rewards/chosen": 0.6298076923076923, "rewards/margins": 2.9641224322010453, "rewards/rejected": -2.3343147398933533, "step": 1018 }, { "epoch": 0.6986630099417209, "grad_norm": 0.5235396036669533, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84859757.71428572, "logits/rejected": -91226112.0, "logps/chosen": -298.0, "logps/rejected": -327.55555555555554, "loss": 0.2096, "rewards/chosen": 0.9056919642857143, "rewards/margins": 5.009858630952381, "rewards/rejected": -4.104166666666667, "step": 1019 }, { "epoch": 0.6993486458690436, "grad_norm": 0.5575933495964643, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76096658.28571428, "logits/rejected": -97481410.20689656, "logps/chosen": -259.54285714285714, "logps/rejected": -450.7586206896552, "loss": 0.2388, "rewards/chosen": 1.025, "rewards/margins": 5.425862068965516, "rewards/rejected": -4.400862068965517, "step": 1020 }, { "epoch": 0.7000342817963662, "grad_norm": 0.8232979065551164, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82530997.16923077, "logits/rejected": -85949943.87301587, "logps/chosen": -272.0, "logps/rejected": -400.76190476190476, "loss": 0.2295, "rewards/chosen": 1.0028846153846154, "rewards/margins": 3.986263676644536, "rewards/rejected": -2.9833790612599205, "step": 1021 }, { "epoch": 0.7007199177236887, "grad_norm": 0.5744000934577158, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83755008.0, "logits/rejected": -82968576.0, "logps/chosen": -255.75, "logps/rejected": -294.5, "loss": 0.2293, "rewards/chosen": 0.97265625, "rewards/margins": 4.46484375, "rewards/rejected": -3.4921875, "step": 1022 }, { "epoch": 0.7014055536510113, "grad_norm": 0.6907200942450614, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78889923.76470588, "logits/rejected": -86752187.73333333, "logps/chosen": -293.4117647058824, "logps/rejected": -274.1333333333333, "loss": 0.2571, "rewards/chosen": 0.9715073529411765, "rewards/margins": 4.4798406862745095, "rewards/rejected": -3.5083333333333333, "step": 1023 }, { "epoch": 0.7020911895783339, "grad_norm": 0.6025865829456105, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76803894.55737706, "logits/rejected": -87141360.71641791, "logps/chosen": -330.4918032786885, "logps/rejected": -333.13432835820896, "loss": 0.2134, "rewards/chosen": 1.4651639344262295, "rewards/margins": 2.0126807145393357, "rewards/rejected": -0.5475167801131063, "step": 1024 }, { "epoch": 0.7027768255056565, "grad_norm": 0.7697208214498805, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79040935.72413793, "logits/rejected": -95869805.71428572, "logps/chosen": -247.17241379310346, "logps/rejected": -409.6, "loss": 0.2333, "rewards/chosen": 1.1109913793103448, "rewards/margins": 2542818.253848522, "rewards/rejected": -2542817.1428571427, "step": 1025 }, { "epoch": 0.7034624614329791, "grad_norm": 0.7152774884957073, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100593390.93333334, "logits/rejected": -88018703.05882353, "logps/chosen": -280.53333333333336, "logps/rejected": -393.4117647058824, "loss": 0.2601, "rewards/chosen": 2.8577489217122394, "rewards/margins": 6.409219509947533, "rewards/rejected": -3.551470588235294, "step": 1026 }, { "epoch": 0.7041480973603017, "grad_norm": 0.5500054435327776, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96468992.0, "logits/rejected": -75986807.46666667, "logps/chosen": -249.1764705882353, "logps/rejected": -294.93333333333334, "loss": 0.2164, "rewards/chosen": 2.6861002304974724, "rewards/margins": 7.148600230497473, "rewards/rejected": -4.4625, "step": 1027 }, { "epoch": 0.7048337332876242, "grad_norm": 0.6978442597281118, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89499994.58461538, "logits/rejected": -95270619.42857143, "logps/chosen": -287.5076923076923, "logps/rejected": -364.1904761904762, "loss": 0.2263, "rewards/chosen": 1.2701923076923076, "rewards/margins": 4.97257326007326, "rewards/rejected": -3.7023809523809526, "step": 1028 }, { "epoch": 0.7055193692149468, "grad_norm": 0.660331584730065, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89067279.05882353, "logits/rejected": -93812599.46666667, "logps/chosen": -357.4117647058824, "logps/rejected": -374.4, "loss": 0.2251, "rewards/chosen": 1.3823529411764706, "rewards/margins": 5.374019607843137, "rewards/rejected": -3.9916666666666667, "step": 1029 }, { "epoch": 0.7062050051422695, "grad_norm": 0.652928886026559, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104048172.91228071, "logits/rejected": -78392132.50704226, "logps/chosen": -312.140350877193, "logps/rejected": -435.38028169014086, "loss": 0.2091, "rewards/chosen": 1.0153508771929824, "rewards/margins": 5.032956510995799, "rewards/rejected": -4.017605633802817, "step": 1030 }, { "epoch": 0.7068906410695921, "grad_norm": 0.6272909081067055, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78028517.51724137, "logits/rejected": -87660953.6, "logps/chosen": -259.86206896551727, "logps/rejected": -269.0285714285714, "loss": 0.2212, "rewards/chosen": 5.33502302498653, "rewards/margins": 8.68502302498653, "rewards/rejected": -3.35, "step": 1031 }, { "epoch": 0.7075762769969146, "grad_norm": 0.5249235919662965, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -71954008.27586207, "logits/rejected": -81968683.88571429, "logps/chosen": -256.82758620689657, "logps/rejected": -298.51428571428573, "loss": 0.2286, "rewards/chosen": 0.8599137931034483, "rewards/margins": 4.374199507389163, "rewards/rejected": -3.5142857142857142, "step": 1032 }, { "epoch": 0.7082619129242372, "grad_norm": 0.5999385060159267, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97280792.77419356, "logits/rejected": -87571983.51515152, "logps/chosen": -332.9032258064516, "logps/rejected": -374.06060606060606, "loss": 0.2375, "rewards/chosen": 1.1703629032258065, "rewards/margins": 5.140059872922777, "rewards/rejected": -3.9696969696969697, "step": 1033 }, { "epoch": 0.7089475488515599, "grad_norm": 0.5895716853246239, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91327587.09677419, "logits/rejected": -82297328.48484848, "logps/chosen": -253.5483870967742, "logps/rejected": -321.93939393939394, "loss": 0.2422, "rewards/chosen": 0.8568548387096774, "rewards/margins": 2.374384181823432, "rewards/rejected": -1.5175293431137546, "step": 1034 }, { "epoch": 0.7096331847788824, "grad_norm": 0.6278449491559774, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91848148.61016949, "logits/rejected": -71728677.10144928, "logps/chosen": -292.06779661016947, "logps/rejected": -314.6666666666667, "loss": 0.215, "rewards/chosen": 2.698597019001589, "rewards/margins": 6.361640497262458, "rewards/rejected": -3.6630434782608696, "step": 1035 }, { "epoch": 0.710318820706205, "grad_norm": 0.5336127891143478, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75978785.5737705, "logits/rejected": -91523469.37313433, "logps/chosen": -212.19672131147541, "logps/rejected": -349.13432835820896, "loss": 0.2325, "rewards/chosen": 0.7653688524590164, "rewards/margins": 4.974324076339614, "rewards/rejected": -4.208955223880597, "step": 1036 }, { "epoch": 0.7110044566335276, "grad_norm": 0.5416174240191192, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81505529.08108108, "logits/rejected": -79691776.0, "logps/chosen": -280.2162162162162, "logps/rejected": -290.3703703703704, "loss": 0.2661, "rewards/chosen": 0.8688766891891891, "rewards/margins": 4.674432244744745, "rewards/rejected": -3.8055555555555554, "step": 1037 }, { "epoch": 0.7116900925608501, "grad_norm": 0.738956731356291, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96868449.52380952, "logits/rejected": -96856158.52307692, "logps/chosen": -307.8095238095238, "logps/rejected": -360.61538461538464, "loss": 0.2643, "rewards/chosen": 2.134674072265625, "rewards/margins": 6.053904841496394, "rewards/rejected": -3.919230769230769, "step": 1038 }, { "epoch": 0.7123757284881728, "grad_norm": 0.6254454614622224, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81131611.70149253, "logits/rejected": -71165649.83606558, "logps/chosen": -295.64179104477614, "logps/rejected": -318.1639344262295, "loss": 0.2487, "rewards/chosen": 1.1492537313432836, "rewards/margins": 4.8705652067531195, "rewards/rejected": -3.721311475409836, "step": 1039 }, { "epoch": 0.7130613644154954, "grad_norm": 0.5860018113652117, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107145402.18181819, "logits/rejected": -70441324.71232876, "logps/chosen": -290.3272727272727, "logps/rejected": -309.47945205479454, "loss": 0.2448, "rewards/chosen": 0.631534090909091, "rewards/margins": 4.0596162826899125, "rewards/rejected": -3.4280821917808217, "step": 1040 }, { "epoch": 0.713747000342818, "grad_norm": 0.6333401284412324, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -58453950.984126985, "logits/rejected": -80272525.78461538, "logps/chosen": -172.82539682539684, "logps/rejected": -368.73846153846154, "loss": 0.2211, "rewards/chosen": 0.691468253968254, "rewards/margins": 5.0376221001221, "rewards/rejected": -4.346153846153846, "step": 1041 }, { "epoch": 0.7144326362701405, "grad_norm": 0.5359806925761068, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86374226.44067797, "logits/rejected": -77746591.53623189, "logps/chosen": -348.20338983050846, "logps/rejected": -355.94202898550725, "loss": 0.2324, "rewards/chosen": 3.004212460275424, "rewards/margins": 6.786821155927598, "rewards/rejected": -3.782608695652174, "step": 1042 }, { "epoch": 0.7151182721974632, "grad_norm": 0.5313617878640146, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82706432.0, "logits/rejected": -81068032.0, "logps/chosen": -373.75, "logps/rejected": -330.0, "loss": 0.2616, "rewards/chosen": 0.806640625, "rewards/margins": 4.431640625, "rewards/rejected": -3.625, "step": 1043 }, { "epoch": 0.7158039081247858, "grad_norm": 0.581094606129714, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89703985.5483871, "logits/rejected": -86428082.42424242, "logps/chosen": -359.741935483871, "logps/rejected": -327.27272727272725, "loss": 0.2121, "rewards/chosen": 1.2348790322580645, "rewards/margins": 5.507606304985337, "rewards/rejected": -4.2727272727272725, "step": 1044 }, { "epoch": 0.7164895440521083, "grad_norm": 0.7771071355399423, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81924228.12903225, "logits/rejected": -104857600.0, "logps/chosen": -275.61290322580646, "logps/rejected": -432.4848484848485, "loss": 0.222, "rewards/chosen": 0.6307963709677419, "rewards/margins": 5.69897818914956, "rewards/rejected": -5.068181818181818, "step": 1045 }, { "epoch": 0.7171751799794309, "grad_norm": 0.5724592875206458, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80774177.03225806, "logits/rejected": -88398134.3030303, "logps/chosen": -217.29032258064515, "logps/rejected": -330.1818181818182, "loss": 0.2271, "rewards/chosen": 3.7215022425497732, "rewards/margins": 7.960138606186137, "rewards/rejected": -4.238636363636363, "step": 1046 }, { "epoch": 0.7178608159067535, "grad_norm": 0.45390960297877236, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89198865.06666666, "logits/rejected": -74942343.52941176, "logps/chosen": -322.93333333333334, "logps/rejected": -290.5882352941176, "loss": 0.2157, "rewards/chosen": 0.8651041666666667, "rewards/margins": 4.394515931372549, "rewards/rejected": -3.5294117647058822, "step": 1047 }, { "epoch": 0.7185464518340761, "grad_norm": 0.5673168770258306, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89693577.84615384, "logits/rejected": -74299099.42857143, "logps/chosen": -250.58461538461538, "logps/rejected": -319.4920634920635, "loss": 0.2417, "rewards/chosen": 0.8605769230769231, "rewards/margins": 5.035180097680098, "rewards/rejected": -4.174603174603175, "step": 1048 }, { "epoch": 0.7192320877613987, "grad_norm": 0.45168633956268733, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89573271.86440678, "logits/rejected": -84493950.14492753, "logps/chosen": -272.135593220339, "logps/rejected": -319.07246376811594, "loss": 0.2279, "rewards/chosen": 0.893604343220339, "rewards/margins": 4.001617772723849, "rewards/rejected": -3.10801342950351, "step": 1049 }, { "epoch": 0.7199177236887213, "grad_norm": 0.5675161671651437, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93492389.16129032, "logits/rejected": -90368186.18181819, "logps/chosen": -301.93548387096774, "logps/rejected": -372.3636363636364, "loss": 0.2315, "rewards/chosen": 0.8064516129032258, "rewards/margins": 4.984481915933529, "rewards/rejected": -4.178030303030303, "step": 1050 }, { "epoch": 0.7206033596160438, "grad_norm": 0.5025235974649394, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89128960.0, "logits/rejected": -91692145.77777778, "logps/chosen": -319.7142857142857, "logps/rejected": -439.1111111111111, "loss": 0.1979, "rewards/chosen": 3.180250440325056, "rewards/margins": 7.784417106991723, "rewards/rejected": -4.604166666666667, "step": 1051 }, { "epoch": 0.7212889955433665, "grad_norm": 0.5290841337962674, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101539717.73134328, "logits/rejected": -83954839.08196722, "logps/chosen": -266.74626865671644, "logps/rejected": -343.8688524590164, "loss": 0.2442, "rewards/chosen": 0.6613805970149254, "rewards/margins": 3553915.1531838756, "rewards/rejected": -3553914.4918032787, "step": 1052 }, { "epoch": 0.7219746314706891, "grad_norm": 0.5950420707398795, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73619425.43283582, "logits/rejected": -69927986.36065574, "logps/chosen": -177.43283582089552, "logps/rejected": -305.3114754098361, "loss": 0.2421, "rewards/chosen": 0.6907649253731343, "rewards/margins": 4.850600990946905, "rewards/rejected": -4.159836065573771, "step": 1053 }, { "epoch": 0.7226602673980117, "grad_norm": 0.5875314699774761, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90280674.62295082, "logits/rejected": -85012907.94029851, "logps/chosen": -338.0983606557377, "logps/rejected": -310.92537313432837, "loss": 0.2311, "rewards/chosen": 1.0020491803278688, "rewards/margins": 3.5468252997308536, "rewards/rejected": -2.544776119402985, "step": 1054 }, { "epoch": 0.7233459033253342, "grad_norm": 0.49210895001437316, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99090432.0, "logits/rejected": -107610112.0, "logps/chosen": -291.5, "logps/rejected": -363.5, "loss": 0.2351, "rewards/chosen": 0.8779296875, "rewards/margins": 5.2216796875, "rewards/rejected": -4.34375, "step": 1055 }, { "epoch": 0.7240315392526568, "grad_norm": 0.48478921916303924, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86376448.0, "logits/rejected": -69533696.0, "logps/chosen": -246.25, "logps/rejected": -318.0, "loss": 0.2113, "rewards/chosen": 0.99853515625, "rewards/margins": 5.06884765625, "rewards/rejected": -4.0703125, "step": 1056 }, { "epoch": 0.7247171751799795, "grad_norm": 0.6150802516640981, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101089835.38983051, "logits/rejected": -96651353.04347827, "logps/chosen": -226.16949152542372, "logps/rejected": -306.3188405797101, "loss": 0.2241, "rewards/chosen": 3.9252795203257413, "rewards/margins": 8.443395462354726, "rewards/rejected": -4.518115942028985, "step": 1057 }, { "epoch": 0.725402811107302, "grad_norm": 0.6403495449591547, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105747300.84848484, "logits/rejected": -87471533.41935484, "logps/chosen": -322.1818181818182, "logps/rejected": -339.61290322580646, "loss": 0.251, "rewards/chosen": 2.526123046875, "rewards/margins": 7.247897240423387, "rewards/rejected": -4.721774193548387, "step": 1058 }, { "epoch": 0.7260884470346246, "grad_norm": 0.44084080590441194, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82487978.66666667, "logits/rejected": -85489784.47058824, "logps/chosen": -328.8, "logps/rejected": -347.29411764705884, "loss": 0.2207, "rewards/chosen": 1.1947916666666667, "rewards/margins": 5.621262254901961, "rewards/rejected": -4.426470588235294, "step": 1059 }, { "epoch": 0.7267740829619472, "grad_norm": 0.5287319382918122, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95850160.26229508, "logits/rejected": -69174715.2238806, "logps/chosen": -293.7704918032787, "logps/rejected": -315.7014925373134, "loss": 0.2281, "rewards/chosen": 1.132172131147541, "rewards/margins": 4.497140046696748, "rewards/rejected": -3.364967915549207, "step": 1060 }, { "epoch": 0.7274597188892697, "grad_norm": 0.8695743019438379, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88005485.71428572, "logits/rejected": -112197632.0, "logps/chosen": -260.85714285714283, "logps/rejected": -356.8888888888889, "loss": 0.2223, "rewards/chosen": 3.130254473005022, "rewards/margins": 7.210115584116133, "rewards/rejected": -4.079861111111111, "step": 1061 }, { "epoch": 0.7281453548165924, "grad_norm": 0.5693336015514494, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78690151.1641791, "logits/rejected": -93237315.14754099, "logps/chosen": -258.3880597014925, "logps/rejected": -378.75409836065575, "loss": 0.2359, "rewards/chosen": 0.9402985074626866, "rewards/margins": 5.346036212380719, "rewards/rejected": -4.405737704918033, "step": 1062 }, { "epoch": 0.728830990743915, "grad_norm": 0.6765029375269849, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83466649.6, "logits/rejected": -84749613.1764706, "logps/chosen": -286.1333333333333, "logps/rejected": -358.5882352941176, "loss": 0.2316, "rewards/chosen": 1.0072916666666667, "rewards/margins": 4.764644607843137, "rewards/rejected": -3.7573529411764706, "step": 1063 }, { "epoch": 0.7295166266712376, "grad_norm": 0.735560094836349, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87877433.80645162, "logits/rejected": -75243271.75757575, "logps/chosen": -220.6451612903226, "logps/rejected": -329.45454545454544, "loss": 0.2365, "rewards/chosen": 2.5090511691185737, "rewards/margins": 6.5166269266943315, "rewards/rejected": -4.007575757575758, "step": 1064 }, { "epoch": 0.7302022625985601, "grad_norm": 0.5069850528412906, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94633984.0, "logits/rejected": -82182144.0, "logps/chosen": -344.75, "logps/rejected": -339.5, "loss": 0.225, "rewards/chosen": 1.28857421875, "rewards/margins": 4.90576171875, "rewards/rejected": -3.6171875, "step": 1065 }, { "epoch": 0.7308878985258828, "grad_norm": 0.6768704362306479, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -68886111.45762712, "logits/rejected": -113793291.13043478, "logps/chosen": -187.38983050847457, "logps/rejected": -401.6231884057971, "loss": 0.2274, "rewards/chosen": 0.685646186440678, "rewards/margins": 5.2834722733972, "rewards/rejected": -4.5978260869565215, "step": 1066 }, { "epoch": 0.7315735344532054, "grad_norm": 0.5255234710985266, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -69771914.15873016, "logits/rejected": -80982331.07692307, "logps/chosen": -173.71428571428572, "logps/rejected": -323.9384615384615, "loss": 0.2195, "rewards/chosen": 0.7951388888888888, "rewards/margins": 4.8605235042735035, "rewards/rejected": -4.065384615384615, "step": 1067 }, { "epoch": 0.7322591703805279, "grad_norm": 0.6503770372218576, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96275408.73846154, "logits/rejected": -102261126.09523809, "logps/chosen": -278.15384615384613, "logps/rejected": -315.6825396825397, "loss": 0.2109, "rewards/chosen": 0.9245192307692308, "rewards/margins": 5.789598595848595, "rewards/rejected": -4.865079365079365, "step": 1068 }, { "epoch": 0.7329448063078505, "grad_norm": 0.6866326006398147, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77240559.37662338, "logits/rejected": -68877050.98039216, "logps/chosen": -232.93506493506493, "logps/rejected": -369.4117647058824, "loss": 0.243, "rewards/chosen": 0.859577922077922, "rewards/margins": 3.76061480167351, "rewards/rejected": -2.9010368795955883, "step": 1069 }, { "epoch": 0.7336304422351732, "grad_norm": 0.5820305517822095, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87898022.95652173, "logits/rejected": -90923979.9322034, "logps/chosen": -309.7971014492754, "logps/rejected": -366.10169491525426, "loss": 0.2483, "rewards/chosen": 0.9015936367753623, "rewards/margins": 3.984742399106533, "rewards/rejected": -3.0831487623311706, "step": 1070 }, { "epoch": 0.7343160781624957, "grad_norm": 0.6299225155909468, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83230720.0, "logits/rejected": -99483648.0, "logps/chosen": -253.5, "logps/rejected": -435.5, "loss": 0.2302, "rewards/chosen": 0.7490234375, "rewards/margins": 5.4208984375, "rewards/rejected": -4.671875, "step": 1071 }, { "epoch": 0.7350017140898183, "grad_norm": 0.6305010310792092, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99836250.14084508, "logits/rejected": -66593774.03508772, "logps/chosen": -303.77464788732397, "logps/rejected": -309.05263157894734, "loss": 0.2326, "rewards/chosen": 0.9797535211267606, "rewards/margins": 5.444665801828515, "rewards/rejected": -4.464912280701754, "step": 1072 }, { "epoch": 0.7356873500171409, "grad_norm": 0.5148506007936313, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -66556981.89473684, "logits/rejected": -70496571.07692307, "logps/chosen": -212.42105263157896, "logps/rejected": -313.53846153846155, "loss": 0.2692, "rewards/chosen": 0.7806332236842105, "rewards/margins": 4.295056300607287, "rewards/rejected": -3.514423076923077, "step": 1073 }, { "epoch": 0.7363729859444635, "grad_norm": 0.6351658210648208, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90647047.64179105, "logits/rejected": -71921999.73770492, "logps/chosen": -261.0149253731343, "logps/rejected": -314.75409836065575, "loss": 0.2234, "rewards/chosen": 2.6095745029734143, "rewards/margins": 7.146459748875054, "rewards/rejected": -4.536885245901639, "step": 1074 }, { "epoch": 0.7370586218717861, "grad_norm": 0.5830303430936867, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88966504.56338029, "logits/rejected": -75423887.71929824, "logps/chosen": -260.7323943661972, "logps/rejected": -320.56140350877195, "loss": 0.2531, "rewards/chosen": 0.9463028169014085, "rewards/margins": 3.89367123795404, "rewards/rejected": -2.9473684210526314, "step": 1075 }, { "epoch": 0.7377442577991087, "grad_norm": 0.4777342267413033, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91881472.0, "logits/rejected": -98172928.0, "logps/chosen": -200.75, "logps/rejected": -317.75, "loss": 0.2272, "rewards/chosen": 0.73974609375, "rewards/margins": 3.8015098571777344, "rewards/rejected": -3.0617637634277344, "step": 1076 }, { "epoch": 0.7384298937264313, "grad_norm": 0.5582021475488199, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -112091254.72463769, "logits/rejected": -75213112.40677966, "logps/chosen": -239.07246376811594, "logps/rejected": -289.6271186440678, "loss": 0.2329, "rewards/chosen": 2.148765232252038, "rewards/margins": 6.140290655980852, "rewards/rejected": -3.9915254237288136, "step": 1077 }, { "epoch": 0.7391155296537538, "grad_norm": 0.510782158716888, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89951688.86153845, "logits/rejected": -81755639.87301587, "logps/chosen": -243.69230769230768, "logps/rejected": -366.73015873015873, "loss": 0.2133, "rewards/chosen": 2.4523709810697114, "rewards/margins": 7.083323362022092, "rewards/rejected": -4.630952380952381, "step": 1078 }, { "epoch": 0.7398011655810764, "grad_norm": 0.6233802591997375, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82742178.9090909, "logits/rejected": -94710090.32258065, "logps/chosen": -254.3030303030303, "logps/rejected": -324.64516129032256, "loss": 0.2509, "rewards/chosen": 0.7310606060606061, "rewards/margins": 4.6584799608993155, "rewards/rejected": -3.9274193548387095, "step": 1079 }, { "epoch": 0.7404868015083991, "grad_norm": 0.647721510135222, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92419319.1724138, "logits/rejected": -96708666.51428571, "logps/chosen": -312.2758620689655, "logps/rejected": -308.1142857142857, "loss": 0.226, "rewards/chosen": 0.804266567887931, "rewards/margins": 4.922123710745074, "rewards/rejected": -4.117857142857143, "step": 1080 }, { "epoch": 0.7411724374357216, "grad_norm": 0.5482771543905315, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103967899.15151516, "logits/rejected": -95183640.77419356, "logps/chosen": -297.6969696969697, "logps/rejected": -300.1290322580645, "loss": 0.2368, "rewards/chosen": 2.457620331735322, "rewards/margins": 6.393104202703064, "rewards/rejected": -3.935483870967742, "step": 1081 }, { "epoch": 0.7418580733630442, "grad_norm": 0.49162132174921247, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78373566.17142858, "logits/rejected": -76871468.13793103, "logps/chosen": -280.45714285714286, "logps/rejected": -340.13793103448273, "loss": 0.2135, "rewards/chosen": 1.1839285714285714, "rewards/margins": 6.041687192118227, "rewards/rejected": -4.857758620689655, "step": 1082 }, { "epoch": 0.7425437092903668, "grad_norm": 0.4836707489329513, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87842970.56603773, "logits/rejected": -101669928.96, "logps/chosen": -277.58490566037733, "logps/rejected": -364.8, "loss": 0.1851, "rewards/chosen": 1.1485849056603774, "rewards/margins": 5.798584905660378, "rewards/rejected": -4.65, "step": 1083 }, { "epoch": 0.7432293452176894, "grad_norm": 0.6271931608036599, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74182839.40298508, "logits/rejected": -82442139.27868852, "logps/chosen": -222.32835820895522, "logps/rejected": -340.72131147540983, "loss": 0.2519, "rewards/chosen": 0.7332089552238806, "rewards/margins": 4.975012233912405, "rewards/rejected": -4.241803278688525, "step": 1084 }, { "epoch": 0.743914981145012, "grad_norm": 0.5136301101284293, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89071764.94545455, "logits/rejected": -78025545.64383562, "logps/chosen": -276.94545454545454, "logps/rejected": -353.3150684931507, "loss": 0.209, "rewards/chosen": 2.9360695578835228, "rewards/margins": 6.990864078431468, "rewards/rejected": -4.054794520547945, "step": 1085 }, { "epoch": 0.7446006170723346, "grad_norm": 0.4868911243512252, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72656169.29032259, "logits/rejected": -91639187.39393939, "logps/chosen": -203.3548387096774, "logps/rejected": -347.1515151515151, "loss": 0.1977, "rewards/chosen": 3.0977689681514615, "rewards/margins": 7.203829574212068, "rewards/rejected": -4.106060606060606, "step": 1086 }, { "epoch": 0.7452862529996572, "grad_norm": 0.9050809988601249, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93799889.45454545, "logits/rejected": -74348347.61643836, "logps/chosen": -308.3636363636364, "logps/rejected": -285.8082191780822, "loss": 0.2473, "rewards/chosen": 0.5089488636363636, "rewards/margins": 4.169907767745952, "rewards/rejected": -3.660958904109589, "step": 1087 }, { "epoch": 0.7459718889269797, "grad_norm": 0.5996766106407693, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88334584.24242425, "logits/rejected": -85712631.74193548, "logps/chosen": -263.27272727272725, "logps/rejected": -370.06451612903226, "loss": 0.2131, "rewards/chosen": 2.261601765950521, "rewards/margins": 6.511601765950521, "rewards/rejected": -4.25, "step": 1088 }, { "epoch": 0.7466575248543024, "grad_norm": 0.582066244625886, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88364743.59322034, "logits/rejected": -61881180.75362319, "logps/chosen": -242.16949152542372, "logps/rejected": -317.2173913043478, "loss": 0.2231, "rewards/chosen": 0.7981991525423728, "rewards/margins": 5.080807848194547, "rewards/rejected": -4.282608695652174, "step": 1089 }, { "epoch": 0.747343160781625, "grad_norm": 0.7015491230435563, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75730488.8888889, "logits/rejected": -84709961.14285715, "logps/chosen": -322.6666666666667, "logps/rejected": -298.57142857142856, "loss": 0.2661, "rewards/chosen": 2.2354660034179688, "rewards/margins": 6.141716003417969, "rewards/rejected": -3.90625, "step": 1090 }, { "epoch": 0.7480287967089475, "grad_norm": 0.7031647162673388, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95529968.71641791, "logits/rejected": -82717175.60655738, "logps/chosen": -302.8059701492537, "logps/rejected": -360.39344262295083, "loss": 0.2354, "rewards/chosen": 0.6707089552238806, "rewards/margins": 5.3018564962074874, "rewards/rejected": -4.631147540983607, "step": 1091 }, { "epoch": 0.7487144326362701, "grad_norm": 0.5100285785399814, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73995034.74626866, "logits/rejected": -81548271.21311475, "logps/chosen": -290.6268656716418, "logps/rejected": -388.72131147540983, "loss": 0.2198, "rewards/chosen": 2.842805321536847, "rewards/margins": 6.9083790920286505, "rewards/rejected": -4.065573770491803, "step": 1092 }, { "epoch": 0.7494000685635928, "grad_norm": 0.554080842121244, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76173972.64516129, "logits/rejected": -79691776.0, "logps/chosen": -256.0, "logps/rejected": -318.54545454545456, "loss": 0.2099, "rewards/chosen": 1.0544354838709677, "rewards/margins": 4.895344574780059, "rewards/rejected": -3.840909090909091, "step": 1093 }, { "epoch": 0.7500857044909153, "grad_norm": 0.6378591152238235, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109620623.18644068, "logits/rejected": -81697747.47826087, "logps/chosen": -372.06779661016947, "logps/rejected": -320.231884057971, "loss": 0.1965, "rewards/chosen": 3.005296092922405, "rewards/margins": 6.8639917450963175, "rewards/rejected": -3.858695652173913, "step": 1094 }, { "epoch": 0.7507713404182379, "grad_norm": 0.6905825719729594, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95358735.05882353, "logits/rejected": -113455923.2, "logps/chosen": -293.4117647058824, "logps/rejected": -356.0, "loss": 0.2443, "rewards/chosen": 0.6723345588235294, "rewards/margins": 5.076501225490196, "rewards/rejected": -4.404166666666667, "step": 1095 }, { "epoch": 0.7514569763455605, "grad_norm": 0.5180931739172522, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72351744.0, "logits/rejected": -76669409.88235295, "logps/chosen": -204.53333333333333, "logps/rejected": -332.94117647058823, "loss": 0.2398, "rewards/chosen": 0.6698567708333333, "rewards/margins": 4.710297947303921, "rewards/rejected": -4.040441176470588, "step": 1096 }, { "epoch": 0.7521426122728831, "grad_norm": 0.7790986324111076, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75824300.88311689, "logits/rejected": -76648849.56862745, "logps/chosen": -257.24675324675326, "logps/rejected": -322.19607843137254, "loss": 0.2779, "rewards/chosen": 0.7670454545454546, "rewards/margins": 4.149398395721925, "rewards/rejected": -3.3823529411764706, "step": 1097 }, { "epoch": 0.7528282482002057, "grad_norm": 0.5450265886022038, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83267248.26229508, "logits/rejected": -69174715.2238806, "logps/chosen": -281.9672131147541, "logps/rejected": -263.4029850746269, "loss": 0.2268, "rewards/chosen": 1.0625, "rewards/margins": 4.043139557340252, "rewards/rejected": -2.9806395573402518, "step": 1098 }, { "epoch": 0.7535138841275283, "grad_norm": 0.6559247160975332, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104546910.81481482, "logits/rejected": -74363876.32432432, "logps/chosen": -382.81481481481484, "logps/rejected": -354.1621621621622, "loss": 0.1805, "rewards/chosen": 1.5451388888888888, "rewards/margins": 5.788382132132131, "rewards/rejected": -4.243243243243243, "step": 1099 }, { "epoch": 0.7541995200548509, "grad_norm": 0.673967841956396, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90521844.53731343, "logits/rejected": -83267248.26229508, "logps/chosen": -263.8805970149254, "logps/rejected": -341.7704918032787, "loss": 0.2132, "rewards/chosen": 1.3973880597014925, "rewards/margins": 5.1883716662588695, "rewards/rejected": -3.790983606557377, "step": 1100 }, { "epoch": 0.7548851559821734, "grad_norm": 0.8524059356619811, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87660953.6, "logits/rejected": -99090432.0, "logps/chosen": -276.2, "logps/rejected": -383.6666666666667, "loss": 0.2772, "rewards/chosen": 0.7296875, "rewards/margins": 3.731952794392904, "rewards/rejected": -3.002265294392904, "step": 1101 }, { "epoch": 0.755570791909496, "grad_norm": 0.5954066391461791, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85063428.49122807, "logits/rejected": -78805655.43661971, "logps/chosen": -269.4736842105263, "logps/rejected": -324.50704225352115, "loss": 0.2286, "rewards/chosen": 0.8689692982456141, "rewards/margins": 2.3904880592491353, "rewards/rejected": -1.5215187610035212, "step": 1102 }, { "epoch": 0.7562564278368187, "grad_norm": 0.5428771656187908, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98506225.37142856, "logits/rejected": -88225015.1724138, "logps/chosen": -250.28571428571428, "logps/rejected": -390.0689655172414, "loss": 0.2334, "rewards/chosen": 0.9464285714285714, "rewards/margins": 5.511083743842364, "rewards/rejected": -4.564655172413793, "step": 1103 }, { "epoch": 0.7569420637641412, "grad_norm": 0.9502932634308228, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98936229.64705883, "logits/rejected": -87940573.86666666, "logps/chosen": -291.29411764705884, "logps/rejected": -314.1333333333333, "loss": 0.2334, "rewards/chosen": 0.6985294117647058, "rewards/margins": 4.865196078431373, "rewards/rejected": -4.166666666666667, "step": 1104 }, { "epoch": 0.7576276996914638, "grad_norm": 0.5679240812796075, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87141360.71641791, "logits/rejected": -62708282.75409836, "logps/chosen": -263.1641791044776, "logps/rejected": -273.04918032786884, "loss": 0.2347, "rewards/chosen": 1.0578358208955223, "rewards/margins": 3.7455872156989294, "rewards/rejected": -2.687751394803407, "step": 1105 }, { "epoch": 0.7583133356187864, "grad_norm": 0.6668671777525337, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79953920.0, "logits/rejected": -74842112.0, "logps/chosen": -235.0, "logps/rejected": -357.75, "loss": 0.25, "rewards/chosen": 2.6088626384735107, "rewards/margins": 6.409643888473511, "rewards/rejected": -3.80078125, "step": 1106 }, { "epoch": 0.7589989715461091, "grad_norm": 0.5291004979540014, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78363579.73333333, "logits/rejected": -69514420.70588236, "logps/chosen": -304.0, "logps/rejected": -376.47058823529414, "loss": 0.2252, "rewards/chosen": 0.6721354166666667, "rewards/margins": 4.995664828431372, "rewards/rejected": -4.323529411764706, "step": 1107 }, { "epoch": 0.7596846074734316, "grad_norm": 0.5154049249865105, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78839808.0, "logits/rejected": -92340224.0, "logps/chosen": -278.75, "logps/rejected": -348.75, "loss": 0.2223, "rewards/chosen": 0.86328125, "rewards/margins": 5.44140625, "rewards/rejected": -4.578125, "step": 1108 }, { "epoch": 0.7603702434007542, "grad_norm": 0.6043525043732673, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103989812.96551724, "logits/rejected": -55304894.17142857, "logps/chosen": -278.3448275862069, "logps/rejected": -260.1142857142857, "loss": 0.2252, "rewards/chosen": 3.042016917261584, "rewards/margins": 6.749159774404442, "rewards/rejected": -3.7071428571428573, "step": 1109 }, { "epoch": 0.7610558793280768, "grad_norm": 0.5173476738728467, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91113188.43076923, "logits/rejected": -85217605.07936507, "logps/chosen": -381.53846153846155, "logps/rejected": -324.57142857142856, "loss": 0.1951, "rewards/chosen": 1.6115384615384616, "rewards/margins": 5.245712142404037, "rewards/rejected": -3.6341736808655756, "step": 1110 }, { "epoch": 0.7617415152553993, "grad_norm": 0.9202965831001237, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105472764.58666667, "logits/rejected": -76763676.98113208, "logps/chosen": -272.85333333333335, "logps/rejected": -278.64150943396226, "loss": 0.2953, "rewards/chosen": 0.515, "rewards/margins": 4.33811320754717, "rewards/rejected": -3.82311320754717, "step": 1111 }, { "epoch": 0.762427151182722, "grad_norm": 0.6555622539590004, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82565650.96296297, "logits/rejected": -70112892.54054055, "logps/chosen": -239.85185185185185, "logps/rejected": -319.13513513513516, "loss": 0.1942, "rewards/chosen": 7.015552662037037, "rewards/margins": 11.265552662037038, "rewards/rejected": -4.25, "step": 1112 }, { "epoch": 0.7631127871100446, "grad_norm": 0.6131916091812173, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77537944.21621622, "logits/rejected": -76274194.96296297, "logps/chosen": -263.56756756756755, "logps/rejected": -287.4074074074074, "loss": 0.2588, "rewards/chosen": 0.8146114864864865, "rewards/margins": 4.370167042042042, "rewards/rejected": -3.5555555555555554, "step": 1113 }, { "epoch": 0.7637984230373671, "grad_norm": 0.5033752687687324, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78951604.70588236, "logits/rejected": -80950067.2, "logps/chosen": -299.29411764705884, "logps/rejected": -266.1333333333333, "loss": 0.2297, "rewards/chosen": 1.3864889705882353, "rewards/margins": 4.478155637254902, "rewards/rejected": -3.091666666666667, "step": 1114 }, { "epoch": 0.7644840589646897, "grad_norm": 0.630237236814628, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80917803.32307692, "logits/rejected": -73167303.1111111, "logps/chosen": -234.09230769230768, "logps/rejected": -329.3968253968254, "loss": 0.2401, "rewards/chosen": 0.619951923076923, "rewards/margins": 4.790586843711844, "rewards/rejected": -4.170634920634921, "step": 1115 }, { "epoch": 0.7651696948920124, "grad_norm": 0.4859569860012425, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94166236.86274509, "logits/rejected": -79419418.5974026, "logps/chosen": -233.72549019607843, "logps/rejected": -322.90909090909093, "loss": 0.1936, "rewards/chosen": 5.1794762704886645, "rewards/margins": 9.169736010748405, "rewards/rejected": -3.99025974025974, "step": 1116 }, { "epoch": 0.7658553308193349, "grad_norm": 0.6223961054125469, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83954839.08196722, "logits/rejected": -90271438.3283582, "logps/chosen": -306.2295081967213, "logps/rejected": -417.43283582089555, "loss": 0.2101, "rewards/chosen": 0.8878073770491803, "rewards/margins": 4.884076033765599, "rewards/rejected": -3.996268656716418, "step": 1117 }, { "epoch": 0.7665409667466575, "grad_norm": 0.5888798753410954, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86237432.24242425, "logits/rejected": -71032567.74193548, "logps/chosen": -276.3636363636364, "logps/rejected": -308.1290322580645, "loss": 0.195, "rewards/chosen": 1.3920454545454546, "rewards/margins": 4.411445774290918, "rewards/rejected": -3.0194003197454635, "step": 1118 }, { "epoch": 0.7672266026739801, "grad_norm": 0.6187053519145309, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72658250.83076923, "logits/rejected": -74299099.42857143, "logps/chosen": -319.0153846153846, "logps/rejected": -365.7142857142857, "loss": 0.2387, "rewards/chosen": 0.8620192307692308, "rewards/margins": 4.834241452991453, "rewards/rejected": -3.9722222222222223, "step": 1119 }, { "epoch": 0.7679122386013028, "grad_norm": 0.610530725712522, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92694118.4, "logits/rejected": -85613146.35294117, "logps/chosen": -250.8, "logps/rejected": -370.8235294117647, "loss": 0.2248, "rewards/chosen": 2.5994672139485675, "rewards/margins": 6.87887897865445, "rewards/rejected": -4.279411764705882, "step": 1120 }, { "epoch": 0.7685978745286253, "grad_norm": 0.7374541482441666, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91768478.89655173, "logits/rejected": -84365429.02857143, "logps/chosen": -277.51724137931035, "logps/rejected": -366.62857142857143, "loss": 0.1862, "rewards/chosen": 1.396551724137931, "rewards/margins": 5.782266009852217, "rewards/rejected": -4.385714285714286, "step": 1121 }, { "epoch": 0.7692835104559479, "grad_norm": 0.5245012996907337, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101572061.86666666, "logits/rejected": -81418842.35294117, "logps/chosen": -384.53333333333336, "logps/rejected": -360.94117647058823, "loss": 0.2103, "rewards/chosen": 1.4385416666666666, "rewards/margins": 5.644424019607843, "rewards/rejected": -4.205882352941177, "step": 1122 }, { "epoch": 0.7699691463832705, "grad_norm": 0.5003899532720933, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73038742.06896552, "logits/rejected": -117620267.88571429, "logps/chosen": -277.51724137931035, "logps/rejected": -376.6857142857143, "loss": 0.2101, "rewards/chosen": 1.0172413793103448, "rewards/margins": 5.981527093596059, "rewards/rejected": -4.964285714285714, "step": 1123 }, { "epoch": 0.770654782310593, "grad_norm": 0.5631983260952476, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102175995.80327868, "logits/rejected": -66044637.6119403, "logps/chosen": -288.5245901639344, "logps/rejected": -352.4776119402985, "loss": 0.2415, "rewards/chosen": 0.6239754098360656, "rewards/margins": 3.6948709322241253, "rewards/rejected": -3.0708955223880596, "step": 1124 }, { "epoch": 0.7713404182379157, "grad_norm": 0.7802072735153409, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82111566.76923077, "logits/rejected": -70104795.42857143, "logps/chosen": -324.18461538461537, "logps/rejected": -336.76190476190476, "loss": 0.2266, "rewards/chosen": 2.4935021033653846, "rewards/margins": 6.1284227382860195, "rewards/rejected": -3.634920634920635, "step": 1125 }, { "epoch": 0.7720260541652383, "grad_norm": 0.48687131317844945, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92068410.75409836, "logits/rejected": -97908827.70149253, "logps/chosen": -349.11475409836066, "logps/rejected": -338.6268656716418, "loss": 0.1956, "rewards/chosen": 2.4948880555199797, "rewards/margins": 7.166529846564757, "rewards/rejected": -4.6716417910447765, "step": 1126 }, { "epoch": 0.7727116900925608, "grad_norm": 0.6778585932384119, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98855406.34482759, "logits/rejected": -74658611.2, "logps/chosen": -289.6551724137931, "logps/rejected": -318.1714285714286, "loss": 0.2038, "rewards/chosen": 0.9571659482758621, "rewards/margins": 5.189308805418719, "rewards/rejected": -4.232142857142857, "step": 1127 }, { "epoch": 0.7733973260198834, "grad_norm": 0.5383142481437201, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90277400.38095239, "logits/rejected": -108664737.47692308, "logps/chosen": -263.87301587301585, "logps/rejected": -325.4153846153846, "loss": 0.2247, "rewards/chosen": 1.0952380952380953, "rewards/margins": 5.318315018315019, "rewards/rejected": -4.223076923076923, "step": 1128 }, { "epoch": 0.774082961947206, "grad_norm": 0.7397328122830312, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96218585.79104477, "logits/rejected": -100869573.24590164, "logps/chosen": -359.1641791044776, "logps/rejected": -348.8524590163934, "loss": 0.2018, "rewards/chosen": 1.4188432835820894, "rewards/margins": 4.400903450238084, "rewards/rejected": -2.982060166655994, "step": 1129 }, { "epoch": 0.7747685978745287, "grad_norm": 0.5429353190878433, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88569719.46666667, "logits/rejected": -92768135.52941176, "logps/chosen": -236.53333333333333, "logps/rejected": -326.11764705882354, "loss": 0.2196, "rewards/chosen": 0.6979166666666666, "rewards/margins": 5.003063725490196, "rewards/rejected": -4.305147058823529, "step": 1130 }, { "epoch": 0.7754542338018512, "grad_norm": 0.5465148895754072, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -67920664.77419356, "logits/rejected": -83377679.51515152, "logps/chosen": -233.29032258064515, "logps/rejected": -409.6969696969697, "loss": 0.2114, "rewards/chosen": 0.780241935483871, "rewards/margins": 5.166605571847508, "rewards/rejected": -4.386363636363637, "step": 1131 }, { "epoch": 0.7761398697291738, "grad_norm": 0.5619748260083356, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82512083.86206897, "logits/rejected": -80770311.31428571, "logps/chosen": -230.06896551724137, "logps/rejected": -378.0571428571429, "loss": 0.1927, "rewards/chosen": 1.0237068965517242, "rewards/margins": 4.616564039408868, "rewards/rejected": -3.592857142857143, "step": 1132 }, { "epoch": 0.7768255056564964, "grad_norm": 0.6071628290434249, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88892184.77419356, "logits/rejected": -78738525.0909091, "logps/chosen": -304.51612903225805, "logps/rejected": -336.0, "loss": 0.2258, "rewards/chosen": 0.9879032258064516, "rewards/margins": 5.593963831867058, "rewards/rejected": -4.606060606060606, "step": 1133 }, { "epoch": 0.777511141583819, "grad_norm": 0.48368319086715517, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89353654.85714285, "logits/rejected": -80507335.1111111, "logps/chosen": -359.42857142857144, "logps/rejected": -351.55555555555554, "loss": 0.1961, "rewards/chosen": 3.373720441545759, "rewards/margins": 8.509137108212427, "rewards/rejected": -5.135416666666667, "step": 1134 }, { "epoch": 0.7781967775111416, "grad_norm": 0.6167638768386194, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78028517.51724137, "logits/rejected": -69984958.17142858, "logps/chosen": -286.8965517241379, "logps/rejected": -337.14285714285717, "loss": 0.2031, "rewards/chosen": 1.2327586206896552, "rewards/margins": 5.918472906403942, "rewards/rejected": -4.685714285714286, "step": 1135 }, { "epoch": 0.7788824134384642, "grad_norm": 0.6145202225846513, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100663296.0, "logits/rejected": -108551091.58208956, "logps/chosen": -277.37704918032784, "logps/rejected": -389.7313432835821, "loss": 0.1994, "rewards/chosen": 1.1711065573770492, "rewards/margins": 4.917375214093467, "rewards/rejected": -3.746268656716418, "step": 1136 }, { "epoch": 0.7795680493657867, "grad_norm": 0.8024074830029159, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86515345.19402985, "logits/rejected": -82785934.68852459, "logps/chosen": -304.95522388059703, "logps/rejected": -419.672131147541, "loss": 0.2463, "rewards/chosen": 2.090239795286264, "rewards/margins": 4.991879139548558, "rewards/rejected": -2.901639344262295, "step": 1137 }, { "epoch": 0.7802536852931093, "grad_norm": 0.5480267647862982, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86918884.43076923, "logits/rejected": -73034150.6031746, "logps/chosen": -290.7076923076923, "logps/rejected": -374.6031746031746, "loss": 0.2478, "rewards/chosen": 0.9625, "rewards/margins": 3.9260363382006447, "rewards/rejected": -2.9635363382006448, "step": 1138 }, { "epoch": 0.780939321220432, "grad_norm": 0.5780850597387739, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -69383740.7457627, "logits/rejected": -90694225.6231884, "logps/chosen": -196.20338983050848, "logps/rejected": -324.8695652173913, "loss": 0.2249, "rewards/chosen": 0.8654661016949152, "rewards/margins": 5.09735015966593, "rewards/rejected": -4.231884057971015, "step": 1139 }, { "epoch": 0.7816249571477546, "grad_norm": 0.6398383520490893, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87903159.88732395, "logits/rejected": -76233314.80701755, "logps/chosen": -258.2535211267606, "logps/rejected": -342.17543859649123, "loss": 0.2691, "rewards/chosen": 1.8391272316516285, "rewards/margins": 5.527723722879699, "rewards/rejected": -3.68859649122807, "step": 1140 }, { "epoch": 0.7823105930750771, "grad_norm": 0.5374557245712719, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82887436.1904762, "logits/rejected": -78788387.44615385, "logps/chosen": -285.46031746031747, "logps/rejected": -304.73846153846154, "loss": 0.2032, "rewards/chosen": 1.1130952380952381, "rewards/margins": 5.682326007326007, "rewards/rejected": -4.569230769230769, "step": 1141 }, { "epoch": 0.7829962290023997, "grad_norm": 0.544283723625695, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97118813.74647887, "logits/rejected": -87050204.07017544, "logps/chosen": -271.77464788732397, "logps/rejected": -345.82456140350877, "loss": 0.2206, "rewards/chosen": 0.9014084507042254, "rewards/margins": 3.007993903135352, "rewards/rejected": -2.1065854524311267, "step": 1142 }, { "epoch": 0.7836818649297224, "grad_norm": 0.6749286051820065, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98057743.51515152, "logits/rejected": -99580894.96774194, "logps/chosen": -371.8787878787879, "logps/rejected": -374.96774193548384, "loss": 0.2346, "rewards/chosen": 2.990972576719342, "rewards/margins": 7.563553221880632, "rewards/rejected": -4.57258064516129, "step": 1143 }, { "epoch": 0.7843675008570449, "grad_norm": 0.7954226966600926, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77801385.46478873, "logits/rejected": -107285881.26315789, "logps/chosen": -249.2394366197183, "logps/rejected": -391.29824561403507, "loss": 0.2505, "rewards/chosen": 0.9955985915492958, "rewards/margins": 5.368405609093156, "rewards/rejected": -4.37280701754386, "step": 1144 }, { "epoch": 0.7850531367843675, "grad_norm": 0.7495949439616832, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -106439058.8852459, "logits/rejected": -92587695.76119404, "logps/chosen": -244.19672131147541, "logps/rejected": -324.7761194029851, "loss": 0.2109, "rewards/chosen": 3.077328291095671, "rewards/margins": 7.872104410498656, "rewards/rejected": -4.794776119402985, "step": 1145 }, { "epoch": 0.7857387727116901, "grad_norm": 0.6642708785389055, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87370578.70769231, "logits/rejected": -87348045.20634921, "logps/chosen": -291.2, "logps/rejected": -367.74603174603175, "loss": 0.2343, "rewards/chosen": 2.6389371431790867, "rewards/margins": 6.797667301909246, "rewards/rejected": -4.158730158730159, "step": 1146 }, { "epoch": 0.7864244086390126, "grad_norm": 0.5682729282958113, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87050204.07017544, "logits/rejected": -84831275.26760563, "logps/chosen": -288.56140350877195, "logps/rejected": -329.46478873239437, "loss": 0.2321, "rewards/chosen": 2.7548900403474508, "rewards/margins": 7.103481589643225, "rewards/rejected": -4.348591549295775, "step": 1147 }, { "epoch": 0.7871100445663353, "grad_norm": 0.5430655657331181, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73619425.43283582, "logits/rejected": -75978785.5737705, "logps/chosen": -282.5074626865672, "logps/rejected": -320.0, "loss": 0.2301, "rewards/chosen": 1.1632462686567164, "rewards/margins": 5.114065940787864, "rewards/rejected": -3.9508196721311477, "step": 1148 }, { "epoch": 0.7877956804936579, "grad_norm": 0.7282070427725258, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86223888.78688525, "logits/rejected": -81695025.6716418, "logps/chosen": -293.5081967213115, "logps/rejected": -346.02985074626866, "loss": 0.2145, "rewards/chosen": 2.8713899205942623, "rewards/margins": 941983.5281063385, "rewards/rejected": -941980.6567164179, "step": 1149 }, { "epoch": 0.7884813164209804, "grad_norm": 0.6968559607187887, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82736955.61643836, "logits/rejected": -59711636.945454545, "logps/chosen": -287.56164383561645, "logps/rejected": -298.4727272727273, "loss": 0.2584, "rewards/chosen": 0.6583904109589042, "rewards/margins": 3.2021276490625974, "rewards/rejected": -2.5437372381036933, "step": 1150 }, { "epoch": 0.789166952348303, "grad_norm": 0.4659487797435672, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80368276.64516129, "logits/rejected": -96087691.63636364, "logps/chosen": -212.6451612903226, "logps/rejected": -343.27272727272725, "loss": 0.2067, "rewards/chosen": 0.8235887096774194, "rewards/margins": 2.6005186228225425, "rewards/rejected": -1.776929913145123, "step": 1151 }, { "epoch": 0.7898525882756257, "grad_norm": 0.518038254653391, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86900736.0, "logits/rejected": -70320128.0, "logps/chosen": -318.0, "logps/rejected": -262.0, "loss": 0.226, "rewards/chosen": 3.0439066886901855, "rewards/margins": 5.39008903503418, "rewards/rejected": -2.346182346343994, "step": 1152 }, { "epoch": 0.7905382242029483, "grad_norm": 0.5711689590811175, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88215684.12903225, "logits/rejected": -84775780.84848484, "logps/chosen": -286.96774193548384, "logps/rejected": -365.09090909090907, "loss": 0.211, "rewards/chosen": 1.127016129032258, "rewards/margins": 5.2292888563049855, "rewards/rejected": -4.1022727272727275, "step": 1153 }, { "epoch": 0.7912238601302708, "grad_norm": 0.5348803857899946, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79869000.11267605, "logits/rejected": -91759598.03508772, "logps/chosen": -334.4225352112676, "logps/rejected": -302.03508771929825, "loss": 0.2242, "rewards/chosen": 1.341549295774648, "rewards/margins": 4.889794909809735, "rewards/rejected": -3.5482456140350878, "step": 1154 }, { "epoch": 0.7919094960575934, "grad_norm": 0.5789211154905167, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79578416.43243243, "logits/rejected": -66720502.518518515, "logps/chosen": -307.6756756756757, "logps/rejected": -285.9259259259259, "loss": 0.2679, "rewards/chosen": 0.8859797297297297, "rewards/margins": 4.923016766766767, "rewards/rejected": -4.037037037037037, "step": 1155 }, { "epoch": 0.792595131984916, "grad_norm": 0.6316340769348839, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94837873.77777778, "logits/rejected": -96843483.42857143, "logps/chosen": -262.6666666666667, "logps/rejected": -303.42857142857144, "loss": 0.2577, "rewards/chosen": 1.0225694444444444, "rewards/margins": 4.982390873015873, "rewards/rejected": -3.9598214285714284, "step": 1156 }, { "epoch": 0.7932807679122386, "grad_norm": 0.6987084005073915, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79225742.22222222, "logits/rejected": -100340657.23076923, "logps/chosen": -239.4920634920635, "logps/rejected": -389.4153846153846, "loss": 0.1877, "rewards/chosen": 1.119047619047619, "rewards/margins": 4.053164862887764, "rewards/rejected": -2.934117243840144, "step": 1157 }, { "epoch": 0.7939664038395612, "grad_norm": 0.5772173499771198, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80670446.93333334, "logits/rejected": -72043339.29411764, "logps/chosen": -264.8, "logps/rejected": -332.70588235294116, "loss": 0.2094, "rewards/chosen": 1.01875, "rewards/margins": 5.658455882352941, "rewards/rejected": -4.639705882352941, "step": 1158 }, { "epoch": 0.7946520397668838, "grad_norm": 0.5893429484133764, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101558080.85333334, "logits/rejected": -99792780.0754717, "logps/chosen": -263.46666666666664, "logps/rejected": -360.1509433962264, "loss": 0.2253, "rewards/chosen": 1.0316666666666667, "rewards/margins": 5.682610062893081, "rewards/rejected": -4.650943396226415, "step": 1159 }, { "epoch": 0.7953376756942063, "grad_norm": 0.5421635501870976, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86925430.72463769, "logits/rejected": -101942914.16949153, "logps/chosen": -316.7536231884058, "logps/rejected": -377.22033898305085, "loss": 0.2275, "rewards/chosen": 1.4293478260869565, "rewards/margins": 5.997144436256448, "rewards/rejected": -4.567796610169491, "step": 1160 }, { "epoch": 0.796023311621529, "grad_norm": 0.47893054355778897, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80171125.02857143, "logits/rejected": -76365259.03448276, "logps/chosen": -246.62857142857143, "logps/rejected": -350.0689655172414, "loss": 0.2397, "rewards/chosen": 2.4482070922851564, "rewards/margins": 7.073207092285156, "rewards/rejected": -4.625, "step": 1161 }, { "epoch": 0.7967089475488516, "grad_norm": 0.5212640418627833, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76799152.55172414, "logits/rejected": -95270619.42857143, "logps/chosen": -252.68965517241378, "logps/rejected": -385.37142857142857, "loss": 0.2091, "rewards/chosen": 0.9730603448275862, "rewards/margins": 5.455203201970443, "rewards/rejected": -4.482142857142857, "step": 1162 }, { "epoch": 0.7973945834761742, "grad_norm": 0.4843139893810499, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74811131.34545454, "logits/rejected": -77106246.1369863, "logps/chosen": -268.8, "logps/rejected": -308.6027397260274, "loss": 0.1877, "rewards/chosen": 3.0107649369673295, "rewards/margins": 7.63062795066596, "rewards/rejected": -4.61986301369863, "step": 1163 }, { "epoch": 0.7980802194034967, "grad_norm": 0.6456774996334074, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -67895296.0, "logits/rejected": -92143616.0, "logps/chosen": -221.0, "logps/rejected": -367.75, "loss": 0.2179, "rewards/chosen": 2.39573335647583, "rewards/margins": 7.56760835647583, "rewards/rejected": -5.171875, "step": 1164 }, { "epoch": 0.7987658553308193, "grad_norm": 0.46458390127094124, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95724841.29032259, "logits/rejected": -76132972.60606061, "logps/chosen": -203.61290322580646, "logps/rejected": -297.2121212121212, "loss": 0.1972, "rewards/chosen": 1.1265120967741935, "rewards/margins": 6.058330278592376, "rewards/rejected": -4.931818181818182, "step": 1165 }, { "epoch": 0.799451491258142, "grad_norm": 0.6130862367565524, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95452191.03030303, "logits/rejected": -92680588.38709678, "logps/chosen": -321.93939393939394, "logps/rejected": -328.7741935483871, "loss": 0.2203, "rewards/chosen": 0.9190340909090909, "rewards/margins": 5.314195381231672, "rewards/rejected": -4.395161290322581, "step": 1166 }, { "epoch": 0.8001371271854645, "grad_norm": 0.5084509539349159, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74847650.25352113, "logits/rejected": -70493740.91228071, "logps/chosen": -233.69014084507043, "logps/rejected": -349.4736842105263, "loss": 0.2238, "rewards/chosen": 1.090669014084507, "rewards/margins": 4.9941777860143315, "rewards/rejected": -3.9035087719298245, "step": 1167 }, { "epoch": 0.8008227631127871, "grad_norm": 0.6609854881084777, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72801133.71428572, "logits/rejected": -81427350.06896552, "logps/chosen": -232.9142857142857, "logps/rejected": -305.9310344827586, "loss": 0.2574, "rewards/chosen": 0.7803571428571429, "rewards/margins": 5.013115763546797, "rewards/rejected": -4.232758620689655, "step": 1168 }, { "epoch": 0.8015083990401097, "grad_norm": 0.48157884062061207, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -69031253.33333333, "logits/rejected": -82687707.42857143, "logps/chosen": -258.44444444444446, "logps/rejected": -363.42857142857144, "loss": 0.2274, "rewards/chosen": 1.0529513888888888, "rewards/margins": 5.865451388888889, "rewards/rejected": -4.8125, "step": 1169 }, { "epoch": 0.8021940349674322, "grad_norm": 1.001519841356803, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87361360.45714286, "logits/rejected": -97336779.03448276, "logps/chosen": -333.7142857142857, "logps/rejected": -341.7931034482759, "loss": 0.2411, "rewards/chosen": 1.1241071428571427, "rewards/margins": 4.709506902083975, "rewards/rejected": -3.585399759226832, "step": 1170 }, { "epoch": 0.8028796708947549, "grad_norm": 0.7305051951977627, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92962278.81967214, "logits/rejected": -85951931.2238806, "logps/chosen": -322.62295081967216, "logps/rejected": -276.7761194029851, "loss": 0.2203, "rewards/chosen": 1.0060194672131149, "rewards/margins": 4.976168720944458, "rewards/rejected": -3.970149253731343, "step": 1171 }, { "epoch": 0.8035653068220775, "grad_norm": 0.5634209576969085, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82940884.73239437, "logits/rejected": -69831482.38596492, "logps/chosen": -209.1267605633803, "logps/rejected": -340.7719298245614, "loss": 0.2456, "rewards/chosen": 0.9507042253521126, "rewards/margins": 3.081454781093272, "rewards/rejected": -2.1307505557411597, "step": 1172 }, { "epoch": 0.8042509427494001, "grad_norm": 0.6315424996565994, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -71819390.03076923, "logits/rejected": -83686351.23809524, "logps/chosen": -225.96923076923076, "logps/rejected": -334.73015873015873, "loss": 0.2203, "rewards/chosen": 0.9211538461538461, "rewards/margins": 5.722741147741147, "rewards/rejected": -4.801587301587301, "step": 1173 }, { "epoch": 0.8049365786767226, "grad_norm": 0.772810395268705, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78127798.23728813, "logits/rejected": -97137649.1594203, "logps/chosen": -227.79661016949152, "logps/rejected": -328.57971014492756, "loss": 0.2097, "rewards/chosen": 2.7971199164956304, "rewards/margins": 5.576996846621876, "rewards/rejected": -2.7798769301262456, "step": 1174 }, { "epoch": 0.8056222146040453, "grad_norm": 0.5420834139623557, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78594804.18461539, "logits/rejected": -86415977.65079366, "logps/chosen": -210.95384615384614, "logps/rejected": -314.41269841269843, "loss": 0.2489, "rewards/chosen": 0.5413461538461538, "rewards/margins": 4.882615995115994, "rewards/rejected": -4.341269841269841, "step": 1175 }, { "epoch": 0.8063078505313679, "grad_norm": 0.7161691341215493, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90327332.57142857, "logits/rejected": -84585130.66666667, "logps/chosen": -306.85714285714283, "logps/rejected": -315.55555555555554, "loss": 0.2009, "rewards/chosen": 0.8895089285714286, "rewards/margins": 5.514508928571429, "rewards/rejected": -4.625, "step": 1176 }, { "epoch": 0.8069934864586904, "grad_norm": 0.5384364477746167, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -68601751.86440678, "logits/rejected": -79691776.0, "logps/chosen": -257.89830508474574, "logps/rejected": -334.84057971014494, "loss": 0.2178, "rewards/chosen": 0.888771186440678, "rewards/margins": 5.588046548759519, "rewards/rejected": -4.699275362318841, "step": 1177 }, { "epoch": 0.807679122386013, "grad_norm": 0.7990789875602843, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -70349917.0909091, "logits/rejected": -107022402.06451613, "logps/chosen": -240.4848484848485, "logps/rejected": -362.3225806451613, "loss": 0.2367, "rewards/chosen": 0.9583333333333334, "rewards/margins": 3.1327166813676075, "rewards/rejected": -2.174383348034274, "step": 1178 }, { "epoch": 0.8083647583133357, "grad_norm": 0.8628177434422757, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88143934.06060606, "logits/rejected": -66702963.61290322, "logps/chosen": -295.27272727272725, "logps/rejected": -307.0967741935484, "loss": 0.2093, "rewards/chosen": 1.4299242424242424, "rewards/margins": 4.429168194037145, "rewards/rejected": -2.999243951612903, "step": 1179 }, { "epoch": 0.8090503942406582, "grad_norm": 0.7828233046084363, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92405760.0, "logits/rejected": -96534528.0, "logps/chosen": -320.25, "logps/rejected": -299.5, "loss": 0.2306, "rewards/chosen": 2.6684257984161377, "rewards/margins": 6.957488298416138, "rewards/rejected": -4.2890625, "step": 1180 }, { "epoch": 0.8097360301679808, "grad_norm": 0.6392286838024736, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91371299.44615385, "logits/rejected": -88546417.77777778, "logps/chosen": -291.2, "logps/rejected": -353.26984126984127, "loss": 0.2161, "rewards/chosen": 1.0754807692307693, "rewards/margins": 5.484210927960929, "rewards/rejected": -4.408730158730159, "step": 1181 }, { "epoch": 0.8104216660953034, "grad_norm": 0.5675257776598388, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -106422638.80597015, "logits/rejected": -104926359.08196722, "logps/chosen": -343.4029850746269, "logps/rejected": -375.08196721311475, "loss": 0.2185, "rewards/chosen": 1.1389925373134329, "rewards/margins": 5.634894176657695, "rewards/rejected": -4.495901639344262, "step": 1182 }, { "epoch": 0.8111073020226259, "grad_norm": 0.9246983880237928, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77938932.53731343, "logits/rejected": -76597617.31147541, "logps/chosen": -268.17910447761193, "logps/rejected": -360.91803278688525, "loss": 0.2551, "rewards/chosen": 0.8111007462686567, "rewards/margins": 2.202463296097358, "rewards/rejected": -1.3913625498287012, "step": 1183 }, { "epoch": 0.8117929379499486, "grad_norm": 0.7377336280896928, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85084452.57142857, "logits/rejected": -78177166.22222222, "logps/chosen": -283.7142857142857, "logps/rejected": -341.77777777777777, "loss": 0.2365, "rewards/chosen": 0.6531808035714286, "rewards/margins": 4.6809585813492065, "rewards/rejected": -4.027777777777778, "step": 1184 }, { "epoch": 0.8124785738772712, "grad_norm": 0.617544205463003, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82968576.0, "logits/rejected": -91226112.0, "logps/chosen": -247.5, "logps/rejected": -365.25, "loss": 0.2224, "rewards/chosen": 0.9755859375, "rewards/margins": 6.0146484375, "rewards/rejected": -5.0390625, "step": 1185 }, { "epoch": 0.8131642098045938, "grad_norm": 0.6669081763066635, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80454376.72727273, "logits/rejected": -88215684.12903225, "logps/chosen": -265.6969696969697, "logps/rejected": -322.3225806451613, "loss": 0.2371, "rewards/chosen": 0.8579545454545454, "rewards/margins": 5.160373900293255, "rewards/rejected": -4.30241935483871, "step": 1186 }, { "epoch": 0.8138498457319163, "grad_norm": 0.6853188552309056, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90300897.88235295, "logits/rejected": -91855257.6, "logps/chosen": -257.1764705882353, "logps/rejected": -345.73333333333335, "loss": 0.2238, "rewards/chosen": 0.8138786764705882, "rewards/margins": 4.334712009803922, "rewards/rejected": -3.5208333333333335, "step": 1187 }, { "epoch": 0.814535481659239, "grad_norm": 0.9151479717482106, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -69034118.29508197, "logits/rejected": -105045404.65671642, "logps/chosen": -221.11475409836066, "logps/rejected": -345.3134328358209, "loss": 0.2047, "rewards/chosen": 1.0645491803278688, "rewards/margins": 5.303355150477122, "rewards/rejected": -4.2388059701492535, "step": 1188 }, { "epoch": 0.8152211175865616, "grad_norm": 0.6174722472403895, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102341017.6, "logits/rejected": -77680808.32876712, "logps/chosen": -392.43636363636364, "logps/rejected": -298.73972602739724, "loss": 0.207, "rewards/chosen": 1.5369318181818181, "rewards/margins": 5.512959215442092, "rewards/rejected": -3.9760273972602738, "step": 1189 }, { "epoch": 0.8159067535138841, "grad_norm": 0.6020185565951574, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81848846.62857144, "logits/rejected": -84536920.27586207, "logps/chosen": -256.45714285714286, "logps/rejected": -345.37931034482756, "loss": 0.2343, "rewards/chosen": 0.8285714285714286, "rewards/margins": 5.5958128078817735, "rewards/rejected": -4.767241379310345, "step": 1190 }, { "epoch": 0.8165923894412067, "grad_norm": 0.7339027990348564, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74448896.0, "logits/rejected": -119029263.51515152, "logps/chosen": -293.93548387096774, "logps/rejected": -407.27272727272725, "loss": 0.2389, "rewards/chosen": 0.7486769153225806, "rewards/margins": 5.619889036534702, "rewards/rejected": -4.871212121212121, "step": 1191 }, { "epoch": 0.8172780253685293, "grad_norm": 0.8648255539483097, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -71811568.48484848, "logits/rejected": -71370818.06451613, "logps/chosen": -312.24242424242425, "logps/rejected": -292.1290322580645, "loss": 0.2363, "rewards/chosen": 0.8494318181818182, "rewards/margins": 4.025330954632801, "rewards/rejected": -3.1758991364509828, "step": 1192 }, { "epoch": 0.8179636612958519, "grad_norm": 0.5551694055039345, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94468631.63076924, "logits/rejected": -80557267.3015873, "logps/chosen": -287.75384615384615, "logps/rejected": -389.58730158730157, "loss": 0.2181, "rewards/chosen": 2.574029541015625, "rewards/margins": 8.097839064825148, "rewards/rejected": -5.523809523809524, "step": 1193 }, { "epoch": 0.8186492972231745, "grad_norm": 0.574107108945184, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -69827394.37037037, "logits/rejected": -73683718.91891892, "logps/chosen": -274.6666666666667, "logps/rejected": -303.13513513513516, "loss": 0.2281, "rewards/chosen": 0.7158564814814815, "rewards/margins": 4.705721346346347, "rewards/rejected": -3.989864864864865, "step": 1194 }, { "epoch": 0.8193349331504971, "grad_norm": 0.5878866466525856, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97178797.29230769, "logits/rejected": -68506965.33333333, "logps/chosen": -241.96923076923076, "logps/rejected": -304.76190476190476, "loss": 0.2602, "rewards/chosen": 0.5129807692307692, "rewards/margins": 4.798695054945055, "rewards/rejected": -4.285714285714286, "step": 1195 }, { "epoch": 0.8200205690778197, "grad_norm": 0.49726314445117037, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -66191360.0, "logits/rejected": -87556096.0, "logps/chosen": -302.625, "logps/rejected": -318.75, "loss": 0.2092, "rewards/chosen": 1.2900390625, "rewards/margins": 6.1923828125, "rewards/rejected": -4.90234375, "step": 1196 }, { "epoch": 0.8207062050051422, "grad_norm": 0.695469956507942, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97565963.81538461, "logits/rejected": -91941806.73015873, "logps/chosen": -349.53846153846155, "logps/rejected": -406.85714285714283, "loss": 0.2387, "rewards/chosen": 2.2951873779296874, "rewards/margins": 6.7396318223741325, "rewards/rejected": -4.444444444444445, "step": 1197 }, { "epoch": 0.8213918409324649, "grad_norm": 0.5511018676127865, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82884455.1641791, "logits/rejected": -79623016.91803278, "logps/chosen": -276.05970149253733, "logps/rejected": -375.08196721311475, "loss": 0.2055, "rewards/chosen": 1.2276119402985075, "rewards/margins": 5.752202104232934, "rewards/rejected": -4.524590163934426, "step": 1198 }, { "epoch": 0.8220774768597875, "grad_norm": 0.558647185688393, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90477129.14285715, "logits/rejected": -105002231.1724138, "logps/chosen": -344.6857142857143, "logps/rejected": -424.82758620689657, "loss": 0.2146, "rewards/chosen": 1.4017857142857142, "rewards/margins": 6.242302955665025, "rewards/rejected": -4.8405172413793105, "step": 1199 }, { "epoch": 0.82276311278711, "grad_norm": 0.6453145031992722, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89653248.0, "logits/rejected": -96665600.0, "logps/chosen": -284.5, "logps/rejected": -363.5, "loss": 0.2413, "rewards/chosen": 0.828857421875, "rewards/margins": 5.446044921875, "rewards/rejected": -4.6171875, "step": 1200 }, { "epoch": 0.8234487487144326, "grad_norm": 0.6380389660367625, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97436908.3076923, "logits/rejected": -73566760.63492064, "logps/chosen": -354.95384615384614, "logps/rejected": -315.93650793650795, "loss": 0.2321, "rewards/chosen": 1.0334134615384616, "rewards/margins": 4.312923475179335, "rewards/rejected": -3.279510013640873, "step": 1201 }, { "epoch": 0.8241343846417553, "grad_norm": 0.9735435447364243, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86130918.76056337, "logits/rejected": -109934915.36842105, "logps/chosen": -265.01408450704224, "logps/rejected": -355.9298245614035, "loss": 0.2339, "rewards/chosen": 2.0384968502420775, "rewards/margins": 5.766567025680674, "rewards/rejected": -3.7280701754385963, "step": 1202 }, { "epoch": 0.8248200205690778, "grad_norm": 0.6818043558366214, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96259276.8, "logits/rejected": -82899184.94117647, "logps/chosen": -221.6, "logps/rejected": -290.8235294117647, "loss": 0.2291, "rewards/chosen": 0.6966145833333334, "rewards/margins": 4.96499693627451, "rewards/rejected": -4.268382352941177, "step": 1203 }, { "epoch": 0.8255056564964004, "grad_norm": 0.5558238912011488, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85766285.2413793, "logits/rejected": -76935519.08571428, "logps/chosen": -300.41379310344826, "logps/rejected": -334.1714285714286, "loss": 0.1969, "rewards/chosen": 0.9568965517241379, "rewards/margins": 5.964039408866995, "rewards/rejected": -5.007142857142857, "step": 1204 }, { "epoch": 0.826191292423723, "grad_norm": 0.6280265869577731, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84161116.32786885, "logits/rejected": -84136486.20895523, "logps/chosen": -310.8196721311475, "logps/rejected": -341.0149253731343, "loss": 0.2141, "rewards/chosen": 2.731045582255379, "rewards/margins": 7.119105283747916, "rewards/rejected": -4.388059701492537, "step": 1205 }, { "epoch": 0.8268769283510456, "grad_norm": 0.9980638962116208, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82088521.14285715, "logits/rejected": -73472635.5862069, "logps/chosen": -318.1714285714286, "logps/rejected": -302.62068965517244, "loss": 0.264, "rewards/chosen": 0.9589285714285715, "rewards/margins": 4.13134236453202, "rewards/rejected": -3.1724137931034484, "step": 1206 }, { "epoch": 0.8275625642783682, "grad_norm": 0.5757135482692957, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75339196.3773585, "logits/rejected": -82767598.93333334, "logps/chosen": -254.03773584905662, "logps/rejected": -278.82666666666665, "loss": 0.1727, "rewards/chosen": 1.1733490566037736, "rewards/margins": 5.70001572327044, "rewards/rejected": -4.526666666666666, "step": 1207 }, { "epoch": 0.8282482002056908, "grad_norm": 0.5372921670447707, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103017992.98245615, "logits/rejected": -93456182.08450705, "logps/chosen": -296.42105263157896, "logps/rejected": -382.6478873239437, "loss": 0.1999, "rewards/chosen": 0.9742324561403509, "rewards/margins": 5.713669075858661, "rewards/rejected": -4.73943661971831, "step": 1208 }, { "epoch": 0.8289338361330134, "grad_norm": 0.6564087989173669, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84655035.73333333, "logits/rejected": -81110437.64705883, "logps/chosen": -271.46666666666664, "logps/rejected": -286.3529411764706, "loss": 0.2126, "rewards/chosen": 0.9484375, "rewards/margins": 5.047702205882353, "rewards/rejected": -4.099264705882353, "step": 1209 }, { "epoch": 0.8296194720603359, "grad_norm": 0.7696870818692452, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82421402.41269842, "logits/rejected": -108793792.98461539, "logps/chosen": -198.85714285714286, "logps/rejected": -447.5076923076923, "loss": 0.2295, "rewards/chosen": 0.7549603174603174, "rewards/margins": 2.9143299181964952, "rewards/rejected": -2.159369600736178, "step": 1210 }, { "epoch": 0.8303051079876586, "grad_norm": 0.5456000707484676, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74375052.61971831, "logits/rejected": -85137012.77192983, "logps/chosen": -242.14084507042253, "logps/rejected": -385.4035087719298, "loss": 0.2595, "rewards/chosen": 0.7526408450704225, "rewards/margins": 5.441237336298492, "rewards/rejected": -4.68859649122807, "step": 1211 }, { "epoch": 0.8309907439149812, "grad_norm": 0.6544263950614754, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78526691.55555555, "logits/rejected": -81039945.14285715, "logps/chosen": -277.1111111111111, "logps/rejected": -369.14285714285717, "loss": 0.2444, "rewards/chosen": 0.8832465277777778, "rewards/margins": 5.735925099206349, "rewards/rejected": -4.852678571428571, "step": 1212 }, { "epoch": 0.8316763798423037, "grad_norm": 0.576909522279451, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101437629.04615384, "logits/rejected": -89611637.84126984, "logps/chosen": -326.89230769230767, "logps/rejected": -369.26984126984127, "loss": 0.2035, "rewards/chosen": 1.1673076923076924, "rewards/margins": 5.853815628815629, "rewards/rejected": -4.686507936507937, "step": 1213 }, { "epoch": 0.8323620157696263, "grad_norm": 0.6064183309483167, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83959664.28070176, "logits/rejected": -76265443.15492958, "logps/chosen": -249.26315789473685, "logps/rejected": -322.7042253521127, "loss": 0.2098, "rewards/chosen": 1.3048245614035088, "rewards/margins": 5.1005992092908325, "rewards/rejected": -3.795774647887324, "step": 1214 }, { "epoch": 0.8330476516969489, "grad_norm": 0.5749893949198932, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89398593.82857142, "logits/rejected": -82295137.10344827, "logps/chosen": -291.8857142857143, "logps/rejected": -377.6551724137931, "loss": 0.2102, "rewards/chosen": 1.2607142857142857, "rewards/margins": 6.510714285714286, "rewards/rejected": -5.25, "step": 1215 }, { "epoch": 0.8337332876242715, "grad_norm": 0.6705801394879469, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100663296.0, "logits/rejected": -73778494.95081967, "logps/chosen": -265.3134328358209, "logps/rejected": -350.95081967213116, "loss": 0.2202, "rewards/chosen": 1.1595149253731343, "rewards/margins": 1.274729038702412, "rewards/rejected": -0.11521411332927767, "step": 1216 }, { "epoch": 0.8344189235515941, "grad_norm": 0.5625419661642358, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81821191.87692308, "logits/rejected": -80290962.28571428, "logps/chosen": -231.13846153846154, "logps/rejected": -314.6666666666667, "loss": 0.2303, "rewards/chosen": 1.802160175030048, "rewards/margins": 6.413271286141159, "rewards/rejected": -4.611111111111111, "step": 1217 }, { "epoch": 0.8351045594789167, "grad_norm": 0.5973708078727373, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84900830.96774194, "logits/rejected": -77467523.87878788, "logps/chosen": -303.741935483871, "logps/rejected": -311.27272727272725, "loss": 0.2212, "rewards/chosen": 1.0952620967741935, "rewards/margins": 5.464580278592376, "rewards/rejected": -4.369318181818182, "step": 1218 }, { "epoch": 0.8357901954062393, "grad_norm": 0.6394556535805619, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84989844.21052632, "logits/rejected": -81062990.76923077, "logps/chosen": -282.7368421052632, "logps/rejected": -312.0, "loss": 0.2647, "rewards/chosen": 1.042763157894737, "rewards/margins": 10233796.119686235, "rewards/rejected": -10233795.076923076, "step": 1219 }, { "epoch": 0.8364758313335618, "grad_norm": 0.6687365437549204, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83220317.46031746, "logits/rejected": -90016216.61538461, "logps/chosen": -290.53968253968253, "logps/rejected": -366.7692307692308, "loss": 0.2355, "rewards/chosen": 0.9459325396825397, "rewards/margins": 3.961317155067155, "rewards/rejected": -3.0153846153846153, "step": 1220 }, { "epoch": 0.8371614672608845, "grad_norm": 0.5288267349723424, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81037118.79245283, "logits/rejected": -85228257.28, "logps/chosen": -297.66037735849056, "logps/rejected": -344.32, "loss": 0.1795, "rewards/chosen": 1.1609669811320755, "rewards/margins": 5.987633647798742, "rewards/rejected": -4.826666666666667, "step": 1221 }, { "epoch": 0.8378471031882071, "grad_norm": 0.6648372455576839, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79353525.67741935, "logits/rejected": -91956937.6969697, "logps/chosen": -252.1290322580645, "logps/rejected": -390.7878787878788, "loss": 0.2135, "rewards/chosen": 1.0942540322580645, "rewards/margins": 5.476829789833822, "rewards/rejected": -4.382575757575758, "step": 1222 }, { "epoch": 0.8385327391155296, "grad_norm": 0.7970670364699768, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82371470.22222222, "logits/rejected": -83137097.14285715, "logps/chosen": -289.3333333333333, "logps/rejected": -374.2857142857143, "loss": 0.2678, "rewards/chosen": 0.9769965277777778, "rewards/margins": 1214734.9769965278, "rewards/rejected": -1214734.0, "step": 1223 }, { "epoch": 0.8392183750428522, "grad_norm": 0.5375481033973699, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76142749.53846154, "logits/rejected": -106788311.36507936, "logps/chosen": -242.95384615384614, "logps/rejected": -427.1746031746032, "loss": 0.1979, "rewards/chosen": 1.2413461538461539, "rewards/margins": 0.9817973005320417, "rewards/rejected": 0.2595488533141121, "step": 1224 }, { "epoch": 0.8399040109701749, "grad_norm": 0.6890374236409891, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94371840.0, "logits/rejected": -79260009.41176471, "logps/chosen": -311.73333333333335, "logps/rejected": -312.0, "loss": 0.1891, "rewards/chosen": 3.1025705973307294, "rewards/margins": 7.499629420860142, "rewards/rejected": -4.397058823529412, "step": 1225 }, { "epoch": 0.8405896468974974, "grad_norm": 0.7039709142819542, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79160103.66197184, "logits/rejected": -107874555.50877193, "logps/chosen": -297.9154929577465, "logps/rejected": -345.2631578947368, "loss": 0.2331, "rewards/chosen": 1.345950704225352, "rewards/margins": 5.543319125277984, "rewards/rejected": -4.197368421052632, "step": 1226 }, { "epoch": 0.84127528282482, "grad_norm": 0.48360176086297463, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -71850251.13043478, "logits/rejected": -81966652.7457627, "logps/chosen": -289.39130434782606, "logps/rejected": -331.66101694915255, "loss": 0.2531, "rewards/chosen": 1.2318840579710144, "rewards/margins": 5.227646769835421, "rewards/rejected": -3.9957627118644066, "step": 1227 }, { "epoch": 0.8419609187521426, "grad_norm": 0.5814267361038001, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73118613.01492538, "logits/rejected": -77422726.29508197, "logps/chosen": -246.92537313432837, "logps/rejected": -297.1803278688525, "loss": 0.2325, "rewards/chosen": 1.0334654850746268, "rewards/margins": 5.09903925556643, "rewards/rejected": -4.065573770491803, "step": 1228 }, { "epoch": 0.8426465546794653, "grad_norm": 0.6785006158275841, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104529920.0, "logits/rejected": -94371840.0, "logps/chosen": -343.0, "logps/rejected": -355.0, "loss": 0.1895, "rewards/chosen": 1.7734375, "rewards/margins": 5.859375, "rewards/rejected": -4.0859375, "step": 1229 }, { "epoch": 0.8433321906067878, "grad_norm": 0.732486179399271, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88369646.34482759, "logits/rejected": -85024533.94285715, "logps/chosen": -265.37931034482756, "logps/rejected": -364.34285714285716, "loss": 0.216, "rewards/chosen": 2.7580892628636855, "rewards/margins": 6.565232120006542, "rewards/rejected": -3.807142857142857, "step": 1230 }, { "epoch": 0.8440178265341104, "grad_norm": 0.5949341467053885, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76241622.70967741, "logits/rejected": -90495286.3030303, "logps/chosen": -252.90322580645162, "logps/rejected": -334.54545454545456, "loss": 0.2101, "rewards/chosen": 1.1038306451612903, "rewards/margins": 6.028073069403714, "rewards/rejected": -4.924242424242424, "step": 1231 }, { "epoch": 0.844703462461433, "grad_norm": 0.606206383742918, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -70720625.77777778, "logits/rejected": -83211995.42857143, "logps/chosen": -244.22222222222223, "logps/rejected": -356.0, "loss": 0.2383, "rewards/chosen": 1.2074652777777777, "rewards/margins": 4.371805705721416, "rewards/rejected": -3.1643404279436385, "step": 1232 }, { "epoch": 0.8453890983887555, "grad_norm": 0.8036410390121055, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87050204.07017544, "logits/rejected": -85422022.30985916, "logps/chosen": -260.2105263157895, "logps/rejected": -435.38028169014086, "loss": 0.2338, "rewards/chosen": 3.1386761581688596, "rewards/margins": 7.364028270844916, "rewards/rejected": -4.225352112676056, "step": 1233 }, { "epoch": 0.8460747343160782, "grad_norm": 0.5512723117396899, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92517836.05797102, "logits/rejected": -82322102.23728813, "logps/chosen": -270.1449275362319, "logps/rejected": -333.0169491525424, "loss": 0.2395, "rewards/chosen": 0.8319746376811594, "rewards/margins": 4.908245824121837, "rewards/rejected": -4.076271186440678, "step": 1234 }, { "epoch": 0.8467603702434008, "grad_norm": 0.7153611103719237, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101038905.31343284, "logits/rejected": -72059517.90163934, "logps/chosen": -280.5970149253731, "logps/rejected": -330.4918032786885, "loss": 0.2391, "rewards/chosen": 1.055037313432836, "rewards/margins": 5.325529116711524, "rewards/rejected": -4.270491803278689, "step": 1235 }, { "epoch": 0.8474460061707233, "grad_norm": 0.5667483667162103, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84695507.08771929, "logits/rejected": -89320952.7887324, "logps/chosen": -308.2105263157895, "logps/rejected": -365.5211267605634, "loss": 0.216, "rewards/chosen": 1.050952576754386, "rewards/margins": 6.022783562669879, "rewards/rejected": -4.971830985915493, "step": 1236 }, { "epoch": 0.8481316420980459, "grad_norm": 0.8113208634664857, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86882011.42857143, "logits/rejected": -74991262.89655173, "logps/chosen": -347.42857142857144, "logps/rejected": -335.17241379310343, "loss": 0.2236, "rewards/chosen": 2.6895154680524556, "rewards/margins": 7.142101674949007, "rewards/rejected": -4.452586206896552, "step": 1237 }, { "epoch": 0.8488172780253685, "grad_norm": 0.5731234084112572, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94528343.88059701, "logits/rejected": -77388346.75409836, "logps/chosen": -368.4776119402985, "logps/rejected": -308.72131147540983, "loss": 0.1998, "rewards/chosen": 1.319962686567164, "rewards/margins": 5.926520063616344, "rewards/rejected": -4.60655737704918, "step": 1238 }, { "epoch": 0.8495029139526912, "grad_norm": 0.5488269888488602, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -68433381.05263157, "logits/rejected": -82586436.50704226, "logps/chosen": -233.82456140350877, "logps/rejected": -309.1830985915493, "loss": 0.1952, "rewards/chosen": 0.9890350877192983, "rewards/margins": 5.460866073634791, "rewards/rejected": -4.471830985915493, "step": 1239 }, { "epoch": 0.8501885498800137, "grad_norm": 0.5066433196251383, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90387251.2, "logits/rejected": -65011712.0, "logps/chosen": -312.0, "logps/rejected": -304.0, "loss": 0.1803, "rewards/chosen": 1.7604166666666667, "rewards/margins": 6.5692401960784315, "rewards/rejected": -4.8088235294117645, "step": 1240 }, { "epoch": 0.8508741858073363, "grad_norm": 0.9553267485409823, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82305617.6231884, "logits/rejected": -86089866.84745763, "logps/chosen": -240.92753623188406, "logps/rejected": -392.135593220339, "loss": 0.2499, "rewards/chosen": 1.0018115942028984, "rewards/margins": 5.650116678948661, "rewards/rejected": -4.648305084745763, "step": 1241 }, { "epoch": 0.8515598217346589, "grad_norm": 0.6307879216647723, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102620637.86666666, "logits/rejected": -81542204.23529412, "logps/chosen": -370.1333333333333, "logps/rejected": -357.6470588235294, "loss": 0.1939, "rewards/chosen": 1.5479166666666666, "rewards/margins": 6.069975490196079, "rewards/rejected": -4.522058823529412, "step": 1242 }, { "epoch": 0.8522454576619815, "grad_norm": 0.6456299142533108, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80758124.47457626, "logits/rejected": -90451077.56521739, "logps/chosen": -325.4237288135593, "logps/rejected": -332.5217391304348, "loss": 0.1941, "rewards/chosen": 2.7701033252780722, "rewards/margins": 7.791842455712855, "rewards/rejected": -5.021739130434782, "step": 1243 }, { "epoch": 0.8529310935893041, "grad_norm": 0.6274285856393994, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88014848.0, "logits/rejected": -86245376.0, "logps/chosen": -292.0, "logps/rejected": -340.0, "loss": 0.2266, "rewards/chosen": 1.1181640625, "rewards/margins": 5.9423828125, "rewards/rejected": -4.82421875, "step": 1244 }, { "epoch": 0.8536167295166267, "grad_norm": 0.7162084375681403, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83059034.14084508, "logits/rejected": -70714493.75438596, "logps/chosen": -276.28169014084506, "logps/rejected": -361.2631578947368, "loss": 0.2484, "rewards/chosen": 2.433967267963248, "rewards/margins": 6.855019899542196, "rewards/rejected": -4.421052631578948, "step": 1245 }, { "epoch": 0.8543023654439492, "grad_norm": 0.5710959633921264, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101044596.36363636, "logits/rejected": -117778762.32258065, "logps/chosen": -295.75757575757575, "logps/rejected": -387.0967741935484, "loss": 0.2161, "rewards/chosen": 2.6813372987689394, "rewards/margins": 8.294240524575391, "rewards/rejected": -5.612903225806452, "step": 1246 }, { "epoch": 0.8549880013712718, "grad_norm": 0.5676114967255346, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93206755.55555555, "logits/rejected": -96319195.42857143, "logps/chosen": -409.77777777777777, "logps/rejected": -342.85714285714283, "loss": 0.1903, "rewards/chosen": 1.90625, "rewards/margins": 5.808035714285714, "rewards/rejected": -3.9017857142857144, "step": 1247 }, { "epoch": 0.8556736372985945, "grad_norm": 0.7749591243067205, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102900258.13333334, "logits/rejected": -77656304.94117647, "logps/chosen": -336.8, "logps/rejected": -386.8235294117647, "loss": 0.2079, "rewards/chosen": 1.0104166666666667, "rewards/margins": 5.341299019607844, "rewards/rejected": -4.330882352941177, "step": 1248 }, { "epoch": 0.856359273225917, "grad_norm": 0.7362684837252618, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77562360.12307692, "logits/rejected": -94338551.87301587, "logps/chosen": -304.73846153846154, "logps/rejected": -371.3015873015873, "loss": 0.2491, "rewards/chosen": 0.9399038461538461, "rewards/margins": 5.388316544566544, "rewards/rejected": -4.448412698412699, "step": 1249 }, { "epoch": 0.8570449091532396, "grad_norm": 0.6081972316747861, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75362171.87096775, "logits/rejected": -98883894.3030303, "logps/chosen": -289.5483870967742, "logps/rejected": -315.6363636363636, "loss": 0.2274, "rewards/chosen": 2.1270006241336947, "rewards/margins": 6.89215213928521, "rewards/rejected": -4.765151515151516, "step": 1250 }, { "epoch": 0.8577305450805622, "grad_norm": 0.6039403294367605, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93622857.14285715, "logits/rejected": -80274318.22222222, "logps/chosen": -266.42857142857144, "logps/rejected": -355.77777777777777, "loss": 0.2056, "rewards/chosen": 1.0206473214285714, "rewards/margins": 5.631758432539682, "rewards/rejected": -4.611111111111111, "step": 1251 }, { "epoch": 0.8584161810078849, "grad_norm": 0.5014158155188653, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -106001501.0909091, "logits/rejected": -85644981.67741935, "logps/chosen": -281.45454545454544, "logps/rejected": -305.2903225806452, "loss": 0.2049, "rewards/chosen": 1.3148674242424243, "rewards/margins": 5.879383553274682, "rewards/rejected": -4.564516129032258, "step": 1252 }, { "epoch": 0.8591018169352074, "grad_norm": 0.5603925413092735, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74631980.6984127, "logits/rejected": -68980168.86153845, "logps/chosen": -252.1904761904762, "logps/rejected": -309.66153846153844, "loss": 0.2109, "rewards/chosen": 0.8442460317460317, "rewards/margins": 3.5644153338794453, "rewards/rejected": -2.7201693021334137, "step": 1253 }, { "epoch": 0.85978745286253, "grad_norm": 0.6843241242132483, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84033248.56140351, "logits/rejected": -72425587.38028169, "logps/chosen": -249.5438596491228, "logps/rejected": -367.5492957746479, "loss": 0.2091, "rewards/chosen": 3.0816446940104165, "rewards/margins": 8.342208074292106, "rewards/rejected": -5.26056338028169, "step": 1254 }, { "epoch": 0.8604730887898526, "grad_norm": 0.6857043511099248, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83686351.23809524, "logits/rejected": -85370218.33846153, "logps/chosen": -246.6031746031746, "logps/rejected": -364.55384615384617, "loss": 0.2136, "rewards/chosen": 1.1721230158730158, "rewards/margins": 6.2567384004884, "rewards/rejected": -5.084615384615384, "step": 1255 }, { "epoch": 0.8611587247171751, "grad_norm": 0.5010194161708896, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72351744.0, "logits/rejected": -102835346.28571428, "logps/chosen": -266.44444444444446, "logps/rejected": -429.7142857142857, "loss": 0.2314, "rewards/chosen": 1.3715277777777777, "rewards/margins": 5.795634920634921, "rewards/rejected": -4.424107142857143, "step": 1256 }, { "epoch": 0.8618443606444978, "grad_norm": 0.5673610743292498, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92337289.5522388, "logits/rejected": -73434699.5409836, "logps/chosen": -234.26865671641792, "logps/rejected": -352.0, "loss": 0.2269, "rewards/chosen": 0.7817164179104478, "rewards/margins": 5.8800770736481525, "rewards/rejected": -5.098360655737705, "step": 1257 }, { "epoch": 0.8625299965718204, "grad_norm": 0.6353926802865196, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77471262.11764705, "logits/rejected": -68227345.06666666, "logps/chosen": -288.94117647058823, "logps/rejected": -287.2, "loss": 0.244, "rewards/chosen": 1.0229779411764706, "rewards/margins": 5.464644607843137, "rewards/rejected": -4.441666666666666, "step": 1258 }, { "epoch": 0.8632156324991429, "grad_norm": 0.6843558500352837, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90451077.56521739, "logits/rejected": -79051966.91525424, "logps/chosen": -284.7536231884058, "logps/rejected": -299.3898305084746, "loss": 0.2233, "rewards/chosen": 1.1494565217391304, "rewards/margins": 4.776575165806927, "rewards/rejected": -3.6271186440677967, "step": 1259 }, { "epoch": 0.8639012684264655, "grad_norm": 0.6040477588648626, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85162607.30434783, "logits/rejected": -109762802.98305085, "logps/chosen": -311.18840579710144, "logps/rejected": -376.6779661016949, "loss": 0.1972, "rewards/chosen": 1.3704710144927537, "rewards/margins": 5.497589658560551, "rewards/rejected": -4.127118644067797, "step": 1260 }, { "epoch": 0.8645869043537882, "grad_norm": 0.6462147744799207, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72916361.84615384, "logits/rejected": -107948139.78947368, "logps/chosen": -223.69230769230768, "logps/rejected": -297.6842105263158, "loss": 0.228, "rewards/chosen": 0.6700721153846154, "rewards/margins": 4.7194142206477725, "rewards/rejected": -4.0493421052631575, "step": 1261 }, { "epoch": 0.8652725402811108, "grad_norm": 0.7511684062076364, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -101526829.1764706, "logits/rejected": -81998643.2, "logps/chosen": -327.05882352941177, "logps/rejected": -320.0, "loss": 0.2392, "rewards/chosen": 1.1829044117647058, "rewards/margins": 4.116237745098039, "rewards/rejected": -2.933333333333333, "step": 1262 }, { "epoch": 0.8659581762084333, "grad_norm": 0.6812314350678317, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80725155.24637681, "logits/rejected": -63696548.881355934, "logps/chosen": -231.18840579710144, "logps/rejected": -316.20338983050846, "loss": 0.2638, "rewards/chosen": 0.7531702898550725, "rewards/margins": 4.945966900024564, "rewards/rejected": -4.192796610169491, "step": 1263 }, { "epoch": 0.8666438121357559, "grad_norm": 0.7962075754832069, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92141535.49206349, "logits/rejected": -88273967.26153846, "logps/chosen": -300.6984126984127, "logps/rejected": -389.4153846153846, "loss": 0.2308, "rewards/chosen": 1.0337301587301588, "rewards/margins": 5.849114774114774, "rewards/rejected": -4.815384615384615, "step": 1264 }, { "epoch": 0.8673294480630785, "grad_norm": 0.5741629658538804, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87963875.55555555, "logits/rejected": -84035876.57142857, "logps/chosen": -290.22222222222223, "logps/rejected": -390.85714285714283, "loss": 0.2424, "rewards/chosen": 1.0243055555555556, "rewards/margins": 5.698412698412699, "rewards/rejected": -4.674107142857143, "step": 1265 }, { "epoch": 0.8680150839904011, "grad_norm": 0.5645018007546575, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84713125.85915492, "logits/rejected": -84989844.21052632, "logps/chosen": -257.5774647887324, "logps/rejected": -373.6140350877193, "loss": 0.2305, "rewards/chosen": 0.9823943661971831, "rewards/margins": 6.03063998023227, "rewards/rejected": -5.048245614035087, "step": 1266 }, { "epoch": 0.8687007199177237, "grad_norm": 0.5704680455913295, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95235373.1764706, "logits/rejected": -81089877.33333333, "logps/chosen": -296.94117647058823, "logps/rejected": -364.26666666666665, "loss": 0.2211, "rewards/chosen": 2.4806931439568016, "rewards/margins": 3.847833341710708, "rewards/rejected": -1.3671401977539062, "step": 1267 }, { "epoch": 0.8693863558450463, "grad_norm": 0.8786673350227502, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88670208.0, "logits/rejected": -85655552.0, "logps/chosen": -319.75, "logps/rejected": -323.0, "loss": 0.2271, "rewards/chosen": 0.901611328125, "rewards/margins": 5.296142578125, "rewards/rejected": -4.39453125, "step": 1268 }, { "epoch": 0.8700719917723688, "grad_norm": 0.6363253796150556, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72024064.0, "logits/rejected": -85327872.0, "logps/chosen": -212.75, "logps/rejected": -368.5, "loss": 0.2361, "rewards/chosen": 0.78125, "rewards/margins": 5.5078125, "rewards/rejected": -4.7265625, "step": 1269 }, { "epoch": 0.8707576276996914, "grad_norm": 0.6600941325319417, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94687849.20547946, "logits/rejected": -80759417.01818182, "logps/chosen": -286.4657534246575, "logps/rejected": -341.23636363636365, "loss": 0.2438, "rewards/chosen": 1.0616438356164384, "rewards/margins": 5.4298256537982565, "rewards/rejected": -4.368181818181818, "step": 1270 }, { "epoch": 0.8714432636270141, "grad_norm": 0.5699751704598102, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96781999.76119404, "logits/rejected": -65321127.86885246, "logps/chosen": -318.56716417910445, "logps/rejected": -284.0655737704918, "loss": 0.2139, "rewards/chosen": 1.3843283582089552, "rewards/margins": 6.097443112307316, "rewards/rejected": -4.713114754098361, "step": 1271 }, { "epoch": 0.8721288995543367, "grad_norm": 0.6712983907046859, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86317560.57971014, "logits/rejected": -87867114.30508475, "logps/chosen": -254.6086956521739, "logps/rejected": -401.08474576271186, "loss": 0.2434, "rewards/chosen": 0.8623188405797102, "rewards/margins": 5.921640874478015, "rewards/rejected": -5.059322033898305, "step": 1272 }, { "epoch": 0.8728145354816592, "grad_norm": 0.9079368013402603, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85509681.5483871, "logits/rejected": -94117639.75757575, "logps/chosen": -332.38709677419354, "logps/rejected": -360.24242424242425, "loss": 0.2202, "rewards/chosen": 1.2913306451612903, "rewards/margins": 4.661116994371162, "rewards/rejected": -3.369786349209872, "step": 1273 }, { "epoch": 0.8735001714089818, "grad_norm": 0.687724054599068, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84573670.81967214, "logits/rejected": -112933200.23880596, "logps/chosen": -319.4754098360656, "logps/rejected": -432.23880597014926, "loss": 0.2067, "rewards/chosen": 1.2110655737704918, "rewards/margins": 6.031961096158552, "rewards/rejected": -4.82089552238806, "step": 1274 }, { "epoch": 0.8741858073363045, "grad_norm": 0.632815158963683, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77210638.42253521, "logits/rejected": -82782315.78947368, "logps/chosen": -191.54929577464787, "logps/rejected": -355.0877192982456, "loss": 0.2603, "rewards/chosen": 0.6963028169014085, "rewards/margins": 2442128.9770045714, "rewards/rejected": -2442128.2807017546, "step": 1275 }, { "epoch": 0.874871443263627, "grad_norm": 0.44261352401886656, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84279296.0, "logits/rejected": -69992448.0, "logps/chosen": -268.5, "logps/rejected": -361.5, "loss": 0.1995, "rewards/chosen": 2.4999969005584717, "rewards/margins": 7.214840650558472, "rewards/rejected": -4.71484375, "step": 1276 }, { "epoch": 0.8755570791909496, "grad_norm": 0.9449727395824538, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83426430.24657534, "logits/rejected": -84801200.87272727, "logps/chosen": -331.83561643835617, "logps/rejected": -301.09090909090907, "loss": 0.2337, "rewards/chosen": 1.4700342465753424, "rewards/margins": 6.074579701120797, "rewards/rejected": -4.6045454545454545, "step": 1277 }, { "epoch": 0.8762427151182722, "grad_norm": 0.652232087986064, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109739494.81967214, "logits/rejected": -75497472.0, "logps/chosen": -274.3606557377049, "logps/rejected": -321.43283582089555, "loss": 0.2033, "rewards/chosen": 1.4221311475409837, "rewards/margins": 5.802728162466357, "rewards/rejected": -4.380597014925373, "step": 1278 }, { "epoch": 0.8769283510455947, "grad_norm": 0.6465236261743411, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89433385.29032259, "logits/rejected": -95070890.66666667, "logps/chosen": -299.61290322580646, "logps/rejected": -401.93939393939394, "loss": 0.2353, "rewards/chosen": 0.9566532258064516, "rewards/margins": 6.199077468230694, "rewards/rejected": -5.242424242424242, "step": 1279 }, { "epoch": 0.8776139869729174, "grad_norm": 0.6061766677075733, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95314038.72463769, "logits/rejected": -76350550.77966101, "logps/chosen": -260.17391304347825, "logps/rejected": -347.1186440677966, "loss": 0.2421, "rewards/chosen": 2.095792024031929, "rewards/margins": 5.20940638115587, "rewards/rejected": -3.1136143571239407, "step": 1280 }, { "epoch": 0.87829962290024, "grad_norm": 0.5907345700289455, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88789280.45070423, "logits/rejected": -87933215.43859649, "logps/chosen": -311.4366197183099, "logps/rejected": -300.0701754385965, "loss": 0.2392, "rewards/chosen": 1.136443661971831, "rewards/margins": 4.1918070995574945, "rewards/rejected": -3.0553634375856635, "step": 1281 }, { "epoch": 0.8789852588275625, "grad_norm": 0.7263094953208996, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82185686.48648648, "logits/rejected": -85983232.0, "logps/chosen": -236.32432432432432, "logps/rejected": -365.6296296296296, "loss": 0.2276, "rewards/chosen": 1.2643581081081081, "rewards/margins": 6.079172922922923, "rewards/rejected": -4.814814814814815, "step": 1282 }, { "epoch": 0.8796708947548851, "grad_norm": 0.5690737139567553, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92274688.0, "logits/rejected": -73679940.26666667, "logps/chosen": -270.8235294117647, "logps/rejected": -326.4, "loss": 0.2258, "rewards/chosen": 1.1176470588235294, "rewards/margins": 6.459313725490196, "rewards/rejected": -5.341666666666667, "step": 1283 }, { "epoch": 0.8803565306822078, "grad_norm": 0.8057971284764792, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78829932.71232876, "logits/rejected": -87394043.34545454, "logps/chosen": -285.5890410958904, "logps/rejected": -368.8727272727273, "loss": 0.2247, "rewards/chosen": 1.1849315068493151, "rewards/margins": 5.266749688667497, "rewards/rejected": -4.081818181818182, "step": 1284 }, { "epoch": 0.8810421666095304, "grad_norm": 0.5063061799077649, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76895573.33333333, "logits/rejected": -89112828.06153846, "logps/chosen": -295.87301587301585, "logps/rejected": -313.6, "loss": 0.2363, "rewards/chosen": 0.8756200396825397, "rewards/margins": 4.752543116605617, "rewards/rejected": -3.876923076923077, "step": 1285 }, { "epoch": 0.8817278025368529, "grad_norm": 0.551334089179995, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100525777.83606558, "logits/rejected": -70113738.50746268, "logps/chosen": -241.04918032786884, "logps/rejected": -297.07462686567163, "loss": 0.2188, "rewards/chosen": 3.081441660396388, "rewards/margins": 7.738158078306836, "rewards/rejected": -4.656716417910448, "step": 1286 }, { "epoch": 0.8824134384641755, "grad_norm": 0.6399610726697471, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90521844.53731343, "logits/rejected": -97844173.63934426, "logps/chosen": -303.2835820895522, "logps/rejected": -382.42622950819674, "loss": 0.2396, "rewards/chosen": 0.7280783582089553, "rewards/margins": 4.686325808740461, "rewards/rejected": -3.958247450531506, "step": 1287 }, { "epoch": 0.8830990743914982, "grad_norm": 0.6443636070255823, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93517444.74074075, "logits/rejected": -106104555.24324325, "logps/chosen": -301.9259259259259, "logps/rejected": -395.2432432432432, "loss": 0.1804, "rewards/chosen": 1.3194444444444444, "rewards/margins": 5.873498498498499, "rewards/rejected": -4.554054054054054, "step": 1288 }, { "epoch": 0.8837847103188207, "grad_norm": 0.9673576682148576, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87264824.8888889, "logits/rejected": -92499382.85714285, "logps/chosen": -280.6666666666667, "logps/rejected": -350.0, "loss": 0.2313, "rewards/chosen": 1.0972222222222223, "rewards/margins": 5.7936507936507935, "rewards/rejected": -4.696428571428571, "step": 1289 }, { "epoch": 0.8844703462461433, "grad_norm": 0.6050227797430223, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80630799.28358209, "logits/rejected": -64083464.39344262, "logps/chosen": -276.53731343283584, "logps/rejected": -313.44262295081967, "loss": 0.2223, "rewards/chosen": 0.9421641791044776, "rewards/margins": 6.483147785661855, "rewards/rejected": -5.540983606557377, "step": 1290 }, { "epoch": 0.8851559821734659, "grad_norm": 0.5079465281040283, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83750779.87096775, "logits/rejected": -61214595.878787875, "logps/chosen": -263.2258064516129, "logps/rejected": -282.6666666666667, "loss": 0.224, "rewards/chosen": 2.4798404324439263, "rewards/margins": 6.911658614262108, "rewards/rejected": -4.431818181818182, "step": 1291 }, { "epoch": 0.8858416181007884, "grad_norm": 0.599839135749421, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80696045.97183098, "logits/rejected": -73878617.8245614, "logps/chosen": -214.53521126760563, "logps/rejected": -362.3859649122807, "loss": 0.2092, "rewards/chosen": 2.508572860502861, "rewards/margins": 7.030502685064264, "rewards/rejected": -4.521929824561403, "step": 1292 }, { "epoch": 0.8865272540281111, "grad_norm": 0.6535703563836684, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79015275.35483871, "logits/rejected": -86046782.06060606, "logps/chosen": -260.64516129032256, "logps/rejected": -406.7878787878788, "loss": 0.2273, "rewards/chosen": 0.7439516129032258, "rewards/margins": 6.100012218963832, "rewards/rejected": -5.356060606060606, "step": 1293 }, { "epoch": 0.8872128899554337, "grad_norm": 0.5389165522082277, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75768072.25806452, "logits/rejected": -86173882.18181819, "logps/chosen": -251.8709677419355, "logps/rejected": -400.969696969697, "loss": 0.2062, "rewards/chosen": 3.221739245999244, "rewards/margins": 7.7255271247871224, "rewards/rejected": -4.503787878787879, "step": 1294 }, { "epoch": 0.8878985258827563, "grad_norm": 0.789881867331259, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75963505.77777778, "logits/rejected": -82837504.0, "logps/chosen": -304.44444444444446, "logps/rejected": -398.2857142857143, "loss": 0.1963, "rewards/chosen": 1.6770833333333333, "rewards/margins": 5.81016967410133, "rewards/rejected": -4.1330863407679965, "step": 1295 }, { "epoch": 0.8885841618100788, "grad_norm": 0.5604062561683875, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83230720.0, "logits/rejected": -73465856.0, "logps/chosen": -321.25, "logps/rejected": -370.25, "loss": 0.2086, "rewards/chosen": 1.375, "rewards/margins": 5.77734375, "rewards/rejected": -4.40234375, "step": 1296 }, { "epoch": 0.8892697977374014, "grad_norm": 0.5864414682118918, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81216977.45454545, "logits/rejected": -93830639.48387097, "logps/chosen": -346.6666666666667, "logps/rejected": -391.2258064516129, "loss": 0.1975, "rewards/chosen": 1.5151515151515151, "rewards/margins": 6.450635386119258, "rewards/rejected": -4.935483870967742, "step": 1297 }, { "epoch": 0.8899554336647241, "grad_norm": 0.6097241939915995, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85067574.08450705, "logits/rejected": -80501203.08771929, "logps/chosen": -315.0422535211268, "logps/rejected": -344.42105263157896, "loss": 0.2418, "rewards/chosen": 1.1663732394366197, "rewards/margins": 5.79356622189276, "rewards/rejected": -4.62719298245614, "step": 1298 }, { "epoch": 0.8906410695920466, "grad_norm": 0.5527570914053018, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -65723952.30188679, "logits/rejected": -66549623.46666667, "logps/chosen": -293.4339622641509, "logps/rejected": -339.2, "loss": 0.1892, "rewards/chosen": 0.9150943396226415, "rewards/margins": 5.655094339622642, "rewards/rejected": -4.74, "step": 1299 }, { "epoch": 0.8913267055193692, "grad_norm": 0.6284895460700526, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85138111.04477613, "logits/rejected": -87874106.75409836, "logps/chosen": -273.1940298507463, "logps/rejected": -381.9016393442623, "loss": 0.2241, "rewards/chosen": 1.1851679104477613, "rewards/margins": 5.672872828480548, "rewards/rejected": -4.487704918032787, "step": 1300 }, { "epoch": 0.8920123414466918, "grad_norm": 0.5293929929512188, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90329503.53623189, "logits/rejected": -80047225.49152543, "logps/chosen": -315.1304347826087, "logps/rejected": -332.47457627118644, "loss": 0.214, "rewards/chosen": 2.2509091142295063, "rewards/margins": 7.632265046432897, "rewards/rejected": -5.38135593220339, "step": 1301 }, { "epoch": 0.8926979773740144, "grad_norm": 0.6506014839844307, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75091571.61290322, "logits/rejected": -72256418.9090909, "logps/chosen": -258.3225806451613, "logps/rejected": -327.030303030303, "loss": 0.2293, "rewards/chosen": 0.7852822580645161, "rewards/margins": 4.633767106549365, "rewards/rejected": -3.8484848484848486, "step": 1302 }, { "epoch": 0.893383613301337, "grad_norm": 0.9091228614679325, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76030082.03174603, "logits/rejected": -81756664.12307692, "logps/chosen": -189.71428571428572, "logps/rejected": -399.75384615384615, "loss": 0.1877, "rewards/chosen": 0.9454365079365079, "rewards/margins": 6.383898046398047, "rewards/rejected": -5.438461538461539, "step": 1303 }, { "epoch": 0.8940692492286596, "grad_norm": 0.7003256280631115, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103775198.96774194, "logits/rejected": -82106678.3030303, "logps/chosen": -328.51612903225805, "logps/rejected": -358.7878787878788, "loss": 0.1895, "rewards/chosen": 1.5766129032258065, "rewards/margins": 6.902370478983382, "rewards/rejected": -5.325757575757576, "step": 1304 }, { "epoch": 0.8947548851559822, "grad_norm": 0.5962910658401671, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76876421.26027398, "logits/rejected": -72523329.16363636, "logps/chosen": -265.4246575342466, "logps/rejected": -303.1272727272727, "loss": 0.2506, "rewards/chosen": 0.9828767123287672, "rewards/margins": 4.687422166874222, "rewards/rejected": -3.7045454545454546, "step": 1305 }, { "epoch": 0.8954405210833047, "grad_norm": 0.44258371901062377, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74702000.55172414, "logits/rejected": -72441621.94285715, "logps/chosen": -229.3793103448276, "logps/rejected": -296.9142857142857, "loss": 0.2054, "rewards/chosen": 0.9601293103448276, "rewards/margins": 6.074415024630541, "rewards/rejected": -5.114285714285714, "step": 1306 }, { "epoch": 0.8961261570106274, "grad_norm": 0.572883954291973, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -70391362.7826087, "logits/rejected": -80615944.6779661, "logps/chosen": -215.8840579710145, "logps/rejected": -373.6949152542373, "loss": 0.2231, "rewards/chosen": 0.7780797101449275, "rewards/margins": 5.58316445590764, "rewards/rejected": -4.805084745762712, "step": 1307 }, { "epoch": 0.89681179293795, "grad_norm": 0.6910744911213377, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92214769.37142856, "logits/rejected": -90756060.68965517, "logps/chosen": -246.85714285714286, "logps/rejected": -374.0689655172414, "loss": 0.2639, "rewards/chosen": 0.6549107142857142, "rewards/margins": 5.999738300492611, "rewards/rejected": -5.344827586206897, "step": 1308 }, { "epoch": 0.8974974288652725, "grad_norm": 0.7178347833446656, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91355388.49315068, "logits/rejected": -96316471.85454546, "logps/chosen": -285.36986301369865, "logps/rejected": -381.96363636363634, "loss": 0.2391, "rewards/chosen": 0.922945205479452, "rewards/margins": 3.8422378635050203, "rewards/rejected": -2.9192926580255683, "step": 1309 }, { "epoch": 0.8981830647925951, "grad_norm": 0.6902413408624405, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77557831.85964912, "logits/rejected": -85185723.49295774, "logps/chosen": -261.6140350877193, "logps/rejected": -320.22535211267603, "loss": 0.176, "rewards/chosen": 1.1513157894736843, "rewards/margins": 6.982301704966642, "rewards/rejected": -5.830985915492958, "step": 1310 }, { "epoch": 0.8988687007199178, "grad_norm": 0.6648273233707621, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79041954.25352113, "logits/rejected": -85799271.29824561, "logps/chosen": -232.33802816901408, "logps/rejected": -341.89473684210526, "loss": 0.22, "rewards/chosen": 2.3600313428422095, "rewards/margins": 6.500382220035192, "rewards/rejected": -4.140350877192983, "step": 1311 }, { "epoch": 0.8995543366472403, "grad_norm": 0.5576588215458788, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84035876.57142857, "logits/rejected": -76779064.8888889, "logps/chosen": -245.14285714285714, "logps/rejected": -351.55555555555554, "loss": 0.2193, "rewards/chosen": 3.9932738712855746, "rewards/margins": 8.889107204618908, "rewards/rejected": -4.895833333333333, "step": 1312 }, { "epoch": 0.9002399725745629, "grad_norm": 0.5251486898787789, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74448896.0, "logits/rejected": -100401152.0, "logps/chosen": -310.25, "logps/rejected": -373.0, "loss": 0.2169, "rewards/chosen": 1.126953125, "rewards/margins": 5.791015625, "rewards/rejected": -4.6640625, "step": 1313 }, { "epoch": 0.9009256085018855, "grad_norm": 0.5472130617634738, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89828010.66666667, "logits/rejected": -82313216.0, "logps/chosen": -237.11111111111111, "logps/rejected": -365.42857142857144, "loss": 0.2022, "rewards/chosen": 1.1857638888888888, "rewards/margins": 4.783134036593967, "rewards/rejected": -3.597370147705078, "step": 1314 }, { "epoch": 0.901611244429208, "grad_norm": 0.8306560748466163, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82508845.85074627, "logits/rejected": -86842720.52459016, "logps/chosen": -243.5820895522388, "logps/rejected": -489.9672131147541, "loss": 0.2112, "rewards/chosen": 1.1333955223880596, "rewards/margins": 6.092411915830683, "rewards/rejected": -4.959016393442623, "step": 1315 }, { "epoch": 0.9022968803565307, "grad_norm": 0.6790524225067062, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -94937738.15873016, "logits/rejected": -71754862.27692308, "logps/chosen": -230.34920634920636, "logps/rejected": -357.9076923076923, "loss": 0.2099, "rewards/chosen": 2.3869076683407737, "rewards/margins": 6.806138437571543, "rewards/rejected": -4.4192307692307695, "step": 1316 }, { "epoch": 0.9029825162838533, "grad_norm": 0.5353377227678682, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -71158536.8275862, "logits/rejected": -82627788.8, "logps/chosen": -291.3103448275862, "logps/rejected": -330.51428571428573, "loss": 0.1945, "rewards/chosen": 1.2489224137931034, "rewards/margins": 6.063208128078817, "rewards/rejected": -4.814285714285714, "step": 1317 }, { "epoch": 0.9036681522111759, "grad_norm": 0.7033172394679775, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88413265.26984127, "logits/rejected": -86273606.8923077, "logps/chosen": -336.76190476190476, "logps/rejected": -320.0, "loss": 0.2205, "rewards/chosen": 1.1051587301587302, "rewards/margins": 5.478235653235653, "rewards/rejected": -4.373076923076923, "step": 1318 }, { "epoch": 0.9043537881384984, "grad_norm": 0.6152006571411371, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91925162.66666667, "logits/rejected": -77903028.70588236, "logps/chosen": -314.93333333333334, "logps/rejected": -339.52941176470586, "loss": 0.213, "rewards/chosen": 1.2197916666666666, "rewards/margins": 5.598468137254902, "rewards/rejected": -4.3786764705882355, "step": 1319 }, { "epoch": 0.905039424065821, "grad_norm": 0.6794856854286366, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75426382.10169491, "logits/rejected": -82184043.5942029, "logps/chosen": -317.2881355932203, "logps/rejected": -315.3623188405797, "loss": 0.1932, "rewards/chosen": 0.9184322033898306, "rewards/margins": 6.0017655367231635, "rewards/rejected": -5.083333333333333, "step": 1320 }, { "epoch": 0.9057250599931437, "grad_norm": 0.711635041852226, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74861971.39393939, "logits/rejected": -99174994.58064516, "logps/chosen": -254.54545454545453, "logps/rejected": -402.06451612903226, "loss": 0.2188, "rewards/chosen": 1.1136363636363635, "rewards/margins": 6.299120234604105, "rewards/rejected": -5.185483870967742, "step": 1321 }, { "epoch": 0.9064106959204662, "grad_norm": 0.7383740069723945, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93937946.48275863, "logits/rejected": -104018739.2, "logps/chosen": -281.37931034482756, "logps/rejected": -356.57142857142856, "loss": 0.2145, "rewards/chosen": 2.6185231702081087, "rewards/margins": 8.311380313065252, "rewards/rejected": -5.692857142857143, "step": 1322 }, { "epoch": 0.9070963318477888, "grad_norm": 0.7586467816530952, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77627912.12698413, "logits/rejected": -85112107.32307692, "logps/chosen": -264.3809523809524, "logps/rejected": -339.2, "loss": 0.2189, "rewards/chosen": 1.1101190476190477, "rewards/margins": 6.00242673992674, "rewards/rejected": -4.892307692307693, "step": 1323 }, { "epoch": 0.9077819677751114, "grad_norm": 0.6279792652069178, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84952428.47457626, "logits/rejected": -73309139.47826087, "logps/chosen": -302.10169491525426, "logps/rejected": -339.4782608695652, "loss": 0.1951, "rewards/chosen": 1.1551906779661016, "rewards/margins": 4.523389238597895, "rewards/rejected": -3.3681985606317935, "step": 1324 }, { "epoch": 0.908467603702434, "grad_norm": 0.5274786548585074, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75792384.0, "logits/rejected": -81133568.0, "logps/chosen": -326.0, "logps/rejected": -305.5, "loss": 0.202, "rewards/chosen": 3.302672863006592, "rewards/margins": 8.244079113006592, "rewards/rejected": -4.94140625, "step": 1325 }, { "epoch": 0.9091532396297566, "grad_norm": 0.4898542108797155, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98814958.6440678, "logits/rejected": -71576709.56521739, "logps/chosen": -247.864406779661, "logps/rejected": -352.463768115942, "loss": 0.2084, "rewards/chosen": 0.78125, "rewards/margins": 5.542119565217392, "rewards/rejected": -4.760869565217392, "step": 1326 }, { "epoch": 0.9098388755570792, "grad_norm": 0.6292833307752986, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83746269.86666666, "logits/rejected": -81912289.88235295, "logps/chosen": -333.06666666666666, "logps/rejected": -393.88235294117646, "loss": 0.1988, "rewards/chosen": 1.1916666666666667, "rewards/margins": 6.287254901960784, "rewards/rejected": -5.095588235294118, "step": 1327 }, { "epoch": 0.9105245114844018, "grad_norm": 0.722100314545074, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -105923948.47457626, "logits/rejected": -76348490.20289855, "logps/chosen": -270.64406779661016, "logps/rejected": -379.3623188405797, "loss": 0.2097, "rewards/chosen": 2.883465265823623, "rewards/margins": 7.738537729591739, "rewards/rejected": -4.855072463768116, "step": 1328 }, { "epoch": 0.9112101474117243, "grad_norm": 0.6585381990256932, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -104730499.87878788, "logits/rejected": -67920664.77419356, "logps/chosen": -382.3030303030303, "logps/rejected": -331.0967741935484, "loss": 0.229, "rewards/chosen": 1.0104166666666667, "rewards/margins": 6.127352150537635, "rewards/rejected": -5.116935483870968, "step": 1329 }, { "epoch": 0.911895783339047, "grad_norm": 0.8220031301779567, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86046782.06060606, "logits/rejected": -86524432.51612903, "logps/chosen": -350.6666666666667, "logps/rejected": -371.35483870967744, "loss": 0.2251, "rewards/chosen": 1.1297348484848484, "rewards/margins": 5.38703740959876, "rewards/rejected": -4.257302561113911, "step": 1330 }, { "epoch": 0.9125814192663696, "grad_norm": 0.7058225640433066, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78465748.67692308, "logits/rejected": -86482553.90476191, "logps/chosen": -282.83076923076925, "logps/rejected": -401.77777777777777, "loss": 0.1891, "rewards/chosen": 1.4365384615384615, "rewards/margins": 6.738125763125763, "rewards/rejected": -5.301587301587301, "step": 1331 }, { "epoch": 0.9132670551936921, "grad_norm": 0.8884622280362213, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83099648.0, "logits/rejected": -78446592.0, "logps/chosen": -338.75, "logps/rejected": -373.0, "loss": 0.2437, "rewards/chosen": 0.845703125, "rewards/margins": 4.212890625, "rewards/rejected": -3.3671875, "step": 1332 }, { "epoch": 0.9139526911210147, "grad_norm": 0.6537909193841999, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75707187.2, "logits/rejected": -75620833.88235295, "logps/chosen": -234.66666666666666, "logps/rejected": -392.94117647058823, "loss": 0.2093, "rewards/chosen": 0.9401041666666666, "rewards/margins": 1917918.5871629904, "rewards/rejected": -1917917.6470588236, "step": 1333 }, { "epoch": 0.9146383270483374, "grad_norm": 0.6175594304581957, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77419861.33333333, "logits/rejected": -83436690.28571428, "logps/chosen": -306.6666666666667, "logps/rejected": -337.7142857142857, "loss": 0.2475, "rewards/chosen": 2.190530776977539, "rewards/margins": 4.080257552010672, "rewards/rejected": -1.8897267750331335, "step": 1334 }, { "epoch": 0.9153239629756599, "grad_norm": 0.591243536863653, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79183375.51515152, "logits/rejected": -88350984.25806452, "logps/chosen": -281.93939393939394, "logps/rejected": -354.5806451612903, "loss": 0.2112, "rewards/chosen": 1.1325757575757576, "rewards/margins": 6.237414467253177, "rewards/rejected": -5.104838709677419, "step": 1335 }, { "epoch": 0.9160095989029825, "grad_norm": 0.6679625429827194, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73174472.86153845, "logits/rejected": -89744790.34920634, "logps/chosen": -244.43076923076924, "logps/rejected": -366.73015873015873, "loss": 0.2084, "rewards/chosen": 1.225, "rewards/margins": 4.763934132409474, "rewards/rejected": -3.5389341324094743, "step": 1336 }, { "epoch": 0.9166952348303051, "grad_norm": 0.9165877641467632, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79198328.47058824, "logits/rejected": -84724940.8, "logps/chosen": -279.52941176470586, "logps/rejected": -395.46666666666664, "loss": 0.2558, "rewards/chosen": 1.0110294117647058, "rewards/margins": 6.211029411764706, "rewards/rejected": -5.2, "step": 1337 }, { "epoch": 0.9173808707576278, "grad_norm": 0.5910124660801667, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80654403.14754099, "logits/rejected": -81256814.80597015, "logps/chosen": -333.37704918032784, "logps/rejected": -372.53731343283584, "loss": 0.2067, "rewards/chosen": 1.3227459016393444, "rewards/margins": 6.199611573281135, "rewards/rejected": -4.876865671641791, "step": 1338 }, { "epoch": 0.9180665066849503, "grad_norm": 0.5525383223027774, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -103222532.33898304, "logits/rejected": -89478485.33333333, "logps/chosen": -317.2881355932203, "logps/rejected": -315.82608695652175, "loss": 0.1738, "rewards/chosen": 1.3093220338983051, "rewards/margins": 6.454249570130189, "rewards/rejected": -5.144927536231884, "step": 1339 }, { "epoch": 0.9187521426122729, "grad_norm": 0.630863020545403, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -74761629.19298245, "logits/rejected": -87076114.02816902, "logps/chosen": -233.40350877192984, "logps/rejected": -340.28169014084506, "loss": 0.1909, "rewards/chosen": 1.087719298245614, "rewards/margins": 6.137015072893501, "rewards/rejected": -5.049295774647887, "step": 1340 }, { "epoch": 0.9194377785395955, "grad_norm": 1.205100506402918, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88662926.22222222, "logits/rejected": -94072246.85714285, "logps/chosen": -316.44444444444446, "logps/rejected": -358.2857142857143, "loss": 0.2405, "rewards/chosen": 0.8428819444444444, "rewards/margins": 6.798239087301588, "rewards/rejected": -5.955357142857143, "step": 1341 }, { "epoch": 0.920123414466918, "grad_norm": 0.642310741737863, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90701824.0, "logits/rejected": -90832896.0, "logps/chosen": -349.5, "logps/rejected": -332.5, "loss": 0.2181, "rewards/chosen": 1.2216796875, "rewards/margins": 6.2607421875, "rewards/rejected": -5.0390625, "step": 1342 }, { "epoch": 0.9208090503942407, "grad_norm": 0.533295810907958, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85913326.93333334, "logits/rejected": -92768135.52941176, "logps/chosen": -272.8, "logps/rejected": -346.3529411764706, "loss": 0.1982, "rewards/chosen": 1.0208333333333333, "rewards/margins": 5.160539215686274, "rewards/rejected": -4.139705882352941, "step": 1343 }, { "epoch": 0.9214946863215633, "grad_norm": 0.7224215844660457, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76429539.55555555, "logits/rejected": -74916722.21538462, "logps/chosen": -264.8888888888889, "logps/rejected": -297.6, "loss": 0.218, "rewards/chosen": 0.75, "rewards/margins": 3.512684983473558, "rewards/rejected": -2.762684983473558, "step": 1344 }, { "epoch": 0.9221803222488858, "grad_norm": 0.5843201513675871, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84021380.12903225, "logits/rejected": -79183375.51515152, "logps/chosen": -273.5483870967742, "logps/rejected": -339.8787878787879, "loss": 0.2116, "rewards/chosen": 0.8931451612903226, "rewards/margins": 5.9537512218963835, "rewards/rejected": -5.0606060606060606, "step": 1345 }, { "epoch": 0.9228659581762084, "grad_norm": 0.6947650572362112, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93694298.58461538, "logits/rejected": -76895573.33333333, "logps/chosen": -365.04615384615386, "logps/rejected": -357.58730158730157, "loss": 0.2434, "rewards/chosen": 0.8586538461538461, "rewards/margins": 5.295161782661783, "rewards/rejected": -4.436507936507937, "step": 1346 }, { "epoch": 0.923551594103531, "grad_norm": 0.7444673692568252, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88882236.23529412, "logits/rejected": -108772283.73333333, "logps/chosen": -272.47058823529414, "logps/rejected": -395.2, "loss": 0.2036, "rewards/chosen": 1.275735294117647, "rewards/margins": 6.83406862745098, "rewards/rejected": -5.558333333333334, "step": 1347 }, { "epoch": 0.9242372300308536, "grad_norm": 0.7243527676692298, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84208718.76923077, "logits/rejected": -79625199.74603175, "logps/chosen": -250.33846153846153, "logps/rejected": -409.3968253968254, "loss": 0.2306, "rewards/chosen": 1.0846153846153845, "rewards/margins": 6.060805860805861, "rewards/rejected": -4.976190476190476, "step": 1348 }, { "epoch": 0.9249228659581762, "grad_norm": 0.4949557942818648, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83009658.26865672, "logits/rejected": -106301540.72131148, "logps/chosen": -255.28358208955223, "logps/rejected": -359.60655737704917, "loss": 0.2009, "rewards/chosen": 1.3041044776119404, "rewards/margins": 6.935252018595547, "rewards/rejected": -5.631147540983607, "step": 1349 }, { "epoch": 0.9256085018854988, "grad_norm": 0.7361161166818848, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83207589.64705883, "logits/rejected": -77804339.2, "logps/chosen": -236.47058823529412, "logps/rejected": -330.1333333333333, "loss": 0.2442, "rewards/chosen": 0.7858455882352942, "rewards/margins": 5.410845588235294, "rewards/rejected": -4.625, "step": 1350 }, { "epoch": 0.9262941378128214, "grad_norm": 0.8703517992085327, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -69138365.93548387, "logits/rejected": -82869279.03030303, "logps/chosen": -280.0, "logps/rejected": -368.969696969697, "loss": 0.1977, "rewards/chosen": 1.216733870967742, "rewards/margins": 5.345521749755621, "rewards/rejected": -4.128787878787879, "step": 1351 }, { "epoch": 0.926979773740144, "grad_norm": 0.5164327580038095, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -67458389.33333333, "logits/rejected": -90120856.21621622, "logps/chosen": -232.88888888888889, "logps/rejected": -352.0, "loss": 0.2112, "rewards/chosen": 0.9259259259259259, "rewards/margins": 5.648898898898898, "rewards/rejected": -4.722972972972973, "step": 1352 }, { "epoch": 0.9276654096674666, "grad_norm": 0.5587545091609183, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72966426.48275863, "logits/rejected": -75856983.77142857, "logps/chosen": -314.48275862068965, "logps/rejected": -327.3142857142857, "loss": 0.2249, "rewards/chosen": 0.8650323275862069, "rewards/margins": 5.8150323275862075, "rewards/rejected": -4.95, "step": 1353 }, { "epoch": 0.9283510455947892, "grad_norm": 0.4567383541412071, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72071139.15492958, "logits/rejected": -91980350.87719299, "logps/chosen": -233.01408450704224, "logps/rejected": -358.17543859649123, "loss": 0.2223, "rewards/chosen": 0.9867957746478874, "rewards/margins": 4.4245645434938305, "rewards/rejected": -3.437768768845943, "step": 1354 }, { "epoch": 0.9290366815221117, "grad_norm": 0.5835994860080082, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87809783.74193548, "logits/rejected": -92147587.87878788, "logps/chosen": -233.80645161290323, "logps/rejected": -312.72727272727275, "loss": 0.1906, "rewards/chosen": 1.1693548387096775, "rewards/margins": 6.464809384164223, "rewards/rejected": -5.295454545454546, "step": 1355 }, { "epoch": 0.9297223174494343, "grad_norm": 0.4731482859712536, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96604292.12903225, "logits/rejected": -91639187.39393939, "logps/chosen": -268.1290322580645, "logps/rejected": -342.7878787878788, "loss": 0.2091, "rewards/chosen": 2.3432560582314768, "rewards/margins": 7.180377270352689, "rewards/rejected": -4.837121212121212, "step": 1356 }, { "epoch": 0.930407953376757, "grad_norm": 0.5658017806928475, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76935519.08571428, "logits/rejected": -92563950.34482759, "logps/chosen": -254.85714285714286, "logps/rejected": -368.0, "loss": 0.2371, "rewards/chosen": 0.971875, "rewards/margins": 5.769288793103448, "rewards/rejected": -4.797413793103448, "step": 1357 }, { "epoch": 0.9310935893040795, "grad_norm": 0.8584367617950814, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92424484.57142857, "logits/rejected": -105673159.1111111, "logps/chosen": -272.2857142857143, "logps/rejected": -374.6666666666667, "loss": 0.2031, "rewards/chosen": 1.2388392857142858, "rewards/margins": 11024819.905505951, "rewards/rejected": -11024818.666666666, "step": 1358 }, { "epoch": 0.9317792252314021, "grad_norm": 0.8083739176740881, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -99975705.18032786, "logits/rejected": -76373893.73134328, "logps/chosen": -296.91803278688525, "logps/rejected": -367.2835820895522, "loss": 0.2238, "rewards/chosen": 0.9723360655737705, "rewards/margins": 5.267112184976756, "rewards/rejected": -4.294776119402985, "step": 1359 }, { "epoch": 0.9324648611587247, "grad_norm": 0.7582794292999744, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95318940.90322581, "logits/rejected": -88207484.12121212, "logps/chosen": -350.4516129032258, "logps/rejected": -413.09090909090907, "loss": 0.1966, "rewards/chosen": 1.3538306451612903, "rewards/margins": 4.350965923111692, "rewards/rejected": -2.9971352779504024, "step": 1360 }, { "epoch": 0.9331504970860474, "grad_norm": 0.8536863799794524, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80030026.32258065, "logits/rejected": -97104492.60606061, "logps/chosen": -206.83870967741936, "logps/rejected": -466.42424242424244, "loss": 0.239, "rewards/chosen": 2.5644907797536542, "rewards/margins": 7.560702900965776, "rewards/rejected": -4.996212121212121, "step": 1361 }, { "epoch": 0.9338361330133699, "grad_norm": 0.8297435167084697, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79945976.24242425, "logits/rejected": -69138365.93548387, "logps/chosen": -304.72727272727275, "logps/rejected": -352.51612903225805, "loss": 0.2119, "rewards/chosen": 1.3740530303030303, "rewards/margins": 6.148246578690127, "rewards/rejected": -4.774193548387097, "step": 1362 }, { "epoch": 0.9345217689406925, "grad_norm": 0.5865415163944678, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -71412110.96103896, "logits/rejected": -99347435.92156863, "logps/chosen": -238.96103896103895, "logps/rejected": -371.45098039215685, "loss": 0.2335, "rewards/chosen": 1.140422077922078, "rewards/margins": 6.311990705373058, "rewards/rejected": -5.171568627450981, "step": 1363 }, { "epoch": 0.9352074048680151, "grad_norm": 0.579916228906381, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80061861.64705883, "logits/rejected": -86402662.4, "logps/chosen": -279.29411764705884, "logps/rejected": -330.4, "loss": 0.2223, "rewards/chosen": 1.0900735294117647, "rewards/margins": 4.665073529411765, "rewards/rejected": -3.575, "step": 1364 }, { "epoch": 0.9358930407953376, "grad_norm": 0.8062346260099513, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89822633.35384615, "logits/rejected": -77960793.3968254, "logps/chosen": -287.26153846153846, "logps/rejected": -396.1904761904762, "loss": 0.2377, "rewards/chosen": 2.5105248084435097, "rewards/margins": 7.506556554475256, "rewards/rejected": -4.996031746031746, "step": 1365 }, { "epoch": 0.9365786767226603, "grad_norm": 0.5712273324955558, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97000664.33802816, "logits/rejected": -79691776.0, "logps/chosen": -216.7887323943662, "logps/rejected": -335.43859649122805, "loss": 0.2401, "rewards/chosen": 0.9947183098591549, "rewards/margins": 5.205244625648628, "rewards/rejected": -4.2105263157894735, "step": 1366 }, { "epoch": 0.9372643126499829, "grad_norm": 0.7708248210252733, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -69555541.33333333, "logits/rejected": -72201947.42857143, "logps/chosen": -278.0, "logps/rejected": -278.85714285714283, "loss": 0.2413, "rewards/chosen": 2.4773809644911022, "rewards/margins": 6.910416678776817, "rewards/rejected": -4.433035714285714, "step": 1367 }, { "epoch": 0.9379499485773054, "grad_norm": 0.6460614953990985, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87392793.18032786, "logits/rejected": -113934825.07462686, "logps/chosen": -306.3606557377049, "logps/rejected": -389.25373134328356, "loss": 0.241, "rewards/chosen": 0.6005379098360656, "rewards/margins": 5.160239402373379, "rewards/rejected": -4.559701492537314, "step": 1368 }, { "epoch": 0.938635584504628, "grad_norm": 0.5068386625063114, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73228422.29508197, "logits/rejected": -84699900.17910448, "logps/chosen": -236.85245901639345, "logps/rejected": -382.089552238806, "loss": 0.1887, "rewards/chosen": 1.0635245901639345, "rewards/margins": 6.168002202104233, "rewards/rejected": -5.104477611940299, "step": 1369 }, { "epoch": 0.9393212204319507, "grad_norm": 0.764885909875878, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93739365.58730158, "logits/rejected": -93565243.07692307, "logps/chosen": -281.3968253968254, "logps/rejected": -418.95384615384614, "loss": 0.2046, "rewards/chosen": 4.351690867590526, "rewards/margins": 9.567075482975142, "rewards/rejected": -5.2153846153846155, "step": 1370 }, { "epoch": 0.9400068563592733, "grad_norm": 0.69361922744287, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77046138.09230769, "logits/rejected": -83153741.20634921, "logps/chosen": -234.95384615384614, "logps/rejected": -373.3333333333333, "loss": 0.2282, "rewards/chosen": 0.7798076923076923, "rewards/margins": 5.799648962148962, "rewards/rejected": -5.01984126984127, "step": 1371 }, { "epoch": 0.9406924922865958, "grad_norm": 0.6367763296208271, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76919269.96610169, "logits/rejected": -83825292.98550725, "logps/chosen": -258.1694915254237, "logps/rejected": -346.8985507246377, "loss": 0.1859, "rewards/chosen": 3.2388678081965043, "rewards/margins": 8.427273605297954, "rewards/rejected": -5.188405797101449, "step": 1372 }, { "epoch": 0.9413781282139184, "grad_norm": 0.899374130940108, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -102000610.31884058, "logits/rejected": -106990296.94915254, "logps/chosen": -310.0289855072464, "logps/rejected": -331.66101694915255, "loss": 0.2217, "rewards/chosen": 1.3170289855072463, "rewards/margins": 18181431.55431712, "rewards/rejected": -18181430.237288136, "step": 1373 }, { "epoch": 0.942063764141241, "grad_norm": 0.8014631509731925, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -67167938.70422535, "logits/rejected": -95659564.91228071, "logps/chosen": -262.76056338028167, "logps/rejected": -333.1929824561403, "loss": 0.2563, "rewards/chosen": 0.9388204225352113, "rewards/margins": 5.816013404991351, "rewards/rejected": -4.87719298245614, "step": 1374 }, { "epoch": 0.9427494000685636, "grad_norm": 0.6574163556329489, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -109677919.52238806, "logits/rejected": -101625923.14754099, "logps/chosen": -322.3880597014925, "logps/rejected": -386.0983606557377, "loss": 0.2386, "rewards/chosen": 1.1128731343283582, "rewards/margins": 6.334184609738195, "rewards/rejected": -5.221311475409836, "step": 1375 }, { "epoch": 0.9434350359958862, "grad_norm": 0.7616983303538795, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73319660.3076923, "logits/rejected": -85652102.73684211, "logps/chosen": -281.53846153846155, "logps/rejected": -361.2631578947368, "loss": 0.1927, "rewards/chosen": 1.2542067307692308, "rewards/margins": 6.010785678137652, "rewards/rejected": -4.756578947368421, "step": 1376 }, { "epoch": 0.9441206719232088, "grad_norm": 0.5804439518229091, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81682293.15254237, "logits/rejected": -89356911.30434783, "logps/chosen": -245.96610169491527, "logps/rejected": -327.42028985507244, "loss": 0.2052, "rewards/chosen": 2.666278903767214, "rewards/margins": 7.52135136753533, "rewards/rejected": -4.855072463768116, "step": 1377 }, { "epoch": 0.9448063078505313, "grad_norm": 0.781236010401636, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -72941568.0, "logits/rejected": -78381056.0, "logps/chosen": -190.25, "logps/rejected": -348.25, "loss": 0.21, "rewards/chosen": 1.0205078125, "rewards/margins": 6.0908203125, "rewards/rejected": -5.0703125, "step": 1378 }, { "epoch": 0.945491943777854, "grad_norm": 0.56456412199915, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83769571.55555555, "logits/rejected": -82200868.57142857, "logps/chosen": -213.33333333333334, "logps/rejected": -358.2857142857143, "loss": 0.2539, "rewards/chosen": 0.7352430555555556, "rewards/margins": 5.76202876984127, "rewards/rejected": -5.026785714285714, "step": 1379 }, { "epoch": 0.9461775797051766, "grad_norm": 0.6207166600049521, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73544951.1724138, "logits/rejected": -108692392.22857143, "logps/chosen": -299.0344827586207, "logps/rejected": -391.3142857142857, "loss": 0.2059, "rewards/chosen": 1.0862068965517242, "rewards/margins": 5.836206896551724, "rewards/rejected": -4.75, "step": 1380 }, { "epoch": 0.9468632156324991, "grad_norm": 0.5808208501647255, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95995441.5483871, "logits/rejected": -92274688.0, "logps/chosen": -259.35483870967744, "logps/rejected": -416.969696969697, "loss": 0.196, "rewards/chosen": 1.0504032258064515, "rewards/margins": 6.929191104594331, "rewards/rejected": -5.878787878787879, "step": 1381 }, { "epoch": 0.9475488515598217, "grad_norm": 0.5742199137435591, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -79752563.01449275, "logits/rejected": -74075674.03389831, "logps/chosen": -252.28985507246378, "logps/rejected": -335.45762711864404, "loss": 0.1967, "rewards/chosen": 1.3659420289855073, "rewards/margins": 6.128653893392286, "rewards/rejected": -4.762711864406779, "step": 1382 }, { "epoch": 0.9482344874871443, "grad_norm": 0.6437265303246765, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87086996.21052632, "logits/rejected": -108245307.07692307, "logps/chosen": -306.10526315789474, "logps/rejected": -362.7692307692308, "loss": 0.2485, "rewards/chosen": 1.2417763157894737, "rewards/margins": 4.583122469635628, "rewards/rejected": -3.3413461538461537, "step": 1383 }, { "epoch": 0.948920123414467, "grad_norm": 0.5952970602837213, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -91226112.0, "logits/rejected": -81678551.57894737, "logps/chosen": -241.84615384615384, "logps/rejected": -376.0, "loss": 0.1775, "rewards/chosen": 3.073894500732422, "rewards/margins": 8.422578711258737, "rewards/rejected": -5.348684210526316, "step": 1384 }, { "epoch": 0.9496057593417895, "grad_norm": 0.8385190313876743, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89020486.62068966, "logits/rejected": -66000369.37142857, "logps/chosen": -282.2068965517241, "logps/rejected": -322.74285714285713, "loss": 0.2021, "rewards/chosen": 0.7359913793103449, "rewards/margins": 5.678848522167488, "rewards/rejected": -4.942857142857143, "step": 1385 }, { "epoch": 0.9502913952691121, "grad_norm": 0.5121613002434962, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83370990.03508772, "logits/rejected": -93574331.49295774, "logps/chosen": -291.36842105263156, "logps/rejected": -411.49295774647885, "loss": 0.2116, "rewards/chosen": 0.9643640350877193, "rewards/margins": 5.893941499876451, "rewards/rejected": -4.929577464788732, "step": 1386 }, { "epoch": 0.9509770311964347, "grad_norm": 0.7127267624949553, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78103024.48484848, "logits/rejected": -76038672.51612903, "logps/chosen": -279.5151515151515, "logps/rejected": -333.6774193548387, "loss": 0.2321, "rewards/chosen": 0.5066287878787878, "rewards/margins": 5.809048142717498, "rewards/rejected": -5.30241935483871, "step": 1387 }, { "epoch": 0.9516626671237572, "grad_norm": 0.5681409991769002, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88355420.32786885, "logits/rejected": -76436495.28358209, "logps/chosen": -242.62295081967213, "logps/rejected": -365.6119402985075, "loss": 0.1913, "rewards/chosen": 1.1798155737704918, "rewards/margins": 5.171653260337656, "rewards/rejected": -3.9918376865671643, "step": 1388 }, { "epoch": 0.9523483030510799, "grad_norm": 0.4866287169437619, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -89605585.45454545, "logits/rejected": -65212808.76712329, "logps/chosen": -283.05454545454546, "logps/rejected": -341.47945205479454, "loss": 0.1715, "rewards/chosen": 1.3545454545454545, "rewards/margins": 6.286052303860523, "rewards/rejected": -4.931506849315069, "step": 1389 }, { "epoch": 0.9530339389784025, "grad_norm": 0.45283941210438555, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -73269248.0, "logits/rejected": -82575360.0, "logps/chosen": -218.0, "logps/rejected": -332.0, "loss": 0.1988, "rewards/chosen": 2.8569226264953613, "rewards/margins": 8.259266376495361, "rewards/rejected": -5.40234375, "step": 1390 }, { "epoch": 0.953719574905725, "grad_norm": 0.6877676286513548, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83099648.0, "logits/rejected": -78118912.0, "logps/chosen": -302.0, "logps/rejected": -334.25, "loss": 0.2188, "rewards/chosen": 1.9997538328170776, "rewards/margins": 5.609128832817078, "rewards/rejected": -3.609375, "step": 1391 }, { "epoch": 0.9544052108330476, "grad_norm": 0.5637082491285869, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -95183640.77419356, "logits/rejected": -72002218.66666667, "logps/chosen": -336.0, "logps/rejected": -313.6969696969697, "loss": 0.2258, "rewards/chosen": 0.9475806451612904, "rewards/margins": 4.492322651405372, "rewards/rejected": -3.5447420062440815, "step": 1392 }, { "epoch": 0.9550908467603703, "grad_norm": 0.6532508939283319, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -67171465.5522388, "logits/rejected": -68415286.55737706, "logps/chosen": -234.7462686567164, "logps/rejected": -324.72131147540983, "loss": 0.2278, "rewards/chosen": 2.552715244577892, "rewards/margins": 7.290420162610678, "rewards/rejected": -4.737704918032787, "step": 1393 }, { "epoch": 0.9557764826876929, "grad_norm": 0.7888090217608744, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -69599232.0, "logits/rejected": -90177536.0, "logps/chosen": -175.5, "logps/rejected": -384.5, "loss": 0.1935, "rewards/chosen": 1.0712890625, "rewards/margins": 5.6181640625, "rewards/rejected": -4.546875, "step": 1394 }, { "epoch": 0.9564621186150154, "grad_norm": 0.6660816960246785, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83526568.22857143, "logits/rejected": -92563950.34482759, "logps/chosen": -280.9142857142857, "logps/rejected": -388.9655172413793, "loss": 0.2207, "rewards/chosen": 1.0084821428571429, "rewards/margins": 6.6421028325123155, "rewards/rejected": -5.633620689655173, "step": 1395 }, { "epoch": 0.957147754542338, "grad_norm": 0.7025472690120668, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86123042.13333334, "logits/rejected": -69699463.52941176, "logps/chosen": -338.1333333333333, "logps/rejected": -329.1764705882353, "loss": 0.2162, "rewards/chosen": 0.7791666666666667, "rewards/margins": 5.918872549019608, "rewards/rejected": -5.139705882352941, "step": 1396 }, { "epoch": 0.9578333904696607, "grad_norm": 0.6378096127894074, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80965046.85714285, "logits/rejected": -86565774.22222222, "logps/chosen": -256.2857142857143, "logps/rejected": -366.0, "loss": 0.1786, "rewards/chosen": 1.2845982142857142, "rewards/margins": 5.756820436507937, "rewards/rejected": -4.472222222222222, "step": 1397 }, { "epoch": 0.9585190263969832, "grad_norm": 0.764708387722287, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80631878.62068966, "logits/rejected": -94791270.4, "logps/chosen": -292.13793103448273, "logps/rejected": -293.9428571428571, "loss": 0.2082, "rewards/chosen": 3.8830771610654633, "rewards/margins": 8.804505732494036, "rewards/rejected": -4.921428571428572, "step": 1398 }, { "epoch": 0.9592046623243058, "grad_norm": 0.5260737329193138, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76973245.62962963, "logits/rejected": -64501593.94594595, "logps/chosen": -293.6296296296296, "logps/rejected": -318.27027027027026, "loss": 0.2052, "rewards/chosen": 3.323494240089699, "rewards/margins": 7.742413159008619, "rewards/rejected": -4.418918918918919, "step": 1399 }, { "epoch": 0.9598902982516284, "grad_norm": 0.5115698671561819, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -62390272.0, "logits/rejected": -78060657.77777778, "logps/chosen": -240.0, "logps/rejected": -376.0, "loss": 0.1858, "rewards/chosen": 1.5100446428571428, "rewards/margins": 7.010044642857142, "rewards/rejected": -5.5, "step": 1400 }, { "epoch": 0.9605759341789509, "grad_norm": 0.8981928725283134, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -69721709.1147541, "logits/rejected": -82258439.64179105, "logps/chosen": -232.78688524590163, "logps/rejected": -343.4029850746269, "loss": 0.2266, "rewards/chosen": 0.694672131147541, "rewards/margins": 5.601388549057988, "rewards/rejected": -4.906716417910448, "step": 1401 }, { "epoch": 0.9612615701062736, "grad_norm": 0.7266928205024891, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85468836.2264151, "logits/rejected": -79691776.0, "logps/chosen": -220.52830188679246, "logps/rejected": -331.94666666666666, "loss": 0.2005, "rewards/chosen": 3.1609425094892396, "rewards/margins": 7.814275842822573, "rewards/rejected": -4.653333333333333, "step": 1402 }, { "epoch": 0.9619472060335962, "grad_norm": 0.6616565146768785, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84291980.38709678, "logits/rejected": -98121293.57575758, "logps/chosen": -319.48387096774195, "logps/rejected": -409.6969696969697, "loss": 0.2155, "rewards/chosen": 0.811491935483871, "rewards/margins": 4.705431329423265, "rewards/rejected": -3.893939393939394, "step": 1403 }, { "epoch": 0.9626328419609188, "grad_norm": 0.5726444977849656, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -96812787.40983607, "logits/rejected": -85513720.35820895, "logps/chosen": -339.672131147541, "logps/rejected": -342.44776119402985, "loss": 0.1961, "rewards/chosen": 1.4528688524590163, "rewards/margins": 6.2215255688769275, "rewards/rejected": -4.768656716417911, "step": 1404 }, { "epoch": 0.9633184778882413, "grad_norm": 0.5142176620155048, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -83486622.47619048, "logits/rejected": -82143830.64615385, "logps/chosen": -253.96825396825398, "logps/rejected": -327.6307692307692, "loss": 0.2081, "rewards/chosen": 2.414670671735491, "rewards/margins": 7.068516825581645, "rewards/rejected": -4.653846153846154, "step": 1405 }, { "epoch": 0.964004113815564, "grad_norm": 0.5481296194522928, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80982331.07692307, "logits/rejected": -64911847.61904762, "logps/chosen": -283.5692307692308, "logps/rejected": -343.1111111111111, "loss": 0.2213, "rewards/chosen": 0.9633413461538461, "rewards/margins": 5.558579441391941, "rewards/rejected": -4.595238095238095, "step": 1406 }, { "epoch": 0.9646897497428866, "grad_norm": 0.5320130555057708, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -66891917.24137931, "logits/rejected": -83167056.45714286, "logps/chosen": -276.9655172413793, "logps/rejected": -414.1714285714286, "loss": 0.1592, "rewards/chosen": 1.8599137931034482, "rewards/margins": 6.88134236453202, "rewards/rejected": -5.021428571428571, "step": 1407 }, { "epoch": 0.9653753856702091, "grad_norm": 0.5575177583958345, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75432944.24615385, "logits/rejected": -72168659.3015873, "logps/chosen": -283.32307692307694, "logps/rejected": -321.77777777777777, "loss": 0.2237, "rewards/chosen": 1.0033653846153847, "rewards/margins": 6.32876221001221, "rewards/rejected": -5.325396825396825, "step": 1408 }, { "epoch": 0.9660610215975317, "grad_norm": 0.5700315897112617, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75733770.81690142, "logits/rejected": -84180417.12280701, "logps/chosen": -264.7887323943662, "logps/rejected": -317.1929824561403, "loss": 0.2283, "rewards/chosen": 0.9388204225352113, "rewards/margins": 6.500223931307141, "rewards/rejected": -5.56140350877193, "step": 1409 }, { "epoch": 0.9667466575248543, "grad_norm": 0.5488006000676223, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100139008.0, "logits/rejected": -78839808.0, "logps/chosen": -304.75, "logps/rejected": -380.25, "loss": 0.1988, "rewards/chosen": 1.1767578125, "rewards/margins": 6.5517578125, "rewards/rejected": -5.375, "step": 1410 }, { "epoch": 0.9674322934521769, "grad_norm": 0.6488801629919494, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80290962.28571428, "logits/rejected": -106857960.36923076, "logps/chosen": -234.9206349206349, "logps/rejected": -336.0, "loss": 0.202, "rewards/chosen": 1.4751984126984128, "rewards/margins": 5.4636599511599515, "rewards/rejected": -3.9884615384615385, "step": 1411 }, { "epoch": 0.9681179293794995, "grad_norm": 0.5991945154549638, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88879299.04761904, "logits/rejected": -85047579.56923077, "logps/chosen": -245.33333333333334, "logps/rejected": -336.24615384615385, "loss": 0.2432, "rewards/chosen": 2.1227475120907737, "rewards/margins": 6.653516742860004, "rewards/rejected": -4.530769230769231, "step": 1412 }, { "epoch": 0.9688035653068221, "grad_norm": 0.6317703594461285, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85043129.37931034, "logits/rejected": -76336332.8, "logps/chosen": -296.2758620689655, "logps/rejected": -285.25714285714287, "loss": 0.1779, "rewards/chosen": 1.4234913793103448, "rewards/margins": 6.58777709359606, "rewards/rejected": -5.164285714285715, "step": 1413 }, { "epoch": 0.9694892012341446, "grad_norm": 0.5615227114178913, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93696485.96610169, "logits/rejected": -65649975.652173914, "logps/chosen": -276.33898305084745, "logps/rejected": -333.4492753623188, "loss": 0.1818, "rewards/chosen": 1.2669491525423728, "rewards/margins": 7.1075288626873006, "rewards/rejected": -5.840579710144928, "step": 1414 }, { "epoch": 0.9701748371614672, "grad_norm": 0.606277397090152, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85241162.83076923, "logits/rejected": -84019232.50793651, "logps/chosen": -261.53846153846155, "logps/rejected": -316.95238095238096, "loss": 0.2297, "rewards/chosen": 0.7376201923076923, "rewards/margins": 5.436032890720391, "rewards/rejected": -4.698412698412699, "step": 1415 }, { "epoch": 0.9708604730887899, "grad_norm": 0.8171705838257762, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86542472.53333333, "logits/rejected": -97085801.41176471, "logps/chosen": -307.06666666666666, "logps/rejected": -360.0, "loss": 0.2131, "rewards/chosen": 1.2020833333333334, "rewards/margins": 6.0917892156862745, "rewards/rejected": -4.889705882352941, "step": 1416 }, { "epoch": 0.9715461090161125, "grad_norm": 0.6792047458041702, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78293674.66666667, "logits/rejected": -72851834.09230769, "logps/chosen": -226.79365079365078, "logps/rejected": -337.2307692307692, "loss": 0.2554, "rewards/chosen": 0.5705605158730159, "rewards/margins": 5.709022054334555, "rewards/rejected": -5.138461538461539, "step": 1417 }, { "epoch": 0.972231744943435, "grad_norm": 0.5832427878981364, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76895573.33333333, "logits/rejected": -69530926.87323944, "logps/chosen": -284.35087719298247, "logps/rejected": -328.5633802816901, "loss": 0.189, "rewards/chosen": 1.1337719298245614, "rewards/margins": 6.176025450951323, "rewards/rejected": -5.042253521126761, "step": 1418 }, { "epoch": 0.9729173808707576, "grad_norm": 0.5692010590883186, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84191120.2909091, "logits/rejected": -76186946.63013698, "logps/chosen": -211.63636363636363, "logps/rejected": -316.93150684931504, "loss": 0.2027, "rewards/chosen": 0.7045454545454546, "rewards/margins": 6.3757783312577825, "rewards/rejected": -5.671232876712328, "step": 1419 }, { "epoch": 0.9736030167980803, "grad_norm": 0.5647581788459524, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75691055.26153846, "logits/rejected": -79292318.47619048, "logps/chosen": -208.24615384615385, "logps/rejected": -335.74603174603175, "loss": 0.1976, "rewards/chosen": 1.0826923076923076, "rewards/margins": 4.5382923801594455, "rewards/rejected": -3.455600072467138, "step": 1420 }, { "epoch": 0.9742886527254028, "grad_norm": 0.7662390497601617, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -71248696.51948053, "logits/rejected": -83310391.21568628, "logps/chosen": -292.3636363636364, "logps/rejected": -348.54901960784315, "loss": 0.2402, "rewards/chosen": 2.255257544579444, "rewards/margins": 6.049375191638267, "rewards/rejected": -3.7941176470588234, "step": 1421 }, { "epoch": 0.9749742886527254, "grad_norm": 0.6569493291058179, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76350550.77966101, "logits/rejected": -89721633.39130434, "logps/chosen": -233.89830508474577, "logps/rejected": -334.3768115942029, "loss": 0.1976, "rewards/chosen": 3.07783637612553, "rewards/margins": 8.287981303661761, "rewards/rejected": -5.2101449275362315, "step": 1422 }, { "epoch": 0.975659924580048, "grad_norm": 0.7409672929572446, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81518327.74193548, "logits/rejected": -79374025.6969697, "logps/chosen": -326.96774193548384, "logps/rejected": -351.030303030303, "loss": 0.2192, "rewards/chosen": 0.9836189516129032, "rewards/margins": 5.858618951612903, "rewards/rejected": -4.875, "step": 1423 }, { "epoch": 0.9763455605073705, "grad_norm": 0.5026253772740008, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97994193.45454545, "logits/rejected": -75226871.74193548, "logps/chosen": -297.45454545454544, "logps/rejected": -305.03225806451616, "loss": 0.202, "rewards/chosen": 3.0558672818270596, "rewards/margins": 6.682619581474237, "rewards/rejected": -3.6267522996471775, "step": 1424 }, { "epoch": 0.9770311964346932, "grad_norm": 0.6883610909191118, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84165700.26666667, "logits/rejected": -84256165.64705883, "logps/chosen": -305.8666666666667, "logps/rejected": -420.2352941176471, "loss": 0.2257, "rewards/chosen": 0.8833333333333333, "rewards/margins": 6.250980392156862, "rewards/rejected": -5.367647058823529, "step": 1425 }, { "epoch": 0.9777168323620158, "grad_norm": 0.6097495786363766, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -65173031.384615384, "logits/rejected": -67674762.15873016, "logps/chosen": -253.2923076923077, "logps/rejected": -330.6666666666667, "loss": 0.1866, "rewards/chosen": 1.376923076923077, "rewards/margins": 5.599145299145299, "rewards/rejected": -4.222222222222222, "step": 1426 }, { "epoch": 0.9784024682893384, "grad_norm": 0.6536335301236272, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -88662926.22222222, "logits/rejected": -67108864.0, "logps/chosen": -305.55555555555554, "logps/rejected": -321.14285714285717, "loss": 0.2398, "rewards/chosen": 1.0954861111111112, "rewards/margins": 6.140128968253968, "rewards/rejected": -5.044642857142857, "step": 1427 }, { "epoch": 0.9790881042166609, "grad_norm": 0.7880894201857666, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -68846504.22857143, "logits/rejected": -79981038.34482759, "logps/chosen": -233.6, "logps/rejected": -374.0689655172414, "loss": 0.243, "rewards/chosen": 3.6479771205357143, "rewards/margins": 7.484184017087438, "rewards/rejected": -3.836206896551724, "step": 1428 }, { "epoch": 0.9797737401439836, "grad_norm": 0.5316665605747344, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -78512128.0, "logits/rejected": -80740352.0, "logps/chosen": -233.25, "logps/rejected": -364.0, "loss": 0.2378, "rewards/chosen": 0.72412109375, "rewards/margins": 5.69287109375, "rewards/rejected": -4.96875, "step": 1429 }, { "epoch": 0.9804593760713062, "grad_norm": 0.5927849747012017, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -90917707.29411764, "logits/rejected": -87171618.13333334, "logps/chosen": -320.0, "logps/rejected": -352.53333333333336, "loss": 0.224, "rewards/chosen": 2.551469690659467, "rewards/margins": 7.343136357326134, "rewards/rejected": -4.791666666666667, "step": 1430 }, { "epoch": 0.9811450119986287, "grad_norm": 0.5245793923597616, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76096658.28571428, "logits/rejected": -94339576.12307692, "logps/chosen": -262.6031746031746, "logps/rejected": -360.8615384615385, "loss": 0.2013, "rewards/chosen": 1.2311507936507937, "rewards/margins": 6.915766178266178, "rewards/rejected": -5.684615384615385, "step": 1431 }, { "epoch": 0.9818306479259513, "grad_norm": 0.5972667514227588, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85306731.35483871, "logits/rejected": -83886080.0, "logps/chosen": -207.74193548387098, "logps/rejected": -407.27272727272725, "loss": 0.2237, "rewards/chosen": 1.9244041442871094, "rewards/margins": 6.20470717459014, "rewards/rejected": -4.28030303030303, "step": 1432 }, { "epoch": 0.9825162838532739, "grad_norm": 0.6806600470788254, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98506225.37142856, "logits/rejected": -70001487.44827586, "logps/chosen": -268.1142857142857, "logps/rejected": -338.7586206896552, "loss": 0.2431, "rewards/chosen": 0.9767857142857143, "rewards/margins": 4.717357361140509, "rewards/rejected": -3.740571646854795, "step": 1433 }, { "epoch": 0.9832019197805965, "grad_norm": 0.5424796925719918, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -69206016.0, "logits/rejected": -85084452.57142857, "logps/chosen": -254.66666666666666, "logps/rejected": -306.2857142857143, "loss": 0.2209, "rewards/chosen": 1.2934027777777777, "rewards/margins": 5.6594742063492065, "rewards/rejected": -4.366071428571429, "step": 1434 }, { "epoch": 0.9838875557079191, "grad_norm": 0.5135473217592257, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80835677.0909091, "logits/rejected": -103708475.61643836, "logps/chosen": -282.1818181818182, "logps/rejected": -384.8767123287671, "loss": 0.1832, "rewards/chosen": 1.3386363636363636, "rewards/margins": 7.420828144458282, "rewards/rejected": -6.082191780821918, "step": 1435 }, { "epoch": 0.9845731916352417, "grad_norm": 0.6696781648619416, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107293002.32258065, "logits/rejected": -71239617.93939394, "logps/chosen": -305.80645161290323, "logps/rejected": -324.6060606060606, "loss": 0.21, "rewards/chosen": 2.912260978452621, "rewards/margins": 7.6660488572405, "rewards/rejected": -4.753787878787879, "step": 1436 }, { "epoch": 0.9852588275625643, "grad_norm": 0.7800449286549709, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76429539.55555555, "logits/rejected": -81046858.83076923, "logps/chosen": -276.06349206349205, "logps/rejected": -343.1384615384615, "loss": 0.2302, "rewards/chosen": 0.9007936507936508, "rewards/margins": 4.823144020760598, "rewards/rejected": -3.922350369966947, "step": 1437 }, { "epoch": 0.9859444634898868, "grad_norm": 0.6210607034513972, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -77705000.42105263, "logits/rejected": -70535196.84507042, "logps/chosen": -266.3859649122807, "logps/rejected": -345.23943661971833, "loss": 0.1894, "rewards/chosen": 1.4144736842105263, "rewards/margins": 6.1539103039288365, "rewards/rejected": -4.73943661971831, "step": 1438 }, { "epoch": 0.9866300994172095, "grad_norm": 0.7098371853278423, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85404707.31034483, "logits/rejected": -92154850.74285714, "logps/chosen": -290.48275862068965, "logps/rejected": -331.8857142857143, "loss": 0.1937, "rewards/chosen": 1.3760775862068966, "rewards/margins": 6.233220443349754, "rewards/rejected": -4.857142857142857, "step": 1439 }, { "epoch": 0.9873157353445321, "grad_norm": 0.5162786317927671, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92569025.12280701, "logits/rejected": -72425587.38028169, "logps/chosen": -275.36842105263156, "logps/rejected": -329.46478873239437, "loss": 0.1991, "rewards/chosen": 1.006578947368421, "rewards/margins": 6.006578947368421, "rewards/rejected": -5.0, "step": 1440 }, { "epoch": 0.9880013712718546, "grad_norm": 0.8010463092219567, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -100122095.48387097, "logits/rejected": -77658174.06060606, "logps/chosen": -329.80645161290323, "logps/rejected": -389.3333333333333, "loss": 0.2217, "rewards/chosen": 1.059475806451613, "rewards/margins": 5.813263685239492, "rewards/rejected": -4.753787878787879, "step": 1441 }, { "epoch": 0.9886870071991772, "grad_norm": 0.6856759211926727, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -92382234.25641026, "logits/rejected": -81033953.28, "logps/chosen": -323.6923076923077, "logps/rejected": -366.72, "loss": 0.2285, "rewards/chosen": 1.0945512820512822, "rewards/margins": 1257000.454551282, "rewards/rejected": -1256999.36, "step": 1442 }, { "epoch": 0.9893726431264999, "grad_norm": 0.5234482199947542, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -98491245.71428572, "logits/rejected": -79517013.33333333, "logps/chosen": -297.14285714285717, "logps/rejected": -342.6666666666667, "loss": 0.222, "rewards/chosen": 2.579730987548828, "rewards/margins": 7.850564320882161, "rewards/rejected": -5.270833333333333, "step": 1443 }, { "epoch": 0.9900582790538224, "grad_norm": 0.6658401982982244, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97907039.08571428, "logits/rejected": -75786734.34482759, "logps/chosen": -281.37142857142857, "logps/rejected": -318.8965517241379, "loss": 0.2294, "rewards/chosen": 1.0455357142857142, "rewards/margins": 5.554156403940887, "rewards/rejected": -4.508620689655173, "step": 1444 }, { "epoch": 0.990743914981145, "grad_norm": 0.5622630242314303, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -81507221.01492538, "logits/rejected": -76528858.22950819, "logps/chosen": -328.35820895522386, "logps/rejected": -319.21311475409834, "loss": 0.2159, "rewards/chosen": 1.357276119402985, "rewards/margins": 10021857.881866284, "rewards/rejected": -10021856.524590164, "step": 1445 }, { "epoch": 0.9914295509084676, "grad_norm": 0.5683077274480368, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -93206755.55555555, "logits/rejected": -77691415.63076924, "logps/chosen": -251.42857142857142, "logps/rejected": -313.6, "loss": 0.2256, "rewards/chosen": 0.9508928571428571, "rewards/margins": 5.985508241758241, "rewards/rejected": -5.0346153846153845, "step": 1446 }, { "epoch": 0.9921151868357901, "grad_norm": 0.765390499489493, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -75440015.78082192, "logits/rejected": -69396666.18181819, "logps/chosen": -248.1095890410959, "logps/rejected": -344.1454545454545, "loss": 0.2462, "rewards/chosen": 0.8056506849315068, "rewards/margins": 5.773832503113325, "rewards/rejected": -4.968181818181818, "step": 1447 }, { "epoch": 0.9928008227631128, "grad_norm": 0.7093014910741557, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -107694923.29411764, "logits/rejected": -86682282.66666667, "logps/chosen": -295.29411764705884, "logps/rejected": -368.26666666666665, "loss": 0.226, "rewards/chosen": 0.9181985294117647, "rewards/margins": 2.7535559411142385, "rewards/rejected": -1.835357411702474, "step": 1448 }, { "epoch": 0.9934864586904354, "grad_norm": 0.6597670654098943, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -87254233.21212122, "logits/rejected": -70356067.09677419, "logps/chosen": -269.3333333333333, "logps/rejected": -340.38709677419354, "loss": 0.236, "rewards/chosen": 0.7107007575757576, "rewards/margins": 2.14008603510852, "rewards/rejected": -1.429385277532762, "step": 1449 }, { "epoch": 0.994172094617758, "grad_norm": 0.5815323510270639, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85644981.67741935, "logits/rejected": -74290020.84848484, "logps/chosen": -355.0967741935484, "logps/rejected": -332.1212121212121, "loss": 0.2285, "rewards/chosen": 1.0216733870967742, "rewards/margins": 6.051976417399804, "rewards/rejected": -5.03030303030303, "step": 1450 }, { "epoch": 0.9948577305450805, "grad_norm": 0.6005897741274092, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -86851019.03448276, "logits/rejected": -63753420.8, "logps/chosen": -288.0, "logps/rejected": -298.0571428571429, "loss": 0.1906, "rewards/chosen": 2.970898069184402, "rewards/margins": 8.24946949775583, "rewards/rejected": -5.2785714285714285, "step": 1451 }, { "epoch": 0.9955433664724032, "grad_norm": 0.47699073559435995, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -85171431.22580644, "logits/rejected": -62978110.06060606, "logps/chosen": -321.80645161290323, "logps/rejected": -337.6969696969697, "loss": 0.2216, "rewards/chosen": 1.2993951612903225, "rewards/margins": 6.144092130987293, "rewards/rejected": -4.84469696969697, "step": 1452 }, { "epoch": 0.9962290023997258, "grad_norm": 0.5656303739892541, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76977814.58823529, "logits/rejected": -83536554.66666667, "logps/chosen": -301.4117647058824, "logps/rejected": -324.26666666666665, "loss": 0.2138, "rewards/chosen": 1.3365119485294117, "rewards/margins": 4.789105942670037, "rewards/rejected": -3.452593994140625, "step": 1453 }, { "epoch": 0.9969146383270483, "grad_norm": 0.5716571237287702, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -84256165.64705883, "logits/rejected": -78293674.66666667, "logps/chosen": -250.58823529411765, "logps/rejected": -366.4, "loss": 0.2148, "rewards/chosen": 1.224264705882353, "rewards/margins": 1946446.8242647059, "rewards/rejected": -1946445.6, "step": 1454 }, { "epoch": 0.9976002742543709, "grad_norm": 0.5868135202165174, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -71052761.79104477, "logits/rejected": -75978785.5737705, "logps/chosen": -289.67164179104475, "logps/rejected": -385.04918032786884, "loss": 0.2244, "rewards/chosen": 1.234141791044776, "rewards/margins": 6.725945069733301, "rewards/rejected": -5.491803278688525, "step": 1455 }, { "epoch": 0.9982859101816935, "grad_norm": 0.7491902508107366, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -82621131.17460318, "logits/rejected": -102921767.38461539, "logps/chosen": -251.17460317460316, "logps/rejected": -410.33846153846156, "loss": 0.2223, "rewards/chosen": 0.9131944444444444, "rewards/margins": 6.2593482905982905, "rewards/rejected": -5.346153846153846, "step": 1456 }, { "epoch": 0.9989715461090161, "grad_norm": 0.7973157167833345, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -97000664.33802816, "logits/rejected": -95806733.4736842, "logps/chosen": -251.94366197183098, "logps/rejected": -462.5964912280702, "loss": 0.2439, "rewards/chosen": 0.6461267605633803, "rewards/margins": 5.2426179886335555, "rewards/rejected": -4.5964912280701755, "step": 1457 }, { "epoch": 0.9996571820363387, "grad_norm": 0.604025687134776, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -80909477.16129032, "logits/rejected": -103523048.72727273, "logps/chosen": -277.93548387096774, "logps/rejected": -362.1818181818182, "loss": 0.1999, "rewards/chosen": 2.8019135998141382, "rewards/margins": 7.404186327086865, "rewards/rejected": -4.6022727272727275, "step": 1458 }, { "epoch": 1.0, "grad_norm": 0.604025687134776, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -76895573.33333333, "logits/rejected": -69056219.42857143, "logps/chosen": -345.77777777777777, "logps/rejected": -400.0, "loss": 0.1001, "rewards/chosen": 1.9444444444444444, "rewards/margins": 6.944444444444445, "rewards/rejected": -5.0, "step": 1459 } ], "logging_steps": 1, "max_steps": 1459, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }