diff --git "a/checkpoint-3386/trainer_state.json" "b/checkpoint-3386/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-3386/trainer_state.json" @@ -0,0 +1,23735 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3386, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00029533372711163615, + "grad_norm": 0.5710846885870289, + "learning_rate": 9.80392156862745e-07, + "loss": 2.4753, + "step": 1 + }, + { + "epoch": 0.0005906674542232723, + "grad_norm": 0.6857016625051384, + "learning_rate": 1.96078431372549e-06, + "loss": 2.3734, + "step": 2 + }, + { + "epoch": 0.0008860011813349084, + "grad_norm": 0.6256508299595333, + "learning_rate": 2.9411764705882355e-06, + "loss": 2.4376, + "step": 3 + }, + { + "epoch": 0.0011813349084465446, + "grad_norm": 0.6149855337534975, + "learning_rate": 3.92156862745098e-06, + "loss": 2.3998, + "step": 4 + }, + { + "epoch": 0.0014766686355581807, + "grad_norm": 0.5737661017857456, + "learning_rate": 4.901960784313726e-06, + "loss": 2.4253, + "step": 5 + }, + { + "epoch": 0.0017720023626698169, + "grad_norm": 0.5066720333672611, + "learning_rate": 5.882352941176471e-06, + "loss": 2.4079, + "step": 6 + }, + { + "epoch": 0.002067336089781453, + "grad_norm": 0.4231547090533186, + "learning_rate": 6.862745098039216e-06, + "loss": 2.4201, + "step": 7 + }, + { + "epoch": 0.002362669816893089, + "grad_norm": 0.4302956698747884, + "learning_rate": 7.84313725490196e-06, + "loss": 2.4233, + "step": 8 + }, + { + "epoch": 0.0026580035440047253, + "grad_norm": 0.3285332592235351, + "learning_rate": 8.823529411764707e-06, + "loss": 2.3609, + "step": 9 + }, + { + "epoch": 0.0029533372711163615, + "grad_norm": 0.3088802384002176, + "learning_rate": 9.803921568627451e-06, + "loss": 2.3957, + "step": 10 + }, + { + "epoch": 0.0032486709982279976, + "grad_norm": 0.29968824828037566, + "learning_rate": 1.0784313725490197e-05, + "loss": 2.3981, + "step": 11 + }, + { + "epoch": 0.0035440047253396337, + "grad_norm": 0.5075309266218019, + "learning_rate": 1.1764705882352942e-05, + "loss": 2.3501, + "step": 12 + }, + { + "epoch": 0.00383933845245127, + "grad_norm": 0.29924986256729325, + "learning_rate": 1.2745098039215686e-05, + "loss": 2.4136, + "step": 13 + }, + { + "epoch": 0.004134672179562906, + "grad_norm": 0.28390574934416124, + "learning_rate": 1.3725490196078432e-05, + "loss": 2.3868, + "step": 14 + }, + { + "epoch": 0.004430005906674543, + "grad_norm": 0.2955156626544259, + "learning_rate": 1.4705882352941177e-05, + "loss": 2.349, + "step": 15 + }, + { + "epoch": 0.004725339633786178, + "grad_norm": 0.2817325293947159, + "learning_rate": 1.568627450980392e-05, + "loss": 2.3478, + "step": 16 + }, + { + "epoch": 0.005020673360897815, + "grad_norm": 0.27817728483393594, + "learning_rate": 1.6666666666666667e-05, + "loss": 2.422, + "step": 17 + }, + { + "epoch": 0.005316007088009451, + "grad_norm": 0.2719411506416558, + "learning_rate": 1.7647058823529414e-05, + "loss": 2.3529, + "step": 18 + }, + { + "epoch": 0.005611340815121087, + "grad_norm": 0.3494691834063752, + "learning_rate": 1.862745098039216e-05, + "loss": 2.6302, + "step": 19 + }, + { + "epoch": 0.005906674542232723, + "grad_norm": 0.2798300369242536, + "learning_rate": 1.9607843137254903e-05, + "loss": 2.4106, + "step": 20 + }, + { + "epoch": 0.0062020082693443595, + "grad_norm": 0.4036462865847432, + "learning_rate": 2.058823529411765e-05, + "loss": 2.3687, + "step": 21 + }, + { + "epoch": 0.006497341996455995, + "grad_norm": 0.2778560581109051, + "learning_rate": 2.1568627450980395e-05, + "loss": 2.3658, + "step": 22 + }, + { + "epoch": 0.006792675723567632, + "grad_norm": 0.2787407092611701, + "learning_rate": 2.2549019607843138e-05, + "loss": 2.2678, + "step": 23 + }, + { + "epoch": 0.0070880094506792675, + "grad_norm": 0.2706431109026445, + "learning_rate": 2.3529411764705884e-05, + "loss": 2.3848, + "step": 24 + }, + { + "epoch": 0.007383343177790904, + "grad_norm": 0.2627352716054599, + "learning_rate": 2.4509803921568626e-05, + "loss": 2.3651, + "step": 25 + }, + { + "epoch": 0.00767867690490254, + "grad_norm": 0.25100290776179457, + "learning_rate": 2.5490196078431373e-05, + "loss": 2.3965, + "step": 26 + }, + { + "epoch": 0.007974010632014175, + "grad_norm": 0.23948535608924562, + "learning_rate": 2.647058823529412e-05, + "loss": 2.3892, + "step": 27 + }, + { + "epoch": 0.008269344359125812, + "grad_norm": 0.32760873552824804, + "learning_rate": 2.7450980392156865e-05, + "loss": 2.4417, + "step": 28 + }, + { + "epoch": 0.008564678086237449, + "grad_norm": 0.259393655152066, + "learning_rate": 2.8431372549019608e-05, + "loss": 2.387, + "step": 29 + }, + { + "epoch": 0.008860011813349085, + "grad_norm": 0.2562564942071991, + "learning_rate": 2.9411764705882354e-05, + "loss": 2.3671, + "step": 30 + }, + { + "epoch": 0.00915534554046072, + "grad_norm": 0.26320815104494905, + "learning_rate": 3.0392156862745097e-05, + "loss": 2.3868, + "step": 31 + }, + { + "epoch": 0.009450679267572357, + "grad_norm": 0.25815934633901616, + "learning_rate": 3.137254901960784e-05, + "loss": 2.3485, + "step": 32 + }, + { + "epoch": 0.009746012994683993, + "grad_norm": 0.24418370439017023, + "learning_rate": 3.235294117647059e-05, + "loss": 2.383, + "step": 33 + }, + { + "epoch": 0.01004134672179563, + "grad_norm": 0.24701385456102007, + "learning_rate": 3.3333333333333335e-05, + "loss": 2.4669, + "step": 34 + }, + { + "epoch": 0.010336680448907265, + "grad_norm": 0.23726052395258151, + "learning_rate": 3.431372549019608e-05, + "loss": 2.4427, + "step": 35 + }, + { + "epoch": 0.010632014176018901, + "grad_norm": 0.249723333513617, + "learning_rate": 3.529411764705883e-05, + "loss": 2.4135, + "step": 36 + }, + { + "epoch": 0.010927347903130538, + "grad_norm": 0.24842772194675528, + "learning_rate": 3.627450980392157e-05, + "loss": 2.4197, + "step": 37 + }, + { + "epoch": 0.011222681630242174, + "grad_norm": 0.24251687816991394, + "learning_rate": 3.725490196078432e-05, + "loss": 2.3664, + "step": 38 + }, + { + "epoch": 0.01151801535735381, + "grad_norm": 0.2776890740557114, + "learning_rate": 3.8235294117647055e-05, + "loss": 2.3475, + "step": 39 + }, + { + "epoch": 0.011813349084465446, + "grad_norm": 0.24148165997176338, + "learning_rate": 3.9215686274509805e-05, + "loss": 2.3283, + "step": 40 + }, + { + "epoch": 0.012108682811577082, + "grad_norm": 0.24683354458235338, + "learning_rate": 4.0196078431372555e-05, + "loss": 2.3371, + "step": 41 + }, + { + "epoch": 0.012404016538688719, + "grad_norm": 0.23832394264356382, + "learning_rate": 4.11764705882353e-05, + "loss": 2.3517, + "step": 42 + }, + { + "epoch": 0.012699350265800354, + "grad_norm": 0.22616199741964443, + "learning_rate": 4.215686274509804e-05, + "loss": 2.2656, + "step": 43 + }, + { + "epoch": 0.01299468399291199, + "grad_norm": 0.23610770768084366, + "learning_rate": 4.313725490196079e-05, + "loss": 2.3267, + "step": 44 + }, + { + "epoch": 0.013290017720023627, + "grad_norm": 0.24426850593667337, + "learning_rate": 4.411764705882353e-05, + "loss": 2.429, + "step": 45 + }, + { + "epoch": 0.013585351447135264, + "grad_norm": 0.23540908458759813, + "learning_rate": 4.5098039215686275e-05, + "loss": 2.4142, + "step": 46 + }, + { + "epoch": 0.013880685174246898, + "grad_norm": 0.24719627094743066, + "learning_rate": 4.607843137254902e-05, + "loss": 2.4022, + "step": 47 + }, + { + "epoch": 0.014176018901358535, + "grad_norm": 0.23254506669061972, + "learning_rate": 4.705882352941177e-05, + "loss": 2.3377, + "step": 48 + }, + { + "epoch": 0.014471352628470172, + "grad_norm": 0.2367249144956946, + "learning_rate": 4.803921568627452e-05, + "loss": 2.3455, + "step": 49 + }, + { + "epoch": 0.014766686355581808, + "grad_norm": 0.2350908394389593, + "learning_rate": 4.901960784313725e-05, + "loss": 2.4205, + "step": 50 + }, + { + "epoch": 0.015062020082693443, + "grad_norm": 0.2708679446087991, + "learning_rate": 5e-05, + "loss": 2.3923, + "step": 51 + }, + { + "epoch": 0.01535735380980508, + "grad_norm": 0.23273626269765535, + "learning_rate": 5.0980392156862745e-05, + "loss": 2.3839, + "step": 52 + }, + { + "epoch": 0.015652687536916714, + "grad_norm": 0.23157975360361266, + "learning_rate": 5.1960784313725495e-05, + "loss": 2.4155, + "step": 53 + }, + { + "epoch": 0.01594802126402835, + "grad_norm": 0.23001752745144333, + "learning_rate": 5.294117647058824e-05, + "loss": 2.3878, + "step": 54 + }, + { + "epoch": 0.016243354991139988, + "grad_norm": 0.23726316960093932, + "learning_rate": 5.392156862745098e-05, + "loss": 2.4332, + "step": 55 + }, + { + "epoch": 0.016538688718251624, + "grad_norm": 0.28431805784638436, + "learning_rate": 5.490196078431373e-05, + "loss": 2.3809, + "step": 56 + }, + { + "epoch": 0.01683402244536326, + "grad_norm": 0.24721454285557176, + "learning_rate": 5.588235294117647e-05, + "loss": 2.373, + "step": 57 + }, + { + "epoch": 0.017129356172474897, + "grad_norm": 0.22647916971404983, + "learning_rate": 5.6862745098039215e-05, + "loss": 2.2881, + "step": 58 + }, + { + "epoch": 0.017424689899586534, + "grad_norm": 0.239755168051603, + "learning_rate": 5.784313725490197e-05, + "loss": 2.4051, + "step": 59 + }, + { + "epoch": 0.01772002362669817, + "grad_norm": 0.23889253720435066, + "learning_rate": 5.882352941176471e-05, + "loss": 2.3781, + "step": 60 + }, + { + "epoch": 0.018015357353809804, + "grad_norm": 0.24281947680677748, + "learning_rate": 5.980392156862745e-05, + "loss": 2.3691, + "step": 61 + }, + { + "epoch": 0.01831069108092144, + "grad_norm": 0.24316699627558058, + "learning_rate": 6.078431372549019e-05, + "loss": 2.3939, + "step": 62 + }, + { + "epoch": 0.018606024808033077, + "grad_norm": 0.23380100375905116, + "learning_rate": 6.176470588235295e-05, + "loss": 2.3725, + "step": 63 + }, + { + "epoch": 0.018901358535144713, + "grad_norm": 0.24571102899837935, + "learning_rate": 6.274509803921569e-05, + "loss": 2.3882, + "step": 64 + }, + { + "epoch": 0.01919669226225635, + "grad_norm": 0.2353230296840185, + "learning_rate": 6.372549019607843e-05, + "loss": 2.448, + "step": 65 + }, + { + "epoch": 0.019492025989367986, + "grad_norm": 0.2232550049153507, + "learning_rate": 6.470588235294118e-05, + "loss": 2.3766, + "step": 66 + }, + { + "epoch": 0.019787359716479623, + "grad_norm": 0.26379743341613787, + "learning_rate": 6.568627450980392e-05, + "loss": 2.3468, + "step": 67 + }, + { + "epoch": 0.02008269344359126, + "grad_norm": 0.2545448196100546, + "learning_rate": 6.666666666666667e-05, + "loss": 2.3481, + "step": 68 + }, + { + "epoch": 0.020378027170702893, + "grad_norm": 0.23134631039383766, + "learning_rate": 6.764705882352942e-05, + "loss": 2.1055, + "step": 69 + }, + { + "epoch": 0.02067336089781453, + "grad_norm": 0.2382586187027627, + "learning_rate": 6.862745098039216e-05, + "loss": 2.3892, + "step": 70 + }, + { + "epoch": 0.020968694624926166, + "grad_norm": 0.23528253959822182, + "learning_rate": 6.96078431372549e-05, + "loss": 2.2759, + "step": 71 + }, + { + "epoch": 0.021264028352037802, + "grad_norm": 0.23517983055382846, + "learning_rate": 7.058823529411765e-05, + "loss": 2.3829, + "step": 72 + }, + { + "epoch": 0.02155936207914944, + "grad_norm": 0.2352420108712536, + "learning_rate": 7.156862745098039e-05, + "loss": 2.4445, + "step": 73 + }, + { + "epoch": 0.021854695806261076, + "grad_norm": 0.24565918716054877, + "learning_rate": 7.254901960784314e-05, + "loss": 2.396, + "step": 74 + }, + { + "epoch": 0.022150029533372712, + "grad_norm": 0.2458807235167, + "learning_rate": 7.352941176470589e-05, + "loss": 2.4107, + "step": 75 + }, + { + "epoch": 0.02244536326048435, + "grad_norm": 0.317017588652439, + "learning_rate": 7.450980392156864e-05, + "loss": 2.314, + "step": 76 + }, + { + "epoch": 0.022740696987595982, + "grad_norm": 0.25638225503585355, + "learning_rate": 7.549019607843137e-05, + "loss": 2.3757, + "step": 77 + }, + { + "epoch": 0.02303603071470762, + "grad_norm": 0.24774776657129416, + "learning_rate": 7.647058823529411e-05, + "loss": 2.3491, + "step": 78 + }, + { + "epoch": 0.023331364441819255, + "grad_norm": 0.2672340938029326, + "learning_rate": 7.745098039215687e-05, + "loss": 2.3403, + "step": 79 + }, + { + "epoch": 0.02362669816893089, + "grad_norm": 0.2435031961861835, + "learning_rate": 7.843137254901961e-05, + "loss": 2.1814, + "step": 80 + }, + { + "epoch": 0.023922031896042528, + "grad_norm": 0.24114619478867358, + "learning_rate": 7.941176470588235e-05, + "loss": 2.3507, + "step": 81 + }, + { + "epoch": 0.024217365623154165, + "grad_norm": 0.23630896403924262, + "learning_rate": 8.039215686274511e-05, + "loss": 2.3835, + "step": 82 + }, + { + "epoch": 0.0245126993502658, + "grad_norm": 0.24134243921153384, + "learning_rate": 8.137254901960785e-05, + "loss": 2.35, + "step": 83 + }, + { + "epoch": 0.024808033077377438, + "grad_norm": 0.23381927887726564, + "learning_rate": 8.23529411764706e-05, + "loss": 2.3346, + "step": 84 + }, + { + "epoch": 0.02510336680448907, + "grad_norm": 0.2507421088645706, + "learning_rate": 8.333333333333334e-05, + "loss": 2.3519, + "step": 85 + }, + { + "epoch": 0.025398700531600708, + "grad_norm": 0.25914558468000776, + "learning_rate": 8.431372549019608e-05, + "loss": 2.5393, + "step": 86 + }, + { + "epoch": 0.025694034258712344, + "grad_norm": 0.2518497077368789, + "learning_rate": 8.529411764705883e-05, + "loss": 2.3714, + "step": 87 + }, + { + "epoch": 0.02598936798582398, + "grad_norm": 0.2422279585607537, + "learning_rate": 8.627450980392158e-05, + "loss": 2.4661, + "step": 88 + }, + { + "epoch": 0.026284701712935617, + "grad_norm": 0.24659408690942353, + "learning_rate": 8.725490196078432e-05, + "loss": 2.3772, + "step": 89 + }, + { + "epoch": 0.026580035440047254, + "grad_norm": 0.23614415188075613, + "learning_rate": 8.823529411764706e-05, + "loss": 2.3805, + "step": 90 + }, + { + "epoch": 0.02687536916715889, + "grad_norm": 0.23697002187138796, + "learning_rate": 8.921568627450981e-05, + "loss": 2.303, + "step": 91 + }, + { + "epoch": 0.027170702894270527, + "grad_norm": 0.24200694213911653, + "learning_rate": 9.019607843137255e-05, + "loss": 2.4726, + "step": 92 + }, + { + "epoch": 0.02746603662138216, + "grad_norm": 0.24633112313054828, + "learning_rate": 9.11764705882353e-05, + "loss": 2.3771, + "step": 93 + }, + { + "epoch": 0.027761370348493797, + "grad_norm": 0.25314859174355153, + "learning_rate": 9.215686274509804e-05, + "loss": 2.3917, + "step": 94 + }, + { + "epoch": 0.028056704075605433, + "grad_norm": 0.22776212932089013, + "learning_rate": 9.313725490196079e-05, + "loss": 2.3529, + "step": 95 + }, + { + "epoch": 0.02835203780271707, + "grad_norm": 0.2513760011635016, + "learning_rate": 9.411764705882353e-05, + "loss": 2.3548, + "step": 96 + }, + { + "epoch": 0.028647371529828707, + "grad_norm": 0.24411694984083857, + "learning_rate": 9.509803921568627e-05, + "loss": 2.3439, + "step": 97 + }, + { + "epoch": 0.028942705256940343, + "grad_norm": 0.2521991711200174, + "learning_rate": 9.607843137254903e-05, + "loss": 2.2791, + "step": 98 + }, + { + "epoch": 0.02923803898405198, + "grad_norm": 0.2486266621610336, + "learning_rate": 9.705882352941177e-05, + "loss": 2.3002, + "step": 99 + }, + { + "epoch": 0.029533372711163616, + "grad_norm": 0.2457103846437245, + "learning_rate": 9.80392156862745e-05, + "loss": 2.3831, + "step": 100 + }, + { + "epoch": 0.02982870643827525, + "grad_norm": 0.2569776803865247, + "learning_rate": 9.901960784313727e-05, + "loss": 2.3966, + "step": 101 + }, + { + "epoch": 0.030124040165386886, + "grad_norm": 0.24524610100669236, + "learning_rate": 0.0001, + "loss": 2.2895, + "step": 102 + }, + { + "epoch": 0.030419373892498523, + "grad_norm": 0.24221290947304217, + "learning_rate": 9.99999771211904e-05, + "loss": 2.4076, + "step": 103 + }, + { + "epoch": 0.03071470761961016, + "grad_norm": 0.2502720961423902, + "learning_rate": 9.999990848478256e-05, + "loss": 2.3409, + "step": 104 + }, + { + "epoch": 0.031010041346721796, + "grad_norm": 0.2793441945785226, + "learning_rate": 9.999979409083924e-05, + "loss": 2.3586, + "step": 105 + }, + { + "epoch": 0.03130537507383343, + "grad_norm": 0.2367980635313326, + "learning_rate": 9.999963393946519e-05, + "loss": 2.3736, + "step": 106 + }, + { + "epoch": 0.031600708800945065, + "grad_norm": 0.26212150272788687, + "learning_rate": 9.999942803080691e-05, + "loss": 2.3463, + "step": 107 + }, + { + "epoch": 0.0318960425280567, + "grad_norm": 0.24541103941230627, + "learning_rate": 9.99991763650529e-05, + "loss": 2.3611, + "step": 108 + }, + { + "epoch": 0.03219137625516834, + "grad_norm": 0.2662199733933049, + "learning_rate": 9.999887894243344e-05, + "loss": 2.3638, + "step": 109 + }, + { + "epoch": 0.032486709982279975, + "grad_norm": 0.26884317335443625, + "learning_rate": 9.999853576322071e-05, + "loss": 2.3848, + "step": 110 + }, + { + "epoch": 0.03278204370939161, + "grad_norm": 0.24207560969428307, + "learning_rate": 9.99981468277288e-05, + "loss": 2.2988, + "step": 111 + }, + { + "epoch": 0.03307737743650325, + "grad_norm": 0.24520261575282198, + "learning_rate": 9.999771213631362e-05, + "loss": 2.3466, + "step": 112 + }, + { + "epoch": 0.033372711163614885, + "grad_norm": 0.2505386480367069, + "learning_rate": 9.999723168937298e-05, + "loss": 2.3782, + "step": 113 + }, + { + "epoch": 0.03366804489072652, + "grad_norm": 0.2705839855486598, + "learning_rate": 9.999670548734657e-05, + "loss": 2.3003, + "step": 114 + }, + { + "epoch": 0.03396337861783816, + "grad_norm": 0.23565469902478192, + "learning_rate": 9.999613353071595e-05, + "loss": 2.2465, + "step": 115 + }, + { + "epoch": 0.034258712344949795, + "grad_norm": 0.23633717291039566, + "learning_rate": 9.999551582000454e-05, + "loss": 2.3707, + "step": 116 + }, + { + "epoch": 0.03455404607206143, + "grad_norm": 0.2634342252905638, + "learning_rate": 9.999485235577764e-05, + "loss": 2.3644, + "step": 117 + }, + { + "epoch": 0.03484937979917307, + "grad_norm": 0.2507575448737134, + "learning_rate": 9.999414313864241e-05, + "loss": 2.3627, + "step": 118 + }, + { + "epoch": 0.035144713526284704, + "grad_norm": 0.24226284393318412, + "learning_rate": 9.999338816924793e-05, + "loss": 2.386, + "step": 119 + }, + { + "epoch": 0.03544004725339634, + "grad_norm": 0.2538690833252653, + "learning_rate": 9.999258744828507e-05, + "loss": 2.2906, + "step": 120 + }, + { + "epoch": 0.03573538098050797, + "grad_norm": 0.25729367040453666, + "learning_rate": 9.999174097648663e-05, + "loss": 2.2542, + "step": 121 + }, + { + "epoch": 0.03603071470761961, + "grad_norm": 0.25159268879266855, + "learning_rate": 9.999084875462726e-05, + "loss": 2.3696, + "step": 122 + }, + { + "epoch": 0.036326048434731244, + "grad_norm": 0.279694520872852, + "learning_rate": 9.998991078352346e-05, + "loss": 2.1832, + "step": 123 + }, + { + "epoch": 0.03662138216184288, + "grad_norm": 0.2383167756285088, + "learning_rate": 9.998892706403365e-05, + "loss": 2.3569, + "step": 124 + }, + { + "epoch": 0.03691671588895452, + "grad_norm": 0.24245192336320429, + "learning_rate": 9.998789759705807e-05, + "loss": 2.3526, + "step": 125 + }, + { + "epoch": 0.03721204961606615, + "grad_norm": 0.23845695052550558, + "learning_rate": 9.998682238353882e-05, + "loss": 2.3469, + "step": 126 + }, + { + "epoch": 0.03750738334317779, + "grad_norm": 0.24513058416832922, + "learning_rate": 9.998570142445991e-05, + "loss": 2.3722, + "step": 127 + }, + { + "epoch": 0.03780271707028943, + "grad_norm": 0.24425855458651832, + "learning_rate": 9.998453472084717e-05, + "loss": 2.3844, + "step": 128 + }, + { + "epoch": 0.03809805079740106, + "grad_norm": 0.231503652310037, + "learning_rate": 9.998332227376834e-05, + "loss": 2.3684, + "step": 129 + }, + { + "epoch": 0.0383933845245127, + "grad_norm": 0.23135478828743994, + "learning_rate": 9.998206408433295e-05, + "loss": 2.3425, + "step": 130 + }, + { + "epoch": 0.038688718251624336, + "grad_norm": 0.23761692619654273, + "learning_rate": 9.998076015369246e-05, + "loss": 2.3914, + "step": 131 + }, + { + "epoch": 0.03898405197873597, + "grad_norm": 0.2433430728993561, + "learning_rate": 9.997941048304018e-05, + "loss": 2.3583, + "step": 132 + }, + { + "epoch": 0.03927938570584761, + "grad_norm": 0.3375489155324886, + "learning_rate": 9.997801507361125e-05, + "loss": 2.3139, + "step": 133 + }, + { + "epoch": 0.039574719432959246, + "grad_norm": 0.2356625160787471, + "learning_rate": 9.997657392668267e-05, + "loss": 2.3048, + "step": 134 + }, + { + "epoch": 0.03987005316007088, + "grad_norm": 0.2500425838919163, + "learning_rate": 9.997508704357332e-05, + "loss": 2.3881, + "step": 135 + }, + { + "epoch": 0.04016538688718252, + "grad_norm": 0.25680527304578854, + "learning_rate": 9.997355442564393e-05, + "loss": 2.3009, + "step": 136 + }, + { + "epoch": 0.040460720614294156, + "grad_norm": 0.2580282435676882, + "learning_rate": 9.997197607429707e-05, + "loss": 2.3655, + "step": 137 + }, + { + "epoch": 0.040756054341405785, + "grad_norm": 0.3127273924346099, + "learning_rate": 9.997035199097717e-05, + "loss": 2.3427, + "step": 138 + }, + { + "epoch": 0.04105138806851742, + "grad_norm": 0.24063401205352722, + "learning_rate": 9.996868217717052e-05, + "loss": 2.3808, + "step": 139 + }, + { + "epoch": 0.04134672179562906, + "grad_norm": 0.25442222107712215, + "learning_rate": 9.996696663440526e-05, + "loss": 2.3616, + "step": 140 + }, + { + "epoch": 0.041642055522740695, + "grad_norm": 0.2604474748825547, + "learning_rate": 9.996520536425137e-05, + "loss": 2.4607, + "step": 141 + }, + { + "epoch": 0.04193738924985233, + "grad_norm": 0.24177215708663724, + "learning_rate": 9.996339836832068e-05, + "loss": 2.3614, + "step": 142 + }, + { + "epoch": 0.04223272297696397, + "grad_norm": 0.23137658945894762, + "learning_rate": 9.996154564826685e-05, + "loss": 2.3782, + "step": 143 + }, + { + "epoch": 0.042528056704075605, + "grad_norm": 0.23795313992463624, + "learning_rate": 9.995964720578541e-05, + "loss": 2.3677, + "step": 144 + }, + { + "epoch": 0.04282339043118724, + "grad_norm": 0.2583745695986252, + "learning_rate": 9.995770304261374e-05, + "loss": 2.3934, + "step": 145 + }, + { + "epoch": 0.04311872415829888, + "grad_norm": 0.25219324778987334, + "learning_rate": 9.995571316053102e-05, + "loss": 2.4591, + "step": 146 + }, + { + "epoch": 0.043414057885410515, + "grad_norm": 0.23547432099949253, + "learning_rate": 9.995367756135832e-05, + "loss": 2.3204, + "step": 147 + }, + { + "epoch": 0.04370939161252215, + "grad_norm": 0.23901270553502624, + "learning_rate": 9.995159624695851e-05, + "loss": 2.3201, + "step": 148 + }, + { + "epoch": 0.04400472533963379, + "grad_norm": 0.2552429373629925, + "learning_rate": 9.994946921923633e-05, + "loss": 2.3056, + "step": 149 + }, + { + "epoch": 0.044300059066745424, + "grad_norm": 0.23913084281198177, + "learning_rate": 9.99472964801383e-05, + "loss": 2.3817, + "step": 150 + }, + { + "epoch": 0.04459539279385706, + "grad_norm": 0.2618276454927748, + "learning_rate": 9.994507803165283e-05, + "loss": 2.3601, + "step": 151 + }, + { + "epoch": 0.0448907265209687, + "grad_norm": 0.24507959237113902, + "learning_rate": 9.994281387581013e-05, + "loss": 2.3245, + "step": 152 + }, + { + "epoch": 0.045186060248080334, + "grad_norm": 0.24235501956131048, + "learning_rate": 9.994050401468224e-05, + "loss": 2.3395, + "step": 153 + }, + { + "epoch": 0.045481393975191964, + "grad_norm": 0.24742524658160775, + "learning_rate": 9.993814845038307e-05, + "loss": 2.3164, + "step": 154 + }, + { + "epoch": 0.0457767277023036, + "grad_norm": 0.2488270377789614, + "learning_rate": 9.993574718506829e-05, + "loss": 2.3358, + "step": 155 + }, + { + "epoch": 0.04607206142941524, + "grad_norm": 0.24748879827571854, + "learning_rate": 9.993330022093541e-05, + "loss": 2.3784, + "step": 156 + }, + { + "epoch": 0.046367395156526874, + "grad_norm": 0.23349167146726868, + "learning_rate": 9.993080756022381e-05, + "loss": 2.2487, + "step": 157 + }, + { + "epoch": 0.04666272888363851, + "grad_norm": 0.23654194294037195, + "learning_rate": 9.992826920521463e-05, + "loss": 2.3817, + "step": 158 + }, + { + "epoch": 0.04695806261075015, + "grad_norm": 0.24473433212039272, + "learning_rate": 9.992568515823087e-05, + "loss": 2.3944, + "step": 159 + }, + { + "epoch": 0.04725339633786178, + "grad_norm": 0.2775984387167661, + "learning_rate": 9.992305542163732e-05, + "loss": 2.2982, + "step": 160 + }, + { + "epoch": 0.04754873006497342, + "grad_norm": 0.24329228831235353, + "learning_rate": 9.992037999784059e-05, + "loss": 2.3722, + "step": 161 + }, + { + "epoch": 0.047844063792085056, + "grad_norm": 0.2676465162877864, + "learning_rate": 9.991765888928908e-05, + "loss": 2.3829, + "step": 162 + }, + { + "epoch": 0.04813939751919669, + "grad_norm": 0.2501406720878809, + "learning_rate": 9.991489209847305e-05, + "loss": 2.3039, + "step": 163 + }, + { + "epoch": 0.04843473124630833, + "grad_norm": 0.26032233278037764, + "learning_rate": 9.991207962792451e-05, + "loss": 2.2749, + "step": 164 + }, + { + "epoch": 0.048730064973419966, + "grad_norm": 0.2534710749436169, + "learning_rate": 9.990922148021731e-05, + "loss": 2.3122, + "step": 165 + }, + { + "epoch": 0.0490253987005316, + "grad_norm": 0.23391882689968543, + "learning_rate": 9.990631765796711e-05, + "loss": 2.3512, + "step": 166 + }, + { + "epoch": 0.04932073242764324, + "grad_norm": 0.2470514041465442, + "learning_rate": 9.990336816383133e-05, + "loss": 2.4127, + "step": 167 + }, + { + "epoch": 0.049616066154754876, + "grad_norm": 0.24476290932959066, + "learning_rate": 9.990037300050918e-05, + "loss": 2.3331, + "step": 168 + }, + { + "epoch": 0.04991139988186651, + "grad_norm": 0.23716664321831832, + "learning_rate": 9.989733217074175e-05, + "loss": 2.3714, + "step": 169 + }, + { + "epoch": 0.05020673360897814, + "grad_norm": 0.24097901751015455, + "learning_rate": 9.989424567731183e-05, + "loss": 2.3457, + "step": 170 + }, + { + "epoch": 0.05050206733608978, + "grad_norm": 0.25444343667915853, + "learning_rate": 9.989111352304402e-05, + "loss": 2.3087, + "step": 171 + }, + { + "epoch": 0.050797401063201415, + "grad_norm": 0.26235568416458194, + "learning_rate": 9.988793571080473e-05, + "loss": 2.4252, + "step": 172 + }, + { + "epoch": 0.05109273479031305, + "grad_norm": 0.23453698918120705, + "learning_rate": 9.988471224350215e-05, + "loss": 2.3191, + "step": 173 + }, + { + "epoch": 0.05138806851742469, + "grad_norm": 0.25520381955936466, + "learning_rate": 9.988144312408624e-05, + "loss": 2.2645, + "step": 174 + }, + { + "epoch": 0.051683402244536325, + "grad_norm": 0.2797406678837787, + "learning_rate": 9.987812835554875e-05, + "loss": 2.3056, + "step": 175 + }, + { + "epoch": 0.05197873597164796, + "grad_norm": 0.23833428246107938, + "learning_rate": 9.987476794092316e-05, + "loss": 2.3373, + "step": 176 + }, + { + "epoch": 0.0522740696987596, + "grad_norm": 0.3155754508415011, + "learning_rate": 9.98713618832848e-05, + "loss": 2.4275, + "step": 177 + }, + { + "epoch": 0.052569403425871235, + "grad_norm": 0.23703943349080572, + "learning_rate": 9.986791018575074e-05, + "loss": 2.3948, + "step": 178 + }, + { + "epoch": 0.05286473715298287, + "grad_norm": 0.2427267562166566, + "learning_rate": 9.986441285147979e-05, + "loss": 2.3214, + "step": 179 + }, + { + "epoch": 0.05316007088009451, + "grad_norm": 0.24508908479242084, + "learning_rate": 9.986086988367254e-05, + "loss": 2.3424, + "step": 180 + }, + { + "epoch": 0.053455404607206145, + "grad_norm": 0.2486820498825831, + "learning_rate": 9.985728128557134e-05, + "loss": 2.3209, + "step": 181 + }, + { + "epoch": 0.05375073833431778, + "grad_norm": 0.2451639776671281, + "learning_rate": 9.985364706046031e-05, + "loss": 2.312, + "step": 182 + }, + { + "epoch": 0.05404607206142942, + "grad_norm": 1.4346047975720264, + "learning_rate": 9.984996721166535e-05, + "loss": 2.3592, + "step": 183 + }, + { + "epoch": 0.054341405788541054, + "grad_norm": 0.27741056631378014, + "learning_rate": 9.984624174255404e-05, + "loss": 2.2816, + "step": 184 + }, + { + "epoch": 0.05463673951565269, + "grad_norm": 0.3121745560705298, + "learning_rate": 9.984247065653576e-05, + "loss": 2.3583, + "step": 185 + }, + { + "epoch": 0.05493207324276432, + "grad_norm": 0.3988304818108965, + "learning_rate": 9.983865395706164e-05, + "loss": 2.3792, + "step": 186 + }, + { + "epoch": 0.05522740696987596, + "grad_norm": 0.31406280033609635, + "learning_rate": 9.983479164762456e-05, + "loss": 2.362, + "step": 187 + }, + { + "epoch": 0.055522740696987594, + "grad_norm": 0.38361719762922797, + "learning_rate": 9.983088373175909e-05, + "loss": 2.2404, + "step": 188 + }, + { + "epoch": 0.05581807442409923, + "grad_norm": 0.2581811783043668, + "learning_rate": 9.982693021304157e-05, + "loss": 2.371, + "step": 189 + }, + { + "epoch": 0.05611340815121087, + "grad_norm": 0.6108030556655425, + "learning_rate": 9.982293109509009e-05, + "loss": 2.3337, + "step": 190 + }, + { + "epoch": 0.0564087418783225, + "grad_norm": 0.3011464218288721, + "learning_rate": 9.981888638156443e-05, + "loss": 2.3997, + "step": 191 + }, + { + "epoch": 0.05670407560543414, + "grad_norm": 0.30703687881085723, + "learning_rate": 9.981479607616615e-05, + "loss": 2.3046, + "step": 192 + }, + { + "epoch": 0.056999409332545777, + "grad_norm": 0.47351228419203034, + "learning_rate": 9.981066018263848e-05, + "loss": 2.4419, + "step": 193 + }, + { + "epoch": 0.05729474305965741, + "grad_norm": 0.31523055177093146, + "learning_rate": 9.980647870476639e-05, + "loss": 2.3477, + "step": 194 + }, + { + "epoch": 0.05759007678676905, + "grad_norm": 0.25040798870945974, + "learning_rate": 9.980225164637659e-05, + "loss": 2.3266, + "step": 195 + }, + { + "epoch": 0.057885410513880686, + "grad_norm": 0.2731552165576713, + "learning_rate": 9.979797901133746e-05, + "loss": 2.3401, + "step": 196 + }, + { + "epoch": 0.05818074424099232, + "grad_norm": 0.2766700944822215, + "learning_rate": 9.979366080355911e-05, + "loss": 2.3655, + "step": 197 + }, + { + "epoch": 0.05847607796810396, + "grad_norm": 0.276282253626431, + "learning_rate": 9.97892970269934e-05, + "loss": 2.3884, + "step": 198 + }, + { + "epoch": 0.058771411695215596, + "grad_norm": 0.2601470584792166, + "learning_rate": 9.978488768563381e-05, + "loss": 2.3726, + "step": 199 + }, + { + "epoch": 0.05906674542232723, + "grad_norm": 0.24947427069227404, + "learning_rate": 9.978043278351556e-05, + "loss": 2.3068, + "step": 200 + }, + { + "epoch": 0.05936207914943887, + "grad_norm": 0.2436200198823698, + "learning_rate": 9.977593232471558e-05, + "loss": 2.3569, + "step": 201 + }, + { + "epoch": 0.0596574128765505, + "grad_norm": 0.23684212456495363, + "learning_rate": 9.977138631335247e-05, + "loss": 2.2432, + "step": 202 + }, + { + "epoch": 0.059952746603662135, + "grad_norm": 0.2596749599243013, + "learning_rate": 9.976679475358653e-05, + "loss": 2.3238, + "step": 203 + }, + { + "epoch": 0.06024808033077377, + "grad_norm": 0.2487771780056702, + "learning_rate": 9.976215764961974e-05, + "loss": 2.3125, + "step": 204 + }, + { + "epoch": 0.06054341405788541, + "grad_norm": 0.27472054685235653, + "learning_rate": 9.975747500569572e-05, + "loss": 2.3852, + "step": 205 + }, + { + "epoch": 0.060838747784997045, + "grad_norm": 0.2531661029692831, + "learning_rate": 9.975274682609984e-05, + "loss": 2.4412, + "step": 206 + }, + { + "epoch": 0.06113408151210868, + "grad_norm": 2.356158255250568, + "learning_rate": 9.97479731151591e-05, + "loss": 2.5623, + "step": 207 + }, + { + "epoch": 0.06142941523922032, + "grad_norm": 0.2965900659964393, + "learning_rate": 9.974315387724216e-05, + "loss": 2.3624, + "step": 208 + }, + { + "epoch": 0.061724748966331955, + "grad_norm": 0.3263727129429106, + "learning_rate": 9.973828911675937e-05, + "loss": 2.4055, + "step": 209 + }, + { + "epoch": 0.06202008269344359, + "grad_norm": 0.2913851472101534, + "learning_rate": 9.973337883816272e-05, + "loss": 2.3145, + "step": 210 + }, + { + "epoch": 0.06231541642055523, + "grad_norm": 0.2779338629939058, + "learning_rate": 9.972842304594585e-05, + "loss": 2.3871, + "step": 211 + }, + { + "epoch": 0.06261075014766686, + "grad_norm": 0.305947985425589, + "learning_rate": 9.972342174464411e-05, + "loss": 2.3516, + "step": 212 + }, + { + "epoch": 0.0629060838747785, + "grad_norm": 0.27605095283915265, + "learning_rate": 9.97183749388344e-05, + "loss": 2.379, + "step": 213 + }, + { + "epoch": 0.06320141760189013, + "grad_norm": 0.3112257728186484, + "learning_rate": 9.971328263313535e-05, + "loss": 2.378, + "step": 214 + }, + { + "epoch": 0.06349675132900177, + "grad_norm": 0.2954009756127675, + "learning_rate": 9.970814483220717e-05, + "loss": 2.398, + "step": 215 + }, + { + "epoch": 0.0637920850561134, + "grad_norm": 0.2667969766507716, + "learning_rate": 9.970296154075177e-05, + "loss": 2.3672, + "step": 216 + }, + { + "epoch": 0.06408741878322505, + "grad_norm": 0.25199347180376447, + "learning_rate": 9.969773276351259e-05, + "loss": 2.3251, + "step": 217 + }, + { + "epoch": 0.06438275251033668, + "grad_norm": 0.5515238037799824, + "learning_rate": 9.969245850527482e-05, + "loss": 2.2876, + "step": 218 + }, + { + "epoch": 0.06467808623744832, + "grad_norm": 0.2965275337258932, + "learning_rate": 9.968713877086518e-05, + "loss": 2.3722, + "step": 219 + }, + { + "epoch": 0.06497341996455995, + "grad_norm": 0.273267979525968, + "learning_rate": 9.968177356515205e-05, + "loss": 2.2743, + "step": 220 + }, + { + "epoch": 0.0652687536916716, + "grad_norm": 0.3096269982842901, + "learning_rate": 9.96763628930454e-05, + "loss": 2.3795, + "step": 221 + }, + { + "epoch": 0.06556408741878322, + "grad_norm": 0.25634762718563825, + "learning_rate": 9.967090675949682e-05, + "loss": 2.382, + "step": 222 + }, + { + "epoch": 0.06585942114589487, + "grad_norm": 0.25781247110077665, + "learning_rate": 9.96654051694995e-05, + "loss": 2.4114, + "step": 223 + }, + { + "epoch": 0.0661547548730065, + "grad_norm": 0.2565333248049412, + "learning_rate": 9.965985812808824e-05, + "loss": 2.3166, + "step": 224 + }, + { + "epoch": 0.06645008860011814, + "grad_norm": 0.2452611918155985, + "learning_rate": 9.965426564033944e-05, + "loss": 2.4186, + "step": 225 + }, + { + "epoch": 0.06674542232722977, + "grad_norm": 0.42394397400416517, + "learning_rate": 9.964862771137107e-05, + "loss": 2.3253, + "step": 226 + }, + { + "epoch": 0.0670407560543414, + "grad_norm": 0.2649084692904018, + "learning_rate": 9.964294434634267e-05, + "loss": 2.3754, + "step": 227 + }, + { + "epoch": 0.06733608978145304, + "grad_norm": 0.2894602693610413, + "learning_rate": 9.963721555045542e-05, + "loss": 2.3811, + "step": 228 + }, + { + "epoch": 0.06763142350856467, + "grad_norm": 6.528881172733639, + "learning_rate": 9.963144132895203e-05, + "loss": 2.647, + "step": 229 + }, + { + "epoch": 0.06792675723567632, + "grad_norm": 0.24087038594683177, + "learning_rate": 9.962562168711678e-05, + "loss": 2.3385, + "step": 230 + }, + { + "epoch": 0.06822209096278795, + "grad_norm": 0.3958436458482407, + "learning_rate": 9.961975663027555e-05, + "loss": 2.2609, + "step": 231 + }, + { + "epoch": 0.06851742468989959, + "grad_norm": 0.2730343025816668, + "learning_rate": 9.961384616379573e-05, + "loss": 2.3763, + "step": 232 + }, + { + "epoch": 0.06881275841701122, + "grad_norm": 0.26718720887820796, + "learning_rate": 9.960789029308632e-05, + "loss": 2.346, + "step": 233 + }, + { + "epoch": 0.06910809214412286, + "grad_norm": 0.2573207876711409, + "learning_rate": 9.960188902359786e-05, + "loss": 2.2552, + "step": 234 + }, + { + "epoch": 0.06940342587123449, + "grad_norm": 0.24875932787413899, + "learning_rate": 9.95958423608224e-05, + "loss": 2.3144, + "step": 235 + }, + { + "epoch": 0.06969875959834614, + "grad_norm": 0.24588300370191452, + "learning_rate": 9.958975031029359e-05, + "loss": 2.3494, + "step": 236 + }, + { + "epoch": 0.06999409332545777, + "grad_norm": 0.2521954782871306, + "learning_rate": 9.958361287758653e-05, + "loss": 2.3592, + "step": 237 + }, + { + "epoch": 0.07028942705256941, + "grad_norm": 0.24830376846435775, + "learning_rate": 9.957743006831797e-05, + "loss": 2.3551, + "step": 238 + }, + { + "epoch": 0.07058476077968104, + "grad_norm": 0.26323893289044276, + "learning_rate": 9.957120188814608e-05, + "loss": 2.343, + "step": 239 + }, + { + "epoch": 0.07088009450679268, + "grad_norm": 0.24600976691686727, + "learning_rate": 9.95649283427706e-05, + "loss": 2.3651, + "step": 240 + }, + { + "epoch": 0.07117542823390431, + "grad_norm": 0.23350216144892838, + "learning_rate": 9.95586094379328e-05, + "loss": 2.3574, + "step": 241 + }, + { + "epoch": 0.07147076196101594, + "grad_norm": 0.2482952766465434, + "learning_rate": 9.955224517941542e-05, + "loss": 2.3451, + "step": 242 + }, + { + "epoch": 0.07176609568812758, + "grad_norm": 0.23654599826357217, + "learning_rate": 9.954583557304275e-05, + "loss": 2.4116, + "step": 243 + }, + { + "epoch": 0.07206142941523921, + "grad_norm": 0.23371843722878857, + "learning_rate": 9.953938062468052e-05, + "loss": 2.3501, + "step": 244 + }, + { + "epoch": 0.07235676314235086, + "grad_norm": 0.23991435530768052, + "learning_rate": 9.953288034023602e-05, + "loss": 2.3418, + "step": 245 + }, + { + "epoch": 0.07265209686946249, + "grad_norm": 0.24094630408374454, + "learning_rate": 9.9526334725658e-05, + "loss": 2.2219, + "step": 246 + }, + { + "epoch": 0.07294743059657413, + "grad_norm": 0.24031312938702018, + "learning_rate": 9.95197437869367e-05, + "loss": 2.3325, + "step": 247 + }, + { + "epoch": 0.07324276432368576, + "grad_norm": 0.22612545182661103, + "learning_rate": 9.951310753010379e-05, + "loss": 2.2858, + "step": 248 + }, + { + "epoch": 0.0735380980507974, + "grad_norm": 0.2318060674924234, + "learning_rate": 9.950642596123249e-05, + "loss": 2.4182, + "step": 249 + }, + { + "epoch": 0.07383343177790903, + "grad_norm": 0.2266579295471816, + "learning_rate": 9.949969908643747e-05, + "loss": 2.3576, + "step": 250 + }, + { + "epoch": 0.07412876550502068, + "grad_norm": 0.24221701595968748, + "learning_rate": 9.949292691187481e-05, + "loss": 2.3717, + "step": 251 + }, + { + "epoch": 0.0744240992321323, + "grad_norm": 0.23092170429023387, + "learning_rate": 9.948610944374213e-05, + "loss": 2.3595, + "step": 252 + }, + { + "epoch": 0.07471943295924395, + "grad_norm": 0.24212531860480274, + "learning_rate": 9.94792466882784e-05, + "loss": 2.2945, + "step": 253 + }, + { + "epoch": 0.07501476668635558, + "grad_norm": 0.24051120748845192, + "learning_rate": 9.947233865176409e-05, + "loss": 2.2915, + "step": 254 + }, + { + "epoch": 0.07531010041346722, + "grad_norm": 0.24260966361618244, + "learning_rate": 9.946538534052114e-05, + "loss": 2.2193, + "step": 255 + }, + { + "epoch": 0.07560543414057885, + "grad_norm": 0.22534062311893827, + "learning_rate": 9.945838676091288e-05, + "loss": 2.3252, + "step": 256 + }, + { + "epoch": 0.0759007678676905, + "grad_norm": 0.2514865162222922, + "learning_rate": 9.945134291934406e-05, + "loss": 2.2699, + "step": 257 + }, + { + "epoch": 0.07619610159480213, + "grad_norm": 0.23860162553361414, + "learning_rate": 9.944425382226088e-05, + "loss": 2.3414, + "step": 258 + }, + { + "epoch": 0.07649143532191376, + "grad_norm": 0.23073216110565437, + "learning_rate": 9.943711947615093e-05, + "loss": 2.379, + "step": 259 + }, + { + "epoch": 0.0767867690490254, + "grad_norm": 0.23259377734096848, + "learning_rate": 9.942993988754325e-05, + "loss": 2.3526, + "step": 260 + }, + { + "epoch": 0.07708210277613703, + "grad_norm": 0.22346806183790263, + "learning_rate": 9.942271506300825e-05, + "loss": 2.3224, + "step": 261 + }, + { + "epoch": 0.07737743650324867, + "grad_norm": 0.24741808724281972, + "learning_rate": 9.941544500915772e-05, + "loss": 2.3797, + "step": 262 + }, + { + "epoch": 0.0776727702303603, + "grad_norm": 0.23804923784706458, + "learning_rate": 9.940812973264491e-05, + "loss": 2.2447, + "step": 263 + }, + { + "epoch": 0.07796810395747195, + "grad_norm": 0.23307615238834856, + "learning_rate": 9.940076924016437e-05, + "loss": 2.3253, + "step": 264 + }, + { + "epoch": 0.07826343768458358, + "grad_norm": 2.7960098036565526, + "learning_rate": 9.93933635384521e-05, + "loss": 2.5548, + "step": 265 + }, + { + "epoch": 0.07855877141169522, + "grad_norm": 0.24501490643706508, + "learning_rate": 9.938591263428543e-05, + "loss": 2.3043, + "step": 266 + }, + { + "epoch": 0.07885410513880685, + "grad_norm": 0.29971914903910296, + "learning_rate": 9.937841653448309e-05, + "loss": 2.336, + "step": 267 + }, + { + "epoch": 0.07914943886591849, + "grad_norm": 0.3027693328362641, + "learning_rate": 9.937087524590514e-05, + "loss": 2.3754, + "step": 268 + }, + { + "epoch": 0.07944477259303012, + "grad_norm": 0.2763257888682253, + "learning_rate": 9.9363288775453e-05, + "loss": 2.3259, + "step": 269 + }, + { + "epoch": 0.07974010632014177, + "grad_norm": 0.263806259383311, + "learning_rate": 9.935565713006946e-05, + "loss": 2.2496, + "step": 270 + }, + { + "epoch": 0.0800354400472534, + "grad_norm": 0.26585311070678325, + "learning_rate": 9.934798031673866e-05, + "loss": 2.317, + "step": 271 + }, + { + "epoch": 0.08033077377436504, + "grad_norm": 0.2561089121465438, + "learning_rate": 9.9340258342486e-05, + "loss": 2.3029, + "step": 272 + }, + { + "epoch": 0.08062610750147667, + "grad_norm": 0.24037949900500818, + "learning_rate": 9.933249121437831e-05, + "loss": 2.3407, + "step": 273 + }, + { + "epoch": 0.08092144122858831, + "grad_norm": 0.27326146317525185, + "learning_rate": 9.932467893952367e-05, + "loss": 2.3729, + "step": 274 + }, + { + "epoch": 0.08121677495569994, + "grad_norm": 0.2528656101435129, + "learning_rate": 9.931682152507152e-05, + "loss": 2.3694, + "step": 275 + }, + { + "epoch": 0.08151210868281157, + "grad_norm": 0.28294634523378887, + "learning_rate": 9.930891897821258e-05, + "loss": 2.3457, + "step": 276 + }, + { + "epoch": 0.08180744240992321, + "grad_norm": 0.2393214549842228, + "learning_rate": 9.93009713061789e-05, + "loss": 2.3319, + "step": 277 + }, + { + "epoch": 0.08210277613703484, + "grad_norm": 0.22205108250652825, + "learning_rate": 9.929297851624378e-05, + "loss": 2.0385, + "step": 278 + }, + { + "epoch": 0.08239810986414649, + "grad_norm": 0.23597908253579974, + "learning_rate": 9.928494061572186e-05, + "loss": 2.3503, + "step": 279 + }, + { + "epoch": 0.08269344359125812, + "grad_norm": 0.23734079094172914, + "learning_rate": 9.927685761196906e-05, + "loss": 2.3328, + "step": 280 + }, + { + "epoch": 0.08298877731836976, + "grad_norm": 0.27605778119297075, + "learning_rate": 9.926872951238252e-05, + "loss": 2.3347, + "step": 281 + }, + { + "epoch": 0.08328411104548139, + "grad_norm": 0.25106018988676815, + "learning_rate": 9.926055632440073e-05, + "loss": 2.3625, + "step": 282 + }, + { + "epoch": 0.08357944477259303, + "grad_norm": 0.2414559266911514, + "learning_rate": 9.925233805550338e-05, + "loss": 2.3149, + "step": 283 + }, + { + "epoch": 0.08387477849970466, + "grad_norm": 0.2348971828173819, + "learning_rate": 9.924407471321145e-05, + "loss": 2.3609, + "step": 284 + }, + { + "epoch": 0.08417011222681631, + "grad_norm": 0.24528995064039635, + "learning_rate": 9.923576630508713e-05, + "loss": 2.3822, + "step": 285 + }, + { + "epoch": 0.08446544595392794, + "grad_norm": 0.22552898232946783, + "learning_rate": 9.922741283873392e-05, + "loss": 2.3584, + "step": 286 + }, + { + "epoch": 0.08476077968103958, + "grad_norm": 0.24368399068610108, + "learning_rate": 9.921901432179649e-05, + "loss": 2.3108, + "step": 287 + }, + { + "epoch": 0.08505611340815121, + "grad_norm": 0.24057538555211094, + "learning_rate": 9.921057076196076e-05, + "loss": 2.3272, + "step": 288 + }, + { + "epoch": 0.08535144713526285, + "grad_norm": 0.27387299598559584, + "learning_rate": 9.920208216695391e-05, + "loss": 2.3284, + "step": 289 + }, + { + "epoch": 0.08564678086237448, + "grad_norm": 0.23011411440443227, + "learning_rate": 9.919354854454423e-05, + "loss": 2.3561, + "step": 290 + }, + { + "epoch": 0.08594211458948611, + "grad_norm": 0.24532391466930634, + "learning_rate": 9.918496990254134e-05, + "loss": 2.3771, + "step": 291 + }, + { + "epoch": 0.08623744831659776, + "grad_norm": 0.23870666237899546, + "learning_rate": 9.9176346248796e-05, + "loss": 2.3292, + "step": 292 + }, + { + "epoch": 0.08653278204370939, + "grad_norm": 0.23418711442918191, + "learning_rate": 9.916767759120016e-05, + "loss": 2.3425, + "step": 293 + }, + { + "epoch": 0.08682811577082103, + "grad_norm": 0.24974982589041642, + "learning_rate": 9.915896393768694e-05, + "loss": 2.4147, + "step": 294 + }, + { + "epoch": 0.08712344949793266, + "grad_norm": 0.22749443598901928, + "learning_rate": 9.91502052962307e-05, + "loss": 2.3431, + "step": 295 + }, + { + "epoch": 0.0874187832250443, + "grad_norm": 0.24140581760552948, + "learning_rate": 9.91414016748469e-05, + "loss": 2.3062, + "step": 296 + }, + { + "epoch": 0.08771411695215593, + "grad_norm": 0.23629039274028238, + "learning_rate": 9.913255308159222e-05, + "loss": 2.3569, + "step": 297 + }, + { + "epoch": 0.08800945067926758, + "grad_norm": 0.22195483873371033, + "learning_rate": 9.912365952456444e-05, + "loss": 2.3822, + "step": 298 + }, + { + "epoch": 0.0883047844063792, + "grad_norm": 0.23528912614338374, + "learning_rate": 9.911472101190254e-05, + "loss": 2.2377, + "step": 299 + }, + { + "epoch": 0.08860011813349085, + "grad_norm": 0.23164680820080066, + "learning_rate": 9.910573755178662e-05, + "loss": 2.3076, + "step": 300 + }, + { + "epoch": 0.08889545186060248, + "grad_norm": 0.2364432787339013, + "learning_rate": 9.909670915243793e-05, + "loss": 2.3685, + "step": 301 + }, + { + "epoch": 0.08919078558771412, + "grad_norm": 0.2310730479435411, + "learning_rate": 9.90876358221188e-05, + "loss": 2.3486, + "step": 302 + }, + { + "epoch": 0.08948611931482575, + "grad_norm": 0.2669960568642284, + "learning_rate": 9.907851756913274e-05, + "loss": 2.3731, + "step": 303 + }, + { + "epoch": 0.0897814530419374, + "grad_norm": 0.2410030550283286, + "learning_rate": 9.906935440182432e-05, + "loss": 2.3046, + "step": 304 + }, + { + "epoch": 0.09007678676904902, + "grad_norm": 0.23462114124854294, + "learning_rate": 9.906014632857923e-05, + "loss": 2.3067, + "step": 305 + }, + { + "epoch": 0.09037212049616067, + "grad_norm": 0.22828826026675025, + "learning_rate": 9.905089335782428e-05, + "loss": 2.4471, + "step": 306 + }, + { + "epoch": 0.0906674542232723, + "grad_norm": 0.23888510654764067, + "learning_rate": 9.904159549802734e-05, + "loss": 2.2856, + "step": 307 + }, + { + "epoch": 0.09096278795038393, + "grad_norm": 0.24277830356867827, + "learning_rate": 9.903225275769736e-05, + "loss": 2.4833, + "step": 308 + }, + { + "epoch": 0.09125812167749557, + "grad_norm": 0.3005895870946432, + "learning_rate": 9.902286514538438e-05, + "loss": 2.0451, + "step": 309 + }, + { + "epoch": 0.0915534554046072, + "grad_norm": 0.2871703777252406, + "learning_rate": 9.901343266967951e-05, + "loss": 2.2913, + "step": 310 + }, + { + "epoch": 0.09184878913171884, + "grad_norm": 0.2504881325288149, + "learning_rate": 9.900395533921487e-05, + "loss": 2.3333, + "step": 311 + }, + { + "epoch": 0.09214412285883047, + "grad_norm": 0.23013884159349968, + "learning_rate": 9.899443316266368e-05, + "loss": 2.2832, + "step": 312 + }, + { + "epoch": 0.09243945658594212, + "grad_norm": 0.23043796366010452, + "learning_rate": 9.898486614874019e-05, + "loss": 2.2679, + "step": 313 + }, + { + "epoch": 0.09273479031305375, + "grad_norm": 0.2332595427877249, + "learning_rate": 9.897525430619965e-05, + "loss": 2.2267, + "step": 314 + }, + { + "epoch": 0.09303012404016539, + "grad_norm": 0.24389977591339287, + "learning_rate": 9.896559764383839e-05, + "loss": 2.296, + "step": 315 + }, + { + "epoch": 0.09332545776727702, + "grad_norm": 0.2191718495744851, + "learning_rate": 9.895589617049372e-05, + "loss": 2.2929, + "step": 316 + }, + { + "epoch": 0.09362079149438866, + "grad_norm": 0.22349822479648201, + "learning_rate": 9.894614989504395e-05, + "loss": 2.3036, + "step": 317 + }, + { + "epoch": 0.0939161252215003, + "grad_norm": 0.235985933789319, + "learning_rate": 9.893635882640842e-05, + "loss": 2.3974, + "step": 318 + }, + { + "epoch": 0.09421145894861194, + "grad_norm": 0.2261568416942464, + "learning_rate": 9.892652297354745e-05, + "loss": 2.3488, + "step": 319 + }, + { + "epoch": 0.09450679267572357, + "grad_norm": 0.23426091278451996, + "learning_rate": 9.891664234546236e-05, + "loss": 2.404, + "step": 320 + }, + { + "epoch": 0.09480212640283521, + "grad_norm": 0.2765743436270929, + "learning_rate": 9.89067169511954e-05, + "loss": 2.364, + "step": 321 + }, + { + "epoch": 0.09509746012994684, + "grad_norm": 0.22782339087043268, + "learning_rate": 9.889674679982982e-05, + "loss": 2.3527, + "step": 322 + }, + { + "epoch": 0.09539279385705847, + "grad_norm": 0.2974983121719887, + "learning_rate": 9.888673190048986e-05, + "loss": 2.2649, + "step": 323 + }, + { + "epoch": 0.09568812758417011, + "grad_norm": 0.24276870544928547, + "learning_rate": 9.887667226234064e-05, + "loss": 2.3782, + "step": 324 + }, + { + "epoch": 0.09598346131128174, + "grad_norm": 0.23905959969051505, + "learning_rate": 9.886656789458829e-05, + "loss": 2.3442, + "step": 325 + }, + { + "epoch": 0.09627879503839339, + "grad_norm": 0.23057295173166228, + "learning_rate": 9.885641880647983e-05, + "loss": 2.3441, + "step": 326 + }, + { + "epoch": 0.09657412876550502, + "grad_norm": 0.22744757323300657, + "learning_rate": 9.884622500730323e-05, + "loss": 2.2602, + "step": 327 + }, + { + "epoch": 0.09686946249261666, + "grad_norm": 0.2347055488432316, + "learning_rate": 9.883598650638737e-05, + "loss": 2.2867, + "step": 328 + }, + { + "epoch": 0.09716479621972829, + "grad_norm": 0.23227916471483706, + "learning_rate": 9.882570331310204e-05, + "loss": 2.3989, + "step": 329 + }, + { + "epoch": 0.09746012994683993, + "grad_norm": 0.22880635962573906, + "learning_rate": 9.881537543685792e-05, + "loss": 2.3249, + "step": 330 + }, + { + "epoch": 0.09775546367395156, + "grad_norm": 0.24071689157760265, + "learning_rate": 9.88050028871066e-05, + "loss": 2.3203, + "step": 331 + }, + { + "epoch": 0.0980507974010632, + "grad_norm": 0.22819289216185434, + "learning_rate": 9.879458567334052e-05, + "loss": 2.3062, + "step": 332 + }, + { + "epoch": 0.09834613112817484, + "grad_norm": 0.7261375086687551, + "learning_rate": 9.878412380509307e-05, + "loss": 2.3477, + "step": 333 + }, + { + "epoch": 0.09864146485528648, + "grad_norm": 0.23207345861187906, + "learning_rate": 9.877361729193839e-05, + "loss": 2.359, + "step": 334 + }, + { + "epoch": 0.09893679858239811, + "grad_norm": 0.22601756792917874, + "learning_rate": 9.87630661434916e-05, + "loss": 2.3908, + "step": 335 + }, + { + "epoch": 0.09923213230950975, + "grad_norm": 0.21967337918378765, + "learning_rate": 9.875247036940856e-05, + "loss": 2.3002, + "step": 336 + }, + { + "epoch": 0.09952746603662138, + "grad_norm": 0.2199948601366141, + "learning_rate": 9.874182997938605e-05, + "loss": 2.3038, + "step": 337 + }, + { + "epoch": 0.09982279976373302, + "grad_norm": 0.2536067957410575, + "learning_rate": 9.873114498316163e-05, + "loss": 2.3759, + "step": 338 + }, + { + "epoch": 0.10011813349084465, + "grad_norm": 0.22184421298844811, + "learning_rate": 9.872041539051371e-05, + "loss": 2.3208, + "step": 339 + }, + { + "epoch": 0.10041346721795628, + "grad_norm": 0.22897481835269898, + "learning_rate": 9.870964121126149e-05, + "loss": 2.3725, + "step": 340 + }, + { + "epoch": 0.10070880094506793, + "grad_norm": 0.2262864239494575, + "learning_rate": 9.869882245526501e-05, + "loss": 2.3145, + "step": 341 + }, + { + "epoch": 0.10100413467217956, + "grad_norm": 0.23160738039036055, + "learning_rate": 9.868795913242505e-05, + "loss": 2.2828, + "step": 342 + }, + { + "epoch": 0.1012994683992912, + "grad_norm": 0.2952437759844837, + "learning_rate": 9.867705125268323e-05, + "loss": 2.4063, + "step": 343 + }, + { + "epoch": 0.10159480212640283, + "grad_norm": 0.4988840285411136, + "learning_rate": 9.866609882602192e-05, + "loss": 2.3309, + "step": 344 + }, + { + "epoch": 0.10189013585351447, + "grad_norm": 0.23043933777713907, + "learning_rate": 9.865510186246423e-05, + "loss": 2.3044, + "step": 345 + }, + { + "epoch": 0.1021854695806261, + "grad_norm": 0.2793627707900518, + "learning_rate": 9.864406037207409e-05, + "loss": 2.3245, + "step": 346 + }, + { + "epoch": 0.10248080330773775, + "grad_norm": 0.25288314096467834, + "learning_rate": 9.863297436495614e-05, + "loss": 2.2132, + "step": 347 + }, + { + "epoch": 0.10277613703484938, + "grad_norm": 0.24854251573379754, + "learning_rate": 9.862184385125578e-05, + "loss": 2.3369, + "step": 348 + }, + { + "epoch": 0.10307147076196102, + "grad_norm": 0.23749213017425, + "learning_rate": 9.861066884115907e-05, + "loss": 2.0824, + "step": 349 + }, + { + "epoch": 0.10336680448907265, + "grad_norm": 0.24551369410398707, + "learning_rate": 9.85994493448929e-05, + "loss": 2.3291, + "step": 350 + }, + { + "epoch": 0.1036621382161843, + "grad_norm": 0.2300363622562957, + "learning_rate": 9.85881853727248e-05, + "loss": 2.3185, + "step": 351 + }, + { + "epoch": 0.10395747194329592, + "grad_norm": 0.2299429585508057, + "learning_rate": 9.857687693496304e-05, + "loss": 2.3165, + "step": 352 + }, + { + "epoch": 0.10425280567040757, + "grad_norm": 0.23387458945023729, + "learning_rate": 9.856552404195653e-05, + "loss": 2.2831, + "step": 353 + }, + { + "epoch": 0.1045481393975192, + "grad_norm": 0.25252133253365167, + "learning_rate": 9.855412670409493e-05, + "loss": 2.3316, + "step": 354 + }, + { + "epoch": 0.10484347312463083, + "grad_norm": 0.2771466051488714, + "learning_rate": 9.854268493180851e-05, + "loss": 2.414, + "step": 355 + }, + { + "epoch": 0.10513880685174247, + "grad_norm": 0.23686887022305417, + "learning_rate": 9.853119873556828e-05, + "loss": 2.3466, + "step": 356 + }, + { + "epoch": 0.1054341405788541, + "grad_norm": 0.2620078952096577, + "learning_rate": 9.851966812588582e-05, + "loss": 2.3267, + "step": 357 + }, + { + "epoch": 0.10572947430596574, + "grad_norm": 0.24116499459424737, + "learning_rate": 9.850809311331339e-05, + "loss": 2.344, + "step": 358 + }, + { + "epoch": 0.10602480803307737, + "grad_norm": 0.23000152556286604, + "learning_rate": 9.849647370844393e-05, + "loss": 2.2885, + "step": 359 + }, + { + "epoch": 0.10632014176018902, + "grad_norm": 0.2378678801112989, + "learning_rate": 9.848480992191091e-05, + "loss": 2.238, + "step": 360 + }, + { + "epoch": 0.10661547548730065, + "grad_norm": 0.23138345624153636, + "learning_rate": 9.847310176438853e-05, + "loss": 2.3559, + "step": 361 + }, + { + "epoch": 0.10691080921441229, + "grad_norm": 0.22897273585589853, + "learning_rate": 9.846134924659152e-05, + "loss": 2.2749, + "step": 362 + }, + { + "epoch": 0.10720614294152392, + "grad_norm": 0.23723162401472717, + "learning_rate": 9.84495523792752e-05, + "loss": 2.2698, + "step": 363 + }, + { + "epoch": 0.10750147666863556, + "grad_norm": 0.23477589970350182, + "learning_rate": 9.843771117323553e-05, + "loss": 2.4274, + "step": 364 + }, + { + "epoch": 0.10779681039574719, + "grad_norm": 0.2434778831173159, + "learning_rate": 9.8425825639309e-05, + "loss": 2.3294, + "step": 365 + }, + { + "epoch": 0.10809214412285884, + "grad_norm": 0.23883122152330769, + "learning_rate": 9.841389578837269e-05, + "loss": 2.3465, + "step": 366 + }, + { + "epoch": 0.10838747784997046, + "grad_norm": 0.2264448386531687, + "learning_rate": 9.840192163134423e-05, + "loss": 2.3236, + "step": 367 + }, + { + "epoch": 0.10868281157708211, + "grad_norm": 0.22899224225172082, + "learning_rate": 9.838990317918182e-05, + "loss": 2.3065, + "step": 368 + }, + { + "epoch": 0.10897814530419374, + "grad_norm": 0.7646795390735404, + "learning_rate": 9.837784044288413e-05, + "loss": 2.1908, + "step": 369 + }, + { + "epoch": 0.10927347903130538, + "grad_norm": 1.5856325955748432, + "learning_rate": 9.836573343349047e-05, + "loss": 2.4753, + "step": 370 + }, + { + "epoch": 0.10956881275841701, + "grad_norm": 0.29471993005973585, + "learning_rate": 9.835358216208053e-05, + "loss": 2.3294, + "step": 371 + }, + { + "epoch": 0.10986414648552864, + "grad_norm": 0.3783244552739968, + "learning_rate": 9.834138663977458e-05, + "loss": 2.2674, + "step": 372 + }, + { + "epoch": 0.11015948021264028, + "grad_norm": 0.2977378377960799, + "learning_rate": 9.832914687773345e-05, + "loss": 2.3742, + "step": 373 + }, + { + "epoch": 0.11045481393975191, + "grad_norm": 0.26836470386261707, + "learning_rate": 9.831686288715832e-05, + "loss": 2.2602, + "step": 374 + }, + { + "epoch": 0.11075014766686356, + "grad_norm": 0.28880453197563705, + "learning_rate": 9.830453467929093e-05, + "loss": 2.4187, + "step": 375 + }, + { + "epoch": 0.11104548139397519, + "grad_norm": 0.2649789414495717, + "learning_rate": 9.829216226541347e-05, + "loss": 2.3539, + "step": 376 + }, + { + "epoch": 0.11134081512108683, + "grad_norm": 0.2707169340720728, + "learning_rate": 9.827974565684859e-05, + "loss": 2.3443, + "step": 377 + }, + { + "epoch": 0.11163614884819846, + "grad_norm": 0.5855311701464524, + "learning_rate": 9.826728486495938e-05, + "loss": 2.2874, + "step": 378 + }, + { + "epoch": 0.1119314825753101, + "grad_norm": 0.2564657030330141, + "learning_rate": 9.825477990114936e-05, + "loss": 2.3789, + "step": 379 + }, + { + "epoch": 0.11222681630242173, + "grad_norm": 0.261296515270812, + "learning_rate": 9.824223077686248e-05, + "loss": 2.2674, + "step": 380 + }, + { + "epoch": 0.11252215002953338, + "grad_norm": 0.26265293616224633, + "learning_rate": 9.822963750358308e-05, + "loss": 2.2723, + "step": 381 + }, + { + "epoch": 0.112817483756645, + "grad_norm": 0.25612169752137515, + "learning_rate": 9.821700009283596e-05, + "loss": 2.3922, + "step": 382 + }, + { + "epoch": 0.11311281748375665, + "grad_norm": 0.2412623918147581, + "learning_rate": 9.820431855618626e-05, + "loss": 2.231, + "step": 383 + }, + { + "epoch": 0.11340815121086828, + "grad_norm": 0.2418632644201591, + "learning_rate": 9.81915929052395e-05, + "loss": 2.3386, + "step": 384 + }, + { + "epoch": 0.11370348493797992, + "grad_norm": 0.2834838265325025, + "learning_rate": 9.817882315164162e-05, + "loss": 2.3689, + "step": 385 + }, + { + "epoch": 0.11399881866509155, + "grad_norm": 0.24270724086897624, + "learning_rate": 9.816600930707887e-05, + "loss": 2.3228, + "step": 386 + }, + { + "epoch": 0.11429415239220318, + "grad_norm": 0.2234415127320527, + "learning_rate": 9.815315138327788e-05, + "loss": 2.3145, + "step": 387 + }, + { + "epoch": 0.11458948611931483, + "grad_norm": 0.24304564016795463, + "learning_rate": 9.814024939200561e-05, + "loss": 2.3447, + "step": 388 + }, + { + "epoch": 0.11488481984642646, + "grad_norm": 0.2456364469918429, + "learning_rate": 9.812730334506934e-05, + "loss": 2.3923, + "step": 389 + }, + { + "epoch": 0.1151801535735381, + "grad_norm": 0.22847100987945307, + "learning_rate": 9.81143132543167e-05, + "loss": 2.2905, + "step": 390 + }, + { + "epoch": 0.11547548730064973, + "grad_norm": 0.3182108362894562, + "learning_rate": 9.810127913163556e-05, + "loss": 2.2519, + "step": 391 + }, + { + "epoch": 0.11577082102776137, + "grad_norm": 0.23554442643241977, + "learning_rate": 9.808820098895416e-05, + "loss": 2.2569, + "step": 392 + }, + { + "epoch": 0.116066154754873, + "grad_norm": 0.25005751186697855, + "learning_rate": 9.807507883824099e-05, + "loss": 2.2692, + "step": 393 + }, + { + "epoch": 0.11636148848198465, + "grad_norm": 0.22210575951509381, + "learning_rate": 9.806191269150479e-05, + "loss": 2.2978, + "step": 394 + }, + { + "epoch": 0.11665682220909628, + "grad_norm": 0.3104777349123211, + "learning_rate": 9.804870256079465e-05, + "loss": 2.5199, + "step": 395 + }, + { + "epoch": 0.11695215593620792, + "grad_norm": 0.2576625995073389, + "learning_rate": 9.80354484581998e-05, + "loss": 2.3658, + "step": 396 + }, + { + "epoch": 0.11724748966331955, + "grad_norm": 0.23521301338505493, + "learning_rate": 9.802215039584976e-05, + "loss": 2.3534, + "step": 397 + }, + { + "epoch": 0.11754282339043119, + "grad_norm": 0.2353767128323444, + "learning_rate": 9.800880838591433e-05, + "loss": 2.3064, + "step": 398 + }, + { + "epoch": 0.11783815711754282, + "grad_norm": 0.23881783806106927, + "learning_rate": 9.799542244060344e-05, + "loss": 2.252, + "step": 399 + }, + { + "epoch": 0.11813349084465447, + "grad_norm": 0.9025693669171184, + "learning_rate": 9.79819925721673e-05, + "loss": 2.4663, + "step": 400 + }, + { + "epoch": 0.1184288245717661, + "grad_norm": 0.2526939796109501, + "learning_rate": 9.796851879289626e-05, + "loss": 2.2758, + "step": 401 + }, + { + "epoch": 0.11872415829887774, + "grad_norm": 0.27670911254125924, + "learning_rate": 9.79550011151209e-05, + "loss": 2.431, + "step": 402 + }, + { + "epoch": 0.11901949202598937, + "grad_norm": 0.2933164821714527, + "learning_rate": 9.794143955121195e-05, + "loss": 2.2915, + "step": 403 + }, + { + "epoch": 0.119314825753101, + "grad_norm": 0.2570290800194319, + "learning_rate": 9.792783411358029e-05, + "loss": 2.3208, + "step": 404 + }, + { + "epoch": 0.11961015948021264, + "grad_norm": 0.2635305520429204, + "learning_rate": 9.7914184814677e-05, + "loss": 2.3647, + "step": 405 + }, + { + "epoch": 0.11990549320732427, + "grad_norm": 0.2654430804266702, + "learning_rate": 9.790049166699325e-05, + "loss": 2.3087, + "step": 406 + }, + { + "epoch": 0.12020082693443591, + "grad_norm": 0.24751816368643312, + "learning_rate": 9.788675468306034e-05, + "loss": 2.2822, + "step": 407 + }, + { + "epoch": 0.12049616066154754, + "grad_norm": 0.2558382768158543, + "learning_rate": 9.787297387544974e-05, + "loss": 2.3681, + "step": 408 + }, + { + "epoch": 0.12079149438865919, + "grad_norm": 0.26188888997771537, + "learning_rate": 9.785914925677297e-05, + "loss": 2.4645, + "step": 409 + }, + { + "epoch": 0.12108682811577082, + "grad_norm": 0.24796522854309844, + "learning_rate": 9.784528083968165e-05, + "loss": 2.2985, + "step": 410 + }, + { + "epoch": 0.12138216184288246, + "grad_norm": 0.22947309698688684, + "learning_rate": 9.783136863686752e-05, + "loss": 2.3274, + "step": 411 + }, + { + "epoch": 0.12167749556999409, + "grad_norm": 0.266052365111578, + "learning_rate": 9.781741266106236e-05, + "loss": 2.2798, + "step": 412 + }, + { + "epoch": 0.12197282929710573, + "grad_norm": 0.2449343329855391, + "learning_rate": 9.7803412925038e-05, + "loss": 2.339, + "step": 413 + }, + { + "epoch": 0.12226816302421736, + "grad_norm": 0.2377575372073797, + "learning_rate": 9.778936944160634e-05, + "loss": 2.2948, + "step": 414 + }, + { + "epoch": 0.122563496751329, + "grad_norm": 0.2380419999622803, + "learning_rate": 9.777528222361932e-05, + "loss": 2.3053, + "step": 415 + }, + { + "epoch": 0.12285883047844064, + "grad_norm": 0.24813662429696556, + "learning_rate": 9.776115128396887e-05, + "loss": 2.2815, + "step": 416 + }, + { + "epoch": 0.12315416420555228, + "grad_norm": 0.23165100550430237, + "learning_rate": 9.774697663558696e-05, + "loss": 2.2714, + "step": 417 + }, + { + "epoch": 0.12344949793266391, + "grad_norm": 0.22261044382435974, + "learning_rate": 9.773275829144556e-05, + "loss": 2.2482, + "step": 418 + }, + { + "epoch": 0.12374483165977555, + "grad_norm": 0.25368335917576107, + "learning_rate": 9.771849626455663e-05, + "loss": 2.4625, + "step": 419 + }, + { + "epoch": 0.12404016538688718, + "grad_norm": 0.42051558994554866, + "learning_rate": 9.770419056797206e-05, + "loss": 2.2871, + "step": 420 + }, + { + "epoch": 0.12433549911399881, + "grad_norm": 0.2283441438140334, + "learning_rate": 9.768984121478379e-05, + "loss": 2.3278, + "step": 421 + }, + { + "epoch": 0.12463083284111046, + "grad_norm": 0.24014147846458211, + "learning_rate": 9.767544821812364e-05, + "loss": 2.2696, + "step": 422 + }, + { + "epoch": 0.12492616656822209, + "grad_norm": 23.324846449956805, + "learning_rate": 9.766101159116338e-05, + "loss": 2.4176, + "step": 423 + }, + { + "epoch": 0.12522150029533372, + "grad_norm": 0.25216212217041384, + "learning_rate": 9.764653134711476e-05, + "loss": 2.3309, + "step": 424 + }, + { + "epoch": 0.12551683402244537, + "grad_norm": 0.2371976948734646, + "learning_rate": 9.763200749922938e-05, + "loss": 2.2738, + "step": 425 + }, + { + "epoch": 0.125812167749557, + "grad_norm": 0.24244610761520002, + "learning_rate": 9.76174400607988e-05, + "loss": 2.3199, + "step": 426 + }, + { + "epoch": 0.12610750147666863, + "grad_norm": 0.449607249900376, + "learning_rate": 9.76028290451544e-05, + "loss": 2.3521, + "step": 427 + }, + { + "epoch": 0.12640283520378026, + "grad_norm": 0.2350598105829786, + "learning_rate": 9.758817446566755e-05, + "loss": 2.2615, + "step": 428 + }, + { + "epoch": 0.12669816893089192, + "grad_norm": 0.24918368550268843, + "learning_rate": 9.757347633574938e-05, + "loss": 2.3792, + "step": 429 + }, + { + "epoch": 0.12699350265800355, + "grad_norm": 0.23723429353988085, + "learning_rate": 9.755873466885093e-05, + "loss": 2.3049, + "step": 430 + }, + { + "epoch": 0.12728883638511518, + "grad_norm": 0.28011772269550395, + "learning_rate": 9.754394947846305e-05, + "loss": 2.2867, + "step": 431 + }, + { + "epoch": 0.1275841701122268, + "grad_norm": 0.24021098170318975, + "learning_rate": 9.752912077811649e-05, + "loss": 2.3306, + "step": 432 + }, + { + "epoch": 0.12787950383933847, + "grad_norm": 0.25284934516876256, + "learning_rate": 9.751424858138174e-05, + "loss": 2.3225, + "step": 433 + }, + { + "epoch": 0.1281748375664501, + "grad_norm": 0.23190953180077753, + "learning_rate": 9.749933290186913e-05, + "loss": 2.3736, + "step": 434 + }, + { + "epoch": 0.12847017129356172, + "grad_norm": 0.2636382328753065, + "learning_rate": 9.748437375322877e-05, + "loss": 2.4079, + "step": 435 + }, + { + "epoch": 0.12876550502067335, + "grad_norm": 0.23565146621383132, + "learning_rate": 9.746937114915056e-05, + "loss": 2.3396, + "step": 436 + }, + { + "epoch": 0.12906083874778498, + "grad_norm": 0.2417985037901317, + "learning_rate": 9.745432510336418e-05, + "loss": 2.4418, + "step": 437 + }, + { + "epoch": 0.12935617247489664, + "grad_norm": 0.3426284157214968, + "learning_rate": 9.743923562963906e-05, + "loss": 2.348, + "step": 438 + }, + { + "epoch": 0.12965150620200827, + "grad_norm": 0.23170661661711084, + "learning_rate": 9.742410274178437e-05, + "loss": 2.3217, + "step": 439 + }, + { + "epoch": 0.1299468399291199, + "grad_norm": 0.2996238759695716, + "learning_rate": 9.7408926453649e-05, + "loss": 2.3619, + "step": 440 + }, + { + "epoch": 0.13024217365623153, + "grad_norm": 0.2257611308654106, + "learning_rate": 9.739370677912156e-05, + "loss": 2.2739, + "step": 441 + }, + { + "epoch": 0.1305375073833432, + "grad_norm": 0.2420042867768518, + "learning_rate": 9.737844373213037e-05, + "loss": 2.2681, + "step": 442 + }, + { + "epoch": 0.13083284111045482, + "grad_norm": 0.23246360051651035, + "learning_rate": 9.736313732664346e-05, + "loss": 2.274, + "step": 443 + }, + { + "epoch": 0.13112817483756645, + "grad_norm": 0.256813048296464, + "learning_rate": 9.734778757666852e-05, + "loss": 2.2913, + "step": 444 + }, + { + "epoch": 0.13142350856467808, + "grad_norm": 0.233768313705355, + "learning_rate": 9.73323944962529e-05, + "loss": 2.3792, + "step": 445 + }, + { + "epoch": 0.13171884229178973, + "grad_norm": 0.22497577636180816, + "learning_rate": 9.731695809948361e-05, + "loss": 2.3505, + "step": 446 + }, + { + "epoch": 0.13201417601890136, + "grad_norm": 0.293339367729038, + "learning_rate": 9.730147840048731e-05, + "loss": 2.3466, + "step": 447 + }, + { + "epoch": 0.132309509746013, + "grad_norm": 0.4204309615637162, + "learning_rate": 9.728595541343031e-05, + "loss": 2.3524, + "step": 448 + }, + { + "epoch": 0.13260484347312462, + "grad_norm": 0.22691318075094435, + "learning_rate": 9.727038915251847e-05, + "loss": 2.2978, + "step": 449 + }, + { + "epoch": 0.13290017720023628, + "grad_norm": 0.2747277065922056, + "learning_rate": 9.725477963199731e-05, + "loss": 2.2622, + "step": 450 + }, + { + "epoch": 0.1331955109273479, + "grad_norm": 0.24701893690718113, + "learning_rate": 9.723912686615191e-05, + "loss": 2.3226, + "step": 451 + }, + { + "epoch": 0.13349084465445954, + "grad_norm": 0.2692627606144092, + "learning_rate": 9.722343086930696e-05, + "loss": 2.3815, + "step": 452 + }, + { + "epoch": 0.13378617838157117, + "grad_norm": 0.24677339436699794, + "learning_rate": 9.720769165582667e-05, + "loss": 2.357, + "step": 453 + }, + { + "epoch": 0.1340815121086828, + "grad_norm": 0.23561138041619506, + "learning_rate": 9.719190924011481e-05, + "loss": 2.3417, + "step": 454 + }, + { + "epoch": 0.13437684583579446, + "grad_norm": 0.21710127861737155, + "learning_rate": 9.71760836366147e-05, + "loss": 2.3721, + "step": 455 + }, + { + "epoch": 0.13467217956290609, + "grad_norm": 0.25544935963621906, + "learning_rate": 9.71602148598092e-05, + "loss": 2.3029, + "step": 456 + }, + { + "epoch": 0.13496751329001772, + "grad_norm": 0.27600294717307067, + "learning_rate": 9.714430292422064e-05, + "loss": 2.2922, + "step": 457 + }, + { + "epoch": 0.13526284701712935, + "grad_norm": 0.22909641634079328, + "learning_rate": 9.712834784441088e-05, + "loss": 2.3061, + "step": 458 + }, + { + "epoch": 0.135558180744241, + "grad_norm": 0.22846867005570257, + "learning_rate": 9.711234963498122e-05, + "loss": 2.3327, + "step": 459 + }, + { + "epoch": 0.13585351447135263, + "grad_norm": 0.23864098269360162, + "learning_rate": 9.709630831057249e-05, + "loss": 2.3954, + "step": 460 + }, + { + "epoch": 0.13614884819846426, + "grad_norm": 0.27382784638002944, + "learning_rate": 9.708022388586492e-05, + "loss": 2.2723, + "step": 461 + }, + { + "epoch": 0.1364441819255759, + "grad_norm": 0.2278457751913323, + "learning_rate": 9.706409637557823e-05, + "loss": 2.3306, + "step": 462 + }, + { + "epoch": 0.13673951565268755, + "grad_norm": 0.2238719194984107, + "learning_rate": 9.704792579447156e-05, + "loss": 2.2991, + "step": 463 + }, + { + "epoch": 0.13703484937979918, + "grad_norm": 0.22121328283592726, + "learning_rate": 9.703171215734342e-05, + "loss": 2.2905, + "step": 464 + }, + { + "epoch": 0.1373301831069108, + "grad_norm": 0.2258653767809954, + "learning_rate": 9.701545547903179e-05, + "loss": 2.3081, + "step": 465 + }, + { + "epoch": 0.13762551683402244, + "grad_norm": 0.22155755833816798, + "learning_rate": 9.699915577441398e-05, + "loss": 2.2725, + "step": 466 + }, + { + "epoch": 0.1379208505611341, + "grad_norm": 0.24131144232865187, + "learning_rate": 9.698281305840673e-05, + "loss": 2.3269, + "step": 467 + }, + { + "epoch": 0.13821618428824572, + "grad_norm": 0.23234431791973675, + "learning_rate": 9.696642734596612e-05, + "loss": 2.3014, + "step": 468 + }, + { + "epoch": 0.13851151801535735, + "grad_norm": 0.24008190930379045, + "learning_rate": 9.694999865208753e-05, + "loss": 2.3033, + "step": 469 + }, + { + "epoch": 0.13880685174246898, + "grad_norm": 0.2913543343000824, + "learning_rate": 9.693352699180577e-05, + "loss": 2.4152, + "step": 470 + }, + { + "epoch": 0.1391021854695806, + "grad_norm": 0.23488528808781284, + "learning_rate": 9.69170123801949e-05, + "loss": 2.3386, + "step": 471 + }, + { + "epoch": 0.13939751919669227, + "grad_norm": 0.3853135056803527, + "learning_rate": 9.69004548323683e-05, + "loss": 2.3014, + "step": 472 + }, + { + "epoch": 0.1396928529238039, + "grad_norm": 0.2270789918499106, + "learning_rate": 9.688385436347865e-05, + "loss": 2.3434, + "step": 473 + }, + { + "epoch": 0.13998818665091553, + "grad_norm": 0.22836278221832898, + "learning_rate": 9.686721098871789e-05, + "loss": 2.3056, + "step": 474 + }, + { + "epoch": 0.14028352037802716, + "grad_norm": 0.2521146212357935, + "learning_rate": 9.685052472331728e-05, + "loss": 2.4042, + "step": 475 + }, + { + "epoch": 0.14057885410513882, + "grad_norm": 0.23427571736988068, + "learning_rate": 9.683379558254728e-05, + "loss": 2.3306, + "step": 476 + }, + { + "epoch": 0.14087418783225045, + "grad_norm": 0.21938660495852166, + "learning_rate": 9.68170235817176e-05, + "loss": 2.3369, + "step": 477 + }, + { + "epoch": 0.14116952155936208, + "grad_norm": 0.23149305567459993, + "learning_rate": 9.68002087361772e-05, + "loss": 2.2962, + "step": 478 + }, + { + "epoch": 0.1414648552864737, + "grad_norm": 0.23630912956632957, + "learning_rate": 9.678335106131419e-05, + "loss": 2.1793, + "step": 479 + }, + { + "epoch": 0.14176018901358536, + "grad_norm": 0.2247454852805964, + "learning_rate": 9.676645057255592e-05, + "loss": 2.2993, + "step": 480 + }, + { + "epoch": 0.142055522740697, + "grad_norm": 0.22486753968814044, + "learning_rate": 9.674950728536894e-05, + "loss": 2.2382, + "step": 481 + }, + { + "epoch": 0.14235085646780862, + "grad_norm": 0.23697091793893216, + "learning_rate": 9.67325212152589e-05, + "loss": 2.253, + "step": 482 + }, + { + "epoch": 0.14264619019492025, + "grad_norm": 0.2614454294087895, + "learning_rate": 9.671549237777066e-05, + "loss": 2.3295, + "step": 483 + }, + { + "epoch": 0.14294152392203188, + "grad_norm": 0.2385676648133286, + "learning_rate": 9.669842078848823e-05, + "loss": 2.2935, + "step": 484 + }, + { + "epoch": 0.14323685764914354, + "grad_norm": 10.279369425748625, + "learning_rate": 9.668130646303466e-05, + "loss": 2.4802, + "step": 485 + }, + { + "epoch": 0.14353219137625517, + "grad_norm": 0.24194917181060105, + "learning_rate": 9.666414941707222e-05, + "loss": 2.4026, + "step": 486 + }, + { + "epoch": 0.1438275251033668, + "grad_norm": 0.3604702431028188, + "learning_rate": 9.664694966630218e-05, + "loss": 2.3286, + "step": 487 + }, + { + "epoch": 0.14412285883047843, + "grad_norm": 0.2509566570062168, + "learning_rate": 9.662970722646494e-05, + "loss": 2.3538, + "step": 488 + }, + { + "epoch": 0.14441819255759009, + "grad_norm": 0.37752250796915815, + "learning_rate": 9.661242211333998e-05, + "loss": 2.2527, + "step": 489 + }, + { + "epoch": 0.14471352628470172, + "grad_norm": 0.23588204647954747, + "learning_rate": 9.659509434274582e-05, + "loss": 2.3164, + "step": 490 + }, + { + "epoch": 0.14500886001181335, + "grad_norm": 0.21629094897910311, + "learning_rate": 9.657772393053997e-05, + "loss": 2.3849, + "step": 491 + }, + { + "epoch": 0.14530419373892497, + "grad_norm": 0.23469017612694074, + "learning_rate": 9.656031089261903e-05, + "loss": 2.3276, + "step": 492 + }, + { + "epoch": 0.14559952746603663, + "grad_norm": 0.247738939915093, + "learning_rate": 9.654285524491858e-05, + "loss": 2.2603, + "step": 493 + }, + { + "epoch": 0.14589486119314826, + "grad_norm": 0.2621473105845063, + "learning_rate": 9.65253570034132e-05, + "loss": 2.354, + "step": 494 + }, + { + "epoch": 0.1461901949202599, + "grad_norm": 0.2302309603740462, + "learning_rate": 9.650781618411646e-05, + "loss": 2.3038, + "step": 495 + }, + { + "epoch": 0.14648552864737152, + "grad_norm": 0.2271545419176143, + "learning_rate": 9.649023280308084e-05, + "loss": 2.2874, + "step": 496 + }, + { + "epoch": 0.14678086237448318, + "grad_norm": 0.4111207973872018, + "learning_rate": 9.647260687639786e-05, + "loss": 2.4009, + "step": 497 + }, + { + "epoch": 0.1470761961015948, + "grad_norm": 0.23524278683351854, + "learning_rate": 9.645493842019788e-05, + "loss": 2.3366, + "step": 498 + }, + { + "epoch": 0.14737152982870644, + "grad_norm": 0.24732263218270573, + "learning_rate": 9.643722745065029e-05, + "loss": 2.3413, + "step": 499 + }, + { + "epoch": 0.14766686355581807, + "grad_norm": 0.2329423910102936, + "learning_rate": 9.641947398396328e-05, + "loss": 2.2531, + "step": 500 + }, + { + "epoch": 0.1479621972829297, + "grad_norm": 0.21731026322258945, + "learning_rate": 9.6401678036384e-05, + "loss": 2.2748, + "step": 501 + }, + { + "epoch": 0.14825753101004135, + "grad_norm": 0.23462581724259274, + "learning_rate": 9.638383962419843e-05, + "loss": 2.3884, + "step": 502 + }, + { + "epoch": 0.14855286473715298, + "grad_norm": 0.22533598589611456, + "learning_rate": 9.636595876373147e-05, + "loss": 2.3142, + "step": 503 + }, + { + "epoch": 0.1488481984642646, + "grad_norm": 0.24528607785016227, + "learning_rate": 9.634803547134681e-05, + "loss": 2.2823, + "step": 504 + }, + { + "epoch": 0.14914353219137624, + "grad_norm": 0.24110163787626032, + "learning_rate": 9.633006976344698e-05, + "loss": 2.3553, + "step": 505 + }, + { + "epoch": 0.1494388659184879, + "grad_norm": 2.319414369562005, + "learning_rate": 9.631206165647338e-05, + "loss": 2.2802, + "step": 506 + }, + { + "epoch": 0.14973419964559953, + "grad_norm": 0.27624157049082715, + "learning_rate": 9.629401116690616e-05, + "loss": 2.2738, + "step": 507 + }, + { + "epoch": 0.15002953337271116, + "grad_norm": 0.24068248638011938, + "learning_rate": 9.627591831126423e-05, + "loss": 2.3479, + "step": 508 + }, + { + "epoch": 0.1503248670998228, + "grad_norm": 0.2525849367798979, + "learning_rate": 9.625778310610534e-05, + "loss": 2.2893, + "step": 509 + }, + { + "epoch": 0.15062020082693445, + "grad_norm": 0.31367995891866657, + "learning_rate": 9.623960556802598e-05, + "loss": 2.3371, + "step": 510 + }, + { + "epoch": 0.15091553455404608, + "grad_norm": 0.2350535821320985, + "learning_rate": 9.622138571366134e-05, + "loss": 2.3246, + "step": 511 + }, + { + "epoch": 0.1512108682811577, + "grad_norm": 0.23178809973161824, + "learning_rate": 9.620312355968536e-05, + "loss": 2.4534, + "step": 512 + }, + { + "epoch": 0.15150620200826934, + "grad_norm": 0.23249648203034085, + "learning_rate": 9.618481912281074e-05, + "loss": 2.3618, + "step": 513 + }, + { + "epoch": 0.151801535735381, + "grad_norm": 0.23441722807346313, + "learning_rate": 9.61664724197888e-05, + "loss": 2.3431, + "step": 514 + }, + { + "epoch": 0.15209686946249262, + "grad_norm": 0.24229607148128565, + "learning_rate": 9.614808346740954e-05, + "loss": 2.413, + "step": 515 + }, + { + "epoch": 0.15239220318960425, + "grad_norm": 0.24380677857930752, + "learning_rate": 9.612965228250171e-05, + "loss": 2.3954, + "step": 516 + }, + { + "epoch": 0.15268753691671588, + "grad_norm": 0.23788188078896827, + "learning_rate": 9.61111788819326e-05, + "loss": 2.3277, + "step": 517 + }, + { + "epoch": 0.1529828706438275, + "grad_norm": 0.23951032898048707, + "learning_rate": 9.609266328260821e-05, + "loss": 2.3772, + "step": 518 + }, + { + "epoch": 0.15327820437093917, + "grad_norm": 0.246447415006566, + "learning_rate": 9.607410550147313e-05, + "loss": 2.3723, + "step": 519 + }, + { + "epoch": 0.1535735380980508, + "grad_norm": 0.26090255549002855, + "learning_rate": 9.605550555551058e-05, + "loss": 2.2783, + "step": 520 + }, + { + "epoch": 0.15386887182516243, + "grad_norm": 0.4419435926990359, + "learning_rate": 9.603686346174232e-05, + "loss": 2.3948, + "step": 521 + }, + { + "epoch": 0.15416420555227406, + "grad_norm": 0.2165843076170916, + "learning_rate": 9.601817923722871e-05, + "loss": 2.2469, + "step": 522 + }, + { + "epoch": 0.15445953927938572, + "grad_norm": 0.22413608608922106, + "learning_rate": 9.599945289906867e-05, + "loss": 2.2973, + "step": 523 + }, + { + "epoch": 0.15475487300649735, + "grad_norm": 0.48805097108956735, + "learning_rate": 9.598068446439965e-05, + "loss": 2.481, + "step": 524 + }, + { + "epoch": 0.15505020673360898, + "grad_norm": 0.23777428618245633, + "learning_rate": 9.596187395039763e-05, + "loss": 2.3924, + "step": 525 + }, + { + "epoch": 0.1553455404607206, + "grad_norm": 0.24123228804889316, + "learning_rate": 9.594302137427709e-05, + "loss": 2.383, + "step": 526 + }, + { + "epoch": 0.15564087418783226, + "grad_norm": 0.23706555189355158, + "learning_rate": 9.592412675329101e-05, + "loss": 2.2442, + "step": 527 + }, + { + "epoch": 0.1559362079149439, + "grad_norm": 0.22533224133103708, + "learning_rate": 9.590519010473086e-05, + "loss": 2.3149, + "step": 528 + }, + { + "epoch": 0.15623154164205552, + "grad_norm": 0.26943128915971537, + "learning_rate": 9.588621144592656e-05, + "loss": 2.365, + "step": 529 + }, + { + "epoch": 0.15652687536916715, + "grad_norm": 0.2642494375138084, + "learning_rate": 9.586719079424645e-05, + "loss": 2.4359, + "step": 530 + }, + { + "epoch": 0.1568222090962788, + "grad_norm": 0.23165264580914074, + "learning_rate": 9.584812816709733e-05, + "loss": 2.3273, + "step": 531 + }, + { + "epoch": 0.15711754282339044, + "grad_norm": 0.23878876778278482, + "learning_rate": 9.582902358192442e-05, + "loss": 2.3519, + "step": 532 + }, + { + "epoch": 0.15741287655050207, + "grad_norm": 0.23814963004373635, + "learning_rate": 9.580987705621134e-05, + "loss": 2.3872, + "step": 533 + }, + { + "epoch": 0.1577082102776137, + "grad_norm": 0.22455370669264701, + "learning_rate": 9.579068860748007e-05, + "loss": 2.3664, + "step": 534 + }, + { + "epoch": 0.15800354400472533, + "grad_norm": 0.23998747908445447, + "learning_rate": 9.577145825329093e-05, + "loss": 2.4231, + "step": 535 + }, + { + "epoch": 0.15829887773183698, + "grad_norm": 0.24718479474512786, + "learning_rate": 9.575218601124268e-05, + "loss": 2.3455, + "step": 536 + }, + { + "epoch": 0.15859421145894861, + "grad_norm": 0.23576285011741208, + "learning_rate": 9.57328718989723e-05, + "loss": 2.3528, + "step": 537 + }, + { + "epoch": 0.15888954518606024, + "grad_norm": 0.2421575650819284, + "learning_rate": 9.57135159341552e-05, + "loss": 2.3293, + "step": 538 + }, + { + "epoch": 0.15918487891317187, + "grad_norm": 0.23156979589255688, + "learning_rate": 9.569411813450502e-05, + "loss": 2.4071, + "step": 539 + }, + { + "epoch": 0.15948021264028353, + "grad_norm": 0.2346069062821118, + "learning_rate": 9.56746785177737e-05, + "loss": 2.4468, + "step": 540 + }, + { + "epoch": 0.15977554636739516, + "grad_norm": 0.22030154972052976, + "learning_rate": 9.565519710175144e-05, + "loss": 2.2574, + "step": 541 + }, + { + "epoch": 0.1600708800945068, + "grad_norm": 0.2503088741778062, + "learning_rate": 9.563567390426672e-05, + "loss": 2.4002, + "step": 542 + }, + { + "epoch": 0.16036621382161842, + "grad_norm": 0.22983759594995842, + "learning_rate": 9.56161089431862e-05, + "loss": 2.3284, + "step": 543 + }, + { + "epoch": 0.16066154754873008, + "grad_norm": 0.23365548475884304, + "learning_rate": 9.559650223641488e-05, + "loss": 2.3026, + "step": 544 + }, + { + "epoch": 0.1609568812758417, + "grad_norm": 0.23205335235648267, + "learning_rate": 9.557685380189581e-05, + "loss": 2.3614, + "step": 545 + }, + { + "epoch": 0.16125221500295334, + "grad_norm": 0.22589935065079467, + "learning_rate": 9.555716365761034e-05, + "loss": 2.3682, + "step": 546 + }, + { + "epoch": 0.16154754873006497, + "grad_norm": 0.3080042024254516, + "learning_rate": 9.553743182157794e-05, + "loss": 2.2679, + "step": 547 + }, + { + "epoch": 0.16184288245717662, + "grad_norm": 0.22266376214990735, + "learning_rate": 9.551765831185625e-05, + "loss": 2.3516, + "step": 548 + }, + { + "epoch": 0.16213821618428825, + "grad_norm": 0.23141384313207294, + "learning_rate": 9.549784314654104e-05, + "loss": 2.2285, + "step": 549 + }, + { + "epoch": 0.16243354991139988, + "grad_norm": 0.24552561256138844, + "learning_rate": 9.547798634376622e-05, + "loss": 2.2268, + "step": 550 + }, + { + "epoch": 0.1627288836385115, + "grad_norm": 0.22704580157982263, + "learning_rate": 9.545808792170378e-05, + "loss": 2.3176, + "step": 551 + }, + { + "epoch": 0.16302421736562314, + "grad_norm": 0.25873130297718916, + "learning_rate": 9.54381478985638e-05, + "loss": 2.3347, + "step": 552 + }, + { + "epoch": 0.1633195510927348, + "grad_norm": 1.10567281072997, + "learning_rate": 9.541816629259445e-05, + "loss": 2.2969, + "step": 553 + }, + { + "epoch": 0.16361488481984643, + "grad_norm": 0.2634879707145821, + "learning_rate": 9.539814312208195e-05, + "loss": 2.3217, + "step": 554 + }, + { + "epoch": 0.16391021854695806, + "grad_norm": 0.22199745761321962, + "learning_rate": 9.537807840535055e-05, + "loss": 2.243, + "step": 555 + }, + { + "epoch": 0.1642055522740697, + "grad_norm": 0.22862960548569647, + "learning_rate": 9.53579721607625e-05, + "loss": 2.3292, + "step": 556 + }, + { + "epoch": 0.16450088600118135, + "grad_norm": 0.26351395575647524, + "learning_rate": 9.533782440671812e-05, + "loss": 2.3342, + "step": 557 + }, + { + "epoch": 0.16479621972829298, + "grad_norm": 0.22805583375621352, + "learning_rate": 9.531763516165563e-05, + "loss": 2.3606, + "step": 558 + }, + { + "epoch": 0.1650915534554046, + "grad_norm": 0.23226761707692406, + "learning_rate": 9.52974044440513e-05, + "loss": 2.3545, + "step": 559 + }, + { + "epoch": 0.16538688718251623, + "grad_norm": 0.25374203286559266, + "learning_rate": 9.52771322724193e-05, + "loss": 2.3148, + "step": 560 + }, + { + "epoch": 0.1656822209096279, + "grad_norm": 0.24234619632521687, + "learning_rate": 9.525681866531177e-05, + "loss": 2.4001, + "step": 561 + }, + { + "epoch": 0.16597755463673952, + "grad_norm": 0.23888052952233765, + "learning_rate": 9.523646364131875e-05, + "loss": 2.2998, + "step": 562 + }, + { + "epoch": 0.16627288836385115, + "grad_norm": 0.23122028888108315, + "learning_rate": 9.521606721906817e-05, + "loss": 2.3381, + "step": 563 + }, + { + "epoch": 0.16656822209096278, + "grad_norm": 0.22273245561429028, + "learning_rate": 9.51956294172259e-05, + "loss": 2.3417, + "step": 564 + }, + { + "epoch": 0.1668635558180744, + "grad_norm": 0.3075134708004817, + "learning_rate": 9.517515025449561e-05, + "loss": 2.3576, + "step": 565 + }, + { + "epoch": 0.16715888954518607, + "grad_norm": 0.24164830451846483, + "learning_rate": 9.515462974961888e-05, + "loss": 2.3145, + "step": 566 + }, + { + "epoch": 0.1674542232722977, + "grad_norm": 0.223218333854752, + "learning_rate": 9.51340679213751e-05, + "loss": 2.3294, + "step": 567 + }, + { + "epoch": 0.16774955699940933, + "grad_norm": 0.21981836732306823, + "learning_rate": 9.511346478858142e-05, + "loss": 2.3079, + "step": 568 + }, + { + "epoch": 0.16804489072652096, + "grad_norm": 0.22730433754927618, + "learning_rate": 9.509282037009293e-05, + "loss": 2.2941, + "step": 569 + }, + { + "epoch": 0.16834022445363261, + "grad_norm": 0.23051145933725262, + "learning_rate": 9.507213468480239e-05, + "loss": 2.4321, + "step": 570 + }, + { + "epoch": 0.16863555818074424, + "grad_norm": 0.22123970363387427, + "learning_rate": 9.505140775164032e-05, + "loss": 2.3488, + "step": 571 + }, + { + "epoch": 0.16893089190785587, + "grad_norm": 0.24210957842581649, + "learning_rate": 9.503063958957506e-05, + "loss": 2.3912, + "step": 572 + }, + { + "epoch": 0.1692262256349675, + "grad_norm": 0.23730961672839998, + "learning_rate": 9.50098302176126e-05, + "loss": 2.3087, + "step": 573 + }, + { + "epoch": 0.16952155936207916, + "grad_norm": 0.23903921612314688, + "learning_rate": 9.498897965479674e-05, + "loss": 2.4313, + "step": 574 + }, + { + "epoch": 0.1698168930891908, + "grad_norm": 0.22875210353269776, + "learning_rate": 9.49680879202089e-05, + "loss": 2.3391, + "step": 575 + }, + { + "epoch": 0.17011222681630242, + "grad_norm": 2.495629304709712, + "learning_rate": 9.494715503296818e-05, + "loss": 2.4673, + "step": 576 + }, + { + "epoch": 0.17040756054341405, + "grad_norm": 0.28472990013382193, + "learning_rate": 9.492618101223141e-05, + "loss": 2.2426, + "step": 577 + }, + { + "epoch": 0.1707028942705257, + "grad_norm": 0.2764977460151948, + "learning_rate": 9.490516587719296e-05, + "loss": 2.3635, + "step": 578 + }, + { + "epoch": 0.17099822799763734, + "grad_norm": 0.3105864469638926, + "learning_rate": 9.488410964708492e-05, + "loss": 2.3281, + "step": 579 + }, + { + "epoch": 0.17129356172474897, + "grad_norm": 0.29015197867853265, + "learning_rate": 9.486301234117691e-05, + "loss": 2.3949, + "step": 580 + }, + { + "epoch": 0.1715888954518606, + "grad_norm": 0.260937035725803, + "learning_rate": 9.484187397877624e-05, + "loss": 2.2845, + "step": 581 + }, + { + "epoch": 0.17188422917897223, + "grad_norm": 0.25342797937631606, + "learning_rate": 9.482069457922769e-05, + "loss": 2.2731, + "step": 582 + }, + { + "epoch": 0.17217956290608388, + "grad_norm": 0.2378052584940391, + "learning_rate": 9.479947416191363e-05, + "loss": 2.3975, + "step": 583 + }, + { + "epoch": 0.1724748966331955, + "grad_norm": 0.274064095404406, + "learning_rate": 9.4778212746254e-05, + "loss": 2.3519, + "step": 584 + }, + { + "epoch": 0.17277023036030714, + "grad_norm": 0.27116897155033814, + "learning_rate": 9.475691035170622e-05, + "loss": 2.2883, + "step": 585 + }, + { + "epoch": 0.17306556408741877, + "grad_norm": 0.27132949253202265, + "learning_rate": 9.473556699776525e-05, + "loss": 2.2701, + "step": 586 + }, + { + "epoch": 0.17336089781453043, + "grad_norm": 0.23975628677733432, + "learning_rate": 9.471418270396348e-05, + "loss": 2.2621, + "step": 587 + }, + { + "epoch": 0.17365623154164206, + "grad_norm": 0.23992670718276618, + "learning_rate": 9.46927574898708e-05, + "loss": 2.2963, + "step": 588 + }, + { + "epoch": 0.1739515652687537, + "grad_norm": 0.2617130563600596, + "learning_rate": 9.467129137509459e-05, + "loss": 2.3996, + "step": 589 + }, + { + "epoch": 0.17424689899586532, + "grad_norm": 0.24582704593948013, + "learning_rate": 9.464978437927955e-05, + "loss": 2.4112, + "step": 590 + }, + { + "epoch": 0.17454223272297698, + "grad_norm": 0.25886002053614177, + "learning_rate": 9.462823652210793e-05, + "loss": 2.2288, + "step": 591 + }, + { + "epoch": 0.1748375664500886, + "grad_norm": 0.2692167409776503, + "learning_rate": 9.460664782329925e-05, + "loss": 2.3667, + "step": 592 + }, + { + "epoch": 0.17513290017720023, + "grad_norm": 0.25153537983157453, + "learning_rate": 9.458501830261046e-05, + "loss": 2.3692, + "step": 593 + }, + { + "epoch": 0.17542823390431186, + "grad_norm": 0.23299628690495144, + "learning_rate": 9.45633479798359e-05, + "loss": 2.3299, + "step": 594 + }, + { + "epoch": 0.17572356763142352, + "grad_norm": 0.2287877492165189, + "learning_rate": 9.454163687480719e-05, + "loss": 2.2421, + "step": 595 + }, + { + "epoch": 0.17601890135853515, + "grad_norm": 0.2376737276054848, + "learning_rate": 9.451988500739333e-05, + "loss": 2.364, + "step": 596 + }, + { + "epoch": 0.17631423508564678, + "grad_norm": 0.5299205694867238, + "learning_rate": 9.449809239750054e-05, + "loss": 2.3576, + "step": 597 + }, + { + "epoch": 0.1766095688127584, + "grad_norm": 0.2796382551688035, + "learning_rate": 9.447625906507242e-05, + "loss": 2.3491, + "step": 598 + }, + { + "epoch": 0.17690490253987004, + "grad_norm": 0.2837802531167416, + "learning_rate": 9.445438503008982e-05, + "loss": 2.3876, + "step": 599 + }, + { + "epoch": 0.1772002362669817, + "grad_norm": 0.2750042985450137, + "learning_rate": 9.443247031257074e-05, + "loss": 2.2858, + "step": 600 + }, + { + "epoch": 0.17749556999409333, + "grad_norm": 0.24865978840784958, + "learning_rate": 9.441051493257055e-05, + "loss": 2.3453, + "step": 601 + }, + { + "epoch": 0.17779090372120496, + "grad_norm": 0.23157085763981583, + "learning_rate": 9.438851891018174e-05, + "loss": 2.3333, + "step": 602 + }, + { + "epoch": 0.1780862374483166, + "grad_norm": 0.24162368364759987, + "learning_rate": 9.436648226553404e-05, + "loss": 2.2552, + "step": 603 + }, + { + "epoch": 0.17838157117542824, + "grad_norm": 0.2507302229356777, + "learning_rate": 9.434440501879433e-05, + "loss": 2.3039, + "step": 604 + }, + { + "epoch": 0.17867690490253987, + "grad_norm": 0.25982219395597345, + "learning_rate": 9.432228719016665e-05, + "loss": 2.3266, + "step": 605 + }, + { + "epoch": 0.1789722386296515, + "grad_norm": 0.22991751357882695, + "learning_rate": 9.430012879989217e-05, + "loss": 2.3909, + "step": 606 + }, + { + "epoch": 0.17926757235676313, + "grad_norm": 0.2337173055412132, + "learning_rate": 9.427792986824922e-05, + "loss": 2.412, + "step": 607 + }, + { + "epoch": 0.1795629060838748, + "grad_norm": 0.23517708226694603, + "learning_rate": 9.42556904155532e-05, + "loss": 2.4193, + "step": 608 + }, + { + "epoch": 0.17985823981098642, + "grad_norm": 0.22529285763265303, + "learning_rate": 9.423341046215659e-05, + "loss": 2.318, + "step": 609 + }, + { + "epoch": 0.18015357353809805, + "grad_norm": 0.24952189355697008, + "learning_rate": 9.421109002844894e-05, + "loss": 2.3706, + "step": 610 + }, + { + "epoch": 0.18044890726520968, + "grad_norm": 0.23262804968493683, + "learning_rate": 9.418872913485686e-05, + "loss": 2.3197, + "step": 611 + }, + { + "epoch": 0.18074424099232134, + "grad_norm": 0.24194069561613055, + "learning_rate": 9.416632780184396e-05, + "loss": 2.2908, + "step": 612 + }, + { + "epoch": 0.18103957471943297, + "grad_norm": 0.23971785866605597, + "learning_rate": 9.414388604991089e-05, + "loss": 2.3736, + "step": 613 + }, + { + "epoch": 0.1813349084465446, + "grad_norm": 0.22796505847266416, + "learning_rate": 9.412140389959528e-05, + "loss": 2.3203, + "step": 614 + }, + { + "epoch": 0.18163024217365623, + "grad_norm": 0.22368014138815792, + "learning_rate": 9.409888137147168e-05, + "loss": 2.3034, + "step": 615 + }, + { + "epoch": 0.18192557590076786, + "grad_norm": 0.22873896094039678, + "learning_rate": 9.407631848615168e-05, + "loss": 2.3063, + "step": 616 + }, + { + "epoch": 0.1822209096278795, + "grad_norm": 0.23241089546492164, + "learning_rate": 9.405371526428373e-05, + "loss": 2.2212, + "step": 617 + }, + { + "epoch": 0.18251624335499114, + "grad_norm": 0.5708456029988758, + "learning_rate": 9.403107172655324e-05, + "loss": 2.3093, + "step": 618 + }, + { + "epoch": 0.18281157708210277, + "grad_norm": 0.24390083744414398, + "learning_rate": 9.400838789368249e-05, + "loss": 2.2484, + "step": 619 + }, + { + "epoch": 0.1831069108092144, + "grad_norm": 0.22124989054294616, + "learning_rate": 9.398566378643065e-05, + "loss": 2.234, + "step": 620 + }, + { + "epoch": 0.18340224453632606, + "grad_norm": 0.22960142417139545, + "learning_rate": 9.396289942559374e-05, + "loss": 2.2995, + "step": 621 + }, + { + "epoch": 0.1836975782634377, + "grad_norm": 0.231858773460613, + "learning_rate": 9.39400948320046e-05, + "loss": 2.259, + "step": 622 + }, + { + "epoch": 0.18399291199054932, + "grad_norm": 0.24381391408686717, + "learning_rate": 9.391725002653295e-05, + "loss": 2.2562, + "step": 623 + }, + { + "epoch": 0.18428824571766095, + "grad_norm": 0.2280765619194993, + "learning_rate": 9.389436503008521e-05, + "loss": 2.3111, + "step": 624 + }, + { + "epoch": 0.1845835794447726, + "grad_norm": 0.2438225467023005, + "learning_rate": 9.387143986360471e-05, + "loss": 2.3499, + "step": 625 + }, + { + "epoch": 0.18487891317188423, + "grad_norm": 0.22759140624117943, + "learning_rate": 9.38484745480714e-05, + "loss": 2.3246, + "step": 626 + }, + { + "epoch": 0.18517424689899586, + "grad_norm": 0.2191431820562164, + "learning_rate": 9.38254691045021e-05, + "loss": 2.3449, + "step": 627 + }, + { + "epoch": 0.1854695806261075, + "grad_norm": 0.2282555885696218, + "learning_rate": 9.380242355395025e-05, + "loss": 2.3575, + "step": 628 + }, + { + "epoch": 0.18576491435321912, + "grad_norm": 0.2240770915931752, + "learning_rate": 9.377933791750605e-05, + "loss": 2.3203, + "step": 629 + }, + { + "epoch": 0.18606024808033078, + "grad_norm": 0.22639377658358248, + "learning_rate": 9.375621221629643e-05, + "loss": 2.3646, + "step": 630 + }, + { + "epoch": 0.1863555818074424, + "grad_norm": 0.22201505156292697, + "learning_rate": 9.373304647148487e-05, + "loss": 2.3169, + "step": 631 + }, + { + "epoch": 0.18665091553455404, + "grad_norm": 0.2179260403481267, + "learning_rate": 9.370984070427156e-05, + "loss": 2.2696, + "step": 632 + }, + { + "epoch": 0.18694624926166567, + "grad_norm": 0.23175003819275, + "learning_rate": 9.368659493589334e-05, + "loss": 2.3223, + "step": 633 + }, + { + "epoch": 0.18724158298877733, + "grad_norm": 0.22000399558817207, + "learning_rate": 9.366330918762361e-05, + "loss": 2.3414, + "step": 634 + }, + { + "epoch": 0.18753691671588896, + "grad_norm": 0.2915606547314636, + "learning_rate": 9.363998348077238e-05, + "loss": 2.3405, + "step": 635 + }, + { + "epoch": 0.1878322504430006, + "grad_norm": 0.22091449473444286, + "learning_rate": 9.361661783668625e-05, + "loss": 2.3561, + "step": 636 + }, + { + "epoch": 0.18812758417011222, + "grad_norm": 0.21620142846014934, + "learning_rate": 9.359321227674833e-05, + "loss": 2.3137, + "step": 637 + }, + { + "epoch": 0.18842291789722387, + "grad_norm": 0.23389260398134049, + "learning_rate": 9.356976682237826e-05, + "loss": 2.3402, + "step": 638 + }, + { + "epoch": 0.1887182516243355, + "grad_norm": 0.21515068324027328, + "learning_rate": 9.354628149503221e-05, + "loss": 2.3338, + "step": 639 + }, + { + "epoch": 0.18901358535144713, + "grad_norm": 0.23222643384617098, + "learning_rate": 9.352275631620284e-05, + "loss": 2.3597, + "step": 640 + }, + { + "epoch": 0.18930891907855876, + "grad_norm": 0.2409969724488727, + "learning_rate": 9.349919130741928e-05, + "loss": 2.3517, + "step": 641 + }, + { + "epoch": 0.18960425280567042, + "grad_norm": 0.23928881853012673, + "learning_rate": 9.34755864902471e-05, + "loss": 2.3313, + "step": 642 + }, + { + "epoch": 0.18989958653278205, + "grad_norm": 0.22885073844733436, + "learning_rate": 9.345194188628828e-05, + "loss": 2.3042, + "step": 643 + }, + { + "epoch": 0.19019492025989368, + "grad_norm": 0.22750557910974006, + "learning_rate": 9.342825751718127e-05, + "loss": 2.402, + "step": 644 + }, + { + "epoch": 0.1904902539870053, + "grad_norm": 0.23507408943655103, + "learning_rate": 9.340453340460086e-05, + "loss": 2.4282, + "step": 645 + }, + { + "epoch": 0.19078558771411694, + "grad_norm": 0.22561435603028726, + "learning_rate": 9.338076957025823e-05, + "loss": 2.3233, + "step": 646 + }, + { + "epoch": 0.1910809214412286, + "grad_norm": 0.21878567472837557, + "learning_rate": 9.335696603590092e-05, + "loss": 2.3516, + "step": 647 + }, + { + "epoch": 0.19137625516834023, + "grad_norm": 0.21602143112574734, + "learning_rate": 9.333312282331277e-05, + "loss": 2.3411, + "step": 648 + }, + { + "epoch": 0.19167158889545186, + "grad_norm": 0.22322847219072248, + "learning_rate": 9.330923995431396e-05, + "loss": 2.2993, + "step": 649 + }, + { + "epoch": 0.19196692262256349, + "grad_norm": 0.287859591774444, + "learning_rate": 9.328531745076096e-05, + "loss": 2.2704, + "step": 650 + }, + { + "epoch": 0.19226225634967514, + "grad_norm": 0.8761360763195796, + "learning_rate": 9.326135533454652e-05, + "loss": 2.3704, + "step": 651 + }, + { + "epoch": 0.19255759007678677, + "grad_norm": 0.22416997305890868, + "learning_rate": 9.32373536275996e-05, + "loss": 2.2448, + "step": 652 + }, + { + "epoch": 0.1928529238038984, + "grad_norm": 0.24539475094721983, + "learning_rate": 9.321331235188544e-05, + "loss": 2.3984, + "step": 653 + }, + { + "epoch": 0.19314825753101003, + "grad_norm": 0.22770265093260433, + "learning_rate": 9.318923152940546e-05, + "loss": 2.3433, + "step": 654 + }, + { + "epoch": 0.1934435912581217, + "grad_norm": 0.21711352138122006, + "learning_rate": 9.316511118219729e-05, + "loss": 2.3189, + "step": 655 + }, + { + "epoch": 0.19373892498523332, + "grad_norm": 0.21118004417918523, + "learning_rate": 9.31409513323347e-05, + "loss": 2.1936, + "step": 656 + }, + { + "epoch": 0.19403425871234495, + "grad_norm": 0.22182967871535425, + "learning_rate": 9.311675200192767e-05, + "loss": 2.2782, + "step": 657 + }, + { + "epoch": 0.19432959243945658, + "grad_norm": 0.22355489747822266, + "learning_rate": 9.309251321312225e-05, + "loss": 2.3754, + "step": 658 + }, + { + "epoch": 0.19462492616656824, + "grad_norm": 0.3055896119924856, + "learning_rate": 9.306823498810064e-05, + "loss": 2.2732, + "step": 659 + }, + { + "epoch": 0.19492025989367986, + "grad_norm": 0.22491663772670686, + "learning_rate": 9.304391734908111e-05, + "loss": 2.4247, + "step": 660 + }, + { + "epoch": 0.1952155936207915, + "grad_norm": 0.23599506585369995, + "learning_rate": 9.3019560318318e-05, + "loss": 2.3961, + "step": 661 + }, + { + "epoch": 0.19551092734790312, + "grad_norm": 0.22593068948877915, + "learning_rate": 9.299516391810171e-05, + "loss": 2.334, + "step": 662 + }, + { + "epoch": 0.19580626107501475, + "grad_norm": 0.22226983554116872, + "learning_rate": 9.297072817075865e-05, + "loss": 2.2832, + "step": 663 + }, + { + "epoch": 0.1961015948021264, + "grad_norm": 0.215686722228141, + "learning_rate": 9.294625309865126e-05, + "loss": 2.3757, + "step": 664 + }, + { + "epoch": 0.19639692852923804, + "grad_norm": 0.22184284440600124, + "learning_rate": 9.2921738724178e-05, + "loss": 2.3174, + "step": 665 + }, + { + "epoch": 0.19669226225634967, + "grad_norm": 0.22432398702311335, + "learning_rate": 9.28971850697732e-05, + "loss": 2.2652, + "step": 666 + }, + { + "epoch": 0.1969875959834613, + "grad_norm": 0.24843252045061526, + "learning_rate": 9.287259215790722e-05, + "loss": 2.2833, + "step": 667 + }, + { + "epoch": 0.19728292971057296, + "grad_norm": 0.2641201870448617, + "learning_rate": 9.284796001108632e-05, + "loss": 2.3756, + "step": 668 + }, + { + "epoch": 0.1975782634376846, + "grad_norm": 0.22980952135728075, + "learning_rate": 9.282328865185268e-05, + "loss": 2.2391, + "step": 669 + }, + { + "epoch": 0.19787359716479622, + "grad_norm": 0.21510693295865263, + "learning_rate": 9.279857810278431e-05, + "loss": 2.2446, + "step": 670 + }, + { + "epoch": 0.19816893089190785, + "grad_norm": 0.22902462180330954, + "learning_rate": 9.27738283864952e-05, + "loss": 2.247, + "step": 671 + }, + { + "epoch": 0.1984642646190195, + "grad_norm": 0.23556484356314344, + "learning_rate": 9.274903952563505e-05, + "loss": 2.3727, + "step": 672 + }, + { + "epoch": 0.19875959834613113, + "grad_norm": 0.22936992229069686, + "learning_rate": 9.272421154288948e-05, + "loss": 2.2938, + "step": 673 + }, + { + "epoch": 0.19905493207324276, + "grad_norm": 0.2332795690359681, + "learning_rate": 9.269934446097986e-05, + "loss": 2.3648, + "step": 674 + }, + { + "epoch": 0.1993502658003544, + "grad_norm": 0.21987049051315524, + "learning_rate": 9.267443830266336e-05, + "loss": 2.3335, + "step": 675 + }, + { + "epoch": 0.19964559952746605, + "grad_norm": 0.23180545680427644, + "learning_rate": 9.26494930907329e-05, + "loss": 2.3493, + "step": 676 + }, + { + "epoch": 0.19994093325457768, + "grad_norm": 0.2195465876695706, + "learning_rate": 9.262450884801719e-05, + "loss": 2.3866, + "step": 677 + }, + { + "epoch": 0.2002362669816893, + "grad_norm": 0.23420961425808526, + "learning_rate": 9.259948559738057e-05, + "loss": 2.2797, + "step": 678 + }, + { + "epoch": 0.20053160070880094, + "grad_norm": 0.2265852966020691, + "learning_rate": 9.257442336172317e-05, + "loss": 2.291, + "step": 679 + }, + { + "epoch": 0.20082693443591257, + "grad_norm": 0.2227250963149621, + "learning_rate": 9.254932216398074e-05, + "loss": 2.2984, + "step": 680 + }, + { + "epoch": 0.20112226816302423, + "grad_norm": 0.24537938748982963, + "learning_rate": 9.252418202712467e-05, + "loss": 2.3177, + "step": 681 + }, + { + "epoch": 0.20141760189013586, + "grad_norm": 0.22744338025308866, + "learning_rate": 9.249900297416205e-05, + "loss": 2.3119, + "step": 682 + }, + { + "epoch": 0.20171293561724749, + "grad_norm": 0.23861587198272816, + "learning_rate": 9.247378502813555e-05, + "loss": 2.4043, + "step": 683 + }, + { + "epoch": 0.20200826934435911, + "grad_norm": 0.2394900148859688, + "learning_rate": 9.244852821212345e-05, + "loss": 2.3894, + "step": 684 + }, + { + "epoch": 0.20230360307147077, + "grad_norm": 0.23352250189560425, + "learning_rate": 9.242323254923953e-05, + "loss": 2.4405, + "step": 685 + }, + { + "epoch": 0.2025989367985824, + "grad_norm": 0.2200399071900516, + "learning_rate": 9.239789806263321e-05, + "loss": 2.2731, + "step": 686 + }, + { + "epoch": 0.20289427052569403, + "grad_norm": 0.22912926892540414, + "learning_rate": 9.237252477548941e-05, + "loss": 2.3459, + "step": 687 + }, + { + "epoch": 0.20318960425280566, + "grad_norm": 0.24815791941034276, + "learning_rate": 9.234711271102855e-05, + "loss": 2.3403, + "step": 688 + }, + { + "epoch": 0.20348493797991732, + "grad_norm": 0.22167615845681804, + "learning_rate": 9.232166189250653e-05, + "loss": 2.3282, + "step": 689 + }, + { + "epoch": 0.20378027170702895, + "grad_norm": 0.2361168823674407, + "learning_rate": 9.229617234321474e-05, + "loss": 2.3156, + "step": 690 + }, + { + "epoch": 0.20407560543414058, + "grad_norm": 0.2576523485692224, + "learning_rate": 9.227064408648e-05, + "loss": 2.3518, + "step": 691 + }, + { + "epoch": 0.2043709391612522, + "grad_norm": 0.2338158772960619, + "learning_rate": 9.224507714566457e-05, + "loss": 2.3243, + "step": 692 + }, + { + "epoch": 0.20466627288836386, + "grad_norm": 0.2534998061501027, + "learning_rate": 9.221947154416605e-05, + "loss": 2.2838, + "step": 693 + }, + { + "epoch": 0.2049616066154755, + "grad_norm": 0.22556684669202579, + "learning_rate": 9.219382730541752e-05, + "loss": 2.2899, + "step": 694 + }, + { + "epoch": 0.20525694034258712, + "grad_norm": 0.2207627426916151, + "learning_rate": 9.216814445288734e-05, + "loss": 2.2566, + "step": 695 + }, + { + "epoch": 0.20555227406969875, + "grad_norm": 0.346595646517422, + "learning_rate": 9.214242301007923e-05, + "loss": 2.3466, + "step": 696 + }, + { + "epoch": 0.20584760779681038, + "grad_norm": 0.23818358780664214, + "learning_rate": 9.211666300053225e-05, + "loss": 2.35, + "step": 697 + }, + { + "epoch": 0.20614294152392204, + "grad_norm": 0.23317518689904831, + "learning_rate": 9.20908644478207e-05, + "loss": 2.3438, + "step": 698 + }, + { + "epoch": 0.20643827525103367, + "grad_norm": 0.2510503222660411, + "learning_rate": 9.206502737555424e-05, + "loss": 2.3672, + "step": 699 + }, + { + "epoch": 0.2067336089781453, + "grad_norm": 0.23137472178763424, + "learning_rate": 9.203915180737768e-05, + "loss": 2.3428, + "step": 700 + }, + { + "epoch": 0.20702894270525693, + "grad_norm": 0.23098169239475708, + "learning_rate": 9.201323776697111e-05, + "loss": 2.3056, + "step": 701 + }, + { + "epoch": 0.2073242764323686, + "grad_norm": 0.2441174839496659, + "learning_rate": 9.198728527804987e-05, + "loss": 2.3973, + "step": 702 + }, + { + "epoch": 0.20761961015948022, + "grad_norm": 0.2324825816654159, + "learning_rate": 9.196129436436438e-05, + "loss": 2.3839, + "step": 703 + }, + { + "epoch": 0.20791494388659185, + "grad_norm": 0.22715644428708254, + "learning_rate": 9.193526504970033e-05, + "loss": 2.3381, + "step": 704 + }, + { + "epoch": 0.20821027761370348, + "grad_norm": 0.23039435139485398, + "learning_rate": 9.190919735787852e-05, + "loss": 2.3243, + "step": 705 + }, + { + "epoch": 0.20850561134081513, + "grad_norm": 0.24191100734738438, + "learning_rate": 9.188309131275482e-05, + "loss": 2.36, + "step": 706 + }, + { + "epoch": 0.20880094506792676, + "grad_norm": 0.2256067687485837, + "learning_rate": 9.185694693822026e-05, + "loss": 2.2815, + "step": 707 + }, + { + "epoch": 0.2090962787950384, + "grad_norm": 0.22220995998740914, + "learning_rate": 9.183076425820092e-05, + "loss": 2.2966, + "step": 708 + }, + { + "epoch": 0.20939161252215002, + "grad_norm": 0.251895099434275, + "learning_rate": 9.180454329665795e-05, + "loss": 2.2478, + "step": 709 + }, + { + "epoch": 0.20968694624926165, + "grad_norm": 0.24155599030165847, + "learning_rate": 9.177828407758753e-05, + "loss": 2.3012, + "step": 710 + }, + { + "epoch": 0.2099822799763733, + "grad_norm": 0.22327682112992325, + "learning_rate": 9.175198662502084e-05, + "loss": 2.1531, + "step": 711 + }, + { + "epoch": 0.21027761370348494, + "grad_norm": 0.21878537675354381, + "learning_rate": 9.172565096302404e-05, + "loss": 2.3311, + "step": 712 + }, + { + "epoch": 0.21057294743059657, + "grad_norm": 0.2285542007229866, + "learning_rate": 9.16992771156983e-05, + "loss": 2.3318, + "step": 713 + }, + { + "epoch": 0.2108682811577082, + "grad_norm": 0.23099689260628328, + "learning_rate": 9.16728651071797e-05, + "loss": 2.3121, + "step": 714 + }, + { + "epoch": 0.21116361488481986, + "grad_norm": 0.2210051739926347, + "learning_rate": 9.164641496163925e-05, + "loss": 2.3543, + "step": 715 + }, + { + "epoch": 0.21145894861193149, + "grad_norm": 0.25572854985294163, + "learning_rate": 9.161992670328287e-05, + "loss": 2.2405, + "step": 716 + }, + { + "epoch": 0.21175428233904312, + "grad_norm": 0.24200407896466264, + "learning_rate": 9.159340035635133e-05, + "loss": 2.3707, + "step": 717 + }, + { + "epoch": 0.21204961606615474, + "grad_norm": 0.2854707499007571, + "learning_rate": 9.156683594512032e-05, + "loss": 2.3321, + "step": 718 + }, + { + "epoch": 0.2123449497932664, + "grad_norm": 0.3054900481465205, + "learning_rate": 9.154023349390029e-05, + "loss": 2.2528, + "step": 719 + }, + { + "epoch": 0.21264028352037803, + "grad_norm": 0.23379005712362913, + "learning_rate": 9.151359302703655e-05, + "loss": 2.3288, + "step": 720 + }, + { + "epoch": 0.21293561724748966, + "grad_norm": 0.2262118190615799, + "learning_rate": 9.148691456890918e-05, + "loss": 2.3534, + "step": 721 + }, + { + "epoch": 0.2132309509746013, + "grad_norm": 0.22913681272917966, + "learning_rate": 9.146019814393305e-05, + "loss": 2.355, + "step": 722 + }, + { + "epoch": 0.21352628470171295, + "grad_norm": 0.21679641104983613, + "learning_rate": 9.143344377655774e-05, + "loss": 2.3453, + "step": 723 + }, + { + "epoch": 0.21382161842882458, + "grad_norm": 0.2367477710799216, + "learning_rate": 9.140665149126759e-05, + "loss": 2.3615, + "step": 724 + }, + { + "epoch": 0.2141169521559362, + "grad_norm": 0.23295292171792048, + "learning_rate": 9.137982131258163e-05, + "loss": 2.3401, + "step": 725 + }, + { + "epoch": 0.21441228588304784, + "grad_norm": 0.23083221740544727, + "learning_rate": 9.135295326505353e-05, + "loss": 2.3597, + "step": 726 + }, + { + "epoch": 0.21470761961015947, + "grad_norm": 0.24943582384592483, + "learning_rate": 9.132604737327166e-05, + "loss": 2.3028, + "step": 727 + }, + { + "epoch": 0.21500295333727112, + "grad_norm": 0.2296739386103728, + "learning_rate": 9.129910366185903e-05, + "loss": 2.4076, + "step": 728 + }, + { + "epoch": 0.21529828706438275, + "grad_norm": 0.23404571130484786, + "learning_rate": 9.127212215547323e-05, + "loss": 2.2521, + "step": 729 + }, + { + "epoch": 0.21559362079149438, + "grad_norm": 0.2684942836454878, + "learning_rate": 9.124510287880645e-05, + "loss": 2.3647, + "step": 730 + }, + { + "epoch": 0.215888954518606, + "grad_norm": 0.22788542879213883, + "learning_rate": 9.121804585658545e-05, + "loss": 2.282, + "step": 731 + }, + { + "epoch": 0.21618428824571767, + "grad_norm": 0.23579196957387127, + "learning_rate": 9.11909511135715e-05, + "loss": 2.2798, + "step": 732 + }, + { + "epoch": 0.2164796219728293, + "grad_norm": 0.22149067857979202, + "learning_rate": 9.116381867456046e-05, + "loss": 2.3608, + "step": 733 + }, + { + "epoch": 0.21677495569994093, + "grad_norm": 0.2207262229648134, + "learning_rate": 9.113664856438262e-05, + "loss": 2.3234, + "step": 734 + }, + { + "epoch": 0.21707028942705256, + "grad_norm": 0.22535211243994638, + "learning_rate": 9.110944080790278e-05, + "loss": 2.3518, + "step": 735 + }, + { + "epoch": 0.21736562315416422, + "grad_norm": 0.2266810370166292, + "learning_rate": 9.10821954300202e-05, + "loss": 2.3673, + "step": 736 + }, + { + "epoch": 0.21766095688127585, + "grad_norm": 0.22405490438071118, + "learning_rate": 9.10549124556685e-05, + "loss": 2.3993, + "step": 737 + }, + { + "epoch": 0.21795629060838748, + "grad_norm": 1.1470647899118664, + "learning_rate": 9.10275919098158e-05, + "loss": 2.346, + "step": 738 + }, + { + "epoch": 0.2182516243354991, + "grad_norm": 0.2317611856631413, + "learning_rate": 9.100023381746459e-05, + "loss": 2.3814, + "step": 739 + }, + { + "epoch": 0.21854695806261076, + "grad_norm": 0.2343673227960154, + "learning_rate": 9.097283820365164e-05, + "loss": 2.3287, + "step": 740 + }, + { + "epoch": 0.2188422917897224, + "grad_norm": 0.22838285452704296, + "learning_rate": 9.094540509344812e-05, + "loss": 2.3151, + "step": 741 + }, + { + "epoch": 0.21913762551683402, + "grad_norm": 0.2911321803436316, + "learning_rate": 9.091793451195953e-05, + "loss": 2.2849, + "step": 742 + }, + { + "epoch": 0.21943295924394565, + "grad_norm": 0.2331376793816367, + "learning_rate": 9.089042648432562e-05, + "loss": 2.2956, + "step": 743 + }, + { + "epoch": 0.21972829297105728, + "grad_norm": 0.24029624731466917, + "learning_rate": 9.086288103572042e-05, + "loss": 2.4115, + "step": 744 + }, + { + "epoch": 0.22002362669816894, + "grad_norm": 0.2371624885534751, + "learning_rate": 9.083529819135225e-05, + "loss": 2.2989, + "step": 745 + }, + { + "epoch": 0.22031896042528057, + "grad_norm": 0.24064600190248206, + "learning_rate": 9.080767797646358e-05, + "loss": 2.2736, + "step": 746 + }, + { + "epoch": 0.2206142941523922, + "grad_norm": 0.24277673843361602, + "learning_rate": 9.078002041633112e-05, + "loss": 2.2769, + "step": 747 + }, + { + "epoch": 0.22090962787950383, + "grad_norm": 0.23748655375262975, + "learning_rate": 9.075232553626576e-05, + "loss": 2.3228, + "step": 748 + }, + { + "epoch": 0.22120496160661549, + "grad_norm": 0.43586238842948277, + "learning_rate": 9.072459336161254e-05, + "loss": 2.3175, + "step": 749 + }, + { + "epoch": 0.22150029533372712, + "grad_norm": 0.6264718130412346, + "learning_rate": 9.069682391775062e-05, + "loss": 2.4861, + "step": 750 + }, + { + "epoch": 0.22179562906083874, + "grad_norm": 0.22257154091455825, + "learning_rate": 9.066901723009328e-05, + "loss": 2.3804, + "step": 751 + }, + { + "epoch": 0.22209096278795037, + "grad_norm": 0.23764647685063298, + "learning_rate": 9.064117332408786e-05, + "loss": 2.3265, + "step": 752 + }, + { + "epoch": 0.22238629651506203, + "grad_norm": 0.30845326231670384, + "learning_rate": 9.06132922252158e-05, + "loss": 2.5038, + "step": 753 + }, + { + "epoch": 0.22268163024217366, + "grad_norm": 0.24804685998150638, + "learning_rate": 9.058537395899252e-05, + "loss": 2.2878, + "step": 754 + }, + { + "epoch": 0.2229769639692853, + "grad_norm": 0.31822174701480005, + "learning_rate": 9.05574185509675e-05, + "loss": 2.3317, + "step": 755 + }, + { + "epoch": 0.22327229769639692, + "grad_norm": 0.3815172040767372, + "learning_rate": 9.052942602672423e-05, + "loss": 2.2638, + "step": 756 + }, + { + "epoch": 0.22356763142350858, + "grad_norm": 0.24055633060495896, + "learning_rate": 9.050139641188011e-05, + "loss": 2.3187, + "step": 757 + }, + { + "epoch": 0.2238629651506202, + "grad_norm": 0.25596865293221305, + "learning_rate": 9.047332973208651e-05, + "loss": 2.371, + "step": 758 + }, + { + "epoch": 0.22415829887773184, + "grad_norm": 0.23812574252566016, + "learning_rate": 9.044522601302872e-05, + "loss": 2.2915, + "step": 759 + }, + { + "epoch": 0.22445363260484347, + "grad_norm": 0.22291975992201638, + "learning_rate": 9.041708528042592e-05, + "loss": 2.3435, + "step": 760 + }, + { + "epoch": 0.2247489663319551, + "grad_norm": 0.23014996998513454, + "learning_rate": 9.038890756003117e-05, + "loss": 2.255, + "step": 761 + }, + { + "epoch": 0.22504430005906675, + "grad_norm": 0.2355338297026091, + "learning_rate": 9.036069287763138e-05, + "loss": 2.3534, + "step": 762 + }, + { + "epoch": 0.22533963378617838, + "grad_norm": 0.2440160051816617, + "learning_rate": 9.03324412590473e-05, + "loss": 2.3743, + "step": 763 + }, + { + "epoch": 0.22563496751329, + "grad_norm": 0.2269423933659883, + "learning_rate": 9.030415273013344e-05, + "loss": 2.3026, + "step": 764 + }, + { + "epoch": 0.22593030124040164, + "grad_norm": 0.2212125839638269, + "learning_rate": 9.027582731677813e-05, + "loss": 2.2935, + "step": 765 + }, + { + "epoch": 0.2262256349675133, + "grad_norm": 0.38704305594606253, + "learning_rate": 9.024746504490343e-05, + "loss": 2.3681, + "step": 766 + }, + { + "epoch": 0.22652096869462493, + "grad_norm": 0.22310064539503963, + "learning_rate": 9.021906594046514e-05, + "loss": 2.2238, + "step": 767 + }, + { + "epoch": 0.22681630242173656, + "grad_norm": 0.3474402453140299, + "learning_rate": 9.019063002945279e-05, + "loss": 2.3755, + "step": 768 + }, + { + "epoch": 0.2271116361488482, + "grad_norm": 0.23094493358453608, + "learning_rate": 9.016215733788955e-05, + "loss": 2.365, + "step": 769 + }, + { + "epoch": 0.22740696987595985, + "grad_norm": 0.22545181349591575, + "learning_rate": 9.013364789183228e-05, + "loss": 2.3115, + "step": 770 + }, + { + "epoch": 0.22770230360307148, + "grad_norm": 0.2205664033990264, + "learning_rate": 9.010510171737147e-05, + "loss": 2.4849, + "step": 771 + }, + { + "epoch": 0.2279976373301831, + "grad_norm": 0.2273426190246556, + "learning_rate": 9.007651884063122e-05, + "loss": 2.3117, + "step": 772 + }, + { + "epoch": 0.22829297105729474, + "grad_norm": 0.25712871906463497, + "learning_rate": 9.00478992877692e-05, + "loss": 2.3692, + "step": 773 + }, + { + "epoch": 0.22858830478440637, + "grad_norm": 0.2345374100960686, + "learning_rate": 9.001924308497667e-05, + "loss": 2.3839, + "step": 774 + }, + { + "epoch": 0.22888363851151802, + "grad_norm": 0.24311184609728417, + "learning_rate": 8.999055025847844e-05, + "loss": 2.3841, + "step": 775 + }, + { + "epoch": 0.22917897223862965, + "grad_norm": 0.22416590157511823, + "learning_rate": 8.996182083453279e-05, + "loss": 2.3568, + "step": 776 + }, + { + "epoch": 0.22947430596574128, + "grad_norm": 0.2338894583077747, + "learning_rate": 8.993305483943155e-05, + "loss": 2.3171, + "step": 777 + }, + { + "epoch": 0.2297696396928529, + "grad_norm": 0.22756606666213117, + "learning_rate": 8.990425229949998e-05, + "loss": 2.335, + "step": 778 + }, + { + "epoch": 0.23006497341996457, + "grad_norm": 17.107525228202757, + "learning_rate": 8.987541324109678e-05, + "loss": 2.3615, + "step": 779 + }, + { + "epoch": 0.2303603071470762, + "grad_norm": 0.24092710065601775, + "learning_rate": 8.98465376906141e-05, + "loss": 2.306, + "step": 780 + }, + { + "epoch": 0.23065564087418783, + "grad_norm": 0.23102049328676963, + "learning_rate": 8.981762567447745e-05, + "loss": 2.2919, + "step": 781 + }, + { + "epoch": 0.23095097460129946, + "grad_norm": 0.24874456161276134, + "learning_rate": 8.978867721914574e-05, + "loss": 2.3066, + "step": 782 + }, + { + "epoch": 0.23124630832841112, + "grad_norm": 0.23140857903997525, + "learning_rate": 8.975969235111124e-05, + "loss": 2.3271, + "step": 783 + }, + { + "epoch": 0.23154164205552275, + "grad_norm": 0.23390567201021856, + "learning_rate": 8.973067109689948e-05, + "loss": 2.2213, + "step": 784 + }, + { + "epoch": 0.23183697578263437, + "grad_norm": 0.23543390440214257, + "learning_rate": 8.970161348306934e-05, + "loss": 2.3907, + "step": 785 + }, + { + "epoch": 0.232132309509746, + "grad_norm": 0.21893873757827415, + "learning_rate": 8.967251953621299e-05, + "loss": 2.2968, + "step": 786 + }, + { + "epoch": 0.23242764323685766, + "grad_norm": 0.2608399368680147, + "learning_rate": 8.964338928295581e-05, + "loss": 2.3531, + "step": 787 + }, + { + "epoch": 0.2327229769639693, + "grad_norm": 0.24702824924342667, + "learning_rate": 8.96142227499564e-05, + "loss": 2.3372, + "step": 788 + }, + { + "epoch": 0.23301831069108092, + "grad_norm": 0.23207013578204322, + "learning_rate": 8.958501996390664e-05, + "loss": 2.3141, + "step": 789 + }, + { + "epoch": 0.23331364441819255, + "grad_norm": 0.23323719088943143, + "learning_rate": 8.955578095153148e-05, + "loss": 2.3542, + "step": 790 + }, + { + "epoch": 0.23360897814530418, + "grad_norm": 0.6752169022158125, + "learning_rate": 8.952650573958908e-05, + "loss": 2.3147, + "step": 791 + }, + { + "epoch": 0.23390431187241584, + "grad_norm": 0.22086681777131367, + "learning_rate": 8.949719435487071e-05, + "loss": 2.3224, + "step": 792 + }, + { + "epoch": 0.23419964559952747, + "grad_norm": 0.2280785872654713, + "learning_rate": 8.946784682420077e-05, + "loss": 2.3693, + "step": 793 + }, + { + "epoch": 0.2344949793266391, + "grad_norm": 0.21811388543512225, + "learning_rate": 8.943846317443673e-05, + "loss": 2.3267, + "step": 794 + }, + { + "epoch": 0.23479031305375073, + "grad_norm": 0.21931930098447977, + "learning_rate": 8.94090434324691e-05, + "loss": 2.2945, + "step": 795 + }, + { + "epoch": 0.23508564678086238, + "grad_norm": 0.22537999020614707, + "learning_rate": 8.93795876252214e-05, + "loss": 2.2352, + "step": 796 + }, + { + "epoch": 0.235380980507974, + "grad_norm": 0.21747344356404832, + "learning_rate": 8.935009577965023e-05, + "loss": 2.3542, + "step": 797 + }, + { + "epoch": 0.23567631423508564, + "grad_norm": 0.21611189671558587, + "learning_rate": 8.932056792274509e-05, + "loss": 2.4005, + "step": 798 + }, + { + "epoch": 0.23597164796219727, + "grad_norm": 0.24815477442401773, + "learning_rate": 8.929100408152849e-05, + "loss": 2.3772, + "step": 799 + }, + { + "epoch": 0.23626698168930893, + "grad_norm": 0.22450851995670817, + "learning_rate": 8.926140428305582e-05, + "loss": 2.3605, + "step": 800 + }, + { + "epoch": 0.23656231541642056, + "grad_norm": 0.22177841072044227, + "learning_rate": 8.923176855441544e-05, + "loss": 2.3268, + "step": 801 + }, + { + "epoch": 0.2368576491435322, + "grad_norm": 0.24766311094485663, + "learning_rate": 8.920209692272851e-05, + "loss": 2.2784, + "step": 802 + }, + { + "epoch": 0.23715298287064382, + "grad_norm": 0.22530585402211822, + "learning_rate": 8.917238941514915e-05, + "loss": 2.3134, + "step": 803 + }, + { + "epoch": 0.23744831659775548, + "grad_norm": 0.23443381843025382, + "learning_rate": 8.914264605886424e-05, + "loss": 2.4006, + "step": 804 + }, + { + "epoch": 0.2377436503248671, + "grad_norm": 0.2256330878453965, + "learning_rate": 8.911286688109346e-05, + "loss": 2.2438, + "step": 805 + }, + { + "epoch": 0.23803898405197874, + "grad_norm": 0.21972966509853795, + "learning_rate": 8.90830519090893e-05, + "loss": 2.2541, + "step": 806 + }, + { + "epoch": 0.23833431777909037, + "grad_norm": 0.24010031146073388, + "learning_rate": 8.905320117013703e-05, + "loss": 2.3143, + "step": 807 + }, + { + "epoch": 0.238629651506202, + "grad_norm": 0.22351497560718098, + "learning_rate": 8.902331469155461e-05, + "loss": 2.331, + "step": 808 + }, + { + "epoch": 0.23892498523331365, + "grad_norm": 0.22678404696536048, + "learning_rate": 8.899339250069272e-05, + "loss": 2.3356, + "step": 809 + }, + { + "epoch": 0.23922031896042528, + "grad_norm": 0.250053042506835, + "learning_rate": 8.896343462493472e-05, + "loss": 2.3074, + "step": 810 + }, + { + "epoch": 0.2395156526875369, + "grad_norm": 0.22331041309781247, + "learning_rate": 8.893344109169664e-05, + "loss": 2.2922, + "step": 811 + }, + { + "epoch": 0.23981098641464854, + "grad_norm": 2.8843457452978774, + "learning_rate": 8.890341192842713e-05, + "loss": 2.2978, + "step": 812 + }, + { + "epoch": 0.2401063201417602, + "grad_norm": 0.23357737238190668, + "learning_rate": 8.887334716260745e-05, + "loss": 2.3504, + "step": 813 + }, + { + "epoch": 0.24040165386887183, + "grad_norm": 0.22574872175279348, + "learning_rate": 8.884324682175145e-05, + "loss": 2.2766, + "step": 814 + }, + { + "epoch": 0.24069698759598346, + "grad_norm": 0.23096299113352348, + "learning_rate": 8.88131109334055e-05, + "loss": 2.318, + "step": 815 + }, + { + "epoch": 0.2409923213230951, + "grad_norm": 0.22711269395920639, + "learning_rate": 8.878293952514856e-05, + "loss": 2.3436, + "step": 816 + }, + { + "epoch": 0.24128765505020675, + "grad_norm": 0.2219176926305029, + "learning_rate": 8.875273262459208e-05, + "loss": 2.2694, + "step": 817 + }, + { + "epoch": 0.24158298877731837, + "grad_norm": 0.2284660040949066, + "learning_rate": 8.872249025937992e-05, + "loss": 2.2904, + "step": 818 + }, + { + "epoch": 0.24187832250443, + "grad_norm": 0.25991785278510354, + "learning_rate": 8.86922124571885e-05, + "loss": 2.2974, + "step": 819 + }, + { + "epoch": 0.24217365623154163, + "grad_norm": 0.2275990828565386, + "learning_rate": 8.866189924572661e-05, + "loss": 2.336, + "step": 820 + }, + { + "epoch": 0.2424689899586533, + "grad_norm": 0.22434401385850664, + "learning_rate": 8.863155065273546e-05, + "loss": 2.3565, + "step": 821 + }, + { + "epoch": 0.24276432368576492, + "grad_norm": 0.2220451854088289, + "learning_rate": 8.860116670598864e-05, + "loss": 2.3449, + "step": 822 + }, + { + "epoch": 0.24305965741287655, + "grad_norm": 0.219153721395951, + "learning_rate": 8.857074743329209e-05, + "loss": 2.3511, + "step": 823 + }, + { + "epoch": 0.24335499113998818, + "grad_norm": 0.226543368156575, + "learning_rate": 8.854029286248404e-05, + "loss": 2.3409, + "step": 824 + }, + { + "epoch": 0.2436503248670998, + "grad_norm": 0.22675814074115505, + "learning_rate": 8.850980302143513e-05, + "loss": 2.289, + "step": 825 + }, + { + "epoch": 0.24394565859421147, + "grad_norm": 0.21972888521450287, + "learning_rate": 8.847927793804818e-05, + "loss": 2.2859, + "step": 826 + }, + { + "epoch": 0.2442409923213231, + "grad_norm": 0.22867287824884402, + "learning_rate": 8.844871764025829e-05, + "loss": 2.3756, + "step": 827 + }, + { + "epoch": 0.24453632604843473, + "grad_norm": 0.2344318638802665, + "learning_rate": 8.841812215603277e-05, + "loss": 2.2618, + "step": 828 + }, + { + "epoch": 0.24483165977554636, + "grad_norm": 0.22303262488011016, + "learning_rate": 8.83874915133712e-05, + "loss": 2.3133, + "step": 829 + }, + { + "epoch": 0.245126993502658, + "grad_norm": 0.22409083176631656, + "learning_rate": 8.835682574030525e-05, + "loss": 2.2265, + "step": 830 + }, + { + "epoch": 0.24542232722976964, + "grad_norm": 0.21592089522926405, + "learning_rate": 8.832612486489878e-05, + "loss": 2.3483, + "step": 831 + }, + { + "epoch": 0.24571766095688127, + "grad_norm": 0.22256238531588027, + "learning_rate": 8.829538891524778e-05, + "loss": 2.3182, + "step": 832 + }, + { + "epoch": 0.2460129946839929, + "grad_norm": 0.23152050776564434, + "learning_rate": 8.826461791948031e-05, + "loss": 2.3024, + "step": 833 + }, + { + "epoch": 0.24630832841110456, + "grad_norm": 0.22102266152791675, + "learning_rate": 8.823381190575654e-05, + "loss": 2.3111, + "step": 834 + }, + { + "epoch": 0.2466036621382162, + "grad_norm": 0.2250313733665659, + "learning_rate": 8.820297090226865e-05, + "loss": 2.3423, + "step": 835 + }, + { + "epoch": 0.24689899586532782, + "grad_norm": 0.21904897694334652, + "learning_rate": 8.817209493724088e-05, + "loss": 2.3392, + "step": 836 + }, + { + "epoch": 0.24719432959243945, + "grad_norm": 0.23103870617116648, + "learning_rate": 8.814118403892941e-05, + "loss": 2.2882, + "step": 837 + }, + { + "epoch": 0.2474896633195511, + "grad_norm": 0.21984998839202427, + "learning_rate": 8.811023823562244e-05, + "loss": 2.3281, + "step": 838 + }, + { + "epoch": 0.24778499704666274, + "grad_norm": 0.22223715071817804, + "learning_rate": 8.80792575556401e-05, + "loss": 2.3008, + "step": 839 + }, + { + "epoch": 0.24808033077377437, + "grad_norm": 0.2157975612057998, + "learning_rate": 8.804824202733443e-05, + "loss": 2.2892, + "step": 840 + }, + { + "epoch": 0.248375664500886, + "grad_norm": 0.25429880167850916, + "learning_rate": 8.801719167908937e-05, + "loss": 2.3284, + "step": 841 + }, + { + "epoch": 0.24867099822799762, + "grad_norm": 0.22230816284905813, + "learning_rate": 8.798610653932071e-05, + "loss": 2.3276, + "step": 842 + }, + { + "epoch": 0.24896633195510928, + "grad_norm": 0.36450522358085724, + "learning_rate": 8.79549866364761e-05, + "loss": 2.3063, + "step": 843 + }, + { + "epoch": 0.2492616656822209, + "grad_norm": 0.2137864539644035, + "learning_rate": 8.792383199903498e-05, + "loss": 2.3515, + "step": 844 + }, + { + "epoch": 0.24955699940933254, + "grad_norm": 0.22024902958066195, + "learning_rate": 8.78926426555086e-05, + "loss": 2.3404, + "step": 845 + }, + { + "epoch": 0.24985233313644417, + "grad_norm": 0.21953900279409494, + "learning_rate": 8.786141863443996e-05, + "loss": 2.3371, + "step": 846 + }, + { + "epoch": 0.25014766686355583, + "grad_norm": 0.2377815870502483, + "learning_rate": 8.783015996440377e-05, + "loss": 2.3769, + "step": 847 + }, + { + "epoch": 0.25044300059066743, + "grad_norm": 0.21915004968351062, + "learning_rate": 8.779886667400654e-05, + "loss": 2.3374, + "step": 848 + }, + { + "epoch": 0.2507383343177791, + "grad_norm": 0.24541236764114147, + "learning_rate": 8.776753879188635e-05, + "loss": 2.3294, + "step": 849 + }, + { + "epoch": 0.25103366804489075, + "grad_norm": 0.2253014723715268, + "learning_rate": 8.7736176346713e-05, + "loss": 2.138, + "step": 850 + }, + { + "epoch": 0.25132900177200235, + "grad_norm": 0.23984975962611899, + "learning_rate": 8.77047793671879e-05, + "loss": 2.3325, + "step": 851 + }, + { + "epoch": 0.251624335499114, + "grad_norm": 0.22460451535482479, + "learning_rate": 8.76733478820441e-05, + "loss": 2.2892, + "step": 852 + }, + { + "epoch": 0.25191966922622566, + "grad_norm": 0.21796220885822487, + "learning_rate": 8.764188192004616e-05, + "loss": 2.3199, + "step": 853 + }, + { + "epoch": 0.25221500295333726, + "grad_norm": 0.22099522022811321, + "learning_rate": 8.761038150999024e-05, + "loss": 2.2618, + "step": 854 + }, + { + "epoch": 0.2525103366804489, + "grad_norm": 0.22072299935307715, + "learning_rate": 8.757884668070402e-05, + "loss": 2.2639, + "step": 855 + }, + { + "epoch": 0.2528056704075605, + "grad_norm": 0.22291859848011322, + "learning_rate": 8.754727746104667e-05, + "loss": 2.3379, + "step": 856 + }, + { + "epoch": 0.2531010041346722, + "grad_norm": 0.2239489839879338, + "learning_rate": 8.751567387990884e-05, + "loss": 2.3023, + "step": 857 + }, + { + "epoch": 0.25339633786178384, + "grad_norm": 0.21855497181237268, + "learning_rate": 8.748403596621264e-05, + "loss": 2.3332, + "step": 858 + }, + { + "epoch": 0.25369167158889544, + "grad_norm": 0.22984205320550133, + "learning_rate": 8.745236374891155e-05, + "loss": 2.3259, + "step": 859 + }, + { + "epoch": 0.2539870053160071, + "grad_norm": 0.218079851746479, + "learning_rate": 8.74206572569905e-05, + "loss": 2.2808, + "step": 860 + }, + { + "epoch": 0.2542823390431187, + "grad_norm": 0.22101877646493398, + "learning_rate": 8.738891651946575e-05, + "loss": 2.4046, + "step": 861 + }, + { + "epoch": 0.25457767277023036, + "grad_norm": 0.22839459857875416, + "learning_rate": 8.735714156538491e-05, + "loss": 2.3851, + "step": 862 + }, + { + "epoch": 0.254873006497342, + "grad_norm": 0.22322214725819192, + "learning_rate": 8.732533242382692e-05, + "loss": 2.2917, + "step": 863 + }, + { + "epoch": 0.2551683402244536, + "grad_norm": 0.21736315066796147, + "learning_rate": 8.729348912390198e-05, + "loss": 2.26, + "step": 864 + }, + { + "epoch": 0.2554636739515653, + "grad_norm": 0.24931028711958314, + "learning_rate": 8.726161169475155e-05, + "loss": 2.2941, + "step": 865 + }, + { + "epoch": 0.25575900767867693, + "grad_norm": 0.22696360893787465, + "learning_rate": 8.722970016554834e-05, + "loss": 2.333, + "step": 866 + }, + { + "epoch": 0.25605434140578853, + "grad_norm": 0.22414251819341577, + "learning_rate": 8.719775456549631e-05, + "loss": 2.3932, + "step": 867 + }, + { + "epoch": 0.2563496751329002, + "grad_norm": 0.21749565129755866, + "learning_rate": 8.716577492383046e-05, + "loss": 2.3795, + "step": 868 + }, + { + "epoch": 0.2566450088600118, + "grad_norm": 0.2333900897077449, + "learning_rate": 8.713376126981712e-05, + "loss": 2.2857, + "step": 869 + }, + { + "epoch": 0.25694034258712345, + "grad_norm": 0.23233729514498153, + "learning_rate": 8.710171363275363e-05, + "loss": 2.2591, + "step": 870 + }, + { + "epoch": 0.2572356763142351, + "grad_norm": 0.20752744771779702, + "learning_rate": 8.706963204196845e-05, + "loss": 2.089, + "step": 871 + }, + { + "epoch": 0.2575310100413467, + "grad_norm": 0.21898424855560997, + "learning_rate": 8.703751652682114e-05, + "loss": 2.3684, + "step": 872 + }, + { + "epoch": 0.25782634376845837, + "grad_norm": 0.21817900009721766, + "learning_rate": 8.700536711670228e-05, + "loss": 2.29, + "step": 873 + }, + { + "epoch": 0.25812167749556997, + "grad_norm": 0.22393931078890708, + "learning_rate": 8.69731838410335e-05, + "loss": 2.4286, + "step": 874 + }, + { + "epoch": 0.2584170112226816, + "grad_norm": 0.22800253461539466, + "learning_rate": 8.694096672926738e-05, + "loss": 2.2877, + "step": 875 + }, + { + "epoch": 0.2587123449497933, + "grad_norm": 0.2150078753481713, + "learning_rate": 8.690871581088747e-05, + "loss": 2.3197, + "step": 876 + }, + { + "epoch": 0.2590076786769049, + "grad_norm": 0.21328306337081732, + "learning_rate": 8.68764311154083e-05, + "loss": 2.2989, + "step": 877 + }, + { + "epoch": 0.25930301240401654, + "grad_norm": 0.2223594175999059, + "learning_rate": 8.68441126723753e-05, + "loss": 2.2037, + "step": 878 + }, + { + "epoch": 0.2595983461311282, + "grad_norm": 0.21152594398429467, + "learning_rate": 8.681176051136477e-05, + "loss": 2.2383, + "step": 879 + }, + { + "epoch": 0.2598936798582398, + "grad_norm": 0.2258942713873613, + "learning_rate": 8.677937466198383e-05, + "loss": 2.2468, + "step": 880 + }, + { + "epoch": 0.26018901358535146, + "grad_norm": 0.22306728069273304, + "learning_rate": 8.674695515387048e-05, + "loss": 2.2319, + "step": 881 + }, + { + "epoch": 0.26048434731246306, + "grad_norm": 0.2153403784416406, + "learning_rate": 8.671450201669354e-05, + "loss": 2.3097, + "step": 882 + }, + { + "epoch": 0.2607796810395747, + "grad_norm": 0.2145858256803292, + "learning_rate": 8.668201528015254e-05, + "loss": 2.3083, + "step": 883 + }, + { + "epoch": 0.2610750147666864, + "grad_norm": 0.21481936056243378, + "learning_rate": 8.664949497397781e-05, + "loss": 2.3267, + "step": 884 + }, + { + "epoch": 0.261370348493798, + "grad_norm": 0.21570500361623765, + "learning_rate": 8.661694112793039e-05, + "loss": 2.4063, + "step": 885 + }, + { + "epoch": 0.26166568222090963, + "grad_norm": 0.21764897095392935, + "learning_rate": 8.6584353771802e-05, + "loss": 2.2773, + "step": 886 + }, + { + "epoch": 0.26196101594802124, + "grad_norm": 0.22240349996843609, + "learning_rate": 8.655173293541503e-05, + "loss": 2.3434, + "step": 887 + }, + { + "epoch": 0.2622563496751329, + "grad_norm": 0.46675902975365674, + "learning_rate": 8.651907864862254e-05, + "loss": 2.2575, + "step": 888 + }, + { + "epoch": 0.26255168340224455, + "grad_norm": 0.22651605294405514, + "learning_rate": 8.648639094130817e-05, + "loss": 2.3025, + "step": 889 + }, + { + "epoch": 0.26284701712935615, + "grad_norm": 0.2631549855043196, + "learning_rate": 8.645366984338613e-05, + "loss": 2.3339, + "step": 890 + }, + { + "epoch": 0.2631423508564678, + "grad_norm": 0.24266297826631902, + "learning_rate": 8.642091538480123e-05, + "loss": 2.3794, + "step": 891 + }, + { + "epoch": 0.26343768458357947, + "grad_norm": 0.24184356386596756, + "learning_rate": 8.638812759552882e-05, + "loss": 2.2397, + "step": 892 + }, + { + "epoch": 0.26373301831069107, + "grad_norm": 0.25415012678412996, + "learning_rate": 8.635530650557466e-05, + "loss": 2.483, + "step": 893 + }, + { + "epoch": 0.2640283520378027, + "grad_norm": 0.22850448222584743, + "learning_rate": 8.632245214497511e-05, + "loss": 2.4365, + "step": 894 + }, + { + "epoch": 0.26432368576491433, + "grad_norm": 0.21801522009109386, + "learning_rate": 8.628956454379687e-05, + "loss": 2.3225, + "step": 895 + }, + { + "epoch": 0.264619019492026, + "grad_norm": 0.23276496056211365, + "learning_rate": 8.625664373213713e-05, + "loss": 2.3742, + "step": 896 + }, + { + "epoch": 0.26491435321913764, + "grad_norm": 0.2276059081158625, + "learning_rate": 8.622368974012345e-05, + "loss": 2.233, + "step": 897 + }, + { + "epoch": 0.26520968694624925, + "grad_norm": 0.22404560155737416, + "learning_rate": 8.619070259791374e-05, + "loss": 2.418, + "step": 898 + }, + { + "epoch": 0.2655050206733609, + "grad_norm": 0.22827278177637236, + "learning_rate": 8.615768233569628e-05, + "loss": 2.4078, + "step": 899 + }, + { + "epoch": 0.26580035440047256, + "grad_norm": 0.22861177915709457, + "learning_rate": 8.612462898368963e-05, + "loss": 2.4145, + "step": 900 + }, + { + "epoch": 0.26609568812758416, + "grad_norm": 0.22772735368953115, + "learning_rate": 8.609154257214264e-05, + "loss": 2.3653, + "step": 901 + }, + { + "epoch": 0.2663910218546958, + "grad_norm": 0.22528949266917753, + "learning_rate": 8.605842313133444e-05, + "loss": 2.43, + "step": 902 + }, + { + "epoch": 0.2666863555818074, + "grad_norm": 0.3228593731885588, + "learning_rate": 8.602527069157433e-05, + "loss": 2.3318, + "step": 903 + }, + { + "epoch": 0.2669816893089191, + "grad_norm": 0.22878712233036141, + "learning_rate": 8.599208528320187e-05, + "loss": 2.3484, + "step": 904 + }, + { + "epoch": 0.26727702303603074, + "grad_norm": 0.21759034915191028, + "learning_rate": 8.595886693658677e-05, + "loss": 2.3507, + "step": 905 + }, + { + "epoch": 0.26757235676314234, + "grad_norm": 0.20783457459538224, + "learning_rate": 8.592561568212887e-05, + "loss": 2.2996, + "step": 906 + }, + { + "epoch": 0.267867690490254, + "grad_norm": 0.21884259750463658, + "learning_rate": 8.589233155025811e-05, + "loss": 2.3326, + "step": 907 + }, + { + "epoch": 0.2681630242173656, + "grad_norm": 0.21543608419928653, + "learning_rate": 8.585901457143459e-05, + "loss": 2.2755, + "step": 908 + }, + { + "epoch": 0.26845835794447725, + "grad_norm": 0.2301495653257755, + "learning_rate": 8.58256647761484e-05, + "loss": 2.3381, + "step": 909 + }, + { + "epoch": 0.2687536916715889, + "grad_norm": 0.22418849322234877, + "learning_rate": 8.579228219491968e-05, + "loss": 2.3079, + "step": 910 + }, + { + "epoch": 0.2690490253987005, + "grad_norm": 0.2303548871040262, + "learning_rate": 8.575886685829856e-05, + "loss": 2.3235, + "step": 911 + }, + { + "epoch": 0.26934435912581217, + "grad_norm": 0.21862048333978587, + "learning_rate": 8.572541879686523e-05, + "loss": 2.3407, + "step": 912 + }, + { + "epoch": 0.26963969285292383, + "grad_norm": 0.22361670356935084, + "learning_rate": 8.569193804122968e-05, + "loss": 2.3436, + "step": 913 + }, + { + "epoch": 0.26993502658003543, + "grad_norm": 0.22787718147692154, + "learning_rate": 8.565842462203197e-05, + "loss": 2.3328, + "step": 914 + }, + { + "epoch": 0.2702303603071471, + "grad_norm": 0.24953104530330855, + "learning_rate": 8.562487856994194e-05, + "loss": 2.3557, + "step": 915 + }, + { + "epoch": 0.2705256940342587, + "grad_norm": 0.22171018617528845, + "learning_rate": 8.559129991565938e-05, + "loss": 2.3187, + "step": 916 + }, + { + "epoch": 0.27082102776137035, + "grad_norm": 0.2532293540224747, + "learning_rate": 8.555768868991383e-05, + "loss": 2.3721, + "step": 917 + }, + { + "epoch": 0.271116361488482, + "grad_norm": 0.2264676799983133, + "learning_rate": 8.552404492346473e-05, + "loss": 2.3844, + "step": 918 + }, + { + "epoch": 0.2714116952155936, + "grad_norm": 0.22753428179912716, + "learning_rate": 8.549036864710123e-05, + "loss": 2.2628, + "step": 919 + }, + { + "epoch": 0.27170702894270526, + "grad_norm": 0.2268556638364573, + "learning_rate": 8.545665989164224e-05, + "loss": 2.3512, + "step": 920 + }, + { + "epoch": 0.27200236266981687, + "grad_norm": 0.2247454852805964, + "learning_rate": 8.542291868793644e-05, + "loss": 2.3873, + "step": 921 + }, + { + "epoch": 0.2722976963969285, + "grad_norm": 0.21741216140963715, + "learning_rate": 8.538914506686215e-05, + "loss": 2.3949, + "step": 922 + }, + { + "epoch": 0.2725930301240402, + "grad_norm": 0.2184785197112524, + "learning_rate": 8.535533905932738e-05, + "loss": 2.2821, + "step": 923 + }, + { + "epoch": 0.2728883638511518, + "grad_norm": 0.23236956929432018, + "learning_rate": 8.532150069626978e-05, + "loss": 2.3821, + "step": 924 + }, + { + "epoch": 0.27318369757826344, + "grad_norm": 0.21681532051042063, + "learning_rate": 8.528763000865662e-05, + "loss": 2.3326, + "step": 925 + }, + { + "epoch": 0.2734790313053751, + "grad_norm": 0.2269947844333617, + "learning_rate": 8.525372702748476e-05, + "loss": 2.3446, + "step": 926 + }, + { + "epoch": 0.2737743650324867, + "grad_norm": 0.23227235648944325, + "learning_rate": 8.521979178378055e-05, + "loss": 2.3257, + "step": 927 + }, + { + "epoch": 0.27406969875959836, + "grad_norm": 0.21265950913072507, + "learning_rate": 8.518582430859991e-05, + "loss": 2.3166, + "step": 928 + }, + { + "epoch": 0.27436503248670996, + "grad_norm": 0.23516756201622527, + "learning_rate": 8.515182463302828e-05, + "loss": 2.446, + "step": 929 + }, + { + "epoch": 0.2746603662138216, + "grad_norm": 0.22732667465581088, + "learning_rate": 8.511779278818055e-05, + "loss": 2.3004, + "step": 930 + }, + { + "epoch": 0.2749556999409333, + "grad_norm": 0.21635380567362286, + "learning_rate": 8.508372880520102e-05, + "loss": 2.2707, + "step": 931 + }, + { + "epoch": 0.2752510336680449, + "grad_norm": 0.22577856352636694, + "learning_rate": 8.504963271526343e-05, + "loss": 2.3307, + "step": 932 + }, + { + "epoch": 0.27554636739515653, + "grad_norm": 0.22801385713021458, + "learning_rate": 8.501550454957092e-05, + "loss": 2.2716, + "step": 933 + }, + { + "epoch": 0.2758417011222682, + "grad_norm": 0.28510916334202796, + "learning_rate": 8.498134433935594e-05, + "loss": 2.2795, + "step": 934 + }, + { + "epoch": 0.2761370348493798, + "grad_norm": 0.2521723451516042, + "learning_rate": 8.49471521158803e-05, + "loss": 2.3819, + "step": 935 + }, + { + "epoch": 0.27643236857649145, + "grad_norm": 0.21933749190719798, + "learning_rate": 8.491292791043508e-05, + "loss": 2.2775, + "step": 936 + }, + { + "epoch": 0.27672770230360305, + "grad_norm": 0.22505092342607205, + "learning_rate": 8.487867175434065e-05, + "loss": 2.3699, + "step": 937 + }, + { + "epoch": 0.2770230360307147, + "grad_norm": 0.22703312630872408, + "learning_rate": 8.484438367894662e-05, + "loss": 2.2689, + "step": 938 + }, + { + "epoch": 0.27731836975782637, + "grad_norm": 0.22219182819062092, + "learning_rate": 8.48100637156318e-05, + "loss": 2.388, + "step": 939 + }, + { + "epoch": 0.27761370348493797, + "grad_norm": 0.23414951445054485, + "learning_rate": 8.47757118958042e-05, + "loss": 2.354, + "step": 940 + }, + { + "epoch": 0.2779090372120496, + "grad_norm": 0.22635372177767288, + "learning_rate": 8.474132825090093e-05, + "loss": 2.302, + "step": 941 + }, + { + "epoch": 0.2782043709391612, + "grad_norm": 0.22517674974227792, + "learning_rate": 8.470691281238832e-05, + "loss": 2.2648, + "step": 942 + }, + { + "epoch": 0.2784997046662729, + "grad_norm": 0.23442639741004095, + "learning_rate": 8.467246561176169e-05, + "loss": 2.3298, + "step": 943 + }, + { + "epoch": 0.27879503839338454, + "grad_norm": 0.22415275599406184, + "learning_rate": 8.46379866805455e-05, + "loss": 2.3701, + "step": 944 + }, + { + "epoch": 0.27909037212049614, + "grad_norm": 0.22208899467211352, + "learning_rate": 8.460347605029322e-05, + "loss": 2.2878, + "step": 945 + }, + { + "epoch": 0.2793857058476078, + "grad_norm": 0.2163384978761503, + "learning_rate": 8.456893375258734e-05, + "loss": 2.3041, + "step": 946 + }, + { + "epoch": 0.27968103957471946, + "grad_norm": 0.23012061414885412, + "learning_rate": 8.453435981903932e-05, + "loss": 2.3741, + "step": 947 + }, + { + "epoch": 0.27997637330183106, + "grad_norm": 0.27178752906127907, + "learning_rate": 8.44997542812896e-05, + "loss": 2.3258, + "step": 948 + }, + { + "epoch": 0.2802717070289427, + "grad_norm": 0.21817541442952815, + "learning_rate": 8.446511717100751e-05, + "loss": 2.4205, + "step": 949 + }, + { + "epoch": 0.2805670407560543, + "grad_norm": 0.2261307647712524, + "learning_rate": 8.443044851989126e-05, + "loss": 2.3807, + "step": 950 + }, + { + "epoch": 0.280862374483166, + "grad_norm": 0.22275570271263878, + "learning_rate": 8.439574835966797e-05, + "loss": 2.3508, + "step": 951 + }, + { + "epoch": 0.28115770821027763, + "grad_norm": 0.2510478144931006, + "learning_rate": 8.436101672209357e-05, + "loss": 2.2465, + "step": 952 + }, + { + "epoch": 0.28145304193738924, + "grad_norm": 0.22515253651645337, + "learning_rate": 8.432625363895281e-05, + "loss": 2.3269, + "step": 953 + }, + { + "epoch": 0.2817483756645009, + "grad_norm": 0.22997186729722563, + "learning_rate": 8.42914591420592e-05, + "loss": 2.3454, + "step": 954 + }, + { + "epoch": 0.2820437093916125, + "grad_norm": 0.2654529182423846, + "learning_rate": 8.425663326325499e-05, + "loss": 2.4404, + "step": 955 + }, + { + "epoch": 0.28233904311872415, + "grad_norm": 0.21771906700018118, + "learning_rate": 8.422177603441122e-05, + "loss": 2.3019, + "step": 956 + }, + { + "epoch": 0.2826343768458358, + "grad_norm": 0.22946539377487826, + "learning_rate": 8.418688748742748e-05, + "loss": 2.262, + "step": 957 + }, + { + "epoch": 0.2829297105729474, + "grad_norm": 0.21576393910164507, + "learning_rate": 8.415196765423219e-05, + "loss": 2.3415, + "step": 958 + }, + { + "epoch": 0.28322504430005907, + "grad_norm": 0.23326903710459163, + "learning_rate": 8.411701656678228e-05, + "loss": 2.3565, + "step": 959 + }, + { + "epoch": 0.2835203780271707, + "grad_norm": 0.22372430487358622, + "learning_rate": 8.408203425706333e-05, + "loss": 2.3405, + "step": 960 + }, + { + "epoch": 0.28381571175428233, + "grad_norm": 0.21467615873595586, + "learning_rate": 8.404702075708946e-05, + "loss": 2.3809, + "step": 961 + }, + { + "epoch": 0.284111045481394, + "grad_norm": 0.762503195583964, + "learning_rate": 8.401197609890338e-05, + "loss": 2.3185, + "step": 962 + }, + { + "epoch": 0.2844063792085056, + "grad_norm": 0.237092437327713, + "learning_rate": 8.397690031457632e-05, + "loss": 2.3033, + "step": 963 + }, + { + "epoch": 0.28470171293561725, + "grad_norm": 0.24334198597216833, + "learning_rate": 8.394179343620792e-05, + "loss": 2.391, + "step": 964 + }, + { + "epoch": 0.2849970466627289, + "grad_norm": 0.3065807259728631, + "learning_rate": 8.390665549592633e-05, + "loss": 2.2553, + "step": 965 + }, + { + "epoch": 0.2852923803898405, + "grad_norm": 0.23263873073389046, + "learning_rate": 8.387148652588815e-05, + "loss": 2.3253, + "step": 966 + }, + { + "epoch": 0.28558771411695216, + "grad_norm": 0.2243550892924071, + "learning_rate": 8.383628655827832e-05, + "loss": 2.303, + "step": 967 + }, + { + "epoch": 0.28588304784406376, + "grad_norm": 0.22609830031861197, + "learning_rate": 8.380105562531019e-05, + "loss": 2.3006, + "step": 968 + }, + { + "epoch": 0.2861783815711754, + "grad_norm": 0.2335547478435392, + "learning_rate": 8.376579375922542e-05, + "loss": 2.2735, + "step": 969 + }, + { + "epoch": 0.2864737152982871, + "grad_norm": 0.2285279327729792, + "learning_rate": 8.373050099229398e-05, + "loss": 2.0304, + "step": 970 + }, + { + "epoch": 0.2867690490253987, + "grad_norm": 0.2416191584995491, + "learning_rate": 8.369517735681418e-05, + "loss": 2.2257, + "step": 971 + }, + { + "epoch": 0.28706438275251034, + "grad_norm": 0.23095804742504877, + "learning_rate": 8.365982288511246e-05, + "loss": 2.3534, + "step": 972 + }, + { + "epoch": 0.287359716479622, + "grad_norm": 0.22704075616833783, + "learning_rate": 8.36244376095436e-05, + "loss": 2.2899, + "step": 973 + }, + { + "epoch": 0.2876550502067336, + "grad_norm": 0.25447672799173277, + "learning_rate": 8.35890215624905e-05, + "loss": 2.3202, + "step": 974 + }, + { + "epoch": 0.28795038393384526, + "grad_norm": 0.24797452787699897, + "learning_rate": 8.355357477636424e-05, + "loss": 2.2412, + "step": 975 + }, + { + "epoch": 0.28824571766095686, + "grad_norm": 0.22919257064559181, + "learning_rate": 8.351809728360403e-05, + "loss": 2.3824, + "step": 976 + }, + { + "epoch": 0.2885410513880685, + "grad_norm": 0.23604220441744075, + "learning_rate": 8.348258911667719e-05, + "loss": 2.295, + "step": 977 + }, + { + "epoch": 0.28883638511518017, + "grad_norm": 0.2418377411900936, + "learning_rate": 8.344705030807912e-05, + "loss": 2.3481, + "step": 978 + }, + { + "epoch": 0.2891317188422918, + "grad_norm": 0.2178679889644105, + "learning_rate": 8.341148089033319e-05, + "loss": 2.2649, + "step": 979 + }, + { + "epoch": 0.28942705256940343, + "grad_norm": 0.2825242027321086, + "learning_rate": 8.337588089599088e-05, + "loss": 2.4041, + "step": 980 + }, + { + "epoch": 0.2897223862965151, + "grad_norm": 0.2268038565769699, + "learning_rate": 8.334025035763162e-05, + "loss": 2.2954, + "step": 981 + }, + { + "epoch": 0.2900177200236267, + "grad_norm": 0.23088308848343023, + "learning_rate": 8.330458930786276e-05, + "loss": 2.3363, + "step": 982 + }, + { + "epoch": 0.29031305375073835, + "grad_norm": 0.22712474971730157, + "learning_rate": 8.326889777931959e-05, + "loss": 2.322, + "step": 983 + }, + { + "epoch": 0.29060838747784995, + "grad_norm": 0.2391098265457601, + "learning_rate": 8.32331758046653e-05, + "loss": 2.2823, + "step": 984 + }, + { + "epoch": 0.2909037212049616, + "grad_norm": 0.2327509641929552, + "learning_rate": 8.319742341659096e-05, + "loss": 2.3148, + "step": 985 + }, + { + "epoch": 0.29119905493207326, + "grad_norm": 0.24187205906738513, + "learning_rate": 8.316164064781544e-05, + "loss": 2.3501, + "step": 986 + }, + { + "epoch": 0.29149438865918487, + "grad_norm": 0.22092616365099735, + "learning_rate": 8.312582753108542e-05, + "loss": 2.3356, + "step": 987 + }, + { + "epoch": 0.2917897223862965, + "grad_norm": 0.22465390303040222, + "learning_rate": 8.308998409917536e-05, + "loss": 2.333, + "step": 988 + }, + { + "epoch": 0.2920850561134081, + "grad_norm": 0.2272412475410169, + "learning_rate": 8.305411038488748e-05, + "loss": 2.3266, + "step": 989 + }, + { + "epoch": 0.2923803898405198, + "grad_norm": 0.25333866355855783, + "learning_rate": 8.301820642105168e-05, + "loss": 2.4075, + "step": 990 + }, + { + "epoch": 0.29267572356763144, + "grad_norm": 0.22046963823330154, + "learning_rate": 8.298227224052553e-05, + "loss": 2.3937, + "step": 991 + }, + { + "epoch": 0.29297105729474304, + "grad_norm": 0.22732190587604, + "learning_rate": 8.294630787619433e-05, + "loss": 2.3004, + "step": 992 + }, + { + "epoch": 0.2932663910218547, + "grad_norm": 0.23591716754536315, + "learning_rate": 8.291031336097096e-05, + "loss": 2.2859, + "step": 993 + }, + { + "epoch": 0.29356172474896636, + "grad_norm": 0.22061575789180732, + "learning_rate": 8.287428872779583e-05, + "loss": 2.3119, + "step": 994 + }, + { + "epoch": 0.29385705847607796, + "grad_norm": 0.20982727409324908, + "learning_rate": 8.283823400963702e-05, + "loss": 2.2529, + "step": 995 + }, + { + "epoch": 0.2941523922031896, + "grad_norm": 0.21814964711265428, + "learning_rate": 8.280214923949005e-05, + "loss": 2.2652, + "step": 996 + }, + { + "epoch": 0.2944477259303012, + "grad_norm": 0.2660133244083088, + "learning_rate": 8.276603445037803e-05, + "loss": 2.366, + "step": 997 + }, + { + "epoch": 0.2947430596574129, + "grad_norm": 0.22938371895114398, + "learning_rate": 8.272988967535146e-05, + "loss": 2.3392, + "step": 998 + }, + { + "epoch": 0.29503839338452453, + "grad_norm": 0.22411316502327916, + "learning_rate": 8.269371494748833e-05, + "loss": 2.2313, + "step": 999 + }, + { + "epoch": 0.29533372711163614, + "grad_norm": 0.23449753101468443, + "learning_rate": 8.265751029989402e-05, + "loss": 2.3568, + "step": 1000 + }, + { + "epoch": 0.2956290608387478, + "grad_norm": 0.23383042328963968, + "learning_rate": 8.262127576570131e-05, + "loss": 2.2248, + "step": 1001 + }, + { + "epoch": 0.2959243945658594, + "grad_norm": 0.23228417653180541, + "learning_rate": 8.25850113780703e-05, + "loss": 2.3577, + "step": 1002 + }, + { + "epoch": 0.29621972829297105, + "grad_norm": 0.23379858980708035, + "learning_rate": 8.254871717018847e-05, + "loss": 2.2675, + "step": 1003 + }, + { + "epoch": 0.2965150620200827, + "grad_norm": 0.227046859871421, + "learning_rate": 8.251239317527051e-05, + "loss": 2.2653, + "step": 1004 + }, + { + "epoch": 0.2968103957471943, + "grad_norm": 0.2228056339607774, + "learning_rate": 8.247603942655844e-05, + "loss": 2.2706, + "step": 1005 + }, + { + "epoch": 0.29710572947430597, + "grad_norm": 0.21941258355787102, + "learning_rate": 8.243965595732146e-05, + "loss": 2.3378, + "step": 1006 + }, + { + "epoch": 0.2974010632014176, + "grad_norm": 0.22931909729665462, + "learning_rate": 8.2403242800856e-05, + "loss": 2.3273, + "step": 1007 + }, + { + "epoch": 0.2976963969285292, + "grad_norm": 0.23167428227269504, + "learning_rate": 8.236679999048566e-05, + "loss": 2.3604, + "step": 1008 + }, + { + "epoch": 0.2979917306556409, + "grad_norm": 0.2213566480759305, + "learning_rate": 8.233032755956112e-05, + "loss": 2.2485, + "step": 1009 + }, + { + "epoch": 0.2982870643827525, + "grad_norm": 0.21700194686244845, + "learning_rate": 8.229382554146026e-05, + "loss": 2.3554, + "step": 1010 + }, + { + "epoch": 0.29858239810986414, + "grad_norm": 0.225166202791107, + "learning_rate": 8.225729396958794e-05, + "loss": 2.3023, + "step": 1011 + }, + { + "epoch": 0.2988777318369758, + "grad_norm": 0.2277803737714067, + "learning_rate": 8.222073287737617e-05, + "loss": 2.3399, + "step": 1012 + }, + { + "epoch": 0.2991730655640874, + "grad_norm": 0.22452234987207656, + "learning_rate": 8.21841422982839e-05, + "loss": 2.4061, + "step": 1013 + }, + { + "epoch": 0.29946839929119906, + "grad_norm": 0.24643365909525303, + "learning_rate": 8.214752226579707e-05, + "loss": 2.3264, + "step": 1014 + }, + { + "epoch": 0.2997637330183107, + "grad_norm": 0.2216987601658502, + "learning_rate": 8.21108728134286e-05, + "loss": 2.3767, + "step": 1015 + }, + { + "epoch": 0.3000590667454223, + "grad_norm": 0.21378262036691076, + "learning_rate": 8.207419397471831e-05, + "loss": 2.3675, + "step": 1016 + }, + { + "epoch": 0.300354400472534, + "grad_norm": 0.22995894021510974, + "learning_rate": 8.203748578323296e-05, + "loss": 2.3418, + "step": 1017 + }, + { + "epoch": 0.3006497341996456, + "grad_norm": 0.22124057922134346, + "learning_rate": 8.200074827256612e-05, + "loss": 2.3409, + "step": 1018 + }, + { + "epoch": 0.30094506792675724, + "grad_norm": 0.255051845464478, + "learning_rate": 8.19639814763382e-05, + "loss": 2.3388, + "step": 1019 + }, + { + "epoch": 0.3012404016538689, + "grad_norm": 0.2356461545063868, + "learning_rate": 8.192718542819644e-05, + "loss": 2.2959, + "step": 1020 + }, + { + "epoch": 0.3015357353809805, + "grad_norm": 0.2190626243585514, + "learning_rate": 8.189036016181482e-05, + "loss": 2.3448, + "step": 1021 + }, + { + "epoch": 0.30183106910809215, + "grad_norm": 0.22423219939510766, + "learning_rate": 8.185350571089408e-05, + "loss": 2.3157, + "step": 1022 + }, + { + "epoch": 0.30212640283520376, + "grad_norm": 0.2448176723559054, + "learning_rate": 8.181662210916165e-05, + "loss": 2.2897, + "step": 1023 + }, + { + "epoch": 0.3024217365623154, + "grad_norm": 0.2447475975355119, + "learning_rate": 8.177970939037164e-05, + "loss": 2.2849, + "step": 1024 + }, + { + "epoch": 0.30271707028942707, + "grad_norm": 0.24184149205688846, + "learning_rate": 8.174276758830481e-05, + "loss": 2.4467, + "step": 1025 + }, + { + "epoch": 0.3030124040165387, + "grad_norm": 0.22021118973028592, + "learning_rate": 8.170579673676857e-05, + "loss": 2.3211, + "step": 1026 + }, + { + "epoch": 0.30330773774365033, + "grad_norm": 0.2283621296959548, + "learning_rate": 8.166879686959686e-05, + "loss": 2.3399, + "step": 1027 + }, + { + "epoch": 0.303603071470762, + "grad_norm": 0.23623192618378977, + "learning_rate": 8.163176802065017e-05, + "loss": 2.3129, + "step": 1028 + }, + { + "epoch": 0.3038984051978736, + "grad_norm": 0.2358335569187448, + "learning_rate": 8.159471022381561e-05, + "loss": 2.2631, + "step": 1029 + }, + { + "epoch": 0.30419373892498525, + "grad_norm": 0.23171883528406242, + "learning_rate": 8.155762351300664e-05, + "loss": 2.3149, + "step": 1030 + }, + { + "epoch": 0.30448907265209685, + "grad_norm": 0.225685031808706, + "learning_rate": 8.15205079221633e-05, + "loss": 2.3215, + "step": 1031 + }, + { + "epoch": 0.3047844063792085, + "grad_norm": 0.2205629325477637, + "learning_rate": 8.148336348525197e-05, + "loss": 2.2931, + "step": 1032 + }, + { + "epoch": 0.30507974010632016, + "grad_norm": 0.22579279411397202, + "learning_rate": 8.144619023626551e-05, + "loss": 2.3311, + "step": 1033 + }, + { + "epoch": 0.30537507383343176, + "grad_norm": 0.2718507092685017, + "learning_rate": 8.140898820922307e-05, + "loss": 2.248, + "step": 1034 + }, + { + "epoch": 0.3056704075605434, + "grad_norm": 0.22851218522572198, + "learning_rate": 8.13717574381702e-05, + "loss": 2.328, + "step": 1035 + }, + { + "epoch": 0.305965741287655, + "grad_norm": 0.2190117121932955, + "learning_rate": 8.133449795717874e-05, + "loss": 2.3356, + "step": 1036 + }, + { + "epoch": 0.3062610750147667, + "grad_norm": 0.22906590905456503, + "learning_rate": 8.129720980034676e-05, + "loss": 2.2837, + "step": 1037 + }, + { + "epoch": 0.30655640874187834, + "grad_norm": 0.23624628399646203, + "learning_rate": 8.125989300179862e-05, + "loss": 2.3198, + "step": 1038 + }, + { + "epoch": 0.30685174246898994, + "grad_norm": 0.2557224897586447, + "learning_rate": 8.122254759568488e-05, + "loss": 2.2714, + "step": 1039 + }, + { + "epoch": 0.3071470761961016, + "grad_norm": 0.31986507708060574, + "learning_rate": 8.118517361618228e-05, + "loss": 2.4149, + "step": 1040 + }, + { + "epoch": 0.30744240992321326, + "grad_norm": 0.21167388173131543, + "learning_rate": 8.114777109749369e-05, + "loss": 2.2198, + "step": 1041 + }, + { + "epoch": 0.30773774365032486, + "grad_norm": 0.23626665624304294, + "learning_rate": 8.111034007384814e-05, + "loss": 2.4117, + "step": 1042 + }, + { + "epoch": 0.3080330773774365, + "grad_norm": 0.22189111751794266, + "learning_rate": 8.107288057950069e-05, + "loss": 2.2879, + "step": 1043 + }, + { + "epoch": 0.3083284111045481, + "grad_norm": 0.2294858079388801, + "learning_rate": 8.103539264873252e-05, + "loss": 2.3775, + "step": 1044 + }, + { + "epoch": 0.3086237448316598, + "grad_norm": 0.22799139127405038, + "learning_rate": 8.099787631585076e-05, + "loss": 2.3315, + "step": 1045 + }, + { + "epoch": 0.30891907855877143, + "grad_norm": 0.2105092982942122, + "learning_rate": 8.096033161518861e-05, + "loss": 2.28, + "step": 1046 + }, + { + "epoch": 0.30921441228588303, + "grad_norm": 0.2109875707892116, + "learning_rate": 8.092275858110517e-05, + "loss": 2.374, + "step": 1047 + }, + { + "epoch": 0.3095097460129947, + "grad_norm": 0.23350408389637073, + "learning_rate": 8.088515724798549e-05, + "loss": 2.274, + "step": 1048 + }, + { + "epoch": 0.3098050797401063, + "grad_norm": 0.22119163361594532, + "learning_rate": 8.084752765024052e-05, + "loss": 2.2427, + "step": 1049 + }, + { + "epoch": 0.31010041346721795, + "grad_norm": 0.21909865625133504, + "learning_rate": 8.08098698223071e-05, + "loss": 2.2771, + "step": 1050 + }, + { + "epoch": 0.3103957471943296, + "grad_norm": 0.22875691578004909, + "learning_rate": 8.077218379864784e-05, + "loss": 2.2492, + "step": 1051 + }, + { + "epoch": 0.3106910809214412, + "grad_norm": 0.4307792838144798, + "learning_rate": 8.073446961375125e-05, + "loss": 2.3902, + "step": 1052 + }, + { + "epoch": 0.31098641464855287, + "grad_norm": 0.2138707321779014, + "learning_rate": 8.069672730213151e-05, + "loss": 2.3951, + "step": 1053 + }, + { + "epoch": 0.3112817483756645, + "grad_norm": 0.23247246231794813, + "learning_rate": 8.065895689832859e-05, + "loss": 2.3959, + "step": 1054 + }, + { + "epoch": 0.3115770821027761, + "grad_norm": 0.22205534375419259, + "learning_rate": 8.062115843690818e-05, + "loss": 2.3108, + "step": 1055 + }, + { + "epoch": 0.3118724158298878, + "grad_norm": 0.22267749751921634, + "learning_rate": 8.058333195246162e-05, + "loss": 2.2813, + "step": 1056 + }, + { + "epoch": 0.3121677495569994, + "grad_norm": 0.2281457646600056, + "learning_rate": 8.054547747960594e-05, + "loss": 2.2768, + "step": 1057 + }, + { + "epoch": 0.31246308328411104, + "grad_norm": 0.3852168308302159, + "learning_rate": 8.050759505298371e-05, + "loss": 2.3256, + "step": 1058 + }, + { + "epoch": 0.3127584170112227, + "grad_norm": 0.22561223425903681, + "learning_rate": 8.046968470726314e-05, + "loss": 2.3743, + "step": 1059 + }, + { + "epoch": 0.3130537507383343, + "grad_norm": 0.21845832179732375, + "learning_rate": 8.043174647713797e-05, + "loss": 2.2945, + "step": 1060 + }, + { + "epoch": 0.31334908446544596, + "grad_norm": 0.215934231411808, + "learning_rate": 8.039378039732745e-05, + "loss": 2.3445, + "step": 1061 + }, + { + "epoch": 0.3136444181925576, + "grad_norm": 0.22598605980858621, + "learning_rate": 8.035578650257636e-05, + "loss": 2.3634, + "step": 1062 + }, + { + "epoch": 0.3139397519196692, + "grad_norm": 0.22409783036912895, + "learning_rate": 8.03177648276549e-05, + "loss": 2.3783, + "step": 1063 + }, + { + "epoch": 0.3142350856467809, + "grad_norm": 0.22585277546411728, + "learning_rate": 8.027971540735866e-05, + "loss": 2.3141, + "step": 1064 + }, + { + "epoch": 0.3145304193738925, + "grad_norm": 0.20692622921896453, + "learning_rate": 8.024163827650868e-05, + "loss": 2.2613, + "step": 1065 + }, + { + "epoch": 0.31482575310100414, + "grad_norm": 0.22114563356774739, + "learning_rate": 8.020353346995135e-05, + "loss": 2.3271, + "step": 1066 + }, + { + "epoch": 0.3151210868281158, + "grad_norm": 0.2369747301182446, + "learning_rate": 8.016540102255834e-05, + "loss": 2.2894, + "step": 1067 + }, + { + "epoch": 0.3154164205552274, + "grad_norm": 0.2209179853564615, + "learning_rate": 8.01272409692267e-05, + "loss": 2.268, + "step": 1068 + }, + { + "epoch": 0.31571175428233905, + "grad_norm": 0.2331885507396867, + "learning_rate": 8.008905334487864e-05, + "loss": 2.3667, + "step": 1069 + }, + { + "epoch": 0.31600708800945065, + "grad_norm": 0.22009138550111806, + "learning_rate": 8.005083818446168e-05, + "loss": 2.2963, + "step": 1070 + }, + { + "epoch": 0.3163024217365623, + "grad_norm": 0.2285833095505211, + "learning_rate": 8.001259552294855e-05, + "loss": 2.3392, + "step": 1071 + }, + { + "epoch": 0.31659775546367397, + "grad_norm": 0.21925664030489264, + "learning_rate": 7.997432539533705e-05, + "loss": 2.2836, + "step": 1072 + }, + { + "epoch": 0.31689308919078557, + "grad_norm": 0.3559490354710031, + "learning_rate": 7.993602783665021e-05, + "loss": 2.391, + "step": 1073 + }, + { + "epoch": 0.31718842291789723, + "grad_norm": 0.2300420706952755, + "learning_rate": 7.989770288193614e-05, + "loss": 2.3391, + "step": 1074 + }, + { + "epoch": 0.3174837566450089, + "grad_norm": 0.21440884179841016, + "learning_rate": 7.985935056626799e-05, + "loss": 2.3809, + "step": 1075 + }, + { + "epoch": 0.3177790903721205, + "grad_norm": 0.21949422632707366, + "learning_rate": 7.982097092474398e-05, + "loss": 2.3266, + "step": 1076 + }, + { + "epoch": 0.31807442409923214, + "grad_norm": 0.23148216082994238, + "learning_rate": 7.978256399248736e-05, + "loss": 2.3447, + "step": 1077 + }, + { + "epoch": 0.31836975782634375, + "grad_norm": 0.21847337875624603, + "learning_rate": 7.97441298046463e-05, + "loss": 2.2974, + "step": 1078 + }, + { + "epoch": 0.3186650915534554, + "grad_norm": 0.2306988666450158, + "learning_rate": 7.970566839639393e-05, + "loss": 2.2473, + "step": 1079 + }, + { + "epoch": 0.31896042528056706, + "grad_norm": 0.21692052528265343, + "learning_rate": 7.96671798029283e-05, + "loss": 2.3201, + "step": 1080 + }, + { + "epoch": 0.31925575900767866, + "grad_norm": 0.2180842930811879, + "learning_rate": 7.962866405947237e-05, + "loss": 2.2907, + "step": 1081 + }, + { + "epoch": 0.3195510927347903, + "grad_norm": 0.2324817484182914, + "learning_rate": 7.959012120127389e-05, + "loss": 2.3351, + "step": 1082 + }, + { + "epoch": 0.3198464264619019, + "grad_norm": 0.20799489048323874, + "learning_rate": 7.955155126360544e-05, + "loss": 2.3136, + "step": 1083 + }, + { + "epoch": 0.3201417601890136, + "grad_norm": 0.2252236880235741, + "learning_rate": 7.95129542817644e-05, + "loss": 2.3845, + "step": 1084 + }, + { + "epoch": 0.32043709391612524, + "grad_norm": 0.2152769835818076, + "learning_rate": 7.947433029107291e-05, + "loss": 2.1001, + "step": 1085 + }, + { + "epoch": 0.32073242764323684, + "grad_norm": 0.2208634275422862, + "learning_rate": 7.943567932687776e-05, + "loss": 2.2985, + "step": 1086 + }, + { + "epoch": 0.3210277613703485, + "grad_norm": 0.2199934715824982, + "learning_rate": 7.939700142455053e-05, + "loss": 2.2884, + "step": 1087 + }, + { + "epoch": 0.32132309509746015, + "grad_norm": 0.21992332126013145, + "learning_rate": 7.935829661948736e-05, + "loss": 2.3801, + "step": 1088 + }, + { + "epoch": 0.32161842882457176, + "grad_norm": 0.2112329586056906, + "learning_rate": 7.931956494710905e-05, + "loss": 2.2796, + "step": 1089 + }, + { + "epoch": 0.3219137625516834, + "grad_norm": 0.21669410297256897, + "learning_rate": 7.928080644286098e-05, + "loss": 2.2802, + "step": 1090 + }, + { + "epoch": 0.322209096278795, + "grad_norm": 0.22273752335404257, + "learning_rate": 7.924202114221309e-05, + "loss": 2.3892, + "step": 1091 + }, + { + "epoch": 0.3225044300059067, + "grad_norm": 0.22718774089774224, + "learning_rate": 7.920320908065983e-05, + "loss": 2.3373, + "step": 1092 + }, + { + "epoch": 0.32279976373301833, + "grad_norm": 0.21434171280572, + "learning_rate": 7.916437029372017e-05, + "loss": 2.2589, + "step": 1093 + }, + { + "epoch": 0.32309509746012993, + "grad_norm": 0.22125358633863243, + "learning_rate": 7.91255048169375e-05, + "loss": 2.2719, + "step": 1094 + }, + { + "epoch": 0.3233904311872416, + "grad_norm": 0.22757244274563657, + "learning_rate": 7.908661268587967e-05, + "loss": 2.3305, + "step": 1095 + }, + { + "epoch": 0.32368576491435325, + "grad_norm": 0.21422077369986148, + "learning_rate": 7.904769393613892e-05, + "loss": 2.3015, + "step": 1096 + }, + { + "epoch": 0.32398109864146485, + "grad_norm": 0.21060467883767356, + "learning_rate": 7.900874860333179e-05, + "loss": 2.2789, + "step": 1097 + }, + { + "epoch": 0.3242764323685765, + "grad_norm": 0.24033549746881766, + "learning_rate": 7.896977672309922e-05, + "loss": 2.3166, + "step": 1098 + }, + { + "epoch": 0.3245717660956881, + "grad_norm": 0.2187066375805733, + "learning_rate": 7.893077833110643e-05, + "loss": 2.3173, + "step": 1099 + }, + { + "epoch": 0.32486709982279977, + "grad_norm": 0.23462087132413986, + "learning_rate": 7.889175346304287e-05, + "loss": 2.3529, + "step": 1100 + }, + { + "epoch": 0.3251624335499114, + "grad_norm": 0.21771368566957763, + "learning_rate": 7.885270215462226e-05, + "loss": 2.2091, + "step": 1101 + }, + { + "epoch": 0.325457767277023, + "grad_norm": 0.23196180473217684, + "learning_rate": 7.881362444158249e-05, + "loss": 2.4113, + "step": 1102 + }, + { + "epoch": 0.3257531010041347, + "grad_norm": 0.22356039648850068, + "learning_rate": 7.877452035968564e-05, + "loss": 2.4037, + "step": 1103 + }, + { + "epoch": 0.3260484347312463, + "grad_norm": 0.21653495466146427, + "learning_rate": 7.873538994471787e-05, + "loss": 2.2997, + "step": 1104 + }, + { + "epoch": 0.32634376845835794, + "grad_norm": 0.22479531428518573, + "learning_rate": 7.86962332324895e-05, + "loss": 2.4048, + "step": 1105 + }, + { + "epoch": 0.3266391021854696, + "grad_norm": 0.2436730140984742, + "learning_rate": 7.865705025883486e-05, + "loss": 2.3001, + "step": 1106 + }, + { + "epoch": 0.3269344359125812, + "grad_norm": 0.2182045249334076, + "learning_rate": 7.861784105961235e-05, + "loss": 2.3236, + "step": 1107 + }, + { + "epoch": 0.32722976963969286, + "grad_norm": 0.21836906711858103, + "learning_rate": 7.857860567070439e-05, + "loss": 2.3493, + "step": 1108 + }, + { + "epoch": 0.3275251033668045, + "grad_norm": 0.24219879770076985, + "learning_rate": 7.853934412801732e-05, + "loss": 2.2851, + "step": 1109 + }, + { + "epoch": 0.3278204370939161, + "grad_norm": 0.2387949690110912, + "learning_rate": 7.850005646748142e-05, + "loss": 2.2688, + "step": 1110 + }, + { + "epoch": 0.3281157708210278, + "grad_norm": 0.22008949823211738, + "learning_rate": 7.846074272505091e-05, + "loss": 2.2076, + "step": 1111 + }, + { + "epoch": 0.3284111045481394, + "grad_norm": 0.21382036959514855, + "learning_rate": 7.842140293670385e-05, + "loss": 2.3059, + "step": 1112 + }, + { + "epoch": 0.32870643827525103, + "grad_norm": 0.2202323855837645, + "learning_rate": 7.838203713844214e-05, + "loss": 2.3796, + "step": 1113 + }, + { + "epoch": 0.3290017720023627, + "grad_norm": 0.21426646094311091, + "learning_rate": 7.834264536629148e-05, + "loss": 2.2386, + "step": 1114 + }, + { + "epoch": 0.3292971057294743, + "grad_norm": 0.2183890941779377, + "learning_rate": 7.830322765630133e-05, + "loss": 2.2662, + "step": 1115 + }, + { + "epoch": 0.32959243945658595, + "grad_norm": 0.21711503130286955, + "learning_rate": 7.826378404454492e-05, + "loss": 2.3278, + "step": 1116 + }, + { + "epoch": 0.32988777318369755, + "grad_norm": 0.2852891324972888, + "learning_rate": 7.822431456711918e-05, + "loss": 2.1962, + "step": 1117 + }, + { + "epoch": 0.3301831069108092, + "grad_norm": 0.22174431787165083, + "learning_rate": 7.818481926014466e-05, + "loss": 2.2175, + "step": 1118 + }, + { + "epoch": 0.33047844063792087, + "grad_norm": 0.21863530762628647, + "learning_rate": 7.814529815976564e-05, + "loss": 2.2424, + "step": 1119 + }, + { + "epoch": 0.33077377436503247, + "grad_norm": 0.21253466814206606, + "learning_rate": 7.810575130214987e-05, + "loss": 2.3371, + "step": 1120 + }, + { + "epoch": 0.3310691080921441, + "grad_norm": 0.7934738955539543, + "learning_rate": 7.806617872348882e-05, + "loss": 2.1723, + "step": 1121 + }, + { + "epoch": 0.3313644418192558, + "grad_norm": 0.22583620630145948, + "learning_rate": 7.802658045999738e-05, + "loss": 2.3746, + "step": 1122 + }, + { + "epoch": 0.3316597755463674, + "grad_norm": 0.2345541110704929, + "learning_rate": 7.798695654791403e-05, + "loss": 2.3565, + "step": 1123 + }, + { + "epoch": 0.33195510927347904, + "grad_norm": 0.9502225050607847, + "learning_rate": 7.794730702350068e-05, + "loss": 2.3054, + "step": 1124 + }, + { + "epoch": 0.33225044300059065, + "grad_norm": 0.23182090832684443, + "learning_rate": 7.790763192304269e-05, + "loss": 2.2703, + "step": 1125 + }, + { + "epoch": 0.3325457767277023, + "grad_norm": 0.23355753914584654, + "learning_rate": 7.78679312828488e-05, + "loss": 2.3518, + "step": 1126 + }, + { + "epoch": 0.33284111045481396, + "grad_norm": 0.2580389704289059, + "learning_rate": 7.782820513925118e-05, + "loss": 2.4705, + "step": 1127 + }, + { + "epoch": 0.33313644418192556, + "grad_norm": 0.2208057013279413, + "learning_rate": 7.778845352860526e-05, + "loss": 2.2866, + "step": 1128 + }, + { + "epoch": 0.3334317779090372, + "grad_norm": 0.22977592297317698, + "learning_rate": 7.774867648728985e-05, + "loss": 2.3771, + "step": 1129 + }, + { + "epoch": 0.3337271116361488, + "grad_norm": 0.22441358740534678, + "learning_rate": 7.770887405170701e-05, + "loss": 2.3407, + "step": 1130 + }, + { + "epoch": 0.3340224453632605, + "grad_norm": 0.22358583181717223, + "learning_rate": 7.766904625828204e-05, + "loss": 2.3403, + "step": 1131 + }, + { + "epoch": 0.33431777909037214, + "grad_norm": 0.2234386033917906, + "learning_rate": 7.76291931434634e-05, + "loss": 2.3225, + "step": 1132 + }, + { + "epoch": 0.33461311281748374, + "grad_norm": 0.22481824865460004, + "learning_rate": 7.75893147437228e-05, + "loss": 2.3273, + "step": 1133 + }, + { + "epoch": 0.3349084465445954, + "grad_norm": 0.23985830194520133, + "learning_rate": 7.754941109555504e-05, + "loss": 2.441, + "step": 1134 + }, + { + "epoch": 0.33520378027170705, + "grad_norm": 0.21179415567046103, + "learning_rate": 7.750948223547805e-05, + "loss": 2.2018, + "step": 1135 + }, + { + "epoch": 0.33549911399881865, + "grad_norm": 0.22035601842168967, + "learning_rate": 7.74695282000328e-05, + "loss": 2.2858, + "step": 1136 + }, + { + "epoch": 0.3357944477259303, + "grad_norm": 0.22815048356313244, + "learning_rate": 7.742954902578333e-05, + "loss": 2.2801, + "step": 1137 + }, + { + "epoch": 0.3360897814530419, + "grad_norm": 0.2232642322245625, + "learning_rate": 7.738954474931667e-05, + "loss": 2.2946, + "step": 1138 + }, + { + "epoch": 0.33638511518015357, + "grad_norm": 0.21570119548129416, + "learning_rate": 7.734951540724285e-05, + "loss": 2.3147, + "step": 1139 + }, + { + "epoch": 0.33668044890726523, + "grad_norm": 0.5964867362604576, + "learning_rate": 7.73094610361948e-05, + "loss": 2.3847, + "step": 1140 + }, + { + "epoch": 0.33697578263437683, + "grad_norm": 0.2200507506305735, + "learning_rate": 7.72693816728284e-05, + "loss": 2.3161, + "step": 1141 + }, + { + "epoch": 0.3372711163614885, + "grad_norm": 0.28033136971963807, + "learning_rate": 7.722927735382232e-05, + "loss": 2.2635, + "step": 1142 + }, + { + "epoch": 0.33756645008860015, + "grad_norm": 0.23493565417805504, + "learning_rate": 7.718914811587819e-05, + "loss": 2.4226, + "step": 1143 + }, + { + "epoch": 0.33786178381571175, + "grad_norm": 0.22331969650261, + "learning_rate": 7.714899399572033e-05, + "loss": 2.3249, + "step": 1144 + }, + { + "epoch": 0.3381571175428234, + "grad_norm": 0.2586653503682705, + "learning_rate": 7.710881503009588e-05, + "loss": 2.2468, + "step": 1145 + }, + { + "epoch": 0.338452451269935, + "grad_norm": 0.22331301549280277, + "learning_rate": 7.706861125577473e-05, + "loss": 2.3016, + "step": 1146 + }, + { + "epoch": 0.33874778499704666, + "grad_norm": 0.22945620477913006, + "learning_rate": 7.702838270954948e-05, + "loss": 2.3656, + "step": 1147 + }, + { + "epoch": 0.3390431187241583, + "grad_norm": 0.2405177825006221, + "learning_rate": 7.698812942823534e-05, + "loss": 2.3825, + "step": 1148 + }, + { + "epoch": 0.3393384524512699, + "grad_norm": 0.23132597184395, + "learning_rate": 7.694785144867023e-05, + "loss": 2.2307, + "step": 1149 + }, + { + "epoch": 0.3396337861783816, + "grad_norm": 0.23540166266207843, + "learning_rate": 7.690754880771462e-05, + "loss": 2.2997, + "step": 1150 + }, + { + "epoch": 0.3399291199054932, + "grad_norm": 0.2301816202360988, + "learning_rate": 7.686722154225157e-05, + "loss": 2.384, + "step": 1151 + }, + { + "epoch": 0.34022445363260484, + "grad_norm": 0.22673207546853283, + "learning_rate": 7.682686968918667e-05, + "loss": 2.3373, + "step": 1152 + }, + { + "epoch": 0.3405197873597165, + "grad_norm": 0.22483447032545067, + "learning_rate": 7.678649328544803e-05, + "loss": 2.3751, + "step": 1153 + }, + { + "epoch": 0.3408151210868281, + "grad_norm": 0.2301144138985587, + "learning_rate": 7.674609236798621e-05, + "loss": 2.3335, + "step": 1154 + }, + { + "epoch": 0.34111045481393976, + "grad_norm": 0.22065998601026465, + "learning_rate": 7.670566697377419e-05, + "loss": 2.3793, + "step": 1155 + }, + { + "epoch": 0.3414057885410514, + "grad_norm": 0.21494256261491107, + "learning_rate": 7.666521713980737e-05, + "loss": 2.3448, + "step": 1156 + }, + { + "epoch": 0.341701122268163, + "grad_norm": 0.20762821717270197, + "learning_rate": 7.662474290310351e-05, + "loss": 2.2304, + "step": 1157 + }, + { + "epoch": 0.3419964559952747, + "grad_norm": 0.21184830590264678, + "learning_rate": 7.658424430070273e-05, + "loss": 2.3321, + "step": 1158 + }, + { + "epoch": 0.3422917897223863, + "grad_norm": 0.21257920191673252, + "learning_rate": 7.654372136966739e-05, + "loss": 2.2885, + "step": 1159 + }, + { + "epoch": 0.34258712344949793, + "grad_norm": 0.23512653815807227, + "learning_rate": 7.650317414708215e-05, + "loss": 2.2524, + "step": 1160 + }, + { + "epoch": 0.3428824571766096, + "grad_norm": 0.22233986544885, + "learning_rate": 7.646260267005392e-05, + "loss": 2.339, + "step": 1161 + }, + { + "epoch": 0.3431777909037212, + "grad_norm": 0.2621850372126038, + "learning_rate": 7.642200697571178e-05, + "loss": 2.3285, + "step": 1162 + }, + { + "epoch": 0.34347312463083285, + "grad_norm": 0.2171026427716969, + "learning_rate": 7.638138710120695e-05, + "loss": 2.3564, + "step": 1163 + }, + { + "epoch": 0.34376845835794445, + "grad_norm": 0.2138915026355032, + "learning_rate": 7.634074308371283e-05, + "loss": 2.1543, + "step": 1164 + }, + { + "epoch": 0.3440637920850561, + "grad_norm": 0.218160302753888, + "learning_rate": 7.630007496042488e-05, + "loss": 2.2956, + "step": 1165 + }, + { + "epoch": 0.34435912581216777, + "grad_norm": 0.2145399112578358, + "learning_rate": 7.625938276856064e-05, + "loss": 2.3447, + "step": 1166 + }, + { + "epoch": 0.34465445953927937, + "grad_norm": 0.22528014987809764, + "learning_rate": 7.621866654535965e-05, + "loss": 2.3235, + "step": 1167 + }, + { + "epoch": 0.344949793266391, + "grad_norm": 0.22239952177403255, + "learning_rate": 7.617792632808347e-05, + "loss": 2.3235, + "step": 1168 + }, + { + "epoch": 0.3452451269935027, + "grad_norm": 0.22773179499373572, + "learning_rate": 7.61371621540156e-05, + "loss": 2.2899, + "step": 1169 + }, + { + "epoch": 0.3455404607206143, + "grad_norm": 0.228945181663852, + "learning_rate": 7.609637406046149e-05, + "loss": 2.2548, + "step": 1170 + }, + { + "epoch": 0.34583579444772594, + "grad_norm": 0.21603530428544826, + "learning_rate": 7.605556208474842e-05, + "loss": 2.2668, + "step": 1171 + }, + { + "epoch": 0.34613112817483754, + "grad_norm": 0.21382942911516253, + "learning_rate": 7.60147262642256e-05, + "loss": 2.3719, + "step": 1172 + }, + { + "epoch": 0.3464264619019492, + "grad_norm": 0.22086684307133467, + "learning_rate": 7.597386663626404e-05, + "loss": 2.3196, + "step": 1173 + }, + { + "epoch": 0.34672179562906086, + "grad_norm": 0.214304429149538, + "learning_rate": 7.59329832382565e-05, + "loss": 2.3295, + "step": 1174 + }, + { + "epoch": 0.34701712935617246, + "grad_norm": 0.21845671884363488, + "learning_rate": 7.589207610761752e-05, + "loss": 2.289, + "step": 1175 + }, + { + "epoch": 0.3473124630832841, + "grad_norm": 0.2195550630761236, + "learning_rate": 7.585114528178338e-05, + "loss": 2.2569, + "step": 1176 + }, + { + "epoch": 0.3476077968103958, + "grad_norm": 0.22926560437260793, + "learning_rate": 7.5810190798212e-05, + "loss": 2.3699, + "step": 1177 + }, + { + "epoch": 0.3479031305375074, + "grad_norm": 0.22438052585034746, + "learning_rate": 7.576921269438298e-05, + "loss": 2.3016, + "step": 1178 + }, + { + "epoch": 0.34819846426461903, + "grad_norm": 0.22095790436986174, + "learning_rate": 7.572821100779753e-05, + "loss": 2.3388, + "step": 1179 + }, + { + "epoch": 0.34849379799173064, + "grad_norm": 0.21610331212841413, + "learning_rate": 7.568718577597842e-05, + "loss": 2.3042, + "step": 1180 + }, + { + "epoch": 0.3487891317188423, + "grad_norm": 0.2230859675446632, + "learning_rate": 7.564613703647004e-05, + "loss": 2.2892, + "step": 1181 + }, + { + "epoch": 0.34908446544595395, + "grad_norm": 0.25366541371107093, + "learning_rate": 7.56050648268382e-05, + "loss": 2.3327, + "step": 1182 + }, + { + "epoch": 0.34937979917306555, + "grad_norm": 0.2217577238151584, + "learning_rate": 7.556396918467023e-05, + "loss": 2.243, + "step": 1183 + }, + { + "epoch": 0.3496751329001772, + "grad_norm": 0.23686717167653606, + "learning_rate": 7.552285014757495e-05, + "loss": 2.3585, + "step": 1184 + }, + { + "epoch": 0.3499704666272888, + "grad_norm": 0.2139454963882489, + "learning_rate": 7.54817077531825e-05, + "loss": 2.3497, + "step": 1185 + }, + { + "epoch": 0.35026580035440047, + "grad_norm": 0.2286579634302601, + "learning_rate": 7.544054203914444e-05, + "loss": 2.3635, + "step": 1186 + }, + { + "epoch": 0.3505611340815121, + "grad_norm": 0.2133651833737105, + "learning_rate": 7.539935304313368e-05, + "loss": 2.3077, + "step": 1187 + }, + { + "epoch": 0.35085646780862373, + "grad_norm": 0.21420173084924787, + "learning_rate": 7.535814080284445e-05, + "loss": 2.3019, + "step": 1188 + }, + { + "epoch": 0.3511518015357354, + "grad_norm": 0.2209182045724288, + "learning_rate": 7.531690535599221e-05, + "loss": 2.3036, + "step": 1189 + }, + { + "epoch": 0.35144713526284704, + "grad_norm": 0.21805833561573482, + "learning_rate": 7.527564674031366e-05, + "loss": 2.2598, + "step": 1190 + }, + { + "epoch": 0.35174246898995865, + "grad_norm": 0.22986324424720667, + "learning_rate": 7.523436499356677e-05, + "loss": 2.2416, + "step": 1191 + }, + { + "epoch": 0.3520378027170703, + "grad_norm": 2.1330850168589035, + "learning_rate": 7.519306015353058e-05, + "loss": 2.3133, + "step": 1192 + }, + { + "epoch": 0.3523331364441819, + "grad_norm": 0.2258371135543673, + "learning_rate": 7.515173225800534e-05, + "loss": 2.306, + "step": 1193 + }, + { + "epoch": 0.35262847017129356, + "grad_norm": 0.2917698935784474, + "learning_rate": 7.511038134481237e-05, + "loss": 2.3919, + "step": 1194 + }, + { + "epoch": 0.3529238038984052, + "grad_norm": 0.21811824922034412, + "learning_rate": 7.506900745179403e-05, + "loss": 2.2283, + "step": 1195 + }, + { + "epoch": 0.3532191376255168, + "grad_norm": 0.21279515741911212, + "learning_rate": 7.502761061681376e-05, + "loss": 2.3274, + "step": 1196 + }, + { + "epoch": 0.3535144713526285, + "grad_norm": 0.2240580883682487, + "learning_rate": 7.498619087775597e-05, + "loss": 2.3773, + "step": 1197 + }, + { + "epoch": 0.3538098050797401, + "grad_norm": 0.2307461346284966, + "learning_rate": 7.494474827252606e-05, + "loss": 2.3299, + "step": 1198 + }, + { + "epoch": 0.35410513880685174, + "grad_norm": 0.25975465392844066, + "learning_rate": 7.490328283905027e-05, + "loss": 2.387, + "step": 1199 + }, + { + "epoch": 0.3544004725339634, + "grad_norm": 0.2167478540346199, + "learning_rate": 7.486179461527584e-05, + "loss": 2.2534, + "step": 1200 + }, + { + "epoch": 0.354695806261075, + "grad_norm": 0.21384897546433576, + "learning_rate": 7.482028363917079e-05, + "loss": 2.3423, + "step": 1201 + }, + { + "epoch": 0.35499113998818665, + "grad_norm": 0.2153279051093439, + "learning_rate": 7.477874994872399e-05, + "loss": 2.3201, + "step": 1202 + }, + { + "epoch": 0.3552864737152983, + "grad_norm": 0.2289620058163175, + "learning_rate": 7.473719358194511e-05, + "loss": 2.3268, + "step": 1203 + }, + { + "epoch": 0.3555818074424099, + "grad_norm": 0.2163104709517956, + "learning_rate": 7.469561457686455e-05, + "loss": 2.3344, + "step": 1204 + }, + { + "epoch": 0.35587714116952157, + "grad_norm": 0.21728132437423953, + "learning_rate": 7.465401297153345e-05, + "loss": 2.2737, + "step": 1205 + }, + { + "epoch": 0.3561724748966332, + "grad_norm": 0.22003220387487582, + "learning_rate": 7.46123888040236e-05, + "loss": 2.2769, + "step": 1206 + }, + { + "epoch": 0.35646780862374483, + "grad_norm": 0.210951875267455, + "learning_rate": 7.457074211242746e-05, + "loss": 2.3868, + "step": 1207 + }, + { + "epoch": 0.3567631423508565, + "grad_norm": 0.21009189159133676, + "learning_rate": 7.452907293485811e-05, + "loss": 2.3062, + "step": 1208 + }, + { + "epoch": 0.3570584760779681, + "grad_norm": 0.21259149482009781, + "learning_rate": 7.448738130944916e-05, + "loss": 2.3905, + "step": 1209 + }, + { + "epoch": 0.35735380980507975, + "grad_norm": 0.22075392551971157, + "learning_rate": 7.444566727435485e-05, + "loss": 2.3269, + "step": 1210 + }, + { + "epoch": 0.35764914353219135, + "grad_norm": 0.216810311940628, + "learning_rate": 7.440393086774985e-05, + "loss": 2.267, + "step": 1211 + }, + { + "epoch": 0.357944477259303, + "grad_norm": 0.21436607838575264, + "learning_rate": 7.436217212782934e-05, + "loss": 2.3347, + "step": 1212 + }, + { + "epoch": 0.35823981098641466, + "grad_norm": 0.2169626993629281, + "learning_rate": 7.432039109280893e-05, + "loss": 2.2892, + "step": 1213 + }, + { + "epoch": 0.35853514471352627, + "grad_norm": 0.22000038886577744, + "learning_rate": 7.427858780092463e-05, + "loss": 2.3117, + "step": 1214 + }, + { + "epoch": 0.3588304784406379, + "grad_norm": 0.21696433052463954, + "learning_rate": 7.423676229043283e-05, + "loss": 2.3064, + "step": 1215 + }, + { + "epoch": 0.3591258121677496, + "grad_norm": 0.21307353814935104, + "learning_rate": 7.419491459961023e-05, + "loss": 2.3259, + "step": 1216 + }, + { + "epoch": 0.3594211458948612, + "grad_norm": 0.22041943978029635, + "learning_rate": 7.415304476675387e-05, + "loss": 2.2719, + "step": 1217 + }, + { + "epoch": 0.35971647962197284, + "grad_norm": 0.22298677072233095, + "learning_rate": 7.4111152830181e-05, + "loss": 2.3653, + "step": 1218 + }, + { + "epoch": 0.36001181334908444, + "grad_norm": 0.21656821624565906, + "learning_rate": 7.406923882822914e-05, + "loss": 2.333, + "step": 1219 + }, + { + "epoch": 0.3603071470761961, + "grad_norm": 0.22557093417655033, + "learning_rate": 7.402730279925601e-05, + "loss": 2.2324, + "step": 1220 + }, + { + "epoch": 0.36060248080330776, + "grad_norm": 0.22777059342295436, + "learning_rate": 7.398534478163941e-05, + "loss": 2.2238, + "step": 1221 + }, + { + "epoch": 0.36089781453041936, + "grad_norm": 0.22033775096956276, + "learning_rate": 7.394336481377737e-05, + "loss": 2.3337, + "step": 1222 + }, + { + "epoch": 0.361193148257531, + "grad_norm": 0.2196309877859273, + "learning_rate": 7.390136293408793e-05, + "loss": 2.259, + "step": 1223 + }, + { + "epoch": 0.3614884819846427, + "grad_norm": 0.22622309946346736, + "learning_rate": 7.385933918100923e-05, + "loss": 2.3052, + "step": 1224 + }, + { + "epoch": 0.3617838157117543, + "grad_norm": 0.26220610776486436, + "learning_rate": 7.38172935929994e-05, + "loss": 2.4623, + "step": 1225 + }, + { + "epoch": 0.36207914943886593, + "grad_norm": 0.23729737195956152, + "learning_rate": 7.377522620853654e-05, + "loss": 2.3537, + "step": 1226 + }, + { + "epoch": 0.36237448316597753, + "grad_norm": 0.2273026412816699, + "learning_rate": 7.373313706611876e-05, + "loss": 2.2875, + "step": 1227 + }, + { + "epoch": 0.3626698168930892, + "grad_norm": 0.2356941136201305, + "learning_rate": 7.369102620426403e-05, + "loss": 2.3716, + "step": 1228 + }, + { + "epoch": 0.36296515062020085, + "grad_norm": 0.23079708117382283, + "learning_rate": 7.364889366151016e-05, + "loss": 2.3329, + "step": 1229 + }, + { + "epoch": 0.36326048434731245, + "grad_norm": 0.22182214671569556, + "learning_rate": 7.360673947641489e-05, + "loss": 2.2909, + "step": 1230 + }, + { + "epoch": 0.3635558180744241, + "grad_norm": 0.3926334057502918, + "learning_rate": 7.356456368755571e-05, + "loss": 2.517, + "step": 1231 + }, + { + "epoch": 0.3638511518015357, + "grad_norm": 0.21989832628379924, + "learning_rate": 7.35223663335299e-05, + "loss": 2.3889, + "step": 1232 + }, + { + "epoch": 0.36414648552864737, + "grad_norm": 0.29883026920122785, + "learning_rate": 7.348014745295444e-05, + "loss": 2.2989, + "step": 1233 + }, + { + "epoch": 0.364441819255759, + "grad_norm": 0.2193023995687847, + "learning_rate": 7.343790708446609e-05, + "loss": 2.3435, + "step": 1234 + }, + { + "epoch": 0.3647371529828706, + "grad_norm": 0.22399422539086017, + "learning_rate": 7.33956452667212e-05, + "loss": 2.269, + "step": 1235 + }, + { + "epoch": 0.3650324867099823, + "grad_norm": 0.22600770305485976, + "learning_rate": 7.335336203839575e-05, + "loss": 2.2924, + "step": 1236 + }, + { + "epoch": 0.36532782043709394, + "grad_norm": 0.22471203323891265, + "learning_rate": 7.331105743818536e-05, + "loss": 2.3733, + "step": 1237 + }, + { + "epoch": 0.36562315416420554, + "grad_norm": 0.34855112352935724, + "learning_rate": 7.326873150480519e-05, + "loss": 2.4152, + "step": 1238 + }, + { + "epoch": 0.3659184878913172, + "grad_norm": 0.22511832582979271, + "learning_rate": 7.322638427698991e-05, + "loss": 2.2751, + "step": 1239 + }, + { + "epoch": 0.3662138216184288, + "grad_norm": 0.22250898410039574, + "learning_rate": 7.318401579349369e-05, + "loss": 2.2626, + "step": 1240 + }, + { + "epoch": 0.36650915534554046, + "grad_norm": 0.22692915417246756, + "learning_rate": 7.314162609309014e-05, + "loss": 2.3195, + "step": 1241 + }, + { + "epoch": 0.3668044890726521, + "grad_norm": 0.2433745227955008, + "learning_rate": 7.309921521457233e-05, + "loss": 2.3506, + "step": 1242 + }, + { + "epoch": 0.3670998227997637, + "grad_norm": 0.22192129337218727, + "learning_rate": 7.305678319675262e-05, + "loss": 2.3963, + "step": 1243 + }, + { + "epoch": 0.3673951565268754, + "grad_norm": 0.2143471961716973, + "learning_rate": 7.301433007846282e-05, + "loss": 2.3057, + "step": 1244 + }, + { + "epoch": 0.367690490253987, + "grad_norm": 0.22704006703026133, + "learning_rate": 7.297185589855397e-05, + "loss": 2.3253, + "step": 1245 + }, + { + "epoch": 0.36798582398109864, + "grad_norm": 0.24577887387833794, + "learning_rate": 7.292936069589642e-05, + "loss": 2.2964, + "step": 1246 + }, + { + "epoch": 0.3682811577082103, + "grad_norm": 0.22148942554777254, + "learning_rate": 7.288684450937979e-05, + "loss": 2.3443, + "step": 1247 + }, + { + "epoch": 0.3685764914353219, + "grad_norm": 0.22226930759375974, + "learning_rate": 7.284430737791283e-05, + "loss": 2.3561, + "step": 1248 + }, + { + "epoch": 0.36887182516243355, + "grad_norm": 0.2172663991311447, + "learning_rate": 7.280174934042352e-05, + "loss": 2.3074, + "step": 1249 + }, + { + "epoch": 0.3691671588895452, + "grad_norm": 0.2161298060315797, + "learning_rate": 7.275917043585895e-05, + "loss": 2.357, + "step": 1250 + }, + { + "epoch": 0.3694624926166568, + "grad_norm": 0.7146153319583781, + "learning_rate": 7.27165707031853e-05, + "loss": 2.312, + "step": 1251 + }, + { + "epoch": 0.36975782634376847, + "grad_norm": 0.23227052007880955, + "learning_rate": 7.267395018138781e-05, + "loss": 2.4438, + "step": 1252 + }, + { + "epoch": 0.37005316007088007, + "grad_norm": 0.26426622725549903, + "learning_rate": 7.263130890947075e-05, + "loss": 2.3255, + "step": 1253 + }, + { + "epoch": 0.37034849379799173, + "grad_norm": 0.2718421581865016, + "learning_rate": 7.25886469264574e-05, + "loss": 2.2608, + "step": 1254 + }, + { + "epoch": 0.3706438275251034, + "grad_norm": 0.26016282425555376, + "learning_rate": 7.254596427138995e-05, + "loss": 2.2684, + "step": 1255 + }, + { + "epoch": 0.370939161252215, + "grad_norm": 0.2589200102396111, + "learning_rate": 7.250326098332957e-05, + "loss": 2.4029, + "step": 1256 + }, + { + "epoch": 0.37123449497932665, + "grad_norm": 0.24794158052366183, + "learning_rate": 7.246053710135625e-05, + "loss": 2.2919, + "step": 1257 + }, + { + "epoch": 0.37152982870643825, + "grad_norm": 0.2274399242623096, + "learning_rate": 7.241779266456885e-05, + "loss": 2.356, + "step": 1258 + }, + { + "epoch": 0.3718251624335499, + "grad_norm": 0.24479393332358595, + "learning_rate": 7.237502771208507e-05, + "loss": 2.3208, + "step": 1259 + }, + { + "epoch": 0.37212049616066156, + "grad_norm": 0.24825742751311144, + "learning_rate": 7.233224228304131e-05, + "loss": 2.3011, + "step": 1260 + }, + { + "epoch": 0.37241582988777316, + "grad_norm": 0.21852831165744502, + "learning_rate": 7.22894364165928e-05, + "loss": 2.287, + "step": 1261 + }, + { + "epoch": 0.3727111636148848, + "grad_norm": 0.2207163832066494, + "learning_rate": 7.224661015191341e-05, + "loss": 2.2493, + "step": 1262 + }, + { + "epoch": 0.3730064973419965, + "grad_norm": 0.2228862511452705, + "learning_rate": 7.22037635281957e-05, + "loss": 2.3351, + "step": 1263 + }, + { + "epoch": 0.3733018310691081, + "grad_norm": 0.22558786135427022, + "learning_rate": 7.216089658465088e-05, + "loss": 2.3534, + "step": 1264 + }, + { + "epoch": 0.37359716479621974, + "grad_norm": 0.2215171587865974, + "learning_rate": 7.211800936050872e-05, + "loss": 2.3162, + "step": 1265 + }, + { + "epoch": 0.37389249852333134, + "grad_norm": 0.22493583340410334, + "learning_rate": 7.207510189501756e-05, + "loss": 2.3455, + "step": 1266 + }, + { + "epoch": 0.374187832250443, + "grad_norm": 0.22210243012972236, + "learning_rate": 7.203217422744427e-05, + "loss": 2.2933, + "step": 1267 + }, + { + "epoch": 0.37448316597755466, + "grad_norm": 0.22033156285431868, + "learning_rate": 7.198922639707422e-05, + "loss": 2.3991, + "step": 1268 + }, + { + "epoch": 0.37477849970466626, + "grad_norm": 0.21639714893249523, + "learning_rate": 7.19462584432112e-05, + "loss": 2.3629, + "step": 1269 + }, + { + "epoch": 0.3750738334317779, + "grad_norm": 0.23168364056289298, + "learning_rate": 7.190327040517746e-05, + "loss": 2.1969, + "step": 1270 + }, + { + "epoch": 0.37536916715888957, + "grad_norm": 0.22141352399939113, + "learning_rate": 7.186026232231359e-05, + "loss": 2.3308, + "step": 1271 + }, + { + "epoch": 0.3756645008860012, + "grad_norm": 0.20848498884372418, + "learning_rate": 7.181723423397855e-05, + "loss": 2.313, + "step": 1272 + }, + { + "epoch": 0.37595983461311283, + "grad_norm": 0.22484402217146018, + "learning_rate": 7.177418617954957e-05, + "loss": 2.2925, + "step": 1273 + }, + { + "epoch": 0.37625516834022443, + "grad_norm": 0.22812696972088065, + "learning_rate": 7.173111819842222e-05, + "loss": 2.3406, + "step": 1274 + }, + { + "epoch": 0.3765505020673361, + "grad_norm": 0.20885133519397708, + "learning_rate": 7.168803033001024e-05, + "loss": 2.3023, + "step": 1275 + }, + { + "epoch": 0.37684583579444775, + "grad_norm": 0.2278564433505856, + "learning_rate": 7.164492261374558e-05, + "loss": 2.3019, + "step": 1276 + }, + { + "epoch": 0.37714116952155935, + "grad_norm": 0.2271799110000368, + "learning_rate": 7.160179508907839e-05, + "loss": 2.4182, + "step": 1277 + }, + { + "epoch": 0.377436503248671, + "grad_norm": 0.22737961614000837, + "learning_rate": 7.155864779547695e-05, + "loss": 2.2759, + "step": 1278 + }, + { + "epoch": 0.3777318369757826, + "grad_norm": 0.23314420668253516, + "learning_rate": 7.151548077242757e-05, + "loss": 2.4424, + "step": 1279 + }, + { + "epoch": 0.37802717070289427, + "grad_norm": 0.21332603515370271, + "learning_rate": 7.14722940594347e-05, + "loss": 2.3622, + "step": 1280 + }, + { + "epoch": 0.3783225044300059, + "grad_norm": 0.20969581721108566, + "learning_rate": 7.14290876960207e-05, + "loss": 2.2939, + "step": 1281 + }, + { + "epoch": 0.3786178381571175, + "grad_norm": 0.20150904501097058, + "learning_rate": 7.138586172172601e-05, + "loss": 2.0388, + "step": 1282 + }, + { + "epoch": 0.3789131718842292, + "grad_norm": 0.22880323357354113, + "learning_rate": 7.1342616176109e-05, + "loss": 2.2646, + "step": 1283 + }, + { + "epoch": 0.37920850561134084, + "grad_norm": 0.2171961055395494, + "learning_rate": 7.12993510987459e-05, + "loss": 2.3949, + "step": 1284 + }, + { + "epoch": 0.37950383933845244, + "grad_norm": 0.2313469786531405, + "learning_rate": 7.125606652923088e-05, + "loss": 2.3488, + "step": 1285 + }, + { + "epoch": 0.3797991730655641, + "grad_norm": 0.2174151856616402, + "learning_rate": 7.12127625071759e-05, + "loss": 2.3495, + "step": 1286 + }, + { + "epoch": 0.3800945067926757, + "grad_norm": 0.2165185069115149, + "learning_rate": 7.116943907221074e-05, + "loss": 2.2786, + "step": 1287 + }, + { + "epoch": 0.38038984051978736, + "grad_norm": 0.21662653855571506, + "learning_rate": 7.112609626398295e-05, + "loss": 2.3644, + "step": 1288 + }, + { + "epoch": 0.380685174246899, + "grad_norm": 0.24740466386528245, + "learning_rate": 7.108273412215778e-05, + "loss": 2.3454, + "step": 1289 + }, + { + "epoch": 0.3809805079740106, + "grad_norm": 0.23512717982979678, + "learning_rate": 7.103935268641822e-05, + "loss": 2.3909, + "step": 1290 + }, + { + "epoch": 0.3812758417011223, + "grad_norm": 0.2140561318668493, + "learning_rate": 7.099595199646492e-05, + "loss": 2.2329, + "step": 1291 + }, + { + "epoch": 0.3815711754282339, + "grad_norm": 0.21508867867679396, + "learning_rate": 7.095253209201607e-05, + "loss": 2.2941, + "step": 1292 + }, + { + "epoch": 0.38186650915534553, + "grad_norm": 0.22157166491654912, + "learning_rate": 7.090909301280755e-05, + "loss": 2.3379, + "step": 1293 + }, + { + "epoch": 0.3821618428824572, + "grad_norm": 0.248530029963479, + "learning_rate": 7.08656347985927e-05, + "loss": 2.2867, + "step": 1294 + }, + { + "epoch": 0.3824571766095688, + "grad_norm": 0.21262865834898584, + "learning_rate": 7.082215748914242e-05, + "loss": 2.2901, + "step": 1295 + }, + { + "epoch": 0.38275251033668045, + "grad_norm": 0.2283010453447754, + "learning_rate": 7.077866112424508e-05, + "loss": 2.3002, + "step": 1296 + }, + { + "epoch": 0.3830478440637921, + "grad_norm": 0.22918829581031752, + "learning_rate": 7.073514574370647e-05, + "loss": 2.2847, + "step": 1297 + }, + { + "epoch": 0.3833431777909037, + "grad_norm": 0.21697949974124842, + "learning_rate": 7.069161138734981e-05, + "loss": 2.3671, + "step": 1298 + }, + { + "epoch": 0.38363851151801537, + "grad_norm": 0.22077554172984612, + "learning_rate": 7.064805809501566e-05, + "loss": 2.3045, + "step": 1299 + }, + { + "epoch": 0.38393384524512697, + "grad_norm": 0.22229572018648971, + "learning_rate": 7.060448590656193e-05, + "loss": 2.2814, + "step": 1300 + }, + { + "epoch": 0.3842291789722386, + "grad_norm": 0.2244905987031683, + "learning_rate": 7.056089486186378e-05, + "loss": 2.4469, + "step": 1301 + }, + { + "epoch": 0.3845245126993503, + "grad_norm": 0.22648467174088197, + "learning_rate": 7.05172850008137e-05, + "loss": 2.3185, + "step": 1302 + }, + { + "epoch": 0.3848198464264619, + "grad_norm": 0.2189449820741445, + "learning_rate": 7.047365636332133e-05, + "loss": 2.3488, + "step": 1303 + }, + { + "epoch": 0.38511518015357354, + "grad_norm": 0.21872986973372988, + "learning_rate": 7.043000898931353e-05, + "loss": 2.3084, + "step": 1304 + }, + { + "epoch": 0.3854105138806852, + "grad_norm": 0.21903077906697965, + "learning_rate": 7.038634291873429e-05, + "loss": 2.3529, + "step": 1305 + }, + { + "epoch": 0.3857058476077968, + "grad_norm": 0.23153794121080576, + "learning_rate": 7.034265819154474e-05, + "loss": 2.325, + "step": 1306 + }, + { + "epoch": 0.38600118133490846, + "grad_norm": 0.21760037306721172, + "learning_rate": 7.029895484772305e-05, + "loss": 2.252, + "step": 1307 + }, + { + "epoch": 0.38629651506202006, + "grad_norm": 0.22139652164868298, + "learning_rate": 7.025523292726446e-05, + "loss": 2.3073, + "step": 1308 + }, + { + "epoch": 0.3865918487891317, + "grad_norm": 0.21666546008953952, + "learning_rate": 7.021149247018114e-05, + "loss": 2.3345, + "step": 1309 + }, + { + "epoch": 0.3868871825162434, + "grad_norm": 0.23706465618370903, + "learning_rate": 7.016773351650232e-05, + "loss": 2.3612, + "step": 1310 + }, + { + "epoch": 0.387182516243355, + "grad_norm": 0.2245592061280838, + "learning_rate": 7.012395610627407e-05, + "loss": 2.2693, + "step": 1311 + }, + { + "epoch": 0.38747784997046664, + "grad_norm": 0.7004960822704472, + "learning_rate": 7.008016027955943e-05, + "loss": 2.2788, + "step": 1312 + }, + { + "epoch": 0.38777318369757824, + "grad_norm": 0.22699317611560452, + "learning_rate": 7.003634607643825e-05, + "loss": 2.3024, + "step": 1313 + }, + { + "epoch": 0.3880685174246899, + "grad_norm": 0.2103302479945636, + "learning_rate": 6.999251353700718e-05, + "loss": 2.2833, + "step": 1314 + }, + { + "epoch": 0.38836385115180155, + "grad_norm": 0.22331261512631667, + "learning_rate": 6.99486627013797e-05, + "loss": 2.3315, + "step": 1315 + }, + { + "epoch": 0.38865918487891316, + "grad_norm": 0.21889470627466953, + "learning_rate": 6.990479360968597e-05, + "loss": 2.3231, + "step": 1316 + }, + { + "epoch": 0.3889545186060248, + "grad_norm": 0.21283427201315236, + "learning_rate": 6.986090630207293e-05, + "loss": 2.3613, + "step": 1317 + }, + { + "epoch": 0.38924985233313647, + "grad_norm": 0.21352071872557613, + "learning_rate": 6.981700081870414e-05, + "loss": 2.2999, + "step": 1318 + }, + { + "epoch": 0.38954518606024807, + "grad_norm": 0.2169400421170575, + "learning_rate": 6.97730771997598e-05, + "loss": 2.2872, + "step": 1319 + }, + { + "epoch": 0.38984051978735973, + "grad_norm": 0.22286722159972963, + "learning_rate": 6.972913548543674e-05, + "loss": 2.2944, + "step": 1320 + }, + { + "epoch": 0.39013585351447133, + "grad_norm": 0.23581377916067478, + "learning_rate": 6.968517571594828e-05, + "loss": 2.3264, + "step": 1321 + }, + { + "epoch": 0.390431187241583, + "grad_norm": 0.2271211166548471, + "learning_rate": 6.964119793152435e-05, + "loss": 2.3072, + "step": 1322 + }, + { + "epoch": 0.39072652096869465, + "grad_norm": 0.22358456553450076, + "learning_rate": 6.959720217241133e-05, + "loss": 2.3456, + "step": 1323 + }, + { + "epoch": 0.39102185469580625, + "grad_norm": 0.21441506185244863, + "learning_rate": 6.9553188478872e-05, + "loss": 2.2873, + "step": 1324 + }, + { + "epoch": 0.3913171884229179, + "grad_norm": 0.2271942669727765, + "learning_rate": 6.950915689118562e-05, + "loss": 2.3494, + "step": 1325 + }, + { + "epoch": 0.3916125221500295, + "grad_norm": 0.2303519518743662, + "learning_rate": 6.946510744964781e-05, + "loss": 2.3776, + "step": 1326 + }, + { + "epoch": 0.39190785587714116, + "grad_norm": 0.22040845391153147, + "learning_rate": 6.94210401945705e-05, + "loss": 2.3346, + "step": 1327 + }, + { + "epoch": 0.3922031896042528, + "grad_norm": 0.22799477354812478, + "learning_rate": 6.937695516628196e-05, + "loss": 2.3959, + "step": 1328 + }, + { + "epoch": 0.3924985233313644, + "grad_norm": 0.24115084467158016, + "learning_rate": 6.933285240512673e-05, + "loss": 2.4143, + "step": 1329 + }, + { + "epoch": 0.3927938570584761, + "grad_norm": 0.21746364506166055, + "learning_rate": 6.928873195146552e-05, + "loss": 2.3181, + "step": 1330 + }, + { + "epoch": 0.39308919078558774, + "grad_norm": 0.24265173288231998, + "learning_rate": 6.924459384567527e-05, + "loss": 2.3288, + "step": 1331 + }, + { + "epoch": 0.39338452451269934, + "grad_norm": 0.2286309008548553, + "learning_rate": 6.920043812814909e-05, + "loss": 2.2772, + "step": 1332 + }, + { + "epoch": 0.393679858239811, + "grad_norm": 0.20969832209078165, + "learning_rate": 6.915626483929619e-05, + "loss": 2.2907, + "step": 1333 + }, + { + "epoch": 0.3939751919669226, + "grad_norm": 0.2720933026789516, + "learning_rate": 6.911207401954185e-05, + "loss": 2.3066, + "step": 1334 + }, + { + "epoch": 0.39427052569403426, + "grad_norm": 0.22447945527650434, + "learning_rate": 6.906786570932742e-05, + "loss": 2.3029, + "step": 1335 + }, + { + "epoch": 0.3945658594211459, + "grad_norm": 0.23335414929153336, + "learning_rate": 6.90236399491102e-05, + "loss": 2.278, + "step": 1336 + }, + { + "epoch": 0.3948611931482575, + "grad_norm": 0.2278395130353173, + "learning_rate": 6.897939677936356e-05, + "loss": 2.3195, + "step": 1337 + }, + { + "epoch": 0.3951565268753692, + "grad_norm": 0.20659891006749076, + "learning_rate": 6.89351362405767e-05, + "loss": 2.2663, + "step": 1338 + }, + { + "epoch": 0.3954518606024808, + "grad_norm": 0.2251169357814702, + "learning_rate": 6.889085837325477e-05, + "loss": 2.3694, + "step": 1339 + }, + { + "epoch": 0.39574719432959243, + "grad_norm": 0.21196514164472555, + "learning_rate": 6.884656321791875e-05, + "loss": 2.2732, + "step": 1340 + }, + { + "epoch": 0.3960425280567041, + "grad_norm": 0.21937735040063072, + "learning_rate": 6.88022508151055e-05, + "loss": 2.323, + "step": 1341 + }, + { + "epoch": 0.3963378617838157, + "grad_norm": 0.22021736431159522, + "learning_rate": 6.875792120536758e-05, + "loss": 2.3045, + "step": 1342 + }, + { + "epoch": 0.39663319551092735, + "grad_norm": 0.22039546452793204, + "learning_rate": 6.871357442927336e-05, + "loss": 2.4119, + "step": 1343 + }, + { + "epoch": 0.396928529238039, + "grad_norm": 0.22082705942424657, + "learning_rate": 6.866921052740687e-05, + "loss": 2.3221, + "step": 1344 + }, + { + "epoch": 0.3972238629651506, + "grad_norm": 0.2200781827418069, + "learning_rate": 6.862482954036787e-05, + "loss": 2.2966, + "step": 1345 + }, + { + "epoch": 0.39751919669226227, + "grad_norm": 0.2126063890464044, + "learning_rate": 6.858043150877172e-05, + "loss": 2.366, + "step": 1346 + }, + { + "epoch": 0.39781453041937387, + "grad_norm": 0.3969288856990338, + "learning_rate": 6.853601647324938e-05, + "loss": 2.3923, + "step": 1347 + }, + { + "epoch": 0.3981098641464855, + "grad_norm": 0.21236571949871474, + "learning_rate": 6.849158447444736e-05, + "loss": 2.3842, + "step": 1348 + }, + { + "epoch": 0.3984051978735972, + "grad_norm": 0.2227970648417703, + "learning_rate": 6.844713555302774e-05, + "loss": 2.3456, + "step": 1349 + }, + { + "epoch": 0.3987005316007088, + "grad_norm": 0.24284597066743363, + "learning_rate": 6.840266974966804e-05, + "loss": 2.2543, + "step": 1350 + }, + { + "epoch": 0.39899586532782044, + "grad_norm": 0.26408115693494877, + "learning_rate": 6.835818710506125e-05, + "loss": 2.4305, + "step": 1351 + }, + { + "epoch": 0.3992911990549321, + "grad_norm": 0.22329074401638746, + "learning_rate": 6.831368765991575e-05, + "loss": 2.3785, + "step": 1352 + }, + { + "epoch": 0.3995865327820437, + "grad_norm": 0.21221457765084256, + "learning_rate": 6.826917145495534e-05, + "loss": 2.281, + "step": 1353 + }, + { + "epoch": 0.39988186650915536, + "grad_norm": 0.22177544596778478, + "learning_rate": 6.822463853091911e-05, + "loss": 2.3221, + "step": 1354 + }, + { + "epoch": 0.40017720023626696, + "grad_norm": 0.2328402576284461, + "learning_rate": 6.81800889285615e-05, + "loss": 2.4046, + "step": 1355 + }, + { + "epoch": 0.4004725339633786, + "grad_norm": 0.22299311903688915, + "learning_rate": 6.813552268865216e-05, + "loss": 2.269, + "step": 1356 + }, + { + "epoch": 0.4007678676904903, + "grad_norm": 0.23334516930930768, + "learning_rate": 6.8090939851976e-05, + "loss": 2.3209, + "step": 1357 + }, + { + "epoch": 0.4010632014176019, + "grad_norm": 0.2104562638734303, + "learning_rate": 6.804634045933311e-05, + "loss": 2.315, + "step": 1358 + }, + { + "epoch": 0.40135853514471354, + "grad_norm": 0.2242593774971827, + "learning_rate": 6.800172455153871e-05, + "loss": 2.3871, + "step": 1359 + }, + { + "epoch": 0.40165386887182514, + "grad_norm": 0.21956912016124935, + "learning_rate": 6.795709216942319e-05, + "loss": 2.3204, + "step": 1360 + }, + { + "epoch": 0.4019492025989368, + "grad_norm": 0.21628745257739715, + "learning_rate": 6.791244335383196e-05, + "loss": 2.3437, + "step": 1361 + }, + { + "epoch": 0.40224453632604845, + "grad_norm": 0.23669086550154927, + "learning_rate": 6.786777814562548e-05, + "loss": 2.4194, + "step": 1362 + }, + { + "epoch": 0.40253987005316005, + "grad_norm": 0.22101328163886863, + "learning_rate": 6.782309658567925e-05, + "loss": 2.2517, + "step": 1363 + }, + { + "epoch": 0.4028352037802717, + "grad_norm": 0.21890438966688974, + "learning_rate": 6.777839871488367e-05, + "loss": 2.3547, + "step": 1364 + }, + { + "epoch": 0.40313053750738337, + "grad_norm": 0.21779032072986065, + "learning_rate": 6.773368457414414e-05, + "loss": 2.3371, + "step": 1365 + }, + { + "epoch": 0.40342587123449497, + "grad_norm": 0.20512909936905804, + "learning_rate": 6.76889542043809e-05, + "loss": 2.3333, + "step": 1366 + }, + { + "epoch": 0.40372120496160663, + "grad_norm": 0.24239589585151952, + "learning_rate": 6.764420764652903e-05, + "loss": 2.3742, + "step": 1367 + }, + { + "epoch": 0.40401653868871823, + "grad_norm": 0.2186234567973797, + "learning_rate": 6.759944494153848e-05, + "loss": 2.3187, + "step": 1368 + }, + { + "epoch": 0.4043118724158299, + "grad_norm": 0.21105611080160683, + "learning_rate": 6.755466613037393e-05, + "loss": 2.2838, + "step": 1369 + }, + { + "epoch": 0.40460720614294154, + "grad_norm": 0.20910331870712812, + "learning_rate": 6.750987125401484e-05, + "loss": 2.2268, + "step": 1370 + }, + { + "epoch": 0.40490253987005315, + "grad_norm": 0.21857095952689354, + "learning_rate": 6.746506035345531e-05, + "loss": 2.3181, + "step": 1371 + }, + { + "epoch": 0.4051978735971648, + "grad_norm": 0.2212033131744065, + "learning_rate": 6.742023346970417e-05, + "loss": 2.2978, + "step": 1372 + }, + { + "epoch": 0.4054932073242764, + "grad_norm": 0.2222946308966803, + "learning_rate": 6.737539064378484e-05, + "loss": 2.3268, + "step": 1373 + }, + { + "epoch": 0.40578854105138806, + "grad_norm": 0.22130465608284758, + "learning_rate": 6.733053191673536e-05, + "loss": 2.2697, + "step": 1374 + }, + { + "epoch": 0.4060838747784997, + "grad_norm": 0.2122194489287943, + "learning_rate": 6.728565732960826e-05, + "loss": 2.3658, + "step": 1375 + }, + { + "epoch": 0.4063792085056113, + "grad_norm": 0.2302190996402733, + "learning_rate": 6.724076692347064e-05, + "loss": 2.4069, + "step": 1376 + }, + { + "epoch": 0.406674542232723, + "grad_norm": 0.22393263994735416, + "learning_rate": 6.719586073940408e-05, + "loss": 2.1966, + "step": 1377 + }, + { + "epoch": 0.40696987595983464, + "grad_norm": 0.2215674784333723, + "learning_rate": 6.715093881850458e-05, + "loss": 2.0335, + "step": 1378 + }, + { + "epoch": 0.40726520968694624, + "grad_norm": 0.2162108103433871, + "learning_rate": 6.71060012018825e-05, + "loss": 2.2438, + "step": 1379 + }, + { + "epoch": 0.4075605434140579, + "grad_norm": 0.21859117259645888, + "learning_rate": 6.706104793066265e-05, + "loss": 2.3871, + "step": 1380 + }, + { + "epoch": 0.4078558771411695, + "grad_norm": 0.21984161755490775, + "learning_rate": 6.701607904598414e-05, + "loss": 2.3105, + "step": 1381 + }, + { + "epoch": 0.40815121086828116, + "grad_norm": 0.20339221983670652, + "learning_rate": 6.697109458900032e-05, + "loss": 2.2823, + "step": 1382 + }, + { + "epoch": 0.4084465445953928, + "grad_norm": 0.22192847789429793, + "learning_rate": 6.692609460087882e-05, + "loss": 2.3575, + "step": 1383 + }, + { + "epoch": 0.4087418783225044, + "grad_norm": 0.22302776428498367, + "learning_rate": 6.688107912280149e-05, + "loss": 2.3091, + "step": 1384 + }, + { + "epoch": 0.4090372120496161, + "grad_norm": 0.40544325246739077, + "learning_rate": 6.683604819596436e-05, + "loss": 2.398, + "step": 1385 + }, + { + "epoch": 0.40933254577672773, + "grad_norm": 0.21224437407876978, + "learning_rate": 6.67910018615776e-05, + "loss": 2.3081, + "step": 1386 + }, + { + "epoch": 0.40962787950383933, + "grad_norm": 0.212353553868455, + "learning_rate": 6.674594016086544e-05, + "loss": 2.3149, + "step": 1387 + }, + { + "epoch": 0.409923213230951, + "grad_norm": 0.2291537203527535, + "learning_rate": 6.670086313506622e-05, + "loss": 2.3729, + "step": 1388 + }, + { + "epoch": 0.4102185469580626, + "grad_norm": 0.22652847758062333, + "learning_rate": 6.665577082543232e-05, + "loss": 2.3121, + "step": 1389 + }, + { + "epoch": 0.41051388068517425, + "grad_norm": 0.22911405862613876, + "learning_rate": 6.661066327323001e-05, + "loss": 2.1739, + "step": 1390 + }, + { + "epoch": 0.4108092144122859, + "grad_norm": 0.2307453435446597, + "learning_rate": 6.656554051973961e-05, + "loss": 2.223, + "step": 1391 + }, + { + "epoch": 0.4111045481393975, + "grad_norm": 0.21873827460062634, + "learning_rate": 6.652040260625532e-05, + "loss": 2.3093, + "step": 1392 + }, + { + "epoch": 0.41139988186650917, + "grad_norm": 0.21790294474244012, + "learning_rate": 6.64752495740852e-05, + "loss": 2.3278, + "step": 1393 + }, + { + "epoch": 0.41169521559362077, + "grad_norm": 0.22205847253161487, + "learning_rate": 6.643008146455114e-05, + "loss": 2.3178, + "step": 1394 + }, + { + "epoch": 0.4119905493207324, + "grad_norm": 0.20910641858957524, + "learning_rate": 6.638489831898889e-05, + "loss": 2.315, + "step": 1395 + }, + { + "epoch": 0.4122858830478441, + "grad_norm": 0.22484370737294104, + "learning_rate": 6.633970017874787e-05, + "loss": 2.2463, + "step": 1396 + }, + { + "epoch": 0.4125812167749557, + "grad_norm": 0.20878357019150132, + "learning_rate": 6.629448708519127e-05, + "loss": 2.3526, + "step": 1397 + }, + { + "epoch": 0.41287655050206734, + "grad_norm": 0.21544669249333648, + "learning_rate": 6.624925907969598e-05, + "loss": 2.2417, + "step": 1398 + }, + { + "epoch": 0.413171884229179, + "grad_norm": 0.21243939131433268, + "learning_rate": 6.620401620365249e-05, + "loss": 2.2848, + "step": 1399 + }, + { + "epoch": 0.4134672179562906, + "grad_norm": 0.28385752346375465, + "learning_rate": 6.615875849846495e-05, + "loss": 2.3024, + "step": 1400 + }, + { + "epoch": 0.41376255168340226, + "grad_norm": 0.597164749798088, + "learning_rate": 6.611348600555107e-05, + "loss": 2.3289, + "step": 1401 + }, + { + "epoch": 0.41405788541051386, + "grad_norm": 0.24931240145809724, + "learning_rate": 6.606819876634203e-05, + "loss": 2.2698, + "step": 1402 + }, + { + "epoch": 0.4143532191376255, + "grad_norm": 0.25550770068927997, + "learning_rate": 6.602289682228259e-05, + "loss": 2.3231, + "step": 1403 + }, + { + "epoch": 0.4146485528647372, + "grad_norm": 0.23011651844088635, + "learning_rate": 6.597758021483093e-05, + "loss": 2.3636, + "step": 1404 + }, + { + "epoch": 0.4149438865918488, + "grad_norm": 0.23207135576374208, + "learning_rate": 6.593224898545864e-05, + "loss": 2.2548, + "step": 1405 + }, + { + "epoch": 0.41523922031896043, + "grad_norm": 0.2184812308129351, + "learning_rate": 6.588690317565072e-05, + "loss": 2.2802, + "step": 1406 + }, + { + "epoch": 0.41553455404607204, + "grad_norm": 0.22568802773491328, + "learning_rate": 6.584154282690546e-05, + "loss": 2.3291, + "step": 1407 + }, + { + "epoch": 0.4158298877731837, + "grad_norm": 0.2185810236876692, + "learning_rate": 6.579616798073455e-05, + "loss": 2.2859, + "step": 1408 + }, + { + "epoch": 0.41612522150029535, + "grad_norm": 0.22098664833749282, + "learning_rate": 6.575077867866284e-05, + "loss": 2.3099, + "step": 1409 + }, + { + "epoch": 0.41642055522740695, + "grad_norm": 0.22960818179749534, + "learning_rate": 6.570537496222847e-05, + "loss": 2.3291, + "step": 1410 + }, + { + "epoch": 0.4167158889545186, + "grad_norm": 0.21008447960222212, + "learning_rate": 6.565995687298278e-05, + "loss": 2.1427, + "step": 1411 + }, + { + "epoch": 0.41701122268163027, + "grad_norm": 0.23487581152214923, + "learning_rate": 6.561452445249021e-05, + "loss": 2.3244, + "step": 1412 + }, + { + "epoch": 0.41730655640874187, + "grad_norm": 0.2310767639654529, + "learning_rate": 6.556907774232837e-05, + "loss": 2.293, + "step": 1413 + }, + { + "epoch": 0.4176018901358535, + "grad_norm": 0.20536263122391865, + "learning_rate": 6.55236167840879e-05, + "loss": 2.3385, + "step": 1414 + }, + { + "epoch": 0.41789722386296513, + "grad_norm": 0.24168386699588745, + "learning_rate": 6.547814161937255e-05, + "loss": 2.2369, + "step": 1415 + }, + { + "epoch": 0.4181925575900768, + "grad_norm": 0.23862473947991386, + "learning_rate": 6.543265228979898e-05, + "loss": 2.3514, + "step": 1416 + }, + { + "epoch": 0.41848789131718844, + "grad_norm": 0.2314960810353291, + "learning_rate": 6.538714883699687e-05, + "loss": 2.3809, + "step": 1417 + }, + { + "epoch": 0.41878322504430004, + "grad_norm": 0.39577791386526173, + "learning_rate": 6.534163130260885e-05, + "loss": 2.3534, + "step": 1418 + }, + { + "epoch": 0.4190785587714117, + "grad_norm": 0.22435590290842183, + "learning_rate": 6.529609972829034e-05, + "loss": 2.3456, + "step": 1419 + }, + { + "epoch": 0.4193738924985233, + "grad_norm": 0.23110401565365482, + "learning_rate": 6.525055415570971e-05, + "loss": 2.2529, + "step": 1420 + }, + { + "epoch": 0.41966922622563496, + "grad_norm": 0.23160666462873777, + "learning_rate": 6.520499462654808e-05, + "loss": 2.4899, + "step": 1421 + }, + { + "epoch": 0.4199645599527466, + "grad_norm": 0.2216884006350742, + "learning_rate": 6.515942118249938e-05, + "loss": 2.2806, + "step": 1422 + }, + { + "epoch": 0.4202598936798582, + "grad_norm": 0.2283516238290581, + "learning_rate": 6.511383386527023e-05, + "loss": 2.329, + "step": 1423 + }, + { + "epoch": 0.4205552274069699, + "grad_norm": 0.211834774038759, + "learning_rate": 6.506823271658e-05, + "loss": 2.3022, + "step": 1424 + }, + { + "epoch": 0.42085056113408154, + "grad_norm": 0.21308349482100417, + "learning_rate": 6.502261777816069e-05, + "loss": 2.3008, + "step": 1425 + }, + { + "epoch": 0.42114589486119314, + "grad_norm": 0.22899199822867575, + "learning_rate": 6.49769890917569e-05, + "loss": 2.3909, + "step": 1426 + }, + { + "epoch": 0.4214412285883048, + "grad_norm": 0.27059483419479335, + "learning_rate": 6.493134669912583e-05, + "loss": 2.3515, + "step": 1427 + }, + { + "epoch": 0.4217365623154164, + "grad_norm": 0.22149690999616203, + "learning_rate": 6.488569064203724e-05, + "loss": 2.2739, + "step": 1428 + }, + { + "epoch": 0.42203189604252805, + "grad_norm": 0.21652477819367302, + "learning_rate": 6.484002096227336e-05, + "loss": 2.3438, + "step": 1429 + }, + { + "epoch": 0.4223272297696397, + "grad_norm": 0.22191888449086244, + "learning_rate": 6.479433770162893e-05, + "loss": 2.3586, + "step": 1430 + }, + { + "epoch": 0.4226225634967513, + "grad_norm": 0.2574086783561908, + "learning_rate": 6.474864090191108e-05, + "loss": 2.2967, + "step": 1431 + }, + { + "epoch": 0.42291789722386297, + "grad_norm": 0.22206117347827842, + "learning_rate": 6.470293060493933e-05, + "loss": 2.273, + "step": 1432 + }, + { + "epoch": 0.42321323095097463, + "grad_norm": 0.23807853142701838, + "learning_rate": 6.465720685254563e-05, + "loss": 2.3121, + "step": 1433 + }, + { + "epoch": 0.42350856467808623, + "grad_norm": 0.22685857040774407, + "learning_rate": 6.46114696865741e-05, + "loss": 2.2945, + "step": 1434 + }, + { + "epoch": 0.4238038984051979, + "grad_norm": 0.22285050568259, + "learning_rate": 6.456571914888128e-05, + "loss": 2.3101, + "step": 1435 + }, + { + "epoch": 0.4240992321323095, + "grad_norm": 0.21173078988388383, + "learning_rate": 6.451995528133583e-05, + "loss": 2.3033, + "step": 1436 + }, + { + "epoch": 0.42439456585942115, + "grad_norm": 0.21121749132619178, + "learning_rate": 6.447417812581871e-05, + "loss": 2.2108, + "step": 1437 + }, + { + "epoch": 0.4246898995865328, + "grad_norm": 0.296653062279907, + "learning_rate": 6.442838772422295e-05, + "loss": 2.423, + "step": 1438 + }, + { + "epoch": 0.4249852333136444, + "grad_norm": 0.2403072153708352, + "learning_rate": 6.43825841184538e-05, + "loss": 2.3537, + "step": 1439 + }, + { + "epoch": 0.42528056704075606, + "grad_norm": 0.22581057077645356, + "learning_rate": 6.433676735042848e-05, + "loss": 2.3704, + "step": 1440 + }, + { + "epoch": 0.42557590076786767, + "grad_norm": 0.2205402566644421, + "learning_rate": 6.429093746207635e-05, + "loss": 2.3109, + "step": 1441 + }, + { + "epoch": 0.4258712344949793, + "grad_norm": 0.2153545808987101, + "learning_rate": 6.424509449533874e-05, + "loss": 2.2699, + "step": 1442 + }, + { + "epoch": 0.426166568222091, + "grad_norm": 0.23089487477372367, + "learning_rate": 6.419923849216893e-05, + "loss": 2.3817, + "step": 1443 + }, + { + "epoch": 0.4264619019492026, + "grad_norm": 0.2209750753362445, + "learning_rate": 6.415336949453217e-05, + "loss": 2.3593, + "step": 1444 + }, + { + "epoch": 0.42675723567631424, + "grad_norm": 0.22855864225912767, + "learning_rate": 6.410748754440556e-05, + "loss": 2.3446, + "step": 1445 + }, + { + "epoch": 0.4270525694034259, + "grad_norm": 0.21875218833101237, + "learning_rate": 6.406159268377811e-05, + "loss": 2.3779, + "step": 1446 + }, + { + "epoch": 0.4273479031305375, + "grad_norm": 0.22266357811362852, + "learning_rate": 6.40156849546506e-05, + "loss": 2.3229, + "step": 1447 + }, + { + "epoch": 0.42764323685764916, + "grad_norm": 0.22806890137570102, + "learning_rate": 6.396976439903559e-05, + "loss": 2.3868, + "step": 1448 + }, + { + "epoch": 0.42793857058476076, + "grad_norm": 0.2203952955003282, + "learning_rate": 6.392383105895739e-05, + "loss": 2.3425, + "step": 1449 + }, + { + "epoch": 0.4282339043118724, + "grad_norm": 0.22250287311284833, + "learning_rate": 6.387788497645198e-05, + "loss": 2.4052, + "step": 1450 + }, + { + "epoch": 0.4285292380389841, + "grad_norm": 0.22483852141810998, + "learning_rate": 6.383192619356706e-05, + "loss": 2.3311, + "step": 1451 + }, + { + "epoch": 0.4288245717660957, + "grad_norm": 0.23674507246559237, + "learning_rate": 6.378595475236191e-05, + "loss": 2.3476, + "step": 1452 + }, + { + "epoch": 0.42911990549320733, + "grad_norm": 0.21570403647722422, + "learning_rate": 6.37399706949074e-05, + "loss": 2.3882, + "step": 1453 + }, + { + "epoch": 0.42941523922031893, + "grad_norm": 0.2098194088882128, + "learning_rate": 6.369397406328596e-05, + "loss": 2.299, + "step": 1454 + }, + { + "epoch": 0.4297105729474306, + "grad_norm": 0.2131100320174511, + "learning_rate": 6.364796489959151e-05, + "loss": 2.2726, + "step": 1455 + }, + { + "epoch": 0.43000590667454225, + "grad_norm": 0.220970599380801, + "learning_rate": 6.360194324592945e-05, + "loss": 2.3822, + "step": 1456 + }, + { + "epoch": 0.43030124040165385, + "grad_norm": 0.2329681852015321, + "learning_rate": 6.355590914441661e-05, + "loss": 2.3255, + "step": 1457 + }, + { + "epoch": 0.4305965741287655, + "grad_norm": 0.21068106181125473, + "learning_rate": 6.350986263718117e-05, + "loss": 2.2735, + "step": 1458 + }, + { + "epoch": 0.43089190785587717, + "grad_norm": 0.20384548860446222, + "learning_rate": 6.346380376636275e-05, + "loss": 2.2814, + "step": 1459 + }, + { + "epoch": 0.43118724158298877, + "grad_norm": 0.21896419086449134, + "learning_rate": 6.34177325741122e-05, + "loss": 2.3529, + "step": 1460 + }, + { + "epoch": 0.4314825753101004, + "grad_norm": 0.21499650033331447, + "learning_rate": 6.337164910259172e-05, + "loss": 2.3325, + "step": 1461 + }, + { + "epoch": 0.431777909037212, + "grad_norm": 0.21720024764433682, + "learning_rate": 6.33255533939747e-05, + "loss": 2.3036, + "step": 1462 + }, + { + "epoch": 0.4320732427643237, + "grad_norm": 0.23401198088967007, + "learning_rate": 6.32794454904457e-05, + "loss": 2.3871, + "step": 1463 + }, + { + "epoch": 0.43236857649143534, + "grad_norm": 0.209127653330281, + "learning_rate": 6.323332543420051e-05, + "loss": 2.3632, + "step": 1464 + }, + { + "epoch": 0.43266391021854694, + "grad_norm": 0.22131922483474878, + "learning_rate": 6.318719326744598e-05, + "loss": 2.3602, + "step": 1465 + }, + { + "epoch": 0.4329592439456586, + "grad_norm": 0.22452940139062602, + "learning_rate": 6.314104903240011e-05, + "loss": 2.1821, + "step": 1466 + }, + { + "epoch": 0.43325457767277026, + "grad_norm": 0.25125956628626883, + "learning_rate": 6.309489277129189e-05, + "loss": 2.3124, + "step": 1467 + }, + { + "epoch": 0.43354991139988186, + "grad_norm": 0.22068911482642314, + "learning_rate": 6.304872452636133e-05, + "loss": 2.3197, + "step": 1468 + }, + { + "epoch": 0.4338452451269935, + "grad_norm": 0.3253934674636865, + "learning_rate": 6.300254433985942e-05, + "loss": 2.2521, + "step": 1469 + }, + { + "epoch": 0.4341405788541051, + "grad_norm": 0.21272682748126812, + "learning_rate": 6.295635225404806e-05, + "loss": 2.3134, + "step": 1470 + }, + { + "epoch": 0.4344359125812168, + "grad_norm": 0.23290723722855947, + "learning_rate": 6.291014831120004e-05, + "loss": 2.3131, + "step": 1471 + }, + { + "epoch": 0.43473124630832843, + "grad_norm": 0.23399999340324312, + "learning_rate": 6.286393255359901e-05, + "loss": 2.316, + "step": 1472 + }, + { + "epoch": 0.43502658003544004, + "grad_norm": 0.23506888353990518, + "learning_rate": 6.281770502353946e-05, + "loss": 2.3925, + "step": 1473 + }, + { + "epoch": 0.4353219137625517, + "grad_norm": 0.2229598635564752, + "learning_rate": 6.277146576332657e-05, + "loss": 2.3428, + "step": 1474 + }, + { + "epoch": 0.4356172474896633, + "grad_norm": 0.22511464383781413, + "learning_rate": 6.272521481527637e-05, + "loss": 2.3065, + "step": 1475 + }, + { + "epoch": 0.43591258121677495, + "grad_norm": 0.2132111944346833, + "learning_rate": 6.267895222171548e-05, + "loss": 2.2666, + "step": 1476 + }, + { + "epoch": 0.4362079149438866, + "grad_norm": 0.21897989352183078, + "learning_rate": 6.263267802498125e-05, + "loss": 2.3145, + "step": 1477 + }, + { + "epoch": 0.4365032486709982, + "grad_norm": 0.2138537311239835, + "learning_rate": 6.25863922674216e-05, + "loss": 2.219, + "step": 1478 + }, + { + "epoch": 0.43679858239810987, + "grad_norm": 0.2245040978544101, + "learning_rate": 6.254009499139506e-05, + "loss": 2.262, + "step": 1479 + }, + { + "epoch": 0.4370939161252215, + "grad_norm": 0.23836554927824974, + "learning_rate": 6.24937862392707e-05, + "loss": 2.318, + "step": 1480 + }, + { + "epoch": 0.43738924985233313, + "grad_norm": 0.21866451023424913, + "learning_rate": 6.244746605342808e-05, + "loss": 2.4199, + "step": 1481 + }, + { + "epoch": 0.4376845835794448, + "grad_norm": 0.22740554985545644, + "learning_rate": 6.24011344762572e-05, + "loss": 2.1876, + "step": 1482 + }, + { + "epoch": 0.4379799173065564, + "grad_norm": 0.21292799875320684, + "learning_rate": 6.235479155015857e-05, + "loss": 2.3052, + "step": 1483 + }, + { + "epoch": 0.43827525103366805, + "grad_norm": 0.21697396271377797, + "learning_rate": 6.230843731754299e-05, + "loss": 2.2943, + "step": 1484 + }, + { + "epoch": 0.4385705847607797, + "grad_norm": 0.27252551991716073, + "learning_rate": 6.226207182083165e-05, + "loss": 2.3133, + "step": 1485 + }, + { + "epoch": 0.4388659184878913, + "grad_norm": 0.21965615734012098, + "learning_rate": 6.221569510245608e-05, + "loss": 2.2561, + "step": 1486 + }, + { + "epoch": 0.43916125221500296, + "grad_norm": 0.23995934226443352, + "learning_rate": 6.216930720485798e-05, + "loss": 2.2878, + "step": 1487 + }, + { + "epoch": 0.43945658594211456, + "grad_norm": 0.23099215121528108, + "learning_rate": 6.21229081704894e-05, + "loss": 2.2657, + "step": 1488 + }, + { + "epoch": 0.4397519196692262, + "grad_norm": 0.25145992533494027, + "learning_rate": 6.207649804181251e-05, + "loss": 2.3454, + "step": 1489 + }, + { + "epoch": 0.4400472533963379, + "grad_norm": 0.21410710886296294, + "learning_rate": 6.203007686129962e-05, + "loss": 2.3665, + "step": 1490 + }, + { + "epoch": 0.4403425871234495, + "grad_norm": 0.22034185937631123, + "learning_rate": 6.198364467143324e-05, + "loss": 2.2482, + "step": 1491 + }, + { + "epoch": 0.44063792085056114, + "grad_norm": 0.21660409552672757, + "learning_rate": 6.193720151470587e-05, + "loss": 2.3629, + "step": 1492 + }, + { + "epoch": 0.4409332545776728, + "grad_norm": 0.22792330212380665, + "learning_rate": 6.189074743362006e-05, + "loss": 2.335, + "step": 1493 + }, + { + "epoch": 0.4412285883047844, + "grad_norm": 0.22644209951108962, + "learning_rate": 6.18442824706884e-05, + "loss": 2.3065, + "step": 1494 + }, + { + "epoch": 0.44152392203189605, + "grad_norm": 0.4551895889188202, + "learning_rate": 6.179780666843339e-05, + "loss": 2.3104, + "step": 1495 + }, + { + "epoch": 0.44181925575900766, + "grad_norm": 0.21940745599956138, + "learning_rate": 6.17513200693875e-05, + "loss": 2.3054, + "step": 1496 + }, + { + "epoch": 0.4421145894861193, + "grad_norm": 0.21496975411536745, + "learning_rate": 6.170482271609301e-05, + "loss": 2.4207, + "step": 1497 + }, + { + "epoch": 0.44240992321323097, + "grad_norm": 0.2144051670168875, + "learning_rate": 6.165831465110213e-05, + "loss": 2.3146, + "step": 1498 + }, + { + "epoch": 0.4427052569403426, + "grad_norm": 0.23069181800155858, + "learning_rate": 6.16117959169768e-05, + "loss": 2.2688, + "step": 1499 + }, + { + "epoch": 0.44300059066745423, + "grad_norm": 0.2245495758064101, + "learning_rate": 6.156526655628876e-05, + "loss": 2.3094, + "step": 1500 + }, + { + "epoch": 0.44329592439456583, + "grad_norm": 0.2299039839051765, + "learning_rate": 6.151872661161945e-05, + "loss": 2.3469, + "step": 1501 + }, + { + "epoch": 0.4435912581216775, + "grad_norm": 0.21163406860402384, + "learning_rate": 6.147217612556002e-05, + "loss": 2.277, + "step": 1502 + }, + { + "epoch": 0.44388659184878915, + "grad_norm": 0.21475798021894416, + "learning_rate": 6.142561514071125e-05, + "loss": 2.2959, + "step": 1503 + }, + { + "epoch": 0.44418192557590075, + "grad_norm": 0.2125436948283001, + "learning_rate": 6.137904369968357e-05, + "loss": 2.2334, + "step": 1504 + }, + { + "epoch": 0.4444772593030124, + "grad_norm": 1.0025420066423139, + "learning_rate": 6.13324618450969e-05, + "loss": 2.2986, + "step": 1505 + }, + { + "epoch": 0.44477259303012406, + "grad_norm": 0.21890884830803412, + "learning_rate": 6.128586961958077e-05, + "loss": 2.3198, + "step": 1506 + }, + { + "epoch": 0.44506792675723567, + "grad_norm": 0.23118656519154002, + "learning_rate": 6.123926706577416e-05, + "loss": 2.2444, + "step": 1507 + }, + { + "epoch": 0.4453632604843473, + "grad_norm": 0.21232892225366928, + "learning_rate": 6.119265422632548e-05, + "loss": 2.3243, + "step": 1508 + }, + { + "epoch": 0.4456585942114589, + "grad_norm": 0.2212425156044023, + "learning_rate": 6.114603114389261e-05, + "loss": 2.3046, + "step": 1509 + }, + { + "epoch": 0.4459539279385706, + "grad_norm": 0.21171532378292476, + "learning_rate": 6.109939786114277e-05, + "loss": 2.3158, + "step": 1510 + }, + { + "epoch": 0.44624926166568224, + "grad_norm": 0.22519287114668654, + "learning_rate": 6.105275442075251e-05, + "loss": 2.3115, + "step": 1511 + }, + { + "epoch": 0.44654459539279384, + "grad_norm": 0.27734426041677007, + "learning_rate": 6.10061008654077e-05, + "loss": 2.3989, + "step": 1512 + }, + { + "epoch": 0.4468399291199055, + "grad_norm": 0.23017703198824244, + "learning_rate": 6.095943723780344e-05, + "loss": 2.3418, + "step": 1513 + }, + { + "epoch": 0.44713526284701716, + "grad_norm": 0.22272506286304636, + "learning_rate": 6.091276358064408e-05, + "loss": 2.2833, + "step": 1514 + }, + { + "epoch": 0.44743059657412876, + "grad_norm": 0.2141198794876711, + "learning_rate": 6.08660799366431e-05, + "loss": 2.43, + "step": 1515 + }, + { + "epoch": 0.4477259303012404, + "grad_norm": 0.21607661670994494, + "learning_rate": 6.081938634852317e-05, + "loss": 2.3765, + "step": 1516 + }, + { + "epoch": 0.448021264028352, + "grad_norm": 0.21750740942995256, + "learning_rate": 6.077268285901603e-05, + "loss": 2.36, + "step": 1517 + }, + { + "epoch": 0.4483165977554637, + "grad_norm": 0.2388554206002271, + "learning_rate": 6.072596951086248e-05, + "loss": 2.4329, + "step": 1518 + }, + { + "epoch": 0.44861193148257533, + "grad_norm": 3.0344557966481442, + "learning_rate": 6.067924634681237e-05, + "loss": 2.3333, + "step": 1519 + }, + { + "epoch": 0.44890726520968693, + "grad_norm": 0.21393768681421352, + "learning_rate": 6.0632513409624505e-05, + "loss": 2.3242, + "step": 1520 + }, + { + "epoch": 0.4492025989367986, + "grad_norm": 0.22821035943714385, + "learning_rate": 6.0585770742066635e-05, + "loss": 2.3107, + "step": 1521 + }, + { + "epoch": 0.4494979326639102, + "grad_norm": 0.23272192844583317, + "learning_rate": 6.053901838691545e-05, + "loss": 2.3286, + "step": 1522 + }, + { + "epoch": 0.44979326639102185, + "grad_norm": 0.21762991139962748, + "learning_rate": 6.049225638695646e-05, + "loss": 2.1094, + "step": 1523 + }, + { + "epoch": 0.4500886001181335, + "grad_norm": 0.22001805785422746, + "learning_rate": 6.044548478498402e-05, + "loss": 2.3343, + "step": 1524 + }, + { + "epoch": 0.4503839338452451, + "grad_norm": 0.24097572472448803, + "learning_rate": 6.039870362380128e-05, + "loss": 2.3612, + "step": 1525 + }, + { + "epoch": 0.45067926757235677, + "grad_norm": 0.22635607523463888, + "learning_rate": 6.0351912946220126e-05, + "loss": 2.1512, + "step": 1526 + }, + { + "epoch": 0.4509746012994684, + "grad_norm": 0.2440914943851154, + "learning_rate": 6.030511279506116e-05, + "loss": 2.3076, + "step": 1527 + }, + { + "epoch": 0.45126993502658, + "grad_norm": 0.2528048173941868, + "learning_rate": 6.0258303213153666e-05, + "loss": 2.2819, + "step": 1528 + }, + { + "epoch": 0.4515652687536917, + "grad_norm": 0.2398576108052417, + "learning_rate": 6.0211484243335524e-05, + "loss": 2.2523, + "step": 1529 + }, + { + "epoch": 0.4518606024808033, + "grad_norm": 0.8791761534903682, + "learning_rate": 6.016465592845323e-05, + "loss": 2.2816, + "step": 1530 + }, + { + "epoch": 0.45215593620791494, + "grad_norm": 0.2387784554465628, + "learning_rate": 6.011781831136183e-05, + "loss": 2.379, + "step": 1531 + }, + { + "epoch": 0.4524512699350266, + "grad_norm": 0.24484896327483402, + "learning_rate": 6.0070971434924885e-05, + "loss": 2.2427, + "step": 1532 + }, + { + "epoch": 0.4527466036621382, + "grad_norm": 0.2643616730913493, + "learning_rate": 6.002411534201441e-05, + "loss": 2.3081, + "step": 1533 + }, + { + "epoch": 0.45304193738924986, + "grad_norm": 0.2977328579858246, + "learning_rate": 5.9977250075510896e-05, + "loss": 2.3311, + "step": 1534 + }, + { + "epoch": 0.45333727111636146, + "grad_norm": 0.2657247524325999, + "learning_rate": 5.993037567830319e-05, + "loss": 2.2767, + "step": 1535 + }, + { + "epoch": 0.4536326048434731, + "grad_norm": 0.260913521127453, + "learning_rate": 5.9883492193288516e-05, + "loss": 2.3645, + "step": 1536 + }, + { + "epoch": 0.4539279385705848, + "grad_norm": 0.5873762396056349, + "learning_rate": 5.983659966337239e-05, + "loss": 2.3691, + "step": 1537 + }, + { + "epoch": 0.4542232722976964, + "grad_norm": 0.24017157935693323, + "learning_rate": 5.978969813146863e-05, + "loss": 2.3265, + "step": 1538 + }, + { + "epoch": 0.45451860602480804, + "grad_norm": 0.2452207323808244, + "learning_rate": 5.9742787640499295e-05, + "loss": 2.227, + "step": 1539 + }, + { + "epoch": 0.4548139397519197, + "grad_norm": 0.24552440632753308, + "learning_rate": 5.9695868233394614e-05, + "loss": 2.354, + "step": 1540 + }, + { + "epoch": 0.4551092734790313, + "grad_norm": 0.23216622982944798, + "learning_rate": 5.9648939953093016e-05, + "loss": 2.3039, + "step": 1541 + }, + { + "epoch": 0.45540460720614295, + "grad_norm": 0.23735204467443202, + "learning_rate": 5.9602002842541005e-05, + "loss": 2.3249, + "step": 1542 + }, + { + "epoch": 0.45569994093325455, + "grad_norm": 0.22812415279596904, + "learning_rate": 5.955505694469322e-05, + "loss": 2.3653, + "step": 1543 + }, + { + "epoch": 0.4559952746603662, + "grad_norm": 1.0367648886997538, + "learning_rate": 5.9508102302512294e-05, + "loss": 2.2787, + "step": 1544 + }, + { + "epoch": 0.45629060838747787, + "grad_norm": 0.22768835973668933, + "learning_rate": 5.9461138958968856e-05, + "loss": 2.3245, + "step": 1545 + }, + { + "epoch": 0.45658594211458947, + "grad_norm": 0.23228801751513553, + "learning_rate": 5.941416695704155e-05, + "loss": 2.3459, + "step": 1546 + }, + { + "epoch": 0.45688127584170113, + "grad_norm": 0.25312748837130933, + "learning_rate": 5.936718633971692e-05, + "loss": 2.3993, + "step": 1547 + }, + { + "epoch": 0.45717660956881273, + "grad_norm": 0.22552439853472567, + "learning_rate": 5.9320197149989376e-05, + "loss": 2.353, + "step": 1548 + }, + { + "epoch": 0.4574719432959244, + "grad_norm": 0.23174041734202047, + "learning_rate": 5.927319943086118e-05, + "loss": 2.387, + "step": 1549 + }, + { + "epoch": 0.45776727702303605, + "grad_norm": 0.23381192598269884, + "learning_rate": 5.922619322534245e-05, + "loss": 2.3074, + "step": 1550 + }, + { + "epoch": 0.45806261075014765, + "grad_norm": 0.2180085303126105, + "learning_rate": 5.917917857645098e-05, + "loss": 2.3598, + "step": 1551 + }, + { + "epoch": 0.4583579444772593, + "grad_norm": 0.2301287972521856, + "learning_rate": 5.913215552721234e-05, + "loss": 2.2436, + "step": 1552 + }, + { + "epoch": 0.45865327820437096, + "grad_norm": 0.21330331471649053, + "learning_rate": 5.908512412065981e-05, + "loss": 2.2061, + "step": 1553 + }, + { + "epoch": 0.45894861193148256, + "grad_norm": 0.23112023134144974, + "learning_rate": 5.903808439983428e-05, + "loss": 2.094, + "step": 1554 + }, + { + "epoch": 0.4592439456585942, + "grad_norm": 0.2181617627411117, + "learning_rate": 5.899103640778426e-05, + "loss": 2.2633, + "step": 1555 + }, + { + "epoch": 0.4595392793857058, + "grad_norm": 0.2263681136894862, + "learning_rate": 5.894398018756584e-05, + "loss": 2.2483, + "step": 1556 + }, + { + "epoch": 0.4598346131128175, + "grad_norm": 0.207668556109939, + "learning_rate": 5.8896915782242634e-05, + "loss": 2.2606, + "step": 1557 + }, + { + "epoch": 0.46012994683992914, + "grad_norm": 0.21318859281458333, + "learning_rate": 5.884984323488574e-05, + "loss": 2.2045, + "step": 1558 + }, + { + "epoch": 0.46042528056704074, + "grad_norm": 0.22306515139444003, + "learning_rate": 5.880276258857371e-05, + "loss": 2.3471, + "step": 1559 + }, + { + "epoch": 0.4607206142941524, + "grad_norm": 0.22116113923001904, + "learning_rate": 5.875567388639251e-05, + "loss": 2.2986, + "step": 1560 + }, + { + "epoch": 0.46101594802126405, + "grad_norm": 0.22268940862736514, + "learning_rate": 5.8708577171435483e-05, + "loss": 2.2903, + "step": 1561 + }, + { + "epoch": 0.46131128174837566, + "grad_norm": 0.2206792058895894, + "learning_rate": 5.866147248680329e-05, + "loss": 2.3548, + "step": 1562 + }, + { + "epoch": 0.4616066154754873, + "grad_norm": 0.210662353285171, + "learning_rate": 5.861435987560391e-05, + "loss": 2.398, + "step": 1563 + }, + { + "epoch": 0.4619019492025989, + "grad_norm": 0.28295693055297017, + "learning_rate": 5.856723938095256e-05, + "loss": 2.3392, + "step": 1564 + }, + { + "epoch": 0.4621972829297106, + "grad_norm": 0.21445528804056466, + "learning_rate": 5.852011104597166e-05, + "loss": 2.2697, + "step": 1565 + }, + { + "epoch": 0.46249261665682223, + "grad_norm": 0.2234489651386081, + "learning_rate": 5.847297491379082e-05, + "loss": 2.3097, + "step": 1566 + }, + { + "epoch": 0.46278795038393383, + "grad_norm": 0.6551306351490472, + "learning_rate": 5.8425831027546796e-05, + "loss": 2.5308, + "step": 1567 + }, + { + "epoch": 0.4630832841110455, + "grad_norm": 0.22395798309801498, + "learning_rate": 5.8378679430383424e-05, + "loss": 2.4397, + "step": 1568 + }, + { + "epoch": 0.4633786178381571, + "grad_norm": 0.2160989162740493, + "learning_rate": 5.833152016545158e-05, + "loss": 2.3472, + "step": 1569 + }, + { + "epoch": 0.46367395156526875, + "grad_norm": 0.2194792648614544, + "learning_rate": 5.82843532759092e-05, + "loss": 2.3367, + "step": 1570 + }, + { + "epoch": 0.4639692852923804, + "grad_norm": 0.2272096139883521, + "learning_rate": 5.823717880492118e-05, + "loss": 2.2651, + "step": 1571 + }, + { + "epoch": 0.464264619019492, + "grad_norm": 0.2196308351315834, + "learning_rate": 5.818999679565933e-05, + "loss": 2.3754, + "step": 1572 + }, + { + "epoch": 0.46455995274660367, + "grad_norm": 0.22943461081416675, + "learning_rate": 5.814280729130239e-05, + "loss": 2.3286, + "step": 1573 + }, + { + "epoch": 0.4648552864737153, + "grad_norm": 0.21771583308454295, + "learning_rate": 5.809561033503594e-05, + "loss": 2.3934, + "step": 1574 + }, + { + "epoch": 0.4651506202008269, + "grad_norm": 0.20759608034947483, + "learning_rate": 5.804840597005241e-05, + "loss": 2.3077, + "step": 1575 + }, + { + "epoch": 0.4654459539279386, + "grad_norm": 0.23753708970176424, + "learning_rate": 5.8001194239550946e-05, + "loss": 2.3501, + "step": 1576 + }, + { + "epoch": 0.4657412876550502, + "grad_norm": 0.21980180935410007, + "learning_rate": 5.795397518673752e-05, + "loss": 2.3219, + "step": 1577 + }, + { + "epoch": 0.46603662138216184, + "grad_norm": 0.22150275441764902, + "learning_rate": 5.790674885482473e-05, + "loss": 2.3391, + "step": 1578 + }, + { + "epoch": 0.4663319551092735, + "grad_norm": 0.21912296040375134, + "learning_rate": 5.7859515287031885e-05, + "loss": 2.3688, + "step": 1579 + }, + { + "epoch": 0.4666272888363851, + "grad_norm": 0.22831193698110847, + "learning_rate": 5.7812274526584884e-05, + "loss": 2.3752, + "step": 1580 + }, + { + "epoch": 0.46692262256349676, + "grad_norm": 0.21249418355891428, + "learning_rate": 5.776502661671624e-05, + "loss": 2.2955, + "step": 1581 + }, + { + "epoch": 0.46721795629060836, + "grad_norm": 0.21623857454204615, + "learning_rate": 5.771777160066495e-05, + "loss": 2.3307, + "step": 1582 + }, + { + "epoch": 0.46751329001772, + "grad_norm": 2.4782426111085574, + "learning_rate": 5.767050952167659e-05, + "loss": 2.1994, + "step": 1583 + }, + { + "epoch": 0.4678086237448317, + "grad_norm": 0.22349898319365116, + "learning_rate": 5.7623240423003155e-05, + "loss": 2.3466, + "step": 1584 + }, + { + "epoch": 0.4681039574719433, + "grad_norm": 0.21911005633112987, + "learning_rate": 5.757596434790307e-05, + "loss": 2.3163, + "step": 1585 + }, + { + "epoch": 0.46839929119905493, + "grad_norm": 0.21066359998247983, + "learning_rate": 5.752868133964114e-05, + "loss": 2.3459, + "step": 1586 + }, + { + "epoch": 0.4686946249261666, + "grad_norm": 0.21164886299454666, + "learning_rate": 5.7481391441488565e-05, + "loss": 2.3608, + "step": 1587 + }, + { + "epoch": 0.4689899586532782, + "grad_norm": 0.22269398386260653, + "learning_rate": 5.743409469672276e-05, + "loss": 2.2611, + "step": 1588 + }, + { + "epoch": 0.46928529238038985, + "grad_norm": 0.24886375992509824, + "learning_rate": 5.738679114862748e-05, + "loss": 2.3216, + "step": 1589 + }, + { + "epoch": 0.46958062610750145, + "grad_norm": 0.23057981013461448, + "learning_rate": 5.733948084049267e-05, + "loss": 2.3344, + "step": 1590 + }, + { + "epoch": 0.4698759598346131, + "grad_norm": 0.2027130167153171, + "learning_rate": 5.729216381561446e-05, + "loss": 2.2021, + "step": 1591 + }, + { + "epoch": 0.47017129356172477, + "grad_norm": 0.22233144593390916, + "learning_rate": 5.724484011729515e-05, + "loss": 2.2843, + "step": 1592 + }, + { + "epoch": 0.47046662728883637, + "grad_norm": 0.23242588200040212, + "learning_rate": 5.719750978884314e-05, + "loss": 2.3836, + "step": 1593 + }, + { + "epoch": 0.470761961015948, + "grad_norm": 0.22104273466996147, + "learning_rate": 5.7150172873572906e-05, + "loss": 2.3128, + "step": 1594 + }, + { + "epoch": 0.4710572947430597, + "grad_norm": 0.21129535413907144, + "learning_rate": 5.710282941480491e-05, + "loss": 2.3919, + "step": 1595 + }, + { + "epoch": 0.4713526284701713, + "grad_norm": 0.2128567887461109, + "learning_rate": 5.705547945586562e-05, + "loss": 2.3011, + "step": 1596 + }, + { + "epoch": 0.47164796219728294, + "grad_norm": 0.21720614764892496, + "learning_rate": 5.7008123040087505e-05, + "loss": 2.2858, + "step": 1597 + }, + { + "epoch": 0.47194329592439455, + "grad_norm": 0.21598167773526553, + "learning_rate": 5.696076021080887e-05, + "loss": 2.3518, + "step": 1598 + }, + { + "epoch": 0.4722386296515062, + "grad_norm": 0.2137994789695948, + "learning_rate": 5.691339101137395e-05, + "loss": 2.2796, + "step": 1599 + }, + { + "epoch": 0.47253396337861786, + "grad_norm": 0.2254725662643394, + "learning_rate": 5.686601548513276e-05, + "loss": 2.3146, + "step": 1600 + }, + { + "epoch": 0.47282929710572946, + "grad_norm": 0.2059551197323242, + "learning_rate": 5.681863367544114e-05, + "loss": 2.2957, + "step": 1601 + }, + { + "epoch": 0.4731246308328411, + "grad_norm": 0.22391603684049882, + "learning_rate": 5.677124562566065e-05, + "loss": 2.2888, + "step": 1602 + }, + { + "epoch": 0.4734199645599527, + "grad_norm": 0.21219608333064638, + "learning_rate": 5.672385137915858e-05, + "loss": 2.2872, + "step": 1603 + }, + { + "epoch": 0.4737152982870644, + "grad_norm": 0.22303830379178124, + "learning_rate": 5.667645097930789e-05, + "loss": 2.4452, + "step": 1604 + }, + { + "epoch": 0.47401063201417604, + "grad_norm": 0.21003535526544811, + "learning_rate": 5.662904446948717e-05, + "loss": 2.3833, + "step": 1605 + }, + { + "epoch": 0.47430596574128764, + "grad_norm": 0.2229820177211462, + "learning_rate": 5.65816318930806e-05, + "loss": 2.2968, + "step": 1606 + }, + { + "epoch": 0.4746012994683993, + "grad_norm": 0.22132849918472539, + "learning_rate": 5.653421329347791e-05, + "loss": 2.2909, + "step": 1607 + }, + { + "epoch": 0.47489663319551095, + "grad_norm": 0.2210058566645439, + "learning_rate": 5.648678871407436e-05, + "loss": 2.3552, + "step": 1608 + }, + { + "epoch": 0.47519196692262256, + "grad_norm": 0.213867440071131, + "learning_rate": 5.6439358198270645e-05, + "loss": 2.3052, + "step": 1609 + }, + { + "epoch": 0.4754873006497342, + "grad_norm": 0.21136485184334453, + "learning_rate": 5.639192178947292e-05, + "loss": 2.33, + "step": 1610 + }, + { + "epoch": 0.4757826343768458, + "grad_norm": 0.2161110520624959, + "learning_rate": 5.634447953109274e-05, + "loss": 2.3393, + "step": 1611 + }, + { + "epoch": 0.47607796810395747, + "grad_norm": 0.21415866515566362, + "learning_rate": 5.629703146654699e-05, + "loss": 2.2899, + "step": 1612 + }, + { + "epoch": 0.47637330183106913, + "grad_norm": 0.21964131715004276, + "learning_rate": 5.624957763925788e-05, + "loss": 2.2754, + "step": 1613 + }, + { + "epoch": 0.47666863555818073, + "grad_norm": 0.21331471016524317, + "learning_rate": 5.6202118092652886e-05, + "loss": 2.353, + "step": 1614 + }, + { + "epoch": 0.4769639692852924, + "grad_norm": 0.2206425372656036, + "learning_rate": 5.615465287016475e-05, + "loss": 2.2348, + "step": 1615 + }, + { + "epoch": 0.477259303012404, + "grad_norm": 0.23512409820515526, + "learning_rate": 5.6107182015231375e-05, + "loss": 2.3278, + "step": 1616 + }, + { + "epoch": 0.47755463673951565, + "grad_norm": 0.29141712015040516, + "learning_rate": 5.605970557129579e-05, + "loss": 2.2918, + "step": 1617 + }, + { + "epoch": 0.4778499704666273, + "grad_norm": 0.22166940270315239, + "learning_rate": 5.601222358180623e-05, + "loss": 2.2265, + "step": 1618 + }, + { + "epoch": 0.4781453041937389, + "grad_norm": 0.21386814552684788, + "learning_rate": 5.596473609021592e-05, + "loss": 2.3885, + "step": 1619 + }, + { + "epoch": 0.47844063792085056, + "grad_norm": 0.6283863833299187, + "learning_rate": 5.591724313998314e-05, + "loss": 2.3462, + "step": 1620 + }, + { + "epoch": 0.4787359716479622, + "grad_norm": 0.22011057886934618, + "learning_rate": 5.586974477457122e-05, + "loss": 2.2646, + "step": 1621 + }, + { + "epoch": 0.4790313053750738, + "grad_norm": 0.21763904341950727, + "learning_rate": 5.5822241037448364e-05, + "loss": 2.2144, + "step": 1622 + }, + { + "epoch": 0.4793266391021855, + "grad_norm": 0.215258163965232, + "learning_rate": 5.5774731972087744e-05, + "loss": 2.3184, + "step": 1623 + }, + { + "epoch": 0.4796219728292971, + "grad_norm": 0.22519864446287524, + "learning_rate": 5.5727217621967386e-05, + "loss": 2.3514, + "step": 1624 + }, + { + "epoch": 0.47991730655640874, + "grad_norm": 0.2185747006196004, + "learning_rate": 5.567969803057018e-05, + "loss": 2.3469, + "step": 1625 + }, + { + "epoch": 0.4802126402835204, + "grad_norm": 2.302351048525609, + "learning_rate": 5.5632173241383775e-05, + "loss": 2.3017, + "step": 1626 + }, + { + "epoch": 0.480507974010632, + "grad_norm": 0.22685410379696244, + "learning_rate": 5.5584643297900594e-05, + "loss": 2.2633, + "step": 1627 + }, + { + "epoch": 0.48080330773774366, + "grad_norm": 0.2324361957017, + "learning_rate": 5.55371082436178e-05, + "loss": 2.3295, + "step": 1628 + }, + { + "epoch": 0.48109864146485526, + "grad_norm": 0.22569437434529577, + "learning_rate": 5.54895681220372e-05, + "loss": 2.3146, + "step": 1629 + }, + { + "epoch": 0.4813939751919669, + "grad_norm": 0.2118830855973221, + "learning_rate": 5.544202297666523e-05, + "loss": 2.3066, + "step": 1630 + }, + { + "epoch": 0.4816893089190786, + "grad_norm": 0.22548197542924991, + "learning_rate": 5.539447285101297e-05, + "loss": 2.196, + "step": 1631 + }, + { + "epoch": 0.4819846426461902, + "grad_norm": 0.2141936610392956, + "learning_rate": 5.534691778859602e-05, + "loss": 2.2971, + "step": 1632 + }, + { + "epoch": 0.48227997637330183, + "grad_norm": 0.3283051495792205, + "learning_rate": 5.529935783293451e-05, + "loss": 2.3402, + "step": 1633 + }, + { + "epoch": 0.4825753101004135, + "grad_norm": 0.21141015186642145, + "learning_rate": 5.525179302755303e-05, + "loss": 2.2795, + "step": 1634 + }, + { + "epoch": 0.4828706438275251, + "grad_norm": 2.111580832688172, + "learning_rate": 5.520422341598066e-05, + "loss": 2.2861, + "step": 1635 + }, + { + "epoch": 0.48316597755463675, + "grad_norm": 0.22670755182553481, + "learning_rate": 5.5156649041750797e-05, + "loss": 2.3301, + "step": 1636 + }, + { + "epoch": 0.48346131128174835, + "grad_norm": 0.22577571730121188, + "learning_rate": 5.5109069948401296e-05, + "loss": 2.3222, + "step": 1637 + }, + { + "epoch": 0.48375664500886, + "grad_norm": 0.2209502583506529, + "learning_rate": 5.506148617947425e-05, + "loss": 2.4877, + "step": 1638 + }, + { + "epoch": 0.48405197873597167, + "grad_norm": 0.21687397144469486, + "learning_rate": 5.5013897778516056e-05, + "loss": 2.3067, + "step": 1639 + }, + { + "epoch": 0.48434731246308327, + "grad_norm": 0.21084575070565112, + "learning_rate": 5.496630478907736e-05, + "loss": 2.2767, + "step": 1640 + }, + { + "epoch": 0.4846426461901949, + "grad_norm": 0.24007388701144664, + "learning_rate": 5.4918707254713e-05, + "loss": 2.4654, + "step": 1641 + }, + { + "epoch": 0.4849379799173066, + "grad_norm": 0.21246732397452336, + "learning_rate": 5.4871105218981955e-05, + "loss": 2.3075, + "step": 1642 + }, + { + "epoch": 0.4852333136444182, + "grad_norm": 0.21526001571542863, + "learning_rate": 5.482349872544737e-05, + "loss": 2.3091, + "step": 1643 + }, + { + "epoch": 0.48552864737152984, + "grad_norm": 0.27575141864266434, + "learning_rate": 5.477588781767642e-05, + "loss": 2.2621, + "step": 1644 + }, + { + "epoch": 0.48582398109864144, + "grad_norm": 0.22253103248768769, + "learning_rate": 5.472827253924036e-05, + "loss": 2.2852, + "step": 1645 + }, + { + "epoch": 0.4861193148257531, + "grad_norm": 0.22096030686812512, + "learning_rate": 5.4680652933714405e-05, + "loss": 2.2812, + "step": 1646 + }, + { + "epoch": 0.48641464855286476, + "grad_norm": 0.22428303936652055, + "learning_rate": 5.4633029044677765e-05, + "loss": 2.3591, + "step": 1647 + }, + { + "epoch": 0.48670998227997636, + "grad_norm": 0.21655930570418194, + "learning_rate": 5.4585400915713524e-05, + "loss": 2.3447, + "step": 1648 + }, + { + "epoch": 0.487005316007088, + "grad_norm": 0.21523228106025363, + "learning_rate": 5.4537768590408735e-05, + "loss": 2.151, + "step": 1649 + }, + { + "epoch": 0.4873006497341996, + "grad_norm": 0.2068738789110261, + "learning_rate": 5.449013211235419e-05, + "loss": 2.3359, + "step": 1650 + }, + { + "epoch": 0.4875959834613113, + "grad_norm": 0.2533221495480129, + "learning_rate": 5.444249152514453e-05, + "loss": 2.3482, + "step": 1651 + }, + { + "epoch": 0.48789131718842293, + "grad_norm": 0.20985922012908473, + "learning_rate": 5.439484687237817e-05, + "loss": 2.0593, + "step": 1652 + }, + { + "epoch": 0.48818665091553454, + "grad_norm": 0.2118278714880735, + "learning_rate": 5.4347198197657234e-05, + "loss": 2.2457, + "step": 1653 + }, + { + "epoch": 0.4884819846426462, + "grad_norm": 0.21757140431734065, + "learning_rate": 5.429954554458748e-05, + "loss": 2.3074, + "step": 1654 + }, + { + "epoch": 0.48877731836975785, + "grad_norm": 0.21433893196429402, + "learning_rate": 5.425188895677839e-05, + "loss": 2.3465, + "step": 1655 + }, + { + "epoch": 0.48907265209686945, + "grad_norm": 0.2112357450647208, + "learning_rate": 5.4204228477842966e-05, + "loss": 2.2716, + "step": 1656 + }, + { + "epoch": 0.4893679858239811, + "grad_norm": 0.21459591182088125, + "learning_rate": 5.415656415139784e-05, + "loss": 2.2586, + "step": 1657 + }, + { + "epoch": 0.4896633195510927, + "grad_norm": 0.21162340121558137, + "learning_rate": 5.4108896021063116e-05, + "loss": 2.3708, + "step": 1658 + }, + { + "epoch": 0.48995865327820437, + "grad_norm": 0.21613893248943913, + "learning_rate": 5.4061224130462395e-05, + "loss": 2.267, + "step": 1659 + }, + { + "epoch": 0.490253987005316, + "grad_norm": 0.21667654977015133, + "learning_rate": 5.4013548523222746e-05, + "loss": 2.3414, + "step": 1660 + }, + { + "epoch": 0.49054932073242763, + "grad_norm": 0.2368419908683972, + "learning_rate": 5.3965869242974585e-05, + "loss": 2.2707, + "step": 1661 + }, + { + "epoch": 0.4908446544595393, + "grad_norm": 0.2206312079272167, + "learning_rate": 5.391818633335174e-05, + "loss": 2.2566, + "step": 1662 + }, + { + "epoch": 0.4911399881866509, + "grad_norm": 0.21653190091112773, + "learning_rate": 5.387049983799133e-05, + "loss": 2.3932, + "step": 1663 + }, + { + "epoch": 0.49143532191376255, + "grad_norm": 0.21901940037418766, + "learning_rate": 5.382280980053376e-05, + "loss": 2.3292, + "step": 1664 + }, + { + "epoch": 0.4917306556408742, + "grad_norm": 0.21925522158961058, + "learning_rate": 5.377511626462269e-05, + "loss": 2.3645, + "step": 1665 + }, + { + "epoch": 0.4920259893679858, + "grad_norm": 0.2190396230835838, + "learning_rate": 5.372741927390498e-05, + "loss": 2.3228, + "step": 1666 + }, + { + "epoch": 0.49232132309509746, + "grad_norm": 0.2667796619498049, + "learning_rate": 5.367971887203063e-05, + "loss": 2.1881, + "step": 1667 + }, + { + "epoch": 0.4926166568222091, + "grad_norm": 0.2237138559674943, + "learning_rate": 5.363201510265279e-05, + "loss": 2.301, + "step": 1668 + }, + { + "epoch": 0.4929119905493207, + "grad_norm": 0.21303498334608506, + "learning_rate": 5.3584308009427656e-05, + "loss": 2.3547, + "step": 1669 + }, + { + "epoch": 0.4932073242764324, + "grad_norm": 0.2149121609055615, + "learning_rate": 5.3536597636014506e-05, + "loss": 2.2263, + "step": 1670 + }, + { + "epoch": 0.493502658003544, + "grad_norm": 0.2083781353143105, + "learning_rate": 5.348888402607559e-05, + "loss": 2.3221, + "step": 1671 + }, + { + "epoch": 0.49379799173065564, + "grad_norm": 0.2056157467450036, + "learning_rate": 5.344116722327616e-05, + "loss": 2.2977, + "step": 1672 + }, + { + "epoch": 0.4940933254577673, + "grad_norm": 0.21515551401426586, + "learning_rate": 5.339344727128432e-05, + "loss": 2.2711, + "step": 1673 + }, + { + "epoch": 0.4943886591848789, + "grad_norm": 0.21025929211541078, + "learning_rate": 5.3345724213771145e-05, + "loss": 2.3203, + "step": 1674 + }, + { + "epoch": 0.49468399291199056, + "grad_norm": 0.219722654519826, + "learning_rate": 5.329799809441046e-05, + "loss": 2.2059, + "step": 1675 + }, + { + "epoch": 0.4949793266391022, + "grad_norm": 0.21871397028252954, + "learning_rate": 5.3250268956878965e-05, + "loss": 2.314, + "step": 1676 + }, + { + "epoch": 0.4952746603662138, + "grad_norm": 0.2083765263286999, + "learning_rate": 5.320253684485609e-05, + "loss": 2.2835, + "step": 1677 + }, + { + "epoch": 0.49556999409332547, + "grad_norm": 0.2174779658008666, + "learning_rate": 5.3154801802023965e-05, + "loss": 2.3567, + "step": 1678 + }, + { + "epoch": 0.4958653278204371, + "grad_norm": 0.29409393406040285, + "learning_rate": 5.310706387206748e-05, + "loss": 2.2652, + "step": 1679 + }, + { + "epoch": 0.49616066154754873, + "grad_norm": 0.21485592200738043, + "learning_rate": 5.3059323098674065e-05, + "loss": 2.3043, + "step": 1680 + }, + { + "epoch": 0.4964559952746604, + "grad_norm": 0.21538721186355322, + "learning_rate": 5.301157952553383e-05, + "loss": 2.3265, + "step": 1681 + }, + { + "epoch": 0.496751329001772, + "grad_norm": 0.21371169512500598, + "learning_rate": 5.296383319633942e-05, + "loss": 2.2457, + "step": 1682 + }, + { + "epoch": 0.49704666272888365, + "grad_norm": 0.2152553170845601, + "learning_rate": 5.2916084154785984e-05, + "loss": 2.2834, + "step": 1683 + }, + { + "epoch": 0.49734199645599525, + "grad_norm": 0.20812332277223572, + "learning_rate": 5.286833244457119e-05, + "loss": 2.3682, + "step": 1684 + }, + { + "epoch": 0.4976373301831069, + "grad_norm": 0.2243519095198872, + "learning_rate": 5.282057810939511e-05, + "loss": 2.3412, + "step": 1685 + }, + { + "epoch": 0.49793266391021856, + "grad_norm": 0.21648288017315778, + "learning_rate": 5.277282119296025e-05, + "loss": 2.4083, + "step": 1686 + }, + { + "epoch": 0.49822799763733017, + "grad_norm": 0.2147154077911866, + "learning_rate": 5.272506173897147e-05, + "loss": 2.2364, + "step": 1687 + }, + { + "epoch": 0.4985233313644418, + "grad_norm": 0.21502373698796898, + "learning_rate": 5.267729979113594e-05, + "loss": 2.307, + "step": 1688 + }, + { + "epoch": 0.4988186650915535, + "grad_norm": 0.21230438428925566, + "learning_rate": 5.262953539316312e-05, + "loss": 2.3393, + "step": 1689 + }, + { + "epoch": 0.4991139988186651, + "grad_norm": 0.20761865380272684, + "learning_rate": 5.258176858876473e-05, + "loss": 2.3542, + "step": 1690 + }, + { + "epoch": 0.49940933254577674, + "grad_norm": 0.31101280623162136, + "learning_rate": 5.253399942165467e-05, + "loss": 2.2967, + "step": 1691 + }, + { + "epoch": 0.49970466627288834, + "grad_norm": 0.21607196169807708, + "learning_rate": 5.248622793554897e-05, + "loss": 2.2403, + "step": 1692 + }, + { + "epoch": 0.5, + "grad_norm": 0.20832065106573083, + "learning_rate": 5.243845417416586e-05, + "loss": 2.309, + "step": 1693 + }, + { + "epoch": 0.5002953337271117, + "grad_norm": 0.23765871147019696, + "learning_rate": 5.2390678181225606e-05, + "loss": 2.3544, + "step": 1694 + }, + { + "epoch": 0.5005906674542233, + "grad_norm": 0.22154154233179865, + "learning_rate": 5.234290000045051e-05, + "loss": 2.3539, + "step": 1695 + }, + { + "epoch": 0.5008860011813349, + "grad_norm": 0.2142188694909762, + "learning_rate": 5.229511967556489e-05, + "loss": 2.3236, + "step": 1696 + }, + { + "epoch": 0.5011813349084465, + "grad_norm": 0.2190450738813356, + "learning_rate": 5.2247337250295036e-05, + "loss": 2.3983, + "step": 1697 + }, + { + "epoch": 0.5014766686355582, + "grad_norm": 0.22151261810588901, + "learning_rate": 5.219955276836914e-05, + "loss": 2.3951, + "step": 1698 + }, + { + "epoch": 0.5017720023626698, + "grad_norm": 0.21575346727115735, + "learning_rate": 5.2151766273517264e-05, + "loss": 2.2668, + "step": 1699 + }, + { + "epoch": 0.5020673360897815, + "grad_norm": 0.20842346784722418, + "learning_rate": 5.210397780947137e-05, + "loss": 2.2778, + "step": 1700 + }, + { + "epoch": 0.502362669816893, + "grad_norm": 0.2161898577716777, + "learning_rate": 5.2056187419965165e-05, + "loss": 2.3866, + "step": 1701 + }, + { + "epoch": 0.5026580035440047, + "grad_norm": 0.21382460322631486, + "learning_rate": 5.2008395148734135e-05, + "loss": 2.2762, + "step": 1702 + }, + { + "epoch": 0.5029533372711164, + "grad_norm": 0.2154419719966732, + "learning_rate": 5.19606010395155e-05, + "loss": 2.2944, + "step": 1703 + }, + { + "epoch": 0.503248670998228, + "grad_norm": 0.22477476416322906, + "learning_rate": 5.191280513604817e-05, + "loss": 2.3647, + "step": 1704 + }, + { + "epoch": 0.5035440047253397, + "grad_norm": 0.2186371648510975, + "learning_rate": 5.186500748207264e-05, + "loss": 2.3455, + "step": 1705 + }, + { + "epoch": 0.5038393384524513, + "grad_norm": 0.21066451068577977, + "learning_rate": 5.1817208121331064e-05, + "loss": 2.4223, + "step": 1706 + }, + { + "epoch": 0.5041346721795629, + "grad_norm": 0.2210309539284888, + "learning_rate": 5.176940709756715e-05, + "loss": 2.2721, + "step": 1707 + }, + { + "epoch": 0.5044300059066745, + "grad_norm": 0.21456409807162888, + "learning_rate": 5.1721604454526106e-05, + "loss": 2.2871, + "step": 1708 + }, + { + "epoch": 0.5047253396337862, + "grad_norm": 0.21611224147100042, + "learning_rate": 5.167380023595465e-05, + "loss": 2.3078, + "step": 1709 + }, + { + "epoch": 0.5050206733608978, + "grad_norm": 0.21731343460375738, + "learning_rate": 5.162599448560091e-05, + "loss": 2.2331, + "step": 1710 + }, + { + "epoch": 0.5053160070880095, + "grad_norm": 0.21588427256122633, + "learning_rate": 5.157818724721445e-05, + "loss": 2.3011, + "step": 1711 + }, + { + "epoch": 0.505611340815121, + "grad_norm": 0.2127720738971815, + "learning_rate": 5.153037856454617e-05, + "loss": 2.2796, + "step": 1712 + }, + { + "epoch": 0.5059066745422327, + "grad_norm": 0.21739131605819484, + "learning_rate": 5.148256848134828e-05, + "loss": 2.3315, + "step": 1713 + }, + { + "epoch": 0.5062020082693444, + "grad_norm": 0.21117249397359295, + "learning_rate": 5.143475704137433e-05, + "loss": 2.3135, + "step": 1714 + }, + { + "epoch": 0.506497341996456, + "grad_norm": 0.2153558869241659, + "learning_rate": 5.1386944288379036e-05, + "loss": 2.3274, + "step": 1715 + }, + { + "epoch": 0.5067926757235677, + "grad_norm": 0.2311126152679537, + "learning_rate": 5.133913026611837e-05, + "loss": 2.4541, + "step": 1716 + }, + { + "epoch": 0.5070880094506792, + "grad_norm": 0.21213007170863937, + "learning_rate": 5.129131501834946e-05, + "loss": 2.3133, + "step": 1717 + }, + { + "epoch": 0.5073833431777909, + "grad_norm": 0.2191455789505875, + "learning_rate": 5.124349858883053e-05, + "loss": 2.2326, + "step": 1718 + }, + { + "epoch": 0.5076786769049025, + "grad_norm": 0.20663987359701744, + "learning_rate": 5.119568102132092e-05, + "loss": 2.2792, + "step": 1719 + }, + { + "epoch": 0.5079740106320142, + "grad_norm": 0.21808548026879954, + "learning_rate": 5.1147862359580944e-05, + "loss": 2.3251, + "step": 1720 + }, + { + "epoch": 0.5082693443591259, + "grad_norm": 0.20895285680661382, + "learning_rate": 5.1100042647372005e-05, + "loss": 2.3014, + "step": 1721 + }, + { + "epoch": 0.5085646780862374, + "grad_norm": 0.22207514741678616, + "learning_rate": 5.105222192845641e-05, + "loss": 2.3096, + "step": 1722 + }, + { + "epoch": 0.5088600118133491, + "grad_norm": 0.225567878892078, + "learning_rate": 5.1004400246597416e-05, + "loss": 2.2783, + "step": 1723 + }, + { + "epoch": 0.5091553455404607, + "grad_norm": 0.20888322539259652, + "learning_rate": 5.095657764555914e-05, + "loss": 2.2506, + "step": 1724 + }, + { + "epoch": 0.5094506792675724, + "grad_norm": 0.22258958320407618, + "learning_rate": 5.090875416910654e-05, + "loss": 2.2842, + "step": 1725 + }, + { + "epoch": 0.509746012994684, + "grad_norm": 0.21686102802881513, + "learning_rate": 5.086092986100541e-05, + "loss": 2.3857, + "step": 1726 + }, + { + "epoch": 0.5100413467217956, + "grad_norm": 0.2599417297466746, + "learning_rate": 5.0813104765022256e-05, + "loss": 2.253, + "step": 1727 + }, + { + "epoch": 0.5103366804489072, + "grad_norm": 0.2216415288407214, + "learning_rate": 5.076527892492434e-05, + "loss": 2.3492, + "step": 1728 + }, + { + "epoch": 0.5106320141760189, + "grad_norm": 0.2151802289118242, + "learning_rate": 5.0717452384479604e-05, + "loss": 2.3062, + "step": 1729 + }, + { + "epoch": 0.5109273479031305, + "grad_norm": 0.24438993313229687, + "learning_rate": 5.066962518745659e-05, + "loss": 2.4949, + "step": 1730 + }, + { + "epoch": 0.5112226816302422, + "grad_norm": 0.217419563470328, + "learning_rate": 5.06217973776245e-05, + "loss": 2.2977, + "step": 1731 + }, + { + "epoch": 0.5115180153573539, + "grad_norm": 0.2086839209549855, + "learning_rate": 5.0573968998753055e-05, + "loss": 2.3278, + "step": 1732 + }, + { + "epoch": 0.5118133490844654, + "grad_norm": 0.20799055609962203, + "learning_rate": 5.052614009461251e-05, + "loss": 2.2762, + "step": 1733 + }, + { + "epoch": 0.5121086828115771, + "grad_norm": 0.21704222568307666, + "learning_rate": 5.047831070897362e-05, + "loss": 2.3251, + "step": 1734 + }, + { + "epoch": 0.5124040165386887, + "grad_norm": 0.22597522916540594, + "learning_rate": 5.043048088560752e-05, + "loss": 2.3466, + "step": 1735 + }, + { + "epoch": 0.5126993502658004, + "grad_norm": 0.215301009731122, + "learning_rate": 5.0382650668285825e-05, + "loss": 2.4101, + "step": 1736 + }, + { + "epoch": 0.512994683992912, + "grad_norm": 0.20661970836731353, + "learning_rate": 5.0334820100780454e-05, + "loss": 2.2501, + "step": 1737 + }, + { + "epoch": 0.5132900177200236, + "grad_norm": 0.22511233532224004, + "learning_rate": 5.0286989226863665e-05, + "loss": 2.2866, + "step": 1738 + }, + { + "epoch": 0.5135853514471352, + "grad_norm": 0.20006695796274448, + "learning_rate": 5.0239158090308e-05, + "loss": 2.2748, + "step": 1739 + }, + { + "epoch": 0.5138806851742469, + "grad_norm": 0.21492104440387844, + "learning_rate": 5.0191326734886245e-05, + "loss": 2.3375, + "step": 1740 + }, + { + "epoch": 0.5141760189013586, + "grad_norm": 0.224929887722295, + "learning_rate": 5.014349520437138e-05, + "loss": 2.4513, + "step": 1741 + }, + { + "epoch": 0.5144713526284702, + "grad_norm": 0.2204909444055596, + "learning_rate": 5.009566354253653e-05, + "loss": 2.3604, + "step": 1742 + }, + { + "epoch": 0.5147666863555818, + "grad_norm": 0.23552945643558604, + "learning_rate": 5.0047831793154954e-05, + "loss": 2.2832, + "step": 1743 + }, + { + "epoch": 0.5150620200826934, + "grad_norm": 0.21413440645214687, + "learning_rate": 5e-05, + "loss": 2.2775, + "step": 1744 + }, + { + "epoch": 0.5153573538098051, + "grad_norm": 0.36143910533687706, + "learning_rate": 4.995216820684506e-05, + "loss": 2.3153, + "step": 1745 + }, + { + "epoch": 0.5156526875369167, + "grad_norm": 0.21953994455462714, + "learning_rate": 4.990433645746349e-05, + "loss": 2.2894, + "step": 1746 + }, + { + "epoch": 0.5159480212640284, + "grad_norm": 0.21594463409751166, + "learning_rate": 4.9856504795628636e-05, + "loss": 2.3022, + "step": 1747 + }, + { + "epoch": 0.5162433549911399, + "grad_norm": 0.22813169717756104, + "learning_rate": 4.980867326511376e-05, + "loss": 2.3773, + "step": 1748 + }, + { + "epoch": 0.5165386887182516, + "grad_norm": 0.21843782356685745, + "learning_rate": 4.9760841909692005e-05, + "loss": 2.2671, + "step": 1749 + }, + { + "epoch": 0.5168340224453633, + "grad_norm": 0.21700879642021673, + "learning_rate": 4.971301077313634e-05, + "loss": 2.3052, + "step": 1750 + }, + { + "epoch": 0.5171293561724749, + "grad_norm": 0.2096008495398451, + "learning_rate": 4.966517989921956e-05, + "loss": 2.3305, + "step": 1751 + }, + { + "epoch": 0.5174246898995866, + "grad_norm": 0.21856859041912668, + "learning_rate": 4.96173493317142e-05, + "loss": 2.3135, + "step": 1752 + }, + { + "epoch": 0.5177200236266982, + "grad_norm": 0.20705154607361867, + "learning_rate": 4.9569519114392496e-05, + "loss": 2.3097, + "step": 1753 + }, + { + "epoch": 0.5180153573538098, + "grad_norm": 0.20864232315701386, + "learning_rate": 4.9521689291026406e-05, + "loss": 2.2751, + "step": 1754 + }, + { + "epoch": 0.5183106910809214, + "grad_norm": 0.21590218350435797, + "learning_rate": 4.94738599053875e-05, + "loss": 2.3112, + "step": 1755 + }, + { + "epoch": 0.5186060248080331, + "grad_norm": 0.2111348976928615, + "learning_rate": 4.942603100124696e-05, + "loss": 2.2453, + "step": 1756 + }, + { + "epoch": 0.5189013585351447, + "grad_norm": 0.2271627006743144, + "learning_rate": 4.9378202622375517e-05, + "loss": 2.322, + "step": 1757 + }, + { + "epoch": 0.5191966922622564, + "grad_norm": 0.2297765876933066, + "learning_rate": 4.933037481254342e-05, + "loss": 2.2488, + "step": 1758 + }, + { + "epoch": 0.519492025989368, + "grad_norm": 0.20953091787074404, + "learning_rate": 4.9282547615520414e-05, + "loss": 2.3331, + "step": 1759 + }, + { + "epoch": 0.5197873597164796, + "grad_norm": 0.21168326189536354, + "learning_rate": 4.923472107507566e-05, + "loss": 2.3218, + "step": 1760 + }, + { + "epoch": 0.5200826934435913, + "grad_norm": 0.2537150910516916, + "learning_rate": 4.918689523497775e-05, + "loss": 2.2584, + "step": 1761 + }, + { + "epoch": 0.5203780271707029, + "grad_norm": 0.20951451592468676, + "learning_rate": 4.91390701389946e-05, + "loss": 2.356, + "step": 1762 + }, + { + "epoch": 0.5206733608978146, + "grad_norm": 0.2079471087642052, + "learning_rate": 4.9091245830893465e-05, + "loss": 2.2943, + "step": 1763 + }, + { + "epoch": 0.5209686946249261, + "grad_norm": 0.20766271700236077, + "learning_rate": 4.904342235444087e-05, + "loss": 2.2693, + "step": 1764 + }, + { + "epoch": 0.5212640283520378, + "grad_norm": 0.2067480129137228, + "learning_rate": 4.8995599753402596e-05, + "loss": 2.3042, + "step": 1765 + }, + { + "epoch": 0.5215593620791494, + "grad_norm": 0.2207701927164485, + "learning_rate": 4.894777807154361e-05, + "loss": 2.4258, + "step": 1766 + }, + { + "epoch": 0.5218546958062611, + "grad_norm": 0.21068131820200953, + "learning_rate": 4.8899957352628014e-05, + "loss": 2.2186, + "step": 1767 + }, + { + "epoch": 0.5221500295333728, + "grad_norm": 0.21512067460516654, + "learning_rate": 4.8852137640419074e-05, + "loss": 2.2861, + "step": 1768 + }, + { + "epoch": 0.5224453632604843, + "grad_norm": 0.2355923192863371, + "learning_rate": 4.880431897867911e-05, + "loss": 2.3858, + "step": 1769 + }, + { + "epoch": 0.522740696987596, + "grad_norm": 0.21315093268638502, + "learning_rate": 4.8756501411169473e-05, + "loss": 2.244, + "step": 1770 + }, + { + "epoch": 0.5230360307147076, + "grad_norm": 0.27278717908862665, + "learning_rate": 4.870868498165055e-05, + "loss": 2.3718, + "step": 1771 + }, + { + "epoch": 0.5233313644418193, + "grad_norm": 0.21047360130728418, + "learning_rate": 4.866086973388163e-05, + "loss": 2.3926, + "step": 1772 + }, + { + "epoch": 0.5236266981689309, + "grad_norm": 0.2118620742804828, + "learning_rate": 4.8613055711620976e-05, + "loss": 2.3069, + "step": 1773 + }, + { + "epoch": 0.5239220318960425, + "grad_norm": 0.22426013332030517, + "learning_rate": 4.856524295862569e-05, + "loss": 2.3197, + "step": 1774 + }, + { + "epoch": 0.5242173656231541, + "grad_norm": 0.20494011367685436, + "learning_rate": 4.851743151865173e-05, + "loss": 2.3144, + "step": 1775 + }, + { + "epoch": 0.5245126993502658, + "grad_norm": 0.20604478893640052, + "learning_rate": 4.846962143545385e-05, + "loss": 2.1928, + "step": 1776 + }, + { + "epoch": 0.5248080330773774, + "grad_norm": 0.20603665277463917, + "learning_rate": 4.842181275278556e-05, + "loss": 2.261, + "step": 1777 + }, + { + "epoch": 0.5251033668044891, + "grad_norm": 0.2212243213542065, + "learning_rate": 4.8374005514399085e-05, + "loss": 2.3905, + "step": 1778 + }, + { + "epoch": 0.5253987005316008, + "grad_norm": 0.215266081383728, + "learning_rate": 4.832619976404535e-05, + "loss": 2.2912, + "step": 1779 + }, + { + "epoch": 0.5256940342587123, + "grad_norm": 0.21677951915105526, + "learning_rate": 4.8278395545473885e-05, + "loss": 2.3512, + "step": 1780 + }, + { + "epoch": 0.525989367985824, + "grad_norm": 0.21140483021171277, + "learning_rate": 4.823059290243287e-05, + "loss": 2.3578, + "step": 1781 + }, + { + "epoch": 0.5262847017129356, + "grad_norm": 0.2169601753306366, + "learning_rate": 4.8182791878668955e-05, + "loss": 2.3486, + "step": 1782 + }, + { + "epoch": 0.5265800354400473, + "grad_norm": 0.220439373508622, + "learning_rate": 4.8134992517927387e-05, + "loss": 2.2458, + "step": 1783 + }, + { + "epoch": 0.5268753691671589, + "grad_norm": 0.20148061005534826, + "learning_rate": 4.8087194863951854e-05, + "loss": 2.2833, + "step": 1784 + }, + { + "epoch": 0.5271707028942705, + "grad_norm": 0.20814957958999555, + "learning_rate": 4.803939896048451e-05, + "loss": 2.2881, + "step": 1785 + }, + { + "epoch": 0.5274660366213821, + "grad_norm": 0.22597985327062933, + "learning_rate": 4.7991604851265877e-05, + "loss": 2.3473, + "step": 1786 + }, + { + "epoch": 0.5277613703484938, + "grad_norm": 0.2114504740261403, + "learning_rate": 4.7943812580034846e-05, + "loss": 2.2821, + "step": 1787 + }, + { + "epoch": 0.5280567040756055, + "grad_norm": 0.20645731419203653, + "learning_rate": 4.789602219052865e-05, + "loss": 2.3678, + "step": 1788 + }, + { + "epoch": 0.5283520378027171, + "grad_norm": 0.21062012037120467, + "learning_rate": 4.784823372648275e-05, + "loss": 2.3422, + "step": 1789 + }, + { + "epoch": 0.5286473715298287, + "grad_norm": 0.21339787414185934, + "learning_rate": 4.7800447231630876e-05, + "loss": 2.253, + "step": 1790 + }, + { + "epoch": 0.5289427052569403, + "grad_norm": 0.2102048565977482, + "learning_rate": 4.775266274970497e-05, + "loss": 2.325, + "step": 1791 + }, + { + "epoch": 0.529238038984052, + "grad_norm": 0.215056955145761, + "learning_rate": 4.770488032443511e-05, + "loss": 2.3088, + "step": 1792 + }, + { + "epoch": 0.5295333727111636, + "grad_norm": 0.20239408743264506, + "learning_rate": 4.7657099999549495e-05, + "loss": 2.3155, + "step": 1793 + }, + { + "epoch": 0.5298287064382753, + "grad_norm": 0.22902756591126103, + "learning_rate": 4.760932181877439e-05, + "loss": 2.3389, + "step": 1794 + }, + { + "epoch": 0.5301240401653868, + "grad_norm": 0.20790324918598774, + "learning_rate": 4.756154582583414e-05, + "loss": 2.2756, + "step": 1795 + }, + { + "epoch": 0.5304193738924985, + "grad_norm": 0.20689498267515694, + "learning_rate": 4.7513772064451044e-05, + "loss": 2.3324, + "step": 1796 + }, + { + "epoch": 0.5307147076196101, + "grad_norm": 0.21125058498527294, + "learning_rate": 4.746600057834536e-05, + "loss": 2.3162, + "step": 1797 + }, + { + "epoch": 0.5310100413467218, + "grad_norm": 0.2248270058510922, + "learning_rate": 4.741823141123528e-05, + "loss": 2.3823, + "step": 1798 + }, + { + "epoch": 0.5313053750738335, + "grad_norm": 0.20554346241976254, + "learning_rate": 4.7370464606836885e-05, + "loss": 2.3444, + "step": 1799 + }, + { + "epoch": 0.5316007088009451, + "grad_norm": 0.21470643770685346, + "learning_rate": 4.7322700208864074e-05, + "loss": 2.3079, + "step": 1800 + }, + { + "epoch": 0.5318960425280567, + "grad_norm": 0.21032349975750972, + "learning_rate": 4.7274938261028545e-05, + "loss": 2.2503, + "step": 1801 + }, + { + "epoch": 0.5321913762551683, + "grad_norm": 0.2119660555434752, + "learning_rate": 4.722717880703975e-05, + "loss": 2.1996, + "step": 1802 + }, + { + "epoch": 0.53248670998228, + "grad_norm": 0.21691187823992428, + "learning_rate": 4.71794218906049e-05, + "loss": 2.2814, + "step": 1803 + }, + { + "epoch": 0.5327820437093916, + "grad_norm": 0.2143873742761177, + "learning_rate": 4.713166755542883e-05, + "loss": 2.2426, + "step": 1804 + }, + { + "epoch": 0.5330773774365033, + "grad_norm": 0.21141897106875754, + "learning_rate": 4.708391584521402e-05, + "loss": 2.2779, + "step": 1805 + }, + { + "epoch": 0.5333727111636148, + "grad_norm": 0.2076566624547624, + "learning_rate": 4.7036166803660594e-05, + "loss": 2.3015, + "step": 1806 + }, + { + "epoch": 0.5336680448907265, + "grad_norm": 0.21399842333491356, + "learning_rate": 4.698842047446617e-05, + "loss": 2.2686, + "step": 1807 + }, + { + "epoch": 0.5339633786178382, + "grad_norm": 0.22016358043197842, + "learning_rate": 4.694067690132593e-05, + "loss": 2.3684, + "step": 1808 + }, + { + "epoch": 0.5342587123449498, + "grad_norm": 0.20911243116158856, + "learning_rate": 4.689293612793253e-05, + "loss": 2.2886, + "step": 1809 + }, + { + "epoch": 0.5345540460720615, + "grad_norm": 0.2168957596443082, + "learning_rate": 4.6845198197976046e-05, + "loss": 2.3035, + "step": 1810 + }, + { + "epoch": 0.534849379799173, + "grad_norm": 0.22915280997403134, + "learning_rate": 4.6797463155143936e-05, + "loss": 2.3476, + "step": 1811 + }, + { + "epoch": 0.5351447135262847, + "grad_norm": 0.2245413635779458, + "learning_rate": 4.674973104312106e-05, + "loss": 2.3469, + "step": 1812 + }, + { + "epoch": 0.5354400472533963, + "grad_norm": 0.22281034056592158, + "learning_rate": 4.670200190558955e-05, + "loss": 2.2609, + "step": 1813 + }, + { + "epoch": 0.535735380980508, + "grad_norm": 0.21983661862365475, + "learning_rate": 4.665427578622887e-05, + "loss": 2.2687, + "step": 1814 + }, + { + "epoch": 0.5360307147076196, + "grad_norm": 0.20128969797588933, + "learning_rate": 4.660655272871568e-05, + "loss": 2.2751, + "step": 1815 + }, + { + "epoch": 0.5363260484347312, + "grad_norm": 0.21045946773345925, + "learning_rate": 4.655883277672385e-05, + "loss": 2.3, + "step": 1816 + }, + { + "epoch": 0.5366213821618429, + "grad_norm": 0.213341864681337, + "learning_rate": 4.6511115973924416e-05, + "loss": 2.412, + "step": 1817 + }, + { + "epoch": 0.5369167158889545, + "grad_norm": 0.20887705462670556, + "learning_rate": 4.6463402363985506e-05, + "loss": 2.3101, + "step": 1818 + }, + { + "epoch": 0.5372120496160662, + "grad_norm": 0.21828120953198374, + "learning_rate": 4.6415691990572356e-05, + "loss": 2.3846, + "step": 1819 + }, + { + "epoch": 0.5375073833431778, + "grad_norm": 0.21407522247264582, + "learning_rate": 4.636798489734723e-05, + "loss": 2.2634, + "step": 1820 + }, + { + "epoch": 0.5378027170702895, + "grad_norm": 0.2140887954321253, + "learning_rate": 4.632028112796938e-05, + "loss": 2.3655, + "step": 1821 + }, + { + "epoch": 0.538098050797401, + "grad_norm": 0.20343859930078378, + "learning_rate": 4.6272580726095026e-05, + "loss": 2.2864, + "step": 1822 + }, + { + "epoch": 0.5383933845245127, + "grad_norm": 0.21099677842401734, + "learning_rate": 4.6224883735377314e-05, + "loss": 2.3278, + "step": 1823 + }, + { + "epoch": 0.5386887182516243, + "grad_norm": 0.21574125956581494, + "learning_rate": 4.617719019946624e-05, + "loss": 2.3312, + "step": 1824 + }, + { + "epoch": 0.538984051978736, + "grad_norm": 0.24699400677147457, + "learning_rate": 4.6129500162008684e-05, + "loss": 2.3665, + "step": 1825 + }, + { + "epoch": 0.5392793857058477, + "grad_norm": 0.21869932166660658, + "learning_rate": 4.6081813666648274e-05, + "loss": 2.2991, + "step": 1826 + }, + { + "epoch": 0.5395747194329592, + "grad_norm": 0.20412703177228736, + "learning_rate": 4.6034130757025426e-05, + "loss": 2.2629, + "step": 1827 + }, + { + "epoch": 0.5398700531600709, + "grad_norm": 0.21790052563101459, + "learning_rate": 4.598645147677727e-05, + "loss": 2.2992, + "step": 1828 + }, + { + "epoch": 0.5401653868871825, + "grad_norm": 0.209680485274071, + "learning_rate": 4.5938775869537616e-05, + "loss": 2.2761, + "step": 1829 + }, + { + "epoch": 0.5404607206142942, + "grad_norm": 0.21628284516371182, + "learning_rate": 4.58911039789369e-05, + "loss": 2.3804, + "step": 1830 + }, + { + "epoch": 0.5407560543414058, + "grad_norm": 0.21391281094182696, + "learning_rate": 4.584343584860218e-05, + "loss": 2.3304, + "step": 1831 + }, + { + "epoch": 0.5410513880685174, + "grad_norm": 0.21490304302104235, + "learning_rate": 4.5795771522157045e-05, + "loss": 2.329, + "step": 1832 + }, + { + "epoch": 0.541346721795629, + "grad_norm": 0.223468720315668, + "learning_rate": 4.574811104322163e-05, + "loss": 2.3321, + "step": 1833 + }, + { + "epoch": 0.5416420555227407, + "grad_norm": 0.21767580726318753, + "learning_rate": 4.570045445541253e-05, + "loss": 2.2233, + "step": 1834 + }, + { + "epoch": 0.5419373892498524, + "grad_norm": 0.2110304804137862, + "learning_rate": 4.5652801802342785e-05, + "loss": 2.2551, + "step": 1835 + }, + { + "epoch": 0.542232722976964, + "grad_norm": 0.21309832843729534, + "learning_rate": 4.5605153127621824e-05, + "loss": 2.2642, + "step": 1836 + }, + { + "epoch": 0.5425280567040756, + "grad_norm": 0.22146988069448326, + "learning_rate": 4.5557508474855467e-05, + "loss": 2.3985, + "step": 1837 + }, + { + "epoch": 0.5428233904311872, + "grad_norm": 0.21400175694608956, + "learning_rate": 4.550986788764582e-05, + "loss": 2.3199, + "step": 1838 + }, + { + "epoch": 0.5431187241582989, + "grad_norm": 0.21850882586804393, + "learning_rate": 4.546223140959127e-05, + "loss": 2.2866, + "step": 1839 + }, + { + "epoch": 0.5434140578854105, + "grad_norm": 0.20827241642757227, + "learning_rate": 4.541459908428648e-05, + "loss": 2.3728, + "step": 1840 + }, + { + "epoch": 0.5437093916125222, + "grad_norm": 0.20356125933181526, + "learning_rate": 4.5366970955322267e-05, + "loss": 2.3022, + "step": 1841 + }, + { + "epoch": 0.5440047253396337, + "grad_norm": 0.24543648702627588, + "learning_rate": 4.5319347066285614e-05, + "loss": 2.2907, + "step": 1842 + }, + { + "epoch": 0.5443000590667454, + "grad_norm": 0.23694427027195303, + "learning_rate": 4.527172746075966e-05, + "loss": 2.2635, + "step": 1843 + }, + { + "epoch": 0.544595392793857, + "grad_norm": 0.2179384248072281, + "learning_rate": 4.522411218232359e-05, + "loss": 2.3433, + "step": 1844 + }, + { + "epoch": 0.5448907265209687, + "grad_norm": 0.5361547969347741, + "learning_rate": 4.517650127455264e-05, + "loss": 2.2983, + "step": 1845 + }, + { + "epoch": 0.5451860602480804, + "grad_norm": 0.2074692250031005, + "learning_rate": 4.512889478101806e-05, + "loss": 2.2796, + "step": 1846 + }, + { + "epoch": 0.545481393975192, + "grad_norm": 0.2133956134461942, + "learning_rate": 4.508129274528702e-05, + "loss": 2.2525, + "step": 1847 + }, + { + "epoch": 0.5457767277023036, + "grad_norm": 0.2129193382848307, + "learning_rate": 4.503369521092265e-05, + "loss": 2.1186, + "step": 1848 + }, + { + "epoch": 0.5460720614294152, + "grad_norm": 0.21931475726455607, + "learning_rate": 4.498610222148395e-05, + "loss": 2.3826, + "step": 1849 + }, + { + "epoch": 0.5463673951565269, + "grad_norm": 0.2303253228617676, + "learning_rate": 4.4938513820525755e-05, + "loss": 2.3478, + "step": 1850 + }, + { + "epoch": 0.5466627288836385, + "grad_norm": 0.22831468632327667, + "learning_rate": 4.489093005159871e-05, + "loss": 2.2054, + "step": 1851 + }, + { + "epoch": 0.5469580626107502, + "grad_norm": 0.22002901242243578, + "learning_rate": 4.4843350958249195e-05, + "loss": 2.3272, + "step": 1852 + }, + { + "epoch": 0.5472533963378617, + "grad_norm": 0.21681225352687028, + "learning_rate": 4.4795776584019354e-05, + "loss": 2.3409, + "step": 1853 + }, + { + "epoch": 0.5475487300649734, + "grad_norm": 0.21983425468267898, + "learning_rate": 4.4748206972446986e-05, + "loss": 2.2474, + "step": 1854 + }, + { + "epoch": 0.5478440637920851, + "grad_norm": 0.22309081017314816, + "learning_rate": 4.470064216706552e-05, + "loss": 2.3057, + "step": 1855 + }, + { + "epoch": 0.5481393975191967, + "grad_norm": 0.20809136087329624, + "learning_rate": 4.4653082211404e-05, + "loss": 2.3415, + "step": 1856 + }, + { + "epoch": 0.5484347312463084, + "grad_norm": 0.20761845643022678, + "learning_rate": 4.460552714898705e-05, + "loss": 2.2833, + "step": 1857 + }, + { + "epoch": 0.5487300649734199, + "grad_norm": 0.21965503800074465, + "learning_rate": 4.455797702333478e-05, + "loss": 2.3584, + "step": 1858 + }, + { + "epoch": 0.5490253987005316, + "grad_norm": 0.21980999528175343, + "learning_rate": 4.451043187796282e-05, + "loss": 2.3571, + "step": 1859 + }, + { + "epoch": 0.5493207324276432, + "grad_norm": 0.5456446294760221, + "learning_rate": 4.4462891756382214e-05, + "loss": 2.3821, + "step": 1860 + }, + { + "epoch": 0.5496160661547549, + "grad_norm": 0.2043215191566684, + "learning_rate": 4.441535670209941e-05, + "loss": 2.3945, + "step": 1861 + }, + { + "epoch": 0.5499113998818665, + "grad_norm": 0.21826684762275161, + "learning_rate": 4.4367826758616236e-05, + "loss": 2.3266, + "step": 1862 + }, + { + "epoch": 0.5502067336089781, + "grad_norm": 0.2170546176625933, + "learning_rate": 4.432030196942983e-05, + "loss": 2.2764, + "step": 1863 + }, + { + "epoch": 0.5505020673360898, + "grad_norm": 0.2317789465666471, + "learning_rate": 4.4272782378032625e-05, + "loss": 2.2957, + "step": 1864 + }, + { + "epoch": 0.5507974010632014, + "grad_norm": 0.22844174812948698, + "learning_rate": 4.422526802791227e-05, + "loss": 2.3273, + "step": 1865 + }, + { + "epoch": 0.5510927347903131, + "grad_norm": 0.24623118453346535, + "learning_rate": 4.417775896255165e-05, + "loss": 2.2759, + "step": 1866 + }, + { + "epoch": 0.5513880685174247, + "grad_norm": 0.21358545463041667, + "learning_rate": 4.4130255225428797e-05, + "loss": 2.2802, + "step": 1867 + }, + { + "epoch": 0.5516834022445364, + "grad_norm": 0.20843832032987344, + "learning_rate": 4.408275686001686e-05, + "loss": 2.304, + "step": 1868 + }, + { + "epoch": 0.5519787359716479, + "grad_norm": 0.20303121565832735, + "learning_rate": 4.403526390978411e-05, + "loss": 2.2489, + "step": 1869 + }, + { + "epoch": 0.5522740696987596, + "grad_norm": 0.2265294971763986, + "learning_rate": 4.398777641819379e-05, + "loss": 2.2881, + "step": 1870 + }, + { + "epoch": 0.5525694034258712, + "grad_norm": 0.2421909993441866, + "learning_rate": 4.394029442870422e-05, + "loss": 2.466, + "step": 1871 + }, + { + "epoch": 0.5528647371529829, + "grad_norm": 0.22001089560827136, + "learning_rate": 4.389281798476865e-05, + "loss": 2.3711, + "step": 1872 + }, + { + "epoch": 0.5531600708800946, + "grad_norm": 0.2136459338325724, + "learning_rate": 4.3845347129835264e-05, + "loss": 2.3483, + "step": 1873 + }, + { + "epoch": 0.5534554046072061, + "grad_norm": 0.21313738739393712, + "learning_rate": 4.379788190734712e-05, + "loss": 2.3051, + "step": 1874 + }, + { + "epoch": 0.5537507383343178, + "grad_norm": 0.21026403152033235, + "learning_rate": 4.375042236074213e-05, + "loss": 2.2832, + "step": 1875 + }, + { + "epoch": 0.5540460720614294, + "grad_norm": 0.23025359601320178, + "learning_rate": 4.3702968533453026e-05, + "loss": 2.3602, + "step": 1876 + }, + { + "epoch": 0.5543414057885411, + "grad_norm": 0.20855926342888612, + "learning_rate": 4.365552046890727e-05, + "loss": 2.3165, + "step": 1877 + }, + { + "epoch": 0.5546367395156527, + "grad_norm": 0.23181738100289673, + "learning_rate": 4.3608078210527085e-05, + "loss": 2.2934, + "step": 1878 + }, + { + "epoch": 0.5549320732427643, + "grad_norm": 0.20086346290134904, + "learning_rate": 4.3560641801729366e-05, + "loss": 2.3065, + "step": 1879 + }, + { + "epoch": 0.5552274069698759, + "grad_norm": 0.22409394043635186, + "learning_rate": 4.3513211285925653e-05, + "loss": 2.3651, + "step": 1880 + }, + { + "epoch": 0.5555227406969876, + "grad_norm": 0.21477268951164766, + "learning_rate": 4.346578670652208e-05, + "loss": 2.3995, + "step": 1881 + }, + { + "epoch": 0.5558180744240993, + "grad_norm": 0.2094261811458417, + "learning_rate": 4.34183681069194e-05, + "loss": 2.3682, + "step": 1882 + }, + { + "epoch": 0.5561134081512109, + "grad_norm": 0.20823280273253825, + "learning_rate": 4.337095553051283e-05, + "loss": 2.3314, + "step": 1883 + }, + { + "epoch": 0.5564087418783225, + "grad_norm": 0.2105538889012166, + "learning_rate": 4.3323549020692134e-05, + "loss": 2.2555, + "step": 1884 + }, + { + "epoch": 0.5567040756054341, + "grad_norm": 0.21207898837330946, + "learning_rate": 4.327614862084145e-05, + "loss": 2.2809, + "step": 1885 + }, + { + "epoch": 0.5569994093325458, + "grad_norm": 0.21519284063952945, + "learning_rate": 4.3228754374339377e-05, + "loss": 2.2817, + "step": 1886 + }, + { + "epoch": 0.5572947430596574, + "grad_norm": 0.23964588067541412, + "learning_rate": 4.318136632455888e-05, + "loss": 2.2597, + "step": 1887 + }, + { + "epoch": 0.5575900767867691, + "grad_norm": 0.2269058339128671, + "learning_rate": 4.3133984514867256e-05, + "loss": 2.4054, + "step": 1888 + }, + { + "epoch": 0.5578854105138806, + "grad_norm": 0.20940981547000545, + "learning_rate": 4.308660898862606e-05, + "loss": 2.2998, + "step": 1889 + }, + { + "epoch": 0.5581807442409923, + "grad_norm": 0.206353589977734, + "learning_rate": 4.3039239789191134e-05, + "loss": 2.065, + "step": 1890 + }, + { + "epoch": 0.558476077968104, + "grad_norm": 0.20900847394090907, + "learning_rate": 4.2991876959912514e-05, + "loss": 2.329, + "step": 1891 + }, + { + "epoch": 0.5587714116952156, + "grad_norm": 0.2076291411898037, + "learning_rate": 4.2944520544134396e-05, + "loss": 2.3456, + "step": 1892 + }, + { + "epoch": 0.5590667454223273, + "grad_norm": 0.21041064340323817, + "learning_rate": 4.289717058519511e-05, + "loss": 2.2781, + "step": 1893 + }, + { + "epoch": 0.5593620791494389, + "grad_norm": 0.2111621384628806, + "learning_rate": 4.2849827126427106e-05, + "loss": 2.3329, + "step": 1894 + }, + { + "epoch": 0.5596574128765505, + "grad_norm": 0.21672691041067224, + "learning_rate": 4.280249021115685e-05, + "loss": 2.3858, + "step": 1895 + }, + { + "epoch": 0.5599527466036621, + "grad_norm": 0.2138795718435133, + "learning_rate": 4.275515988270485e-05, + "loss": 2.3062, + "step": 1896 + }, + { + "epoch": 0.5602480803307738, + "grad_norm": 0.2070227747208054, + "learning_rate": 4.2707836184385544e-05, + "loss": 2.3009, + "step": 1897 + }, + { + "epoch": 0.5605434140578854, + "grad_norm": 0.2127500297286375, + "learning_rate": 4.2660519159507364e-05, + "loss": 2.3253, + "step": 1898 + }, + { + "epoch": 0.5608387477849971, + "grad_norm": 0.21282030396569823, + "learning_rate": 4.2613208851372543e-05, + "loss": 2.384, + "step": 1899 + }, + { + "epoch": 0.5611340815121086, + "grad_norm": 0.24031794265318193, + "learning_rate": 4.256590530327725e-05, + "loss": 2.3203, + "step": 1900 + }, + { + "epoch": 0.5614294152392203, + "grad_norm": 0.22217783644005148, + "learning_rate": 4.2518608558511453e-05, + "loss": 2.3369, + "step": 1901 + }, + { + "epoch": 0.561724748966332, + "grad_norm": 0.21427991747820815, + "learning_rate": 4.247131866035886e-05, + "loss": 2.2602, + "step": 1902 + }, + { + "epoch": 0.5620200826934436, + "grad_norm": 0.21677727653652915, + "learning_rate": 4.242403565209695e-05, + "loss": 2.3009, + "step": 1903 + }, + { + "epoch": 0.5623154164205553, + "grad_norm": 0.21626066792089493, + "learning_rate": 4.2376759576996857e-05, + "loss": 2.3543, + "step": 1904 + }, + { + "epoch": 0.5626107501476668, + "grad_norm": 0.21472016161045315, + "learning_rate": 4.232949047832342e-05, + "loss": 2.3185, + "step": 1905 + }, + { + "epoch": 0.5629060838747785, + "grad_norm": 0.22380087089047399, + "learning_rate": 4.228222839933507e-05, + "loss": 2.4492, + "step": 1906 + }, + { + "epoch": 0.5632014176018901, + "grad_norm": 0.21396179371152446, + "learning_rate": 4.223497338328378e-05, + "loss": 2.3876, + "step": 1907 + }, + { + "epoch": 0.5634967513290018, + "grad_norm": 0.20412634740128974, + "learning_rate": 4.218772547341512e-05, + "loss": 2.3336, + "step": 1908 + }, + { + "epoch": 0.5637920850561134, + "grad_norm": 0.2111888994501896, + "learning_rate": 4.214048471296812e-05, + "loss": 2.3728, + "step": 1909 + }, + { + "epoch": 0.564087418783225, + "grad_norm": 0.2062457044472016, + "learning_rate": 4.2093251145175265e-05, + "loss": 2.3182, + "step": 1910 + }, + { + "epoch": 0.5643827525103366, + "grad_norm": 0.2083034831755747, + "learning_rate": 4.204602481326248e-05, + "loss": 2.2827, + "step": 1911 + }, + { + "epoch": 0.5646780862374483, + "grad_norm": 0.21177619633323766, + "learning_rate": 4.1998805760449045e-05, + "loss": 2.2961, + "step": 1912 + }, + { + "epoch": 0.56497341996456, + "grad_norm": 0.2120212336671352, + "learning_rate": 4.195159402994762e-05, + "loss": 2.3579, + "step": 1913 + }, + { + "epoch": 0.5652687536916716, + "grad_norm": 0.21977278329234473, + "learning_rate": 4.190438966496407e-05, + "loss": 2.3553, + "step": 1914 + }, + { + "epoch": 0.5655640874187833, + "grad_norm": 0.20460882531289237, + "learning_rate": 4.185719270869763e-05, + "loss": 2.2887, + "step": 1915 + }, + { + "epoch": 0.5658594211458948, + "grad_norm": 0.21502373698796898, + "learning_rate": 4.1810003204340684e-05, + "loss": 2.2617, + "step": 1916 + }, + { + "epoch": 0.5661547548730065, + "grad_norm": 0.21266117329893025, + "learning_rate": 4.1762821195078833e-05, + "loss": 2.3532, + "step": 1917 + }, + { + "epoch": 0.5664500886001181, + "grad_norm": 0.20833459893477987, + "learning_rate": 4.171564672409081e-05, + "loss": 2.3261, + "step": 1918 + }, + { + "epoch": 0.5667454223272298, + "grad_norm": 0.2106128330989705, + "learning_rate": 4.166847983454843e-05, + "loss": 2.2381, + "step": 1919 + }, + { + "epoch": 0.5670407560543415, + "grad_norm": 0.21847805080660548, + "learning_rate": 4.1621320569616595e-05, + "loss": 2.3435, + "step": 1920 + }, + { + "epoch": 0.567336089781453, + "grad_norm": 0.2126329857870062, + "learning_rate": 4.1574168972453216e-05, + "loss": 2.2607, + "step": 1921 + }, + { + "epoch": 0.5676314235085647, + "grad_norm": 0.20510548000454448, + "learning_rate": 4.152702508620919e-05, + "loss": 2.2455, + "step": 1922 + }, + { + "epoch": 0.5679267572356763, + "grad_norm": 0.2162694615774181, + "learning_rate": 4.147988895402835e-05, + "loss": 2.3158, + "step": 1923 + }, + { + "epoch": 0.568222090962788, + "grad_norm": 0.20772284029197047, + "learning_rate": 4.143276061904745e-05, + "loss": 2.2309, + "step": 1924 + }, + { + "epoch": 0.5685174246898996, + "grad_norm": 0.21267716616732185, + "learning_rate": 4.138564012439609e-05, + "loss": 2.3289, + "step": 1925 + }, + { + "epoch": 0.5688127584170112, + "grad_norm": 0.21696586723945419, + "learning_rate": 4.133852751319671e-05, + "loss": 2.3748, + "step": 1926 + }, + { + "epoch": 0.5691080921441228, + "grad_norm": 0.20678096613490485, + "learning_rate": 4.129142282856452e-05, + "loss": 2.2331, + "step": 1927 + }, + { + "epoch": 0.5694034258712345, + "grad_norm": 0.20893914633492267, + "learning_rate": 4.12443261136075e-05, + "loss": 2.3693, + "step": 1928 + }, + { + "epoch": 0.5696987595983461, + "grad_norm": 0.21203744170861571, + "learning_rate": 4.1197237411426305e-05, + "loss": 2.2969, + "step": 1929 + }, + { + "epoch": 0.5699940933254578, + "grad_norm": 0.21766439196713414, + "learning_rate": 4.1150156765114274e-05, + "loss": 2.333, + "step": 1930 + }, + { + "epoch": 0.5702894270525694, + "grad_norm": 0.20750250353796587, + "learning_rate": 4.110308421775738e-05, + "loss": 2.2271, + "step": 1931 + }, + { + "epoch": 0.570584760779681, + "grad_norm": 0.22294704788761882, + "learning_rate": 4.1056019812434165e-05, + "loss": 2.3086, + "step": 1932 + }, + { + "epoch": 0.5708800945067927, + "grad_norm": 0.21496627088387973, + "learning_rate": 4.100896359221575e-05, + "loss": 2.3348, + "step": 1933 + }, + { + "epoch": 0.5711754282339043, + "grad_norm": 0.2188176493263548, + "learning_rate": 4.0961915600165736e-05, + "loss": 2.313, + "step": 1934 + }, + { + "epoch": 0.571470761961016, + "grad_norm": 0.21458638121159443, + "learning_rate": 4.09148758793402e-05, + "loss": 2.3526, + "step": 1935 + }, + { + "epoch": 0.5717660956881275, + "grad_norm": 0.20685084598113304, + "learning_rate": 4.086784447278767e-05, + "loss": 2.3044, + "step": 1936 + }, + { + "epoch": 0.5720614294152392, + "grad_norm": 0.2046993117527976, + "learning_rate": 4.082082142354904e-05, + "loss": 2.3646, + "step": 1937 + }, + { + "epoch": 0.5723567631423508, + "grad_norm": 0.5236539250933636, + "learning_rate": 4.077380677465756e-05, + "loss": 2.3392, + "step": 1938 + }, + { + "epoch": 0.5726520968694625, + "grad_norm": 0.20252125219393277, + "learning_rate": 4.072680056913881e-05, + "loss": 2.3157, + "step": 1939 + }, + { + "epoch": 0.5729474305965742, + "grad_norm": 0.21506209982003605, + "learning_rate": 4.067980285001063e-05, + "loss": 2.2279, + "step": 1940 + }, + { + "epoch": 0.5732427643236858, + "grad_norm": 0.21343762006798156, + "learning_rate": 4.0632813660283084e-05, + "loss": 2.3149, + "step": 1941 + }, + { + "epoch": 0.5735380980507974, + "grad_norm": 0.21730182883771923, + "learning_rate": 4.0585833042958466e-05, + "loss": 2.4289, + "step": 1942 + }, + { + "epoch": 0.573833431777909, + "grad_norm": 0.23511400540364935, + "learning_rate": 4.053886104103116e-05, + "loss": 2.3773, + "step": 1943 + }, + { + "epoch": 0.5741287655050207, + "grad_norm": 0.21041576889683783, + "learning_rate": 4.049189769748774e-05, + "loss": 2.2754, + "step": 1944 + }, + { + "epoch": 0.5744240992321323, + "grad_norm": 0.21615714978081405, + "learning_rate": 4.044494305530679e-05, + "loss": 2.3264, + "step": 1945 + }, + { + "epoch": 0.574719432959244, + "grad_norm": 0.20277352368524373, + "learning_rate": 4.0397997157459e-05, + "loss": 2.3284, + "step": 1946 + }, + { + "epoch": 0.5750147666863555, + "grad_norm": 0.21223907334124056, + "learning_rate": 4.0351060046906995e-05, + "loss": 2.3067, + "step": 1947 + }, + { + "epoch": 0.5753101004134672, + "grad_norm": 0.20799520391656853, + "learning_rate": 4.030413176660539e-05, + "loss": 2.2995, + "step": 1948 + }, + { + "epoch": 0.5756054341405789, + "grad_norm": 0.21437281232112423, + "learning_rate": 4.0257212359500716e-05, + "loss": 2.3723, + "step": 1949 + }, + { + "epoch": 0.5759007678676905, + "grad_norm": 0.20430747053923135, + "learning_rate": 4.0210301868531375e-05, + "loss": 2.2917, + "step": 1950 + }, + { + "epoch": 0.5761961015948022, + "grad_norm": 0.21361831223826852, + "learning_rate": 4.016340033662762e-05, + "loss": 2.3099, + "step": 1951 + }, + { + "epoch": 0.5764914353219137, + "grad_norm": 0.2111713473066087, + "learning_rate": 4.0116507806711496e-05, + "loss": 2.3102, + "step": 1952 + }, + { + "epoch": 0.5767867690490254, + "grad_norm": 0.20366707326820616, + "learning_rate": 4.006962432169681e-05, + "loss": 2.2896, + "step": 1953 + }, + { + "epoch": 0.577082102776137, + "grad_norm": 0.21864529215391468, + "learning_rate": 4.002274992448911e-05, + "loss": 2.1971, + "step": 1954 + }, + { + "epoch": 0.5773774365032487, + "grad_norm": 0.2092002217447973, + "learning_rate": 3.997588465798558e-05, + "loss": 2.2336, + "step": 1955 + }, + { + "epoch": 0.5776727702303603, + "grad_norm": 0.20515621151891086, + "learning_rate": 3.992902856507512e-05, + "loss": 2.2233, + "step": 1956 + }, + { + "epoch": 0.5779681039574719, + "grad_norm": 0.20109057403937516, + "learning_rate": 3.988218168863819e-05, + "loss": 2.1983, + "step": 1957 + }, + { + "epoch": 0.5782634376845835, + "grad_norm": 0.210614027026134, + "learning_rate": 3.9835344071546794e-05, + "loss": 2.3494, + "step": 1958 + }, + { + "epoch": 0.5785587714116952, + "grad_norm": 0.22621378699701372, + "learning_rate": 3.9788515756664494e-05, + "loss": 2.2267, + "step": 1959 + }, + { + "epoch": 0.5788541051388069, + "grad_norm": 0.21062250813614702, + "learning_rate": 3.974169678684635e-05, + "loss": 2.2941, + "step": 1960 + }, + { + "epoch": 0.5791494388659185, + "grad_norm": 0.20912933669692685, + "learning_rate": 3.9694887204938845e-05, + "loss": 2.3251, + "step": 1961 + }, + { + "epoch": 0.5794447725930302, + "grad_norm": 0.20560415107172186, + "learning_rate": 3.9648087053779886e-05, + "loss": 2.2259, + "step": 1962 + }, + { + "epoch": 0.5797401063201417, + "grad_norm": 0.20737190830105873, + "learning_rate": 3.960129637619874e-05, + "loss": 2.2841, + "step": 1963 + }, + { + "epoch": 0.5800354400472534, + "grad_norm": 0.22886143300989448, + "learning_rate": 3.9554515215015997e-05, + "loss": 2.3097, + "step": 1964 + }, + { + "epoch": 0.580330773774365, + "grad_norm": 0.20460320841034618, + "learning_rate": 3.950774361304356e-05, + "loss": 2.26, + "step": 1965 + }, + { + "epoch": 0.5806261075014767, + "grad_norm": 0.2130872273602769, + "learning_rate": 3.946098161308456e-05, + "loss": 2.2492, + "step": 1966 + }, + { + "epoch": 0.5809214412285884, + "grad_norm": 0.20380253766817813, + "learning_rate": 3.9414229257933376e-05, + "loss": 2.248, + "step": 1967 + }, + { + "epoch": 0.5812167749556999, + "grad_norm": 0.20578652600336703, + "learning_rate": 3.9367486590375514e-05, + "loss": 2.3306, + "step": 1968 + }, + { + "epoch": 0.5815121086828116, + "grad_norm": 0.21722442121021432, + "learning_rate": 3.932075365318765e-05, + "loss": 2.3022, + "step": 1969 + }, + { + "epoch": 0.5818074424099232, + "grad_norm": 0.1996434210690231, + "learning_rate": 3.927403048913754e-05, + "loss": 2.2934, + "step": 1970 + }, + { + "epoch": 0.5821027761370349, + "grad_norm": 0.2088398299841818, + "learning_rate": 3.922731714098399e-05, + "loss": 2.2562, + "step": 1971 + }, + { + "epoch": 0.5823981098641465, + "grad_norm": 0.19716660686625295, + "learning_rate": 3.918061365147685e-05, + "loss": 2.2258, + "step": 1972 + }, + { + "epoch": 0.5826934435912581, + "grad_norm": 0.20711892448808633, + "learning_rate": 3.9133920063356915e-05, + "loss": 2.2691, + "step": 1973 + }, + { + "epoch": 0.5829887773183697, + "grad_norm": 0.2088279138429133, + "learning_rate": 3.9087236419355935e-05, + "loss": 2.3082, + "step": 1974 + }, + { + "epoch": 0.5832841110454814, + "grad_norm": 0.21843168395544327, + "learning_rate": 3.904056276219656e-05, + "loss": 2.2952, + "step": 1975 + }, + { + "epoch": 0.583579444772593, + "grad_norm": 0.20108677628939964, + "learning_rate": 3.899389913459231e-05, + "loss": 2.3386, + "step": 1976 + }, + { + "epoch": 0.5838747784997047, + "grad_norm": 0.2263554909871159, + "learning_rate": 3.89472455792475e-05, + "loss": 2.3844, + "step": 1977 + }, + { + "epoch": 0.5841701122268163, + "grad_norm": 0.3256070277220377, + "learning_rate": 3.8900602138857245e-05, + "loss": 2.3935, + "step": 1978 + }, + { + "epoch": 0.5844654459539279, + "grad_norm": 0.2169908909797563, + "learning_rate": 3.8853968856107404e-05, + "loss": 2.2621, + "step": 1979 + }, + { + "epoch": 0.5847607796810396, + "grad_norm": 0.2140799905077454, + "learning_rate": 3.880734577367453e-05, + "loss": 2.2952, + "step": 1980 + }, + { + "epoch": 0.5850561134081512, + "grad_norm": 0.21128862791039627, + "learning_rate": 3.8760732934225866e-05, + "loss": 2.2547, + "step": 1981 + }, + { + "epoch": 0.5853514471352629, + "grad_norm": 0.22792874477423603, + "learning_rate": 3.871413038041924e-05, + "loss": 2.4029, + "step": 1982 + }, + { + "epoch": 0.5856467808623745, + "grad_norm": 0.20125634540304335, + "learning_rate": 3.8667538154903094e-05, + "loss": 2.3097, + "step": 1983 + }, + { + "epoch": 0.5859421145894861, + "grad_norm": 0.20774166111615633, + "learning_rate": 3.8620956300316434e-05, + "loss": 2.2919, + "step": 1984 + }, + { + "epoch": 0.5862374483165977, + "grad_norm": 0.21874467809197887, + "learning_rate": 3.8574384859288746e-05, + "loss": 2.3902, + "step": 1985 + }, + { + "epoch": 0.5865327820437094, + "grad_norm": 0.2124055007173793, + "learning_rate": 3.852782387444e-05, + "loss": 2.2731, + "step": 1986 + }, + { + "epoch": 0.5868281157708211, + "grad_norm": 0.22046892010677488, + "learning_rate": 3.848127338838058e-05, + "loss": 2.2776, + "step": 1987 + }, + { + "epoch": 0.5871234494979327, + "grad_norm": 0.21378108690866496, + "learning_rate": 3.843473344371127e-05, + "loss": 2.2811, + "step": 1988 + }, + { + "epoch": 0.5874187832250443, + "grad_norm": 0.24669202862794953, + "learning_rate": 3.838820408302322e-05, + "loss": 2.3811, + "step": 1989 + }, + { + "epoch": 0.5877141169521559, + "grad_norm": 0.2103323202443801, + "learning_rate": 3.8341685348897884e-05, + "loss": 2.3315, + "step": 1990 + }, + { + "epoch": 0.5880094506792676, + "grad_norm": 0.20719303251736418, + "learning_rate": 3.8295177283906995e-05, + "loss": 2.2562, + "step": 1991 + }, + { + "epoch": 0.5883047844063792, + "grad_norm": 0.21391978553318267, + "learning_rate": 3.824867993061252e-05, + "loss": 2.4671, + "step": 1992 + }, + { + "epoch": 0.5886001181334909, + "grad_norm": 0.20999438884311744, + "learning_rate": 3.8202193331566615e-05, + "loss": 2.2954, + "step": 1993 + }, + { + "epoch": 0.5888954518606024, + "grad_norm": 0.20750158793347365, + "learning_rate": 3.815571752931162e-05, + "loss": 2.2694, + "step": 1994 + }, + { + "epoch": 0.5891907855877141, + "grad_norm": 0.22857156704574885, + "learning_rate": 3.810925256637994e-05, + "loss": 2.3437, + "step": 1995 + }, + { + "epoch": 0.5894861193148258, + "grad_norm": 0.2135360802330655, + "learning_rate": 3.8062798485294136e-05, + "loss": 2.364, + "step": 1996 + }, + { + "epoch": 0.5897814530419374, + "grad_norm": 0.21082225052976658, + "learning_rate": 3.801635532856676e-05, + "loss": 2.32, + "step": 1997 + }, + { + "epoch": 0.5900767867690491, + "grad_norm": 0.21468619725194407, + "learning_rate": 3.796992313870037e-05, + "loss": 2.2939, + "step": 1998 + }, + { + "epoch": 0.5903721204961606, + "grad_norm": 0.19806896590623455, + "learning_rate": 3.79235019581875e-05, + "loss": 2.2509, + "step": 1999 + }, + { + "epoch": 0.5906674542232723, + "grad_norm": 0.2049776466905956, + "learning_rate": 3.78770918295106e-05, + "loss": 2.2951, + "step": 2000 + }, + { + "epoch": 0.5909627879503839, + "grad_norm": 0.2061605859334903, + "learning_rate": 3.783069279514204e-05, + "loss": 2.2101, + "step": 2001 + }, + { + "epoch": 0.5912581216774956, + "grad_norm": 0.21202663648529518, + "learning_rate": 3.7784304897543956e-05, + "loss": 2.2495, + "step": 2002 + }, + { + "epoch": 0.5915534554046072, + "grad_norm": 0.22777926982333455, + "learning_rate": 3.773792817916836e-05, + "loss": 2.3497, + "step": 2003 + }, + { + "epoch": 0.5918487891317188, + "grad_norm": 0.2064195449035772, + "learning_rate": 3.769156268245703e-05, + "loss": 2.3066, + "step": 2004 + }, + { + "epoch": 0.5921441228588304, + "grad_norm": 0.20422746343491657, + "learning_rate": 3.764520844984145e-05, + "loss": 2.2907, + "step": 2005 + }, + { + "epoch": 0.5924394565859421, + "grad_norm": 0.20865695474427634, + "learning_rate": 3.759886552374281e-05, + "loss": 2.2382, + "step": 2006 + }, + { + "epoch": 0.5927347903130538, + "grad_norm": 0.21923806885027208, + "learning_rate": 3.755253394657194e-05, + "loss": 2.4312, + "step": 2007 + }, + { + "epoch": 0.5930301240401654, + "grad_norm": 0.2031939004537118, + "learning_rate": 3.7506213760729315e-05, + "loss": 2.3205, + "step": 2008 + }, + { + "epoch": 0.5933254577672771, + "grad_norm": 0.20850019428589456, + "learning_rate": 3.745990500860495e-05, + "loss": 2.2447, + "step": 2009 + }, + { + "epoch": 0.5936207914943886, + "grad_norm": 0.2156107994600358, + "learning_rate": 3.7413607732578416e-05, + "loss": 2.3146, + "step": 2010 + }, + { + "epoch": 0.5939161252215003, + "grad_norm": 0.2061328468517906, + "learning_rate": 3.7367321975018766e-05, + "loss": 2.3773, + "step": 2011 + }, + { + "epoch": 0.5942114589486119, + "grad_norm": 0.1963346343018277, + "learning_rate": 3.732104777828453e-05, + "loss": 2.0472, + "step": 2012 + }, + { + "epoch": 0.5945067926757236, + "grad_norm": 0.21538652867907404, + "learning_rate": 3.727478518472364e-05, + "loss": 2.2911, + "step": 2013 + }, + { + "epoch": 0.5948021264028353, + "grad_norm": 0.22127024611819668, + "learning_rate": 3.722853423667342e-05, + "loss": 2.2264, + "step": 2014 + }, + { + "epoch": 0.5950974601299468, + "grad_norm": 0.20792424847177038, + "learning_rate": 3.7182294976460546e-05, + "loss": 2.285, + "step": 2015 + }, + { + "epoch": 0.5953927938570585, + "grad_norm": 0.21396425735483815, + "learning_rate": 3.7136067446401e-05, + "loss": 2.3225, + "step": 2016 + }, + { + "epoch": 0.5956881275841701, + "grad_norm": 0.23631013849082846, + "learning_rate": 3.708985168879998e-05, + "loss": 2.3464, + "step": 2017 + }, + { + "epoch": 0.5959834613112818, + "grad_norm": 0.20800379180604173, + "learning_rate": 3.7043647745951966e-05, + "loss": 2.2641, + "step": 2018 + }, + { + "epoch": 0.5962787950383934, + "grad_norm": 0.2314301417404614, + "learning_rate": 3.699745566014059e-05, + "loss": 2.3641, + "step": 2019 + }, + { + "epoch": 0.596574128765505, + "grad_norm": 0.21461333143088865, + "learning_rate": 3.695127547363867e-05, + "loss": 2.2935, + "step": 2020 + }, + { + "epoch": 0.5968694624926166, + "grad_norm": 0.20894899692735075, + "learning_rate": 3.690510722870811e-05, + "loss": 2.2425, + "step": 2021 + }, + { + "epoch": 0.5971647962197283, + "grad_norm": 0.20923291338613126, + "learning_rate": 3.685895096759989e-05, + "loss": 2.3, + "step": 2022 + }, + { + "epoch": 0.59746012994684, + "grad_norm": 0.21027113599826935, + "learning_rate": 3.681280673255403e-05, + "loss": 2.3003, + "step": 2023 + }, + { + "epoch": 0.5977554636739516, + "grad_norm": 0.2073864768331986, + "learning_rate": 3.6766674565799507e-05, + "loss": 2.4326, + "step": 2024 + }, + { + "epoch": 0.5980507974010632, + "grad_norm": 0.21583946281153024, + "learning_rate": 3.6720554509554316e-05, + "loss": 2.3239, + "step": 2025 + }, + { + "epoch": 0.5983461311281748, + "grad_norm": 0.2053261573957909, + "learning_rate": 3.6674446606025316e-05, + "loss": 2.3112, + "step": 2026 + }, + { + "epoch": 0.5986414648552865, + "grad_norm": 0.21215596433803752, + "learning_rate": 3.662835089740827e-05, + "loss": 2.2855, + "step": 2027 + }, + { + "epoch": 0.5989367985823981, + "grad_norm": 0.20616918700136586, + "learning_rate": 3.6582267425887787e-05, + "loss": 2.3134, + "step": 2028 + }, + { + "epoch": 0.5992321323095098, + "grad_norm": 0.21708996181167364, + "learning_rate": 3.653619623363725e-05, + "loss": 2.3415, + "step": 2029 + }, + { + "epoch": 0.5995274660366214, + "grad_norm": 0.21099889709423958, + "learning_rate": 3.649013736281884e-05, + "loss": 2.3176, + "step": 2030 + }, + { + "epoch": 0.599822799763733, + "grad_norm": 0.2038694732210426, + "learning_rate": 3.644409085558341e-05, + "loss": 2.3165, + "step": 2031 + }, + { + "epoch": 0.6001181334908446, + "grad_norm": 0.2072926616325334, + "learning_rate": 3.6398056754070566e-05, + "loss": 2.3198, + "step": 2032 + }, + { + "epoch": 0.6004134672179563, + "grad_norm": 0.21236028145164468, + "learning_rate": 3.63520351004085e-05, + "loss": 2.298, + "step": 2033 + }, + { + "epoch": 0.600708800945068, + "grad_norm": 0.2173546583480951, + "learning_rate": 3.630602593671405e-05, + "loss": 2.2977, + "step": 2034 + }, + { + "epoch": 0.6010041346721796, + "grad_norm": 0.20638025245706146, + "learning_rate": 3.62600293050926e-05, + "loss": 2.2665, + "step": 2035 + }, + { + "epoch": 0.6012994683992912, + "grad_norm": 0.2102523467010051, + "learning_rate": 3.62140452476381e-05, + "loss": 2.1955, + "step": 2036 + }, + { + "epoch": 0.6015948021264028, + "grad_norm": 0.2045729637010621, + "learning_rate": 3.616807380643295e-05, + "loss": 2.3196, + "step": 2037 + }, + { + "epoch": 0.6018901358535145, + "grad_norm": 0.23005196498729336, + "learning_rate": 3.612211502354803e-05, + "loss": 2.2518, + "step": 2038 + }, + { + "epoch": 0.6021854695806261, + "grad_norm": 0.2089437730511037, + "learning_rate": 3.607616894104263e-05, + "loss": 2.2818, + "step": 2039 + }, + { + "epoch": 0.6024808033077378, + "grad_norm": 0.217317385908512, + "learning_rate": 3.603023560096442e-05, + "loss": 2.2624, + "step": 2040 + }, + { + "epoch": 0.6027761370348493, + "grad_norm": 0.20412329964125198, + "learning_rate": 3.598431504534941e-05, + "loss": 2.3067, + "step": 2041 + }, + { + "epoch": 0.603071470761961, + "grad_norm": 0.21497793342216145, + "learning_rate": 3.593840731622188e-05, + "loss": 2.3847, + "step": 2042 + }, + { + "epoch": 0.6033668044890727, + "grad_norm": 0.2147871549797251, + "learning_rate": 3.5892512455594426e-05, + "loss": 2.3827, + "step": 2043 + }, + { + "epoch": 0.6036621382161843, + "grad_norm": 0.20645923585595016, + "learning_rate": 3.584663050546784e-05, + "loss": 2.3705, + "step": 2044 + }, + { + "epoch": 0.603957471943296, + "grad_norm": 0.20936036379031392, + "learning_rate": 3.5800761507831086e-05, + "loss": 2.2919, + "step": 2045 + }, + { + "epoch": 0.6042528056704075, + "grad_norm": 0.2251519077816801, + "learning_rate": 3.575490550466129e-05, + "loss": 2.3213, + "step": 2046 + }, + { + "epoch": 0.6045481393975192, + "grad_norm": 0.21770717485171343, + "learning_rate": 3.5709062537923665e-05, + "loss": 2.323, + "step": 2047 + }, + { + "epoch": 0.6048434731246308, + "grad_norm": 0.2236664925746079, + "learning_rate": 3.566323264957153e-05, + "loss": 2.2557, + "step": 2048 + }, + { + "epoch": 0.6051388068517425, + "grad_norm": 0.21427090308398897, + "learning_rate": 3.561741588154622e-05, + "loss": 2.3342, + "step": 2049 + }, + { + "epoch": 0.6054341405788541, + "grad_norm": 0.20515977051700243, + "learning_rate": 3.557161227577706e-05, + "loss": 2.3018, + "step": 2050 + }, + { + "epoch": 0.6057294743059657, + "grad_norm": 0.20824282985080847, + "learning_rate": 3.552582187418131e-05, + "loss": 2.2486, + "step": 2051 + }, + { + "epoch": 0.6060248080330773, + "grad_norm": 0.20394293539126485, + "learning_rate": 3.548004471866418e-05, + "loss": 2.3175, + "step": 2052 + }, + { + "epoch": 0.606320141760189, + "grad_norm": 0.21561835838538176, + "learning_rate": 3.543428085111874e-05, + "loss": 2.3578, + "step": 2053 + }, + { + "epoch": 0.6066154754873007, + "grad_norm": 0.21867923784143314, + "learning_rate": 3.5388530313425904e-05, + "loss": 2.291, + "step": 2054 + }, + { + "epoch": 0.6069108092144123, + "grad_norm": 0.21849717277437056, + "learning_rate": 3.5342793147454386e-05, + "loss": 2.2633, + "step": 2055 + }, + { + "epoch": 0.607206142941524, + "grad_norm": 0.21609689070321775, + "learning_rate": 3.529706939506066e-05, + "loss": 2.339, + "step": 2056 + }, + { + "epoch": 0.6075014766686355, + "grad_norm": 0.20630258410042385, + "learning_rate": 3.525135909808893e-05, + "loss": 2.2657, + "step": 2057 + }, + { + "epoch": 0.6077968103957472, + "grad_norm": 0.20787964933452424, + "learning_rate": 3.520566229837108e-05, + "loss": 2.2713, + "step": 2058 + }, + { + "epoch": 0.6080921441228588, + "grad_norm": 0.20329253000487987, + "learning_rate": 3.515997903772664e-05, + "loss": 2.3273, + "step": 2059 + }, + { + "epoch": 0.6083874778499705, + "grad_norm": 0.21367011723327117, + "learning_rate": 3.511430935796278e-05, + "loss": 2.4283, + "step": 2060 + }, + { + "epoch": 0.6086828115770822, + "grad_norm": 0.20906309623804303, + "learning_rate": 3.506865330087419e-05, + "loss": 2.2701, + "step": 2061 + }, + { + "epoch": 0.6089781453041937, + "grad_norm": 0.24316485148229103, + "learning_rate": 3.5023010908243127e-05, + "loss": 2.3499, + "step": 2062 + }, + { + "epoch": 0.6092734790313054, + "grad_norm": 0.21592229272040248, + "learning_rate": 3.497738222183933e-05, + "loss": 2.393, + "step": 2063 + }, + { + "epoch": 0.609568812758417, + "grad_norm": 0.21500382961976605, + "learning_rate": 3.493176728342e-05, + "loss": 2.2562, + "step": 2064 + }, + { + "epoch": 0.6098641464855287, + "grad_norm": 0.21391947207328305, + "learning_rate": 3.488616613472977e-05, + "loss": 2.2554, + "step": 2065 + }, + { + "epoch": 0.6101594802126403, + "grad_norm": 0.21751415743735902, + "learning_rate": 3.4840578817500635e-05, + "loss": 2.3011, + "step": 2066 + }, + { + "epoch": 0.6104548139397519, + "grad_norm": 0.222766246729768, + "learning_rate": 3.479500537345193e-05, + "loss": 2.2538, + "step": 2067 + }, + { + "epoch": 0.6107501476668635, + "grad_norm": 0.20568337816207724, + "learning_rate": 3.474944584429031e-05, + "loss": 2.2898, + "step": 2068 + }, + { + "epoch": 0.6110454813939752, + "grad_norm": 0.21466025405861291, + "learning_rate": 3.4703900271709675e-05, + "loss": 2.279, + "step": 2069 + }, + { + "epoch": 0.6113408151210868, + "grad_norm": 0.22619174349520654, + "learning_rate": 3.465836869739117e-05, + "loss": 2.3022, + "step": 2070 + }, + { + "epoch": 0.6116361488481985, + "grad_norm": 0.2122687083818206, + "learning_rate": 3.461285116300314e-05, + "loss": 2.2801, + "step": 2071 + }, + { + "epoch": 0.61193148257531, + "grad_norm": 0.21779698300196462, + "learning_rate": 3.4567347710201035e-05, + "loss": 2.3925, + "step": 2072 + }, + { + "epoch": 0.6122268163024217, + "grad_norm": 0.211717215317798, + "learning_rate": 3.452185838062747e-05, + "loss": 2.2896, + "step": 2073 + }, + { + "epoch": 0.6125221500295334, + "grad_norm": 0.20845041064894249, + "learning_rate": 3.4476383215912114e-05, + "loss": 2.2708, + "step": 2074 + }, + { + "epoch": 0.612817483756645, + "grad_norm": 0.21199436687799336, + "learning_rate": 3.443092225767166e-05, + "loss": 2.3443, + "step": 2075 + }, + { + "epoch": 0.6131128174837567, + "grad_norm": 0.2350964568404555, + "learning_rate": 3.4385475547509815e-05, + "loss": 2.3736, + "step": 2076 + }, + { + "epoch": 0.6134081512108683, + "grad_norm": 0.21152356641770764, + "learning_rate": 3.434004312701724e-05, + "loss": 2.2678, + "step": 2077 + }, + { + "epoch": 0.6137034849379799, + "grad_norm": 0.2153813052729538, + "learning_rate": 3.429462503777154e-05, + "loss": 2.3315, + "step": 2078 + }, + { + "epoch": 0.6139988186650915, + "grad_norm": 0.22114156536224988, + "learning_rate": 3.4249221321337176e-05, + "loss": 2.3282, + "step": 2079 + }, + { + "epoch": 0.6142941523922032, + "grad_norm": 0.20791182295604677, + "learning_rate": 3.4203832019265466e-05, + "loss": 2.3362, + "step": 2080 + }, + { + "epoch": 0.6145894861193149, + "grad_norm": 1.694939708593917, + "learning_rate": 3.415845717309454e-05, + "loss": 2.3513, + "step": 2081 + }, + { + "epoch": 0.6148848198464265, + "grad_norm": 0.2066243329322972, + "learning_rate": 3.41130968243493e-05, + "loss": 2.3018, + "step": 2082 + }, + { + "epoch": 0.6151801535735381, + "grad_norm": 0.21964295386019847, + "learning_rate": 3.406775101454137e-05, + "loss": 2.2232, + "step": 2083 + }, + { + "epoch": 0.6154754873006497, + "grad_norm": 0.2139843223685917, + "learning_rate": 3.402241978516908e-05, + "loss": 2.2903, + "step": 2084 + }, + { + "epoch": 0.6157708210277614, + "grad_norm": 0.22962070681141214, + "learning_rate": 3.3977103177717425e-05, + "loss": 2.3754, + "step": 2085 + }, + { + "epoch": 0.616066154754873, + "grad_norm": 0.2138572934444395, + "learning_rate": 3.393180123365797e-05, + "loss": 2.3093, + "step": 2086 + }, + { + "epoch": 0.6163614884819847, + "grad_norm": 0.22256454453122437, + "learning_rate": 3.3886513994448946e-05, + "loss": 2.3936, + "step": 2087 + }, + { + "epoch": 0.6166568222090962, + "grad_norm": 0.20980221271209443, + "learning_rate": 3.384124150153504e-05, + "loss": 2.3388, + "step": 2088 + }, + { + "epoch": 0.6169521559362079, + "grad_norm": 0.22153315133188142, + "learning_rate": 3.3795983796347526e-05, + "loss": 2.3372, + "step": 2089 + }, + { + "epoch": 0.6172474896633195, + "grad_norm": 0.21191957352323793, + "learning_rate": 3.3750740920304046e-05, + "loss": 2.3239, + "step": 2090 + }, + { + "epoch": 0.6175428233904312, + "grad_norm": 0.22158195424992722, + "learning_rate": 3.3705512914808747e-05, + "loss": 2.3146, + "step": 2091 + }, + { + "epoch": 0.6178381571175429, + "grad_norm": 0.21578624508246494, + "learning_rate": 3.366029982125215e-05, + "loss": 2.2377, + "step": 2092 + }, + { + "epoch": 0.6181334908446544, + "grad_norm": 0.21392332063247346, + "learning_rate": 3.361510168101113e-05, + "loss": 2.3726, + "step": 2093 + }, + { + "epoch": 0.6184288245717661, + "grad_norm": 0.22537052719450965, + "learning_rate": 3.356991853544886e-05, + "loss": 2.389, + "step": 2094 + }, + { + "epoch": 0.6187241582988777, + "grad_norm": 0.19999911300641304, + "learning_rate": 3.352475042591482e-05, + "loss": 2.3346, + "step": 2095 + }, + { + "epoch": 0.6190194920259894, + "grad_norm": 0.21196532618229055, + "learning_rate": 3.347959739374469e-05, + "loss": 2.2903, + "step": 2096 + }, + { + "epoch": 0.619314825753101, + "grad_norm": 0.21865916069004884, + "learning_rate": 3.3434459480260404e-05, + "loss": 2.317, + "step": 2097 + }, + { + "epoch": 0.6196101594802126, + "grad_norm": 0.21574951322440955, + "learning_rate": 3.338933672676999e-05, + "loss": 2.3085, + "step": 2098 + }, + { + "epoch": 0.6199054932073242, + "grad_norm": 0.21438367306150213, + "learning_rate": 3.33442291745677e-05, + "loss": 2.3123, + "step": 2099 + }, + { + "epoch": 0.6202008269344359, + "grad_norm": 0.22102613358715528, + "learning_rate": 3.329913686493377e-05, + "loss": 2.3194, + "step": 2100 + }, + { + "epoch": 0.6204961606615476, + "grad_norm": 0.21010211367842097, + "learning_rate": 3.325405983913455e-05, + "loss": 2.3461, + "step": 2101 + }, + { + "epoch": 0.6207914943886592, + "grad_norm": 0.2108462454180446, + "learning_rate": 3.32089981384224e-05, + "loss": 2.2973, + "step": 2102 + }, + { + "epoch": 0.6210868281157709, + "grad_norm": 0.25962714038897766, + "learning_rate": 3.316395180403563e-05, + "loss": 2.349, + "step": 2103 + }, + { + "epoch": 0.6213821618428824, + "grad_norm": 0.22984198837331293, + "learning_rate": 3.311892087719853e-05, + "loss": 2.3636, + "step": 2104 + }, + { + "epoch": 0.6216774955699941, + "grad_norm": 0.22851832298577157, + "learning_rate": 3.30739053991212e-05, + "loss": 2.3418, + "step": 2105 + }, + { + "epoch": 0.6219728292971057, + "grad_norm": 0.2146707878937667, + "learning_rate": 3.30289054109997e-05, + "loss": 2.3868, + "step": 2106 + }, + { + "epoch": 0.6222681630242174, + "grad_norm": 0.20748701850660126, + "learning_rate": 3.2983920954015874e-05, + "loss": 2.3501, + "step": 2107 + }, + { + "epoch": 0.622563496751329, + "grad_norm": 0.20815051918977584, + "learning_rate": 3.293895206933735e-05, + "loss": 2.286, + "step": 2108 + }, + { + "epoch": 0.6228588304784406, + "grad_norm": 0.24129222164398734, + "learning_rate": 3.289399879811751e-05, + "loss": 2.3969, + "step": 2109 + }, + { + "epoch": 0.6231541642055523, + "grad_norm": 0.20244988710311262, + "learning_rate": 3.284906118149545e-05, + "loss": 2.2669, + "step": 2110 + }, + { + "epoch": 0.6234494979326639, + "grad_norm": 0.20472740883553103, + "learning_rate": 3.280413926059593e-05, + "loss": 2.3379, + "step": 2111 + }, + { + "epoch": 0.6237448316597756, + "grad_norm": 0.21610134693398544, + "learning_rate": 3.2759233076529366e-05, + "loss": 2.3491, + "step": 2112 + }, + { + "epoch": 0.6240401653868872, + "grad_norm": 0.2095450340732284, + "learning_rate": 3.2714342670391756e-05, + "loss": 2.3265, + "step": 2113 + }, + { + "epoch": 0.6243354991139988, + "grad_norm": 0.21052597664073242, + "learning_rate": 3.266946808326466e-05, + "loss": 2.3697, + "step": 2114 + }, + { + "epoch": 0.6246308328411104, + "grad_norm": 0.21586662762138453, + "learning_rate": 3.2624609356215156e-05, + "loss": 2.3025, + "step": 2115 + }, + { + "epoch": 0.6249261665682221, + "grad_norm": 0.21069875204140381, + "learning_rate": 3.257976653029583e-05, + "loss": 2.297, + "step": 2116 + }, + { + "epoch": 0.6252215002953337, + "grad_norm": 0.20652898105304873, + "learning_rate": 3.2534939646544686e-05, + "loss": 2.3127, + "step": 2117 + }, + { + "epoch": 0.6255168340224454, + "grad_norm": 0.20859198408772345, + "learning_rate": 3.249012874598518e-05, + "loss": 2.2286, + "step": 2118 + }, + { + "epoch": 0.625812167749557, + "grad_norm": 0.22082680637832622, + "learning_rate": 3.2445333869626084e-05, + "loss": 2.2654, + "step": 2119 + }, + { + "epoch": 0.6261075014766686, + "grad_norm": 0.20626354028587288, + "learning_rate": 3.2400555058461536e-05, + "loss": 2.2802, + "step": 2120 + }, + { + "epoch": 0.6264028352037803, + "grad_norm": 0.20233474659864772, + "learning_rate": 3.235579235347098e-05, + "loss": 2.3472, + "step": 2121 + }, + { + "epoch": 0.6266981689308919, + "grad_norm": 0.20906603634841248, + "learning_rate": 3.2311045795619124e-05, + "loss": 2.3644, + "step": 2122 + }, + { + "epoch": 0.6269935026580036, + "grad_norm": 0.20795441779456317, + "learning_rate": 3.226631542585587e-05, + "loss": 2.3212, + "step": 2123 + }, + { + "epoch": 0.6272888363851152, + "grad_norm": 0.2117048540517045, + "learning_rate": 3.222160128511633e-05, + "loss": 2.3731, + "step": 2124 + }, + { + "epoch": 0.6275841701122268, + "grad_norm": 0.3282228165103756, + "learning_rate": 3.217690341432077e-05, + "loss": 2.3651, + "step": 2125 + }, + { + "epoch": 0.6278795038393384, + "grad_norm": 0.2825213414123331, + "learning_rate": 3.213222185437453e-05, + "loss": 2.2476, + "step": 2126 + }, + { + "epoch": 0.6281748375664501, + "grad_norm": 0.20613361492221283, + "learning_rate": 3.2087556646168056e-05, + "loss": 2.2521, + "step": 2127 + }, + { + "epoch": 0.6284701712935618, + "grad_norm": 0.20526737382028679, + "learning_rate": 3.2042907830576815e-05, + "loss": 2.338, + "step": 2128 + }, + { + "epoch": 0.6287655050206734, + "grad_norm": 0.2221630389326845, + "learning_rate": 3.199827544846129e-05, + "loss": 2.2889, + "step": 2129 + }, + { + "epoch": 0.629060838747785, + "grad_norm": 0.21083718139607177, + "learning_rate": 3.1953659540666903e-05, + "loss": 2.3355, + "step": 2130 + }, + { + "epoch": 0.6293561724748966, + "grad_norm": 0.21346344135302697, + "learning_rate": 3.190906014802401e-05, + "loss": 2.3265, + "step": 2131 + }, + { + "epoch": 0.6296515062020083, + "grad_norm": 0.2238495538516452, + "learning_rate": 3.186447731134784e-05, + "loss": 2.3107, + "step": 2132 + }, + { + "epoch": 0.6299468399291199, + "grad_norm": 0.20975548206435538, + "learning_rate": 3.181991107143851e-05, + "loss": 2.3106, + "step": 2133 + }, + { + "epoch": 0.6302421736562316, + "grad_norm": 0.2026126709422336, + "learning_rate": 3.177536146908089e-05, + "loss": 2.3275, + "step": 2134 + }, + { + "epoch": 0.6305375073833431, + "grad_norm": 0.21118934044003548, + "learning_rate": 3.173082854504467e-05, + "loss": 2.2564, + "step": 2135 + }, + { + "epoch": 0.6308328411104548, + "grad_norm": 0.3506806545830201, + "learning_rate": 3.168631234008426e-05, + "loss": 2.3184, + "step": 2136 + }, + { + "epoch": 0.6311281748375664, + "grad_norm": 0.21405370408802718, + "learning_rate": 3.164181289493877e-05, + "loss": 2.4538, + "step": 2137 + }, + { + "epoch": 0.6314235085646781, + "grad_norm": 0.2122257331510362, + "learning_rate": 3.1597330250331973e-05, + "loss": 2.3122, + "step": 2138 + }, + { + "epoch": 0.6317188422917898, + "grad_norm": 0.9073176836093152, + "learning_rate": 3.1552864446972275e-05, + "loss": 2.1975, + "step": 2139 + }, + { + "epoch": 0.6320141760189013, + "grad_norm": 0.22056815147131695, + "learning_rate": 3.150841552555265e-05, + "loss": 2.3383, + "step": 2140 + }, + { + "epoch": 0.632309509746013, + "grad_norm": 0.21274671152455688, + "learning_rate": 3.146398352675064e-05, + "loss": 2.3185, + "step": 2141 + }, + { + "epoch": 0.6326048434731246, + "grad_norm": 0.24613519349889174, + "learning_rate": 3.141956849122829e-05, + "loss": 2.222, + "step": 2142 + }, + { + "epoch": 0.6329001772002363, + "grad_norm": 0.20579041804555137, + "learning_rate": 3.137517045963214e-05, + "loss": 2.2544, + "step": 2143 + }, + { + "epoch": 0.6331955109273479, + "grad_norm": 0.21196087087333718, + "learning_rate": 3.1330789472593134e-05, + "loss": 2.3375, + "step": 2144 + }, + { + "epoch": 0.6334908446544595, + "grad_norm": 0.22023984509157093, + "learning_rate": 3.1286425570726654e-05, + "loss": 2.3463, + "step": 2145 + }, + { + "epoch": 0.6337861783815711, + "grad_norm": 0.25016024043696694, + "learning_rate": 3.124207879463242e-05, + "loss": 2.3532, + "step": 2146 + }, + { + "epoch": 0.6340815121086828, + "grad_norm": 0.21262602154701082, + "learning_rate": 3.11977491848945e-05, + "loss": 2.2774, + "step": 2147 + }, + { + "epoch": 0.6343768458357945, + "grad_norm": 0.2073370995565342, + "learning_rate": 3.1153436782081245e-05, + "loss": 2.2839, + "step": 2148 + }, + { + "epoch": 0.6346721795629061, + "grad_norm": 0.2086486972631962, + "learning_rate": 3.110914162674524e-05, + "loss": 2.3277, + "step": 2149 + }, + { + "epoch": 0.6349675132900178, + "grad_norm": 0.21951347190848572, + "learning_rate": 3.106486375942331e-05, + "loss": 2.2734, + "step": 2150 + }, + { + "epoch": 0.6352628470171293, + "grad_norm": 0.21842348901076056, + "learning_rate": 3.102060322063645e-05, + "loss": 2.3412, + "step": 2151 + }, + { + "epoch": 0.635558180744241, + "grad_norm": 0.2192457915721028, + "learning_rate": 3.0976360050889795e-05, + "loss": 2.3305, + "step": 2152 + }, + { + "epoch": 0.6358535144713526, + "grad_norm": 0.20782315649947014, + "learning_rate": 3.09321342906726e-05, + "loss": 2.2824, + "step": 2153 + }, + { + "epoch": 0.6361488481984643, + "grad_norm": 0.21840656944259182, + "learning_rate": 3.0887925980458154e-05, + "loss": 2.1931, + "step": 2154 + }, + { + "epoch": 0.636444181925576, + "grad_norm": 0.2085835454352069, + "learning_rate": 3.084373516070382e-05, + "loss": 2.3384, + "step": 2155 + }, + { + "epoch": 0.6367395156526875, + "grad_norm": 0.4560302793191468, + "learning_rate": 3.079956187185091e-05, + "loss": 2.4357, + "step": 2156 + }, + { + "epoch": 0.6370348493797992, + "grad_norm": 0.2143452583262143, + "learning_rate": 3.075540615432474e-05, + "loss": 2.3236, + "step": 2157 + }, + { + "epoch": 0.6373301831069108, + "grad_norm": 0.20781307327937165, + "learning_rate": 3.071126804853449e-05, + "loss": 2.2892, + "step": 2158 + }, + { + "epoch": 0.6376255168340225, + "grad_norm": 0.22493300963888505, + "learning_rate": 3.066714759487328e-05, + "loss": 2.346, + "step": 2159 + }, + { + "epoch": 0.6379208505611341, + "grad_norm": 0.2077394464626211, + "learning_rate": 3.062304483371803e-05, + "loss": 2.2608, + "step": 2160 + }, + { + "epoch": 0.6382161842882457, + "grad_norm": 0.20737263585376178, + "learning_rate": 3.0578959805429495e-05, + "loss": 2.3895, + "step": 2161 + }, + { + "epoch": 0.6385115180153573, + "grad_norm": 0.21942198094060708, + "learning_rate": 3.053489255035221e-05, + "loss": 2.3275, + "step": 2162 + }, + { + "epoch": 0.638806851742469, + "grad_norm": 0.2110303480173383, + "learning_rate": 3.0490843108814393e-05, + "loss": 2.3096, + "step": 2163 + }, + { + "epoch": 0.6391021854695806, + "grad_norm": 0.20484876977166117, + "learning_rate": 3.0446811521128017e-05, + "loss": 2.2798, + "step": 2164 + }, + { + "epoch": 0.6393975191966923, + "grad_norm": 0.21318848796963363, + "learning_rate": 3.040279782758869e-05, + "loss": 2.3186, + "step": 2165 + }, + { + "epoch": 0.6396928529238038, + "grad_norm": 0.38238871725817103, + "learning_rate": 3.0358802068475655e-05, + "loss": 2.265, + "step": 2166 + }, + { + "epoch": 0.6399881866509155, + "grad_norm": 0.20931191490389997, + "learning_rate": 3.0314824284051725e-05, + "loss": 2.2954, + "step": 2167 + }, + { + "epoch": 0.6402835203780272, + "grad_norm": 0.20673230017946487, + "learning_rate": 3.027086451456328e-05, + "loss": 2.3394, + "step": 2168 + }, + { + "epoch": 0.6405788541051388, + "grad_norm": 0.2050150456373461, + "learning_rate": 3.022692280024021e-05, + "loss": 2.2838, + "step": 2169 + }, + { + "epoch": 0.6408741878322505, + "grad_norm": 0.20222726842039712, + "learning_rate": 3.0182999181295872e-05, + "loss": 2.3053, + "step": 2170 + }, + { + "epoch": 0.6411695215593621, + "grad_norm": 0.21384196372194073, + "learning_rate": 3.013909369792708e-05, + "loss": 2.3129, + "step": 2171 + }, + { + "epoch": 0.6414648552864737, + "grad_norm": 0.22824065464333534, + "learning_rate": 3.0095206390314035e-05, + "loss": 2.4134, + "step": 2172 + }, + { + "epoch": 0.6417601890135853, + "grad_norm": 0.22303314266922405, + "learning_rate": 3.005133729862032e-05, + "loss": 2.2793, + "step": 2173 + }, + { + "epoch": 0.642055522740697, + "grad_norm": 0.3712365528201601, + "learning_rate": 3.000748646299283e-05, + "loss": 2.4325, + "step": 2174 + }, + { + "epoch": 0.6423508564678087, + "grad_norm": 0.20905429347856588, + "learning_rate": 2.9963653923561765e-05, + "loss": 2.2591, + "step": 2175 + }, + { + "epoch": 0.6426461901949203, + "grad_norm": 0.22436222078543067, + "learning_rate": 2.9919839720440568e-05, + "loss": 2.2003, + "step": 2176 + }, + { + "epoch": 0.6429415239220319, + "grad_norm": 0.20521852138001762, + "learning_rate": 2.987604389372594e-05, + "loss": 2.2948, + "step": 2177 + }, + { + "epoch": 0.6432368576491435, + "grad_norm": 0.204807711747545, + "learning_rate": 2.9832266483497705e-05, + "loss": 2.3005, + "step": 2178 + }, + { + "epoch": 0.6435321913762552, + "grad_norm": 0.2181519012340126, + "learning_rate": 2.9788507529818875e-05, + "loss": 2.2975, + "step": 2179 + }, + { + "epoch": 0.6438275251033668, + "grad_norm": 0.21525077411188984, + "learning_rate": 2.974476707273557e-05, + "loss": 2.3823, + "step": 2180 + }, + { + "epoch": 0.6441228588304785, + "grad_norm": 0.21068164532079542, + "learning_rate": 2.9701045152276956e-05, + "loss": 2.3412, + "step": 2181 + }, + { + "epoch": 0.64441819255759, + "grad_norm": 0.21265341290859455, + "learning_rate": 2.9657341808455262e-05, + "loss": 2.2766, + "step": 2182 + }, + { + "epoch": 0.6447135262847017, + "grad_norm": 0.21520731248702712, + "learning_rate": 2.9613657081265716e-05, + "loss": 2.2302, + "step": 2183 + }, + { + "epoch": 0.6450088600118133, + "grad_norm": 0.2177495216272401, + "learning_rate": 2.9569991010686488e-05, + "loss": 2.352, + "step": 2184 + }, + { + "epoch": 0.645304193738925, + "grad_norm": 0.22254261664186792, + "learning_rate": 2.9526343636678688e-05, + "loss": 2.3132, + "step": 2185 + }, + { + "epoch": 0.6455995274660367, + "grad_norm": 0.20914383624649927, + "learning_rate": 2.9482714999186322e-05, + "loss": 2.3413, + "step": 2186 + }, + { + "epoch": 0.6458948611931482, + "grad_norm": 0.20134026229918803, + "learning_rate": 2.943910513813623e-05, + "loss": 2.2664, + "step": 2187 + }, + { + "epoch": 0.6461901949202599, + "grad_norm": 0.20446913989165602, + "learning_rate": 2.9395514093438093e-05, + "loss": 2.3349, + "step": 2188 + }, + { + "epoch": 0.6464855286473715, + "grad_norm": 0.20473469634148417, + "learning_rate": 2.9351941904984338e-05, + "loss": 2.3417, + "step": 2189 + }, + { + "epoch": 0.6467808623744832, + "grad_norm": 0.2230296767998299, + "learning_rate": 2.9308388612650185e-05, + "loss": 2.1796, + "step": 2190 + }, + { + "epoch": 0.6470761961015948, + "grad_norm": 0.2047511900755423, + "learning_rate": 2.9264854256293524e-05, + "loss": 2.3202, + "step": 2191 + }, + { + "epoch": 0.6473715298287065, + "grad_norm": 0.20435384280675312, + "learning_rate": 2.9221338875754935e-05, + "loss": 2.2798, + "step": 2192 + }, + { + "epoch": 0.647666863555818, + "grad_norm": 0.20922444716852395, + "learning_rate": 2.9177842510857583e-05, + "loss": 2.2643, + "step": 2193 + }, + { + "epoch": 0.6479621972829297, + "grad_norm": 0.2113404839757015, + "learning_rate": 2.913436520140731e-05, + "loss": 2.3306, + "step": 2194 + }, + { + "epoch": 0.6482575310100414, + "grad_norm": 0.20452023872127661, + "learning_rate": 2.9090906987192457e-05, + "loss": 2.2291, + "step": 2195 + }, + { + "epoch": 0.648552864737153, + "grad_norm": 0.2307764924397734, + "learning_rate": 2.9047467907983926e-05, + "loss": 2.4213, + "step": 2196 + }, + { + "epoch": 0.6488481984642647, + "grad_norm": 0.21261271439626217, + "learning_rate": 2.900404800353509e-05, + "loss": 2.3421, + "step": 2197 + }, + { + "epoch": 0.6491435321913762, + "grad_norm": 0.21581095679532614, + "learning_rate": 2.8960647313581772e-05, + "loss": 2.3026, + "step": 2198 + }, + { + "epoch": 0.6494388659184879, + "grad_norm": 0.2620468787148432, + "learning_rate": 2.8917265877842225e-05, + "loss": 2.2308, + "step": 2199 + }, + { + "epoch": 0.6497341996455995, + "grad_norm": 0.21614318964770038, + "learning_rate": 2.8873903736017072e-05, + "loss": 2.34, + "step": 2200 + }, + { + "epoch": 0.6500295333727112, + "grad_norm": 0.21690761899385574, + "learning_rate": 2.8830560927789274e-05, + "loss": 2.3472, + "step": 2201 + }, + { + "epoch": 0.6503248670998228, + "grad_norm": 0.20957450791396395, + "learning_rate": 2.8787237492824103e-05, + "loss": 2.2994, + "step": 2202 + }, + { + "epoch": 0.6506202008269344, + "grad_norm": 0.20883511177303032, + "learning_rate": 2.8743933470769123e-05, + "loss": 2.241, + "step": 2203 + }, + { + "epoch": 0.650915534554046, + "grad_norm": 0.21531212643340417, + "learning_rate": 2.8700648901254097e-05, + "loss": 2.236, + "step": 2204 + }, + { + "epoch": 0.6512108682811577, + "grad_norm": 0.20469186829135366, + "learning_rate": 2.8657383823891006e-05, + "loss": 2.2834, + "step": 2205 + }, + { + "epoch": 0.6515062020082694, + "grad_norm": 0.22262605663168755, + "learning_rate": 2.8614138278274004e-05, + "loss": 2.22, + "step": 2206 + }, + { + "epoch": 0.651801535735381, + "grad_norm": 0.2004706745248589, + "learning_rate": 2.8570912303979326e-05, + "loss": 2.3498, + "step": 2207 + }, + { + "epoch": 0.6520968694624926, + "grad_norm": 0.20706908761946538, + "learning_rate": 2.852770594056533e-05, + "loss": 2.2888, + "step": 2208 + }, + { + "epoch": 0.6523922031896042, + "grad_norm": 0.20737965076800566, + "learning_rate": 2.8484519227572437e-05, + "loss": 2.3377, + "step": 2209 + }, + { + "epoch": 0.6526875369167159, + "grad_norm": 0.2079780808095994, + "learning_rate": 2.8441352204523065e-05, + "loss": 2.359, + "step": 2210 + }, + { + "epoch": 0.6529828706438275, + "grad_norm": 0.2173075975123472, + "learning_rate": 2.839820491092161e-05, + "loss": 2.3327, + "step": 2211 + }, + { + "epoch": 0.6532782043709392, + "grad_norm": 0.2058238956199615, + "learning_rate": 2.8355077386254437e-05, + "loss": 2.2888, + "step": 2212 + }, + { + "epoch": 0.6535735380980507, + "grad_norm": 0.20725832488097928, + "learning_rate": 2.831196966998979e-05, + "loss": 2.3788, + "step": 2213 + }, + { + "epoch": 0.6538688718251624, + "grad_norm": 0.217567029563942, + "learning_rate": 2.8268881801577806e-05, + "loss": 2.2743, + "step": 2214 + }, + { + "epoch": 0.6541642055522741, + "grad_norm": 0.20731088359134756, + "learning_rate": 2.8225813820450443e-05, + "loss": 2.2494, + "step": 2215 + }, + { + "epoch": 0.6544595392793857, + "grad_norm": 0.20862178899786413, + "learning_rate": 2.8182765766021474e-05, + "loss": 2.2679, + "step": 2216 + }, + { + "epoch": 0.6547548730064974, + "grad_norm": 0.3153797972348677, + "learning_rate": 2.813973767768642e-05, + "loss": 2.4669, + "step": 2217 + }, + { + "epoch": 0.655050206733609, + "grad_norm": 0.21529773949465394, + "learning_rate": 2.8096729594822552e-05, + "loss": 2.3396, + "step": 2218 + }, + { + "epoch": 0.6553455404607206, + "grad_norm": 0.2099417744252778, + "learning_rate": 2.8053741556788805e-05, + "loss": 2.3832, + "step": 2219 + }, + { + "epoch": 0.6556408741878322, + "grad_norm": 0.24194697772880047, + "learning_rate": 2.8010773602925793e-05, + "loss": 2.4004, + "step": 2220 + }, + { + "epoch": 0.6559362079149439, + "grad_norm": 0.22219069647602283, + "learning_rate": 2.7967825772555737e-05, + "loss": 2.4259, + "step": 2221 + }, + { + "epoch": 0.6562315416420555, + "grad_norm": 0.20761301066951177, + "learning_rate": 2.7924898104982455e-05, + "loss": 2.275, + "step": 2222 + }, + { + "epoch": 0.6565268753691672, + "grad_norm": 0.22172592957951578, + "learning_rate": 2.7881990639491296e-05, + "loss": 2.3404, + "step": 2223 + }, + { + "epoch": 0.6568222090962788, + "grad_norm": 0.20198975199549948, + "learning_rate": 2.783910341534913e-05, + "loss": 2.3058, + "step": 2224 + }, + { + "epoch": 0.6571175428233904, + "grad_norm": 0.2052609582332606, + "learning_rate": 2.77962364718043e-05, + "loss": 2.2663, + "step": 2225 + }, + { + "epoch": 0.6574128765505021, + "grad_norm": 0.20490688258950895, + "learning_rate": 2.7753389848086596e-05, + "loss": 2.2665, + "step": 2226 + }, + { + "epoch": 0.6577082102776137, + "grad_norm": 0.2097605791674914, + "learning_rate": 2.771056358340721e-05, + "loss": 2.3167, + "step": 2227 + }, + { + "epoch": 0.6580035440047254, + "grad_norm": 0.20582987738679748, + "learning_rate": 2.7667757716958697e-05, + "loss": 2.3357, + "step": 2228 + }, + { + "epoch": 0.6582988777318369, + "grad_norm": 0.20895515665458525, + "learning_rate": 2.7624972287914953e-05, + "loss": 2.2871, + "step": 2229 + }, + { + "epoch": 0.6585942114589486, + "grad_norm": 0.2014367757559932, + "learning_rate": 2.7582207335431153e-05, + "loss": 2.316, + "step": 2230 + }, + { + "epoch": 0.6588895451860602, + "grad_norm": 0.2972614132636687, + "learning_rate": 2.7539462898643755e-05, + "loss": 2.2026, + "step": 2231 + }, + { + "epoch": 0.6591848789131719, + "grad_norm": 0.2064215120366213, + "learning_rate": 2.7496739016670432e-05, + "loss": 2.2783, + "step": 2232 + }, + { + "epoch": 0.6594802126402836, + "grad_norm": 0.20797453421581708, + "learning_rate": 2.745403572861004e-05, + "loss": 2.3586, + "step": 2233 + }, + { + "epoch": 0.6597755463673951, + "grad_norm": 0.20740451093504725, + "learning_rate": 2.74113530735426e-05, + "loss": 2.2836, + "step": 2234 + }, + { + "epoch": 0.6600708800945068, + "grad_norm": 0.20823118367887208, + "learning_rate": 2.736869109052925e-05, + "loss": 2.4011, + "step": 2235 + }, + { + "epoch": 0.6603662138216184, + "grad_norm": 0.21333105567713392, + "learning_rate": 2.732604981861222e-05, + "loss": 2.3528, + "step": 2236 + }, + { + "epoch": 0.6606615475487301, + "grad_norm": 0.20216352986926273, + "learning_rate": 2.7283429296814727e-05, + "loss": 2.323, + "step": 2237 + }, + { + "epoch": 0.6609568812758417, + "grad_norm": 0.2067490940212525, + "learning_rate": 2.7240829564141068e-05, + "loss": 2.261, + "step": 2238 + }, + { + "epoch": 0.6612522150029534, + "grad_norm": 0.20525055856721636, + "learning_rate": 2.7198250659576496e-05, + "loss": 2.3344, + "step": 2239 + }, + { + "epoch": 0.6615475487300649, + "grad_norm": 0.22345197437327027, + "learning_rate": 2.7155692622087186e-05, + "loss": 2.2924, + "step": 2240 + }, + { + "epoch": 0.6618428824571766, + "grad_norm": 0.20106177418879878, + "learning_rate": 2.7113155490620233e-05, + "loss": 2.313, + "step": 2241 + }, + { + "epoch": 0.6621382161842883, + "grad_norm": 0.20530313228040045, + "learning_rate": 2.7070639304103596e-05, + "loss": 2.3271, + "step": 2242 + }, + { + "epoch": 0.6624335499113999, + "grad_norm": 0.2114818665997233, + "learning_rate": 2.7028144101446063e-05, + "loss": 2.3993, + "step": 2243 + }, + { + "epoch": 0.6627288836385116, + "grad_norm": 0.20531129751042004, + "learning_rate": 2.6985669921537217e-05, + "loss": 2.3125, + "step": 2244 + }, + { + "epoch": 0.6630242173656231, + "grad_norm": 0.2073063102818227, + "learning_rate": 2.6943216803247402e-05, + "loss": 2.3325, + "step": 2245 + }, + { + "epoch": 0.6633195510927348, + "grad_norm": 0.21365156582792627, + "learning_rate": 2.690078478542768e-05, + "loss": 2.3114, + "step": 2246 + }, + { + "epoch": 0.6636148848198464, + "grad_norm": 0.5454150466904601, + "learning_rate": 2.685837390690985e-05, + "loss": 2.3542, + "step": 2247 + }, + { + "epoch": 0.6639102185469581, + "grad_norm": 0.24669271572138127, + "learning_rate": 2.6815984206506306e-05, + "loss": 2.2829, + "step": 2248 + }, + { + "epoch": 0.6642055522740697, + "grad_norm": 0.2101283625708673, + "learning_rate": 2.677361572301008e-05, + "loss": 2.3641, + "step": 2249 + }, + { + "epoch": 0.6645008860011813, + "grad_norm": 0.20993933455644312, + "learning_rate": 2.673126849519483e-05, + "loss": 2.3198, + "step": 2250 + }, + { + "epoch": 0.664796219728293, + "grad_norm": 0.207896350493608, + "learning_rate": 2.6688942561814656e-05, + "loss": 2.2096, + "step": 2251 + }, + { + "epoch": 0.6650915534554046, + "grad_norm": 0.21564600019884905, + "learning_rate": 2.664663796160427e-05, + "loss": 2.395, + "step": 2252 + }, + { + "epoch": 0.6653868871825163, + "grad_norm": 0.21287239941617356, + "learning_rate": 2.6604354733278824e-05, + "loss": 2.3324, + "step": 2253 + }, + { + "epoch": 0.6656822209096279, + "grad_norm": 0.20402628591887367, + "learning_rate": 2.656209291553392e-05, + "loss": 2.2015, + "step": 2254 + }, + { + "epoch": 0.6659775546367395, + "grad_norm": 0.20758866897467954, + "learning_rate": 2.6519852547045565e-05, + "loss": 2.2803, + "step": 2255 + }, + { + "epoch": 0.6662728883638511, + "grad_norm": 0.20796736919537956, + "learning_rate": 2.6477633666470126e-05, + "loss": 2.3405, + "step": 2256 + }, + { + "epoch": 0.6665682220909628, + "grad_norm": 0.2191899252665723, + "learning_rate": 2.6435436312444308e-05, + "loss": 2.2821, + "step": 2257 + }, + { + "epoch": 0.6668635558180744, + "grad_norm": 0.3480140580655114, + "learning_rate": 2.639326052358513e-05, + "loss": 2.4016, + "step": 2258 + }, + { + "epoch": 0.6671588895451861, + "grad_norm": 0.21987770816030844, + "learning_rate": 2.6351106338489852e-05, + "loss": 2.3673, + "step": 2259 + }, + { + "epoch": 0.6674542232722976, + "grad_norm": 0.21605497867479775, + "learning_rate": 2.6308973795735993e-05, + "loss": 2.3582, + "step": 2260 + }, + { + "epoch": 0.6677495569994093, + "grad_norm": 0.1986632456013554, + "learning_rate": 2.626686293388124e-05, + "loss": 2.3456, + "step": 2261 + }, + { + "epoch": 0.668044890726521, + "grad_norm": 0.2109149285002063, + "learning_rate": 2.6224773791463452e-05, + "loss": 2.2603, + "step": 2262 + }, + { + "epoch": 0.6683402244536326, + "grad_norm": 0.20471081309741865, + "learning_rate": 2.618270640700061e-05, + "loss": 2.3598, + "step": 2263 + }, + { + "epoch": 0.6686355581807443, + "grad_norm": 0.2088213578914136, + "learning_rate": 2.6140660818990782e-05, + "loss": 2.356, + "step": 2264 + }, + { + "epoch": 0.6689308919078559, + "grad_norm": 0.20844414664656222, + "learning_rate": 2.6098637065912095e-05, + "loss": 2.3325, + "step": 2265 + }, + { + "epoch": 0.6692262256349675, + "grad_norm": 0.20841873127337837, + "learning_rate": 2.6056635186222666e-05, + "loss": 2.3046, + "step": 2266 + }, + { + "epoch": 0.6695215593620791, + "grad_norm": 0.20596179404976123, + "learning_rate": 2.601465521836062e-05, + "loss": 2.1947, + "step": 2267 + }, + { + "epoch": 0.6698168930891908, + "grad_norm": 0.21305645599755893, + "learning_rate": 2.5972697200744035e-05, + "loss": 2.3188, + "step": 2268 + }, + { + "epoch": 0.6701122268163024, + "grad_norm": 0.21100037132305086, + "learning_rate": 2.593076117177086e-05, + "loss": 2.3395, + "step": 2269 + }, + { + "epoch": 0.6704075605434141, + "grad_norm": 0.21216589383671028, + "learning_rate": 2.5888847169819003e-05, + "loss": 2.3555, + "step": 2270 + }, + { + "epoch": 0.6707028942705257, + "grad_norm": 0.2063889165699722, + "learning_rate": 2.584695523324614e-05, + "loss": 2.3109, + "step": 2271 + }, + { + "epoch": 0.6709982279976373, + "grad_norm": 0.21011015446770018, + "learning_rate": 2.580508540038977e-05, + "loss": 2.2092, + "step": 2272 + }, + { + "epoch": 0.671293561724749, + "grad_norm": 0.21348626689718936, + "learning_rate": 2.576323770956718e-05, + "loss": 2.2858, + "step": 2273 + }, + { + "epoch": 0.6715888954518606, + "grad_norm": 0.1990871555803215, + "learning_rate": 2.5721412199075372e-05, + "loss": 2.2429, + "step": 2274 + }, + { + "epoch": 0.6718842291789723, + "grad_norm": 0.2078506341521679, + "learning_rate": 2.5679608907191076e-05, + "loss": 2.2454, + "step": 2275 + }, + { + "epoch": 0.6721795629060838, + "grad_norm": 0.20548116951020842, + "learning_rate": 2.563782787217066e-05, + "loss": 2.2992, + "step": 2276 + }, + { + "epoch": 0.6724748966331955, + "grad_norm": 0.20718465375887127, + "learning_rate": 2.5596069132250157e-05, + "loss": 2.2627, + "step": 2277 + }, + { + "epoch": 0.6727702303603071, + "grad_norm": 0.20577084847406718, + "learning_rate": 2.555433272564515e-05, + "loss": 2.3378, + "step": 2278 + }, + { + "epoch": 0.6730655640874188, + "grad_norm": 0.20966939867915255, + "learning_rate": 2.5512618690550837e-05, + "loss": 2.3553, + "step": 2279 + }, + { + "epoch": 0.6733608978145305, + "grad_norm": 0.20688977896058805, + "learning_rate": 2.547092706514192e-05, + "loss": 2.3667, + "step": 2280 + }, + { + "epoch": 0.673656231541642, + "grad_norm": 0.2083782068244937, + "learning_rate": 2.5429257887572554e-05, + "loss": 2.2603, + "step": 2281 + }, + { + "epoch": 0.6739515652687537, + "grad_norm": 0.21297349104498417, + "learning_rate": 2.5387611195976415e-05, + "loss": 2.3275, + "step": 2282 + }, + { + "epoch": 0.6742468989958653, + "grad_norm": 0.2116444890319756, + "learning_rate": 2.534598702846656e-05, + "loss": 2.3417, + "step": 2283 + }, + { + "epoch": 0.674542232722977, + "grad_norm": 0.20434583985245483, + "learning_rate": 2.530438542313545e-05, + "loss": 2.3689, + "step": 2284 + }, + { + "epoch": 0.6748375664500886, + "grad_norm": 0.2181651608021957, + "learning_rate": 2.5262806418054898e-05, + "loss": 2.3103, + "step": 2285 + }, + { + "epoch": 0.6751329001772003, + "grad_norm": 0.20171650060142657, + "learning_rate": 2.5221250051276012e-05, + "loss": 2.3225, + "step": 2286 + }, + { + "epoch": 0.6754282339043118, + "grad_norm": 0.20675720214753662, + "learning_rate": 2.5179716360829225e-05, + "loss": 2.2958, + "step": 2287 + }, + { + "epoch": 0.6757235676314235, + "grad_norm": 0.2088919907769986, + "learning_rate": 2.5138205384724177e-05, + "loss": 2.3008, + "step": 2288 + }, + { + "epoch": 0.6760189013585352, + "grad_norm": 0.20584004870617273, + "learning_rate": 2.509671716094974e-05, + "loss": 2.3648, + "step": 2289 + }, + { + "epoch": 0.6763142350856468, + "grad_norm": 0.2084423773222097, + "learning_rate": 2.5055251727473962e-05, + "loss": 2.3239, + "step": 2290 + }, + { + "epoch": 0.6766095688127585, + "grad_norm": 0.21145805834923012, + "learning_rate": 2.5013809122244024e-05, + "loss": 2.3136, + "step": 2291 + }, + { + "epoch": 0.67690490253987, + "grad_norm": 0.2058552504529845, + "learning_rate": 2.4972389383186245e-05, + "loss": 2.3769, + "step": 2292 + }, + { + "epoch": 0.6772002362669817, + "grad_norm": 0.21096603765652572, + "learning_rate": 2.4930992548205985e-05, + "loss": 2.2808, + "step": 2293 + }, + { + "epoch": 0.6774955699940933, + "grad_norm": 0.2011941693739019, + "learning_rate": 2.488961865518765e-05, + "loss": 2.2412, + "step": 2294 + }, + { + "epoch": 0.677790903721205, + "grad_norm": 0.22183694177270263, + "learning_rate": 2.484826774199467e-05, + "loss": 2.379, + "step": 2295 + }, + { + "epoch": 0.6780862374483166, + "grad_norm": 0.20284731848185286, + "learning_rate": 2.4806939846469424e-05, + "loss": 2.2986, + "step": 2296 + }, + { + "epoch": 0.6783815711754282, + "grad_norm": 0.19749194089526373, + "learning_rate": 2.4765635006433237e-05, + "loss": 2.2702, + "step": 2297 + }, + { + "epoch": 0.6786769049025398, + "grad_norm": 0.20093094184697843, + "learning_rate": 2.4724353259686334e-05, + "loss": 2.3261, + "step": 2298 + }, + { + "epoch": 0.6789722386296515, + "grad_norm": 0.20565554754267365, + "learning_rate": 2.4683094644007797e-05, + "loss": 2.2768, + "step": 2299 + }, + { + "epoch": 0.6792675723567632, + "grad_norm": 0.20536758340038522, + "learning_rate": 2.4641859197155555e-05, + "loss": 2.3191, + "step": 2300 + }, + { + "epoch": 0.6795629060838748, + "grad_norm": 0.2106683744996848, + "learning_rate": 2.4600646956866312e-05, + "loss": 2.2624, + "step": 2301 + }, + { + "epoch": 0.6798582398109864, + "grad_norm": 0.426827483960569, + "learning_rate": 2.4559457960855564e-05, + "loss": 2.3685, + "step": 2302 + }, + { + "epoch": 0.680153573538098, + "grad_norm": 0.1997610452818496, + "learning_rate": 2.4518292246817513e-05, + "loss": 2.2895, + "step": 2303 + }, + { + "epoch": 0.6804489072652097, + "grad_norm": 0.3044332274683582, + "learning_rate": 2.4477149852425058e-05, + "loss": 2.1799, + "step": 2304 + }, + { + "epoch": 0.6807442409923213, + "grad_norm": 0.21644994969298, + "learning_rate": 2.4436030815329758e-05, + "loss": 2.3393, + "step": 2305 + }, + { + "epoch": 0.681039574719433, + "grad_norm": 0.21455651936695147, + "learning_rate": 2.4394935173161803e-05, + "loss": 2.3867, + "step": 2306 + }, + { + "epoch": 0.6813349084465445, + "grad_norm": 0.22907209702823125, + "learning_rate": 2.4353862963529966e-05, + "loss": 2.3343, + "step": 2307 + }, + { + "epoch": 0.6816302421736562, + "grad_norm": 0.20067406142119742, + "learning_rate": 2.4312814224021573e-05, + "loss": 2.295, + "step": 2308 + }, + { + "epoch": 0.6819255759007679, + "grad_norm": 0.20441517623111014, + "learning_rate": 2.4271788992202503e-05, + "loss": 2.3045, + "step": 2309 + }, + { + "epoch": 0.6822209096278795, + "grad_norm": 0.20761159313340777, + "learning_rate": 2.4230787305617053e-05, + "loss": 2.3197, + "step": 2310 + }, + { + "epoch": 0.6825162433549912, + "grad_norm": 0.2753667212318436, + "learning_rate": 2.4189809201788027e-05, + "loss": 2.4104, + "step": 2311 + }, + { + "epoch": 0.6828115770821028, + "grad_norm": 0.2117174440604047, + "learning_rate": 2.4148854718216647e-05, + "loss": 2.3313, + "step": 2312 + }, + { + "epoch": 0.6831069108092144, + "grad_norm": 0.20505439984369037, + "learning_rate": 2.4107923892382495e-05, + "loss": 2.3295, + "step": 2313 + }, + { + "epoch": 0.683402244536326, + "grad_norm": 0.20564328384854388, + "learning_rate": 2.4067016761743515e-05, + "loss": 2.3897, + "step": 2314 + }, + { + "epoch": 0.6836975782634377, + "grad_norm": 0.21073047757428479, + "learning_rate": 2.4026133363735975e-05, + "loss": 2.3706, + "step": 2315 + }, + { + "epoch": 0.6839929119905493, + "grad_norm": 0.23032001771652783, + "learning_rate": 2.3985273735774406e-05, + "loss": 2.3986, + "step": 2316 + }, + { + "epoch": 0.684288245717661, + "grad_norm": 0.2060818405701464, + "learning_rate": 2.3944437915251594e-05, + "loss": 2.2472, + "step": 2317 + }, + { + "epoch": 0.6845835794447725, + "grad_norm": 0.22528119165983596, + "learning_rate": 2.3903625939538538e-05, + "loss": 2.4183, + "step": 2318 + }, + { + "epoch": 0.6848789131718842, + "grad_norm": 1.501103789795375, + "learning_rate": 2.3862837845984414e-05, + "loss": 2.4114, + "step": 2319 + }, + { + "epoch": 0.6851742468989959, + "grad_norm": 0.21177924829668177, + "learning_rate": 2.382207367191655e-05, + "loss": 2.2495, + "step": 2320 + }, + { + "epoch": 0.6854695806261075, + "grad_norm": 0.2233433912068046, + "learning_rate": 2.378133345464037e-05, + "loss": 2.3177, + "step": 2321 + }, + { + "epoch": 0.6857649143532192, + "grad_norm": 0.21031579481267576, + "learning_rate": 2.3740617231439377e-05, + "loss": 2.3547, + "step": 2322 + }, + { + "epoch": 0.6860602480803307, + "grad_norm": 0.20416733261570696, + "learning_rate": 2.3699925039575133e-05, + "loss": 2.2243, + "step": 2323 + }, + { + "epoch": 0.6863555818074424, + "grad_norm": 0.1999252172073705, + "learning_rate": 2.3659256916287183e-05, + "loss": 2.26, + "step": 2324 + }, + { + "epoch": 0.686650915534554, + "grad_norm": 0.2037289608729619, + "learning_rate": 2.3618612898793065e-05, + "loss": 2.2755, + "step": 2325 + }, + { + "epoch": 0.6869462492616657, + "grad_norm": 0.20817024980549298, + "learning_rate": 2.3577993024288237e-05, + "loss": 2.3121, + "step": 2326 + }, + { + "epoch": 0.6872415829887774, + "grad_norm": 0.2079542834397052, + "learning_rate": 2.3537397329946087e-05, + "loss": 2.363, + "step": 2327 + }, + { + "epoch": 0.6875369167158889, + "grad_norm": 0.20840634417706924, + "learning_rate": 2.3496825852917854e-05, + "loss": 2.3343, + "step": 2328 + }, + { + "epoch": 0.6878322504430006, + "grad_norm": 0.20715524451149456, + "learning_rate": 2.3456278630332623e-05, + "loss": 2.3755, + "step": 2329 + }, + { + "epoch": 0.6881275841701122, + "grad_norm": 0.20317231140817407, + "learning_rate": 2.3415755699297286e-05, + "loss": 2.3262, + "step": 2330 + }, + { + "epoch": 0.6884229178972239, + "grad_norm": 0.20524740044914197, + "learning_rate": 2.3375257096896492e-05, + "loss": 2.3497, + "step": 2331 + }, + { + "epoch": 0.6887182516243355, + "grad_norm": 0.199997175837485, + "learning_rate": 2.3334782860192637e-05, + "loss": 2.281, + "step": 2332 + }, + { + "epoch": 0.6890135853514472, + "grad_norm": 0.20251658911807002, + "learning_rate": 2.3294333026225822e-05, + "loss": 2.3581, + "step": 2333 + }, + { + "epoch": 0.6893089190785587, + "grad_norm": 0.20327876763143932, + "learning_rate": 2.32539076320138e-05, + "loss": 2.2661, + "step": 2334 + }, + { + "epoch": 0.6896042528056704, + "grad_norm": 0.21509041064861265, + "learning_rate": 2.321350671455197e-05, + "loss": 2.17, + "step": 2335 + }, + { + "epoch": 0.689899586532782, + "grad_norm": 0.20154813190378512, + "learning_rate": 2.3173130310813325e-05, + "loss": 2.3747, + "step": 2336 + }, + { + "epoch": 0.6901949202598937, + "grad_norm": 1.378693820918992, + "learning_rate": 2.313277845774843e-05, + "loss": 2.2918, + "step": 2337 + }, + { + "epoch": 0.6904902539870054, + "grad_norm": 0.20664786883170627, + "learning_rate": 2.3092451192285395e-05, + "loss": 2.2949, + "step": 2338 + }, + { + "epoch": 0.6907855877141169, + "grad_norm": 0.20422817482831174, + "learning_rate": 2.305214855132979e-05, + "loss": 2.3029, + "step": 2339 + }, + { + "epoch": 0.6910809214412286, + "grad_norm": 0.20300187450692112, + "learning_rate": 2.301187057176467e-05, + "loss": 2.2589, + "step": 2340 + }, + { + "epoch": 0.6913762551683402, + "grad_norm": 0.21167108344053046, + "learning_rate": 2.2971617290450537e-05, + "loss": 2.271, + "step": 2341 + }, + { + "epoch": 0.6916715888954519, + "grad_norm": 0.21596165167285386, + "learning_rate": 2.2931388744225278e-05, + "loss": 2.4431, + "step": 2342 + }, + { + "epoch": 0.6919669226225635, + "grad_norm": 0.2121653670845275, + "learning_rate": 2.2891184969904134e-05, + "loss": 2.348, + "step": 2343 + }, + { + "epoch": 0.6922622563496751, + "grad_norm": 0.20334224820787916, + "learning_rate": 2.2851006004279697e-05, + "loss": 2.3458, + "step": 2344 + }, + { + "epoch": 0.6925575900767867, + "grad_norm": 0.2059227941642918, + "learning_rate": 2.281085188412183e-05, + "loss": 2.2775, + "step": 2345 + }, + { + "epoch": 0.6928529238038984, + "grad_norm": 0.1969943211016523, + "learning_rate": 2.2770722646177684e-05, + "loss": 2.2256, + "step": 2346 + }, + { + "epoch": 0.6931482575310101, + "grad_norm": 0.21956412350928228, + "learning_rate": 2.273061832717162e-05, + "loss": 2.3779, + "step": 2347 + }, + { + "epoch": 0.6934435912581217, + "grad_norm": 0.20701518986146825, + "learning_rate": 2.2690538963805204e-05, + "loss": 2.2561, + "step": 2348 + }, + { + "epoch": 0.6937389249852333, + "grad_norm": 0.20150923912399063, + "learning_rate": 2.265048459275714e-05, + "loss": 2.3274, + "step": 2349 + }, + { + "epoch": 0.6940342587123449, + "grad_norm": 0.20322613765845335, + "learning_rate": 2.2610455250683314e-05, + "loss": 2.2776, + "step": 2350 + }, + { + "epoch": 0.6943295924394566, + "grad_norm": 0.2194696832170163, + "learning_rate": 2.2570450974216665e-05, + "loss": 2.34, + "step": 2351 + }, + { + "epoch": 0.6946249261665682, + "grad_norm": 0.20417786043647618, + "learning_rate": 2.25304717999672e-05, + "loss": 2.3352, + "step": 2352 + }, + { + "epoch": 0.6949202598936799, + "grad_norm": 0.20727938051730774, + "learning_rate": 2.2490517764521968e-05, + "loss": 2.3058, + "step": 2353 + }, + { + "epoch": 0.6952155936207915, + "grad_norm": 0.2067300657028347, + "learning_rate": 2.2450588904444968e-05, + "loss": 2.3294, + "step": 2354 + }, + { + "epoch": 0.6955109273479031, + "grad_norm": 0.20826036943823, + "learning_rate": 2.241068525627721e-05, + "loss": 2.227, + "step": 2355 + }, + { + "epoch": 0.6958062610750148, + "grad_norm": 0.20776657658304756, + "learning_rate": 2.237080685653661e-05, + "loss": 2.3201, + "step": 2356 + }, + { + "epoch": 0.6961015948021264, + "grad_norm": 0.2124828230002198, + "learning_rate": 2.2330953741717975e-05, + "loss": 2.3416, + "step": 2357 + }, + { + "epoch": 0.6963969285292381, + "grad_norm": 0.19517910216518214, + "learning_rate": 2.229112594829299e-05, + "loss": 2.3018, + "step": 2358 + }, + { + "epoch": 0.6966922622563497, + "grad_norm": 0.20945375975271113, + "learning_rate": 2.2251323512710155e-05, + "loss": 2.3683, + "step": 2359 + }, + { + "epoch": 0.6969875959834613, + "grad_norm": 0.20689530677761694, + "learning_rate": 2.221154647139475e-05, + "loss": 2.3096, + "step": 2360 + }, + { + "epoch": 0.6972829297105729, + "grad_norm": 0.2098972933807581, + "learning_rate": 2.217179486074884e-05, + "loss": 2.354, + "step": 2361 + }, + { + "epoch": 0.6975782634376846, + "grad_norm": 0.20650952656871976, + "learning_rate": 2.2132068717151206e-05, + "loss": 2.292, + "step": 2362 + }, + { + "epoch": 0.6978735971647962, + "grad_norm": 0.2005309570944063, + "learning_rate": 2.2092368076957322e-05, + "loss": 2.2858, + "step": 2363 + }, + { + "epoch": 0.6981689308919079, + "grad_norm": 0.20413828250257818, + "learning_rate": 2.205269297649932e-05, + "loss": 2.3032, + "step": 2364 + }, + { + "epoch": 0.6984642646190194, + "grad_norm": 0.21361936729531922, + "learning_rate": 2.201304345208597e-05, + "loss": 2.3776, + "step": 2365 + }, + { + "epoch": 0.6987595983461311, + "grad_norm": 0.2056821103341812, + "learning_rate": 2.1973419540002617e-05, + "loss": 2.3223, + "step": 2366 + }, + { + "epoch": 0.6990549320732428, + "grad_norm": 0.20690992687169202, + "learning_rate": 2.193382127651119e-05, + "loss": 2.3576, + "step": 2367 + }, + { + "epoch": 0.6993502658003544, + "grad_norm": 0.20598427532187558, + "learning_rate": 2.1894248697850146e-05, + "loss": 2.3677, + "step": 2368 + }, + { + "epoch": 0.6996455995274661, + "grad_norm": 0.2338042382562836, + "learning_rate": 2.1854701840234392e-05, + "loss": 2.2614, + "step": 2369 + }, + { + "epoch": 0.6999409332545776, + "grad_norm": 0.20456730939268064, + "learning_rate": 2.1815180739855347e-05, + "loss": 2.3653, + "step": 2370 + }, + { + "epoch": 0.7002362669816893, + "grad_norm": 0.19688013387979894, + "learning_rate": 2.1775685432880837e-05, + "loss": 2.321, + "step": 2371 + }, + { + "epoch": 0.7005316007088009, + "grad_norm": 0.2206043427444098, + "learning_rate": 2.173621595545507e-05, + "loss": 2.2903, + "step": 2372 + }, + { + "epoch": 0.7008269344359126, + "grad_norm": 0.2095996320680308, + "learning_rate": 2.1696772343698667e-05, + "loss": 2.202, + "step": 2373 + }, + { + "epoch": 0.7011222681630243, + "grad_norm": 0.2009370600041486, + "learning_rate": 2.165735463370853e-05, + "loss": 2.1178, + "step": 2374 + }, + { + "epoch": 0.7014176018901358, + "grad_norm": 0.21053473556145003, + "learning_rate": 2.1617962861557866e-05, + "loss": 2.2123, + "step": 2375 + }, + { + "epoch": 0.7017129356172475, + "grad_norm": 0.20975807503125418, + "learning_rate": 2.1578597063296152e-05, + "loss": 2.3614, + "step": 2376 + }, + { + "epoch": 0.7020082693443591, + "grad_norm": 0.2489559530931807, + "learning_rate": 2.1539257274949088e-05, + "loss": 2.2791, + "step": 2377 + }, + { + "epoch": 0.7023036030714708, + "grad_norm": 0.2072889056253499, + "learning_rate": 2.149994353251858e-05, + "loss": 2.4095, + "step": 2378 + }, + { + "epoch": 0.7025989367985824, + "grad_norm": 0.20471700936097012, + "learning_rate": 2.146065587198269e-05, + "loss": 2.369, + "step": 2379 + }, + { + "epoch": 0.7028942705256941, + "grad_norm": 0.2115042631100663, + "learning_rate": 2.1421394329295613e-05, + "loss": 2.3646, + "step": 2380 + }, + { + "epoch": 0.7031896042528056, + "grad_norm": 0.22503948494372109, + "learning_rate": 2.138215894038765e-05, + "loss": 2.2866, + "step": 2381 + }, + { + "epoch": 0.7034849379799173, + "grad_norm": 0.2173428662417206, + "learning_rate": 2.1342949741165168e-05, + "loss": 2.2609, + "step": 2382 + }, + { + "epoch": 0.703780271707029, + "grad_norm": 0.2027426661006942, + "learning_rate": 2.1303766767510534e-05, + "loss": 2.3269, + "step": 2383 + }, + { + "epoch": 0.7040756054341406, + "grad_norm": 0.2060915565952426, + "learning_rate": 2.126461005528215e-05, + "loss": 2.3079, + "step": 2384 + }, + { + "epoch": 0.7043709391612523, + "grad_norm": 0.20687897498296637, + "learning_rate": 2.1225479640314378e-05, + "loss": 2.2408, + "step": 2385 + }, + { + "epoch": 0.7046662728883638, + "grad_norm": 0.1931654410317875, + "learning_rate": 2.1186375558417508e-05, + "loss": 2.2113, + "step": 2386 + }, + { + "epoch": 0.7049616066154755, + "grad_norm": 0.1998664760418134, + "learning_rate": 2.1147297845377745e-05, + "loss": 2.3252, + "step": 2387 + }, + { + "epoch": 0.7052569403425871, + "grad_norm": 0.20693415038181132, + "learning_rate": 2.110824653695713e-05, + "loss": 2.3445, + "step": 2388 + }, + { + "epoch": 0.7055522740696988, + "grad_norm": 0.22620919237631293, + "learning_rate": 2.106922166889358e-05, + "loss": 2.4404, + "step": 2389 + }, + { + "epoch": 0.7058476077968104, + "grad_norm": 0.21170310318057245, + "learning_rate": 2.103022327690079e-05, + "loss": 2.325, + "step": 2390 + }, + { + "epoch": 0.706142941523922, + "grad_norm": 0.20316535292121915, + "learning_rate": 2.0991251396668226e-05, + "loss": 2.3124, + "step": 2391 + }, + { + "epoch": 0.7064382752510336, + "grad_norm": 0.2054893639235686, + "learning_rate": 2.0952306063861106e-05, + "loss": 2.3391, + "step": 2392 + }, + { + "epoch": 0.7067336089781453, + "grad_norm": 0.20676360734249033, + "learning_rate": 2.0913387314120335e-05, + "loss": 2.3004, + "step": 2393 + }, + { + "epoch": 0.707028942705257, + "grad_norm": 0.20686973713464374, + "learning_rate": 2.0874495183062503e-05, + "loss": 2.3862, + "step": 2394 + }, + { + "epoch": 0.7073242764323686, + "grad_norm": 0.24156201253992324, + "learning_rate": 2.0835629706279843e-05, + "loss": 2.3243, + "step": 2395 + }, + { + "epoch": 0.7076196101594802, + "grad_norm": 0.2078478291962534, + "learning_rate": 2.0796790919340182e-05, + "loss": 2.272, + "step": 2396 + }, + { + "epoch": 0.7079149438865918, + "grad_norm": 0.19623625695041236, + "learning_rate": 2.0757978857786932e-05, + "loss": 2.2929, + "step": 2397 + }, + { + "epoch": 0.7082102776137035, + "grad_norm": 0.21595735643570568, + "learning_rate": 2.071919355713904e-05, + "loss": 2.3189, + "step": 2398 + }, + { + "epoch": 0.7085056113408151, + "grad_norm": 0.2086100389785606, + "learning_rate": 2.068043505289096e-05, + "loss": 2.3645, + "step": 2399 + }, + { + "epoch": 0.7088009450679268, + "grad_norm": 0.2159108796114043, + "learning_rate": 2.064170338051265e-05, + "loss": 2.3415, + "step": 2400 + }, + { + "epoch": 0.7090962787950384, + "grad_norm": 0.20535055865963442, + "learning_rate": 2.060299857544947e-05, + "loss": 2.3404, + "step": 2401 + }, + { + "epoch": 0.70939161252215, + "grad_norm": 0.2054830821735356, + "learning_rate": 2.0564320673122233e-05, + "loss": 2.2849, + "step": 2402 + }, + { + "epoch": 0.7096869462492617, + "grad_norm": 0.20930504484275347, + "learning_rate": 2.05256697089271e-05, + "loss": 2.2851, + "step": 2403 + }, + { + "epoch": 0.7099822799763733, + "grad_norm": 0.20313730569390875, + "learning_rate": 2.04870457182356e-05, + "loss": 2.3441, + "step": 2404 + }, + { + "epoch": 0.710277613703485, + "grad_norm": 0.20446545044447662, + "learning_rate": 2.0448448736394564e-05, + "loss": 2.2637, + "step": 2405 + }, + { + "epoch": 0.7105729474305966, + "grad_norm": 0.20306699181270754, + "learning_rate": 2.0409878798726122e-05, + "loss": 2.3608, + "step": 2406 + }, + { + "epoch": 0.7108682811577082, + "grad_norm": 0.2086567762076013, + "learning_rate": 2.0371335940527636e-05, + "loss": 2.3539, + "step": 2407 + }, + { + "epoch": 0.7111636148848198, + "grad_norm": 0.2044398228913661, + "learning_rate": 2.0332820197071696e-05, + "loss": 2.2933, + "step": 2408 + }, + { + "epoch": 0.7114589486119315, + "grad_norm": 0.20943679146858837, + "learning_rate": 2.0294331603606074e-05, + "loss": 2.3213, + "step": 2409 + }, + { + "epoch": 0.7117542823390431, + "grad_norm": 0.20364287273437076, + "learning_rate": 2.025587019535371e-05, + "loss": 2.3018, + "step": 2410 + }, + { + "epoch": 0.7120496160661548, + "grad_norm": 0.2042414537168643, + "learning_rate": 2.0217436007512643e-05, + "loss": 2.2433, + "step": 2411 + }, + { + "epoch": 0.7123449497932663, + "grad_norm": 0.19888290445071116, + "learning_rate": 2.017902907525603e-05, + "loss": 2.3293, + "step": 2412 + }, + { + "epoch": 0.712640283520378, + "grad_norm": 0.19516628514458068, + "learning_rate": 2.014064943373203e-05, + "loss": 2.3285, + "step": 2413 + }, + { + "epoch": 0.7129356172474897, + "grad_norm": 0.22236503831573795, + "learning_rate": 2.0102297118063885e-05, + "loss": 2.343, + "step": 2414 + }, + { + "epoch": 0.7132309509746013, + "grad_norm": 0.22749148022764334, + "learning_rate": 2.006397216334981e-05, + "loss": 2.3709, + "step": 2415 + }, + { + "epoch": 0.713526284701713, + "grad_norm": 0.2161615358340313, + "learning_rate": 2.002567460466297e-05, + "loss": 2.3137, + "step": 2416 + }, + { + "epoch": 0.7138216184288245, + "grad_norm": 0.21074277225445076, + "learning_rate": 1.9987404477051475e-05, + "loss": 2.3525, + "step": 2417 + }, + { + "epoch": 0.7141169521559362, + "grad_norm": 0.19817146204738698, + "learning_rate": 1.994916181553832e-05, + "loss": 2.3155, + "step": 2418 + }, + { + "epoch": 0.7144122858830478, + "grad_norm": 0.21471758519227174, + "learning_rate": 1.9910946655121372e-05, + "loss": 2.4104, + "step": 2419 + }, + { + "epoch": 0.7147076196101595, + "grad_norm": 0.19779455450052702, + "learning_rate": 1.9872759030773325e-05, + "loss": 2.2817, + "step": 2420 + }, + { + "epoch": 0.7150029533372712, + "grad_norm": 0.21605977198815438, + "learning_rate": 1.9834598977441665e-05, + "loss": 2.4002, + "step": 2421 + }, + { + "epoch": 0.7152982870643827, + "grad_norm": 0.1969481453423045, + "learning_rate": 1.9796466530048668e-05, + "loss": 2.1323, + "step": 2422 + }, + { + "epoch": 0.7155936207914944, + "grad_norm": 0.2709334154932119, + "learning_rate": 1.9758361723491332e-05, + "loss": 2.3079, + "step": 2423 + }, + { + "epoch": 0.715888954518606, + "grad_norm": 0.20194773454684653, + "learning_rate": 1.972028459264135e-05, + "loss": 2.2766, + "step": 2424 + }, + { + "epoch": 0.7161842882457177, + "grad_norm": 0.22744244664938218, + "learning_rate": 1.968223517234512e-05, + "loss": 2.3191, + "step": 2425 + }, + { + "epoch": 0.7164796219728293, + "grad_norm": 0.2067001141764229, + "learning_rate": 1.9644213497423642e-05, + "loss": 2.2317, + "step": 2426 + }, + { + "epoch": 0.716774955699941, + "grad_norm": 0.2031625382876526, + "learning_rate": 1.960621960267255e-05, + "loss": 2.3124, + "step": 2427 + }, + { + "epoch": 0.7170702894270525, + "grad_norm": 0.20814314545467816, + "learning_rate": 1.9568253522862046e-05, + "loss": 2.2992, + "step": 2428 + }, + { + "epoch": 0.7173656231541642, + "grad_norm": 0.21064772835587983, + "learning_rate": 1.9530315292736873e-05, + "loss": 2.3283, + "step": 2429 + }, + { + "epoch": 0.7176609568812758, + "grad_norm": 0.2032124074526591, + "learning_rate": 1.94924049470163e-05, + "loss": 2.2403, + "step": 2430 + }, + { + "epoch": 0.7179562906083875, + "grad_norm": 0.21532507645151555, + "learning_rate": 1.945452252039407e-05, + "loss": 2.3499, + "step": 2431 + }, + { + "epoch": 0.7182516243354992, + "grad_norm": 0.20691197036166536, + "learning_rate": 1.9416668047538378e-05, + "loss": 2.2544, + "step": 2432 + }, + { + "epoch": 0.7185469580626107, + "grad_norm": 0.22484971332130962, + "learning_rate": 1.9378841563091827e-05, + "loss": 2.2478, + "step": 2433 + }, + { + "epoch": 0.7188422917897224, + "grad_norm": 0.22611850774332334, + "learning_rate": 1.9341043101671412e-05, + "loss": 2.3256, + "step": 2434 + }, + { + "epoch": 0.719137625516834, + "grad_norm": 0.2166379225597076, + "learning_rate": 1.9303272697868503e-05, + "loss": 2.3152, + "step": 2435 + }, + { + "epoch": 0.7194329592439457, + "grad_norm": 0.20881283038666115, + "learning_rate": 1.9265530386248758e-05, + "loss": 2.2381, + "step": 2436 + }, + { + "epoch": 0.7197282929710573, + "grad_norm": 0.2062440517319174, + "learning_rate": 1.922781620135215e-05, + "loss": 2.3577, + "step": 2437 + }, + { + "epoch": 0.7200236266981689, + "grad_norm": 0.20128601502758695, + "learning_rate": 1.9190130177692906e-05, + "loss": 2.2555, + "step": 2438 + }, + { + "epoch": 0.7203189604252805, + "grad_norm": 0.22978161356375942, + "learning_rate": 1.9152472349759477e-05, + "loss": 2.4257, + "step": 2439 + }, + { + "epoch": 0.7206142941523922, + "grad_norm": 0.20621823875478332, + "learning_rate": 1.9114842752014516e-05, + "loss": 2.321, + "step": 2440 + }, + { + "epoch": 0.7209096278795039, + "grad_norm": 0.22270420461532284, + "learning_rate": 1.9077241418894858e-05, + "loss": 2.3322, + "step": 2441 + }, + { + "epoch": 0.7212049616066155, + "grad_norm": 0.21267737636130882, + "learning_rate": 1.9039668384811414e-05, + "loss": 2.353, + "step": 2442 + }, + { + "epoch": 0.7215002953337271, + "grad_norm": 0.2051051076667579, + "learning_rate": 1.900212368414926e-05, + "loss": 2.2813, + "step": 2443 + }, + { + "epoch": 0.7217956290608387, + "grad_norm": 0.20641506915487512, + "learning_rate": 1.8964607351267505e-05, + "loss": 2.2948, + "step": 2444 + }, + { + "epoch": 0.7220909627879504, + "grad_norm": 0.21468044490132948, + "learning_rate": 1.8927119420499324e-05, + "loss": 2.259, + "step": 2445 + }, + { + "epoch": 0.722386296515062, + "grad_norm": 0.1982470168854971, + "learning_rate": 1.888965992615188e-05, + "loss": 2.337, + "step": 2446 + }, + { + "epoch": 0.7226816302421737, + "grad_norm": 0.2047254254244587, + "learning_rate": 1.8852228902506326e-05, + "loss": 2.3489, + "step": 2447 + }, + { + "epoch": 0.7229769639692853, + "grad_norm": 0.20380824975065584, + "learning_rate": 1.881482638381774e-05, + "loss": 2.3569, + "step": 2448 + }, + { + "epoch": 0.7232722976963969, + "grad_norm": 0.2058783495299905, + "learning_rate": 1.877745240431513e-05, + "loss": 2.2837, + "step": 2449 + }, + { + "epoch": 0.7235676314235086, + "grad_norm": 0.20975388364413278, + "learning_rate": 1.8740106998201396e-05, + "loss": 2.3414, + "step": 2450 + }, + { + "epoch": 0.7238629651506202, + "grad_norm": 0.21500642859750713, + "learning_rate": 1.8702790199653237e-05, + "loss": 2.3182, + "step": 2451 + }, + { + "epoch": 0.7241582988777319, + "grad_norm": 0.2124976634850794, + "learning_rate": 1.8665502042821258e-05, + "loss": 2.351, + "step": 2452 + }, + { + "epoch": 0.7244536326048435, + "grad_norm": 0.19988242098061912, + "learning_rate": 1.8628242561829785e-05, + "loss": 2.25, + "step": 2453 + }, + { + "epoch": 0.7247489663319551, + "grad_norm": 0.2037760497916873, + "learning_rate": 1.8591011790776917e-05, + "loss": 2.366, + "step": 2454 + }, + { + "epoch": 0.7250443000590667, + "grad_norm": 0.2010818391072003, + "learning_rate": 1.8553809763734494e-05, + "loss": 2.285, + "step": 2455 + }, + { + "epoch": 0.7253396337861784, + "grad_norm": 0.21461525817683774, + "learning_rate": 1.8516636514748038e-05, + "loss": 2.3686, + "step": 2456 + }, + { + "epoch": 0.72563496751329, + "grad_norm": 0.2085746867152027, + "learning_rate": 1.847949207783672e-05, + "loss": 2.3614, + "step": 2457 + }, + { + "epoch": 0.7259303012404017, + "grad_norm": 0.20611363512404807, + "learning_rate": 1.844237648699337e-05, + "loss": 2.3062, + "step": 2458 + }, + { + "epoch": 0.7262256349675132, + "grad_norm": 0.2055012289275018, + "learning_rate": 1.8405289776184402e-05, + "loss": 2.3208, + "step": 2459 + }, + { + "epoch": 0.7265209686946249, + "grad_norm": 0.2045507006652784, + "learning_rate": 1.8368231979349827e-05, + "loss": 2.3218, + "step": 2460 + }, + { + "epoch": 0.7268163024217366, + "grad_norm": 0.20670126762481275, + "learning_rate": 1.8331203130403158e-05, + "loss": 2.2433, + "step": 2461 + }, + { + "epoch": 0.7271116361488482, + "grad_norm": 0.21030516683935604, + "learning_rate": 1.8294203263231436e-05, + "loss": 2.345, + "step": 2462 + }, + { + "epoch": 0.7274069698759599, + "grad_norm": 0.22020518414950196, + "learning_rate": 1.825723241169519e-05, + "loss": 2.3222, + "step": 2463 + }, + { + "epoch": 0.7277023036030714, + "grad_norm": 0.20664804910400691, + "learning_rate": 1.822029060962837e-05, + "loss": 2.2902, + "step": 2464 + }, + { + "epoch": 0.7279976373301831, + "grad_norm": 0.20414838299497456, + "learning_rate": 1.8183377890838367e-05, + "loss": 2.2941, + "step": 2465 + }, + { + "epoch": 0.7282929710572947, + "grad_norm": 0.20571008223530085, + "learning_rate": 1.814649428910593e-05, + "loss": 2.386, + "step": 2466 + }, + { + "epoch": 0.7285883047844064, + "grad_norm": 0.19876355169987353, + "learning_rate": 1.8109639838185184e-05, + "loss": 2.2761, + "step": 2467 + }, + { + "epoch": 0.728883638511518, + "grad_norm": 0.2178858394240885, + "learning_rate": 1.8072814571803566e-05, + "loss": 2.3201, + "step": 2468 + }, + { + "epoch": 0.7291789722386296, + "grad_norm": 0.20620484327389646, + "learning_rate": 1.8036018523661803e-05, + "loss": 2.2742, + "step": 2469 + }, + { + "epoch": 0.7294743059657413, + "grad_norm": 0.20997470544430374, + "learning_rate": 1.7999251727433902e-05, + "loss": 2.3781, + "step": 2470 + }, + { + "epoch": 0.7297696396928529, + "grad_norm": 0.19382926411661092, + "learning_rate": 1.7962514216767056e-05, + "loss": 2.3193, + "step": 2471 + }, + { + "epoch": 0.7300649734199646, + "grad_norm": 0.20689019310183532, + "learning_rate": 1.7925806025281704e-05, + "loss": 2.3317, + "step": 2472 + }, + { + "epoch": 0.7303603071470762, + "grad_norm": 0.1972481276050994, + "learning_rate": 1.7889127186571426e-05, + "loss": 2.2075, + "step": 2473 + }, + { + "epoch": 0.7306556408741879, + "grad_norm": 0.20977568327019963, + "learning_rate": 1.7852477734202954e-05, + "loss": 2.3111, + "step": 2474 + }, + { + "epoch": 0.7309509746012994, + "grad_norm": 0.19991679472466656, + "learning_rate": 1.781585770171611e-05, + "loss": 2.3142, + "step": 2475 + }, + { + "epoch": 0.7312463083284111, + "grad_norm": 0.30425369730076585, + "learning_rate": 1.777926712262383e-05, + "loss": 2.4828, + "step": 2476 + }, + { + "epoch": 0.7315416420555227, + "grad_norm": 0.20627176682891868, + "learning_rate": 1.7742706030412056e-05, + "loss": 2.2783, + "step": 2477 + }, + { + "epoch": 0.7318369757826344, + "grad_norm": 0.2027130718468008, + "learning_rate": 1.7706174458539753e-05, + "loss": 2.3569, + "step": 2478 + }, + { + "epoch": 0.7321323095097461, + "grad_norm": 0.2222599216527242, + "learning_rate": 1.7669672440438882e-05, + "loss": 2.2677, + "step": 2479 + }, + { + "epoch": 0.7324276432368576, + "grad_norm": 0.20746478087783818, + "learning_rate": 1.763320000951435e-05, + "loss": 2.324, + "step": 2480 + }, + { + "epoch": 0.7327229769639693, + "grad_norm": 0.20466429418684767, + "learning_rate": 1.759675719914399e-05, + "loss": 2.3219, + "step": 2481 + }, + { + "epoch": 0.7330183106910809, + "grad_norm": 0.22508803241537756, + "learning_rate": 1.7560344042678533e-05, + "loss": 2.2213, + "step": 2482 + }, + { + "epoch": 0.7333136444181926, + "grad_norm": 0.199362160918019, + "learning_rate": 1.7523960573441557e-05, + "loss": 2.2751, + "step": 2483 + }, + { + "epoch": 0.7336089781453042, + "grad_norm": 0.20067201938340298, + "learning_rate": 1.748760682472948e-05, + "loss": 2.0422, + "step": 2484 + }, + { + "epoch": 0.7339043118724158, + "grad_norm": 0.21762390305759197, + "learning_rate": 1.7451282829811545e-05, + "loss": 2.2902, + "step": 2485 + }, + { + "epoch": 0.7341996455995274, + "grad_norm": 0.19319255452869877, + "learning_rate": 1.7414988621929705e-05, + "loss": 2.2939, + "step": 2486 + }, + { + "epoch": 0.7344949793266391, + "grad_norm": 0.209814908011842, + "learning_rate": 1.737872423429871e-05, + "loss": 2.2981, + "step": 2487 + }, + { + "epoch": 0.7347903130537508, + "grad_norm": 0.20961660494227063, + "learning_rate": 1.7342489700105998e-05, + "loss": 2.327, + "step": 2488 + }, + { + "epoch": 0.7350856467808624, + "grad_norm": 0.19916318604754504, + "learning_rate": 1.730628505251169e-05, + "loss": 2.3306, + "step": 2489 + }, + { + "epoch": 0.735380980507974, + "grad_norm": 0.21553363142707035, + "learning_rate": 1.7270110324648552e-05, + "loss": 2.2961, + "step": 2490 + }, + { + "epoch": 0.7356763142350856, + "grad_norm": 0.19901410044442341, + "learning_rate": 1.7233965549621982e-05, + "loss": 2.2807, + "step": 2491 + }, + { + "epoch": 0.7359716479621973, + "grad_norm": 0.20154786389462656, + "learning_rate": 1.7197850760509948e-05, + "loss": 2.3281, + "step": 2492 + }, + { + "epoch": 0.7362669816893089, + "grad_norm": 0.19615479047389375, + "learning_rate": 1.7161765990362994e-05, + "loss": 2.3254, + "step": 2493 + }, + { + "epoch": 0.7365623154164206, + "grad_norm": 0.20585670722810814, + "learning_rate": 1.7125711272204176e-05, + "loss": 2.2834, + "step": 2494 + }, + { + "epoch": 0.7368576491435322, + "grad_norm": 0.20528362513849444, + "learning_rate": 1.7089686639029057e-05, + "loss": 2.1915, + "step": 2495 + }, + { + "epoch": 0.7371529828706438, + "grad_norm": 0.20957041951058564, + "learning_rate": 1.7053692123805663e-05, + "loss": 2.3601, + "step": 2496 + }, + { + "epoch": 0.7374483165977554, + "grad_norm": 0.2085992080174432, + "learning_rate": 1.7017727759474472e-05, + "loss": 2.2736, + "step": 2497 + }, + { + "epoch": 0.7377436503248671, + "grad_norm": 0.2121516798484919, + "learning_rate": 1.6981793578948346e-05, + "loss": 2.2882, + "step": 2498 + }, + { + "epoch": 0.7380389840519788, + "grad_norm": 0.20408955223849318, + "learning_rate": 1.6945889615112538e-05, + "loss": 2.3214, + "step": 2499 + }, + { + "epoch": 0.7383343177790904, + "grad_norm": 0.2028784631250213, + "learning_rate": 1.691001590082465e-05, + "loss": 2.2205, + "step": 2500 + }, + { + "epoch": 0.738629651506202, + "grad_norm": 0.2085554319898225, + "learning_rate": 1.6874172468914595e-05, + "loss": 2.2313, + "step": 2501 + }, + { + "epoch": 0.7389249852333136, + "grad_norm": 0.20450940975036602, + "learning_rate": 1.6838359352184576e-05, + "loss": 2.3378, + "step": 2502 + }, + { + "epoch": 0.7392203189604253, + "grad_norm": 0.21666444565631118, + "learning_rate": 1.6802576583409045e-05, + "loss": 2.3471, + "step": 2503 + }, + { + "epoch": 0.7395156526875369, + "grad_norm": 0.2072465245216616, + "learning_rate": 1.67668241953347e-05, + "loss": 2.2734, + "step": 2504 + }, + { + "epoch": 0.7398109864146486, + "grad_norm": 0.21024872330063643, + "learning_rate": 1.6731102220680416e-05, + "loss": 2.3596, + "step": 2505 + }, + { + "epoch": 0.7401063201417601, + "grad_norm": 0.20252528976066062, + "learning_rate": 1.669541069213725e-05, + "loss": 2.2293, + "step": 2506 + }, + { + "epoch": 0.7404016538688718, + "grad_norm": 0.1949160557797522, + "learning_rate": 1.6659749642368384e-05, + "loss": 2.3606, + "step": 2507 + }, + { + "epoch": 0.7406969875959835, + "grad_norm": 0.20515662008050223, + "learning_rate": 1.662411910400911e-05, + "loss": 2.3324, + "step": 2508 + }, + { + "epoch": 0.7409923213230951, + "grad_norm": 0.1988405770859961, + "learning_rate": 1.658851910966681e-05, + "loss": 2.2937, + "step": 2509 + }, + { + "epoch": 0.7412876550502068, + "grad_norm": 0.20199535857960907, + "learning_rate": 1.6552949691920895e-05, + "loss": 2.3489, + "step": 2510 + }, + { + "epoch": 0.7415829887773183, + "grad_norm": 0.2063415212301434, + "learning_rate": 1.6517410883322803e-05, + "loss": 2.3233, + "step": 2511 + }, + { + "epoch": 0.74187832250443, + "grad_norm": 0.20629688690897427, + "learning_rate": 1.6481902716395968e-05, + "loss": 2.3384, + "step": 2512 + }, + { + "epoch": 0.7421736562315416, + "grad_norm": 0.20679760290406154, + "learning_rate": 1.6446425223635764e-05, + "loss": 2.2923, + "step": 2513 + }, + { + "epoch": 0.7424689899586533, + "grad_norm": 0.2136613996682166, + "learning_rate": 1.641097843750952e-05, + "loss": 2.3257, + "step": 2514 + }, + { + "epoch": 0.742764323685765, + "grad_norm": 0.20104407908584984, + "learning_rate": 1.6375562390456417e-05, + "loss": 2.1946, + "step": 2515 + }, + { + "epoch": 0.7430596574128765, + "grad_norm": 0.19995788823772614, + "learning_rate": 1.6340177114887555e-05, + "loss": 2.3205, + "step": 2516 + }, + { + "epoch": 0.7433549911399882, + "grad_norm": 0.21771244512139945, + "learning_rate": 1.6304822643185846e-05, + "loss": 2.2674, + "step": 2517 + }, + { + "epoch": 0.7436503248670998, + "grad_norm": 0.2051529611554882, + "learning_rate": 1.626949900770603e-05, + "loss": 2.2664, + "step": 2518 + }, + { + "epoch": 0.7439456585942115, + "grad_norm": 0.20690058237404538, + "learning_rate": 1.62342062407746e-05, + "loss": 2.3279, + "step": 2519 + }, + { + "epoch": 0.7442409923213231, + "grad_norm": 0.2092584702049004, + "learning_rate": 1.6198944374689827e-05, + "loss": 2.2843, + "step": 2520 + }, + { + "epoch": 0.7445363260484348, + "grad_norm": 0.2170951698463259, + "learning_rate": 1.616371344172169e-05, + "loss": 2.3927, + "step": 2521 + }, + { + "epoch": 0.7448316597755463, + "grad_norm": 0.20752797726665814, + "learning_rate": 1.612851347411186e-05, + "loss": 2.3959, + "step": 2522 + }, + { + "epoch": 0.745126993502658, + "grad_norm": 0.2080638523458922, + "learning_rate": 1.6093344504073672e-05, + "loss": 2.3546, + "step": 2523 + }, + { + "epoch": 0.7454223272297696, + "grad_norm": 0.20446620655882844, + "learning_rate": 1.6058206563792094e-05, + "loss": 2.3309, + "step": 2524 + }, + { + "epoch": 0.7457176609568813, + "grad_norm": 0.2085821791450683, + "learning_rate": 1.6023099685423692e-05, + "loss": 2.3552, + "step": 2525 + }, + { + "epoch": 0.746012994683993, + "grad_norm": 0.2011521338989455, + "learning_rate": 1.5988023901096617e-05, + "loss": 2.3433, + "step": 2526 + }, + { + "epoch": 0.7463083284111045, + "grad_norm": 0.2026165687929602, + "learning_rate": 1.5952979242910544e-05, + "loss": 2.3058, + "step": 2527 + }, + { + "epoch": 0.7466036621382162, + "grad_norm": 0.20650648692185483, + "learning_rate": 1.5917965742936684e-05, + "loss": 2.3647, + "step": 2528 + }, + { + "epoch": 0.7468989958653278, + "grad_norm": 0.2150475056001586, + "learning_rate": 1.588298343321773e-05, + "loss": 2.2846, + "step": 2529 + }, + { + "epoch": 0.7471943295924395, + "grad_norm": 0.20732529469667707, + "learning_rate": 1.5848032345767816e-05, + "loss": 2.2782, + "step": 2530 + }, + { + "epoch": 0.7474896633195511, + "grad_norm": 0.20882651347328088, + "learning_rate": 1.581311251257252e-05, + "loss": 2.2579, + "step": 2531 + }, + { + "epoch": 0.7477849970466627, + "grad_norm": 0.2291766818343461, + "learning_rate": 1.5778223965588802e-05, + "loss": 2.2261, + "step": 2532 + }, + { + "epoch": 0.7480803307737743, + "grad_norm": 0.25591018405228405, + "learning_rate": 1.5743366736745008e-05, + "loss": 2.4818, + "step": 2533 + }, + { + "epoch": 0.748375664500886, + "grad_norm": 0.20746303911389014, + "learning_rate": 1.5708540857940814e-05, + "loss": 2.0999, + "step": 2534 + }, + { + "epoch": 0.7486709982279977, + "grad_norm": 0.20877657568394872, + "learning_rate": 1.56737463610472e-05, + "loss": 2.27, + "step": 2535 + }, + { + "epoch": 0.7489663319551093, + "grad_norm": 0.20989774595829574, + "learning_rate": 1.5638983277906432e-05, + "loss": 2.2637, + "step": 2536 + }, + { + "epoch": 0.7492616656822209, + "grad_norm": 0.19364945971735906, + "learning_rate": 1.5604251640332034e-05, + "loss": 2.3233, + "step": 2537 + }, + { + "epoch": 0.7495569994093325, + "grad_norm": 0.19845801382588385, + "learning_rate": 1.5569551480108747e-05, + "loss": 2.2303, + "step": 2538 + }, + { + "epoch": 0.7498523331364442, + "grad_norm": 0.20431990557482643, + "learning_rate": 1.5534882828992504e-05, + "loss": 2.2724, + "step": 2539 + }, + { + "epoch": 0.7501476668635558, + "grad_norm": 0.20516746028420757, + "learning_rate": 1.5500245718710398e-05, + "loss": 2.2611, + "step": 2540 + }, + { + "epoch": 0.7504430005906675, + "grad_norm": 0.20852616247404884, + "learning_rate": 1.5465640180960674e-05, + "loss": 2.2737, + "step": 2541 + }, + { + "epoch": 0.7507383343177791, + "grad_norm": 0.21661401030096333, + "learning_rate": 1.543106624741266e-05, + "loss": 2.3751, + "step": 2542 + }, + { + "epoch": 0.7510336680448907, + "grad_norm": 0.22478914115967227, + "learning_rate": 1.5396523949706788e-05, + "loss": 2.305, + "step": 2543 + }, + { + "epoch": 0.7513290017720023, + "grad_norm": 0.19765113703330464, + "learning_rate": 1.5362013319454527e-05, + "loss": 2.2916, + "step": 2544 + }, + { + "epoch": 0.751624335499114, + "grad_norm": 0.20445887304264207, + "learning_rate": 1.5327534388238336e-05, + "loss": 2.334, + "step": 2545 + }, + { + "epoch": 0.7519196692262257, + "grad_norm": 0.3223945244293676, + "learning_rate": 1.529308718761171e-05, + "loss": 2.2833, + "step": 2546 + }, + { + "epoch": 0.7522150029533373, + "grad_norm": 0.2005438677863611, + "learning_rate": 1.525867174909908e-05, + "loss": 2.3105, + "step": 2547 + }, + { + "epoch": 0.7525103366804489, + "grad_norm": 0.2039404785541981, + "learning_rate": 1.5224288104195821e-05, + "loss": 2.3623, + "step": 2548 + }, + { + "epoch": 0.7528056704075605, + "grad_norm": 0.2024150509201158, + "learning_rate": 1.5189936284368205e-05, + "loss": 2.2912, + "step": 2549 + }, + { + "epoch": 0.7531010041346722, + "grad_norm": 0.21116309994139432, + "learning_rate": 1.5155616321053385e-05, + "loss": 2.288, + "step": 2550 + }, + { + "epoch": 0.7533963378617838, + "grad_norm": 0.20538799853004072, + "learning_rate": 1.5121328245659355e-05, + "loss": 2.2895, + "step": 2551 + }, + { + "epoch": 0.7536916715888955, + "grad_norm": 0.2062368265915645, + "learning_rate": 1.508707208956493e-05, + "loss": 2.3206, + "step": 2552 + }, + { + "epoch": 0.753987005316007, + "grad_norm": 0.2124156377731772, + "learning_rate": 1.5052847884119713e-05, + "loss": 2.3301, + "step": 2553 + }, + { + "epoch": 0.7542823390431187, + "grad_norm": 0.2072019413191568, + "learning_rate": 1.5018655660644055e-05, + "loss": 2.2994, + "step": 2554 + }, + { + "epoch": 0.7545776727702304, + "grad_norm": 0.2031213319887417, + "learning_rate": 1.4984495450429075e-05, + "loss": 2.3028, + "step": 2555 + }, + { + "epoch": 0.754873006497342, + "grad_norm": 0.21518495515032648, + "learning_rate": 1.4950367284736556e-05, + "loss": 2.3596, + "step": 2556 + }, + { + "epoch": 0.7551683402244537, + "grad_norm": 0.22131709555072462, + "learning_rate": 1.4916271194798976e-05, + "loss": 2.3012, + "step": 2557 + }, + { + "epoch": 0.7554636739515652, + "grad_norm": 0.20778332268812358, + "learning_rate": 1.4882207211819466e-05, + "loss": 2.3197, + "step": 2558 + }, + { + "epoch": 0.7557590076786769, + "grad_norm": 0.21933953850389598, + "learning_rate": 1.4848175366971728e-05, + "loss": 2.279, + "step": 2559 + }, + { + "epoch": 0.7560543414057885, + "grad_norm": 0.20480922144633779, + "learning_rate": 1.48141756914001e-05, + "loss": 2.3284, + "step": 2560 + }, + { + "epoch": 0.7563496751329002, + "grad_norm": 0.2167481290297622, + "learning_rate": 1.4780208216219476e-05, + "loss": 2.2892, + "step": 2561 + }, + { + "epoch": 0.7566450088600118, + "grad_norm": 0.38446131644217013, + "learning_rate": 1.4746272972515257e-05, + "loss": 2.3675, + "step": 2562 + }, + { + "epoch": 0.7569403425871235, + "grad_norm": 0.19582019624035082, + "learning_rate": 1.4712369991343377e-05, + "loss": 2.2331, + "step": 2563 + }, + { + "epoch": 0.757235676314235, + "grad_norm": 0.2040353695066053, + "learning_rate": 1.467849930373022e-05, + "loss": 2.3754, + "step": 2564 + }, + { + "epoch": 0.7575310100413467, + "grad_norm": 0.2082560226919332, + "learning_rate": 1.4644660940672627e-05, + "loss": 2.2935, + "step": 2565 + }, + { + "epoch": 0.7578263437684584, + "grad_norm": 0.2117922472333884, + "learning_rate": 1.4610854933137863e-05, + "loss": 2.2564, + "step": 2566 + }, + { + "epoch": 0.75812167749557, + "grad_norm": 0.1943687238783929, + "learning_rate": 1.4577081312063573e-05, + "loss": 2.2755, + "step": 2567 + }, + { + "epoch": 0.7584170112226817, + "grad_norm": 0.20716907304203525, + "learning_rate": 1.4543340108357762e-05, + "loss": 2.3206, + "step": 2568 + }, + { + "epoch": 0.7587123449497932, + "grad_norm": 0.2005822512873884, + "learning_rate": 1.4509631352898783e-05, + "loss": 2.3488, + "step": 2569 + }, + { + "epoch": 0.7590076786769049, + "grad_norm": 0.21300904016719854, + "learning_rate": 1.4475955076535275e-05, + "loss": 2.2412, + "step": 2570 + }, + { + "epoch": 0.7593030124040165, + "grad_norm": 0.2056759431455926, + "learning_rate": 1.4442311310086166e-05, + "loss": 2.2342, + "step": 2571 + }, + { + "epoch": 0.7595983461311282, + "grad_norm": 0.21269131876521183, + "learning_rate": 1.4408700084340632e-05, + "loss": 2.3209, + "step": 2572 + }, + { + "epoch": 0.7598936798582399, + "grad_norm": 0.2015856496691104, + "learning_rate": 1.4375121430058075e-05, + "loss": 2.3155, + "step": 2573 + }, + { + "epoch": 0.7601890135853514, + "grad_norm": 0.20165288695009928, + "learning_rate": 1.4341575377968053e-05, + "loss": 2.3788, + "step": 2574 + }, + { + "epoch": 0.7604843473124631, + "grad_norm": 0.20023419072458612, + "learning_rate": 1.4308061958770336e-05, + "loss": 2.21, + "step": 2575 + }, + { + "epoch": 0.7607796810395747, + "grad_norm": 0.2011559304149934, + "learning_rate": 1.4274581203134801e-05, + "loss": 2.317, + "step": 2576 + }, + { + "epoch": 0.7610750147666864, + "grad_norm": 0.2005390937136699, + "learning_rate": 1.4241133141701434e-05, + "loss": 2.2508, + "step": 2577 + }, + { + "epoch": 0.761370348493798, + "grad_norm": 0.20739050051374228, + "learning_rate": 1.4207717805080333e-05, + "loss": 2.3396, + "step": 2578 + }, + { + "epoch": 0.7616656822209096, + "grad_norm": 0.22379467033435416, + "learning_rate": 1.4174335223851609e-05, + "loss": 2.2983, + "step": 2579 + }, + { + "epoch": 0.7619610159480212, + "grad_norm": 0.2080910475848011, + "learning_rate": 1.4140985428565411e-05, + "loss": 2.307, + "step": 2580 + }, + { + "epoch": 0.7622563496751329, + "grad_norm": 0.19905946010621975, + "learning_rate": 1.4107668449741884e-05, + "loss": 2.304, + "step": 2581 + }, + { + "epoch": 0.7625516834022446, + "grad_norm": 0.3513143935519841, + "learning_rate": 1.4074384317871141e-05, + "loss": 2.2843, + "step": 2582 + }, + { + "epoch": 0.7628470171293562, + "grad_norm": 0.20375095716789884, + "learning_rate": 1.404113306341323e-05, + "loss": 2.3757, + "step": 2583 + }, + { + "epoch": 0.7631423508564678, + "grad_norm": 0.20150456187252588, + "learning_rate": 1.4007914716798127e-05, + "loss": 2.3417, + "step": 2584 + }, + { + "epoch": 0.7634376845835794, + "grad_norm": 0.20154879730429318, + "learning_rate": 1.3974729308425672e-05, + "loss": 2.2766, + "step": 2585 + }, + { + "epoch": 0.7637330183106911, + "grad_norm": 0.2159204207703394, + "learning_rate": 1.3941576868665567e-05, + "loss": 2.3611, + "step": 2586 + }, + { + "epoch": 0.7640283520378027, + "grad_norm": 0.2025246183713174, + "learning_rate": 1.3908457427857375e-05, + "loss": 2.2258, + "step": 2587 + }, + { + "epoch": 0.7643236857649144, + "grad_norm": 0.2145438615519117, + "learning_rate": 1.387537101631039e-05, + "loss": 2.4265, + "step": 2588 + }, + { + "epoch": 0.764619019492026, + "grad_norm": 0.2042667140542109, + "learning_rate": 1.3842317664303739e-05, + "loss": 2.2967, + "step": 2589 + }, + { + "epoch": 0.7649143532191376, + "grad_norm": 0.20530265142895204, + "learning_rate": 1.3809297402086274e-05, + "loss": 2.3209, + "step": 2590 + }, + { + "epoch": 0.7652096869462492, + "grad_norm": 0.20077661924551155, + "learning_rate": 1.3776310259876569e-05, + "loss": 2.3505, + "step": 2591 + }, + { + "epoch": 0.7655050206733609, + "grad_norm": 0.19545492701210943, + "learning_rate": 1.3743356267862883e-05, + "loss": 2.1064, + "step": 2592 + }, + { + "epoch": 0.7658003544004726, + "grad_norm": 0.2066189781633805, + "learning_rate": 1.371043545620314e-05, + "loss": 2.2493, + "step": 2593 + }, + { + "epoch": 0.7660956881275842, + "grad_norm": 0.20375880980021777, + "learning_rate": 1.3677547855024907e-05, + "loss": 2.264, + "step": 2594 + }, + { + "epoch": 0.7663910218546958, + "grad_norm": 0.20705016067796198, + "learning_rate": 1.3644693494425343e-05, + "loss": 2.3405, + "step": 2595 + }, + { + "epoch": 0.7666863555818074, + "grad_norm": 0.20497208533152014, + "learning_rate": 1.3611872404471198e-05, + "loss": 2.4009, + "step": 2596 + }, + { + "epoch": 0.7669816893089191, + "grad_norm": 0.20291959941266893, + "learning_rate": 1.3579084615198767e-05, + "loss": 2.3075, + "step": 2597 + }, + { + "epoch": 0.7672770230360307, + "grad_norm": 0.22148737358741116, + "learning_rate": 1.3546330156613873e-05, + "loss": 2.3205, + "step": 2598 + }, + { + "epoch": 0.7675723567631424, + "grad_norm": 0.19807093133733047, + "learning_rate": 1.3513609058691845e-05, + "loss": 2.3292, + "step": 2599 + }, + { + "epoch": 0.7678676904902539, + "grad_norm": 0.21499301753515837, + "learning_rate": 1.3480921351377468e-05, + "loss": 2.3227, + "step": 2600 + }, + { + "epoch": 0.7681630242173656, + "grad_norm": 0.20771587283691528, + "learning_rate": 1.3448267064584974e-05, + "loss": 2.3116, + "step": 2601 + }, + { + "epoch": 0.7684583579444773, + "grad_norm": 0.20039326255207712, + "learning_rate": 1.3415646228198015e-05, + "loss": 2.2748, + "step": 2602 + }, + { + "epoch": 0.7687536916715889, + "grad_norm": 0.20078003322663077, + "learning_rate": 1.338305887206962e-05, + "loss": 2.2917, + "step": 2603 + }, + { + "epoch": 0.7690490253987006, + "grad_norm": 0.21160406300273446, + "learning_rate": 1.3350505026022198e-05, + "loss": 2.212, + "step": 2604 + }, + { + "epoch": 0.7693443591258121, + "grad_norm": 0.21239488084332253, + "learning_rate": 1.331798471984747e-05, + "loss": 2.2525, + "step": 2605 + }, + { + "epoch": 0.7696396928529238, + "grad_norm": 0.20638442210680125, + "learning_rate": 1.3285497983306473e-05, + "loss": 2.3251, + "step": 2606 + }, + { + "epoch": 0.7699350265800354, + "grad_norm": 0.19703891681756355, + "learning_rate": 1.325304484612952e-05, + "loss": 1.9863, + "step": 2607 + }, + { + "epoch": 0.7702303603071471, + "grad_norm": 0.1972731503854706, + "learning_rate": 1.3220625338016179e-05, + "loss": 2.2337, + "step": 2608 + }, + { + "epoch": 0.7705256940342587, + "grad_norm": 0.2064717578100141, + "learning_rate": 1.318823948863524e-05, + "loss": 2.3318, + "step": 2609 + }, + { + "epoch": 0.7708210277613704, + "grad_norm": 0.20137039130221684, + "learning_rate": 1.3155887327624688e-05, + "loss": 2.3612, + "step": 2610 + }, + { + "epoch": 0.771116361488482, + "grad_norm": 0.221455422784825, + "learning_rate": 1.3123568884591686e-05, + "loss": 2.2829, + "step": 2611 + }, + { + "epoch": 0.7714116952155936, + "grad_norm": 0.20702360247036763, + "learning_rate": 1.3091284189112529e-05, + "loss": 2.3411, + "step": 2612 + }, + { + "epoch": 0.7717070289427053, + "grad_norm": 0.21421296546633037, + "learning_rate": 1.3059033270732635e-05, + "loss": 2.3098, + "step": 2613 + }, + { + "epoch": 0.7720023626698169, + "grad_norm": 0.22103689493882725, + "learning_rate": 1.302681615896651e-05, + "loss": 2.2956, + "step": 2614 + }, + { + "epoch": 0.7722976963969286, + "grad_norm": 0.20813138631238473, + "learning_rate": 1.299463288329772e-05, + "loss": 2.3512, + "step": 2615 + }, + { + "epoch": 0.7725930301240401, + "grad_norm": 0.21031304044781604, + "learning_rate": 1.296248347317886e-05, + "loss": 2.2591, + "step": 2616 + }, + { + "epoch": 0.7728883638511518, + "grad_norm": 0.19858600136131455, + "learning_rate": 1.2930367958031564e-05, + "loss": 2.2687, + "step": 2617 + }, + { + "epoch": 0.7731836975782634, + "grad_norm": 0.2045272421826674, + "learning_rate": 1.2898286367246388e-05, + "loss": 2.2755, + "step": 2618 + }, + { + "epoch": 0.7734790313053751, + "grad_norm": 0.20555775277596564, + "learning_rate": 1.2866238730182888e-05, + "loss": 2.3358, + "step": 2619 + }, + { + "epoch": 0.7737743650324868, + "grad_norm": 0.22186209614262561, + "learning_rate": 1.2834225076169543e-05, + "loss": 2.2641, + "step": 2620 + }, + { + "epoch": 0.7740696987595983, + "grad_norm": 0.2129413211900645, + "learning_rate": 1.2802245434503718e-05, + "loss": 2.3891, + "step": 2621 + }, + { + "epoch": 0.77436503248671, + "grad_norm": 0.21016781395787587, + "learning_rate": 1.277029983445166e-05, + "loss": 2.3439, + "step": 2622 + }, + { + "epoch": 0.7746603662138216, + "grad_norm": 0.19903197602438938, + "learning_rate": 1.2738388305248467e-05, + "loss": 2.3154, + "step": 2623 + }, + { + "epoch": 0.7749556999409333, + "grad_norm": 0.20555478060869106, + "learning_rate": 1.2706510876098043e-05, + "loss": 2.2727, + "step": 2624 + }, + { + "epoch": 0.7752510336680449, + "grad_norm": 0.20362377368412543, + "learning_rate": 1.2674667576173094e-05, + "loss": 2.3398, + "step": 2625 + }, + { + "epoch": 0.7755463673951565, + "grad_norm": 0.19571312823589765, + "learning_rate": 1.2642858434615096e-05, + "loss": 2.2892, + "step": 2626 + }, + { + "epoch": 0.7758417011222681, + "grad_norm": 0.19819795650649452, + "learning_rate": 1.2611083480534258e-05, + "loss": 2.2869, + "step": 2627 + }, + { + "epoch": 0.7761370348493798, + "grad_norm": 0.2034800159690752, + "learning_rate": 1.2579342743009509e-05, + "loss": 2.3155, + "step": 2628 + }, + { + "epoch": 0.7764323685764914, + "grad_norm": 0.22615326720860543, + "learning_rate": 1.2547636251088457e-05, + "loss": 2.3838, + "step": 2629 + }, + { + "epoch": 0.7767277023036031, + "grad_norm": 0.2029405361502941, + "learning_rate": 1.251596403378737e-05, + "loss": 2.2874, + "step": 2630 + }, + { + "epoch": 0.7770230360307147, + "grad_norm": 0.19993518583789505, + "learning_rate": 1.248432612009116e-05, + "loss": 2.3625, + "step": 2631 + }, + { + "epoch": 0.7773183697578263, + "grad_norm": 0.2622907139172772, + "learning_rate": 1.2452722538953338e-05, + "loss": 2.3371, + "step": 2632 + }, + { + "epoch": 0.777613703484938, + "grad_norm": 0.2119095006355048, + "learning_rate": 1.2421153319295992e-05, + "loss": 2.3659, + "step": 2633 + }, + { + "epoch": 0.7779090372120496, + "grad_norm": 0.20048087617825203, + "learning_rate": 1.2389618490009775e-05, + "loss": 2.1738, + "step": 2634 + }, + { + "epoch": 0.7782043709391613, + "grad_norm": 0.28118038640517434, + "learning_rate": 1.2358118079953856e-05, + "loss": 2.324, + "step": 2635 + }, + { + "epoch": 0.7784997046662729, + "grad_norm": 0.20563772237546674, + "learning_rate": 1.2326652117955911e-05, + "loss": 2.3314, + "step": 2636 + }, + { + "epoch": 0.7787950383933845, + "grad_norm": 0.21093608714442924, + "learning_rate": 1.2295220632812093e-05, + "loss": 2.2978, + "step": 2637 + }, + { + "epoch": 0.7790903721204961, + "grad_norm": 0.1984059545268726, + "learning_rate": 1.2263823653286999e-05, + "loss": 2.2174, + "step": 2638 + }, + { + "epoch": 0.7793857058476078, + "grad_norm": 0.20611710529580954, + "learning_rate": 1.2232461208113649e-05, + "loss": 2.2587, + "step": 2639 + }, + { + "epoch": 0.7796810395747195, + "grad_norm": 0.20294229837248354, + "learning_rate": 1.2201133325993463e-05, + "loss": 2.38, + "step": 2640 + }, + { + "epoch": 0.7799763733018311, + "grad_norm": 0.20744803599524092, + "learning_rate": 1.216984003559622e-05, + "loss": 2.3506, + "step": 2641 + }, + { + "epoch": 0.7802717070289427, + "grad_norm": 0.19665956403715304, + "learning_rate": 1.2138581365560054e-05, + "loss": 2.2593, + "step": 2642 + }, + { + "epoch": 0.7805670407560543, + "grad_norm": 0.20738276743353956, + "learning_rate": 1.210735734449141e-05, + "loss": 2.2273, + "step": 2643 + }, + { + "epoch": 0.780862374483166, + "grad_norm": 0.6343985802162537, + "learning_rate": 1.2076168000965027e-05, + "loss": 2.3183, + "step": 2644 + }, + { + "epoch": 0.7811577082102776, + "grad_norm": 0.21300764105191966, + "learning_rate": 1.2045013363523911e-05, + "loss": 2.2982, + "step": 2645 + }, + { + "epoch": 0.7814530419373893, + "grad_norm": 0.1965693091166819, + "learning_rate": 1.2013893460679303e-05, + "loss": 2.2756, + "step": 2646 + }, + { + "epoch": 0.7817483756645008, + "grad_norm": 0.21316137508231292, + "learning_rate": 1.1982808320910643e-05, + "loss": 2.2956, + "step": 2647 + }, + { + "epoch": 0.7820437093916125, + "grad_norm": 0.20095081590755914, + "learning_rate": 1.1951757972665577e-05, + "loss": 2.2212, + "step": 2648 + }, + { + "epoch": 0.7823390431187242, + "grad_norm": 0.2074891371066667, + "learning_rate": 1.1920742444359912e-05, + "loss": 2.2914, + "step": 2649 + }, + { + "epoch": 0.7826343768458358, + "grad_norm": 0.21482290687025207, + "learning_rate": 1.1889761764377572e-05, + "loss": 2.3939, + "step": 2650 + }, + { + "epoch": 0.7829297105729475, + "grad_norm": 0.19897976734052206, + "learning_rate": 1.1858815961070607e-05, + "loss": 2.325, + "step": 2651 + }, + { + "epoch": 0.783225044300059, + "grad_norm": 0.20321883271784816, + "learning_rate": 1.1827905062759142e-05, + "loss": 2.3949, + "step": 2652 + }, + { + "epoch": 0.7835203780271707, + "grad_norm": 0.21038552756496326, + "learning_rate": 1.179702909773136e-05, + "loss": 2.2629, + "step": 2653 + }, + { + "epoch": 0.7838157117542823, + "grad_norm": 0.20089143816885796, + "learning_rate": 1.1766188094243474e-05, + "loss": 2.3279, + "step": 2654 + }, + { + "epoch": 0.784111045481394, + "grad_norm": 0.21464340231823717, + "learning_rate": 1.1735382080519698e-05, + "loss": 2.3258, + "step": 2655 + }, + { + "epoch": 0.7844063792085056, + "grad_norm": 0.2270212297794642, + "learning_rate": 1.1704611084752216e-05, + "loss": 2.3354, + "step": 2656 + }, + { + "epoch": 0.7847017129356173, + "grad_norm": 0.1987183400794917, + "learning_rate": 1.1673875135101208e-05, + "loss": 2.3102, + "step": 2657 + }, + { + "epoch": 0.7849970466627288, + "grad_norm": 0.19945870702084428, + "learning_rate": 1.1643174259694745e-05, + "loss": 2.2946, + "step": 2658 + }, + { + "epoch": 0.7852923803898405, + "grad_norm": 0.20411157355328682, + "learning_rate": 1.1612508486628793e-05, + "loss": 2.3743, + "step": 2659 + }, + { + "epoch": 0.7855877141169522, + "grad_norm": 0.19766800509994634, + "learning_rate": 1.1581877843967215e-05, + "loss": 2.317, + "step": 2660 + }, + { + "epoch": 0.7858830478440638, + "grad_norm": 0.20120498236654807, + "learning_rate": 1.1551282359741728e-05, + "loss": 2.3221, + "step": 2661 + }, + { + "epoch": 0.7861783815711755, + "grad_norm": 0.20020273706578245, + "learning_rate": 1.1520722061951834e-05, + "loss": 2.265, + "step": 2662 + }, + { + "epoch": 0.786473715298287, + "grad_norm": 0.20467087408785434, + "learning_rate": 1.1490196978564876e-05, + "loss": 2.2363, + "step": 2663 + }, + { + "epoch": 0.7867690490253987, + "grad_norm": 0.21987753873468704, + "learning_rate": 1.1459707137515957e-05, + "loss": 2.3103, + "step": 2664 + }, + { + "epoch": 0.7870643827525103, + "grad_norm": 0.23669528813438645, + "learning_rate": 1.1429252566707932e-05, + "loss": 2.2844, + "step": 2665 + }, + { + "epoch": 0.787359716479622, + "grad_norm": 0.19794214967169457, + "learning_rate": 1.1398833294011374e-05, + "loss": 2.064, + "step": 2666 + }, + { + "epoch": 0.7876550502067337, + "grad_norm": 0.1965988997126173, + "learning_rate": 1.1368449347264549e-05, + "loss": 2.2609, + "step": 2667 + }, + { + "epoch": 0.7879503839338452, + "grad_norm": 0.19780157950993366, + "learning_rate": 1.1338100754273396e-05, + "loss": 2.3022, + "step": 2668 + }, + { + "epoch": 0.7882457176609569, + "grad_norm": 0.20727990171378452, + "learning_rate": 1.1307787542811504e-05, + "loss": 2.365, + "step": 2669 + }, + { + "epoch": 0.7885410513880685, + "grad_norm": 0.21103468175125972, + "learning_rate": 1.1277509740620084e-05, + "loss": 2.3901, + "step": 2670 + }, + { + "epoch": 0.7888363851151802, + "grad_norm": 0.20218635977374902, + "learning_rate": 1.1247267375407938e-05, + "loss": 2.4191, + "step": 2671 + }, + { + "epoch": 0.7891317188422918, + "grad_norm": 0.20609698833151216, + "learning_rate": 1.1217060474851443e-05, + "loss": 2.232, + "step": 2672 + }, + { + "epoch": 0.7894270525694034, + "grad_norm": 0.2189289791334077, + "learning_rate": 1.1186889066594502e-05, + "loss": 2.2378, + "step": 2673 + }, + { + "epoch": 0.789722386296515, + "grad_norm": 0.20039082726064963, + "learning_rate": 1.1156753178248564e-05, + "loss": 2.3252, + "step": 2674 + }, + { + "epoch": 0.7900177200236267, + "grad_norm": 0.1989453908611917, + "learning_rate": 1.112665283739257e-05, + "loss": 2.3116, + "step": 2675 + }, + { + "epoch": 0.7903130537507383, + "grad_norm": 0.20808792362514889, + "learning_rate": 1.1096588071572888e-05, + "loss": 2.2346, + "step": 2676 + }, + { + "epoch": 0.79060838747785, + "grad_norm": 0.19982136482450574, + "learning_rate": 1.1066558908303376e-05, + "loss": 2.3526, + "step": 2677 + }, + { + "epoch": 0.7909037212049616, + "grad_norm": 0.2025828921974141, + "learning_rate": 1.1036565375065294e-05, + "loss": 2.2767, + "step": 2678 + }, + { + "epoch": 0.7911990549320732, + "grad_norm": 0.20214498212874216, + "learning_rate": 1.1006607499307298e-05, + "loss": 2.2981, + "step": 2679 + }, + { + "epoch": 0.7914943886591849, + "grad_norm": 0.20204623490570608, + "learning_rate": 1.097668530844539e-05, + "loss": 2.3255, + "step": 2680 + }, + { + "epoch": 0.7917897223862965, + "grad_norm": 0.20379157002101683, + "learning_rate": 1.0946798829862964e-05, + "loss": 2.372, + "step": 2681 + }, + { + "epoch": 0.7920850561134082, + "grad_norm": 0.1990740755629067, + "learning_rate": 1.0916948090910694e-05, + "loss": 2.3671, + "step": 2682 + }, + { + "epoch": 0.7923803898405198, + "grad_norm": 0.21025704196801076, + "learning_rate": 1.0887133118906546e-05, + "loss": 2.3254, + "step": 2683 + }, + { + "epoch": 0.7926757235676314, + "grad_norm": 0.19880793793129115, + "learning_rate": 1.0857353941135774e-05, + "loss": 2.2724, + "step": 2684 + }, + { + "epoch": 0.792971057294743, + "grad_norm": 0.2020623212554547, + "learning_rate": 1.082761058485085e-05, + "loss": 2.3424, + "step": 2685 + }, + { + "epoch": 0.7932663910218547, + "grad_norm": 0.2092139151073642, + "learning_rate": 1.0797903077271482e-05, + "loss": 2.3749, + "step": 2686 + }, + { + "epoch": 0.7935617247489664, + "grad_norm": 0.2010875636337027, + "learning_rate": 1.0768231445584575e-05, + "loss": 2.3068, + "step": 2687 + }, + { + "epoch": 0.793857058476078, + "grad_norm": 0.20416261590293266, + "learning_rate": 1.0738595716944183e-05, + "loss": 2.2892, + "step": 2688 + }, + { + "epoch": 0.7941523922031896, + "grad_norm": 0.22193361434929715, + "learning_rate": 1.0708995918471521e-05, + "loss": 2.1912, + "step": 2689 + }, + { + "epoch": 0.7944477259303012, + "grad_norm": 0.21185853995468018, + "learning_rate": 1.067943207725492e-05, + "loss": 2.3498, + "step": 2690 + }, + { + "epoch": 0.7947430596574129, + "grad_norm": 0.2025069407168409, + "learning_rate": 1.0649904220349782e-05, + "loss": 2.3046, + "step": 2691 + }, + { + "epoch": 0.7950383933845245, + "grad_norm": 0.19897130483679923, + "learning_rate": 1.0620412374778605e-05, + "loss": 2.3376, + "step": 2692 + }, + { + "epoch": 0.7953337271116362, + "grad_norm": 0.21815588004290298, + "learning_rate": 1.0590956567530918e-05, + "loss": 2.3061, + "step": 2693 + }, + { + "epoch": 0.7956290608387477, + "grad_norm": 0.20060053496467725, + "learning_rate": 1.0561536825563283e-05, + "loss": 2.2616, + "step": 2694 + }, + { + "epoch": 0.7959243945658594, + "grad_norm": 0.19899185199566238, + "learning_rate": 1.0532153175799237e-05, + "loss": 2.3256, + "step": 2695 + }, + { + "epoch": 0.796219728292971, + "grad_norm": 0.20184401842776586, + "learning_rate": 1.0502805645129304e-05, + "loss": 2.0862, + "step": 2696 + }, + { + "epoch": 0.7965150620200827, + "grad_norm": 0.22745910352923693, + "learning_rate": 1.0473494260410943e-05, + "loss": 2.3586, + "step": 2697 + }, + { + "epoch": 0.7968103957471944, + "grad_norm": 0.2081713861604165, + "learning_rate": 1.044421904846854e-05, + "loss": 2.2951, + "step": 2698 + }, + { + "epoch": 0.7971057294743059, + "grad_norm": 0.21181654554180457, + "learning_rate": 1.041498003609337e-05, + "loss": 2.2658, + "step": 2699 + }, + { + "epoch": 0.7974010632014176, + "grad_norm": 0.224387199978694, + "learning_rate": 1.0385777250043587e-05, + "loss": 2.3646, + "step": 2700 + }, + { + "epoch": 0.7976963969285292, + "grad_norm": 0.1991017502934837, + "learning_rate": 1.0356610717044196e-05, + "loss": 2.3388, + "step": 2701 + }, + { + "epoch": 0.7979917306556409, + "grad_norm": 0.20217593093625663, + "learning_rate": 1.0327480463787014e-05, + "loss": 2.2697, + "step": 2702 + }, + { + "epoch": 0.7982870643827525, + "grad_norm": 0.20474950710232645, + "learning_rate": 1.0298386516930663e-05, + "loss": 2.3525, + "step": 2703 + }, + { + "epoch": 0.7985823981098642, + "grad_norm": 0.22782913023029225, + "learning_rate": 1.0269328903100533e-05, + "loss": 2.2794, + "step": 2704 + }, + { + "epoch": 0.7988777318369757, + "grad_norm": 0.20493443314251866, + "learning_rate": 1.0240307648888781e-05, + "loss": 2.3486, + "step": 2705 + }, + { + "epoch": 0.7991730655640874, + "grad_norm": 0.21054780248604002, + "learning_rate": 1.0211322780854265e-05, + "loss": 2.3083, + "step": 2706 + }, + { + "epoch": 0.7994683992911991, + "grad_norm": 0.20220220468104008, + "learning_rate": 1.018237432552256e-05, + "loss": 2.2662, + "step": 2707 + }, + { + "epoch": 0.7997637330183107, + "grad_norm": 0.2035815719686267, + "learning_rate": 1.0153462309385909e-05, + "loss": 2.3454, + "step": 2708 + }, + { + "epoch": 0.8000590667454224, + "grad_norm": 0.22540798847612775, + "learning_rate": 1.0124586758903226e-05, + "loss": 2.2517, + "step": 2709 + }, + { + "epoch": 0.8003544004725339, + "grad_norm": 0.20474345738166372, + "learning_rate": 1.0095747700500024e-05, + "loss": 2.2989, + "step": 2710 + }, + { + "epoch": 0.8006497341996456, + "grad_norm": 0.25995448424757844, + "learning_rate": 1.0066945160568448e-05, + "loss": 2.3339, + "step": 2711 + }, + { + "epoch": 0.8009450679267572, + "grad_norm": 0.20580403056298893, + "learning_rate": 1.0038179165467204e-05, + "loss": 2.3131, + "step": 2712 + }, + { + "epoch": 0.8012404016538689, + "grad_norm": 0.2014258642389753, + "learning_rate": 1.0009449741521565e-05, + "loss": 2.3442, + "step": 2713 + }, + { + "epoch": 0.8015357353809806, + "grad_norm": 0.20746518489315946, + "learning_rate": 9.980756915023332e-06, + "loss": 2.2716, + "step": 2714 + }, + { + "epoch": 0.8018310691080921, + "grad_norm": 0.20513026165127146, + "learning_rate": 9.952100712230805e-06, + "loss": 2.3331, + "step": 2715 + }, + { + "epoch": 0.8021264028352038, + "grad_norm": 0.20566965803774243, + "learning_rate": 9.923481159368792e-06, + "loss": 2.2771, + "step": 2716 + }, + { + "epoch": 0.8024217365623154, + "grad_norm": 0.213920917467885, + "learning_rate": 9.894898282628528e-06, + "loss": 2.2334, + "step": 2717 + }, + { + "epoch": 0.8027170702894271, + "grad_norm": 0.21027860341395718, + "learning_rate": 9.866352108167715e-06, + "loss": 2.2193, + "step": 2718 + }, + { + "epoch": 0.8030124040165387, + "grad_norm": 0.20102743870949907, + "learning_rate": 9.837842662110458e-06, + "loss": 2.2912, + "step": 2719 + }, + { + "epoch": 0.8033077377436503, + "grad_norm": 0.20077438342750598, + "learning_rate": 9.809369970547217e-06, + "loss": 2.3148, + "step": 2720 + }, + { + "epoch": 0.8036030714707619, + "grad_norm": 0.21547267066536574, + "learning_rate": 9.780934059534864e-06, + "loss": 2.3223, + "step": 2721 + }, + { + "epoch": 0.8038984051978736, + "grad_norm": 0.20789082241514883, + "learning_rate": 9.752534955096582e-06, + "loss": 2.3537, + "step": 2722 + }, + { + "epoch": 0.8041937389249852, + "grad_norm": 0.1961081890854194, + "learning_rate": 9.724172683221889e-06, + "loss": 2.2867, + "step": 2723 + }, + { + "epoch": 0.8044890726520969, + "grad_norm": 0.46967474003233994, + "learning_rate": 9.695847269866576e-06, + "loss": 2.2346, + "step": 2724 + }, + { + "epoch": 0.8047844063792085, + "grad_norm": 0.20137682909226587, + "learning_rate": 9.66755874095272e-06, + "loss": 2.3635, + "step": 2725 + }, + { + "epoch": 0.8050797401063201, + "grad_norm": 0.1987123223345168, + "learning_rate": 9.639307122368629e-06, + "loss": 2.3858, + "step": 2726 + }, + { + "epoch": 0.8053750738334318, + "grad_norm": 0.19841268564500464, + "learning_rate": 9.611092439968843e-06, + "loss": 2.1672, + "step": 2727 + }, + { + "epoch": 0.8056704075605434, + "grad_norm": 0.20318350499583987, + "learning_rate": 9.582914719574099e-06, + "loss": 2.2705, + "step": 2728 + }, + { + "epoch": 0.8059657412876551, + "grad_norm": 0.20407971351892595, + "learning_rate": 9.554773986971294e-06, + "loss": 2.2813, + "step": 2729 + }, + { + "epoch": 0.8062610750147667, + "grad_norm": 0.21245732965928668, + "learning_rate": 9.526670267913502e-06, + "loss": 2.3247, + "step": 2730 + }, + { + "epoch": 0.8065564087418783, + "grad_norm": 0.2045877679614824, + "learning_rate": 9.498603588119897e-06, + "loss": 2.1904, + "step": 2731 + }, + { + "epoch": 0.8068517424689899, + "grad_norm": 0.22087941682314488, + "learning_rate": 9.470573973275771e-06, + "loss": 2.3243, + "step": 2732 + }, + { + "epoch": 0.8071470761961016, + "grad_norm": 0.20639515270113098, + "learning_rate": 9.442581449032496e-06, + "loss": 2.3318, + "step": 2733 + }, + { + "epoch": 0.8074424099232133, + "grad_norm": 0.20656810995266328, + "learning_rate": 9.41462604100749e-06, + "loss": 2.2874, + "step": 2734 + }, + { + "epoch": 0.8077377436503249, + "grad_norm": 0.21233915945765408, + "learning_rate": 9.386707774784215e-06, + "loss": 2.3716, + "step": 2735 + }, + { + "epoch": 0.8080330773774365, + "grad_norm": 0.2045549258203156, + "learning_rate": 9.358826675912146e-06, + "loss": 2.3311, + "step": 2736 + }, + { + "epoch": 0.8083284111045481, + "grad_norm": 0.21076992230697475, + "learning_rate": 9.330982769906727e-06, + "loss": 2.1959, + "step": 2737 + }, + { + "epoch": 0.8086237448316598, + "grad_norm": 0.20183667269351896, + "learning_rate": 9.303176082249376e-06, + "loss": 2.2255, + "step": 2738 + }, + { + "epoch": 0.8089190785587714, + "grad_norm": 0.20217395934644536, + "learning_rate": 9.275406638387457e-06, + "loss": 2.2943, + "step": 2739 + }, + { + "epoch": 0.8092144122858831, + "grad_norm": 0.20258351742135736, + "learning_rate": 9.247674463734235e-06, + "loss": 2.3365, + "step": 2740 + }, + { + "epoch": 0.8095097460129946, + "grad_norm": 0.2015302669272252, + "learning_rate": 9.21997958366888e-06, + "loss": 2.2888, + "step": 2741 + }, + { + "epoch": 0.8098050797401063, + "grad_norm": 0.2091015460525469, + "learning_rate": 9.192322023536432e-06, + "loss": 2.3958, + "step": 2742 + }, + { + "epoch": 0.810100413467218, + "grad_norm": 0.20654216612892126, + "learning_rate": 9.164701808647757e-06, + "loss": 2.3304, + "step": 2743 + }, + { + "epoch": 0.8103957471943296, + "grad_norm": 0.21559989686323858, + "learning_rate": 9.137118964279578e-06, + "loss": 2.484, + "step": 2744 + }, + { + "epoch": 0.8106910809214413, + "grad_norm": 0.19616164631612534, + "learning_rate": 9.109573515674391e-06, + "loss": 2.3195, + "step": 2745 + }, + { + "epoch": 0.8109864146485528, + "grad_norm": 0.20303945390585112, + "learning_rate": 9.08206548804048e-06, + "loss": 2.3431, + "step": 2746 + }, + { + "epoch": 0.8112817483756645, + "grad_norm": 0.20299671780641185, + "learning_rate": 9.054594906551889e-06, + "loss": 2.2782, + "step": 2747 + }, + { + "epoch": 0.8115770821027761, + "grad_norm": 0.20262615681894, + "learning_rate": 9.027161796348377e-06, + "loss": 2.2601, + "step": 2748 + }, + { + "epoch": 0.8118724158298878, + "grad_norm": 0.20236489321734255, + "learning_rate": 8.999766182535436e-06, + "loss": 2.2945, + "step": 2749 + }, + { + "epoch": 0.8121677495569994, + "grad_norm": 0.21816518641552046, + "learning_rate": 8.972408090184204e-06, + "loss": 2.3924, + "step": 2750 + }, + { + "epoch": 0.8124630832841111, + "grad_norm": 0.20950588327365896, + "learning_rate": 8.945087544331515e-06, + "loss": 2.2952, + "step": 2751 + }, + { + "epoch": 0.8127584170112226, + "grad_norm": 0.22217479318051345, + "learning_rate": 8.917804569979831e-06, + "loss": 2.3381, + "step": 2752 + }, + { + "epoch": 0.8130537507383343, + "grad_norm": 0.20423430365338735, + "learning_rate": 8.890559192097237e-06, + "loss": 2.2914, + "step": 2753 + }, + { + "epoch": 0.813349084465446, + "grad_norm": 0.19983972743730458, + "learning_rate": 8.863351435617395e-06, + "loss": 2.2923, + "step": 2754 + }, + { + "epoch": 0.8136444181925576, + "grad_norm": 0.20415042675559036, + "learning_rate": 8.836181325439557e-06, + "loss": 2.3071, + "step": 2755 + }, + { + "epoch": 0.8139397519196693, + "grad_norm": 0.199401005421144, + "learning_rate": 8.809048886428512e-06, + "loss": 2.2763, + "step": 2756 + }, + { + "epoch": 0.8142350856467808, + "grad_norm": 0.19808897665675448, + "learning_rate": 8.781954143414573e-06, + "loss": 2.3158, + "step": 2757 + }, + { + "epoch": 0.8145304193738925, + "grad_norm": 0.2026256328445803, + "learning_rate": 8.754897121193567e-06, + "loss": 2.2964, + "step": 2758 + }, + { + "epoch": 0.8148257531010041, + "grad_norm": 0.20995933177245274, + "learning_rate": 8.727877844526767e-06, + "loss": 2.3254, + "step": 2759 + }, + { + "epoch": 0.8151210868281158, + "grad_norm": 0.1987507221670646, + "learning_rate": 8.700896338140962e-06, + "loss": 2.3263, + "step": 2760 + }, + { + "epoch": 0.8154164205552275, + "grad_norm": 0.2003453785489906, + "learning_rate": 8.673952626728332e-06, + "loss": 2.3144, + "step": 2761 + }, + { + "epoch": 0.815711754282339, + "grad_norm": 0.2053822306226763, + "learning_rate": 8.647046734946473e-06, + "loss": 2.2174, + "step": 2762 + }, + { + "epoch": 0.8160070880094507, + "grad_norm": 0.20512577593201606, + "learning_rate": 8.62017868741839e-06, + "loss": 2.3086, + "step": 2763 + }, + { + "epoch": 0.8163024217365623, + "grad_norm": 0.20940793867217344, + "learning_rate": 8.593348508732412e-06, + "loss": 2.2058, + "step": 2764 + }, + { + "epoch": 0.816597755463674, + "grad_norm": 0.20233664297635384, + "learning_rate": 8.566556223442262e-06, + "loss": 2.3318, + "step": 2765 + }, + { + "epoch": 0.8168930891907856, + "grad_norm": 0.20277999041350264, + "learning_rate": 8.539801856066954e-06, + "loss": 2.3309, + "step": 2766 + }, + { + "epoch": 0.8171884229178972, + "grad_norm": 0.21017378730349198, + "learning_rate": 8.513085431090822e-06, + "loss": 2.246, + "step": 2767 + }, + { + "epoch": 0.8174837566450088, + "grad_norm": 0.21191826389911883, + "learning_rate": 8.486406972963457e-06, + "loss": 2.2864, + "step": 2768 + }, + { + "epoch": 0.8177790903721205, + "grad_norm": 0.20077287122113022, + "learning_rate": 8.45976650609972e-06, + "loss": 2.3277, + "step": 2769 + }, + { + "epoch": 0.8180744240992321, + "grad_norm": 0.20440002229899454, + "learning_rate": 8.433164054879689e-06, + "loss": 2.2983, + "step": 2770 + }, + { + "epoch": 0.8183697578263438, + "grad_norm": 0.2117649643771672, + "learning_rate": 8.406599643648672e-06, + "loss": 2.3046, + "step": 2771 + }, + { + "epoch": 0.8186650915534555, + "grad_norm": 0.20272691856299205, + "learning_rate": 8.38007329671714e-06, + "loss": 2.2914, + "step": 2772 + }, + { + "epoch": 0.818960425280567, + "grad_norm": 0.20304332521630009, + "learning_rate": 8.353585038360757e-06, + "loss": 2.4321, + "step": 2773 + }, + { + "epoch": 0.8192557590076787, + "grad_norm": 0.19961942322703877, + "learning_rate": 8.327134892820304e-06, + "loss": 2.2908, + "step": 2774 + }, + { + "epoch": 0.8195510927347903, + "grad_norm": 0.2022663178179719, + "learning_rate": 8.300722884301704e-06, + "loss": 2.3661, + "step": 2775 + }, + { + "epoch": 0.819846426461902, + "grad_norm": 0.20520654018380305, + "learning_rate": 8.274349036975965e-06, + "loss": 2.2864, + "step": 2776 + }, + { + "epoch": 0.8201417601890136, + "grad_norm": 0.20207272830317688, + "learning_rate": 8.248013374979174e-06, + "loss": 2.3357, + "step": 2777 + }, + { + "epoch": 0.8204370939161252, + "grad_norm": 0.19960063907744588, + "learning_rate": 8.221715922412487e-06, + "loss": 2.3483, + "step": 2778 + }, + { + "epoch": 0.8207324276432368, + "grad_norm": 0.2130735818583206, + "learning_rate": 8.195456703342064e-06, + "loss": 2.2983, + "step": 2779 + }, + { + "epoch": 0.8210277613703485, + "grad_norm": 0.1996763061167883, + "learning_rate": 8.169235741799092e-06, + "loss": 2.3015, + "step": 2780 + }, + { + "epoch": 0.8213230950974602, + "grad_norm": 0.20804666326485552, + "learning_rate": 8.143053061779759e-06, + "loss": 2.4057, + "step": 2781 + }, + { + "epoch": 0.8216184288245718, + "grad_norm": 0.20725445141745671, + "learning_rate": 8.11690868724519e-06, + "loss": 2.296, + "step": 2782 + }, + { + "epoch": 0.8219137625516834, + "grad_norm": 0.20257496639721265, + "learning_rate": 8.090802642121487e-06, + "loss": 2.314, + "step": 2783 + }, + { + "epoch": 0.822209096278795, + "grad_norm": 0.21199796922853675, + "learning_rate": 8.064734950299662e-06, + "loss": 2.2446, + "step": 2784 + }, + { + "epoch": 0.8225044300059067, + "grad_norm": 0.20987304799681317, + "learning_rate": 8.038705635635618e-06, + "loss": 2.3866, + "step": 2785 + }, + { + "epoch": 0.8227997637330183, + "grad_norm": 0.19484016525286998, + "learning_rate": 8.012714721950144e-06, + "loss": 2.3271, + "step": 2786 + }, + { + "epoch": 0.82309509746013, + "grad_norm": 0.21035423702144238, + "learning_rate": 7.986762233028883e-06, + "loss": 2.3816, + "step": 2787 + }, + { + "epoch": 0.8233904311872415, + "grad_norm": 0.2733364054581852, + "learning_rate": 7.960848192622327e-06, + "loss": 2.4093, + "step": 2788 + }, + { + "epoch": 0.8236857649143532, + "grad_norm": 0.20267718726820425, + "learning_rate": 7.934972624445763e-06, + "loss": 2.3371, + "step": 2789 + }, + { + "epoch": 0.8239810986414648, + "grad_norm": 0.20268589020665295, + "learning_rate": 7.909135552179287e-06, + "loss": 2.323, + "step": 2790 + }, + { + "epoch": 0.8242764323685765, + "grad_norm": 0.2015641748373547, + "learning_rate": 7.883336999467749e-06, + "loss": 2.3773, + "step": 2791 + }, + { + "epoch": 0.8245717660956882, + "grad_norm": 0.2114276401358768, + "learning_rate": 7.857576989920761e-06, + "loss": 2.3746, + "step": 2792 + }, + { + "epoch": 0.8248670998227997, + "grad_norm": 0.22050603151235712, + "learning_rate": 7.831855547112666e-06, + "loss": 2.4074, + "step": 2793 + }, + { + "epoch": 0.8251624335499114, + "grad_norm": 0.20547162405694544, + "learning_rate": 7.806172694582487e-06, + "loss": 2.3425, + "step": 2794 + }, + { + "epoch": 0.825457767277023, + "grad_norm": 0.19852856213698458, + "learning_rate": 7.780528455833946e-06, + "loss": 2.283, + "step": 2795 + }, + { + "epoch": 0.8257531010041347, + "grad_norm": 0.2021889853238671, + "learning_rate": 7.754922854335445e-06, + "loss": 2.3561, + "step": 2796 + }, + { + "epoch": 0.8260484347312463, + "grad_norm": 0.20154628355751408, + "learning_rate": 7.729355913519998e-06, + "loss": 2.2394, + "step": 2797 + }, + { + "epoch": 0.826343768458358, + "grad_norm": 0.21020249953333214, + "learning_rate": 7.703827656785262e-06, + "loss": 2.3031, + "step": 2798 + }, + { + "epoch": 0.8266391021854695, + "grad_norm": 0.2049858157990278, + "learning_rate": 7.678338107493476e-06, + "loss": 2.2585, + "step": 2799 + }, + { + "epoch": 0.8269344359125812, + "grad_norm": 0.20699198368463453, + "learning_rate": 7.652887288971462e-06, + "loss": 2.3701, + "step": 2800 + }, + { + "epoch": 0.8272297696396929, + "grad_norm": 0.2052680271644341, + "learning_rate": 7.627475224510599e-06, + "loss": 2.3461, + "step": 2801 + }, + { + "epoch": 0.8275251033668045, + "grad_norm": 0.23152281674681102, + "learning_rate": 7.602101937366801e-06, + "loss": 2.3493, + "step": 2802 + }, + { + "epoch": 0.8278204370939162, + "grad_norm": 0.19783481779347684, + "learning_rate": 7.576767450760486e-06, + "loss": 2.4095, + "step": 2803 + }, + { + "epoch": 0.8281157708210277, + "grad_norm": 0.22016026397792868, + "learning_rate": 7.551471787876574e-06, + "loss": 2.4839, + "step": 2804 + }, + { + "epoch": 0.8284111045481394, + "grad_norm": 0.23791647177157685, + "learning_rate": 7.526214971864448e-06, + "loss": 2.2877, + "step": 2805 + }, + { + "epoch": 0.828706438275251, + "grad_norm": 0.2756027324703693, + "learning_rate": 7.500997025837947e-06, + "loss": 2.2911, + "step": 2806 + }, + { + "epoch": 0.8290017720023627, + "grad_norm": 0.20167111050685566, + "learning_rate": 7.4758179728753374e-06, + "loss": 2.283, + "step": 2807 + }, + { + "epoch": 0.8292971057294743, + "grad_norm": 0.2291226109870777, + "learning_rate": 7.450677836019276e-06, + "loss": 2.3402, + "step": 2808 + }, + { + "epoch": 0.8295924394565859, + "grad_norm": 0.1929170964100983, + "learning_rate": 7.425576638276837e-06, + "loss": 2.3512, + "step": 2809 + }, + { + "epoch": 0.8298877731836976, + "grad_norm": 0.19837832356981713, + "learning_rate": 7.400514402619429e-06, + "loss": 2.296, + "step": 2810 + }, + { + "epoch": 0.8301831069108092, + "grad_norm": 0.2077125280167767, + "learning_rate": 7.375491151982822e-06, + "loss": 2.3584, + "step": 2811 + }, + { + "epoch": 0.8304784406379209, + "grad_norm": 0.21163946370383202, + "learning_rate": 7.350506909267096e-06, + "loss": 2.2595, + "step": 2812 + }, + { + "epoch": 0.8307737743650325, + "grad_norm": 0.21137444839879302, + "learning_rate": 7.3255616973366525e-06, + "loss": 2.3142, + "step": 2813 + }, + { + "epoch": 0.8310691080921441, + "grad_norm": 0.1964353519204519, + "learning_rate": 7.300655539020152e-06, + "loss": 2.3806, + "step": 2814 + }, + { + "epoch": 0.8313644418192557, + "grad_norm": 0.19654953219419466, + "learning_rate": 7.2757884571105274e-06, + "loss": 2.3407, + "step": 2815 + }, + { + "epoch": 0.8316597755463674, + "grad_norm": 0.21135633883167704, + "learning_rate": 7.2509604743649485e-06, + "loss": 2.3885, + "step": 2816 + }, + { + "epoch": 0.831955109273479, + "grad_norm": 0.2042561452173443, + "learning_rate": 7.226171613504806e-06, + "loss": 2.3764, + "step": 2817 + }, + { + "epoch": 0.8322504430005907, + "grad_norm": 0.2052329523467791, + "learning_rate": 7.201421897215677e-06, + "loss": 2.2922, + "step": 2818 + }, + { + "epoch": 0.8325457767277024, + "grad_norm": 0.2105833276817971, + "learning_rate": 7.17671134814733e-06, + "loss": 2.2407, + "step": 2819 + }, + { + "epoch": 0.8328411104548139, + "grad_norm": 0.21024579972914817, + "learning_rate": 7.1520399889136835e-06, + "loss": 2.3805, + "step": 2820 + }, + { + "epoch": 0.8331364441819256, + "grad_norm": 0.2150666987428589, + "learning_rate": 7.1274078420927815e-06, + "loss": 2.2789, + "step": 2821 + }, + { + "epoch": 0.8334317779090372, + "grad_norm": 0.20013212742047715, + "learning_rate": 7.10281493022682e-06, + "loss": 2.3409, + "step": 2822 + }, + { + "epoch": 0.8337271116361489, + "grad_norm": 0.20106797173880572, + "learning_rate": 7.078261275822018e-06, + "loss": 2.301, + "step": 2823 + }, + { + "epoch": 0.8340224453632605, + "grad_norm": 0.20265022140924258, + "learning_rate": 7.05374690134874e-06, + "loss": 2.2405, + "step": 2824 + }, + { + "epoch": 0.8343177790903721, + "grad_norm": 0.20450139466756811, + "learning_rate": 7.029271829241363e-06, + "loss": 2.2645, + "step": 2825 + }, + { + "epoch": 0.8346131128174837, + "grad_norm": 0.20548755102840932, + "learning_rate": 7.004836081898314e-06, + "loss": 2.3216, + "step": 2826 + }, + { + "epoch": 0.8349084465445954, + "grad_norm": 0.20164698449222793, + "learning_rate": 6.980439681682016e-06, + "loss": 2.3916, + "step": 2827 + }, + { + "epoch": 0.835203780271707, + "grad_norm": 0.2057380865473075, + "learning_rate": 6.956082650918905e-06, + "loss": 2.3586, + "step": 2828 + }, + { + "epoch": 0.8354991139988187, + "grad_norm": 0.2020021268471793, + "learning_rate": 6.931765011899372e-06, + "loss": 2.3865, + "step": 2829 + }, + { + "epoch": 0.8357944477259303, + "grad_norm": 0.20094024878797548, + "learning_rate": 6.9074867868777596e-06, + "loss": 2.3087, + "step": 2830 + }, + { + "epoch": 0.8360897814530419, + "grad_norm": 0.19473755134943257, + "learning_rate": 6.8832479980723394e-06, + "loss": 2.1433, + "step": 2831 + }, + { + "epoch": 0.8363851151801536, + "grad_norm": 0.2028653429251387, + "learning_rate": 6.8590486676653075e-06, + "loss": 2.3296, + "step": 2832 + }, + { + "epoch": 0.8366804489072652, + "grad_norm": 0.20252014851703695, + "learning_rate": 6.834888817802731e-06, + "loss": 2.3665, + "step": 2833 + }, + { + "epoch": 0.8369757826343769, + "grad_norm": 0.20112140726160657, + "learning_rate": 6.810768470594553e-06, + "loss": 2.2778, + "step": 2834 + }, + { + "epoch": 0.8372711163614884, + "grad_norm": 0.19679734386344322, + "learning_rate": 6.7866876481145754e-06, + "loss": 2.2746, + "step": 2835 + }, + { + "epoch": 0.8375664500886001, + "grad_norm": 0.19518175517375996, + "learning_rate": 6.762646372400411e-06, + "loss": 2.3117, + "step": 2836 + }, + { + "epoch": 0.8378617838157117, + "grad_norm": 0.2077930129386682, + "learning_rate": 6.738644665453492e-06, + "loss": 2.3147, + "step": 2837 + }, + { + "epoch": 0.8381571175428234, + "grad_norm": 0.19441053022839871, + "learning_rate": 6.714682549239043e-06, + "loss": 2.2834, + "step": 2838 + }, + { + "epoch": 0.8384524512699351, + "grad_norm": 0.20146199019570465, + "learning_rate": 6.690760045686051e-06, + "loss": 2.3027, + "step": 2839 + }, + { + "epoch": 0.8387477849970466, + "grad_norm": 0.20699159674300108, + "learning_rate": 6.666877176687248e-06, + "loss": 2.324, + "step": 2840 + }, + { + "epoch": 0.8390431187241583, + "grad_norm": 0.19871048510503517, + "learning_rate": 6.643033964099099e-06, + "loss": 2.3838, + "step": 2841 + }, + { + "epoch": 0.8393384524512699, + "grad_norm": 0.20268741570964122, + "learning_rate": 6.619230429741779e-06, + "loss": 2.2684, + "step": 2842 + }, + { + "epoch": 0.8396337861783816, + "grad_norm": 0.20764890340606895, + "learning_rate": 6.595466595399152e-06, + "loss": 2.345, + "step": 2843 + }, + { + "epoch": 0.8399291199054932, + "grad_norm": 0.20915681195664498, + "learning_rate": 6.571742482818738e-06, + "loss": 2.3877, + "step": 2844 + }, + { + "epoch": 0.8402244536326049, + "grad_norm": 0.20040618211467573, + "learning_rate": 6.5480581137117245e-06, + "loss": 2.3786, + "step": 2845 + }, + { + "epoch": 0.8405197873597164, + "grad_norm": 0.19205644682789993, + "learning_rate": 6.5244135097529135e-06, + "loss": 2.2678, + "step": 2846 + }, + { + "epoch": 0.8408151210868281, + "grad_norm": 0.21361699558951605, + "learning_rate": 6.500808692580729e-06, + "loss": 2.2592, + "step": 2847 + }, + { + "epoch": 0.8411104548139398, + "grad_norm": 0.20245872861410513, + "learning_rate": 6.477243683797163e-06, + "loss": 2.3225, + "step": 2848 + }, + { + "epoch": 0.8414057885410514, + "grad_norm": 0.20029300074063272, + "learning_rate": 6.453718504967799e-06, + "loss": 2.331, + "step": 2849 + }, + { + "epoch": 0.8417011222681631, + "grad_norm": 0.19572321623046482, + "learning_rate": 6.430233177621753e-06, + "loss": 2.31, + "step": 2850 + }, + { + "epoch": 0.8419964559952746, + "grad_norm": 0.2068173545147758, + "learning_rate": 6.406787723251695e-06, + "loss": 2.2348, + "step": 2851 + }, + { + "epoch": 0.8422917897223863, + "grad_norm": 0.21478246334892653, + "learning_rate": 6.383382163313762e-06, + "loss": 2.3266, + "step": 2852 + }, + { + "epoch": 0.8425871234494979, + "grad_norm": 0.21086628912790564, + "learning_rate": 6.3600165192276236e-06, + "loss": 2.3195, + "step": 2853 + }, + { + "epoch": 0.8428824571766096, + "grad_norm": 0.20048398860120875, + "learning_rate": 6.336690812376405e-06, + "loss": 2.2735, + "step": 2854 + }, + { + "epoch": 0.8431777909037212, + "grad_norm": 0.20180353974055865, + "learning_rate": 6.313405064106681e-06, + "loss": 2.3208, + "step": 2855 + }, + { + "epoch": 0.8434731246308328, + "grad_norm": 0.19807422268544167, + "learning_rate": 6.290159295728459e-06, + "loss": 2.2728, + "step": 2856 + }, + { + "epoch": 0.8437684583579445, + "grad_norm": 0.19585179261365213, + "learning_rate": 6.266953528515157e-06, + "loss": 2.3262, + "step": 2857 + }, + { + "epoch": 0.8440637920850561, + "grad_norm": 0.22276599588662296, + "learning_rate": 6.243787783703592e-06, + "loss": 2.4418, + "step": 2858 + }, + { + "epoch": 0.8443591258121678, + "grad_norm": 0.20096815775655144, + "learning_rate": 6.220662082493945e-06, + "loss": 2.2998, + "step": 2859 + }, + { + "epoch": 0.8446544595392794, + "grad_norm": 0.20937378868777476, + "learning_rate": 6.197576446049763e-06, + "loss": 2.2901, + "step": 2860 + }, + { + "epoch": 0.844949793266391, + "grad_norm": 0.19996123236194593, + "learning_rate": 6.1745308954979206e-06, + "loss": 2.3949, + "step": 2861 + }, + { + "epoch": 0.8452451269935026, + "grad_norm": 0.2075057350509402, + "learning_rate": 6.151525451928597e-06, + "loss": 2.3202, + "step": 2862 + }, + { + "epoch": 0.8455404607206143, + "grad_norm": 0.20568266274586805, + "learning_rate": 6.128560136395295e-06, + "loss": 2.3225, + "step": 2863 + }, + { + "epoch": 0.8458357944477259, + "grad_norm": 0.2139244264273997, + "learning_rate": 6.105634969914775e-06, + "loss": 2.3223, + "step": 2864 + }, + { + "epoch": 0.8461311281748376, + "grad_norm": 0.1993169075241402, + "learning_rate": 6.08274997346705e-06, + "loss": 2.3005, + "step": 2865 + }, + { + "epoch": 0.8464264619019493, + "grad_norm": 0.20274540387916692, + "learning_rate": 6.059905167995394e-06, + "loss": 2.3095, + "step": 2866 + }, + { + "epoch": 0.8467217956290608, + "grad_norm": 0.2051717816797767, + "learning_rate": 6.037100574406268e-06, + "loss": 2.3984, + "step": 2867 + }, + { + "epoch": 0.8470171293561725, + "grad_norm": 0.21101421268164222, + "learning_rate": 6.0143362135693545e-06, + "loss": 2.4017, + "step": 2868 + }, + { + "epoch": 0.8473124630832841, + "grad_norm": 0.20450535670798525, + "learning_rate": 5.991612106317512e-06, + "loss": 2.3571, + "step": 2869 + }, + { + "epoch": 0.8476077968103958, + "grad_norm": 0.20323518368898927, + "learning_rate": 5.968928273446767e-06, + "loss": 2.3297, + "step": 2870 + }, + { + "epoch": 0.8479031305375074, + "grad_norm": 0.2106224285444184, + "learning_rate": 5.9462847357162785e-06, + "loss": 2.3161, + "step": 2871 + }, + { + "epoch": 0.848198464264619, + "grad_norm": 0.1996005924181422, + "learning_rate": 5.923681513848334e-06, + "loss": 2.3174, + "step": 2872 + }, + { + "epoch": 0.8484937979917306, + "grad_norm": 0.20144532886243943, + "learning_rate": 5.9011186285283295e-06, + "loss": 2.3658, + "step": 2873 + }, + { + "epoch": 0.8487891317188423, + "grad_norm": 0.20533638995165412, + "learning_rate": 5.878596100404743e-06, + "loss": 2.3064, + "step": 2874 + }, + { + "epoch": 0.849084465445954, + "grad_norm": 0.19774882934081536, + "learning_rate": 5.856113950089115e-06, + "loss": 2.3031, + "step": 2875 + }, + { + "epoch": 0.8493797991730656, + "grad_norm": 0.20771041169524893, + "learning_rate": 5.833672198156043e-06, + "loss": 2.2644, + "step": 2876 + }, + { + "epoch": 0.8496751329001772, + "grad_norm": 0.19678593846333806, + "learning_rate": 5.811270865143148e-06, + "loss": 2.28, + "step": 2877 + }, + { + "epoch": 0.8499704666272888, + "grad_norm": 0.19571477470825382, + "learning_rate": 5.788909971551066e-06, + "loss": 2.2922, + "step": 2878 + }, + { + "epoch": 0.8502658003544005, + "grad_norm": 0.19889938711236893, + "learning_rate": 5.76658953784342e-06, + "loss": 2.2718, + "step": 2879 + }, + { + "epoch": 0.8505611340815121, + "grad_norm": 0.20382973488654052, + "learning_rate": 5.744309584446811e-06, + "loss": 2.246, + "step": 2880 + }, + { + "epoch": 0.8508564678086238, + "grad_norm": 0.20062090597699314, + "learning_rate": 5.722070131750796e-06, + "loss": 2.2434, + "step": 2881 + }, + { + "epoch": 0.8511518015357353, + "grad_norm": 0.259489242259063, + "learning_rate": 5.6998712001078425e-06, + "loss": 2.3658, + "step": 2882 + }, + { + "epoch": 0.851447135262847, + "grad_norm": 0.1963838376934582, + "learning_rate": 5.677712809833375e-06, + "loss": 2.3131, + "step": 2883 + }, + { + "epoch": 0.8517424689899586, + "grad_norm": 0.20898857290739895, + "learning_rate": 5.655594981205687e-06, + "loss": 2.2829, + "step": 2884 + }, + { + "epoch": 0.8520378027170703, + "grad_norm": 0.19524358487244495, + "learning_rate": 5.633517734465959e-06, + "loss": 2.2703, + "step": 2885 + }, + { + "epoch": 0.852333136444182, + "grad_norm": 0.20550865213773348, + "learning_rate": 5.611481089818255e-06, + "loss": 2.312, + "step": 2886 + }, + { + "epoch": 0.8526284701712935, + "grad_norm": 0.1968184586728648, + "learning_rate": 5.5894850674294454e-06, + "loss": 2.2478, + "step": 2887 + }, + { + "epoch": 0.8529238038984052, + "grad_norm": 0.21230263836248137, + "learning_rate": 5.567529687429257e-06, + "loss": 2.4372, + "step": 2888 + }, + { + "epoch": 0.8532191376255168, + "grad_norm": 0.2077306234789241, + "learning_rate": 5.54561496991019e-06, + "loss": 2.346, + "step": 2889 + }, + { + "epoch": 0.8535144713526285, + "grad_norm": 0.2060955332548025, + "learning_rate": 5.523740934927568e-06, + "loss": 2.3012, + "step": 2890 + }, + { + "epoch": 0.8538098050797401, + "grad_norm": 0.19893375281502973, + "learning_rate": 5.501907602499456e-06, + "loss": 2.3079, + "step": 2891 + }, + { + "epoch": 0.8541051388068518, + "grad_norm": 0.20310638415789425, + "learning_rate": 5.480114992606683e-06, + "loss": 2.2958, + "step": 2892 + }, + { + "epoch": 0.8544004725339633, + "grad_norm": 0.2070883546203821, + "learning_rate": 5.458363125192806e-06, + "loss": 2.3554, + "step": 2893 + }, + { + "epoch": 0.854695806261075, + "grad_norm": 0.2135119165460688, + "learning_rate": 5.436652020164096e-06, + "loss": 2.2879, + "step": 2894 + }, + { + "epoch": 0.8549911399881867, + "grad_norm": 0.20215627867008087, + "learning_rate": 5.414981697389543e-06, + "loss": 2.2702, + "step": 2895 + }, + { + "epoch": 0.8552864737152983, + "grad_norm": 0.21019592442447366, + "learning_rate": 5.3933521767007665e-06, + "loss": 2.3291, + "step": 2896 + }, + { + "epoch": 0.85558180744241, + "grad_norm": 0.20033108828823748, + "learning_rate": 5.371763477892084e-06, + "loss": 2.2796, + "step": 2897 + }, + { + "epoch": 0.8558771411695215, + "grad_norm": 0.20060728529682487, + "learning_rate": 5.350215620720444e-06, + "loss": 2.3443, + "step": 2898 + }, + { + "epoch": 0.8561724748966332, + "grad_norm": 0.20791308614620346, + "learning_rate": 5.328708624905421e-06, + "loss": 2.2713, + "step": 2899 + }, + { + "epoch": 0.8564678086237448, + "grad_norm": 0.20009262428640248, + "learning_rate": 5.307242510129195e-06, + "loss": 2.3364, + "step": 2900 + }, + { + "epoch": 0.8567631423508565, + "grad_norm": 0.20132367415979846, + "learning_rate": 5.285817296036533e-06, + "loss": 2.245, + "step": 2901 + }, + { + "epoch": 0.8570584760779681, + "grad_norm": 0.20488520130841315, + "learning_rate": 5.26443300223477e-06, + "loss": 2.3236, + "step": 2902 + }, + { + "epoch": 0.8573538098050797, + "grad_norm": 0.21681332740559833, + "learning_rate": 5.243089648293786e-06, + "loss": 2.4268, + "step": 2903 + }, + { + "epoch": 0.8576491435321913, + "grad_norm": 0.19676138384820954, + "learning_rate": 5.221787253746013e-06, + "loss": 2.2512, + "step": 2904 + }, + { + "epoch": 0.857944477259303, + "grad_norm": 0.20164436112215994, + "learning_rate": 5.200525838086378e-06, + "loss": 2.2512, + "step": 2905 + }, + { + "epoch": 0.8582398109864147, + "grad_norm": 0.19816982658691257, + "learning_rate": 5.179305420772329e-06, + "loss": 2.2601, + "step": 2906 + }, + { + "epoch": 0.8585351447135263, + "grad_norm": 0.20383079491968825, + "learning_rate": 5.158126021223769e-06, + "loss": 2.2782, + "step": 2907 + }, + { + "epoch": 0.8588304784406379, + "grad_norm": 0.20444718442280957, + "learning_rate": 5.136987658823084e-06, + "loss": 2.3211, + "step": 2908 + }, + { + "epoch": 0.8591258121677495, + "grad_norm": 0.20746617248285482, + "learning_rate": 5.115890352915092e-06, + "loss": 2.2259, + "step": 2909 + }, + { + "epoch": 0.8594211458948612, + "grad_norm": 0.2076441222525065, + "learning_rate": 5.094834122807046e-06, + "loss": 2.3364, + "step": 2910 + }, + { + "epoch": 0.8597164796219728, + "grad_norm": 0.20178687891642139, + "learning_rate": 5.073818987768603e-06, + "loss": 2.2969, + "step": 2911 + }, + { + "epoch": 0.8600118133490845, + "grad_norm": 0.2028773154856977, + "learning_rate": 5.0528449670318166e-06, + "loss": 2.3351, + "step": 2912 + }, + { + "epoch": 0.8603071470761962, + "grad_norm": 0.2103356145482155, + "learning_rate": 5.031912079791107e-06, + "loss": 2.3647, + "step": 2913 + }, + { + "epoch": 0.8606024808033077, + "grad_norm": 0.1961994346088707, + "learning_rate": 5.01102034520326e-06, + "loss": 2.2798, + "step": 2914 + }, + { + "epoch": 0.8608978145304194, + "grad_norm": 0.20086741324164248, + "learning_rate": 4.990169782387394e-06, + "loss": 2.3078, + "step": 2915 + }, + { + "epoch": 0.861193148257531, + "grad_norm": 0.1966128928172885, + "learning_rate": 4.969360410424956e-06, + "loss": 2.3512, + "step": 2916 + }, + { + "epoch": 0.8614884819846427, + "grad_norm": 0.19650708137497222, + "learning_rate": 4.94859224835969e-06, + "loss": 2.2913, + "step": 2917 + }, + { + "epoch": 0.8617838157117543, + "grad_norm": 0.20135156698533677, + "learning_rate": 4.927865315197622e-06, + "loss": 2.3219, + "step": 2918 + }, + { + "epoch": 0.8620791494388659, + "grad_norm": 0.2078421833235662, + "learning_rate": 4.907179629907066e-06, + "loss": 2.3208, + "step": 2919 + }, + { + "epoch": 0.8623744831659775, + "grad_norm": 0.2014561006365756, + "learning_rate": 4.8865352114185635e-06, + "loss": 2.3229, + "step": 2920 + }, + { + "epoch": 0.8626698168930892, + "grad_norm": 0.20152438860626476, + "learning_rate": 4.865932078624919e-06, + "loss": 2.2594, + "step": 2921 + }, + { + "epoch": 0.8629651506202008, + "grad_norm": 0.2052217253237973, + "learning_rate": 4.8453702503811205e-06, + "loss": 2.3027, + "step": 2922 + }, + { + "epoch": 0.8632604843473125, + "grad_norm": 0.20458920644874823, + "learning_rate": 4.82484974550439e-06, + "loss": 2.3849, + "step": 2923 + }, + { + "epoch": 0.863555818074424, + "grad_norm": 0.19519459026356498, + "learning_rate": 4.804370582774109e-06, + "loss": 2.3255, + "step": 2924 + }, + { + "epoch": 0.8638511518015357, + "grad_norm": 0.1955494016148057, + "learning_rate": 4.783932780931844e-06, + "loss": 2.2618, + "step": 2925 + }, + { + "epoch": 0.8641464855286474, + "grad_norm": 0.19707662196120396, + "learning_rate": 4.76353635868127e-06, + "loss": 2.2476, + "step": 2926 + }, + { + "epoch": 0.864441819255759, + "grad_norm": 0.21996031301603175, + "learning_rate": 4.743181334688246e-06, + "loss": 2.3456, + "step": 2927 + }, + { + "epoch": 0.8647371529828707, + "grad_norm": 0.20970179511915463, + "learning_rate": 4.72286772758071e-06, + "loss": 2.4546, + "step": 2928 + }, + { + "epoch": 0.8650324867099822, + "grad_norm": 0.20180209062654025, + "learning_rate": 4.702595555948713e-06, + "loss": 2.2711, + "step": 2929 + }, + { + "epoch": 0.8653278204370939, + "grad_norm": 0.2066472288637689, + "learning_rate": 4.682364838344377e-06, + "loss": 2.3366, + "step": 2930 + }, + { + "epoch": 0.8656231541642055, + "grad_norm": 0.20383121527613415, + "learning_rate": 4.6621755932818975e-06, + "loss": 2.305, + "step": 2931 + }, + { + "epoch": 0.8659184878913172, + "grad_norm": 0.20255744026700911, + "learning_rate": 4.642027839237501e-06, + "loss": 2.3303, + "step": 2932 + }, + { + "epoch": 0.8662138216184289, + "grad_norm": 0.2081797252051876, + "learning_rate": 4.621921594649464e-06, + "loss": 2.2424, + "step": 2933 + }, + { + "epoch": 0.8665091553455405, + "grad_norm": 0.19835709309476746, + "learning_rate": 4.601856877918053e-06, + "loss": 2.3095, + "step": 2934 + }, + { + "epoch": 0.8668044890726521, + "grad_norm": 0.19833830208792003, + "learning_rate": 4.581833707405553e-06, + "loss": 2.3309, + "step": 2935 + }, + { + "epoch": 0.8670998227997637, + "grad_norm": 0.2053970128281633, + "learning_rate": 4.561852101436209e-06, + "loss": 2.2693, + "step": 2936 + }, + { + "epoch": 0.8673951565268754, + "grad_norm": 0.2287421937298426, + "learning_rate": 4.541912078296234e-06, + "loss": 2.1972, + "step": 2937 + }, + { + "epoch": 0.867690490253987, + "grad_norm": 0.20397663399812968, + "learning_rate": 4.522013656233787e-06, + "loss": 2.3134, + "step": 2938 + }, + { + "epoch": 0.8679858239810987, + "grad_norm": 0.19799863953977942, + "learning_rate": 4.5021568534589664e-06, + "loss": 2.3059, + "step": 2939 + }, + { + "epoch": 0.8682811577082102, + "grad_norm": 0.20486986393917353, + "learning_rate": 4.48234168814376e-06, + "loss": 2.3065, + "step": 2940 + }, + { + "epoch": 0.8685764914353219, + "grad_norm": 0.20828674312024328, + "learning_rate": 4.462568178422072e-06, + "loss": 2.3181, + "step": 2941 + }, + { + "epoch": 0.8688718251624336, + "grad_norm": 0.2007210689555041, + "learning_rate": 4.442836342389672e-06, + "loss": 2.2951, + "step": 2942 + }, + { + "epoch": 0.8691671588895452, + "grad_norm": 0.20180771165402508, + "learning_rate": 4.4231461981042e-06, + "loss": 2.3493, + "step": 2943 + }, + { + "epoch": 0.8694624926166569, + "grad_norm": 0.1955599361908739, + "learning_rate": 4.4034977635851345e-06, + "loss": 2.3153, + "step": 2944 + }, + { + "epoch": 0.8697578263437684, + "grad_norm": 0.19919593530801338, + "learning_rate": 4.383891056813794e-06, + "loss": 2.2864, + "step": 2945 + }, + { + "epoch": 0.8700531600708801, + "grad_norm": 0.20162306807066024, + "learning_rate": 4.364326095733301e-06, + "loss": 2.3646, + "step": 2946 + }, + { + "epoch": 0.8703484937979917, + "grad_norm": 0.19537937926609197, + "learning_rate": 4.344802898248573e-06, + "loss": 2.2361, + "step": 2947 + }, + { + "epoch": 0.8706438275251034, + "grad_norm": 0.2008504892889088, + "learning_rate": 4.325321482226307e-06, + "loss": 2.3462, + "step": 2948 + }, + { + "epoch": 0.870939161252215, + "grad_norm": 0.19537472687309831, + "learning_rate": 4.305881865494976e-06, + "loss": 2.161, + "step": 2949 + }, + { + "epoch": 0.8712344949793266, + "grad_norm": 0.20136924431802122, + "learning_rate": 4.286484065844792e-06, + "loss": 2.3456, + "step": 2950 + }, + { + "epoch": 0.8715298287064382, + "grad_norm": 0.19218438463864973, + "learning_rate": 4.26712810102769e-06, + "loss": 2.2822, + "step": 2951 + }, + { + "epoch": 0.8718251624335499, + "grad_norm": 0.19753310510071756, + "learning_rate": 4.247813988757327e-06, + "loss": 2.299, + "step": 2952 + }, + { + "epoch": 0.8721204961606616, + "grad_norm": 0.20435978557231302, + "learning_rate": 4.228541746709069e-06, + "loss": 2.3307, + "step": 2953 + }, + { + "epoch": 0.8724158298877732, + "grad_norm": 0.20380115760503936, + "learning_rate": 4.209311392519955e-06, + "loss": 2.2836, + "step": 2954 + }, + { + "epoch": 0.8727111636148848, + "grad_norm": 0.20117701366490973, + "learning_rate": 4.190122943788671e-06, + "loss": 2.2996, + "step": 2955 + }, + { + "epoch": 0.8730064973419964, + "grad_norm": 0.21840925585268978, + "learning_rate": 4.170976418075584e-06, + "loss": 2.3926, + "step": 2956 + }, + { + "epoch": 0.8733018310691081, + "grad_norm": 0.20111962908466122, + "learning_rate": 4.151871832902682e-06, + "loss": 2.2764, + "step": 2957 + }, + { + "epoch": 0.8735971647962197, + "grad_norm": 0.21517319995487985, + "learning_rate": 4.132809205753574e-06, + "loss": 2.3116, + "step": 2958 + }, + { + "epoch": 0.8738924985233314, + "grad_norm": 0.20109640020418854, + "learning_rate": 4.113788554073467e-06, + "loss": 2.2785, + "step": 2959 + }, + { + "epoch": 0.874187832250443, + "grad_norm": 0.20056835866414854, + "learning_rate": 4.0948098952691474e-06, + "loss": 2.3148, + "step": 2960 + }, + { + "epoch": 0.8744831659775546, + "grad_norm": 0.7728238609580261, + "learning_rate": 4.0758732467089945e-06, + "loss": 2.3264, + "step": 2961 + }, + { + "epoch": 0.8747784997046663, + "grad_norm": 0.20907025043450564, + "learning_rate": 4.056978625722918e-06, + "loss": 2.2241, + "step": 2962 + }, + { + "epoch": 0.8750738334317779, + "grad_norm": 0.20318554012734455, + "learning_rate": 4.038126049602381e-06, + "loss": 2.3657, + "step": 2963 + }, + { + "epoch": 0.8753691671588896, + "grad_norm": 0.20436440659112604, + "learning_rate": 4.019315535600349e-06, + "loss": 2.3695, + "step": 2964 + }, + { + "epoch": 0.8756645008860012, + "grad_norm": 0.20116277322271195, + "learning_rate": 4.000547100931323e-06, + "loss": 2.2737, + "step": 2965 + }, + { + "epoch": 0.8759598346131128, + "grad_norm": 0.2033408008992971, + "learning_rate": 3.981820762771282e-06, + "loss": 2.307, + "step": 2966 + }, + { + "epoch": 0.8762551683402244, + "grad_norm": 0.22117687960790045, + "learning_rate": 3.963136538257678e-06, + "loss": 2.4374, + "step": 2967 + }, + { + "epoch": 0.8765505020673361, + "grad_norm": 0.22119106099015462, + "learning_rate": 3.944494444489416e-06, + "loss": 2.4215, + "step": 2968 + }, + { + "epoch": 0.8768458357944477, + "grad_norm": 0.20092846672137857, + "learning_rate": 3.9258944985268716e-06, + "loss": 2.3367, + "step": 2969 + }, + { + "epoch": 0.8771411695215594, + "grad_norm": 0.19440935176318144, + "learning_rate": 3.907336717391802e-06, + "loss": 2.2513, + "step": 2970 + }, + { + "epoch": 0.877436503248671, + "grad_norm": 0.21165079032562045, + "learning_rate": 3.888821118067415e-06, + "loss": 2.4354, + "step": 2971 + }, + { + "epoch": 0.8777318369757826, + "grad_norm": 0.1933040347041837, + "learning_rate": 3.870347717498313e-06, + "loss": 2.2817, + "step": 2972 + }, + { + "epoch": 0.8780271707028943, + "grad_norm": 0.21483662333372577, + "learning_rate": 3.851916532590466e-06, + "loss": 2.3554, + "step": 2973 + }, + { + "epoch": 0.8783225044300059, + "grad_norm": 0.2826333068384435, + "learning_rate": 3.833527580211216e-06, + "loss": 2.2779, + "step": 2974 + }, + { + "epoch": 0.8786178381571176, + "grad_norm": 0.20125600296450302, + "learning_rate": 3.815180877189261e-06, + "loss": 2.2787, + "step": 2975 + }, + { + "epoch": 0.8789131718842291, + "grad_norm": 0.19657690853563908, + "learning_rate": 3.7968764403146284e-06, + "loss": 2.3011, + "step": 2976 + }, + { + "epoch": 0.8792085056113408, + "grad_norm": 0.21489855327416557, + "learning_rate": 3.7786142863386663e-06, + "loss": 2.2914, + "step": 2977 + }, + { + "epoch": 0.8795038393384524, + "grad_norm": 0.20386349788907823, + "learning_rate": 3.760394431974035e-06, + "loss": 2.3879, + "step": 2978 + }, + { + "epoch": 0.8797991730655641, + "grad_norm": 0.20273346946675777, + "learning_rate": 3.742216893894662e-06, + "loss": 2.3126, + "step": 2979 + }, + { + "epoch": 0.8800945067926758, + "grad_norm": 0.19725734391157373, + "learning_rate": 3.724081688735781e-06, + "loss": 2.3226, + "step": 2980 + }, + { + "epoch": 0.8803898405197874, + "grad_norm": 0.20220274817618292, + "learning_rate": 3.7059888330938595e-06, + "loss": 2.2905, + "step": 2981 + }, + { + "epoch": 0.880685174246899, + "grad_norm": 0.19935176186279346, + "learning_rate": 3.6879383435266255e-06, + "loss": 2.3246, + "step": 2982 + }, + { + "epoch": 0.8809805079740106, + "grad_norm": 0.2063397429032539, + "learning_rate": 3.669930236553021e-06, + "loss": 2.2254, + "step": 2983 + }, + { + "epoch": 0.8812758417011223, + "grad_norm": 0.21890330051937384, + "learning_rate": 3.651964528653207e-06, + "loss": 2.3215, + "step": 2984 + }, + { + "epoch": 0.8815711754282339, + "grad_norm": 0.21080167247280215, + "learning_rate": 3.634041236268543e-06, + "loss": 2.2224, + "step": 2985 + }, + { + "epoch": 0.8818665091553456, + "grad_norm": 0.19502522322247448, + "learning_rate": 3.616160375801575e-06, + "loss": 2.2724, + "step": 2986 + }, + { + "epoch": 0.8821618428824571, + "grad_norm": 0.20607488994183576, + "learning_rate": 3.598321963616019e-06, + "loss": 2.3261, + "step": 2987 + }, + { + "epoch": 0.8824571766095688, + "grad_norm": 0.2045883415367312, + "learning_rate": 3.5805260160367225e-06, + "loss": 2.296, + "step": 2988 + }, + { + "epoch": 0.8827525103366805, + "grad_norm": 0.21091227027337053, + "learning_rate": 3.562772549349719e-06, + "loss": 2.2976, + "step": 2989 + }, + { + "epoch": 0.8830478440637921, + "grad_norm": 0.20797960331867882, + "learning_rate": 3.545061579802117e-06, + "loss": 2.3893, + "step": 2990 + }, + { + "epoch": 0.8833431777909038, + "grad_norm": 0.20439787168001292, + "learning_rate": 3.527393123602152e-06, + "loss": 2.3503, + "step": 2991 + }, + { + "epoch": 0.8836385115180153, + "grad_norm": 0.2103817293855383, + "learning_rate": 3.509767196919167e-06, + "loss": 2.3123, + "step": 2992 + }, + { + "epoch": 0.883933845245127, + "grad_norm": 0.2066265234795846, + "learning_rate": 3.492183815883554e-06, + "loss": 2.3796, + "step": 2993 + }, + { + "epoch": 0.8842291789722386, + "grad_norm": 0.19995869865736512, + "learning_rate": 3.4746429965867967e-06, + "loss": 2.3492, + "step": 2994 + }, + { + "epoch": 0.8845245126993503, + "grad_norm": 0.2033125946456309, + "learning_rate": 3.4571447550814153e-06, + "loss": 2.2628, + "step": 2995 + }, + { + "epoch": 0.8848198464264619, + "grad_norm": 0.2065840696018061, + "learning_rate": 3.439689107380967e-06, + "loss": 2.344, + "step": 2996 + }, + { + "epoch": 0.8851151801535735, + "grad_norm": 0.22150544532245892, + "learning_rate": 3.422276069460029e-06, + "loss": 2.316, + "step": 2997 + }, + { + "epoch": 0.8854105138806851, + "grad_norm": 0.19749492122662965, + "learning_rate": 3.4049056572541928e-06, + "loss": 2.1275, + "step": 2998 + }, + { + "epoch": 0.8857058476077968, + "grad_norm": 0.19372821077757152, + "learning_rate": 3.387577886660015e-06, + "loss": 2.3008, + "step": 2999 + }, + { + "epoch": 0.8860011813349085, + "grad_norm": 0.1973537399348856, + "learning_rate": 3.370292773535061e-06, + "loss": 2.2481, + "step": 3000 + }, + { + "epoch": 0.8862965150620201, + "grad_norm": 0.22379966408429064, + "learning_rate": 3.3530503336978326e-06, + "loss": 2.2312, + "step": 3001 + }, + { + "epoch": 0.8865918487891317, + "grad_norm": 0.21260709868643904, + "learning_rate": 3.3358505829277965e-06, + "loss": 2.3025, + "step": 3002 + }, + { + "epoch": 0.8868871825162433, + "grad_norm": 0.19568027203009156, + "learning_rate": 3.3186935369653392e-06, + "loss": 2.3797, + "step": 3003 + }, + { + "epoch": 0.887182516243355, + "grad_norm": 0.19955235952854797, + "learning_rate": 3.301579211511785e-06, + "loss": 2.3076, + "step": 3004 + }, + { + "epoch": 0.8874778499704666, + "grad_norm": 0.20362538363489083, + "learning_rate": 3.284507622229338e-06, + "loss": 2.3233, + "step": 3005 + }, + { + "epoch": 0.8877731836975783, + "grad_norm": 0.20854504477413113, + "learning_rate": 3.267478784741107e-06, + "loss": 2.3394, + "step": 3006 + }, + { + "epoch": 0.88806851742469, + "grad_norm": 0.20288567015157802, + "learning_rate": 3.250492714631076e-06, + "loss": 2.3469, + "step": 3007 + }, + { + "epoch": 0.8883638511518015, + "grad_norm": 0.19626259506713575, + "learning_rate": 3.2335494274440823e-06, + "loss": 2.3792, + "step": 3008 + }, + { + "epoch": 0.8886591848789132, + "grad_norm": 0.21139858326398855, + "learning_rate": 3.216648938685823e-06, + "loss": 2.2783, + "step": 3009 + }, + { + "epoch": 0.8889545186060248, + "grad_norm": 0.19588643623931265, + "learning_rate": 3.19979126382281e-06, + "loss": 2.3251, + "step": 3010 + }, + { + "epoch": 0.8892498523331365, + "grad_norm": 0.1963952379925473, + "learning_rate": 3.182976418282396e-06, + "loss": 2.2959, + "step": 3011 + }, + { + "epoch": 0.8895451860602481, + "grad_norm": 0.20347529247793475, + "learning_rate": 3.1662044174527227e-06, + "loss": 2.384, + "step": 3012 + }, + { + "epoch": 0.8898405197873597, + "grad_norm": 0.19875000053981565, + "learning_rate": 3.1494752766827227e-06, + "loss": 2.2595, + "step": 3013 + }, + { + "epoch": 0.8901358535144713, + "grad_norm": 0.1984832125298411, + "learning_rate": 3.132789011282117e-06, + "loss": 2.3005, + "step": 3014 + }, + { + "epoch": 0.890431187241583, + "grad_norm": 0.19647748643088386, + "learning_rate": 3.1161456365213726e-06, + "loss": 2.296, + "step": 3015 + }, + { + "epoch": 0.8907265209686946, + "grad_norm": 0.20050739990584318, + "learning_rate": 3.0995451676317123e-06, + "loss": 2.2411, + "step": 3016 + }, + { + "epoch": 0.8910218546958063, + "grad_norm": 0.2012054359811075, + "learning_rate": 3.0829876198051057e-06, + "loss": 2.3267, + "step": 3017 + }, + { + "epoch": 0.8913171884229178, + "grad_norm": 0.20157676986334636, + "learning_rate": 3.066473008194226e-06, + "loss": 2.2874, + "step": 3018 + }, + { + "epoch": 0.8916125221500295, + "grad_norm": 0.1983678447692518, + "learning_rate": 3.0500013479124624e-06, + "loss": 2.3278, + "step": 3019 + }, + { + "epoch": 0.8919078558771412, + "grad_norm": 0.2704517292592977, + "learning_rate": 3.0335726540338895e-06, + "loss": 2.3922, + "step": 3020 + }, + { + "epoch": 0.8922031896042528, + "grad_norm": 0.2050168990520147, + "learning_rate": 3.017186941593264e-06, + "loss": 2.2804, + "step": 3021 + }, + { + "epoch": 0.8924985233313645, + "grad_norm": 0.20676770620279494, + "learning_rate": 3.0008442255860138e-06, + "loss": 2.3122, + "step": 3022 + }, + { + "epoch": 0.892793857058476, + "grad_norm": 0.35342651635126876, + "learning_rate": 2.984544520968219e-06, + "loss": 2.62, + "step": 3023 + }, + { + "epoch": 0.8930891907855877, + "grad_norm": 0.19768680328827679, + "learning_rate": 2.9682878426565862e-06, + "loss": 2.2851, + "step": 3024 + }, + { + "epoch": 0.8933845245126993, + "grad_norm": 0.20453001982200197, + "learning_rate": 2.952074205528449e-06, + "loss": 2.3702, + "step": 3025 + }, + { + "epoch": 0.893679858239811, + "grad_norm": 0.21144148876172578, + "learning_rate": 2.935903624421771e-06, + "loss": 2.3039, + "step": 3026 + }, + { + "epoch": 0.8939751919669227, + "grad_norm": 0.19682198862841394, + "learning_rate": 2.919776114135092e-06, + "loss": 2.2984, + "step": 3027 + }, + { + "epoch": 0.8942705256940343, + "grad_norm": 0.3012186931828116, + "learning_rate": 2.903691689427529e-06, + "loss": 2.3552, + "step": 3028 + }, + { + "epoch": 0.8945658594211459, + "grad_norm": 0.20251740769339746, + "learning_rate": 2.887650365018796e-06, + "loss": 2.2735, + "step": 3029 + }, + { + "epoch": 0.8948611931482575, + "grad_norm": 0.20533530140611952, + "learning_rate": 2.871652155589144e-06, + "loss": 2.3387, + "step": 3030 + }, + { + "epoch": 0.8951565268753692, + "grad_norm": 0.20239596484839825, + "learning_rate": 2.8556970757793733e-06, + "loss": 2.2942, + "step": 3031 + }, + { + "epoch": 0.8954518606024808, + "grad_norm": 0.20567457565288416, + "learning_rate": 2.839785140190809e-06, + "loss": 2.2935, + "step": 3032 + }, + { + "epoch": 0.8957471943295925, + "grad_norm": 0.20542157795653118, + "learning_rate": 2.8239163633853038e-06, + "loss": 2.3878, + "step": 3033 + }, + { + "epoch": 0.896042528056704, + "grad_norm": 0.20995705179566648, + "learning_rate": 2.808090759885207e-06, + "loss": 2.2859, + "step": 3034 + }, + { + "epoch": 0.8963378617838157, + "grad_norm": 0.20276365785305203, + "learning_rate": 2.79230834417335e-06, + "loss": 2.3153, + "step": 3035 + }, + { + "epoch": 0.8966331955109273, + "grad_norm": 0.20685840985311943, + "learning_rate": 2.776569130693052e-06, + "loss": 2.3303, + "step": 3036 + }, + { + "epoch": 0.896928529238039, + "grad_norm": 0.19619608332829983, + "learning_rate": 2.760873133848091e-06, + "loss": 2.2533, + "step": 3037 + }, + { + "epoch": 0.8972238629651507, + "grad_norm": 0.2130639744111999, + "learning_rate": 2.745220368002699e-06, + "loss": 2.3022, + "step": 3038 + }, + { + "epoch": 0.8975191966922622, + "grad_norm": 0.19629791581414555, + "learning_rate": 2.7296108474815397e-06, + "loss": 2.3864, + "step": 3039 + }, + { + "epoch": 0.8978145304193739, + "grad_norm": 0.2031874102302149, + "learning_rate": 2.714044586569703e-06, + "loss": 2.2998, + "step": 3040 + }, + { + "epoch": 0.8981098641464855, + "grad_norm": 0.19315049419828664, + "learning_rate": 2.698521599512693e-06, + "loss": 2.2593, + "step": 3041 + }, + { + "epoch": 0.8984051978735972, + "grad_norm": 0.20539699469113973, + "learning_rate": 2.6830419005164033e-06, + "loss": 2.337, + "step": 3042 + }, + { + "epoch": 0.8987005316007088, + "grad_norm": 0.20854714369433233, + "learning_rate": 2.667605503747117e-06, + "loss": 2.3183, + "step": 3043 + }, + { + "epoch": 0.8989958653278204, + "grad_norm": 0.21711165970200386, + "learning_rate": 2.652212423331496e-06, + "loss": 2.3886, + "step": 3044 + }, + { + "epoch": 0.899291199054932, + "grad_norm": 0.20324036183326344, + "learning_rate": 2.636862673356544e-06, + "loss": 2.3003, + "step": 3045 + }, + { + "epoch": 0.8995865327820437, + "grad_norm": 0.19972637427933118, + "learning_rate": 2.6215562678696346e-06, + "loss": 2.2852, + "step": 3046 + }, + { + "epoch": 0.8998818665091554, + "grad_norm": 0.20226705452659535, + "learning_rate": 2.6062932208784476e-06, + "loss": 2.3216, + "step": 3047 + }, + { + "epoch": 0.900177200236267, + "grad_norm": 0.20077209192014578, + "learning_rate": 2.5910735463510084e-06, + "loss": 2.2952, + "step": 3048 + }, + { + "epoch": 0.9004725339633786, + "grad_norm": 0.20287217398183563, + "learning_rate": 2.5758972582156315e-06, + "loss": 2.3047, + "step": 3049 + }, + { + "epoch": 0.9007678676904902, + "grad_norm": 0.19478211868229517, + "learning_rate": 2.5607643703609354e-06, + "loss": 2.266, + "step": 3050 + }, + { + "epoch": 0.9010632014176019, + "grad_norm": 0.20333403138883815, + "learning_rate": 2.5456748966358135e-06, + "loss": 2.3194, + "step": 3051 + }, + { + "epoch": 0.9013585351447135, + "grad_norm": 0.19636459225938613, + "learning_rate": 2.530628850849448e-06, + "loss": 2.3742, + "step": 3052 + }, + { + "epoch": 0.9016538688718252, + "grad_norm": 0.2020388135504698, + "learning_rate": 2.5156262467712466e-06, + "loss": 2.1944, + "step": 3053 + }, + { + "epoch": 0.9019492025989368, + "grad_norm": 0.19863352179986724, + "learning_rate": 2.5006670981308877e-06, + "loss": 2.1666, + "step": 3054 + }, + { + "epoch": 0.9022445363260484, + "grad_norm": 0.20336751948668744, + "learning_rate": 2.4857514186182707e-06, + "loss": 2.3019, + "step": 3055 + }, + { + "epoch": 0.90253987005316, + "grad_norm": 0.20402126466666615, + "learning_rate": 2.47087922188351e-06, + "loss": 2.2078, + "step": 3056 + }, + { + "epoch": 0.9028352037802717, + "grad_norm": 0.20653511375118158, + "learning_rate": 2.4560505215369457e-06, + "loss": 2.3468, + "step": 3057 + }, + { + "epoch": 0.9031305375073834, + "grad_norm": 0.2044584448664892, + "learning_rate": 2.441265331149084e-06, + "loss": 2.3107, + "step": 3058 + }, + { + "epoch": 0.903425871234495, + "grad_norm": 0.19699763043728558, + "learning_rate": 2.4265236642506297e-06, + "loss": 2.3595, + "step": 3059 + }, + { + "epoch": 0.9037212049616066, + "grad_norm": 0.20250144947675294, + "learning_rate": 2.411825534332457e-06, + "loss": 2.1545, + "step": 3060 + }, + { + "epoch": 0.9040165386887182, + "grad_norm": 0.19594660819936963, + "learning_rate": 2.3971709548455955e-06, + "loss": 2.3357, + "step": 3061 + }, + { + "epoch": 0.9043118724158299, + "grad_norm": 0.19879243151852677, + "learning_rate": 2.382559939201212e-06, + "loss": 2.3483, + "step": 3062 + }, + { + "epoch": 0.9046072061429415, + "grad_norm": 0.20345975727731666, + "learning_rate": 2.3679925007706216e-06, + "loss": 2.3858, + "step": 3063 + }, + { + "epoch": 0.9049025398700532, + "grad_norm": 0.20739454207855818, + "learning_rate": 2.353468652885249e-06, + "loss": 2.2611, + "step": 3064 + }, + { + "epoch": 0.9051978735971647, + "grad_norm": 0.19201082001437833, + "learning_rate": 2.3389884088366177e-06, + "loss": 2.2602, + "step": 3065 + }, + { + "epoch": 0.9054932073242764, + "grad_norm": 0.19862845799597087, + "learning_rate": 2.3245517818763717e-06, + "loss": 2.3537, + "step": 3066 + }, + { + "epoch": 0.9057885410513881, + "grad_norm": 0.20143002548044422, + "learning_rate": 2.31015878521621e-06, + "loss": 2.3416, + "step": 3067 + }, + { + "epoch": 0.9060838747784997, + "grad_norm": 0.20140680465444302, + "learning_rate": 2.295809432027929e-06, + "loss": 2.253, + "step": 3068 + }, + { + "epoch": 0.9063792085056114, + "grad_norm": 0.1983543229182772, + "learning_rate": 2.2815037354433745e-06, + "loss": 2.346, + "step": 3069 + }, + { + "epoch": 0.9066745422327229, + "grad_norm": 0.20606098798186104, + "learning_rate": 2.2672417085544294e-06, + "loss": 2.2748, + "step": 3070 + }, + { + "epoch": 0.9069698759598346, + "grad_norm": 0.2183108007898392, + "learning_rate": 2.2530233644130373e-06, + "loss": 2.3399, + "step": 3071 + }, + { + "epoch": 0.9072652096869462, + "grad_norm": 0.20539472755058116, + "learning_rate": 2.2388487160311333e-06, + "loss": 2.3366, + "step": 3072 + }, + { + "epoch": 0.9075605434140579, + "grad_norm": 0.2016936267481351, + "learning_rate": 2.224717776380686e-06, + "loss": 2.2984, + "step": 3073 + }, + { + "epoch": 0.9078558771411696, + "grad_norm": 0.207304414440239, + "learning_rate": 2.2106305583936617e-06, + "loss": 2.3193, + "step": 3074 + }, + { + "epoch": 0.9081512108682812, + "grad_norm": 0.1993092723954729, + "learning_rate": 2.196587074962003e-06, + "loss": 2.3078, + "step": 3075 + }, + { + "epoch": 0.9084465445953928, + "grad_norm": 0.1995464696135528, + "learning_rate": 2.1825873389376474e-06, + "loss": 2.2782, + "step": 3076 + }, + { + "epoch": 0.9087418783225044, + "grad_norm": 0.19865987025153573, + "learning_rate": 2.168631363132484e-06, + "loss": 2.3186, + "step": 3077 + }, + { + "epoch": 0.9090372120496161, + "grad_norm": 0.21807160941449544, + "learning_rate": 2.1547191603183526e-06, + "loss": 2.4045, + "step": 3078 + }, + { + "epoch": 0.9093325457767277, + "grad_norm": 0.19706438204988436, + "learning_rate": 2.140850743227046e-06, + "loss": 2.2464, + "step": 3079 + }, + { + "epoch": 0.9096278795038394, + "grad_norm": 0.20697936721495108, + "learning_rate": 2.1270261245502686e-06, + "loss": 2.3401, + "step": 3080 + }, + { + "epoch": 0.9099232132309509, + "grad_norm": 0.23527008640740824, + "learning_rate": 2.113245316939666e-06, + "loss": 2.4847, + "step": 3081 + }, + { + "epoch": 0.9102185469580626, + "grad_norm": 0.20487435526161618, + "learning_rate": 2.0995083330067665e-06, + "loss": 2.401, + "step": 3082 + }, + { + "epoch": 0.9105138806851742, + "grad_norm": 0.1949360079792306, + "learning_rate": 2.0858151853230147e-06, + "loss": 2.3105, + "step": 3083 + }, + { + "epoch": 0.9108092144122859, + "grad_norm": 0.23204241960145, + "learning_rate": 2.0721658864197137e-06, + "loss": 2.2557, + "step": 3084 + }, + { + "epoch": 0.9111045481393976, + "grad_norm": 0.19780194676227594, + "learning_rate": 2.0585604487880606e-06, + "loss": 2.2495, + "step": 3085 + }, + { + "epoch": 0.9113998818665091, + "grad_norm": 0.19577660732442465, + "learning_rate": 2.0449988848791123e-06, + "loss": 2.343, + "step": 3086 + }, + { + "epoch": 0.9116952155936208, + "grad_norm": 0.20994105577594882, + "learning_rate": 2.031481207103747e-06, + "loss": 2.3535, + "step": 3087 + }, + { + "epoch": 0.9119905493207324, + "grad_norm": 0.22121168300496977, + "learning_rate": 2.018007427832708e-06, + "loss": 2.3113, + "step": 3088 + }, + { + "epoch": 0.9122858830478441, + "grad_norm": 0.2019419698382856, + "learning_rate": 2.0045775593965587e-06, + "loss": 2.3403, + "step": 3089 + }, + { + "epoch": 0.9125812167749557, + "grad_norm": 0.21454121356062297, + "learning_rate": 1.991191614085669e-06, + "loss": 2.3429, + "step": 3090 + }, + { + "epoch": 0.9128765505020673, + "grad_norm": 0.20613658777957716, + "learning_rate": 1.977849604150234e-06, + "loss": 2.2292, + "step": 3091 + }, + { + "epoch": 0.9131718842291789, + "grad_norm": 0.19873615790994323, + "learning_rate": 1.964551541800208e-06, + "loss": 2.3105, + "step": 3092 + }, + { + "epoch": 0.9134672179562906, + "grad_norm": 0.19871778705445797, + "learning_rate": 1.951297439205357e-06, + "loss": 2.3311, + "step": 3093 + }, + { + "epoch": 0.9137625516834023, + "grad_norm": 0.19926556803767478, + "learning_rate": 1.938087308495201e-06, + "loss": 2.325, + "step": 3094 + }, + { + "epoch": 0.9140578854105139, + "grad_norm": 0.19863405630496836, + "learning_rate": 1.924921161759019e-06, + "loss": 2.3863, + "step": 3095 + }, + { + "epoch": 0.9143532191376255, + "grad_norm": 0.25500964520482644, + "learning_rate": 1.911799011045845e-06, + "loss": 2.2746, + "step": 3096 + }, + { + "epoch": 0.9146485528647371, + "grad_norm": 0.2062578329853033, + "learning_rate": 1.8987208683644453e-06, + "loss": 2.3195, + "step": 3097 + }, + { + "epoch": 0.9149438865918488, + "grad_norm": 0.19625968143855954, + "learning_rate": 1.8856867456833128e-06, + "loss": 2.3549, + "step": 3098 + }, + { + "epoch": 0.9152392203189604, + "grad_norm": 0.194641773585207, + "learning_rate": 1.8726966549306557e-06, + "loss": 2.3332, + "step": 3099 + }, + { + "epoch": 0.9155345540460721, + "grad_norm": 0.204116136313492, + "learning_rate": 1.8597506079943928e-06, + "loss": 2.3413, + "step": 3100 + }, + { + "epoch": 0.9158298877731837, + "grad_norm": 0.19986038103015208, + "learning_rate": 1.8468486167221244e-06, + "loss": 2.2134, + "step": 3101 + }, + { + "epoch": 0.9161252215002953, + "grad_norm": 0.20324224059635593, + "learning_rate": 1.8339906929211338e-06, + "loss": 2.3053, + "step": 3102 + }, + { + "epoch": 0.916420555227407, + "grad_norm": 0.19702756321639436, + "learning_rate": 1.8211768483583914e-06, + "loss": 2.2866, + "step": 3103 + }, + { + "epoch": 0.9167158889545186, + "grad_norm": 0.20070945962877995, + "learning_rate": 1.8084070947605058e-06, + "loss": 2.2851, + "step": 3104 + }, + { + "epoch": 0.9170112226816303, + "grad_norm": 0.21302611737743354, + "learning_rate": 1.795681443813757e-06, + "loss": 2.3134, + "step": 3105 + }, + { + "epoch": 0.9173065564087419, + "grad_norm": 0.20800444550990474, + "learning_rate": 1.7829999071640513e-06, + "loss": 2.3048, + "step": 3106 + }, + { + "epoch": 0.9176018901358535, + "grad_norm": 0.20253204951564804, + "learning_rate": 1.7703624964169218e-06, + "loss": 2.2998, + "step": 3107 + }, + { + "epoch": 0.9178972238629651, + "grad_norm": 0.2047032608632763, + "learning_rate": 1.7577692231375342e-06, + "loss": 2.3096, + "step": 3108 + }, + { + "epoch": 0.9181925575900768, + "grad_norm": 0.1986805246190087, + "learning_rate": 1.7452200988506528e-06, + "loss": 2.3046, + "step": 3109 + }, + { + "epoch": 0.9184878913171884, + "grad_norm": 0.2156305125817236, + "learning_rate": 1.7327151350406302e-06, + "loss": 2.2773, + "step": 3110 + }, + { + "epoch": 0.9187832250443001, + "grad_norm": 0.1983541069367895, + "learning_rate": 1.7202543431514172e-06, + "loss": 2.3525, + "step": 3111 + }, + { + "epoch": 0.9190785587714116, + "grad_norm": 0.19713314253294498, + "learning_rate": 1.7078377345865371e-06, + "loss": 2.2716, + "step": 3112 + }, + { + "epoch": 0.9193738924985233, + "grad_norm": 0.19880196037585166, + "learning_rate": 1.6954653207090831e-06, + "loss": 2.339, + "step": 3113 + }, + { + "epoch": 0.919669226225635, + "grad_norm": 0.20068004818496787, + "learning_rate": 1.6831371128416983e-06, + "loss": 2.2478, + "step": 3114 + }, + { + "epoch": 0.9199645599527466, + "grad_norm": 0.19916847005896207, + "learning_rate": 1.6708531222665636e-06, + "loss": 2.3192, + "step": 3115 + }, + { + "epoch": 0.9202598936798583, + "grad_norm": 0.21560347352865789, + "learning_rate": 1.658613360225414e-06, + "loss": 2.3448, + "step": 3116 + }, + { + "epoch": 0.9205552274069698, + "grad_norm": 0.19884574789554013, + "learning_rate": 1.646417837919484e-06, + "loss": 2.2314, + "step": 3117 + }, + { + "epoch": 0.9208505611340815, + "grad_norm": 0.19646446020929095, + "learning_rate": 1.6342665665095458e-06, + "loss": 2.2848, + "step": 3118 + }, + { + "epoch": 0.9211458948611931, + "grad_norm": 0.20549155750532683, + "learning_rate": 1.62215955711586e-06, + "loss": 2.2611, + "step": 3119 + }, + { + "epoch": 0.9214412285883048, + "grad_norm": 0.19338478537930012, + "learning_rate": 1.6100968208181855e-06, + "loss": 2.289, + "step": 3120 + }, + { + "epoch": 0.9217365623154165, + "grad_norm": 0.20137410045638346, + "learning_rate": 1.5980783686557643e-06, + "loss": 2.2705, + "step": 3121 + }, + { + "epoch": 0.9220318960425281, + "grad_norm": 0.19663842269337123, + "learning_rate": 1.586104211627315e-06, + "loss": 2.291, + "step": 3122 + }, + { + "epoch": 0.9223272297696397, + "grad_norm": 0.20404882525414572, + "learning_rate": 1.5741743606910108e-06, + "loss": 2.3398, + "step": 3123 + }, + { + "epoch": 0.9226225634967513, + "grad_norm": 0.20263426446002886, + "learning_rate": 1.5622888267644798e-06, + "loss": 2.2977, + "step": 3124 + }, + { + "epoch": 0.922917897223863, + "grad_norm": 0.2002468601015168, + "learning_rate": 1.5504476207248041e-06, + "loss": 2.355, + "step": 3125 + }, + { + "epoch": 0.9232132309509746, + "grad_norm": 0.20465298135907903, + "learning_rate": 1.5386507534084937e-06, + "loss": 2.3545, + "step": 3126 + }, + { + "epoch": 0.9235085646780863, + "grad_norm": 0.21279399323621662, + "learning_rate": 1.526898235611468e-06, + "loss": 2.32, + "step": 3127 + }, + { + "epoch": 0.9238038984051978, + "grad_norm": 0.20158902222957542, + "learning_rate": 1.515190078089085e-06, + "loss": 2.2698, + "step": 3128 + }, + { + "epoch": 0.9240992321323095, + "grad_norm": 0.24607117110320378, + "learning_rate": 1.5035262915560844e-06, + "loss": 2.3789, + "step": 3129 + }, + { + "epoch": 0.9243945658594211, + "grad_norm": 0.19902744644814357, + "learning_rate": 1.491906886686617e-06, + "loss": 2.2321, + "step": 3130 + }, + { + "epoch": 0.9246898995865328, + "grad_norm": 0.20352532295946, + "learning_rate": 1.4803318741141991e-06, + "loss": 2.2627, + "step": 3131 + }, + { + "epoch": 0.9249852333136445, + "grad_norm": 0.22674926919146432, + "learning_rate": 1.468801264431735e-06, + "loss": 2.2666, + "step": 3132 + }, + { + "epoch": 0.925280567040756, + "grad_norm": 0.21892312555934038, + "learning_rate": 1.4573150681914837e-06, + "loss": 2.3429, + "step": 3133 + }, + { + "epoch": 0.9255759007678677, + "grad_norm": 0.26177096913480036, + "learning_rate": 1.4458732959050759e-06, + "loss": 2.2816, + "step": 3134 + }, + { + "epoch": 0.9258712344949793, + "grad_norm": 0.20932540519686071, + "learning_rate": 1.4344759580434685e-06, + "loss": 2.271, + "step": 3135 + }, + { + "epoch": 0.926166568222091, + "grad_norm": 0.19393953312832496, + "learning_rate": 1.4231230650369686e-06, + "loss": 2.3043, + "step": 3136 + }, + { + "epoch": 0.9264619019492026, + "grad_norm": 0.19909519216050778, + "learning_rate": 1.4118146272751987e-06, + "loss": 2.2627, + "step": 3137 + }, + { + "epoch": 0.9267572356763142, + "grad_norm": 0.2050558623082844, + "learning_rate": 1.4005506551071025e-06, + "loss": 2.3368, + "step": 3138 + }, + { + "epoch": 0.9270525694034258, + "grad_norm": 0.19863629745982148, + "learning_rate": 1.389331158840934e-06, + "loss": 2.2958, + "step": 3139 + }, + { + "epoch": 0.9273479031305375, + "grad_norm": 0.19876923054202622, + "learning_rate": 1.3781561487442418e-06, + "loss": 2.3256, + "step": 3140 + }, + { + "epoch": 0.9276432368576492, + "grad_norm": 0.19824547600485015, + "learning_rate": 1.3670256350438614e-06, + "loss": 2.351, + "step": 3141 + }, + { + "epoch": 0.9279385705847608, + "grad_norm": 0.1985923793454295, + "learning_rate": 1.3559396279259117e-06, + "loss": 2.327, + "step": 3142 + }, + { + "epoch": 0.9282339043118725, + "grad_norm": 0.19480204635307535, + "learning_rate": 1.3448981375357716e-06, + "loss": 2.3549, + "step": 3143 + }, + { + "epoch": 0.928529238038984, + "grad_norm": 0.201628490852933, + "learning_rate": 1.3339011739780916e-06, + "loss": 2.2872, + "step": 3144 + }, + { + "epoch": 0.9288245717660957, + "grad_norm": 0.20432405345722707, + "learning_rate": 1.3229487473167712e-06, + "loss": 2.2598, + "step": 3145 + }, + { + "epoch": 0.9291199054932073, + "grad_norm": 0.20298740421642686, + "learning_rate": 1.3120408675749484e-06, + "loss": 2.2582, + "step": 3146 + }, + { + "epoch": 0.929415239220319, + "grad_norm": 0.1948514455638394, + "learning_rate": 1.3011775447349995e-06, + "loss": 2.3232, + "step": 3147 + }, + { + "epoch": 0.9297105729474306, + "grad_norm": 0.19332050163921138, + "learning_rate": 1.2903587887385104e-06, + "loss": 2.334, + "step": 3148 + }, + { + "epoch": 0.9300059066745422, + "grad_norm": 0.19760474713859771, + "learning_rate": 1.2795846094862951e-06, + "loss": 2.3374, + "step": 3149 + }, + { + "epoch": 0.9303012404016539, + "grad_norm": 0.19995312812043844, + "learning_rate": 1.2688550168383772e-06, + "loss": 2.3443, + "step": 3150 + }, + { + "epoch": 0.9305965741287655, + "grad_norm": 0.2027395791651874, + "learning_rate": 1.258170020613958e-06, + "loss": 2.2739, + "step": 3151 + }, + { + "epoch": 0.9308919078558772, + "grad_norm": 0.20810917278629104, + "learning_rate": 1.247529630591443e-06, + "loss": 2.3442, + "step": 3152 + }, + { + "epoch": 0.9311872415829888, + "grad_norm": 0.19838732776003434, + "learning_rate": 1.2369338565084099e-06, + "loss": 2.2853, + "step": 3153 + }, + { + "epoch": 0.9314825753101004, + "grad_norm": 0.20137595038305173, + "learning_rate": 1.2263827080616074e-06, + "loss": 2.3605, + "step": 3154 + }, + { + "epoch": 0.931777909037212, + "grad_norm": 0.2187725464237008, + "learning_rate": 1.2158761949069397e-06, + "loss": 2.3768, + "step": 3155 + }, + { + "epoch": 0.9320732427643237, + "grad_norm": 0.20365108623868214, + "learning_rate": 1.2054143266594764e-06, + "loss": 2.3012, + "step": 3156 + }, + { + "epoch": 0.9323685764914353, + "grad_norm": 0.19462248987319936, + "learning_rate": 1.1949971128934145e-06, + "loss": 2.3647, + "step": 3157 + }, + { + "epoch": 0.932663910218547, + "grad_norm": 0.2117434399274333, + "learning_rate": 1.1846245631420893e-06, + "loss": 2.3583, + "step": 3158 + }, + { + "epoch": 0.9329592439456585, + "grad_norm": 0.20690186074064562, + "learning_rate": 1.1742966868979743e-06, + "loss": 2.3337, + "step": 3159 + }, + { + "epoch": 0.9332545776727702, + "grad_norm": 0.2004154204853939, + "learning_rate": 1.1640134936126423e-06, + "loss": 2.2502, + "step": 3160 + }, + { + "epoch": 0.9335499113998819, + "grad_norm": 0.19898642288430257, + "learning_rate": 1.1537749926967767e-06, + "loss": 2.3016, + "step": 3161 + }, + { + "epoch": 0.9338452451269935, + "grad_norm": 0.210476203126637, + "learning_rate": 1.1435811935201824e-06, + "loss": 2.3543, + "step": 3162 + }, + { + "epoch": 0.9341405788541052, + "grad_norm": 0.199764756345147, + "learning_rate": 1.1334321054117246e-06, + "loss": 2.41, + "step": 3163 + }, + { + "epoch": 0.9344359125812167, + "grad_norm": 0.2042500352865303, + "learning_rate": 1.1233277376593688e-06, + "loss": 2.3165, + "step": 3164 + }, + { + "epoch": 0.9347312463083284, + "grad_norm": 0.19981759888219966, + "learning_rate": 1.1132680995101563e-06, + "loss": 2.322, + "step": 3165 + }, + { + "epoch": 0.93502658003544, + "grad_norm": 0.20431513768509638, + "learning_rate": 1.1032532001701845e-06, + "loss": 2.3501, + "step": 3166 + }, + { + "epoch": 0.9353219137625517, + "grad_norm": 0.1963251659536152, + "learning_rate": 1.0932830488046164e-06, + "loss": 2.3699, + "step": 3167 + }, + { + "epoch": 0.9356172474896634, + "grad_norm": 0.20117209721525198, + "learning_rate": 1.0833576545376533e-06, + "loss": 2.3679, + "step": 3168 + }, + { + "epoch": 0.935912581216775, + "grad_norm": 0.20087820673307538, + "learning_rate": 1.0734770264525463e-06, + "loss": 2.3001, + "step": 3169 + }, + { + "epoch": 0.9362079149438866, + "grad_norm": 0.22051639590861455, + "learning_rate": 1.0636411735915786e-06, + "loss": 2.3766, + "step": 3170 + }, + { + "epoch": 0.9365032486709982, + "grad_norm": 0.28446247673220637, + "learning_rate": 1.0538501049560501e-06, + "loss": 2.3271, + "step": 3171 + }, + { + "epoch": 0.9367985823981099, + "grad_norm": 0.21379311032037968, + "learning_rate": 1.0441038295062878e-06, + "loss": 2.3317, + "step": 3172 + }, + { + "epoch": 0.9370939161252215, + "grad_norm": 0.2048135504004486, + "learning_rate": 1.0344023561616123e-06, + "loss": 2.2762, + "step": 3173 + }, + { + "epoch": 0.9373892498523332, + "grad_norm": 0.20296004823645306, + "learning_rate": 1.0247456938003552e-06, + "loss": 2.269, + "step": 3174 + }, + { + "epoch": 0.9376845835794447, + "grad_norm": 0.21346591074790291, + "learning_rate": 1.0151338512598251e-06, + "loss": 2.3412, + "step": 3175 + }, + { + "epoch": 0.9379799173065564, + "grad_norm": 0.1942828791278375, + "learning_rate": 1.0055668373363304e-06, + "loss": 2.3551, + "step": 3176 + }, + { + "epoch": 0.938275251033668, + "grad_norm": 0.1992420575588311, + "learning_rate": 9.960446607851448e-07, + "loss": 2.2856, + "step": 3177 + }, + { + "epoch": 0.9385705847607797, + "grad_norm": 0.2050807135282367, + "learning_rate": 9.86567330320509e-07, + "loss": 2.2501, + "step": 3178 + }, + { + "epoch": 0.9388659184878914, + "grad_norm": 0.20509768801431796, + "learning_rate": 9.771348546156245e-07, + "loss": 2.2855, + "step": 3179 + }, + { + "epoch": 0.9391612522150029, + "grad_norm": 0.20213585044999738, + "learning_rate": 9.677472423026467e-07, + "loss": 2.2781, + "step": 3180 + }, + { + "epoch": 0.9394565859421146, + "grad_norm": 0.19725218812105966, + "learning_rate": 9.584045019726707e-07, + "loss": 2.3045, + "step": 3181 + }, + { + "epoch": 0.9397519196692262, + "grad_norm": 0.21237928769608985, + "learning_rate": 9.491066421757288e-07, + "loss": 2.2934, + "step": 3182 + }, + { + "epoch": 0.9400472533963379, + "grad_norm": 0.20550870651923664, + "learning_rate": 9.398536714207762e-07, + "loss": 2.349, + "step": 3183 + }, + { + "epoch": 0.9403425871234495, + "grad_norm": 0.19521828286298415, + "learning_rate": 9.306455981756945e-07, + "loss": 2.2531, + "step": 3184 + }, + { + "epoch": 0.9406379208505611, + "grad_norm": 0.19966429087481047, + "learning_rate": 9.214824308672709e-07, + "loss": 2.3312, + "step": 3185 + }, + { + "epoch": 0.9409332545776727, + "grad_norm": 0.21376705002684426, + "learning_rate": 9.123641778812031e-07, + "loss": 2.365, + "step": 3186 + }, + { + "epoch": 0.9412285883047844, + "grad_norm": 0.2010682496513712, + "learning_rate": 9.032908475620771e-07, + "loss": 2.2653, + "step": 3187 + }, + { + "epoch": 0.941523922031896, + "grad_norm": 0.195761412660553, + "learning_rate": 8.942624482133732e-07, + "loss": 2.3113, + "step": 3188 + }, + { + "epoch": 0.9418192557590077, + "grad_norm": 0.2028917936277463, + "learning_rate": 8.852789880974655e-07, + "loss": 2.1787, + "step": 3189 + }, + { + "epoch": 0.9421145894861194, + "grad_norm": 0.19379472370179285, + "learning_rate": 8.763404754355719e-07, + "loss": 2.2891, + "step": 3190 + }, + { + "epoch": 0.9424099232132309, + "grad_norm": 0.20246876569919492, + "learning_rate": 8.674469184077993e-07, + "loss": 2.2796, + "step": 3191 + }, + { + "epoch": 0.9427052569403426, + "grad_norm": 0.19592724376282972, + "learning_rate": 8.585983251531093e-07, + "loss": 2.3474, + "step": 3192 + }, + { + "epoch": 0.9430005906674542, + "grad_norm": 0.19438248465071006, + "learning_rate": 8.497947037693077e-07, + "loss": 2.2683, + "step": 3193 + }, + { + "epoch": 0.9432959243945659, + "grad_norm": 0.19864002015906856, + "learning_rate": 8.410360623130554e-07, + "loss": 2.3812, + "step": 3194 + }, + { + "epoch": 0.9435912581216775, + "grad_norm": 0.20851932905386755, + "learning_rate": 8.32322408799846e-07, + "loss": 2.2559, + "step": 3195 + }, + { + "epoch": 0.9438865918487891, + "grad_norm": 0.1973455663674459, + "learning_rate": 8.23653751204001e-07, + "loss": 2.3194, + "step": 3196 + }, + { + "epoch": 0.9441819255759007, + "grad_norm": 0.19128834732666683, + "learning_rate": 8.150300974586522e-07, + "loss": 2.2354, + "step": 3197 + }, + { + "epoch": 0.9444772593030124, + "grad_norm": 0.2002180226168244, + "learning_rate": 8.0645145545577e-07, + "loss": 2.351, + "step": 3198 + }, + { + "epoch": 0.9447725930301241, + "grad_norm": 0.2064627814142543, + "learning_rate": 7.979178330461079e-07, + "loss": 2.3793, + "step": 3199 + }, + { + "epoch": 0.9450679267572357, + "grad_norm": 0.2209959619296663, + "learning_rate": 7.89429238039241e-07, + "loss": 2.3138, + "step": 3200 + }, + { + "epoch": 0.9453632604843473, + "grad_norm": 0.20066495562876874, + "learning_rate": 7.809856782035163e-07, + "loss": 2.2969, + "step": 3201 + }, + { + "epoch": 0.9456585942114589, + "grad_norm": 0.2036281369870979, + "learning_rate": 7.725871612660862e-07, + "loss": 2.305, + "step": 3202 + }, + { + "epoch": 0.9459539279385706, + "grad_norm": 0.20147043128870473, + "learning_rate": 7.642336949128748e-07, + "loss": 2.321, + "step": 3203 + }, + { + "epoch": 0.9462492616656822, + "grad_norm": 0.19469024717495995, + "learning_rate": 7.559252867885724e-07, + "loss": 2.3252, + "step": 3204 + }, + { + "epoch": 0.9465445953927939, + "grad_norm": 0.19893229215937966, + "learning_rate": 7.476619444966359e-07, + "loss": 2.3219, + "step": 3205 + }, + { + "epoch": 0.9468399291199054, + "grad_norm": 0.19943291241573605, + "learning_rate": 7.394436755992828e-07, + "loss": 2.3184, + "step": 3206 + }, + { + "epoch": 0.9471352628470171, + "grad_norm": 0.20955935375788232, + "learning_rate": 7.312704876174858e-07, + "loss": 2.3211, + "step": 3207 + }, + { + "epoch": 0.9474305965741288, + "grad_norm": 0.2251098281978648, + "learning_rate": 7.231423880309562e-07, + "loss": 2.4168, + "step": 3208 + }, + { + "epoch": 0.9477259303012404, + "grad_norm": 0.20376649759060572, + "learning_rate": 7.150593842781439e-07, + "loss": 2.3004, + "step": 3209 + }, + { + "epoch": 0.9480212640283521, + "grad_norm": 0.20549521946567423, + "learning_rate": 7.070214837562317e-07, + "loss": 2.2773, + "step": 3210 + }, + { + "epoch": 0.9483165977554636, + "grad_norm": 0.2069122044164975, + "learning_rate": 6.990286938211132e-07, + "loss": 2.2979, + "step": 3211 + }, + { + "epoch": 0.9486119314825753, + "grad_norm": 0.20010722944036208, + "learning_rate": 6.910810217874208e-07, + "loss": 2.2959, + "step": 3212 + }, + { + "epoch": 0.9489072652096869, + "grad_norm": 0.19820338841993293, + "learning_rate": 6.831784749284809e-07, + "loss": 2.263, + "step": 3213 + }, + { + "epoch": 0.9492025989367986, + "grad_norm": 0.19803583276125464, + "learning_rate": 6.753210604763304e-07, + "loss": 2.2551, + "step": 3214 + }, + { + "epoch": 0.9494979326639102, + "grad_norm": 0.19508184164384848, + "learning_rate": 6.675087856216955e-07, + "loss": 2.3318, + "step": 3215 + }, + { + "epoch": 0.9497932663910219, + "grad_norm": 0.20112238895672616, + "learning_rate": 6.597416575140014e-07, + "loss": 2.3276, + "step": 3216 + }, + { + "epoch": 0.9500886001181335, + "grad_norm": 0.194263531025202, + "learning_rate": 6.520196832613568e-07, + "loss": 2.2521, + "step": 3217 + }, + { + "epoch": 0.9503839338452451, + "grad_norm": 0.20985010459861847, + "learning_rate": 6.443428699305365e-07, + "loss": 2.335, + "step": 3218 + }, + { + "epoch": 0.9506792675723568, + "grad_norm": 0.19365355721621078, + "learning_rate": 6.367112245470042e-07, + "loss": 2.3368, + "step": 3219 + }, + { + "epoch": 0.9509746012994684, + "grad_norm": 0.19849069175287276, + "learning_rate": 6.29124754094873e-07, + "loss": 2.3201, + "step": 3220 + }, + { + "epoch": 0.9512699350265801, + "grad_norm": 0.2047150986389056, + "learning_rate": 6.215834655169173e-07, + "loss": 2.3902, + "step": 3221 + }, + { + "epoch": 0.9515652687536916, + "grad_norm": 0.20025545472908401, + "learning_rate": 6.140873657145718e-07, + "loss": 2.2959, + "step": 3222 + }, + { + "epoch": 0.9518606024808033, + "grad_norm": 0.24449245279166398, + "learning_rate": 6.066364615479048e-07, + "loss": 2.4983, + "step": 3223 + }, + { + "epoch": 0.9521559362079149, + "grad_norm": 0.19758073736404305, + "learning_rate": 5.992307598356339e-07, + "loss": 2.2354, + "step": 3224 + }, + { + "epoch": 0.9524512699350266, + "grad_norm": 0.20027689318263894, + "learning_rate": 5.918702673550991e-07, + "loss": 2.3535, + "step": 3225 + }, + { + "epoch": 0.9527466036621383, + "grad_norm": 0.1943382665600021, + "learning_rate": 5.845549908422787e-07, + "loss": 2.2915, + "step": 3226 + }, + { + "epoch": 0.9530419373892498, + "grad_norm": 0.19473885217144904, + "learning_rate": 5.772849369917565e-07, + "loss": 2.1856, + "step": 3227 + }, + { + "epoch": 0.9533372711163615, + "grad_norm": 0.22013120056606247, + "learning_rate": 5.700601124567495e-07, + "loss": 2.2995, + "step": 3228 + }, + { + "epoch": 0.9536326048434731, + "grad_norm": 0.24409243298772196, + "learning_rate": 5.628805238490686e-07, + "loss": 2.3551, + "step": 3229 + }, + { + "epoch": 0.9539279385705848, + "grad_norm": 0.1945840315853908, + "learning_rate": 5.557461777391304e-07, + "loss": 2.3228, + "step": 3230 + }, + { + "epoch": 0.9542232722976964, + "grad_norm": 0.20317276063036768, + "learning_rate": 5.48657080655951e-07, + "loss": 2.2679, + "step": 3231 + }, + { + "epoch": 0.954518606024808, + "grad_norm": 0.19912442610993814, + "learning_rate": 5.416132390871298e-07, + "loss": 2.3093, + "step": 3232 + }, + { + "epoch": 0.9548139397519196, + "grad_norm": 0.2011794764739949, + "learning_rate": 5.346146594788659e-07, + "loss": 2.3427, + "step": 3233 + }, + { + "epoch": 0.9551092734790313, + "grad_norm": 0.2022260249784509, + "learning_rate": 5.276613482359138e-07, + "loss": 2.3238, + "step": 3234 + }, + { + "epoch": 0.955404607206143, + "grad_norm": 0.19765824252846934, + "learning_rate": 5.207533117216223e-07, + "loss": 2.378, + "step": 3235 + }, + { + "epoch": 0.9556999409332546, + "grad_norm": 0.19678235106806086, + "learning_rate": 5.1389055625789e-07, + "loss": 2.2348, + "step": 3236 + }, + { + "epoch": 0.9559952746603663, + "grad_norm": 0.19289775615811436, + "learning_rate": 5.070730881251873e-07, + "loss": 2.3302, + "step": 3237 + }, + { + "epoch": 0.9562906083874778, + "grad_norm": 0.20118424460890164, + "learning_rate": 5.003009135625347e-07, + "loss": 2.3175, + "step": 3238 + }, + { + "epoch": 0.9565859421145895, + "grad_norm": 0.20011731931290017, + "learning_rate": 4.935740387675081e-07, + "loss": 2.3319, + "step": 3239 + }, + { + "epoch": 0.9568812758417011, + "grad_norm": 0.2040710426400163, + "learning_rate": 4.868924698962163e-07, + "loss": 2.3585, + "step": 3240 + }, + { + "epoch": 0.9571766095688128, + "grad_norm": 0.2111283427990741, + "learning_rate": 4.802562130633237e-07, + "loss": 2.4165, + "step": 3241 + }, + { + "epoch": 0.9574719432959244, + "grad_norm": 0.2132881460367495, + "learning_rate": 4.736652743420056e-07, + "loss": 2.2975, + "step": 3242 + }, + { + "epoch": 0.957767277023036, + "grad_norm": 0.20190362342067156, + "learning_rate": 4.671196597639815e-07, + "loss": 2.3027, + "step": 3243 + }, + { + "epoch": 0.9580626107501476, + "grad_norm": 0.20018811096636496, + "learning_rate": 4.6061937531948205e-07, + "loss": 2.3424, + "step": 3244 + }, + { + "epoch": 0.9583579444772593, + "grad_norm": 0.19792263226971424, + "learning_rate": 4.541644269572598e-07, + "loss": 2.3025, + "step": 3245 + }, + { + "epoch": 0.958653278204371, + "grad_norm": 0.20760895541388358, + "learning_rate": 4.477548205845783e-07, + "loss": 2.3102, + "step": 3246 + }, + { + "epoch": 0.9589486119314826, + "grad_norm": 0.2149550583077201, + "learning_rate": 4.4139056206720654e-07, + "loss": 2.2444, + "step": 3247 + }, + { + "epoch": 0.9592439456585942, + "grad_norm": 0.20485384348102817, + "learning_rate": 4.350716572294022e-07, + "loss": 2.3501, + "step": 3248 + }, + { + "epoch": 0.9595392793857058, + "grad_norm": 0.198019710912505, + "learning_rate": 4.287981118539286e-07, + "loss": 2.264, + "step": 3249 + }, + { + "epoch": 0.9598346131128175, + "grad_norm": 0.2014519399337216, + "learning_rate": 4.2256993168204307e-07, + "loss": 2.3084, + "step": 3250 + }, + { + "epoch": 0.9601299468399291, + "grad_norm": 0.2318753139154412, + "learning_rate": 4.163871224134697e-07, + "loss": 2.2206, + "step": 3251 + }, + { + "epoch": 0.9604252805670408, + "grad_norm": 0.1931626349725229, + "learning_rate": 4.102496897064323e-07, + "loss": 2.2527, + "step": 3252 + }, + { + "epoch": 0.9607206142941523, + "grad_norm": 0.22218372163363315, + "learning_rate": 4.0415763917760476e-07, + "loss": 2.2752, + "step": 3253 + }, + { + "epoch": 0.961015948021264, + "grad_norm": 0.20785967606866296, + "learning_rate": 3.9811097640214954e-07, + "loss": 2.3385, + "step": 3254 + }, + { + "epoch": 0.9613112817483757, + "grad_norm": 0.2068307373430602, + "learning_rate": 3.921097069136792e-07, + "loss": 2.3667, + "step": 3255 + }, + { + "epoch": 0.9616066154754873, + "grad_norm": 0.20323175595985385, + "learning_rate": 3.8615383620427824e-07, + "loss": 2.2973, + "step": 3256 + }, + { + "epoch": 0.961901949202599, + "grad_norm": 0.20856961421418851, + "learning_rate": 3.802433697244645e-07, + "loss": 2.365, + "step": 3257 + }, + { + "epoch": 0.9621972829297105, + "grad_norm": 0.1929150977819294, + "learning_rate": 3.743783128832279e-07, + "loss": 2.3526, + "step": 3258 + }, + { + "epoch": 0.9624926166568222, + "grad_norm": 0.2047092571779919, + "learning_rate": 3.6855867104798046e-07, + "loss": 2.2064, + "step": 3259 + }, + { + "epoch": 0.9627879503839338, + "grad_norm": 0.21866339433694645, + "learning_rate": 3.6278444954458423e-07, + "loss": 2.3536, + "step": 3260 + }, + { + "epoch": 0.9630832841110455, + "grad_norm": 0.2352362860909541, + "learning_rate": 3.5705565365732886e-07, + "loss": 2.2868, + "step": 3261 + }, + { + "epoch": 0.9633786178381571, + "grad_norm": 1.6564250349727194, + "learning_rate": 3.5137228862894276e-07, + "loss": 2.2558, + "step": 3262 + }, + { + "epoch": 0.9636739515652688, + "grad_norm": 0.21819294949142926, + "learning_rate": 3.4573435966056e-07, + "loss": 2.2808, + "step": 3263 + }, + { + "epoch": 0.9639692852923804, + "grad_norm": 0.20606684535891281, + "learning_rate": 3.401418719117533e-07, + "loss": 2.3618, + "step": 3264 + }, + { + "epoch": 0.964264619019492, + "grad_norm": 0.293490085866521, + "learning_rate": 3.345948305005009e-07, + "loss": 2.322, + "step": 3265 + }, + { + "epoch": 0.9645599527466037, + "grad_norm": 0.22330960401370267, + "learning_rate": 3.290932405031866e-07, + "loss": 2.3422, + "step": 3266 + }, + { + "epoch": 0.9648552864737153, + "grad_norm": 0.2013625010288245, + "learning_rate": 3.2363710695461067e-07, + "loss": 2.3598, + "step": 3267 + }, + { + "epoch": 0.965150620200827, + "grad_norm": 0.1962333429307737, + "learning_rate": 3.182264348479569e-07, + "loss": 2.2419, + "step": 3268 + }, + { + "epoch": 0.9654459539279385, + "grad_norm": 0.19873881029622864, + "learning_rate": 3.1286122913481987e-07, + "loss": 2.2681, + "step": 3269 + }, + { + "epoch": 0.9657412876550502, + "grad_norm": 0.315672105163451, + "learning_rate": 3.075414947251831e-07, + "loss": 2.252, + "step": 3270 + }, + { + "epoch": 0.9660366213821618, + "grad_norm": 0.24166328856694166, + "learning_rate": 3.0226723648740796e-07, + "loss": 2.34, + "step": 3271 + }, + { + "epoch": 0.9663319551092735, + "grad_norm": 0.20842168047351312, + "learning_rate": 2.970384592482445e-07, + "loss": 2.3347, + "step": 3272 + }, + { + "epoch": 0.9666272888363852, + "grad_norm": 0.19553565628296005, + "learning_rate": 2.9185516779283164e-07, + "loss": 2.3158, + "step": 3273 + }, + { + "epoch": 0.9669226225634967, + "grad_norm": 0.19625312324334773, + "learning_rate": 2.867173668646583e-07, + "loss": 2.2487, + "step": 3274 + }, + { + "epoch": 0.9672179562906084, + "grad_norm": 0.21044281066909667, + "learning_rate": 2.8162506116560796e-07, + "loss": 2.3225, + "step": 3275 + }, + { + "epoch": 0.96751329001772, + "grad_norm": 0.20112864946571488, + "learning_rate": 2.765782553559026e-07, + "loss": 2.3055, + "step": 3276 + }, + { + "epoch": 0.9678086237448317, + "grad_norm": 0.19724598399668325, + "learning_rate": 2.7157695405414797e-07, + "loss": 2.3103, + "step": 3277 + }, + { + "epoch": 0.9681039574719433, + "grad_norm": 0.216578794887435, + "learning_rate": 2.666211618372882e-07, + "loss": 2.402, + "step": 3278 + }, + { + "epoch": 0.9683992911990549, + "grad_norm": 0.20862820837172028, + "learning_rate": 2.617108832406401e-07, + "loss": 2.2732, + "step": 3279 + }, + { + "epoch": 0.9686946249261665, + "grad_norm": 0.20259868770503392, + "learning_rate": 2.5684612275784783e-07, + "loss": 2.4236, + "step": 3280 + }, + { + "epoch": 0.9689899586532782, + "grad_norm": 0.20252933644381216, + "learning_rate": 2.520268848409113e-07, + "loss": 2.2943, + "step": 3281 + }, + { + "epoch": 0.9692852923803899, + "grad_norm": 0.2053546131612031, + "learning_rate": 2.472531739001638e-07, + "loss": 2.3561, + "step": 3282 + }, + { + "epoch": 0.9695806261075015, + "grad_norm": 0.19633023224081902, + "learning_rate": 2.4252499430428286e-07, + "loss": 2.2781, + "step": 3283 + }, + { + "epoch": 0.9698759598346132, + "grad_norm": 0.20108584999803744, + "learning_rate": 2.378423503802796e-07, + "loss": 2.2706, + "step": 3284 + }, + { + "epoch": 0.9701712935617247, + "grad_norm": 0.21178261683927674, + "learning_rate": 2.3320524641347042e-07, + "loss": 2.2989, + "step": 3285 + }, + { + "epoch": 0.9704666272888364, + "grad_norm": 0.1999049523373924, + "learning_rate": 2.2861368664752748e-07, + "loss": 2.337, + "step": 3286 + }, + { + "epoch": 0.970761961015948, + "grad_norm": 0.20002112835802507, + "learning_rate": 2.240676752844173e-07, + "loss": 2.2822, + "step": 3287 + }, + { + "epoch": 0.9710572947430597, + "grad_norm": 0.19762508764874265, + "learning_rate": 2.1956721648443978e-07, + "loss": 2.3502, + "step": 3288 + }, + { + "epoch": 0.9713526284701713, + "grad_norm": 0.20309318696833392, + "learning_rate": 2.1511231436619484e-07, + "loss": 2.2992, + "step": 3289 + }, + { + "epoch": 0.9716479621972829, + "grad_norm": 0.19385014491561636, + "learning_rate": 2.1070297300660457e-07, + "loss": 2.2861, + "step": 3290 + }, + { + "epoch": 0.9719432959243945, + "grad_norm": 0.19578875649835742, + "learning_rate": 2.0633919644088007e-07, + "loss": 2.2808, + "step": 3291 + }, + { + "epoch": 0.9722386296515062, + "grad_norm": 0.21163822275504382, + "learning_rate": 2.0202098866254903e-07, + "loss": 2.3062, + "step": 3292 + }, + { + "epoch": 0.9725339633786179, + "grad_norm": 0.19528113113267845, + "learning_rate": 1.9774835362341703e-07, + "loss": 2.2718, + "step": 3293 + }, + { + "epoch": 0.9728292971057295, + "grad_norm": 0.20018382155772244, + "learning_rate": 1.9352129523361183e-07, + "loss": 2.3587, + "step": 3294 + }, + { + "epoch": 0.9731246308328411, + "grad_norm": 0.2089266652793472, + "learning_rate": 1.8933981736152795e-07, + "loss": 2.2533, + "step": 3295 + }, + { + "epoch": 0.9734199645599527, + "grad_norm": 0.2031212586278409, + "learning_rate": 1.852039238338543e-07, + "loss": 2.3329, + "step": 3296 + }, + { + "epoch": 0.9737152982870644, + "grad_norm": 0.19586795976751376, + "learning_rate": 1.8111361843556883e-07, + "loss": 2.32, + "step": 3297 + }, + { + "epoch": 0.974010632014176, + "grad_norm": 0.20340088302346548, + "learning_rate": 1.7706890490991612e-07, + "loss": 2.3499, + "step": 3298 + }, + { + "epoch": 0.9743059657412877, + "grad_norm": 0.20399623872875144, + "learning_rate": 1.7306978695843524e-07, + "loss": 2.3005, + "step": 3299 + }, + { + "epoch": 0.9746012994683992, + "grad_norm": 0.20455931478981512, + "learning_rate": 1.6911626824092085e-07, + "loss": 2.2826, + "step": 3300 + }, + { + "epoch": 0.9748966331955109, + "grad_norm": 0.20067243707464233, + "learning_rate": 1.6520835237544551e-07, + "loss": 2.3456, + "step": 3301 + }, + { + "epoch": 0.9751919669226226, + "grad_norm": 0.1990328276480799, + "learning_rate": 1.61346042938354e-07, + "loss": 2.3157, + "step": 3302 + }, + { + "epoch": 0.9754873006497342, + "grad_norm": 0.20282935672001343, + "learning_rate": 1.5752934346424108e-07, + "loss": 2.2588, + "step": 3303 + }, + { + "epoch": 0.9757826343768459, + "grad_norm": 0.19833482729391877, + "learning_rate": 1.5375825744596838e-07, + "loss": 2.3449, + "step": 3304 + }, + { + "epoch": 0.9760779681039575, + "grad_norm": 0.20700281775671403, + "learning_rate": 1.5003278833466418e-07, + "loss": 2.2693, + "step": 3305 + }, + { + "epoch": 0.9763733018310691, + "grad_norm": 0.20069525097776955, + "learning_rate": 1.463529395396901e-07, + "loss": 2.2871, + "step": 3306 + }, + { + "epoch": 0.9766686355581807, + "grad_norm": 0.1994140172915356, + "learning_rate": 1.4271871442866902e-07, + "loss": 2.3102, + "step": 3307 + }, + { + "epoch": 0.9769639692852924, + "grad_norm": 0.1997257214593218, + "learning_rate": 1.391301163274794e-07, + "loss": 2.3194, + "step": 3308 + }, + { + "epoch": 0.977259303012404, + "grad_norm": 0.1964643559201892, + "learning_rate": 1.3558714852022203e-07, + "loss": 2.3078, + "step": 3309 + }, + { + "epoch": 0.9775546367395157, + "grad_norm": 0.20522325012933168, + "learning_rate": 1.3208981424926435e-07, + "loss": 2.2478, + "step": 3310 + }, + { + "epoch": 0.9778499704666272, + "grad_norm": 0.20286133967211198, + "learning_rate": 1.2863811671519067e-07, + "loss": 2.2761, + "step": 3311 + }, + { + "epoch": 0.9781453041937389, + "grad_norm": 0.2034030075541528, + "learning_rate": 1.2523205907684076e-07, + "loss": 2.2949, + "step": 3312 + }, + { + "epoch": 0.9784406379208506, + "grad_norm": 0.19432671683208355, + "learning_rate": 1.2187164445126575e-07, + "loss": 2.3003, + "step": 3313 + }, + { + "epoch": 0.9787359716479622, + "grad_norm": 0.20012441170479098, + "learning_rate": 1.1855687591376675e-07, + "loss": 2.324, + "step": 3314 + }, + { + "epoch": 0.9790313053750739, + "grad_norm": 0.19816699739640595, + "learning_rate": 1.1528775649785606e-07, + "loss": 2.3294, + "step": 3315 + }, + { + "epoch": 0.9793266391021854, + "grad_norm": 0.2034253961944144, + "learning_rate": 1.120642891952739e-07, + "loss": 2.3061, + "step": 3316 + }, + { + "epoch": 0.9796219728292971, + "grad_norm": 0.2019813694931344, + "learning_rate": 1.0888647695598831e-07, + "loss": 2.2783, + "step": 3317 + }, + { + "epoch": 0.9799173065564087, + "grad_norm": 0.21010273425746037, + "learning_rate": 1.0575432268818408e-07, + "loss": 2.337, + "step": 3318 + }, + { + "epoch": 0.9802126402835204, + "grad_norm": 0.2825915210935061, + "learning_rate": 1.026678292582517e-07, + "loss": 2.2868, + "step": 3319 + }, + { + "epoch": 0.980507974010632, + "grad_norm": 0.20912154321897455, + "learning_rate": 9.9626999490815e-08, + "loss": 2.3148, + "step": 3320 + }, + { + "epoch": 0.9808033077377436, + "grad_norm": 0.19880709471335586, + "learning_rate": 9.663183616868132e-08, + "loss": 2.3325, + "step": 3321 + }, + { + "epoch": 0.9810986414648553, + "grad_norm": 0.1975070212872585, + "learning_rate": 9.368234203289695e-08, + "loss": 2.3202, + "step": 3322 + }, + { + "epoch": 0.9813939751919669, + "grad_norm": 0.20281616905608355, + "learning_rate": 9.077851978268604e-08, + "loss": 2.1774, + "step": 3323 + }, + { + "epoch": 0.9816893089190786, + "grad_norm": 0.19742993791582508, + "learning_rate": 8.792037207549509e-08, + "loss": 2.3374, + "step": 3324 + }, + { + "epoch": 0.9819846426461902, + "grad_norm": 0.21444853063779512, + "learning_rate": 8.510790152695957e-08, + "loss": 2.3079, + "step": 3325 + }, + { + "epoch": 0.9822799763733018, + "grad_norm": 0.21122078946141054, + "learning_rate": 8.234111071092621e-08, + "loss": 2.3246, + "step": 3326 + }, + { + "epoch": 0.9825753101004134, + "grad_norm": 0.20006475145478733, + "learning_rate": 7.962000215942512e-08, + "loss": 2.3602, + "step": 3327 + }, + { + "epoch": 0.9828706438275251, + "grad_norm": 0.2657527616326859, + "learning_rate": 7.694457836268653e-08, + "loss": 2.2105, + "step": 3328 + }, + { + "epoch": 0.9831659775546367, + "grad_norm": 0.20014101548178756, + "learning_rate": 7.431484176913528e-08, + "loss": 2.2411, + "step": 3329 + }, + { + "epoch": 0.9834613112817484, + "grad_norm": 0.20613786184818014, + "learning_rate": 7.173079478536849e-08, + "loss": 2.3306, + "step": 3330 + }, + { + "epoch": 0.9837566450088601, + "grad_norm": 0.1973859967386912, + "learning_rate": 6.919243977620005e-08, + "loss": 2.3094, + "step": 3331 + }, + { + "epoch": 0.9840519787359716, + "grad_norm": 0.19630111353685173, + "learning_rate": 6.669977906459402e-08, + "loss": 2.2863, + "step": 3332 + }, + { + "epoch": 0.9843473124630833, + "grad_norm": 0.19774305525785213, + "learning_rate": 6.425281493172564e-08, + "loss": 2.3701, + "step": 3333 + }, + { + "epoch": 0.9846426461901949, + "grad_norm": 0.1942818436984004, + "learning_rate": 6.18515496169425e-08, + "loss": 2.3049, + "step": 3334 + }, + { + "epoch": 0.9849379799173066, + "grad_norm": 0.20323355231709178, + "learning_rate": 5.9495985317759014e-08, + "loss": 2.2955, + "step": 3335 + }, + { + "epoch": 0.9852333136444182, + "grad_norm": 0.19547465268987393, + "learning_rate": 5.718612418987856e-08, + "loss": 2.2961, + "step": 3336 + }, + { + "epoch": 0.9855286473715298, + "grad_norm": 0.19939370046706517, + "learning_rate": 5.492196834718244e-08, + "loss": 2.2505, + "step": 3337 + }, + { + "epoch": 0.9858239810986414, + "grad_norm": 0.2589356349230607, + "learning_rate": 5.270351986170763e-08, + "loss": 2.231, + "step": 3338 + }, + { + "epoch": 0.9861193148257531, + "grad_norm": 0.19818477081012445, + "learning_rate": 5.053078076368012e-08, + "loss": 2.2725, + "step": 3339 + }, + { + "epoch": 0.9864146485528648, + "grad_norm": 0.19981418710091234, + "learning_rate": 4.840375304148714e-08, + "loss": 2.3083, + "step": 3340 + }, + { + "epoch": 0.9867099822799764, + "grad_norm": 0.20284281900780218, + "learning_rate": 4.632243864168273e-08, + "loss": 2.2649, + "step": 3341 + }, + { + "epoch": 0.987005316007088, + "grad_norm": 0.19638172259014836, + "learning_rate": 4.4286839468982155e-08, + "loss": 2.2948, + "step": 3342 + }, + { + "epoch": 0.9873006497341996, + "grad_norm": 0.20172264110142374, + "learning_rate": 4.229695738627304e-08, + "loss": 2.3776, + "step": 3343 + }, + { + "epoch": 0.9875959834613113, + "grad_norm": 0.19771365474263142, + "learning_rate": 4.03527942145987e-08, + "loss": 2.3372, + "step": 3344 + }, + { + "epoch": 0.9878913171884229, + "grad_norm": 0.19799573264315895, + "learning_rate": 3.84543517331637e-08, + "loss": 2.2951, + "step": 3345 + }, + { + "epoch": 0.9881866509155346, + "grad_norm": 0.19662942367192504, + "learning_rate": 3.6601631679339387e-08, + "loss": 2.3153, + "step": 3346 + }, + { + "epoch": 0.9884819846426461, + "grad_norm": 0.19927025110592078, + "learning_rate": 3.4794635748636175e-08, + "loss": 2.3097, + "step": 3347 + }, + { + "epoch": 0.9887773183697578, + "grad_norm": 0.19355748364135078, + "learning_rate": 3.303336559474235e-08, + "loss": 2.3463, + "step": 3348 + }, + { + "epoch": 0.9890726520968695, + "grad_norm": 0.1996205616035383, + "learning_rate": 3.1317822829479704e-08, + "loss": 2.3756, + "step": 3349 + }, + { + "epoch": 0.9893679858239811, + "grad_norm": 0.1939632733989877, + "learning_rate": 2.9648009022831268e-08, + "loss": 2.2859, + "step": 3350 + }, + { + "epoch": 0.9896633195510928, + "grad_norm": 0.1911438973473431, + "learning_rate": 2.802392570293577e-08, + "loss": 2.308, + "step": 3351 + }, + { + "epoch": 0.9899586532782044, + "grad_norm": 0.19852295147055432, + "learning_rate": 2.644557435607653e-08, + "loss": 2.3319, + "step": 3352 + }, + { + "epoch": 0.990253987005316, + "grad_norm": 0.19972594528356538, + "learning_rate": 2.4912956426681454e-08, + "loss": 2.3447, + "step": 3353 + }, + { + "epoch": 0.9905493207324276, + "grad_norm": 0.19794490679658566, + "learning_rate": 2.342607331733415e-08, + "loss": 2.248, + "step": 3354 + }, + { + "epoch": 0.9908446544595393, + "grad_norm": 0.20019705236655824, + "learning_rate": 2.1984926388757267e-08, + "loss": 2.2969, + "step": 3355 + }, + { + "epoch": 0.9911399881866509, + "grad_norm": 0.19544282379926353, + "learning_rate": 2.0589516959818033e-08, + "loss": 2.2909, + "step": 3356 + }, + { + "epoch": 0.9914353219137626, + "grad_norm": 0.20050453867079868, + "learning_rate": 1.9239846307533836e-08, + "loss": 2.2921, + "step": 3357 + }, + { + "epoch": 0.9917306556408741, + "grad_norm": 0.20018714329869092, + "learning_rate": 1.7935915667049995e-08, + "loss": 2.2718, + "step": 3358 + }, + { + "epoch": 0.9920259893679858, + "grad_norm": 0.19626902008217842, + "learning_rate": 1.6677726231673074e-08, + "loss": 2.3681, + "step": 3359 + }, + { + "epoch": 0.9923213230950975, + "grad_norm": 0.19973058958002, + "learning_rate": 1.546527915282647e-08, + "loss": 2.2424, + "step": 3360 + }, + { + "epoch": 0.9926166568222091, + "grad_norm": 0.19733387172558137, + "learning_rate": 1.4298575540094839e-08, + "loss": 2.2925, + "step": 3361 + }, + { + "epoch": 0.9929119905493208, + "grad_norm": 0.1905699091397115, + "learning_rate": 1.3177616461185206e-08, + "loss": 2.2309, + "step": 3362 + }, + { + "epoch": 0.9932073242764323, + "grad_norm": 0.19264625288247025, + "learning_rate": 1.2102402941943646e-08, + "loss": 2.3195, + "step": 3363 + }, + { + "epoch": 0.993502658003544, + "grad_norm": 0.21406769606973544, + "learning_rate": 1.107293596635528e-08, + "loss": 2.3802, + "step": 3364 + }, + { + "epoch": 0.9937979917306556, + "grad_norm": 0.197989213137293, + "learning_rate": 1.0089216476538709e-08, + "loss": 2.3931, + "step": 3365 + }, + { + "epoch": 0.9940933254577673, + "grad_norm": 0.19997583779771025, + "learning_rate": 9.151245372751583e-09, + "loss": 2.2966, + "step": 3366 + }, + { + "epoch": 0.994388659184879, + "grad_norm": 0.19331494215795664, + "learning_rate": 8.259023513379483e-09, + "loss": 2.3181, + "step": 3367 + }, + { + "epoch": 0.9946839929119905, + "grad_norm": 0.20393173781954518, + "learning_rate": 7.412551714935934e-09, + "loss": 2.3149, + "step": 3368 + }, + { + "epoch": 0.9949793266391022, + "grad_norm": 0.2078829287452291, + "learning_rate": 6.611830752079051e-09, + "loss": 2.2951, + "step": 3369 + }, + { + "epoch": 0.9952746603662138, + "grad_norm": 0.21082958357257028, + "learning_rate": 5.8568613575837875e-09, + "loss": 2.2569, + "step": 3370 + }, + { + "epoch": 0.9955699940933255, + "grad_norm": 0.21129444615477916, + "learning_rate": 5.147644222364134e-09, + "loss": 2.2745, + "step": 3371 + }, + { + "epoch": 0.9958653278204371, + "grad_norm": 0.19653679506798627, + "learning_rate": 4.484179995467575e-09, + "loss": 2.2815, + "step": 3372 + }, + { + "epoch": 0.9961606615475487, + "grad_norm": 0.21027973723324483, + "learning_rate": 3.866469284052876e-09, + "loss": 2.2725, + "step": 3373 + }, + { + "epoch": 0.9964559952746603, + "grad_norm": 0.2154067119752265, + "learning_rate": 3.2945126534345007e-09, + "loss": 2.2738, + "step": 3374 + }, + { + "epoch": 0.996751329001772, + "grad_norm": 0.19635351271145204, + "learning_rate": 2.768310627027093e-09, + "loss": 2.337, + "step": 3375 + }, + { + "epoch": 0.9970466627288836, + "grad_norm": 0.20090570711485875, + "learning_rate": 2.2878636863898907e-09, + "loss": 2.3521, + "step": 3376 + }, + { + "epoch": 0.9973419964559953, + "grad_norm": 0.19755178412459953, + "learning_rate": 1.8531722712100686e-09, + "loss": 2.3246, + "step": 3377 + }, + { + "epoch": 0.997637330183107, + "grad_norm": 0.21671953626076138, + "learning_rate": 1.464236779291639e-09, + "loss": 2.2896, + "step": 3378 + }, + { + "epoch": 0.9979326639102185, + "grad_norm": 0.20289922968465343, + "learning_rate": 1.1210575665665523e-09, + "loss": 2.3367, + "step": 3379 + }, + { + "epoch": 0.9982279976373302, + "grad_norm": 0.20316175898482108, + "learning_rate": 8.236349471058002e-10, + "loss": 2.2826, + "step": 3380 + }, + { + "epoch": 0.9985233313644418, + "grad_norm": 0.19848370990222824, + "learning_rate": 5.719691930861081e-10, + "loss": 2.3616, + "step": 3381 + }, + { + "epoch": 0.9988186650915535, + "grad_norm": 0.2001613215921407, + "learning_rate": 3.6606053482324265e-10, + "loss": 2.2905, + "step": 3382 + }, + { + "epoch": 0.9991139988186651, + "grad_norm": 0.20481899784083965, + "learning_rate": 2.059091607609087e-10, + "loss": 2.3647, + "step": 3383 + }, + { + "epoch": 0.9994093325457767, + "grad_norm": 0.20511726732457702, + "learning_rate": 9.151521745409674e-11, + "loss": 2.3038, + "step": 3384 + }, + { + "epoch": 0.9997046662728883, + "grad_norm": 0.20300248008948915, + "learning_rate": 2.287880959683797e-11, + "loss": 2.3683, + "step": 3385 + }, + { + "epoch": 1.0, + "grad_norm": 0.19207395168826688, + "learning_rate": 0.0, + "loss": 2.3321, + "step": 3386 + } + ], + "logging_steps": 1, + "max_steps": 3386, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 4000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6576962877259776.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}