diff --git "a/LLama-3-6.6B-R-Pruned-checkpoint-24880/trainer_state.json" "b/LLama-3-6.6B-R-Pruned-checkpoint-24880/trainer_state.json" new file mode 100644--- /dev/null +++ "b/LLama-3-6.6B-R-Pruned-checkpoint-24880/trainer_state.json" @@ -0,0 +1,17552 @@ +{ + "best_metric": 1.203926920890808, + "best_model_checkpoint": "finetune-checkpoints/checkpoint-24000", + "epoch": 2.0, + "eval_steps": 2000, + "global_step": 24880, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.038585209003216e-05, + "grad_norm": 1.276573896408081, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.6908, + "step": 1 + }, + { + "epoch": 0.0008038585209003215, + "grad_norm": 1.557997465133667, + "learning_rate": 1e-05, + "loss": 1.8209, + "step": 10 + }, + { + "epoch": 0.001607717041800643, + "grad_norm": 1.4084455966949463, + "learning_rate": 2e-05, + "loss": 1.7609, + "step": 20 + }, + { + "epoch": 0.002411575562700965, + "grad_norm": 1.5869466066360474, + "learning_rate": 3e-05, + "loss": 1.6485, + "step": 30 + }, + { + "epoch": 0.003215434083601286, + "grad_norm": 2.227403163909912, + "learning_rate": 4e-05, + "loss": 1.7366, + "step": 40 + }, + { + "epoch": 0.0040192926045016075, + "grad_norm": 1.7820078134536743, + "learning_rate": 5e-05, + "loss": 1.5889, + "step": 50 + }, + { + "epoch": 0.00482315112540193, + "grad_norm": 1.197980284690857, + "learning_rate": 5.9e-05, + "loss": 1.6265, + "step": 60 + }, + { + "epoch": 0.005627009646302251, + "grad_norm": 1.1902364492416382, + "learning_rate": 6.9e-05, + "loss": 1.5198, + "step": 70 + }, + { + "epoch": 0.006430868167202572, + "grad_norm": 2.68631649017334, + "learning_rate": 7.900000000000001e-05, + "loss": 1.3046, + "step": 80 + }, + { + "epoch": 0.007234726688102894, + "grad_norm": 1.3876135349273682, + "learning_rate": 8.900000000000001e-05, + "loss": 1.4239, + "step": 90 + }, + { + "epoch": 0.008038585209003215, + "grad_norm": 1.8347593545913696, + "learning_rate": 9.900000000000001e-05, + "loss": 1.4361, + "step": 100 + }, + { + "epoch": 0.008842443729903537, + "grad_norm": 1.8065059185028076, + "learning_rate": 9.99636803874092e-05, + "loss": 1.4075, + "step": 110 + }, + { + "epoch": 0.00964630225080386, + "grad_norm": 1.6402571201324463, + "learning_rate": 9.992332526230832e-05, + "loss": 1.4785, + "step": 120 + }, + { + "epoch": 0.01045016077170418, + "grad_norm": 1.8013700246810913, + "learning_rate": 9.988297013720743e-05, + "loss": 1.3207, + "step": 130 + }, + { + "epoch": 0.011254019292604502, + "grad_norm": 3.1454508304595947, + "learning_rate": 9.984261501210655e-05, + "loss": 1.4448, + "step": 140 + }, + { + "epoch": 0.012057877813504822, + "grad_norm": 2.474430799484253, + "learning_rate": 9.980225988700565e-05, + "loss": 1.3956, + "step": 150 + }, + { + "epoch": 0.012861736334405145, + "grad_norm": 1.4373944997787476, + "learning_rate": 9.976190476190477e-05, + "loss": 1.2823, + "step": 160 + }, + { + "epoch": 0.013665594855305467, + "grad_norm": 2.847407817840576, + "learning_rate": 9.972154963680387e-05, + "loss": 1.3352, + "step": 170 + }, + { + "epoch": 0.014469453376205787, + "grad_norm": 3.7937076091766357, + "learning_rate": 9.968119451170299e-05, + "loss": 1.2526, + "step": 180 + }, + { + "epoch": 0.01527331189710611, + "grad_norm": 1.221319556236267, + "learning_rate": 9.96408393866021e-05, + "loss": 1.4984, + "step": 190 + }, + { + "epoch": 0.01607717041800643, + "grad_norm": 1.652315378189087, + "learning_rate": 9.960048426150121e-05, + "loss": 1.3169, + "step": 200 + }, + { + "epoch": 0.016881028938906754, + "grad_norm": 1.7962154150009155, + "learning_rate": 9.956012913640033e-05, + "loss": 1.3491, + "step": 210 + }, + { + "epoch": 0.017684887459807074, + "grad_norm": 1.4589283466339111, + "learning_rate": 9.951977401129944e-05, + "loss": 1.3256, + "step": 220 + }, + { + "epoch": 0.018488745980707395, + "grad_norm": 1.7330913543701172, + "learning_rate": 9.947941888619856e-05, + "loss": 1.2179, + "step": 230 + }, + { + "epoch": 0.01929260450160772, + "grad_norm": 1.4680728912353516, + "learning_rate": 9.943906376109766e-05, + "loss": 1.4964, + "step": 240 + }, + { + "epoch": 0.02009646302250804, + "grad_norm": 1.7182399034500122, + "learning_rate": 9.939870863599678e-05, + "loss": 1.4423, + "step": 250 + }, + { + "epoch": 0.02090032154340836, + "grad_norm": 1.5316752195358276, + "learning_rate": 9.935835351089588e-05, + "loss": 1.2597, + "step": 260 + }, + { + "epoch": 0.021704180064308683, + "grad_norm": 1.4161688089370728, + "learning_rate": 9.9317998385795e-05, + "loss": 1.3918, + "step": 270 + }, + { + "epoch": 0.022508038585209004, + "grad_norm": 2.1475634574890137, + "learning_rate": 9.92776432606941e-05, + "loss": 1.3015, + "step": 280 + }, + { + "epoch": 0.023311897106109324, + "grad_norm": 1.2509418725967407, + "learning_rate": 9.923728813559322e-05, + "loss": 1.3341, + "step": 290 + }, + { + "epoch": 0.024115755627009645, + "grad_norm": 1.3515390157699585, + "learning_rate": 9.919693301049233e-05, + "loss": 1.3216, + "step": 300 + }, + { + "epoch": 0.02491961414790997, + "grad_norm": 1.5757758617401123, + "learning_rate": 9.915657788539145e-05, + "loss": 1.3335, + "step": 310 + }, + { + "epoch": 0.02572347266881029, + "grad_norm": 1.8854318857192993, + "learning_rate": 9.911622276029056e-05, + "loss": 1.3985, + "step": 320 + }, + { + "epoch": 0.02652733118971061, + "grad_norm": 1.7211475372314453, + "learning_rate": 9.907586763518968e-05, + "loss": 1.3353, + "step": 330 + }, + { + "epoch": 0.027331189710610933, + "grad_norm": 1.5294115543365479, + "learning_rate": 9.903551251008879e-05, + "loss": 1.3076, + "step": 340 + }, + { + "epoch": 0.028135048231511254, + "grad_norm": 1.2507871389389038, + "learning_rate": 9.89951573849879e-05, + "loss": 1.3118, + "step": 350 + }, + { + "epoch": 0.028938906752411574, + "grad_norm": 1.4744722843170166, + "learning_rate": 9.895480225988701e-05, + "loss": 1.3776, + "step": 360 + }, + { + "epoch": 0.0297427652733119, + "grad_norm": 1.4481561183929443, + "learning_rate": 9.891444713478613e-05, + "loss": 1.3754, + "step": 370 + }, + { + "epoch": 0.03054662379421222, + "grad_norm": 2.999114751815796, + "learning_rate": 9.887409200968523e-05, + "loss": 1.3131, + "step": 380 + }, + { + "epoch": 0.03135048231511254, + "grad_norm": 1.4064487218856812, + "learning_rate": 9.883373688458435e-05, + "loss": 1.4513, + "step": 390 + }, + { + "epoch": 0.03215434083601286, + "grad_norm": 1.2479560375213623, + "learning_rate": 9.879338175948346e-05, + "loss": 1.411, + "step": 400 + }, + { + "epoch": 0.03295819935691318, + "grad_norm": 3.228004217147827, + "learning_rate": 9.875302663438257e-05, + "loss": 1.355, + "step": 410 + }, + { + "epoch": 0.03376205787781351, + "grad_norm": 2.090229034423828, + "learning_rate": 9.871267150928168e-05, + "loss": 1.3551, + "step": 420 + }, + { + "epoch": 0.03456591639871383, + "grad_norm": 1.4778763055801392, + "learning_rate": 9.86723163841808e-05, + "loss": 1.3192, + "step": 430 + }, + { + "epoch": 0.03536977491961415, + "grad_norm": 1.4477226734161377, + "learning_rate": 9.863196125907991e-05, + "loss": 1.3218, + "step": 440 + }, + { + "epoch": 0.03617363344051447, + "grad_norm": 2.188595771789551, + "learning_rate": 9.859160613397902e-05, + "loss": 1.2708, + "step": 450 + }, + { + "epoch": 0.03697749196141479, + "grad_norm": 1.2476807832717896, + "learning_rate": 9.855125100887814e-05, + "loss": 1.3206, + "step": 460 + }, + { + "epoch": 0.03778135048231511, + "grad_norm": 1.279171109199524, + "learning_rate": 9.851089588377724e-05, + "loss": 1.2349, + "step": 470 + }, + { + "epoch": 0.03858520900321544, + "grad_norm": 1.156655192375183, + "learning_rate": 9.847054075867636e-05, + "loss": 1.2511, + "step": 480 + }, + { + "epoch": 0.03938906752411576, + "grad_norm": 1.1249313354492188, + "learning_rate": 9.843018563357546e-05, + "loss": 1.2895, + "step": 490 + }, + { + "epoch": 0.04019292604501608, + "grad_norm": 2.300807237625122, + "learning_rate": 9.838983050847458e-05, + "loss": 1.2176, + "step": 500 + }, + { + "epoch": 0.0409967845659164, + "grad_norm": 1.7253382205963135, + "learning_rate": 9.834947538337369e-05, + "loss": 1.2453, + "step": 510 + }, + { + "epoch": 0.04180064308681672, + "grad_norm": 1.288348913192749, + "learning_rate": 9.83091202582728e-05, + "loss": 1.3414, + "step": 520 + }, + { + "epoch": 0.04260450160771704, + "grad_norm": 1.2001973390579224, + "learning_rate": 9.826876513317191e-05, + "loss": 1.228, + "step": 530 + }, + { + "epoch": 0.04340836012861737, + "grad_norm": 1.375143051147461, + "learning_rate": 9.822841000807103e-05, + "loss": 1.4148, + "step": 540 + }, + { + "epoch": 0.04421221864951769, + "grad_norm": 2.0278618335723877, + "learning_rate": 9.818805488297013e-05, + "loss": 1.3985, + "step": 550 + }, + { + "epoch": 0.04501607717041801, + "grad_norm": 1.3718616962432861, + "learning_rate": 9.814769975786925e-05, + "loss": 1.2803, + "step": 560 + }, + { + "epoch": 0.04581993569131833, + "grad_norm": 1.0574828386306763, + "learning_rate": 9.810734463276836e-05, + "loss": 1.157, + "step": 570 + }, + { + "epoch": 0.04662379421221865, + "grad_norm": 1.2841317653656006, + "learning_rate": 9.806698950766749e-05, + "loss": 1.3258, + "step": 580 + }, + { + "epoch": 0.04742765273311897, + "grad_norm": 1.2227920293807983, + "learning_rate": 9.802663438256659e-05, + "loss": 1.3772, + "step": 590 + }, + { + "epoch": 0.04823151125401929, + "grad_norm": 1.8267626762390137, + "learning_rate": 9.798627925746571e-05, + "loss": 1.4531, + "step": 600 + }, + { + "epoch": 0.04903536977491962, + "grad_norm": 1.2855032682418823, + "learning_rate": 9.794592413236481e-05, + "loss": 1.3173, + "step": 610 + }, + { + "epoch": 0.04983922829581994, + "grad_norm": 1.6092931032180786, + "learning_rate": 9.790556900726393e-05, + "loss": 1.2729, + "step": 620 + }, + { + "epoch": 0.05064308681672026, + "grad_norm": 1.3694730997085571, + "learning_rate": 9.786521388216304e-05, + "loss": 1.3944, + "step": 630 + }, + { + "epoch": 0.05144694533762058, + "grad_norm": 1.0532931089401245, + "learning_rate": 9.782485875706216e-05, + "loss": 1.404, + "step": 640 + }, + { + "epoch": 0.0522508038585209, + "grad_norm": 2.206791877746582, + "learning_rate": 9.778450363196126e-05, + "loss": 1.2867, + "step": 650 + }, + { + "epoch": 0.05305466237942122, + "grad_norm": 1.2418971061706543, + "learning_rate": 9.774414850686038e-05, + "loss": 1.3017, + "step": 660 + }, + { + "epoch": 0.053858520900321546, + "grad_norm": 1.8438310623168945, + "learning_rate": 9.77037933817595e-05, + "loss": 1.2832, + "step": 670 + }, + { + "epoch": 0.05466237942122187, + "grad_norm": 1.0188833475112915, + "learning_rate": 9.76634382566586e-05, + "loss": 1.3811, + "step": 680 + }, + { + "epoch": 0.05546623794212219, + "grad_norm": 1.3860092163085938, + "learning_rate": 9.762308313155772e-05, + "loss": 1.2406, + "step": 690 + }, + { + "epoch": 0.05627009646302251, + "grad_norm": 1.2227293252944946, + "learning_rate": 9.758272800645682e-05, + "loss": 1.3276, + "step": 700 + }, + { + "epoch": 0.05707395498392283, + "grad_norm": 1.1025124788284302, + "learning_rate": 9.754237288135594e-05, + "loss": 1.3964, + "step": 710 + }, + { + "epoch": 0.05787781350482315, + "grad_norm": 1.0877604484558105, + "learning_rate": 9.750201775625505e-05, + "loss": 1.3527, + "step": 720 + }, + { + "epoch": 0.058681672025723476, + "grad_norm": 1.8921070098876953, + "learning_rate": 9.746166263115416e-05, + "loss": 1.2887, + "step": 730 + }, + { + "epoch": 0.0594855305466238, + "grad_norm": 1.6759284734725952, + "learning_rate": 9.742130750605327e-05, + "loss": 1.3984, + "step": 740 + }, + { + "epoch": 0.06028938906752412, + "grad_norm": 0.9962583780288696, + "learning_rate": 9.738095238095239e-05, + "loss": 1.2688, + "step": 750 + }, + { + "epoch": 0.06109324758842444, + "grad_norm": 2.284313917160034, + "learning_rate": 9.734059725585149e-05, + "loss": 1.1844, + "step": 760 + }, + { + "epoch": 0.06189710610932476, + "grad_norm": 1.3981996774673462, + "learning_rate": 9.730024213075061e-05, + "loss": 1.3251, + "step": 770 + }, + { + "epoch": 0.06270096463022508, + "grad_norm": 1.0155847072601318, + "learning_rate": 9.725988700564971e-05, + "loss": 1.1357, + "step": 780 + }, + { + "epoch": 0.0635048231511254, + "grad_norm": 1.4152178764343262, + "learning_rate": 9.721953188054883e-05, + "loss": 1.2166, + "step": 790 + }, + { + "epoch": 0.06430868167202572, + "grad_norm": 1.5149229764938354, + "learning_rate": 9.717917675544794e-05, + "loss": 1.176, + "step": 800 + }, + { + "epoch": 0.06511254019292605, + "grad_norm": 1.0455445051193237, + "learning_rate": 9.713882163034706e-05, + "loss": 1.2231, + "step": 810 + }, + { + "epoch": 0.06591639871382636, + "grad_norm": 1.8290674686431885, + "learning_rate": 9.709846650524616e-05, + "loss": 1.3406, + "step": 820 + }, + { + "epoch": 0.06672025723472669, + "grad_norm": 1.7942872047424316, + "learning_rate": 9.705811138014528e-05, + "loss": 1.4025, + "step": 830 + }, + { + "epoch": 0.06752411575562701, + "grad_norm": 1.2038955688476562, + "learning_rate": 9.70177562550444e-05, + "loss": 1.2591, + "step": 840 + }, + { + "epoch": 0.06832797427652733, + "grad_norm": 1.5919041633605957, + "learning_rate": 9.697740112994351e-05, + "loss": 1.3773, + "step": 850 + }, + { + "epoch": 0.06913183279742766, + "grad_norm": 1.2200759649276733, + "learning_rate": 9.693704600484262e-05, + "loss": 1.326, + "step": 860 + }, + { + "epoch": 0.06993569131832797, + "grad_norm": 1.3108922243118286, + "learning_rate": 9.689669087974174e-05, + "loss": 1.3047, + "step": 870 + }, + { + "epoch": 0.0707395498392283, + "grad_norm": 1.160473346710205, + "learning_rate": 9.685633575464084e-05, + "loss": 1.2789, + "step": 880 + }, + { + "epoch": 0.07154340836012862, + "grad_norm": 1.5282626152038574, + "learning_rate": 9.681598062953996e-05, + "loss": 1.1021, + "step": 890 + }, + { + "epoch": 0.07234726688102894, + "grad_norm": 2.892331838607788, + "learning_rate": 9.677562550443908e-05, + "loss": 1.3719, + "step": 900 + }, + { + "epoch": 0.07315112540192927, + "grad_norm": 1.1138297319412231, + "learning_rate": 9.673527037933818e-05, + "loss": 1.1974, + "step": 910 + }, + { + "epoch": 0.07395498392282958, + "grad_norm": 1.3960531949996948, + "learning_rate": 9.66949152542373e-05, + "loss": 1.3427, + "step": 920 + }, + { + "epoch": 0.0747588424437299, + "grad_norm": 1.2433034181594849, + "learning_rate": 9.66545601291364e-05, + "loss": 1.2951, + "step": 930 + }, + { + "epoch": 0.07556270096463022, + "grad_norm": 1.0800739526748657, + "learning_rate": 9.661420500403552e-05, + "loss": 1.2975, + "step": 940 + }, + { + "epoch": 0.07636655948553055, + "grad_norm": 2.4013397693634033, + "learning_rate": 9.657384987893463e-05, + "loss": 1.382, + "step": 950 + }, + { + "epoch": 0.07717041800643087, + "grad_norm": 1.9743378162384033, + "learning_rate": 9.653349475383375e-05, + "loss": 1.3482, + "step": 960 + }, + { + "epoch": 0.07797427652733119, + "grad_norm": 1.1982871294021606, + "learning_rate": 9.649313962873285e-05, + "loss": 1.432, + "step": 970 + }, + { + "epoch": 0.07877813504823152, + "grad_norm": 1.0801359415054321, + "learning_rate": 9.645278450363197e-05, + "loss": 1.2414, + "step": 980 + }, + { + "epoch": 0.07958199356913183, + "grad_norm": 1.329847812652588, + "learning_rate": 9.641242937853107e-05, + "loss": 1.2838, + "step": 990 + }, + { + "epoch": 0.08038585209003216, + "grad_norm": 2.138526678085327, + "learning_rate": 9.637207425343019e-05, + "loss": 1.3736, + "step": 1000 + }, + { + "epoch": 0.08118971061093247, + "grad_norm": 1.342236876487732, + "learning_rate": 9.63317191283293e-05, + "loss": 1.3163, + "step": 1010 + }, + { + "epoch": 0.0819935691318328, + "grad_norm": 2.2968924045562744, + "learning_rate": 9.629136400322841e-05, + "loss": 1.3696, + "step": 1020 + }, + { + "epoch": 0.08279742765273312, + "grad_norm": 1.244811773300171, + "learning_rate": 9.625100887812752e-05, + "loss": 1.196, + "step": 1030 + }, + { + "epoch": 0.08360128617363344, + "grad_norm": 1.9975718259811401, + "learning_rate": 9.621065375302664e-05, + "loss": 1.2744, + "step": 1040 + }, + { + "epoch": 0.08440514469453377, + "grad_norm": 1.5397216081619263, + "learning_rate": 9.617029862792574e-05, + "loss": 1.2667, + "step": 1050 + }, + { + "epoch": 0.08520900321543408, + "grad_norm": 1.3428692817687988, + "learning_rate": 9.612994350282486e-05, + "loss": 1.1593, + "step": 1060 + }, + { + "epoch": 0.0860128617363344, + "grad_norm": 1.1173874139785767, + "learning_rate": 9.608958837772398e-05, + "loss": 1.3026, + "step": 1070 + }, + { + "epoch": 0.08681672025723473, + "grad_norm": 2.408698081970215, + "learning_rate": 9.604923325262308e-05, + "loss": 1.2274, + "step": 1080 + }, + { + "epoch": 0.08762057877813505, + "grad_norm": 2.4228882789611816, + "learning_rate": 9.60088781275222e-05, + "loss": 1.1629, + "step": 1090 + }, + { + "epoch": 0.08842443729903537, + "grad_norm": 1.572268009185791, + "learning_rate": 9.59685230024213e-05, + "loss": 1.214, + "step": 1100 + }, + { + "epoch": 0.08922829581993569, + "grad_norm": 1.5269432067871094, + "learning_rate": 9.592816787732042e-05, + "loss": 1.2515, + "step": 1110 + }, + { + "epoch": 0.09003215434083602, + "grad_norm": 1.0226801633834839, + "learning_rate": 9.588781275221954e-05, + "loss": 1.2765, + "step": 1120 + }, + { + "epoch": 0.09083601286173633, + "grad_norm": 1.3933486938476562, + "learning_rate": 9.584745762711866e-05, + "loss": 1.1333, + "step": 1130 + }, + { + "epoch": 0.09163987138263666, + "grad_norm": 1.4505245685577393, + "learning_rate": 9.580710250201776e-05, + "loss": 1.2681, + "step": 1140 + }, + { + "epoch": 0.09244372990353698, + "grad_norm": 0.9488227963447571, + "learning_rate": 9.576674737691688e-05, + "loss": 1.2356, + "step": 1150 + }, + { + "epoch": 0.0932475884244373, + "grad_norm": 1.6163320541381836, + "learning_rate": 9.572639225181599e-05, + "loss": 1.3831, + "step": 1160 + }, + { + "epoch": 0.09405144694533762, + "grad_norm": 2.110747814178467, + "learning_rate": 9.56860371267151e-05, + "loss": 1.3194, + "step": 1170 + }, + { + "epoch": 0.09485530546623794, + "grad_norm": 1.251051902770996, + "learning_rate": 9.564568200161421e-05, + "loss": 1.2746, + "step": 1180 + }, + { + "epoch": 0.09565916398713827, + "grad_norm": 0.9723356366157532, + "learning_rate": 9.560532687651333e-05, + "loss": 1.323, + "step": 1190 + }, + { + "epoch": 0.09646302250803858, + "grad_norm": 1.8033671379089355, + "learning_rate": 9.556497175141243e-05, + "loss": 1.1776, + "step": 1200 + }, + { + "epoch": 0.0972668810289389, + "grad_norm": 1.8983824253082275, + "learning_rate": 9.552461662631155e-05, + "loss": 1.251, + "step": 1210 + }, + { + "epoch": 0.09807073954983923, + "grad_norm": 1.7401349544525146, + "learning_rate": 9.548426150121066e-05, + "loss": 1.2079, + "step": 1220 + }, + { + "epoch": 0.09887459807073955, + "grad_norm": 1.1335679292678833, + "learning_rate": 9.544390637610977e-05, + "loss": 1.1727, + "step": 1230 + }, + { + "epoch": 0.09967845659163987, + "grad_norm": 1.8288633823394775, + "learning_rate": 9.540355125100888e-05, + "loss": 1.2381, + "step": 1240 + }, + { + "epoch": 0.10048231511254019, + "grad_norm": 1.2624421119689941, + "learning_rate": 9.5363196125908e-05, + "loss": 1.3436, + "step": 1250 + }, + { + "epoch": 0.10128617363344052, + "grad_norm": 1.6780205965042114, + "learning_rate": 9.53228410008071e-05, + "loss": 1.2763, + "step": 1260 + }, + { + "epoch": 0.10209003215434084, + "grad_norm": 1.6315609216690063, + "learning_rate": 9.528248587570622e-05, + "loss": 1.2907, + "step": 1270 + }, + { + "epoch": 0.10289389067524116, + "grad_norm": 1.821751594543457, + "learning_rate": 9.524213075060532e-05, + "loss": 1.3109, + "step": 1280 + }, + { + "epoch": 0.10369774919614148, + "grad_norm": 1.9027079343795776, + "learning_rate": 9.520177562550444e-05, + "loss": 1.2388, + "step": 1290 + }, + { + "epoch": 0.1045016077170418, + "grad_norm": 0.9719260334968567, + "learning_rate": 9.516142050040356e-05, + "loss": 1.325, + "step": 1300 + }, + { + "epoch": 0.10530546623794212, + "grad_norm": 1.5324716567993164, + "learning_rate": 9.512106537530266e-05, + "loss": 1.2834, + "step": 1310 + }, + { + "epoch": 0.10610932475884244, + "grad_norm": 1.4811768531799316, + "learning_rate": 9.508071025020178e-05, + "loss": 1.2229, + "step": 1320 + }, + { + "epoch": 0.10691318327974277, + "grad_norm": 1.5108731985092163, + "learning_rate": 9.504035512510089e-05, + "loss": 1.326, + "step": 1330 + }, + { + "epoch": 0.10771704180064309, + "grad_norm": 1.3529566526412964, + "learning_rate": 9.5e-05, + "loss": 1.1673, + "step": 1340 + }, + { + "epoch": 0.1085209003215434, + "grad_norm": 1.7689844369888306, + "learning_rate": 9.495964487489911e-05, + "loss": 1.1217, + "step": 1350 + }, + { + "epoch": 0.10932475884244373, + "grad_norm": 1.4722760915756226, + "learning_rate": 9.491928974979823e-05, + "loss": 1.3527, + "step": 1360 + }, + { + "epoch": 0.11012861736334405, + "grad_norm": 1.8713065385818481, + "learning_rate": 9.487893462469735e-05, + "loss": 1.2872, + "step": 1370 + }, + { + "epoch": 0.11093247588424437, + "grad_norm": 1.243556261062622, + "learning_rate": 9.483857949959646e-05, + "loss": 1.2113, + "step": 1380 + }, + { + "epoch": 0.11173633440514469, + "grad_norm": 1.129428505897522, + "learning_rate": 9.479822437449557e-05, + "loss": 1.2766, + "step": 1390 + }, + { + "epoch": 0.11254019292604502, + "grad_norm": 2.1960277557373047, + "learning_rate": 9.475786924939469e-05, + "loss": 1.3027, + "step": 1400 + }, + { + "epoch": 0.11334405144694534, + "grad_norm": 1.2851146459579468, + "learning_rate": 9.471751412429379e-05, + "loss": 1.2294, + "step": 1410 + }, + { + "epoch": 0.11414790996784566, + "grad_norm": 1.365645170211792, + "learning_rate": 9.467715899919291e-05, + "loss": 1.4006, + "step": 1420 + }, + { + "epoch": 0.11495176848874598, + "grad_norm": 1.5098659992218018, + "learning_rate": 9.463680387409201e-05, + "loss": 1.3402, + "step": 1430 + }, + { + "epoch": 0.1157556270096463, + "grad_norm": 1.6049816608428955, + "learning_rate": 9.459644874899113e-05, + "loss": 1.3056, + "step": 1440 + }, + { + "epoch": 0.11655948553054662, + "grad_norm": 1.306289553642273, + "learning_rate": 9.455609362389024e-05, + "loss": 1.3096, + "step": 1450 + }, + { + "epoch": 0.11736334405144695, + "grad_norm": 1.9932974576950073, + "learning_rate": 9.451573849878936e-05, + "loss": 1.3128, + "step": 1460 + }, + { + "epoch": 0.11816720257234727, + "grad_norm": 1.2543416023254395, + "learning_rate": 9.447538337368846e-05, + "loss": 1.2531, + "step": 1470 + }, + { + "epoch": 0.1189710610932476, + "grad_norm": 1.874577522277832, + "learning_rate": 9.443502824858758e-05, + "loss": 1.4227, + "step": 1480 + }, + { + "epoch": 0.1197749196141479, + "grad_norm": 1.5035979747772217, + "learning_rate": 9.439467312348668e-05, + "loss": 1.3016, + "step": 1490 + }, + { + "epoch": 0.12057877813504823, + "grad_norm": 0.9851014614105225, + "learning_rate": 9.43543179983858e-05, + "loss": 1.2965, + "step": 1500 + }, + { + "epoch": 0.12138263665594855, + "grad_norm": 1.2649171352386475, + "learning_rate": 9.43139628732849e-05, + "loss": 1.2061, + "step": 1510 + }, + { + "epoch": 0.12218649517684887, + "grad_norm": 2.586120843887329, + "learning_rate": 9.427360774818402e-05, + "loss": 1.2258, + "step": 1520 + }, + { + "epoch": 0.1229903536977492, + "grad_norm": 1.7649742364883423, + "learning_rate": 9.423325262308314e-05, + "loss": 1.2618, + "step": 1530 + }, + { + "epoch": 0.12379421221864952, + "grad_norm": 1.172960877418518, + "learning_rate": 9.419289749798225e-05, + "loss": 1.2505, + "step": 1540 + }, + { + "epoch": 0.12459807073954984, + "grad_norm": 1.0227694511413574, + "learning_rate": 9.415254237288136e-05, + "loss": 1.1528, + "step": 1550 + }, + { + "epoch": 0.12540192926045016, + "grad_norm": 1.686591386795044, + "learning_rate": 9.411218724778047e-05, + "loss": 1.2418, + "step": 1560 + }, + { + "epoch": 0.12620578778135047, + "grad_norm": 1.7634520530700684, + "learning_rate": 9.407183212267959e-05, + "loss": 1.2923, + "step": 1570 + }, + { + "epoch": 0.1270096463022508, + "grad_norm": 1.1225353479385376, + "learning_rate": 9.403147699757869e-05, + "loss": 1.3068, + "step": 1580 + }, + { + "epoch": 0.12781350482315113, + "grad_norm": 1.3780168294906616, + "learning_rate": 9.399112187247781e-05, + "loss": 1.2754, + "step": 1590 + }, + { + "epoch": 0.12861736334405144, + "grad_norm": 3.1708788871765137, + "learning_rate": 9.395076674737691e-05, + "loss": 1.2333, + "step": 1600 + }, + { + "epoch": 0.12942122186495178, + "grad_norm": 1.2919507026672363, + "learning_rate": 9.391041162227603e-05, + "loss": 1.269, + "step": 1610 + }, + { + "epoch": 0.1302250803858521, + "grad_norm": 1.1988024711608887, + "learning_rate": 9.387005649717514e-05, + "loss": 1.3041, + "step": 1620 + }, + { + "epoch": 0.1310289389067524, + "grad_norm": 1.3553966283798218, + "learning_rate": 9.382970137207427e-05, + "loss": 1.36, + "step": 1630 + }, + { + "epoch": 0.13183279742765272, + "grad_norm": 1.6519627571105957, + "learning_rate": 9.378934624697337e-05, + "loss": 1.2687, + "step": 1640 + }, + { + "epoch": 0.13263665594855306, + "grad_norm": 1.3935558795928955, + "learning_rate": 9.374899112187249e-05, + "loss": 1.2566, + "step": 1650 + }, + { + "epoch": 0.13344051446945338, + "grad_norm": 1.8146889209747314, + "learning_rate": 9.37086359967716e-05, + "loss": 1.3103, + "step": 1660 + }, + { + "epoch": 0.1342443729903537, + "grad_norm": 1.1725800037384033, + "learning_rate": 9.366828087167071e-05, + "loss": 1.28, + "step": 1670 + }, + { + "epoch": 0.13504823151125403, + "grad_norm": 1.4799379110336304, + "learning_rate": 9.362792574656982e-05, + "loss": 1.2874, + "step": 1680 + }, + { + "epoch": 0.13585209003215434, + "grad_norm": 1.1668813228607178, + "learning_rate": 9.358757062146894e-05, + "loss": 1.2449, + "step": 1690 + }, + { + "epoch": 0.13665594855305466, + "grad_norm": 1.3387470245361328, + "learning_rate": 9.354721549636804e-05, + "loss": 1.336, + "step": 1700 + }, + { + "epoch": 0.13745980707395497, + "grad_norm": 1.42355477809906, + "learning_rate": 9.350686037126716e-05, + "loss": 1.3606, + "step": 1710 + }, + { + "epoch": 0.1382636655948553, + "grad_norm": 1.2502082586288452, + "learning_rate": 9.346650524616626e-05, + "loss": 1.2905, + "step": 1720 + }, + { + "epoch": 0.13906752411575563, + "grad_norm": 1.7439844608306885, + "learning_rate": 9.342615012106538e-05, + "loss": 1.2083, + "step": 1730 + }, + { + "epoch": 0.13987138263665594, + "grad_norm": 0.9318333268165588, + "learning_rate": 9.338579499596449e-05, + "loss": 1.2035, + "step": 1740 + }, + { + "epoch": 0.14067524115755628, + "grad_norm": 1.819989800453186, + "learning_rate": 9.33454398708636e-05, + "loss": 1.2072, + "step": 1750 + }, + { + "epoch": 0.1414790996784566, + "grad_norm": 1.5602185726165771, + "learning_rate": 9.330508474576271e-05, + "loss": 1.1104, + "step": 1760 + }, + { + "epoch": 0.1422829581993569, + "grad_norm": 1.0436007976531982, + "learning_rate": 9.326472962066183e-05, + "loss": 1.188, + "step": 1770 + }, + { + "epoch": 0.14308681672025725, + "grad_norm": 3.7814908027648926, + "learning_rate": 9.322437449556095e-05, + "loss": 1.2902, + "step": 1780 + }, + { + "epoch": 0.14389067524115756, + "grad_norm": 1.4683253765106201, + "learning_rate": 9.318401937046005e-05, + "loss": 1.25, + "step": 1790 + }, + { + "epoch": 0.14469453376205788, + "grad_norm": 1.266164779663086, + "learning_rate": 9.314366424535917e-05, + "loss": 1.309, + "step": 1800 + }, + { + "epoch": 0.1454983922829582, + "grad_norm": 8.560582160949707, + "learning_rate": 9.310330912025827e-05, + "loss": 1.373, + "step": 1810 + }, + { + "epoch": 0.14630225080385853, + "grad_norm": 1.4244352579116821, + "learning_rate": 9.306295399515739e-05, + "loss": 1.2828, + "step": 1820 + }, + { + "epoch": 0.14710610932475884, + "grad_norm": 1.052838683128357, + "learning_rate": 9.30225988700565e-05, + "loss": 1.2791, + "step": 1830 + }, + { + "epoch": 0.14790996784565916, + "grad_norm": 1.3596222400665283, + "learning_rate": 9.298224374495561e-05, + "loss": 1.0957, + "step": 1840 + }, + { + "epoch": 0.1487138263665595, + "grad_norm": 1.2217438220977783, + "learning_rate": 9.294188861985472e-05, + "loss": 1.2423, + "step": 1850 + }, + { + "epoch": 0.1495176848874598, + "grad_norm": 1.2153493165969849, + "learning_rate": 9.290153349475384e-05, + "loss": 1.3722, + "step": 1860 + }, + { + "epoch": 0.15032154340836013, + "grad_norm": 1.7717721462249756, + "learning_rate": 9.286117836965294e-05, + "loss": 1.2418, + "step": 1870 + }, + { + "epoch": 0.15112540192926044, + "grad_norm": 1.3134626150131226, + "learning_rate": 9.282082324455206e-05, + "loss": 1.221, + "step": 1880 + }, + { + "epoch": 0.15192926045016078, + "grad_norm": 1.7526469230651855, + "learning_rate": 9.278046811945116e-05, + "loss": 1.4151, + "step": 1890 + }, + { + "epoch": 0.1527331189710611, + "grad_norm": 1.1325948238372803, + "learning_rate": 9.27401129943503e-05, + "loss": 1.2002, + "step": 1900 + }, + { + "epoch": 0.1535369774919614, + "grad_norm": 1.3506640195846558, + "learning_rate": 9.26997578692494e-05, + "loss": 1.2874, + "step": 1910 + }, + { + "epoch": 0.15434083601286175, + "grad_norm": 1.7926762104034424, + "learning_rate": 9.265940274414852e-05, + "loss": 1.2386, + "step": 1920 + }, + { + "epoch": 0.15514469453376206, + "grad_norm": 1.4644490480422974, + "learning_rate": 9.261904761904762e-05, + "loss": 1.1797, + "step": 1930 + }, + { + "epoch": 0.15594855305466238, + "grad_norm": 1.57390296459198, + "learning_rate": 9.257869249394674e-05, + "loss": 1.2558, + "step": 1940 + }, + { + "epoch": 0.1567524115755627, + "grad_norm": 1.5351284742355347, + "learning_rate": 9.253833736884585e-05, + "loss": 1.2979, + "step": 1950 + }, + { + "epoch": 0.15755627009646303, + "grad_norm": 1.4718937873840332, + "learning_rate": 9.249798224374496e-05, + "loss": 1.3425, + "step": 1960 + }, + { + "epoch": 0.15836012861736334, + "grad_norm": 0.9524021744728088, + "learning_rate": 9.245762711864407e-05, + "loss": 1.1977, + "step": 1970 + }, + { + "epoch": 0.15916398713826366, + "grad_norm": 1.8549355268478394, + "learning_rate": 9.241727199354319e-05, + "loss": 1.2554, + "step": 1980 + }, + { + "epoch": 0.159967845659164, + "grad_norm": 1.3382093906402588, + "learning_rate": 9.237691686844229e-05, + "loss": 1.2411, + "step": 1990 + }, + { + "epoch": 0.1607717041800643, + "grad_norm": 1.112624168395996, + "learning_rate": 9.233656174334141e-05, + "loss": 1.2603, + "step": 2000 + }, + { + "epoch": 0.1607717041800643, + "eval_yahma/alpaca-cleaned_loss": 1.284648060798645, + "eval_yahma/alpaca-cleaned_runtime": 115.7666, + "eval_yahma/alpaca-cleaned_samples_per_second": 17.276, + "eval_yahma/alpaca-cleaned_steps_per_second": 2.16, + "step": 2000 + }, + { + "epoch": 0.16157556270096463, + "grad_norm": 1.0333826541900635, + "learning_rate": 9.229620661824053e-05, + "loss": 1.423, + "step": 2010 + }, + { + "epoch": 0.16237942122186494, + "grad_norm": 0.9243280291557312, + "learning_rate": 9.225585149313963e-05, + "loss": 1.3014, + "step": 2020 + }, + { + "epoch": 0.16318327974276528, + "grad_norm": 1.417212963104248, + "learning_rate": 9.221549636803875e-05, + "loss": 1.3641, + "step": 2030 + }, + { + "epoch": 0.1639871382636656, + "grad_norm": 1.8156367540359497, + "learning_rate": 9.217514124293785e-05, + "loss": 1.3503, + "step": 2040 + }, + { + "epoch": 0.1647909967845659, + "grad_norm": 1.642484188079834, + "learning_rate": 9.213478611783697e-05, + "loss": 1.2603, + "step": 2050 + }, + { + "epoch": 0.16559485530546625, + "grad_norm": 1.0917778015136719, + "learning_rate": 9.209443099273608e-05, + "loss": 1.2754, + "step": 2060 + }, + { + "epoch": 0.16639871382636656, + "grad_norm": 2.1152048110961914, + "learning_rate": 9.20540758676352e-05, + "loss": 1.2619, + "step": 2070 + }, + { + "epoch": 0.16720257234726688, + "grad_norm": 1.556593418121338, + "learning_rate": 9.20137207425343e-05, + "loss": 1.1498, + "step": 2080 + }, + { + "epoch": 0.1680064308681672, + "grad_norm": 1.4882763624191284, + "learning_rate": 9.197336561743342e-05, + "loss": 1.2363, + "step": 2090 + }, + { + "epoch": 0.16881028938906753, + "grad_norm": 1.1114956140518188, + "learning_rate": 9.193301049233252e-05, + "loss": 1.2216, + "step": 2100 + }, + { + "epoch": 0.16961414790996784, + "grad_norm": 1.2593668699264526, + "learning_rate": 9.189265536723164e-05, + "loss": 1.3039, + "step": 2110 + }, + { + "epoch": 0.17041800643086816, + "grad_norm": 1.329266905784607, + "learning_rate": 9.185230024213075e-05, + "loss": 1.2402, + "step": 2120 + }, + { + "epoch": 0.1712218649517685, + "grad_norm": 1.7473903894424438, + "learning_rate": 9.181194511702986e-05, + "loss": 1.2572, + "step": 2130 + }, + { + "epoch": 0.1720257234726688, + "grad_norm": 1.2216113805770874, + "learning_rate": 9.177158999192897e-05, + "loss": 1.2478, + "step": 2140 + }, + { + "epoch": 0.17282958199356913, + "grad_norm": 1.2641884088516235, + "learning_rate": 9.173123486682809e-05, + "loss": 1.2347, + "step": 2150 + }, + { + "epoch": 0.17363344051446947, + "grad_norm": 2.2486350536346436, + "learning_rate": 9.16908797417272e-05, + "loss": 1.2163, + "step": 2160 + }, + { + "epoch": 0.17443729903536978, + "grad_norm": 1.4562492370605469, + "learning_rate": 9.165052461662632e-05, + "loss": 1.1854, + "step": 2170 + }, + { + "epoch": 0.1752411575562701, + "grad_norm": 1.8461923599243164, + "learning_rate": 9.161016949152543e-05, + "loss": 1.2042, + "step": 2180 + }, + { + "epoch": 0.1760450160771704, + "grad_norm": 1.1238946914672852, + "learning_rate": 9.156981436642455e-05, + "loss": 1.3557, + "step": 2190 + }, + { + "epoch": 0.17684887459807075, + "grad_norm": 1.1480857133865356, + "learning_rate": 9.152945924132365e-05, + "loss": 1.24, + "step": 2200 + }, + { + "epoch": 0.17765273311897106, + "grad_norm": 1.1996605396270752, + "learning_rate": 9.148910411622277e-05, + "loss": 1.2744, + "step": 2210 + }, + { + "epoch": 0.17845659163987138, + "grad_norm": 1.267026662826538, + "learning_rate": 9.144874899112187e-05, + "loss": 1.2034, + "step": 2220 + }, + { + "epoch": 0.17926045016077172, + "grad_norm": 0.9613250494003296, + "learning_rate": 9.140839386602099e-05, + "loss": 1.2365, + "step": 2230 + }, + { + "epoch": 0.18006430868167203, + "grad_norm": 1.1929056644439697, + "learning_rate": 9.137207425343019e-05, + "loss": 1.1804, + "step": 2240 + }, + { + "epoch": 0.18086816720257234, + "grad_norm": 1.337989330291748, + "learning_rate": 9.133171912832931e-05, + "loss": 1.1404, + "step": 2250 + }, + { + "epoch": 0.18167202572347266, + "grad_norm": 1.1448936462402344, + "learning_rate": 9.129136400322841e-05, + "loss": 1.2866, + "step": 2260 + }, + { + "epoch": 0.182475884244373, + "grad_norm": 1.0559418201446533, + "learning_rate": 9.125100887812753e-05, + "loss": 1.2936, + "step": 2270 + }, + { + "epoch": 0.1832797427652733, + "grad_norm": 1.2513796091079712, + "learning_rate": 9.121065375302664e-05, + "loss": 1.1852, + "step": 2280 + }, + { + "epoch": 0.18408360128617363, + "grad_norm": 1.0850704908370972, + "learning_rate": 9.117029862792576e-05, + "loss": 1.3189, + "step": 2290 + }, + { + "epoch": 0.18488745980707397, + "grad_norm": 1.1448137760162354, + "learning_rate": 9.112994350282486e-05, + "loss": 1.1907, + "step": 2300 + }, + { + "epoch": 0.18569131832797428, + "grad_norm": 1.3059879541397095, + "learning_rate": 9.108958837772398e-05, + "loss": 1.2661, + "step": 2310 + }, + { + "epoch": 0.1864951768488746, + "grad_norm": 1.2946072816848755, + "learning_rate": 9.104923325262308e-05, + "loss": 1.4031, + "step": 2320 + }, + { + "epoch": 0.1872990353697749, + "grad_norm": 1.630731463432312, + "learning_rate": 9.10088781275222e-05, + "loss": 1.2734, + "step": 2330 + }, + { + "epoch": 0.18810289389067525, + "grad_norm": 1.1476322412490845, + "learning_rate": 9.096852300242132e-05, + "loss": 1.1943, + "step": 2340 + }, + { + "epoch": 0.18890675241157556, + "grad_norm": 1.2856926918029785, + "learning_rate": 9.092816787732042e-05, + "loss": 1.3245, + "step": 2350 + }, + { + "epoch": 0.18971061093247588, + "grad_norm": 1.6066474914550781, + "learning_rate": 9.088781275221954e-05, + "loss": 1.3207, + "step": 2360 + }, + { + "epoch": 0.19051446945337622, + "grad_norm": 1.4631816148757935, + "learning_rate": 9.084745762711865e-05, + "loss": 1.2339, + "step": 2370 + }, + { + "epoch": 0.19131832797427653, + "grad_norm": 1.5194926261901855, + "learning_rate": 9.080710250201776e-05, + "loss": 1.2414, + "step": 2380 + }, + { + "epoch": 0.19212218649517684, + "grad_norm": 3.205920934677124, + "learning_rate": 9.076674737691687e-05, + "loss": 1.2412, + "step": 2390 + }, + { + "epoch": 0.19292604501607716, + "grad_norm": 1.5582493543624878, + "learning_rate": 9.072639225181599e-05, + "loss": 1.245, + "step": 2400 + }, + { + "epoch": 0.1937299035369775, + "grad_norm": 1.579376220703125, + "learning_rate": 9.068603712671509e-05, + "loss": 1.1683, + "step": 2410 + }, + { + "epoch": 0.1945337620578778, + "grad_norm": 1.1779841184616089, + "learning_rate": 9.064568200161421e-05, + "loss": 1.2371, + "step": 2420 + }, + { + "epoch": 0.19533762057877813, + "grad_norm": 1.2082445621490479, + "learning_rate": 9.060532687651331e-05, + "loss": 1.2501, + "step": 2430 + }, + { + "epoch": 0.19614147909967847, + "grad_norm": 1.2248579263687134, + "learning_rate": 9.056497175141243e-05, + "loss": 1.2724, + "step": 2440 + }, + { + "epoch": 0.19694533762057878, + "grad_norm": 1.2531076669692993, + "learning_rate": 9.052461662631154e-05, + "loss": 1.2786, + "step": 2450 + }, + { + "epoch": 0.1977491961414791, + "grad_norm": 1.2422891855239868, + "learning_rate": 9.048426150121066e-05, + "loss": 1.4194, + "step": 2460 + }, + { + "epoch": 0.1985530546623794, + "grad_norm": 1.63975191116333, + "learning_rate": 9.044390637610976e-05, + "loss": 1.2917, + "step": 2470 + }, + { + "epoch": 0.19935691318327975, + "grad_norm": 1.3208664655685425, + "learning_rate": 9.040355125100888e-05, + "loss": 1.2646, + "step": 2480 + }, + { + "epoch": 0.20016077170418006, + "grad_norm": 3.9294943809509277, + "learning_rate": 9.036319612590798e-05, + "loss": 1.3366, + "step": 2490 + }, + { + "epoch": 0.20096463022508038, + "grad_norm": 1.5396499633789062, + "learning_rate": 9.03228410008071e-05, + "loss": 1.2358, + "step": 2500 + }, + { + "epoch": 0.20176848874598072, + "grad_norm": 1.4243323802947998, + "learning_rate": 9.028248587570622e-05, + "loss": 1.2169, + "step": 2510 + }, + { + "epoch": 0.20257234726688103, + "grad_norm": 1.5669913291931152, + "learning_rate": 9.024213075060534e-05, + "loss": 1.2263, + "step": 2520 + }, + { + "epoch": 0.20337620578778134, + "grad_norm": 0.9774581789970398, + "learning_rate": 9.020177562550444e-05, + "loss": 1.239, + "step": 2530 + }, + { + "epoch": 0.20418006430868169, + "grad_norm": 1.010597586631775, + "learning_rate": 9.016142050040356e-05, + "loss": 1.2691, + "step": 2540 + }, + { + "epoch": 0.204983922829582, + "grad_norm": 1.1401910781860352, + "learning_rate": 9.012106537530266e-05, + "loss": 1.3039, + "step": 2550 + }, + { + "epoch": 0.2057877813504823, + "grad_norm": 1.491416573524475, + "learning_rate": 9.008071025020178e-05, + "loss": 1.3013, + "step": 2560 + }, + { + "epoch": 0.20659163987138263, + "grad_norm": 1.2823078632354736, + "learning_rate": 9.00403551251009e-05, + "loss": 1.279, + "step": 2570 + }, + { + "epoch": 0.20739549839228297, + "grad_norm": 1.1345791816711426, + "learning_rate": 9e-05, + "loss": 1.2173, + "step": 2580 + }, + { + "epoch": 0.20819935691318328, + "grad_norm": 2.610827922821045, + "learning_rate": 8.995964487489912e-05, + "loss": 1.3519, + "step": 2590 + }, + { + "epoch": 0.2090032154340836, + "grad_norm": 1.0789318084716797, + "learning_rate": 8.991928974979823e-05, + "loss": 1.3307, + "step": 2600 + }, + { + "epoch": 0.20980707395498394, + "grad_norm": 1.324279546737671, + "learning_rate": 8.987893462469735e-05, + "loss": 1.2548, + "step": 2610 + }, + { + "epoch": 0.21061093247588425, + "grad_norm": 1.199806571006775, + "learning_rate": 8.983857949959645e-05, + "loss": 1.208, + "step": 2620 + }, + { + "epoch": 0.21141479099678456, + "grad_norm": 1.218712568283081, + "learning_rate": 8.979822437449557e-05, + "loss": 1.2557, + "step": 2630 + }, + { + "epoch": 0.21221864951768488, + "grad_norm": 1.133314847946167, + "learning_rate": 8.975786924939467e-05, + "loss": 1.2158, + "step": 2640 + }, + { + "epoch": 0.21302250803858522, + "grad_norm": 2.35660719871521, + "learning_rate": 8.971751412429379e-05, + "loss": 1.148, + "step": 2650 + }, + { + "epoch": 0.21382636655948553, + "grad_norm": 2.8322184085845947, + "learning_rate": 8.96771589991929e-05, + "loss": 1.1055, + "step": 2660 + }, + { + "epoch": 0.21463022508038584, + "grad_norm": 1.280400276184082, + "learning_rate": 8.963680387409201e-05, + "loss": 1.3694, + "step": 2670 + }, + { + "epoch": 0.21543408360128619, + "grad_norm": 1.163424015045166, + "learning_rate": 8.959644874899112e-05, + "loss": 1.2949, + "step": 2680 + }, + { + "epoch": 0.2162379421221865, + "grad_norm": 1.504102349281311, + "learning_rate": 8.955609362389024e-05, + "loss": 1.3514, + "step": 2690 + }, + { + "epoch": 0.2170418006430868, + "grad_norm": 1.1156712770462036, + "learning_rate": 8.951573849878934e-05, + "loss": 1.2545, + "step": 2700 + }, + { + "epoch": 0.21784565916398713, + "grad_norm": 1.1746556758880615, + "learning_rate": 8.947538337368846e-05, + "loss": 1.2149, + "step": 2710 + }, + { + "epoch": 0.21864951768488747, + "grad_norm": 1.2077722549438477, + "learning_rate": 8.943502824858756e-05, + "loss": 1.2764, + "step": 2720 + }, + { + "epoch": 0.21945337620578778, + "grad_norm": 1.1626636981964111, + "learning_rate": 8.939467312348668e-05, + "loss": 1.3789, + "step": 2730 + }, + { + "epoch": 0.2202572347266881, + "grad_norm": 2.6447031497955322, + "learning_rate": 8.93543179983858e-05, + "loss": 1.2829, + "step": 2740 + }, + { + "epoch": 0.22106109324758844, + "grad_norm": 1.1910282373428345, + "learning_rate": 8.93139628732849e-05, + "loss": 1.2785, + "step": 2750 + }, + { + "epoch": 0.22186495176848875, + "grad_norm": 1.3396004438400269, + "learning_rate": 8.927360774818402e-05, + "loss": 1.1879, + "step": 2760 + }, + { + "epoch": 0.22266881028938906, + "grad_norm": 1.88126802444458, + "learning_rate": 8.923325262308314e-05, + "loss": 1.1849, + "step": 2770 + }, + { + "epoch": 0.22347266881028938, + "grad_norm": 1.8967148065567017, + "learning_rate": 8.919289749798225e-05, + "loss": 1.2796, + "step": 2780 + }, + { + "epoch": 0.22427652733118972, + "grad_norm": 1.174562692642212, + "learning_rate": 8.915254237288136e-05, + "loss": 1.2017, + "step": 2790 + }, + { + "epoch": 0.22508038585209003, + "grad_norm": 1.0899947881698608, + "learning_rate": 8.911218724778048e-05, + "loss": 1.2023, + "step": 2800 + }, + { + "epoch": 0.22588424437299034, + "grad_norm": 3.6643030643463135, + "learning_rate": 8.907183212267959e-05, + "loss": 1.1986, + "step": 2810 + }, + { + "epoch": 0.2266881028938907, + "grad_norm": 2.480713367462158, + "learning_rate": 8.90314769975787e-05, + "loss": 1.3729, + "step": 2820 + }, + { + "epoch": 0.227491961414791, + "grad_norm": 1.0885637998580933, + "learning_rate": 8.899112187247781e-05, + "loss": 1.1907, + "step": 2830 + }, + { + "epoch": 0.2282958199356913, + "grad_norm": 1.0519663095474243, + "learning_rate": 8.895076674737693e-05, + "loss": 1.1776, + "step": 2840 + }, + { + "epoch": 0.22909967845659163, + "grad_norm": 2.0941977500915527, + "learning_rate": 8.891041162227603e-05, + "loss": 1.2108, + "step": 2850 + }, + { + "epoch": 0.22990353697749197, + "grad_norm": 1.2095335721969604, + "learning_rate": 8.887005649717515e-05, + "loss": 1.1837, + "step": 2860 + }, + { + "epoch": 0.23070739549839228, + "grad_norm": 1.6254414319992065, + "learning_rate": 8.882970137207426e-05, + "loss": 1.302, + "step": 2870 + }, + { + "epoch": 0.2315112540192926, + "grad_norm": 1.7986174821853638, + "learning_rate": 8.878934624697337e-05, + "loss": 1.3875, + "step": 2880 + }, + { + "epoch": 0.23231511254019294, + "grad_norm": 1.1640440225601196, + "learning_rate": 8.874899112187248e-05, + "loss": 1.2783, + "step": 2890 + }, + { + "epoch": 0.23311897106109325, + "grad_norm": 1.3544087409973145, + "learning_rate": 8.87086359967716e-05, + "loss": 1.1829, + "step": 2900 + }, + { + "epoch": 0.23392282958199356, + "grad_norm": 1.7265479564666748, + "learning_rate": 8.86682808716707e-05, + "loss": 1.3421, + "step": 2910 + }, + { + "epoch": 0.2347266881028939, + "grad_norm": 1.130656123161316, + "learning_rate": 8.862792574656982e-05, + "loss": 1.35, + "step": 2920 + }, + { + "epoch": 0.23553054662379422, + "grad_norm": 1.722943663597107, + "learning_rate": 8.858757062146892e-05, + "loss": 1.2528, + "step": 2930 + }, + { + "epoch": 0.23633440514469453, + "grad_norm": 0.9455968737602234, + "learning_rate": 8.854721549636804e-05, + "loss": 1.2752, + "step": 2940 + }, + { + "epoch": 0.23713826366559485, + "grad_norm": 1.1642229557037354, + "learning_rate": 8.850686037126715e-05, + "loss": 1.2127, + "step": 2950 + }, + { + "epoch": 0.2379421221864952, + "grad_norm": 1.1500279903411865, + "learning_rate": 8.846650524616626e-05, + "loss": 1.1981, + "step": 2960 + }, + { + "epoch": 0.2387459807073955, + "grad_norm": 1.352089762687683, + "learning_rate": 8.842615012106538e-05, + "loss": 1.3122, + "step": 2970 + }, + { + "epoch": 0.2395498392282958, + "grad_norm": 1.4196809530258179, + "learning_rate": 8.838579499596449e-05, + "loss": 1.3015, + "step": 2980 + }, + { + "epoch": 0.24035369774919615, + "grad_norm": 1.7181886434555054, + "learning_rate": 8.83454398708636e-05, + "loss": 1.3678, + "step": 2990 + }, + { + "epoch": 0.24115755627009647, + "grad_norm": 2.6380298137664795, + "learning_rate": 8.830508474576271e-05, + "loss": 1.2444, + "step": 3000 + }, + { + "epoch": 0.24196141479099678, + "grad_norm": 1.2690985202789307, + "learning_rate": 8.826472962066183e-05, + "loss": 1.2668, + "step": 3010 + }, + { + "epoch": 0.2427652733118971, + "grad_norm": 1.4184331893920898, + "learning_rate": 8.822437449556093e-05, + "loss": 1.1698, + "step": 3020 + }, + { + "epoch": 0.24356913183279744, + "grad_norm": 4.190485954284668, + "learning_rate": 8.818401937046006e-05, + "loss": 1.2451, + "step": 3030 + }, + { + "epoch": 0.24437299035369775, + "grad_norm": 1.2775496244430542, + "learning_rate": 8.814366424535917e-05, + "loss": 1.1782, + "step": 3040 + }, + { + "epoch": 0.24517684887459806, + "grad_norm": 1.435091257095337, + "learning_rate": 8.810330912025829e-05, + "loss": 1.3203, + "step": 3050 + }, + { + "epoch": 0.2459807073954984, + "grad_norm": 1.2115564346313477, + "learning_rate": 8.806295399515739e-05, + "loss": 1.1335, + "step": 3060 + }, + { + "epoch": 0.24678456591639872, + "grad_norm": 1.3692493438720703, + "learning_rate": 8.802259887005651e-05, + "loss": 1.2182, + "step": 3070 + }, + { + "epoch": 0.24758842443729903, + "grad_norm": 1.2258083820343018, + "learning_rate": 8.798224374495561e-05, + "loss": 1.1362, + "step": 3080 + }, + { + "epoch": 0.24839228295819935, + "grad_norm": 1.7599858045578003, + "learning_rate": 8.794188861985473e-05, + "loss": 1.3197, + "step": 3090 + }, + { + "epoch": 0.2491961414790997, + "grad_norm": 1.6278449296951294, + "learning_rate": 8.790153349475384e-05, + "loss": 1.2382, + "step": 3100 + }, + { + "epoch": 0.25, + "grad_norm": 2.995894432067871, + "learning_rate": 8.786117836965296e-05, + "loss": 1.1695, + "step": 3110 + }, + { + "epoch": 0.2508038585209003, + "grad_norm": 1.095329999923706, + "learning_rate": 8.782082324455206e-05, + "loss": 1.2429, + "step": 3120 + }, + { + "epoch": 0.2516077170418006, + "grad_norm": 1.7307472229003906, + "learning_rate": 8.778046811945118e-05, + "loss": 1.2102, + "step": 3130 + }, + { + "epoch": 0.25241157556270094, + "grad_norm": 1.4965832233428955, + "learning_rate": 8.774011299435028e-05, + "loss": 1.2401, + "step": 3140 + }, + { + "epoch": 0.2532154340836013, + "grad_norm": 1.0890928506851196, + "learning_rate": 8.76997578692494e-05, + "loss": 1.2583, + "step": 3150 + }, + { + "epoch": 0.2540192926045016, + "grad_norm": 1.1239075660705566, + "learning_rate": 8.76594027441485e-05, + "loss": 1.2871, + "step": 3160 + }, + { + "epoch": 0.25482315112540194, + "grad_norm": 1.455148458480835, + "learning_rate": 8.761904761904762e-05, + "loss": 1.2707, + "step": 3170 + }, + { + "epoch": 0.25562700964630225, + "grad_norm": 1.3527302742004395, + "learning_rate": 8.757869249394673e-05, + "loss": 1.2797, + "step": 3180 + }, + { + "epoch": 0.25643086816720256, + "grad_norm": 1.3919832706451416, + "learning_rate": 8.753833736884585e-05, + "loss": 1.1467, + "step": 3190 + }, + { + "epoch": 0.2572347266881029, + "grad_norm": 1.2856230735778809, + "learning_rate": 8.749798224374496e-05, + "loss": 1.2301, + "step": 3200 + }, + { + "epoch": 0.2580385852090032, + "grad_norm": 1.550368070602417, + "learning_rate": 8.745762711864407e-05, + "loss": 1.1609, + "step": 3210 + }, + { + "epoch": 0.25884244372990356, + "grad_norm": 0.9304008483886719, + "learning_rate": 8.741727199354319e-05, + "loss": 1.2375, + "step": 3220 + }, + { + "epoch": 0.2596463022508039, + "grad_norm": 1.2073613405227661, + "learning_rate": 8.737691686844229e-05, + "loss": 1.2715, + "step": 3230 + }, + { + "epoch": 0.2604501607717042, + "grad_norm": 1.894003987312317, + "learning_rate": 8.733656174334141e-05, + "loss": 1.205, + "step": 3240 + }, + { + "epoch": 0.2612540192926045, + "grad_norm": 1.4571211338043213, + "learning_rate": 8.729620661824051e-05, + "loss": 1.2137, + "step": 3250 + }, + { + "epoch": 0.2620578778135048, + "grad_norm": 1.2148561477661133, + "learning_rate": 8.725585149313963e-05, + "loss": 1.3079, + "step": 3260 + }, + { + "epoch": 0.2628617363344051, + "grad_norm": 1.1078846454620361, + "learning_rate": 8.721549636803874e-05, + "loss": 1.1524, + "step": 3270 + }, + { + "epoch": 0.26366559485530544, + "grad_norm": 1.6308702230453491, + "learning_rate": 8.717514124293786e-05, + "loss": 1.1884, + "step": 3280 + }, + { + "epoch": 0.2644694533762058, + "grad_norm": 1.2123280763626099, + "learning_rate": 8.713478611783696e-05, + "loss": 1.2499, + "step": 3290 + }, + { + "epoch": 0.2652733118971061, + "grad_norm": 1.2395896911621094, + "learning_rate": 8.709443099273609e-05, + "loss": 1.1591, + "step": 3300 + }, + { + "epoch": 0.26607717041800644, + "grad_norm": 1.0542362928390503, + "learning_rate": 8.70540758676352e-05, + "loss": 1.203, + "step": 3310 + }, + { + "epoch": 0.26688102893890675, + "grad_norm": 1.354406476020813, + "learning_rate": 8.701372074253431e-05, + "loss": 1.2953, + "step": 3320 + }, + { + "epoch": 0.26768488745980706, + "grad_norm": 1.0819886922836304, + "learning_rate": 8.697336561743342e-05, + "loss": 1.2686, + "step": 3330 + }, + { + "epoch": 0.2684887459807074, + "grad_norm": 1.5589094161987305, + "learning_rate": 8.693301049233254e-05, + "loss": 1.2366, + "step": 3340 + }, + { + "epoch": 0.2692926045016077, + "grad_norm": 2.0046586990356445, + "learning_rate": 8.689265536723164e-05, + "loss": 1.2515, + "step": 3350 + }, + { + "epoch": 0.27009646302250806, + "grad_norm": 2.3307278156280518, + "learning_rate": 8.685230024213076e-05, + "loss": 1.2391, + "step": 3360 + }, + { + "epoch": 0.2709003215434084, + "grad_norm": 2.4583561420440674, + "learning_rate": 8.681194511702986e-05, + "loss": 1.1907, + "step": 3370 + }, + { + "epoch": 0.2717041800643087, + "grad_norm": 1.040289044380188, + "learning_rate": 8.677158999192898e-05, + "loss": 1.1758, + "step": 3380 + }, + { + "epoch": 0.272508038585209, + "grad_norm": 1.7027485370635986, + "learning_rate": 8.673123486682809e-05, + "loss": 1.3229, + "step": 3390 + }, + { + "epoch": 0.2733118971061093, + "grad_norm": 1.3173311948776245, + "learning_rate": 8.66908797417272e-05, + "loss": 1.3225, + "step": 3400 + }, + { + "epoch": 0.2741157556270096, + "grad_norm": 1.4691309928894043, + "learning_rate": 8.665052461662631e-05, + "loss": 1.0976, + "step": 3410 + }, + { + "epoch": 0.27491961414790994, + "grad_norm": 1.385856032371521, + "learning_rate": 8.661016949152543e-05, + "loss": 1.2308, + "step": 3420 + }, + { + "epoch": 0.2757234726688103, + "grad_norm": 1.038273811340332, + "learning_rate": 8.656981436642455e-05, + "loss": 1.1296, + "step": 3430 + }, + { + "epoch": 0.2765273311897106, + "grad_norm": 1.1960220336914062, + "learning_rate": 8.652945924132365e-05, + "loss": 1.2165, + "step": 3440 + }, + { + "epoch": 0.27733118971061094, + "grad_norm": 1.353964924812317, + "learning_rate": 8.648910411622277e-05, + "loss": 1.2996, + "step": 3450 + }, + { + "epoch": 0.27813504823151125, + "grad_norm": 1.173205018043518, + "learning_rate": 8.644874899112187e-05, + "loss": 1.295, + "step": 3460 + }, + { + "epoch": 0.27893890675241156, + "grad_norm": 2.594675302505493, + "learning_rate": 8.640839386602099e-05, + "loss": 1.1145, + "step": 3470 + }, + { + "epoch": 0.2797427652733119, + "grad_norm": 1.1430559158325195, + "learning_rate": 8.63680387409201e-05, + "loss": 1.1369, + "step": 3480 + }, + { + "epoch": 0.2805466237942122, + "grad_norm": 1.6096569299697876, + "learning_rate": 8.632768361581921e-05, + "loss": 1.1668, + "step": 3490 + }, + { + "epoch": 0.28135048231511256, + "grad_norm": 2.040187120437622, + "learning_rate": 8.628732849071832e-05, + "loss": 1.2364, + "step": 3500 + }, + { + "epoch": 0.2821543408360129, + "grad_norm": 1.2003670930862427, + "learning_rate": 8.624697336561744e-05, + "loss": 1.2538, + "step": 3510 + }, + { + "epoch": 0.2829581993569132, + "grad_norm": 1.2703702449798584, + "learning_rate": 8.620661824051654e-05, + "loss": 1.3134, + "step": 3520 + }, + { + "epoch": 0.2837620578778135, + "grad_norm": 1.3621017932891846, + "learning_rate": 8.616626311541566e-05, + "loss": 1.2114, + "step": 3530 + }, + { + "epoch": 0.2845659163987138, + "grad_norm": 2.1689460277557373, + "learning_rate": 8.612590799031476e-05, + "loss": 1.393, + "step": 3540 + }, + { + "epoch": 0.2853697749196141, + "grad_norm": 1.1933314800262451, + "learning_rate": 8.608555286521388e-05, + "loss": 1.1932, + "step": 3550 + }, + { + "epoch": 0.2861736334405145, + "grad_norm": 1.2700861692428589, + "learning_rate": 8.6045197740113e-05, + "loss": 1.4143, + "step": 3560 + }, + { + "epoch": 0.2869774919614148, + "grad_norm": 1.3436776399612427, + "learning_rate": 8.600484261501212e-05, + "loss": 1.2522, + "step": 3570 + }, + { + "epoch": 0.2877813504823151, + "grad_norm": 1.207179307937622, + "learning_rate": 8.596448748991122e-05, + "loss": 1.2819, + "step": 3580 + }, + { + "epoch": 0.28858520900321544, + "grad_norm": 1.681864619255066, + "learning_rate": 8.592413236481034e-05, + "loss": 1.1955, + "step": 3590 + }, + { + "epoch": 0.28938906752411575, + "grad_norm": 1.4330998659133911, + "learning_rate": 8.588377723970945e-05, + "loss": 1.2732, + "step": 3600 + }, + { + "epoch": 0.29019292604501606, + "grad_norm": 1.481811285018921, + "learning_rate": 8.584342211460856e-05, + "loss": 1.3336, + "step": 3610 + }, + { + "epoch": 0.2909967845659164, + "grad_norm": 1.147080898284912, + "learning_rate": 8.580306698950767e-05, + "loss": 1.2514, + "step": 3620 + }, + { + "epoch": 0.29180064308681675, + "grad_norm": 1.0554629564285278, + "learning_rate": 8.576271186440679e-05, + "loss": 1.2695, + "step": 3630 + }, + { + "epoch": 0.29260450160771706, + "grad_norm": 1.670669674873352, + "learning_rate": 8.572235673930589e-05, + "loss": 1.3654, + "step": 3640 + }, + { + "epoch": 0.2934083601286174, + "grad_norm": 1.398838996887207, + "learning_rate": 8.568200161420501e-05, + "loss": 1.2102, + "step": 3650 + }, + { + "epoch": 0.2942122186495177, + "grad_norm": 1.1837551593780518, + "learning_rate": 8.564164648910413e-05, + "loss": 1.3595, + "step": 3660 + }, + { + "epoch": 0.295016077170418, + "grad_norm": 1.2118239402770996, + "learning_rate": 8.560129136400323e-05, + "loss": 1.2297, + "step": 3670 + }, + { + "epoch": 0.2958199356913183, + "grad_norm": 1.3268067836761475, + "learning_rate": 8.556093623890235e-05, + "loss": 1.3314, + "step": 3680 + }, + { + "epoch": 0.2966237942122186, + "grad_norm": 1.6145943403244019, + "learning_rate": 8.552058111380146e-05, + "loss": 1.183, + "step": 3690 + }, + { + "epoch": 0.297427652733119, + "grad_norm": 1.7863456010818481, + "learning_rate": 8.548022598870057e-05, + "loss": 1.2523, + "step": 3700 + }, + { + "epoch": 0.2982315112540193, + "grad_norm": 1.243323802947998, + "learning_rate": 8.543987086359968e-05, + "loss": 1.2971, + "step": 3710 + }, + { + "epoch": 0.2990353697749196, + "grad_norm": 1.4287739992141724, + "learning_rate": 8.53995157384988e-05, + "loss": 1.1036, + "step": 3720 + }, + { + "epoch": 0.29983922829581994, + "grad_norm": 1.366148591041565, + "learning_rate": 8.53591606133979e-05, + "loss": 1.2176, + "step": 3730 + }, + { + "epoch": 0.30064308681672025, + "grad_norm": 1.3919678926467896, + "learning_rate": 8.531880548829702e-05, + "loss": 1.1227, + "step": 3740 + }, + { + "epoch": 0.30144694533762056, + "grad_norm": 1.0967152118682861, + "learning_rate": 8.527845036319612e-05, + "loss": 1.2714, + "step": 3750 + }, + { + "epoch": 0.3022508038585209, + "grad_norm": 1.183366060256958, + "learning_rate": 8.523809523809524e-05, + "loss": 1.2636, + "step": 3760 + }, + { + "epoch": 0.30305466237942125, + "grad_norm": 1.0084779262542725, + "learning_rate": 8.519774011299435e-05, + "loss": 1.4423, + "step": 3770 + }, + { + "epoch": 0.30385852090032156, + "grad_norm": 1.0512844324111938, + "learning_rate": 8.515738498789346e-05, + "loss": 1.2682, + "step": 3780 + }, + { + "epoch": 0.3046623794212219, + "grad_norm": 1.1441655158996582, + "learning_rate": 8.511702986279257e-05, + "loss": 1.2442, + "step": 3790 + }, + { + "epoch": 0.3054662379421222, + "grad_norm": 1.0703250169754028, + "learning_rate": 8.507667473769169e-05, + "loss": 1.263, + "step": 3800 + }, + { + "epoch": 0.3062700964630225, + "grad_norm": 1.2896406650543213, + "learning_rate": 8.503631961259079e-05, + "loss": 1.1945, + "step": 3810 + }, + { + "epoch": 0.3070739549839228, + "grad_norm": 1.516065239906311, + "learning_rate": 8.499596448748992e-05, + "loss": 1.1396, + "step": 3820 + }, + { + "epoch": 0.30787781350482313, + "grad_norm": 1.256030559539795, + "learning_rate": 8.495560936238903e-05, + "loss": 1.2744, + "step": 3830 + }, + { + "epoch": 0.3086816720257235, + "grad_norm": 1.2016384601593018, + "learning_rate": 8.491525423728815e-05, + "loss": 1.1509, + "step": 3840 + }, + { + "epoch": 0.3094855305466238, + "grad_norm": 1.8608025312423706, + "learning_rate": 8.487489911218725e-05, + "loss": 1.1685, + "step": 3850 + }, + { + "epoch": 0.3102893890675241, + "grad_norm": 1.5279998779296875, + "learning_rate": 8.483454398708637e-05, + "loss": 1.2004, + "step": 3860 + }, + { + "epoch": 0.31109324758842444, + "grad_norm": 1.5688719749450684, + "learning_rate": 8.479418886198547e-05, + "loss": 1.2176, + "step": 3870 + }, + { + "epoch": 0.31189710610932475, + "grad_norm": 1.5599457025527954, + "learning_rate": 8.475383373688459e-05, + "loss": 1.2146, + "step": 3880 + }, + { + "epoch": 0.31270096463022506, + "grad_norm": 1.1148357391357422, + "learning_rate": 8.471347861178371e-05, + "loss": 1.3145, + "step": 3890 + }, + { + "epoch": 0.3135048231511254, + "grad_norm": 1.0745151042938232, + "learning_rate": 8.467312348668281e-05, + "loss": 1.0896, + "step": 3900 + }, + { + "epoch": 0.31430868167202575, + "grad_norm": 1.194466233253479, + "learning_rate": 8.463276836158193e-05, + "loss": 1.1872, + "step": 3910 + }, + { + "epoch": 0.31511254019292606, + "grad_norm": 2.6574392318725586, + "learning_rate": 8.459241323648104e-05, + "loss": 1.3583, + "step": 3920 + }, + { + "epoch": 0.3159163987138264, + "grad_norm": 1.4396333694458008, + "learning_rate": 8.455205811138016e-05, + "loss": 1.4009, + "step": 3930 + }, + { + "epoch": 0.3167202572347267, + "grad_norm": 1.4764877557754517, + "learning_rate": 8.451170298627926e-05, + "loss": 1.312, + "step": 3940 + }, + { + "epoch": 0.317524115755627, + "grad_norm": 1.3817880153656006, + "learning_rate": 8.447134786117838e-05, + "loss": 1.2372, + "step": 3950 + }, + { + "epoch": 0.3183279742765273, + "grad_norm": 2.030385971069336, + "learning_rate": 8.443099273607748e-05, + "loss": 1.2477, + "step": 3960 + }, + { + "epoch": 0.31913183279742763, + "grad_norm": 2.089034080505371, + "learning_rate": 8.43906376109766e-05, + "loss": 1.1984, + "step": 3970 + }, + { + "epoch": 0.319935691318328, + "grad_norm": 1.7432914972305298, + "learning_rate": 8.43502824858757e-05, + "loss": 1.1436, + "step": 3980 + }, + { + "epoch": 0.3207395498392283, + "grad_norm": 1.3305381536483765, + "learning_rate": 8.430992736077482e-05, + "loss": 1.2557, + "step": 3990 + }, + { + "epoch": 0.3215434083601286, + "grad_norm": 1.6194047927856445, + "learning_rate": 8.426957223567393e-05, + "loss": 1.3163, + "step": 4000 + }, + { + "epoch": 0.3215434083601286, + "eval_yahma/alpaca-cleaned_loss": 1.2568495273590088, + "eval_yahma/alpaca-cleaned_runtime": 115.5517, + "eval_yahma/alpaca-cleaned_samples_per_second": 17.308, + "eval_yahma/alpaca-cleaned_steps_per_second": 2.164, + "step": 4000 + }, + { + "epoch": 0.32234726688102894, + "grad_norm": 0.9996694922447205, + "learning_rate": 8.422921711057305e-05, + "loss": 1.1647, + "step": 4010 + }, + { + "epoch": 0.32315112540192925, + "grad_norm": 1.4061142206192017, + "learning_rate": 8.418886198547215e-05, + "loss": 1.2166, + "step": 4020 + }, + { + "epoch": 0.32395498392282956, + "grad_norm": 1.2130415439605713, + "learning_rate": 8.414850686037127e-05, + "loss": 1.1794, + "step": 4030 + }, + { + "epoch": 0.3247588424437299, + "grad_norm": 1.2440122365951538, + "learning_rate": 8.410815173527037e-05, + "loss": 1.2449, + "step": 4040 + }, + { + "epoch": 0.32556270096463025, + "grad_norm": 1.5796377658843994, + "learning_rate": 8.406779661016949e-05, + "loss": 1.2174, + "step": 4050 + }, + { + "epoch": 0.32636655948553056, + "grad_norm": 1.8361690044403076, + "learning_rate": 8.402744148506861e-05, + "loss": 1.2374, + "step": 4060 + }, + { + "epoch": 0.3271704180064309, + "grad_norm": 1.2278058528900146, + "learning_rate": 8.398708635996771e-05, + "loss": 1.1239, + "step": 4070 + }, + { + "epoch": 0.3279742765273312, + "grad_norm": 1.4516998529434204, + "learning_rate": 8.394673123486683e-05, + "loss": 1.1387, + "step": 4080 + }, + { + "epoch": 0.3287781350482315, + "grad_norm": 1.1614266633987427, + "learning_rate": 8.390637610976595e-05, + "loss": 1.2378, + "step": 4090 + }, + { + "epoch": 0.3295819935691318, + "grad_norm": 1.7242281436920166, + "learning_rate": 8.386602098466506e-05, + "loss": 1.2562, + "step": 4100 + }, + { + "epoch": 0.33038585209003213, + "grad_norm": 1.3742716312408447, + "learning_rate": 8.382566585956417e-05, + "loss": 1.1921, + "step": 4110 + }, + { + "epoch": 0.3311897106109325, + "grad_norm": 1.2327730655670166, + "learning_rate": 8.378531073446329e-05, + "loss": 1.1925, + "step": 4120 + }, + { + "epoch": 0.3319935691318328, + "grad_norm": 1.3033393621444702, + "learning_rate": 8.37449556093624e-05, + "loss": 1.2141, + "step": 4130 + }, + { + "epoch": 0.3327974276527331, + "grad_norm": 1.1770600080490112, + "learning_rate": 8.370460048426151e-05, + "loss": 1.2859, + "step": 4140 + }, + { + "epoch": 0.33360128617363344, + "grad_norm": 1.2515443563461304, + "learning_rate": 8.366424535916062e-05, + "loss": 1.1253, + "step": 4150 + }, + { + "epoch": 0.33440514469453375, + "grad_norm": 1.2094742059707642, + "learning_rate": 8.362389023405974e-05, + "loss": 1.2391, + "step": 4160 + }, + { + "epoch": 0.33520900321543406, + "grad_norm": 1.5006566047668457, + "learning_rate": 8.358353510895884e-05, + "loss": 1.2237, + "step": 4170 + }, + { + "epoch": 0.3360128617363344, + "grad_norm": 1.1610201597213745, + "learning_rate": 8.354317998385796e-05, + "loss": 1.2401, + "step": 4180 + }, + { + "epoch": 0.33681672025723475, + "grad_norm": 1.9824213981628418, + "learning_rate": 8.350282485875706e-05, + "loss": 1.3003, + "step": 4190 + }, + { + "epoch": 0.33762057877813506, + "grad_norm": 1.5451104640960693, + "learning_rate": 8.346246973365618e-05, + "loss": 1.2139, + "step": 4200 + }, + { + "epoch": 0.3384244372990354, + "grad_norm": 1.7514973878860474, + "learning_rate": 8.342211460855529e-05, + "loss": 1.1285, + "step": 4210 + }, + { + "epoch": 0.3392282958199357, + "grad_norm": 1.2136577367782593, + "learning_rate": 8.33817594834544e-05, + "loss": 1.2802, + "step": 4220 + }, + { + "epoch": 0.340032154340836, + "grad_norm": 1.0182433128356934, + "learning_rate": 8.334140435835351e-05, + "loss": 1.2155, + "step": 4230 + }, + { + "epoch": 0.3408360128617363, + "grad_norm": 0.9786971807479858, + "learning_rate": 8.330104923325263e-05, + "loss": 1.0544, + "step": 4240 + }, + { + "epoch": 0.34163987138263663, + "grad_norm": 1.7164803743362427, + "learning_rate": 8.326069410815173e-05, + "loss": 1.1538, + "step": 4250 + }, + { + "epoch": 0.342443729903537, + "grad_norm": 1.1870179176330566, + "learning_rate": 8.322033898305085e-05, + "loss": 1.2086, + "step": 4260 + }, + { + "epoch": 0.3432475884244373, + "grad_norm": 1.4695736169815063, + "learning_rate": 8.317998385794996e-05, + "loss": 0.9652, + "step": 4270 + }, + { + "epoch": 0.3440514469453376, + "grad_norm": 1.3140954971313477, + "learning_rate": 8.313962873284907e-05, + "loss": 1.1532, + "step": 4280 + }, + { + "epoch": 0.34485530546623794, + "grad_norm": 1.1161473989486694, + "learning_rate": 8.309927360774819e-05, + "loss": 1.1443, + "step": 4290 + }, + { + "epoch": 0.34565916398713825, + "grad_norm": 1.8414597511291504, + "learning_rate": 8.30589184826473e-05, + "loss": 1.2339, + "step": 4300 + }, + { + "epoch": 0.34646302250803857, + "grad_norm": 1.5777239799499512, + "learning_rate": 8.301856335754641e-05, + "loss": 1.2191, + "step": 4310 + }, + { + "epoch": 0.34726688102893893, + "grad_norm": 1.560793161392212, + "learning_rate": 8.297820823244552e-05, + "loss": 1.2505, + "step": 4320 + }, + { + "epoch": 0.34807073954983925, + "grad_norm": 1.2204498052597046, + "learning_rate": 8.293785310734464e-05, + "loss": 1.1672, + "step": 4330 + }, + { + "epoch": 0.34887459807073956, + "grad_norm": 1.1328129768371582, + "learning_rate": 8.289749798224374e-05, + "loss": 1.3139, + "step": 4340 + }, + { + "epoch": 0.3496784565916399, + "grad_norm": 2.0714643001556396, + "learning_rate": 8.285714285714287e-05, + "loss": 1.2314, + "step": 4350 + }, + { + "epoch": 0.3504823151125402, + "grad_norm": 1.0736972093582153, + "learning_rate": 8.281678773204198e-05, + "loss": 1.3055, + "step": 4360 + }, + { + "epoch": 0.3512861736334405, + "grad_norm": 1.7276315689086914, + "learning_rate": 8.27764326069411e-05, + "loss": 1.157, + "step": 4370 + }, + { + "epoch": 0.3520900321543408, + "grad_norm": 1.200021505355835, + "learning_rate": 8.27360774818402e-05, + "loss": 1.1639, + "step": 4380 + }, + { + "epoch": 0.3528938906752412, + "grad_norm": 1.3981629610061646, + "learning_rate": 8.269572235673932e-05, + "loss": 1.2204, + "step": 4390 + }, + { + "epoch": 0.3536977491961415, + "grad_norm": 1.264017105102539, + "learning_rate": 8.265536723163842e-05, + "loss": 1.3322, + "step": 4400 + }, + { + "epoch": 0.3545016077170418, + "grad_norm": 1.5476160049438477, + "learning_rate": 8.261501210653754e-05, + "loss": 1.1365, + "step": 4410 + }, + { + "epoch": 0.3553054662379421, + "grad_norm": 2.6459197998046875, + "learning_rate": 8.257465698143665e-05, + "loss": 1.2962, + "step": 4420 + }, + { + "epoch": 0.35610932475884244, + "grad_norm": 1.1030009984970093, + "learning_rate": 8.253430185633576e-05, + "loss": 1.1737, + "step": 4430 + }, + { + "epoch": 0.35691318327974275, + "grad_norm": 1.3585678339004517, + "learning_rate": 8.249394673123487e-05, + "loss": 1.1757, + "step": 4440 + }, + { + "epoch": 0.35771704180064307, + "grad_norm": 1.4126050472259521, + "learning_rate": 8.245359160613399e-05, + "loss": 1.2081, + "step": 4450 + }, + { + "epoch": 0.35852090032154343, + "grad_norm": 1.3890093564987183, + "learning_rate": 8.241323648103309e-05, + "loss": 1.1337, + "step": 4460 + }, + { + "epoch": 0.35932475884244375, + "grad_norm": 1.015844702720642, + "learning_rate": 8.237288135593221e-05, + "loss": 1.1043, + "step": 4470 + }, + { + "epoch": 0.36012861736334406, + "grad_norm": 1.4223793745040894, + "learning_rate": 8.233252623083131e-05, + "loss": 1.1537, + "step": 4480 + }, + { + "epoch": 0.3609324758842444, + "grad_norm": 1.3807647228240967, + "learning_rate": 8.229217110573043e-05, + "loss": 1.1781, + "step": 4490 + }, + { + "epoch": 0.3617363344051447, + "grad_norm": 1.4933511018753052, + "learning_rate": 8.225181598062954e-05, + "loss": 1.1658, + "step": 4500 + }, + { + "epoch": 0.362540192926045, + "grad_norm": 1.1804993152618408, + "learning_rate": 8.221146085552865e-05, + "loss": 1.2211, + "step": 4510 + }, + { + "epoch": 0.3633440514469453, + "grad_norm": 1.9485111236572266, + "learning_rate": 8.217110573042777e-05, + "loss": 1.216, + "step": 4520 + }, + { + "epoch": 0.3641479099678457, + "grad_norm": 1.215474009513855, + "learning_rate": 8.213075060532688e-05, + "loss": 1.2775, + "step": 4530 + }, + { + "epoch": 0.364951768488746, + "grad_norm": 1.2956700325012207, + "learning_rate": 8.2090395480226e-05, + "loss": 1.2052, + "step": 4540 + }, + { + "epoch": 0.3657556270096463, + "grad_norm": 1.3250538110733032, + "learning_rate": 8.20500403551251e-05, + "loss": 1.345, + "step": 4550 + }, + { + "epoch": 0.3665594855305466, + "grad_norm": 1.5776617527008057, + "learning_rate": 8.200968523002422e-05, + "loss": 1.2108, + "step": 4560 + }, + { + "epoch": 0.36736334405144694, + "grad_norm": 1.3848649263381958, + "learning_rate": 8.196933010492332e-05, + "loss": 1.1932, + "step": 4570 + }, + { + "epoch": 0.36816720257234725, + "grad_norm": 1.2446579933166504, + "learning_rate": 8.192897497982244e-05, + "loss": 1.299, + "step": 4580 + }, + { + "epoch": 0.36897106109324757, + "grad_norm": 1.8784652948379517, + "learning_rate": 8.188861985472155e-05, + "loss": 1.2617, + "step": 4590 + }, + { + "epoch": 0.36977491961414793, + "grad_norm": 1.3336700201034546, + "learning_rate": 8.184826472962066e-05, + "loss": 1.2814, + "step": 4600 + }, + { + "epoch": 0.37057877813504825, + "grad_norm": 1.560197353363037, + "learning_rate": 8.180790960451978e-05, + "loss": 1.2565, + "step": 4610 + }, + { + "epoch": 0.37138263665594856, + "grad_norm": 1.5231118202209473, + "learning_rate": 8.17675544794189e-05, + "loss": 1.3068, + "step": 4620 + }, + { + "epoch": 0.3721864951768489, + "grad_norm": 1.2317482233047485, + "learning_rate": 8.1727199354318e-05, + "loss": 1.2264, + "step": 4630 + }, + { + "epoch": 0.3729903536977492, + "grad_norm": 1.2745100259780884, + "learning_rate": 8.168684422921712e-05, + "loss": 1.4129, + "step": 4640 + }, + { + "epoch": 0.3737942122186495, + "grad_norm": 1.971482753753662, + "learning_rate": 8.164648910411623e-05, + "loss": 1.1744, + "step": 4650 + }, + { + "epoch": 0.3745980707395498, + "grad_norm": 1.0844069719314575, + "learning_rate": 8.160613397901535e-05, + "loss": 1.2031, + "step": 4660 + }, + { + "epoch": 0.3754019292604502, + "grad_norm": 1.330795168876648, + "learning_rate": 8.156577885391445e-05, + "loss": 1.0976, + "step": 4670 + }, + { + "epoch": 0.3762057877813505, + "grad_norm": 1.6659519672393799, + "learning_rate": 8.152542372881357e-05, + "loss": 1.2434, + "step": 4680 + }, + { + "epoch": 0.3770096463022508, + "grad_norm": 1.3720048666000366, + "learning_rate": 8.148506860371267e-05, + "loss": 1.1822, + "step": 4690 + }, + { + "epoch": 0.3778135048231511, + "grad_norm": 1.275566577911377, + "learning_rate": 8.144471347861179e-05, + "loss": 1.295, + "step": 4700 + }, + { + "epoch": 0.37861736334405144, + "grad_norm": 1.3874690532684326, + "learning_rate": 8.14043583535109e-05, + "loss": 1.1628, + "step": 4710 + }, + { + "epoch": 0.37942122186495175, + "grad_norm": 1.2212285995483398, + "learning_rate": 8.136400322841001e-05, + "loss": 1.2601, + "step": 4720 + }, + { + "epoch": 0.38022508038585207, + "grad_norm": 3.2551138401031494, + "learning_rate": 8.132364810330912e-05, + "loss": 1.2103, + "step": 4730 + }, + { + "epoch": 0.38102893890675243, + "grad_norm": 1.1511696577072144, + "learning_rate": 8.128329297820824e-05, + "loss": 1.1613, + "step": 4740 + }, + { + "epoch": 0.38183279742765275, + "grad_norm": 1.658243179321289, + "learning_rate": 8.124293785310735e-05, + "loss": 1.314, + "step": 4750 + }, + { + "epoch": 0.38263665594855306, + "grad_norm": 1.3986806869506836, + "learning_rate": 8.120258272800646e-05, + "loss": 1.1491, + "step": 4760 + }, + { + "epoch": 0.3834405144694534, + "grad_norm": 1.2337511777877808, + "learning_rate": 8.116222760290558e-05, + "loss": 1.2881, + "step": 4770 + }, + { + "epoch": 0.3842443729903537, + "grad_norm": 1.68783438205719, + "learning_rate": 8.112187247780468e-05, + "loss": 1.2599, + "step": 4780 + }, + { + "epoch": 0.385048231511254, + "grad_norm": 1.1215641498565674, + "learning_rate": 8.10815173527038e-05, + "loss": 1.2125, + "step": 4790 + }, + { + "epoch": 0.3858520900321543, + "grad_norm": 1.1371439695358276, + "learning_rate": 8.10411622276029e-05, + "loss": 1.2937, + "step": 4800 + }, + { + "epoch": 0.3866559485530547, + "grad_norm": 1.5197312831878662, + "learning_rate": 8.100080710250202e-05, + "loss": 1.1736, + "step": 4810 + }, + { + "epoch": 0.387459807073955, + "grad_norm": 1.3680319786071777, + "learning_rate": 8.096045197740113e-05, + "loss": 1.2799, + "step": 4820 + }, + { + "epoch": 0.3882636655948553, + "grad_norm": 1.2305307388305664, + "learning_rate": 8.092009685230025e-05, + "loss": 1.1474, + "step": 4830 + }, + { + "epoch": 0.3890675241157556, + "grad_norm": 1.3372670412063599, + "learning_rate": 8.087974172719935e-05, + "loss": 1.2025, + "step": 4840 + }, + { + "epoch": 0.38987138263665594, + "grad_norm": 1.3374559879302979, + "learning_rate": 8.083938660209847e-05, + "loss": 1.2011, + "step": 4850 + }, + { + "epoch": 0.39067524115755625, + "grad_norm": 1.3609942197799683, + "learning_rate": 8.079903147699757e-05, + "loss": 1.3043, + "step": 4860 + }, + { + "epoch": 0.39147909967845657, + "grad_norm": 1.3824046850204468, + "learning_rate": 8.075867635189669e-05, + "loss": 1.2488, + "step": 4870 + }, + { + "epoch": 0.39228295819935693, + "grad_norm": 1.2782840728759766, + "learning_rate": 8.071832122679581e-05, + "loss": 1.1891, + "step": 4880 + }, + { + "epoch": 0.39308681672025725, + "grad_norm": 1.4103243350982666, + "learning_rate": 8.067796610169493e-05, + "loss": 1.3044, + "step": 4890 + }, + { + "epoch": 0.39389067524115756, + "grad_norm": 1.2876780033111572, + "learning_rate": 8.063761097659403e-05, + "loss": 1.1633, + "step": 4900 + }, + { + "epoch": 0.3946945337620579, + "grad_norm": 1.3857921361923218, + "learning_rate": 8.059725585149315e-05, + "loss": 1.2069, + "step": 4910 + }, + { + "epoch": 0.3954983922829582, + "grad_norm": 1.3935562372207642, + "learning_rate": 8.055690072639225e-05, + "loss": 1.1609, + "step": 4920 + }, + { + "epoch": 0.3963022508038585, + "grad_norm": 2.10925030708313, + "learning_rate": 8.051654560129137e-05, + "loss": 1.3154, + "step": 4930 + }, + { + "epoch": 0.3971061093247588, + "grad_norm": 1.2593389749526978, + "learning_rate": 8.047619047619048e-05, + "loss": 1.1579, + "step": 4940 + }, + { + "epoch": 0.3979099678456592, + "grad_norm": 1.4237825870513916, + "learning_rate": 8.04358353510896e-05, + "loss": 1.2519, + "step": 4950 + }, + { + "epoch": 0.3987138263665595, + "grad_norm": 1.0136818885803223, + "learning_rate": 8.03954802259887e-05, + "loss": 1.236, + "step": 4960 + }, + { + "epoch": 0.3995176848874598, + "grad_norm": 1.361685872077942, + "learning_rate": 8.035512510088782e-05, + "loss": 1.091, + "step": 4970 + }, + { + "epoch": 0.4003215434083601, + "grad_norm": 1.5500177145004272, + "learning_rate": 8.031476997578694e-05, + "loss": 1.2047, + "step": 4980 + }, + { + "epoch": 0.40112540192926044, + "grad_norm": 1.5440484285354614, + "learning_rate": 8.027441485068604e-05, + "loss": 1.2541, + "step": 4990 + }, + { + "epoch": 0.40192926045016075, + "grad_norm": 1.7334917783737183, + "learning_rate": 8.023405972558516e-05, + "loss": 1.126, + "step": 5000 + }, + { + "epoch": 0.40273311897106107, + "grad_norm": 1.1905109882354736, + "learning_rate": 8.019370460048426e-05, + "loss": 1.2036, + "step": 5010 + }, + { + "epoch": 0.40353697749196143, + "grad_norm": 1.008507251739502, + "learning_rate": 8.015334947538338e-05, + "loss": 1.3405, + "step": 5020 + }, + { + "epoch": 0.40434083601286175, + "grad_norm": 1.1107839345932007, + "learning_rate": 8.011299435028249e-05, + "loss": 1.2577, + "step": 5030 + }, + { + "epoch": 0.40514469453376206, + "grad_norm": 1.179787039756775, + "learning_rate": 8.00726392251816e-05, + "loss": 1.1532, + "step": 5040 + }, + { + "epoch": 0.4059485530546624, + "grad_norm": 1.4130208492279053, + "learning_rate": 8.003228410008071e-05, + "loss": 1.2049, + "step": 5050 + }, + { + "epoch": 0.4067524115755627, + "grad_norm": 1.2334232330322266, + "learning_rate": 7.999192897497983e-05, + "loss": 1.1509, + "step": 5060 + }, + { + "epoch": 0.407556270096463, + "grad_norm": 1.5695637464523315, + "learning_rate": 7.995157384987893e-05, + "loss": 1.2461, + "step": 5070 + }, + { + "epoch": 0.40836012861736337, + "grad_norm": 1.1055469512939453, + "learning_rate": 7.991121872477805e-05, + "loss": 1.0805, + "step": 5080 + }, + { + "epoch": 0.4091639871382637, + "grad_norm": 1.6603456735610962, + "learning_rate": 7.987086359967715e-05, + "loss": 1.0994, + "step": 5090 + }, + { + "epoch": 0.409967845659164, + "grad_norm": 1.1670210361480713, + "learning_rate": 7.983050847457627e-05, + "loss": 1.2261, + "step": 5100 + }, + { + "epoch": 0.4107717041800643, + "grad_norm": 1.5910462141036987, + "learning_rate": 7.979015334947538e-05, + "loss": 1.1899, + "step": 5110 + }, + { + "epoch": 0.4115755627009646, + "grad_norm": 2.9151222705841064, + "learning_rate": 7.97497982243745e-05, + "loss": 1.1837, + "step": 5120 + }, + { + "epoch": 0.41237942122186494, + "grad_norm": 1.3645329475402832, + "learning_rate": 7.97094430992736e-05, + "loss": 1.2522, + "step": 5130 + }, + { + "epoch": 0.41318327974276525, + "grad_norm": 1.454911708831787, + "learning_rate": 7.966908797417273e-05, + "loss": 1.1749, + "step": 5140 + }, + { + "epoch": 0.4139871382636656, + "grad_norm": 2.2036333084106445, + "learning_rate": 7.962873284907184e-05, + "loss": 1.2823, + "step": 5150 + }, + { + "epoch": 0.41479099678456594, + "grad_norm": 1.2479572296142578, + "learning_rate": 7.958837772397095e-05, + "loss": 1.2181, + "step": 5160 + }, + { + "epoch": 0.41559485530546625, + "grad_norm": 1.1498271226882935, + "learning_rate": 7.954802259887006e-05, + "loss": 1.2351, + "step": 5170 + }, + { + "epoch": 0.41639871382636656, + "grad_norm": 1.4739044904708862, + "learning_rate": 7.950766747376918e-05, + "loss": 1.2791, + "step": 5180 + }, + { + "epoch": 0.4172025723472669, + "grad_norm": 1.3020864725112915, + "learning_rate": 7.946731234866828e-05, + "loss": 1.1077, + "step": 5190 + }, + { + "epoch": 0.4180064308681672, + "grad_norm": 1.4907817840576172, + "learning_rate": 7.94269572235674e-05, + "loss": 1.3331, + "step": 5200 + }, + { + "epoch": 0.4188102893890675, + "grad_norm": 1.3892371654510498, + "learning_rate": 7.938660209846652e-05, + "loss": 1.2152, + "step": 5210 + }, + { + "epoch": 0.41961414790996787, + "grad_norm": 0.9748413562774658, + "learning_rate": 7.934624697336562e-05, + "loss": 1.2838, + "step": 5220 + }, + { + "epoch": 0.4204180064308682, + "grad_norm": 1.5303418636322021, + "learning_rate": 7.930589184826474e-05, + "loss": 1.2055, + "step": 5230 + }, + { + "epoch": 0.4212218649517685, + "grad_norm": 1.7131788730621338, + "learning_rate": 7.926553672316385e-05, + "loss": 1.2611, + "step": 5240 + }, + { + "epoch": 0.4220257234726688, + "grad_norm": 1.1032283306121826, + "learning_rate": 7.922518159806296e-05, + "loss": 1.2017, + "step": 5250 + }, + { + "epoch": 0.4228295819935691, + "grad_norm": 1.971808910369873, + "learning_rate": 7.918482647296207e-05, + "loss": 1.2185, + "step": 5260 + }, + { + "epoch": 0.42363344051446944, + "grad_norm": 1.2828810214996338, + "learning_rate": 7.914447134786119e-05, + "loss": 1.1686, + "step": 5270 + }, + { + "epoch": 0.42443729903536975, + "grad_norm": 1.083824872970581, + "learning_rate": 7.910411622276029e-05, + "loss": 1.2189, + "step": 5280 + }, + { + "epoch": 0.4252411575562701, + "grad_norm": 1.4020578861236572, + "learning_rate": 7.906376109765941e-05, + "loss": 1.1495, + "step": 5290 + }, + { + "epoch": 0.42604501607717044, + "grad_norm": 1.1178410053253174, + "learning_rate": 7.902340597255851e-05, + "loss": 1.2211, + "step": 5300 + }, + { + "epoch": 0.42684887459807075, + "grad_norm": 1.4398339986801147, + "learning_rate": 7.898305084745763e-05, + "loss": 1.2003, + "step": 5310 + }, + { + "epoch": 0.42765273311897106, + "grad_norm": 1.1164844036102295, + "learning_rate": 7.894269572235674e-05, + "loss": 1.1514, + "step": 5320 + }, + { + "epoch": 0.4284565916398714, + "grad_norm": 1.5604671239852905, + "learning_rate": 7.890234059725585e-05, + "loss": 1.3449, + "step": 5330 + }, + { + "epoch": 0.4292604501607717, + "grad_norm": 1.345582127571106, + "learning_rate": 7.886198547215496e-05, + "loss": 1.144, + "step": 5340 + }, + { + "epoch": 0.430064308681672, + "grad_norm": 1.3933559656143188, + "learning_rate": 7.882163034705408e-05, + "loss": 1.1806, + "step": 5350 + }, + { + "epoch": 0.43086816720257237, + "grad_norm": 1.3390429019927979, + "learning_rate": 7.878127522195318e-05, + "loss": 1.218, + "step": 5360 + }, + { + "epoch": 0.4316720257234727, + "grad_norm": 2.075464963912964, + "learning_rate": 7.87409200968523e-05, + "loss": 1.0676, + "step": 5370 + }, + { + "epoch": 0.432475884244373, + "grad_norm": 1.098965048789978, + "learning_rate": 7.870056497175142e-05, + "loss": 1.1328, + "step": 5380 + }, + { + "epoch": 0.4332797427652733, + "grad_norm": 2.4418158531188965, + "learning_rate": 7.866020984665052e-05, + "loss": 1.169, + "step": 5390 + }, + { + "epoch": 0.4340836012861736, + "grad_norm": 1.4988317489624023, + "learning_rate": 7.861985472154964e-05, + "loss": 1.1963, + "step": 5400 + }, + { + "epoch": 0.43488745980707394, + "grad_norm": 1.1517032384872437, + "learning_rate": 7.857949959644876e-05, + "loss": 1.2728, + "step": 5410 + }, + { + "epoch": 0.43569131832797425, + "grad_norm": 1.3836979866027832, + "learning_rate": 7.853914447134786e-05, + "loss": 1.2674, + "step": 5420 + }, + { + "epoch": 0.4364951768488746, + "grad_norm": 1.0794776678085327, + "learning_rate": 7.849878934624698e-05, + "loss": 1.1862, + "step": 5430 + }, + { + "epoch": 0.43729903536977494, + "grad_norm": 1.3821418285369873, + "learning_rate": 7.845843422114609e-05, + "loss": 1.2164, + "step": 5440 + }, + { + "epoch": 0.43810289389067525, + "grad_norm": 1.051756739616394, + "learning_rate": 7.84180790960452e-05, + "loss": 1.2235, + "step": 5450 + }, + { + "epoch": 0.43890675241157556, + "grad_norm": 1.1937540769577026, + "learning_rate": 7.837772397094432e-05, + "loss": 1.2094, + "step": 5460 + }, + { + "epoch": 0.4397106109324759, + "grad_norm": 1.652544617652893, + "learning_rate": 7.833736884584343e-05, + "loss": 1.3222, + "step": 5470 + }, + { + "epoch": 0.4405144694533762, + "grad_norm": 1.5041916370391846, + "learning_rate": 7.829701372074255e-05, + "loss": 1.1524, + "step": 5480 + }, + { + "epoch": 0.4413183279742765, + "grad_norm": 2.3479011058807373, + "learning_rate": 7.825665859564165e-05, + "loss": 1.2593, + "step": 5490 + }, + { + "epoch": 0.44212218649517687, + "grad_norm": 1.3248958587646484, + "learning_rate": 7.821630347054077e-05, + "loss": 1.1777, + "step": 5500 + }, + { + "epoch": 0.4429260450160772, + "grad_norm": 1.0060685873031616, + "learning_rate": 7.817594834543987e-05, + "loss": 1.2084, + "step": 5510 + }, + { + "epoch": 0.4437299035369775, + "grad_norm": 1.3419932126998901, + "learning_rate": 7.813559322033899e-05, + "loss": 1.1381, + "step": 5520 + }, + { + "epoch": 0.4445337620578778, + "grad_norm": 1.8088581562042236, + "learning_rate": 7.80952380952381e-05, + "loss": 1.1101, + "step": 5530 + }, + { + "epoch": 0.4453376205787781, + "grad_norm": 1.0692769289016724, + "learning_rate": 7.805488297013721e-05, + "loss": 1.2082, + "step": 5540 + }, + { + "epoch": 0.44614147909967844, + "grad_norm": 1.071608066558838, + "learning_rate": 7.801452784503632e-05, + "loss": 1.165, + "step": 5550 + }, + { + "epoch": 0.44694533762057875, + "grad_norm": 1.3941121101379395, + "learning_rate": 7.797417271993544e-05, + "loss": 1.2301, + "step": 5560 + }, + { + "epoch": 0.4477491961414791, + "grad_norm": 1.166388750076294, + "learning_rate": 7.793381759483454e-05, + "loss": 1.1386, + "step": 5570 + }, + { + "epoch": 0.44855305466237944, + "grad_norm": 1.871343731880188, + "learning_rate": 7.789346246973366e-05, + "loss": 1.2296, + "step": 5580 + }, + { + "epoch": 0.44935691318327975, + "grad_norm": 1.4270875453948975, + "learning_rate": 7.785310734463276e-05, + "loss": 1.2229, + "step": 5590 + }, + { + "epoch": 0.45016077170418006, + "grad_norm": 1.0972518920898438, + "learning_rate": 7.781275221953188e-05, + "loss": 1.1807, + "step": 5600 + }, + { + "epoch": 0.4509646302250804, + "grad_norm": 2.3128669261932373, + "learning_rate": 7.7772397094431e-05, + "loss": 1.3243, + "step": 5610 + }, + { + "epoch": 0.4517684887459807, + "grad_norm": 1.4300646781921387, + "learning_rate": 7.77320419693301e-05, + "loss": 1.2877, + "step": 5620 + }, + { + "epoch": 0.452572347266881, + "grad_norm": 1.5658491849899292, + "learning_rate": 7.769168684422922e-05, + "loss": 1.206, + "step": 5630 + }, + { + "epoch": 0.4533762057877814, + "grad_norm": 1.6786673069000244, + "learning_rate": 7.765133171912833e-05, + "loss": 1.1521, + "step": 5640 + }, + { + "epoch": 0.4541800643086817, + "grad_norm": 1.7415214776992798, + "learning_rate": 7.761097659402745e-05, + "loss": 1.0989, + "step": 5650 + }, + { + "epoch": 0.454983922829582, + "grad_norm": 1.5428780317306519, + "learning_rate": 7.757062146892655e-05, + "loss": 1.2544, + "step": 5660 + }, + { + "epoch": 0.4557877813504823, + "grad_norm": 1.2579231262207031, + "learning_rate": 7.753026634382567e-05, + "loss": 1.2282, + "step": 5670 + }, + { + "epoch": 0.4565916398713826, + "grad_norm": 1.019351601600647, + "learning_rate": 7.748991121872479e-05, + "loss": 1.1731, + "step": 5680 + }, + { + "epoch": 0.45739549839228294, + "grad_norm": 1.2908672094345093, + "learning_rate": 7.74495560936239e-05, + "loss": 1.0881, + "step": 5690 + }, + { + "epoch": 0.45819935691318325, + "grad_norm": 1.3027087450027466, + "learning_rate": 7.740920096852301e-05, + "loss": 1.2435, + "step": 5700 + }, + { + "epoch": 0.4590032154340836, + "grad_norm": 1.603829264640808, + "learning_rate": 7.736884584342213e-05, + "loss": 1.2332, + "step": 5710 + }, + { + "epoch": 0.45980707395498394, + "grad_norm": 1.3460705280303955, + "learning_rate": 7.732849071832123e-05, + "loss": 1.2056, + "step": 5720 + }, + { + "epoch": 0.46061093247588425, + "grad_norm": 1.7570997476577759, + "learning_rate": 7.728813559322035e-05, + "loss": 1.2024, + "step": 5730 + }, + { + "epoch": 0.46141479099678456, + "grad_norm": 1.3634543418884277, + "learning_rate": 7.724778046811945e-05, + "loss": 1.1674, + "step": 5740 + }, + { + "epoch": 0.4622186495176849, + "grad_norm": 1.3680329322814941, + "learning_rate": 7.720742534301857e-05, + "loss": 1.2917, + "step": 5750 + }, + { + "epoch": 0.4630225080385852, + "grad_norm": 1.585895299911499, + "learning_rate": 7.716707021791768e-05, + "loss": 1.202, + "step": 5760 + }, + { + "epoch": 0.4638263665594855, + "grad_norm": 1.8435193300247192, + "learning_rate": 7.71267150928168e-05, + "loss": 1.2317, + "step": 5770 + }, + { + "epoch": 0.4646302250803859, + "grad_norm": 0.9803140759468079, + "learning_rate": 7.70863599677159e-05, + "loss": 1.1616, + "step": 5780 + }, + { + "epoch": 0.4654340836012862, + "grad_norm": 1.3502463102340698, + "learning_rate": 7.704600484261502e-05, + "loss": 1.2768, + "step": 5790 + }, + { + "epoch": 0.4662379421221865, + "grad_norm": 1.094741940498352, + "learning_rate": 7.700564971751412e-05, + "loss": 1.1815, + "step": 5800 + }, + { + "epoch": 0.4670418006430868, + "grad_norm": 1.1807562112808228, + "learning_rate": 7.696529459241324e-05, + "loss": 1.173, + "step": 5810 + }, + { + "epoch": 0.4678456591639871, + "grad_norm": 1.4296817779541016, + "learning_rate": 7.692493946731235e-05, + "loss": 1.19, + "step": 5820 + }, + { + "epoch": 0.46864951768488744, + "grad_norm": 1.4732331037521362, + "learning_rate": 7.688458434221146e-05, + "loss": 1.1519, + "step": 5830 + }, + { + "epoch": 0.4694533762057878, + "grad_norm": 1.2672715187072754, + "learning_rate": 7.684422921711057e-05, + "loss": 1.1821, + "step": 5840 + }, + { + "epoch": 0.4702572347266881, + "grad_norm": 1.078221321105957, + "learning_rate": 7.680387409200969e-05, + "loss": 1.1839, + "step": 5850 + }, + { + "epoch": 0.47106109324758844, + "grad_norm": 1.2685880661010742, + "learning_rate": 7.67635189669088e-05, + "loss": 1.2756, + "step": 5860 + }, + { + "epoch": 0.47186495176848875, + "grad_norm": 1.14828360080719, + "learning_rate": 7.672316384180791e-05, + "loss": 1.122, + "step": 5870 + }, + { + "epoch": 0.47266881028938906, + "grad_norm": 1.5411911010742188, + "learning_rate": 7.668280871670703e-05, + "loss": 1.3135, + "step": 5880 + }, + { + "epoch": 0.4734726688102894, + "grad_norm": 1.9456324577331543, + "learning_rate": 7.664245359160613e-05, + "loss": 1.3535, + "step": 5890 + }, + { + "epoch": 0.4742765273311897, + "grad_norm": 1.542708396911621, + "learning_rate": 7.660209846650525e-05, + "loss": 1.3669, + "step": 5900 + }, + { + "epoch": 0.47508038585209006, + "grad_norm": 1.6236876249313354, + "learning_rate": 7.656174334140435e-05, + "loss": 1.128, + "step": 5910 + }, + { + "epoch": 0.4758842443729904, + "grad_norm": 1.0894272327423096, + "learning_rate": 7.652138821630347e-05, + "loss": 1.2741, + "step": 5920 + }, + { + "epoch": 0.4766881028938907, + "grad_norm": 1.1134109497070312, + "learning_rate": 7.648103309120259e-05, + "loss": 1.2204, + "step": 5930 + }, + { + "epoch": 0.477491961414791, + "grad_norm": 1.1840413808822632, + "learning_rate": 7.644067796610171e-05, + "loss": 1.12, + "step": 5940 + }, + { + "epoch": 0.4782958199356913, + "grad_norm": 2.185009241104126, + "learning_rate": 7.640032284100081e-05, + "loss": 1.1607, + "step": 5950 + }, + { + "epoch": 0.4790996784565916, + "grad_norm": 1.7973313331604004, + "learning_rate": 7.635996771589993e-05, + "loss": 1.1847, + "step": 5960 + }, + { + "epoch": 0.47990353697749194, + "grad_norm": 1.4497991800308228, + "learning_rate": 7.631961259079904e-05, + "loss": 1.1776, + "step": 5970 + }, + { + "epoch": 0.4807073954983923, + "grad_norm": 1.4984551668167114, + "learning_rate": 7.627925746569815e-05, + "loss": 1.2923, + "step": 5980 + }, + { + "epoch": 0.4815112540192926, + "grad_norm": 1.1014914512634277, + "learning_rate": 7.623890234059726e-05, + "loss": 1.0993, + "step": 5990 + }, + { + "epoch": 0.48231511254019294, + "grad_norm": 1.5178377628326416, + "learning_rate": 7.619854721549638e-05, + "loss": 1.161, + "step": 6000 + }, + { + "epoch": 0.48231511254019294, + "eval_yahma/alpaca-cleaned_loss": 1.2380210161209106, + "eval_yahma/alpaca-cleaned_runtime": 115.6688, + "eval_yahma/alpaca-cleaned_samples_per_second": 17.291, + "eval_yahma/alpaca-cleaned_steps_per_second": 2.161, + "step": 6000 + }, + { + "epoch": 0.48311897106109325, + "grad_norm": 1.408656120300293, + "learning_rate": 7.615819209039548e-05, + "loss": 1.2989, + "step": 6010 + }, + { + "epoch": 0.48392282958199356, + "grad_norm": 1.2989753484725952, + "learning_rate": 7.61178369652946e-05, + "loss": 1.2565, + "step": 6020 + }, + { + "epoch": 0.4847266881028939, + "grad_norm": 1.5819889307022095, + "learning_rate": 7.60774818401937e-05, + "loss": 1.1385, + "step": 6030 + }, + { + "epoch": 0.4855305466237942, + "grad_norm": 1.1295589208602905, + "learning_rate": 7.603712671509282e-05, + "loss": 1.203, + "step": 6040 + }, + { + "epoch": 0.48633440514469456, + "grad_norm": 1.2585210800170898, + "learning_rate": 7.599677158999193e-05, + "loss": 1.3078, + "step": 6050 + }, + { + "epoch": 0.4871382636655949, + "grad_norm": 1.024665355682373, + "learning_rate": 7.595641646489105e-05, + "loss": 1.2181, + "step": 6060 + }, + { + "epoch": 0.4879421221864952, + "grad_norm": 1.289249062538147, + "learning_rate": 7.591606133979015e-05, + "loss": 1.3394, + "step": 6070 + }, + { + "epoch": 0.4887459807073955, + "grad_norm": 3.0227270126342773, + "learning_rate": 7.587570621468927e-05, + "loss": 1.1185, + "step": 6080 + }, + { + "epoch": 0.4895498392282958, + "grad_norm": 1.325408935546875, + "learning_rate": 7.583535108958839e-05, + "loss": 1.2544, + "step": 6090 + }, + { + "epoch": 0.4903536977491961, + "grad_norm": 1.8913363218307495, + "learning_rate": 7.579499596448749e-05, + "loss": 1.2673, + "step": 6100 + }, + { + "epoch": 0.49115755627009644, + "grad_norm": 1.3949637413024902, + "learning_rate": 7.575464083938661e-05, + "loss": 1.1629, + "step": 6110 + }, + { + "epoch": 0.4919614147909968, + "grad_norm": 1.7149893045425415, + "learning_rate": 7.571428571428571e-05, + "loss": 1.3161, + "step": 6120 + }, + { + "epoch": 0.4927652733118971, + "grad_norm": 1.4487000703811646, + "learning_rate": 7.567393058918483e-05, + "loss": 1.1248, + "step": 6130 + }, + { + "epoch": 0.49356913183279744, + "grad_norm": 1.4671707153320312, + "learning_rate": 7.563357546408394e-05, + "loss": 1.1904, + "step": 6140 + }, + { + "epoch": 0.49437299035369775, + "grad_norm": 1.4180289506912231, + "learning_rate": 7.559322033898305e-05, + "loss": 1.1742, + "step": 6150 + }, + { + "epoch": 0.49517684887459806, + "grad_norm": 1.335613489151001, + "learning_rate": 7.555286521388216e-05, + "loss": 1.19, + "step": 6160 + }, + { + "epoch": 0.4959807073954984, + "grad_norm": 1.493781566619873, + "learning_rate": 7.551251008878128e-05, + "loss": 1.249, + "step": 6170 + }, + { + "epoch": 0.4967845659163987, + "grad_norm": 1.7311235666275024, + "learning_rate": 7.547215496368038e-05, + "loss": 1.2581, + "step": 6180 + }, + { + "epoch": 0.49758842443729906, + "grad_norm": 1.3849860429763794, + "learning_rate": 7.54317998385795e-05, + "loss": 1.2069, + "step": 6190 + }, + { + "epoch": 0.4983922829581994, + "grad_norm": 1.0231637954711914, + "learning_rate": 7.539144471347862e-05, + "loss": 1.1676, + "step": 6200 + }, + { + "epoch": 0.4991961414790997, + "grad_norm": 1.1450506448745728, + "learning_rate": 7.535108958837774e-05, + "loss": 1.2508, + "step": 6210 + }, + { + "epoch": 0.5, + "grad_norm": 1.203847885131836, + "learning_rate": 7.531073446327684e-05, + "loss": 1.3101, + "step": 6220 + }, + { + "epoch": 0.5008038585209004, + "grad_norm": 1.2034550905227661, + "learning_rate": 7.527037933817596e-05, + "loss": 1.2576, + "step": 6230 + }, + { + "epoch": 0.5016077170418006, + "grad_norm": 1.3764240741729736, + "learning_rate": 7.523002421307506e-05, + "loss": 1.1927, + "step": 6240 + }, + { + "epoch": 0.502411575562701, + "grad_norm": 1.907971978187561, + "learning_rate": 7.518966908797418e-05, + "loss": 1.2045, + "step": 6250 + }, + { + "epoch": 0.5032154340836013, + "grad_norm": 1.417354702949524, + "learning_rate": 7.514931396287329e-05, + "loss": 1.2857, + "step": 6260 + }, + { + "epoch": 0.5040192926045016, + "grad_norm": 1.4104013442993164, + "learning_rate": 7.51089588377724e-05, + "loss": 1.2178, + "step": 6270 + }, + { + "epoch": 0.5048231511254019, + "grad_norm": 1.3243697881698608, + "learning_rate": 7.506860371267151e-05, + "loss": 1.2072, + "step": 6280 + }, + { + "epoch": 0.5056270096463023, + "grad_norm": 1.7258855104446411, + "learning_rate": 7.502824858757063e-05, + "loss": 1.2003, + "step": 6290 + }, + { + "epoch": 0.5064308681672026, + "grad_norm": 1.7383719682693481, + "learning_rate": 7.498789346246973e-05, + "loss": 1.21, + "step": 6300 + }, + { + "epoch": 0.5072347266881029, + "grad_norm": 1.1666938066482544, + "learning_rate": 7.494753833736885e-05, + "loss": 1.2231, + "step": 6310 + }, + { + "epoch": 0.5080385852090032, + "grad_norm": 1.3047698736190796, + "learning_rate": 7.491121872477805e-05, + "loss": 1.025, + "step": 6320 + }, + { + "epoch": 0.5088424437299035, + "grad_norm": 2.041975259780884, + "learning_rate": 7.487086359967717e-05, + "loss": 1.1173, + "step": 6330 + }, + { + "epoch": 0.5096463022508039, + "grad_norm": 1.8618550300598145, + "learning_rate": 7.483050847457627e-05, + "loss": 1.1592, + "step": 6340 + }, + { + "epoch": 0.5104501607717041, + "grad_norm": 1.1699954271316528, + "learning_rate": 7.479015334947539e-05, + "loss": 1.0528, + "step": 6350 + }, + { + "epoch": 0.5112540192926045, + "grad_norm": 1.319175362586975, + "learning_rate": 7.47497982243745e-05, + "loss": 1.199, + "step": 6360 + }, + { + "epoch": 0.5120578778135049, + "grad_norm": 1.1470156908035278, + "learning_rate": 7.470944309927361e-05, + "loss": 1.0411, + "step": 6370 + }, + { + "epoch": 0.5128617363344051, + "grad_norm": 1.7862601280212402, + "learning_rate": 7.466908797417272e-05, + "loss": 1.2587, + "step": 6380 + }, + { + "epoch": 0.5136655948553055, + "grad_norm": 1.924744963645935, + "learning_rate": 7.462873284907184e-05, + "loss": 1.2351, + "step": 6390 + }, + { + "epoch": 0.5144694533762058, + "grad_norm": 1.4155441522598267, + "learning_rate": 7.458837772397094e-05, + "loss": 1.1327, + "step": 6400 + }, + { + "epoch": 0.5152733118971061, + "grad_norm": 1.2903022766113281, + "learning_rate": 7.455205811138016e-05, + "loss": 1.0905, + "step": 6410 + }, + { + "epoch": 0.5160771704180064, + "grad_norm": 1.745773434638977, + "learning_rate": 7.451170298627926e-05, + "loss": 1.1903, + "step": 6420 + }, + { + "epoch": 0.5168810289389068, + "grad_norm": 1.2349950075149536, + "learning_rate": 7.447134786117838e-05, + "loss": 1.1813, + "step": 6430 + }, + { + "epoch": 0.5176848874598071, + "grad_norm": 1.8500481843948364, + "learning_rate": 7.443099273607748e-05, + "loss": 1.1865, + "step": 6440 + }, + { + "epoch": 0.5184887459807074, + "grad_norm": 1.355523705482483, + "learning_rate": 7.43906376109766e-05, + "loss": 1.1611, + "step": 6450 + }, + { + "epoch": 0.5192926045016077, + "grad_norm": 1.3113207817077637, + "learning_rate": 7.43502824858757e-05, + "loss": 1.1685, + "step": 6460 + }, + { + "epoch": 0.520096463022508, + "grad_norm": 1.3658199310302734, + "learning_rate": 7.430992736077482e-05, + "loss": 1.2853, + "step": 6470 + }, + { + "epoch": 0.5209003215434084, + "grad_norm": 1.1712766885757446, + "learning_rate": 7.426957223567393e-05, + "loss": 1.2977, + "step": 6480 + }, + { + "epoch": 0.5217041800643086, + "grad_norm": 1.9549909830093384, + "learning_rate": 7.422921711057305e-05, + "loss": 1.2796, + "step": 6490 + }, + { + "epoch": 0.522508038585209, + "grad_norm": 2.5356125831604004, + "learning_rate": 7.418886198547215e-05, + "loss": 1.143, + "step": 6500 + }, + { + "epoch": 0.5233118971061094, + "grad_norm": 1.2105944156646729, + "learning_rate": 7.414850686037127e-05, + "loss": 1.3304, + "step": 6510 + }, + { + "epoch": 0.5241157556270096, + "grad_norm": 0.9766327738761902, + "learning_rate": 7.410815173527039e-05, + "loss": 1.2569, + "step": 6520 + }, + { + "epoch": 0.52491961414791, + "grad_norm": 1.3324644565582275, + "learning_rate": 7.406779661016949e-05, + "loss": 1.1962, + "step": 6530 + }, + { + "epoch": 0.5257234726688103, + "grad_norm": 1.4825365543365479, + "learning_rate": 7.402744148506861e-05, + "loss": 1.182, + "step": 6540 + }, + { + "epoch": 0.5265273311897106, + "grad_norm": 1.0984454154968262, + "learning_rate": 7.398708635996772e-05, + "loss": 1.325, + "step": 6550 + }, + { + "epoch": 0.5273311897106109, + "grad_norm": 1.4700428247451782, + "learning_rate": 7.394673123486683e-05, + "loss": 1.1464, + "step": 6560 + }, + { + "epoch": 0.5281350482315113, + "grad_norm": 1.6935198307037354, + "learning_rate": 7.390637610976594e-05, + "loss": 1.2638, + "step": 6570 + }, + { + "epoch": 0.5289389067524116, + "grad_norm": 1.5501216650009155, + "learning_rate": 7.386602098466506e-05, + "loss": 1.1705, + "step": 6580 + }, + { + "epoch": 0.5297427652733119, + "grad_norm": 1.5348395109176636, + "learning_rate": 7.382566585956416e-05, + "loss": 1.249, + "step": 6590 + }, + { + "epoch": 0.5305466237942122, + "grad_norm": 1.548414945602417, + "learning_rate": 7.378531073446328e-05, + "loss": 1.2001, + "step": 6600 + }, + { + "epoch": 0.5313504823151125, + "grad_norm": 1.1692911386489868, + "learning_rate": 7.374495560936238e-05, + "loss": 1.0845, + "step": 6610 + }, + { + "epoch": 0.5321543408360129, + "grad_norm": 2.0657691955566406, + "learning_rate": 7.370460048426152e-05, + "loss": 1.4335, + "step": 6620 + }, + { + "epoch": 0.5329581993569131, + "grad_norm": 1.2757883071899414, + "learning_rate": 7.366424535916062e-05, + "loss": 1.2333, + "step": 6630 + }, + { + "epoch": 0.5337620578778135, + "grad_norm": 1.6110565662384033, + "learning_rate": 7.362389023405974e-05, + "loss": 1.1881, + "step": 6640 + }, + { + "epoch": 0.5345659163987139, + "grad_norm": 1.3758124113082886, + "learning_rate": 7.358353510895884e-05, + "loss": 1.2932, + "step": 6650 + }, + { + "epoch": 0.5353697749196141, + "grad_norm": 1.5866690874099731, + "learning_rate": 7.354317998385796e-05, + "loss": 1.2757, + "step": 6660 + }, + { + "epoch": 0.5361736334405145, + "grad_norm": 1.281878113746643, + "learning_rate": 7.350282485875707e-05, + "loss": 1.2987, + "step": 6670 + }, + { + "epoch": 0.5369774919614148, + "grad_norm": 1.4316363334655762, + "learning_rate": 7.346246973365618e-05, + "loss": 1.2868, + "step": 6680 + }, + { + "epoch": 0.5377813504823151, + "grad_norm": 1.2454899549484253, + "learning_rate": 7.342211460855529e-05, + "loss": 1.2473, + "step": 6690 + }, + { + "epoch": 0.5385852090032154, + "grad_norm": 1.4715195894241333, + "learning_rate": 7.33817594834544e-05, + "loss": 1.2471, + "step": 6700 + }, + { + "epoch": 0.5393890675241158, + "grad_norm": 1.0020066499710083, + "learning_rate": 7.334140435835351e-05, + "loss": 1.2116, + "step": 6710 + }, + { + "epoch": 0.5401929260450161, + "grad_norm": 1.8996168375015259, + "learning_rate": 7.330104923325263e-05, + "loss": 1.1744, + "step": 6720 + }, + { + "epoch": 0.5409967845659164, + "grad_norm": 1.3301576375961304, + "learning_rate": 7.326069410815173e-05, + "loss": 1.2056, + "step": 6730 + }, + { + "epoch": 0.5418006430868167, + "grad_norm": 1.354276180267334, + "learning_rate": 7.322033898305085e-05, + "loss": 1.2008, + "step": 6740 + }, + { + "epoch": 0.542604501607717, + "grad_norm": 1.4722867012023926, + "learning_rate": 7.317998385794997e-05, + "loss": 1.2248, + "step": 6750 + }, + { + "epoch": 0.5434083601286174, + "grad_norm": 1.3259929418563843, + "learning_rate": 7.313962873284907e-05, + "loss": 1.2193, + "step": 6760 + }, + { + "epoch": 0.5442122186495176, + "grad_norm": 1.7194771766662598, + "learning_rate": 7.309927360774819e-05, + "loss": 1.2812, + "step": 6770 + }, + { + "epoch": 0.545016077170418, + "grad_norm": 1.4885749816894531, + "learning_rate": 7.30589184826473e-05, + "loss": 1.0297, + "step": 6780 + }, + { + "epoch": 0.5458199356913184, + "grad_norm": 1.3488175868988037, + "learning_rate": 7.301856335754641e-05, + "loss": 1.2759, + "step": 6790 + }, + { + "epoch": 0.5466237942122186, + "grad_norm": 1.410971760749817, + "learning_rate": 7.297820823244552e-05, + "loss": 1.1545, + "step": 6800 + }, + { + "epoch": 0.547427652733119, + "grad_norm": 1.3579992055892944, + "learning_rate": 7.293785310734464e-05, + "loss": 1.219, + "step": 6810 + }, + { + "epoch": 0.5482315112540193, + "grad_norm": 1.5444881916046143, + "learning_rate": 7.289749798224374e-05, + "loss": 1.1847, + "step": 6820 + }, + { + "epoch": 0.5490353697749196, + "grad_norm": 1.577883005142212, + "learning_rate": 7.285714285714286e-05, + "loss": 1.103, + "step": 6830 + }, + { + "epoch": 0.5498392282958199, + "grad_norm": 1.9665368795394897, + "learning_rate": 7.281678773204197e-05, + "loss": 1.1444, + "step": 6840 + }, + { + "epoch": 0.5506430868167203, + "grad_norm": 1.849618673324585, + "learning_rate": 7.277643260694108e-05, + "loss": 1.3576, + "step": 6850 + }, + { + "epoch": 0.5514469453376206, + "grad_norm": 1.6548022031784058, + "learning_rate": 7.273607748184019e-05, + "loss": 1.2216, + "step": 6860 + }, + { + "epoch": 0.5522508038585209, + "grad_norm": 1.2869874238967896, + "learning_rate": 7.26957223567393e-05, + "loss": 1.2912, + "step": 6870 + }, + { + "epoch": 0.5530546623794212, + "grad_norm": 2.3842945098876953, + "learning_rate": 7.265536723163842e-05, + "loss": 1.2517, + "step": 6880 + }, + { + "epoch": 0.5538585209003215, + "grad_norm": 1.285078525543213, + "learning_rate": 7.261501210653754e-05, + "loss": 1.27, + "step": 6890 + }, + { + "epoch": 0.5546623794212219, + "grad_norm": 1.1571310758590698, + "learning_rate": 7.257465698143665e-05, + "loss": 1.2052, + "step": 6900 + }, + { + "epoch": 0.5554662379421221, + "grad_norm": 3.9599387645721436, + "learning_rate": 7.253430185633576e-05, + "loss": 1.1625, + "step": 6910 + }, + { + "epoch": 0.5562700964630225, + "grad_norm": 1.3315590620040894, + "learning_rate": 7.249394673123487e-05, + "loss": 1.129, + "step": 6920 + }, + { + "epoch": 0.5570739549839229, + "grad_norm": 1.445112705230713, + "learning_rate": 7.245359160613399e-05, + "loss": 1.2613, + "step": 6930 + }, + { + "epoch": 0.5578778135048231, + "grad_norm": 1.5418325662612915, + "learning_rate": 7.241323648103309e-05, + "loss": 1.17, + "step": 6940 + }, + { + "epoch": 0.5586816720257235, + "grad_norm": 1.247528314590454, + "learning_rate": 7.237288135593221e-05, + "loss": 1.2158, + "step": 6950 + }, + { + "epoch": 0.5594855305466238, + "grad_norm": 2.0513546466827393, + "learning_rate": 7.233252623083131e-05, + "loss": 1.1665, + "step": 6960 + }, + { + "epoch": 0.5602893890675241, + "grad_norm": 1.661195158958435, + "learning_rate": 7.229217110573043e-05, + "loss": 1.2148, + "step": 6970 + }, + { + "epoch": 0.5610932475884244, + "grad_norm": 1.1958410739898682, + "learning_rate": 7.225181598062955e-05, + "loss": 1.175, + "step": 6980 + }, + { + "epoch": 0.5618971061093248, + "grad_norm": 1.3750405311584473, + "learning_rate": 7.221146085552866e-05, + "loss": 1.1609, + "step": 6990 + }, + { + "epoch": 0.5627009646302251, + "grad_norm": 1.5093348026275635, + "learning_rate": 7.217110573042777e-05, + "loss": 1.1352, + "step": 7000 + }, + { + "epoch": 0.5635048231511254, + "grad_norm": 2.5393998622894287, + "learning_rate": 7.213075060532688e-05, + "loss": 1.1646, + "step": 7010 + }, + { + "epoch": 0.5643086816720257, + "grad_norm": 2.845985174179077, + "learning_rate": 7.2090395480226e-05, + "loss": 1.1089, + "step": 7020 + }, + { + "epoch": 0.565112540192926, + "grad_norm": 1.2273420095443726, + "learning_rate": 7.20500403551251e-05, + "loss": 1.3287, + "step": 7030 + }, + { + "epoch": 0.5659163987138264, + "grad_norm": 1.1808136701583862, + "learning_rate": 7.200968523002422e-05, + "loss": 1.2131, + "step": 7040 + }, + { + "epoch": 0.5667202572347267, + "grad_norm": 1.4661297798156738, + "learning_rate": 7.196933010492332e-05, + "loss": 1.2178, + "step": 7050 + }, + { + "epoch": 0.567524115755627, + "grad_norm": 1.872639775276184, + "learning_rate": 7.192897497982244e-05, + "loss": 1.1569, + "step": 7060 + }, + { + "epoch": 0.5683279742765274, + "grad_norm": 1.1631591320037842, + "learning_rate": 7.188861985472155e-05, + "loss": 1.2776, + "step": 7070 + }, + { + "epoch": 0.5691318327974276, + "grad_norm": 1.7431219816207886, + "learning_rate": 7.184826472962066e-05, + "loss": 1.116, + "step": 7080 + }, + { + "epoch": 0.569935691318328, + "grad_norm": 1.3899308443069458, + "learning_rate": 7.180790960451977e-05, + "loss": 1.2137, + "step": 7090 + }, + { + "epoch": 0.5707395498392283, + "grad_norm": 1.6811970472335815, + "learning_rate": 7.176755447941889e-05, + "loss": 1.2248, + "step": 7100 + }, + { + "epoch": 0.5715434083601286, + "grad_norm": 1.0294607877731323, + "learning_rate": 7.172719935431799e-05, + "loss": 1.2298, + "step": 7110 + }, + { + "epoch": 0.572347266881029, + "grad_norm": 1.3149453401565552, + "learning_rate": 7.168684422921711e-05, + "loss": 1.162, + "step": 7120 + }, + { + "epoch": 0.5731511254019293, + "grad_norm": 1.3619112968444824, + "learning_rate": 7.164648910411621e-05, + "loss": 1.1935, + "step": 7130 + }, + { + "epoch": 0.5739549839228296, + "grad_norm": 1.2184911966323853, + "learning_rate": 7.160613397901533e-05, + "loss": 1.0804, + "step": 7140 + }, + { + "epoch": 0.5747588424437299, + "grad_norm": 1.170629858970642, + "learning_rate": 7.156577885391445e-05, + "loss": 1.1793, + "step": 7150 + }, + { + "epoch": 0.5755627009646302, + "grad_norm": 1.2444965839385986, + "learning_rate": 7.152542372881357e-05, + "loss": 1.2071, + "step": 7160 + }, + { + "epoch": 0.5763665594855305, + "grad_norm": 1.3081021308898926, + "learning_rate": 7.148506860371267e-05, + "loss": 1.1656, + "step": 7170 + }, + { + "epoch": 0.5771704180064309, + "grad_norm": 2.752758264541626, + "learning_rate": 7.144471347861179e-05, + "loss": 1.2164, + "step": 7180 + }, + { + "epoch": 0.5779742765273312, + "grad_norm": 1.5479896068572998, + "learning_rate": 7.14043583535109e-05, + "loss": 1.1613, + "step": 7190 + }, + { + "epoch": 0.5787781350482315, + "grad_norm": 1.40146803855896, + "learning_rate": 7.136400322841001e-05, + "loss": 1.1026, + "step": 7200 + }, + { + "epoch": 0.5795819935691319, + "grad_norm": 1.0006448030471802, + "learning_rate": 7.132364810330913e-05, + "loss": 1.1749, + "step": 7210 + }, + { + "epoch": 0.5803858520900321, + "grad_norm": 1.4498153924942017, + "learning_rate": 7.128329297820824e-05, + "loss": 1.065, + "step": 7220 + }, + { + "epoch": 0.5811897106109325, + "grad_norm": 1.3882859945297241, + "learning_rate": 7.124293785310736e-05, + "loss": 1.248, + "step": 7230 + }, + { + "epoch": 0.5819935691318328, + "grad_norm": 0.9907482266426086, + "learning_rate": 7.120258272800646e-05, + "loss": 1.1794, + "step": 7240 + }, + { + "epoch": 0.5827974276527331, + "grad_norm": 2.1010050773620605, + "learning_rate": 7.116222760290558e-05, + "loss": 1.1451, + "step": 7250 + }, + { + "epoch": 0.5836012861736335, + "grad_norm": 1.1258735656738281, + "learning_rate": 7.112187247780468e-05, + "loss": 1.2021, + "step": 7260 + }, + { + "epoch": 0.5844051446945338, + "grad_norm": 1.8240938186645508, + "learning_rate": 7.10815173527038e-05, + "loss": 1.2344, + "step": 7270 + }, + { + "epoch": 0.5852090032154341, + "grad_norm": 2.2731316089630127, + "learning_rate": 7.10411622276029e-05, + "loss": 1.1984, + "step": 7280 + }, + { + "epoch": 0.5860128617363344, + "grad_norm": 1.6902538537979126, + "learning_rate": 7.100080710250202e-05, + "loss": 1.2406, + "step": 7290 + }, + { + "epoch": 0.5868167202572347, + "grad_norm": 1.1986221075057983, + "learning_rate": 7.096045197740113e-05, + "loss": 1.2479, + "step": 7300 + }, + { + "epoch": 0.587620578778135, + "grad_norm": 1.3120813369750977, + "learning_rate": 7.092009685230025e-05, + "loss": 1.2603, + "step": 7310 + }, + { + "epoch": 0.5884244372990354, + "grad_norm": 1.1902378797531128, + "learning_rate": 7.087974172719935e-05, + "loss": 1.1116, + "step": 7320 + }, + { + "epoch": 0.5892282958199357, + "grad_norm": 1.3847696781158447, + "learning_rate": 7.083938660209847e-05, + "loss": 1.2127, + "step": 7330 + }, + { + "epoch": 0.590032154340836, + "grad_norm": 2.2988333702087402, + "learning_rate": 7.079903147699757e-05, + "loss": 1.1229, + "step": 7340 + }, + { + "epoch": 0.5908360128617364, + "grad_norm": 1.5824936628341675, + "learning_rate": 7.075867635189669e-05, + "loss": 1.333, + "step": 7350 + }, + { + "epoch": 0.5916398713826366, + "grad_norm": 1.1853426694869995, + "learning_rate": 7.07183212267958e-05, + "loss": 1.1416, + "step": 7360 + }, + { + "epoch": 0.592443729903537, + "grad_norm": 1.1010386943817139, + "learning_rate": 7.067796610169491e-05, + "loss": 1.3182, + "step": 7370 + }, + { + "epoch": 0.5932475884244373, + "grad_norm": 1.2553712129592896, + "learning_rate": 7.063761097659403e-05, + "loss": 1.2814, + "step": 7380 + }, + { + "epoch": 0.5940514469453376, + "grad_norm": 1.5219231843948364, + "learning_rate": 7.059725585149314e-05, + "loss": 1.2996, + "step": 7390 + }, + { + "epoch": 0.594855305466238, + "grad_norm": 1.8472049236297607, + "learning_rate": 7.055690072639226e-05, + "loss": 1.1994, + "step": 7400 + }, + { + "epoch": 0.5956591639871383, + "grad_norm": 1.3701444864273071, + "learning_rate": 7.051654560129137e-05, + "loss": 1.1645, + "step": 7410 + }, + { + "epoch": 0.5964630225080386, + "grad_norm": 1.3976223468780518, + "learning_rate": 7.047619047619048e-05, + "loss": 1.1904, + "step": 7420 + }, + { + "epoch": 0.5972668810289389, + "grad_norm": 2.294698476791382, + "learning_rate": 7.04358353510896e-05, + "loss": 1.2541, + "step": 7430 + }, + { + "epoch": 0.5980707395498392, + "grad_norm": 1.3611372709274292, + "learning_rate": 7.039548022598871e-05, + "loss": 1.1069, + "step": 7440 + }, + { + "epoch": 0.5988745980707395, + "grad_norm": 1.5398073196411133, + "learning_rate": 7.035512510088782e-05, + "loss": 1.1596, + "step": 7450 + }, + { + "epoch": 0.5996784565916399, + "grad_norm": 1.7317296266555786, + "learning_rate": 7.031476997578694e-05, + "loss": 1.241, + "step": 7460 + }, + { + "epoch": 0.6004823151125402, + "grad_norm": 1.5638251304626465, + "learning_rate": 7.027441485068604e-05, + "loss": 1.2383, + "step": 7470 + }, + { + "epoch": 0.6012861736334405, + "grad_norm": 1.892224669456482, + "learning_rate": 7.023405972558516e-05, + "loss": 1.025, + "step": 7480 + }, + { + "epoch": 0.6020900321543409, + "grad_norm": 1.6151889562606812, + "learning_rate": 7.019370460048426e-05, + "loss": 1.1827, + "step": 7490 + }, + { + "epoch": 0.6028938906752411, + "grad_norm": 1.3697775602340698, + "learning_rate": 7.015334947538338e-05, + "loss": 1.2432, + "step": 7500 + }, + { + "epoch": 0.6036977491961415, + "grad_norm": 1.0278207063674927, + "learning_rate": 7.011299435028249e-05, + "loss": 1.3371, + "step": 7510 + }, + { + "epoch": 0.6045016077170418, + "grad_norm": 1.348399043083191, + "learning_rate": 7.00726392251816e-05, + "loss": 1.0785, + "step": 7520 + }, + { + "epoch": 0.6053054662379421, + "grad_norm": 1.1832696199417114, + "learning_rate": 7.003228410008071e-05, + "loss": 1.1552, + "step": 7530 + }, + { + "epoch": 0.6061093247588425, + "grad_norm": 1.6915167570114136, + "learning_rate": 6.999192897497983e-05, + "loss": 1.2163, + "step": 7540 + }, + { + "epoch": 0.6069131832797428, + "grad_norm": 1.2538834810256958, + "learning_rate": 6.995157384987893e-05, + "loss": 1.2105, + "step": 7550 + }, + { + "epoch": 0.6077170418006431, + "grad_norm": 1.0548170804977417, + "learning_rate": 6.991121872477805e-05, + "loss": 1.2295, + "step": 7560 + }, + { + "epoch": 0.6085209003215434, + "grad_norm": 1.6791824102401733, + "learning_rate": 6.987086359967716e-05, + "loss": 1.1932, + "step": 7570 + }, + { + "epoch": 0.6093247588424437, + "grad_norm": 1.7227991819381714, + "learning_rate": 6.983050847457627e-05, + "loss": 1.2275, + "step": 7580 + }, + { + "epoch": 0.610128617363344, + "grad_norm": 1.4850959777832031, + "learning_rate": 6.979015334947538e-05, + "loss": 1.1649, + "step": 7590 + }, + { + "epoch": 0.6109324758842444, + "grad_norm": 1.2839738130569458, + "learning_rate": 6.97497982243745e-05, + "loss": 1.2562, + "step": 7600 + }, + { + "epoch": 0.6117363344051447, + "grad_norm": 1.845155119895935, + "learning_rate": 6.970944309927361e-05, + "loss": 1.297, + "step": 7610 + }, + { + "epoch": 0.612540192926045, + "grad_norm": 1.21802818775177, + "learning_rate": 6.966908797417272e-05, + "loss": 1.2947, + "step": 7620 + }, + { + "epoch": 0.6133440514469454, + "grad_norm": 1.228559136390686, + "learning_rate": 6.962873284907184e-05, + "loss": 1.3151, + "step": 7630 + }, + { + "epoch": 0.6141479099678456, + "grad_norm": 1.5754231214523315, + "learning_rate": 6.958837772397094e-05, + "loss": 1.1846, + "step": 7640 + }, + { + "epoch": 0.614951768488746, + "grad_norm": 1.7374244928359985, + "learning_rate": 6.954802259887006e-05, + "loss": 1.2737, + "step": 7650 + }, + { + "epoch": 0.6157556270096463, + "grad_norm": 2.792844533920288, + "learning_rate": 6.950766747376916e-05, + "loss": 1.1331, + "step": 7660 + }, + { + "epoch": 0.6165594855305466, + "grad_norm": 1.3875277042388916, + "learning_rate": 6.94673123486683e-05, + "loss": 1.2218, + "step": 7670 + }, + { + "epoch": 0.617363344051447, + "grad_norm": 1.1979440450668335, + "learning_rate": 6.94269572235674e-05, + "loss": 1.1648, + "step": 7680 + }, + { + "epoch": 0.6181672025723473, + "grad_norm": 1.5823277235031128, + "learning_rate": 6.938660209846652e-05, + "loss": 1.249, + "step": 7690 + }, + { + "epoch": 0.6189710610932476, + "grad_norm": 1.26264226436615, + "learning_rate": 6.934624697336562e-05, + "loss": 1.2724, + "step": 7700 + }, + { + "epoch": 0.6197749196141479, + "grad_norm": 1.2197229862213135, + "learning_rate": 6.930589184826474e-05, + "loss": 1.2324, + "step": 7710 + }, + { + "epoch": 0.6205787781350482, + "grad_norm": 1.3479337692260742, + "learning_rate": 6.926553672316385e-05, + "loss": 1.1723, + "step": 7720 + }, + { + "epoch": 0.6213826366559485, + "grad_norm": 4.150685787200928, + "learning_rate": 6.922518159806296e-05, + "loss": 1.1546, + "step": 7730 + }, + { + "epoch": 0.6221864951768489, + "grad_norm": 1.5650343894958496, + "learning_rate": 6.918482647296207e-05, + "loss": 1.1733, + "step": 7740 + }, + { + "epoch": 0.6229903536977492, + "grad_norm": 1.296133279800415, + "learning_rate": 6.914447134786119e-05, + "loss": 1.0567, + "step": 7750 + }, + { + "epoch": 0.6237942122186495, + "grad_norm": 1.382730484008789, + "learning_rate": 6.910411622276029e-05, + "loss": 1.3209, + "step": 7760 + }, + { + "epoch": 0.6245980707395499, + "grad_norm": 1.6270509958267212, + "learning_rate": 6.906376109765941e-05, + "loss": 1.1816, + "step": 7770 + }, + { + "epoch": 0.6254019292604501, + "grad_norm": 1.0578793287277222, + "learning_rate": 6.902340597255851e-05, + "loss": 1.15, + "step": 7780 + }, + { + "epoch": 0.6262057877813505, + "grad_norm": 1.2071346044540405, + "learning_rate": 6.898305084745763e-05, + "loss": 1.2732, + "step": 7790 + }, + { + "epoch": 0.6270096463022508, + "grad_norm": 1.1046452522277832, + "learning_rate": 6.894269572235674e-05, + "loss": 1.0907, + "step": 7800 + }, + { + "epoch": 0.6278135048231511, + "grad_norm": 2.797956943511963, + "learning_rate": 6.890234059725586e-05, + "loss": 1.2019, + "step": 7810 + }, + { + "epoch": 0.6286173633440515, + "grad_norm": 1.2115646600723267, + "learning_rate": 6.886198547215496e-05, + "loss": 1.2811, + "step": 7820 + }, + { + "epoch": 0.6294212218649518, + "grad_norm": 1.8098793029785156, + "learning_rate": 6.882163034705408e-05, + "loss": 1.125, + "step": 7830 + }, + { + "epoch": 0.6302250803858521, + "grad_norm": 1.6568626165390015, + "learning_rate": 6.87812752219532e-05, + "loss": 1.1808, + "step": 7840 + }, + { + "epoch": 0.6310289389067524, + "grad_norm": 1.643615961074829, + "learning_rate": 6.87409200968523e-05, + "loss": 1.209, + "step": 7850 + }, + { + "epoch": 0.6318327974276527, + "grad_norm": 1.4601590633392334, + "learning_rate": 6.870056497175142e-05, + "loss": 1.1949, + "step": 7860 + }, + { + "epoch": 0.632636655948553, + "grad_norm": 1.7648741006851196, + "learning_rate": 6.866020984665052e-05, + "loss": 1.0492, + "step": 7870 + }, + { + "epoch": 0.6334405144694534, + "grad_norm": 1.511143684387207, + "learning_rate": 6.861985472154964e-05, + "loss": 1.3411, + "step": 7880 + }, + { + "epoch": 0.6342443729903537, + "grad_norm": 1.235887050628662, + "learning_rate": 6.857949959644875e-05, + "loss": 1.1432, + "step": 7890 + }, + { + "epoch": 0.635048231511254, + "grad_norm": 1.6829694509506226, + "learning_rate": 6.853914447134786e-05, + "loss": 1.1803, + "step": 7900 + }, + { + "epoch": 0.6358520900321544, + "grad_norm": 1.0991063117980957, + "learning_rate": 6.849878934624697e-05, + "loss": 1.2734, + "step": 7910 + }, + { + "epoch": 0.6366559485530546, + "grad_norm": 2.292754888534546, + "learning_rate": 6.845843422114609e-05, + "loss": 1.152, + "step": 7920 + }, + { + "epoch": 0.637459807073955, + "grad_norm": 1.5538569688796997, + "learning_rate": 6.841807909604519e-05, + "loss": 1.1571, + "step": 7930 + }, + { + "epoch": 0.6382636655948553, + "grad_norm": 1.1748164892196655, + "learning_rate": 6.837772397094432e-05, + "loss": 1.1991, + "step": 7940 + }, + { + "epoch": 0.6390675241157556, + "grad_norm": 1.2783567905426025, + "learning_rate": 6.833736884584343e-05, + "loss": 1.2955, + "step": 7950 + }, + { + "epoch": 0.639871382636656, + "grad_norm": 1.7576483488082886, + "learning_rate": 6.829701372074255e-05, + "loss": 1.2463, + "step": 7960 + }, + { + "epoch": 0.6406752411575563, + "grad_norm": 1.5240103006362915, + "learning_rate": 6.825665859564165e-05, + "loss": 1.1883, + "step": 7970 + }, + { + "epoch": 0.6414790996784566, + "grad_norm": 1.1293576955795288, + "learning_rate": 6.821630347054077e-05, + "loss": 1.1206, + "step": 7980 + }, + { + "epoch": 0.6422829581993569, + "grad_norm": 1.4663335084915161, + "learning_rate": 6.817594834543987e-05, + "loss": 1.2008, + "step": 7990 + }, + { + "epoch": 0.6430868167202572, + "grad_norm": 2.1061480045318604, + "learning_rate": 6.813559322033899e-05, + "loss": 1.2704, + "step": 8000 + }, + { + "epoch": 0.6430868167202572, + "eval_yahma/alpaca-cleaned_loss": 1.225741982460022, + "eval_yahma/alpaca-cleaned_runtime": 115.6052, + "eval_yahma/alpaca-cleaned_samples_per_second": 17.3, + "eval_yahma/alpaca-cleaned_steps_per_second": 2.163, + "step": 8000 + }, + { + "epoch": 0.6438906752411575, + "grad_norm": 1.7242372035980225, + "learning_rate": 6.80952380952381e-05, + "loss": 1.1511, + "step": 8010 + }, + { + "epoch": 0.6446945337620579, + "grad_norm": 2.951997995376587, + "learning_rate": 6.805488297013721e-05, + "loss": 1.2597, + "step": 8020 + }, + { + "epoch": 0.6454983922829582, + "grad_norm": 1.1992913484573364, + "learning_rate": 6.801452784503632e-05, + "loss": 1.116, + "step": 8030 + }, + { + "epoch": 0.6463022508038585, + "grad_norm": 1.4141520261764526, + "learning_rate": 6.797417271993544e-05, + "loss": 1.2101, + "step": 8040 + }, + { + "epoch": 0.6471061093247589, + "grad_norm": 1.491201400756836, + "learning_rate": 6.793381759483454e-05, + "loss": 1.2755, + "step": 8050 + }, + { + "epoch": 0.6479099678456591, + "grad_norm": 1.392905592918396, + "learning_rate": 6.789346246973366e-05, + "loss": 1.13, + "step": 8060 + }, + { + "epoch": 0.6487138263665595, + "grad_norm": 2.571122169494629, + "learning_rate": 6.785310734463278e-05, + "loss": 1.2448, + "step": 8070 + }, + { + "epoch": 0.6495176848874598, + "grad_norm": 1.7001698017120361, + "learning_rate": 6.781275221953188e-05, + "loss": 1.1545, + "step": 8080 + }, + { + "epoch": 0.6503215434083601, + "grad_norm": 1.1046559810638428, + "learning_rate": 6.7772397094431e-05, + "loss": 1.221, + "step": 8090 + }, + { + "epoch": 0.6511254019292605, + "grad_norm": 1.3756605386734009, + "learning_rate": 6.77320419693301e-05, + "loss": 1.2753, + "step": 8100 + }, + { + "epoch": 0.6519292604501608, + "grad_norm": 1.8165565729141235, + "learning_rate": 6.769168684422922e-05, + "loss": 1.2406, + "step": 8110 + }, + { + "epoch": 0.6527331189710611, + "grad_norm": 1.0720582008361816, + "learning_rate": 6.765133171912833e-05, + "loss": 1.2176, + "step": 8120 + }, + { + "epoch": 0.6535369774919614, + "grad_norm": 2.1698720455169678, + "learning_rate": 6.761097659402745e-05, + "loss": 1.225, + "step": 8130 + }, + { + "epoch": 0.6543408360128617, + "grad_norm": 1.36322820186615, + "learning_rate": 6.757062146892655e-05, + "loss": 1.2478, + "step": 8140 + }, + { + "epoch": 0.655144694533762, + "grad_norm": 2.453728199005127, + "learning_rate": 6.753026634382567e-05, + "loss": 1.1111, + "step": 8150 + }, + { + "epoch": 0.6559485530546624, + "grad_norm": 1.62954843044281, + "learning_rate": 6.748991121872477e-05, + "loss": 1.1836, + "step": 8160 + }, + { + "epoch": 0.6567524115755627, + "grad_norm": 1.3338814973831177, + "learning_rate": 6.744955609362389e-05, + "loss": 1.108, + "step": 8170 + }, + { + "epoch": 0.657556270096463, + "grad_norm": 2.812258243560791, + "learning_rate": 6.7409200968523e-05, + "loss": 1.2557, + "step": 8180 + }, + { + "epoch": 0.6583601286173634, + "grad_norm": 1.158793330192566, + "learning_rate": 6.736884584342211e-05, + "loss": 1.1807, + "step": 8190 + }, + { + "epoch": 0.6591639871382636, + "grad_norm": 4.249461650848389, + "learning_rate": 6.732849071832123e-05, + "loss": 1.2792, + "step": 8200 + }, + { + "epoch": 0.659967845659164, + "grad_norm": 1.1763710975646973, + "learning_rate": 6.728813559322035e-05, + "loss": 1.2391, + "step": 8210 + }, + { + "epoch": 0.6607717041800643, + "grad_norm": 0.897494912147522, + "learning_rate": 6.724778046811946e-05, + "loss": 1.2074, + "step": 8220 + }, + { + "epoch": 0.6615755627009646, + "grad_norm": 1.7237837314605713, + "learning_rate": 6.720742534301857e-05, + "loss": 1.3164, + "step": 8230 + }, + { + "epoch": 0.662379421221865, + "grad_norm": 1.3664112091064453, + "learning_rate": 6.716707021791768e-05, + "loss": 1.2813, + "step": 8240 + }, + { + "epoch": 0.6631832797427653, + "grad_norm": 1.3216561079025269, + "learning_rate": 6.71267150928168e-05, + "loss": 1.1742, + "step": 8250 + }, + { + "epoch": 0.6639871382636656, + "grad_norm": 1.8324904441833496, + "learning_rate": 6.70863599677159e-05, + "loss": 1.2871, + "step": 8260 + }, + { + "epoch": 0.6647909967845659, + "grad_norm": 1.4093453884124756, + "learning_rate": 6.704600484261502e-05, + "loss": 1.1952, + "step": 8270 + }, + { + "epoch": 0.6655948553054662, + "grad_norm": 1.3671215772628784, + "learning_rate": 6.700564971751412e-05, + "loss": 1.1578, + "step": 8280 + }, + { + "epoch": 0.6663987138263665, + "grad_norm": 1.5380281209945679, + "learning_rate": 6.696529459241324e-05, + "loss": 1.1117, + "step": 8290 + }, + { + "epoch": 0.6672025723472669, + "grad_norm": 1.2650505304336548, + "learning_rate": 6.692493946731236e-05, + "loss": 1.2053, + "step": 8300 + }, + { + "epoch": 0.6680064308681672, + "grad_norm": 1.4757028818130493, + "learning_rate": 6.688458434221146e-05, + "loss": 1.2185, + "step": 8310 + }, + { + "epoch": 0.6688102893890675, + "grad_norm": 1.5317448377609253, + "learning_rate": 6.684422921711058e-05, + "loss": 1.1689, + "step": 8320 + }, + { + "epoch": 0.6696141479099679, + "grad_norm": 1.8137046098709106, + "learning_rate": 6.680387409200969e-05, + "loss": 1.3851, + "step": 8330 + }, + { + "epoch": 0.6704180064308681, + "grad_norm": 1.6186026334762573, + "learning_rate": 6.67635189669088e-05, + "loss": 1.1795, + "step": 8340 + }, + { + "epoch": 0.6712218649517685, + "grad_norm": 3.300424814224243, + "learning_rate": 6.672316384180791e-05, + "loss": 1.1575, + "step": 8350 + }, + { + "epoch": 0.6720257234726688, + "grad_norm": 1.2810907363891602, + "learning_rate": 6.668280871670703e-05, + "loss": 1.1909, + "step": 8360 + }, + { + "epoch": 0.6728295819935691, + "grad_norm": 1.7278164625167847, + "learning_rate": 6.664245359160613e-05, + "loss": 1.14, + "step": 8370 + }, + { + "epoch": 0.6736334405144695, + "grad_norm": 1.4007691144943237, + "learning_rate": 6.660209846650525e-05, + "loss": 1.1765, + "step": 8380 + }, + { + "epoch": 0.6744372990353698, + "grad_norm": 1.2572942972183228, + "learning_rate": 6.656174334140436e-05, + "loss": 1.2409, + "step": 8390 + }, + { + "epoch": 0.6752411575562701, + "grad_norm": 1.5756325721740723, + "learning_rate": 6.652138821630347e-05, + "loss": 1.2796, + "step": 8400 + }, + { + "epoch": 0.6760450160771704, + "grad_norm": 1.283668041229248, + "learning_rate": 6.648103309120258e-05, + "loss": 1.0367, + "step": 8410 + }, + { + "epoch": 0.6768488745980707, + "grad_norm": 1.803141474723816, + "learning_rate": 6.64406779661017e-05, + "loss": 1.2924, + "step": 8420 + }, + { + "epoch": 0.677652733118971, + "grad_norm": 1.47214937210083, + "learning_rate": 6.64003228410008e-05, + "loss": 1.1776, + "step": 8430 + }, + { + "epoch": 0.6784565916398714, + "grad_norm": 1.3774545192718506, + "learning_rate": 6.635996771589992e-05, + "loss": 1.2473, + "step": 8440 + }, + { + "epoch": 0.6792604501607717, + "grad_norm": 1.7005946636199951, + "learning_rate": 6.631961259079902e-05, + "loss": 1.1774, + "step": 8450 + }, + { + "epoch": 0.680064308681672, + "grad_norm": 5.010222911834717, + "learning_rate": 6.627925746569814e-05, + "loss": 1.1893, + "step": 8460 + }, + { + "epoch": 0.6808681672025724, + "grad_norm": 1.1495386362075806, + "learning_rate": 6.623890234059726e-05, + "loss": 1.1586, + "step": 8470 + }, + { + "epoch": 0.6816720257234726, + "grad_norm": 1.4174809455871582, + "learning_rate": 6.619854721549638e-05, + "loss": 1.1392, + "step": 8480 + }, + { + "epoch": 0.682475884244373, + "grad_norm": 1.0881890058517456, + "learning_rate": 6.615819209039548e-05, + "loss": 1.1908, + "step": 8490 + }, + { + "epoch": 0.6832797427652733, + "grad_norm": 1.5683577060699463, + "learning_rate": 6.61178369652946e-05, + "loss": 1.1267, + "step": 8500 + }, + { + "epoch": 0.6840836012861736, + "grad_norm": 1.0879807472229004, + "learning_rate": 6.60774818401937e-05, + "loss": 1.1413, + "step": 8510 + }, + { + "epoch": 0.684887459807074, + "grad_norm": 1.0915292501449585, + "learning_rate": 6.603712671509282e-05, + "loss": 1.2611, + "step": 8520 + }, + { + "epoch": 0.6856913183279743, + "grad_norm": 1.571441888809204, + "learning_rate": 6.599677158999193e-05, + "loss": 1.1953, + "step": 8530 + }, + { + "epoch": 0.6864951768488746, + "grad_norm": 1.2628365755081177, + "learning_rate": 6.595641646489105e-05, + "loss": 1.2556, + "step": 8540 + }, + { + "epoch": 0.6872990353697749, + "grad_norm": 1.3509798049926758, + "learning_rate": 6.591606133979016e-05, + "loss": 1.1023, + "step": 8550 + }, + { + "epoch": 0.6881028938906752, + "grad_norm": 1.2220946550369263, + "learning_rate": 6.587570621468927e-05, + "loss": 1.216, + "step": 8560 + }, + { + "epoch": 0.6889067524115756, + "grad_norm": 1.320622444152832, + "learning_rate": 6.583535108958839e-05, + "loss": 1.2987, + "step": 8570 + }, + { + "epoch": 0.6897106109324759, + "grad_norm": 1.3505820035934448, + "learning_rate": 6.579499596448749e-05, + "loss": 1.3089, + "step": 8580 + }, + { + "epoch": 0.6905144694533762, + "grad_norm": 1.2899729013442993, + "learning_rate": 6.575464083938661e-05, + "loss": 1.1303, + "step": 8590 + }, + { + "epoch": 0.6913183279742765, + "grad_norm": 1.7675771713256836, + "learning_rate": 6.571428571428571e-05, + "loss": 1.2207, + "step": 8600 + }, + { + "epoch": 0.6921221864951769, + "grad_norm": 1.1679964065551758, + "learning_rate": 6.567393058918483e-05, + "loss": 1.1867, + "step": 8610 + }, + { + "epoch": 0.6929260450160771, + "grad_norm": 1.245856523513794, + "learning_rate": 6.563357546408394e-05, + "loss": 1.1644, + "step": 8620 + }, + { + "epoch": 0.6937299035369775, + "grad_norm": 1.3648661375045776, + "learning_rate": 6.559322033898306e-05, + "loss": 1.2478, + "step": 8630 + }, + { + "epoch": 0.6945337620578779, + "grad_norm": 1.5502279996871948, + "learning_rate": 6.555286521388216e-05, + "loss": 1.0907, + "step": 8640 + }, + { + "epoch": 0.6953376205787781, + "grad_norm": 1.7564395666122437, + "learning_rate": 6.551251008878128e-05, + "loss": 1.1526, + "step": 8650 + }, + { + "epoch": 0.6961414790996785, + "grad_norm": 1.3868829011917114, + "learning_rate": 6.547215496368038e-05, + "loss": 1.2426, + "step": 8660 + }, + { + "epoch": 0.6969453376205788, + "grad_norm": 1.6506773233413696, + "learning_rate": 6.54317998385795e-05, + "loss": 1.0556, + "step": 8670 + }, + { + "epoch": 0.6977491961414791, + "grad_norm": 1.3206250667572021, + "learning_rate": 6.53914447134786e-05, + "loss": 1.3248, + "step": 8680 + }, + { + "epoch": 0.6985530546623794, + "grad_norm": 1.1856706142425537, + "learning_rate": 6.535108958837772e-05, + "loss": 1.3029, + "step": 8690 + }, + { + "epoch": 0.6993569131832797, + "grad_norm": 1.089762806892395, + "learning_rate": 6.531073446327683e-05, + "loss": 1.1649, + "step": 8700 + }, + { + "epoch": 0.7001607717041801, + "grad_norm": 1.6999385356903076, + "learning_rate": 6.527037933817595e-05, + "loss": 1.221, + "step": 8710 + }, + { + "epoch": 0.7009646302250804, + "grad_norm": 1.2625404596328735, + "learning_rate": 6.523002421307506e-05, + "loss": 1.0953, + "step": 8720 + }, + { + "epoch": 0.7017684887459807, + "grad_norm": 1.6326946020126343, + "learning_rate": 6.518966908797418e-05, + "loss": 1.3168, + "step": 8730 + }, + { + "epoch": 0.702572347266881, + "grad_norm": 1.0178214311599731, + "learning_rate": 6.514931396287329e-05, + "loss": 1.1271, + "step": 8740 + }, + { + "epoch": 0.7033762057877814, + "grad_norm": 1.3457207679748535, + "learning_rate": 6.51089588377724e-05, + "loss": 1.2128, + "step": 8750 + }, + { + "epoch": 0.7041800643086816, + "grad_norm": 1.4977234601974487, + "learning_rate": 6.506860371267151e-05, + "loss": 1.241, + "step": 8760 + }, + { + "epoch": 0.704983922829582, + "grad_norm": 1.3846150636672974, + "learning_rate": 6.502824858757063e-05, + "loss": 1.1745, + "step": 8770 + }, + { + "epoch": 0.7057877813504824, + "grad_norm": 1.4363330602645874, + "learning_rate": 6.498789346246975e-05, + "loss": 1.1242, + "step": 8780 + }, + { + "epoch": 0.7065916398713826, + "grad_norm": 1.6958163976669312, + "learning_rate": 6.494753833736885e-05, + "loss": 1.2168, + "step": 8790 + }, + { + "epoch": 0.707395498392283, + "grad_norm": 0.9998372793197632, + "learning_rate": 6.490718321226797e-05, + "loss": 1.0476, + "step": 8800 + }, + { + "epoch": 0.7081993569131833, + "grad_norm": 1.3063734769821167, + "learning_rate": 6.486682808716707e-05, + "loss": 1.231, + "step": 8810 + }, + { + "epoch": 0.7090032154340836, + "grad_norm": 1.0710753202438354, + "learning_rate": 6.482647296206619e-05, + "loss": 1.1191, + "step": 8820 + }, + { + "epoch": 0.7098070739549839, + "grad_norm": 1.6944465637207031, + "learning_rate": 6.47861178369653e-05, + "loss": 1.1878, + "step": 8830 + }, + { + "epoch": 0.7106109324758842, + "grad_norm": 1.3219144344329834, + "learning_rate": 6.474576271186441e-05, + "loss": 1.2053, + "step": 8840 + }, + { + "epoch": 0.7114147909967846, + "grad_norm": 1.5892850160598755, + "learning_rate": 6.470540758676352e-05, + "loss": 1.0587, + "step": 8850 + }, + { + "epoch": 0.7122186495176849, + "grad_norm": 3.019225597381592, + "learning_rate": 6.466505246166264e-05, + "loss": 1.1815, + "step": 8860 + }, + { + "epoch": 0.7130225080385852, + "grad_norm": 1.493353009223938, + "learning_rate": 6.462469733656174e-05, + "loss": 1.1207, + "step": 8870 + }, + { + "epoch": 0.7138263665594855, + "grad_norm": 2.1397647857666016, + "learning_rate": 6.458837772397096e-05, + "loss": 1.1683, + "step": 8880 + }, + { + "epoch": 0.7146302250803859, + "grad_norm": 1.1362327337265015, + "learning_rate": 6.454802259887006e-05, + "loss": 1.0961, + "step": 8890 + }, + { + "epoch": 0.7154340836012861, + "grad_norm": 1.0656194686889648, + "learning_rate": 6.450766747376918e-05, + "loss": 1.1447, + "step": 8900 + }, + { + "epoch": 0.7162379421221865, + "grad_norm": 1.3920767307281494, + "learning_rate": 6.446731234866828e-05, + "loss": 1.2971, + "step": 8910 + }, + { + "epoch": 0.7170418006430869, + "grad_norm": 1.0478270053863525, + "learning_rate": 6.44269572235674e-05, + "loss": 1.23, + "step": 8920 + }, + { + "epoch": 0.7178456591639871, + "grad_norm": 1.6214714050292969, + "learning_rate": 6.43866020984665e-05, + "loss": 1.2869, + "step": 8930 + }, + { + "epoch": 0.7186495176848875, + "grad_norm": 1.8323612213134766, + "learning_rate": 6.434624697336562e-05, + "loss": 1.1301, + "step": 8940 + }, + { + "epoch": 0.7194533762057878, + "grad_norm": 1.6926652193069458, + "learning_rate": 6.430589184826473e-05, + "loss": 1.3043, + "step": 8950 + }, + { + "epoch": 0.7202572347266881, + "grad_norm": 1.4071663618087769, + "learning_rate": 6.426553672316385e-05, + "loss": 1.0648, + "step": 8960 + }, + { + "epoch": 0.7210610932475884, + "grad_norm": 1.264920949935913, + "learning_rate": 6.422518159806295e-05, + "loss": 1.1961, + "step": 8970 + }, + { + "epoch": 0.7218649517684887, + "grad_norm": 1.331862211227417, + "learning_rate": 6.418482647296207e-05, + "loss": 1.3281, + "step": 8980 + }, + { + "epoch": 0.7226688102893891, + "grad_norm": 1.2815852165222168, + "learning_rate": 6.414447134786117e-05, + "loss": 1.2381, + "step": 8990 + }, + { + "epoch": 0.7234726688102894, + "grad_norm": 1.4118419885635376, + "learning_rate": 6.410411622276029e-05, + "loss": 1.1161, + "step": 9000 + }, + { + "epoch": 0.7242765273311897, + "grad_norm": 1.3302923440933228, + "learning_rate": 6.40637610976594e-05, + "loss": 1.1986, + "step": 9010 + }, + { + "epoch": 0.72508038585209, + "grad_norm": 1.8220189809799194, + "learning_rate": 6.402340597255852e-05, + "loss": 1.1209, + "step": 9020 + }, + { + "epoch": 0.7258842443729904, + "grad_norm": 1.5864709615707397, + "learning_rate": 6.398305084745762e-05, + "loss": 1.2848, + "step": 9030 + }, + { + "epoch": 0.7266881028938906, + "grad_norm": 2.4576737880706787, + "learning_rate": 6.394269572235674e-05, + "loss": 1.2107, + "step": 9040 + }, + { + "epoch": 0.727491961414791, + "grad_norm": 1.4367574453353882, + "learning_rate": 6.390234059725586e-05, + "loss": 1.3456, + "step": 9050 + }, + { + "epoch": 0.7282958199356914, + "grad_norm": 1.2669857740402222, + "learning_rate": 6.386198547215496e-05, + "loss": 1.099, + "step": 9060 + }, + { + "epoch": 0.7290996784565916, + "grad_norm": 1.5575499534606934, + "learning_rate": 6.382163034705408e-05, + "loss": 1.1937, + "step": 9070 + }, + { + "epoch": 0.729903536977492, + "grad_norm": 1.5874534845352173, + "learning_rate": 6.37812752219532e-05, + "loss": 1.2806, + "step": 9080 + }, + { + "epoch": 0.7307073954983923, + "grad_norm": 1.2446188926696777, + "learning_rate": 6.37409200968523e-05, + "loss": 1.1843, + "step": 9090 + }, + { + "epoch": 0.7315112540192926, + "grad_norm": 1.694628119468689, + "learning_rate": 6.370056497175142e-05, + "loss": 1.2325, + "step": 9100 + }, + { + "epoch": 0.7323151125401929, + "grad_norm": 1.0469274520874023, + "learning_rate": 6.366020984665054e-05, + "loss": 1.2086, + "step": 9110 + }, + { + "epoch": 0.7331189710610932, + "grad_norm": 1.4143184423446655, + "learning_rate": 6.361985472154964e-05, + "loss": 1.1298, + "step": 9120 + }, + { + "epoch": 0.7339228295819936, + "grad_norm": 1.917021632194519, + "learning_rate": 6.357949959644876e-05, + "loss": 1.1863, + "step": 9130 + }, + { + "epoch": 0.7347266881028939, + "grad_norm": 1.2662550210952759, + "learning_rate": 6.353914447134787e-05, + "loss": 1.0804, + "step": 9140 + }, + { + "epoch": 0.7355305466237942, + "grad_norm": 1.4026659727096558, + "learning_rate": 6.349878934624698e-05, + "loss": 1.1445, + "step": 9150 + }, + { + "epoch": 0.7363344051446945, + "grad_norm": 1.2977662086486816, + "learning_rate": 6.345843422114609e-05, + "loss": 1.1224, + "step": 9160 + }, + { + "epoch": 0.7371382636655949, + "grad_norm": 1.3581264019012451, + "learning_rate": 6.34180790960452e-05, + "loss": 1.2324, + "step": 9170 + }, + { + "epoch": 0.7379421221864951, + "grad_norm": 1.3826358318328857, + "learning_rate": 6.337772397094431e-05, + "loss": 1.2467, + "step": 9180 + }, + { + "epoch": 0.7387459807073955, + "grad_norm": 1.1410869359970093, + "learning_rate": 6.333736884584343e-05, + "loss": 1.1995, + "step": 9190 + }, + { + "epoch": 0.7395498392282959, + "grad_norm": 1.2657575607299805, + "learning_rate": 6.329701372074253e-05, + "loss": 1.1736, + "step": 9200 + }, + { + "epoch": 0.7403536977491961, + "grad_norm": 1.88206148147583, + "learning_rate": 6.325665859564165e-05, + "loss": 1.1617, + "step": 9210 + }, + { + "epoch": 0.7411575562700965, + "grad_norm": 2.55361270904541, + "learning_rate": 6.321630347054076e-05, + "loss": 1.1128, + "step": 9220 + }, + { + "epoch": 0.7419614147909968, + "grad_norm": 1.296675205230713, + "learning_rate": 6.317594834543987e-05, + "loss": 1.1873, + "step": 9230 + }, + { + "epoch": 0.7427652733118971, + "grad_norm": 1.9324688911437988, + "learning_rate": 6.313559322033898e-05, + "loss": 1.2678, + "step": 9240 + }, + { + "epoch": 0.7435691318327974, + "grad_norm": 1.5639374256134033, + "learning_rate": 6.30952380952381e-05, + "loss": 1.2444, + "step": 9250 + }, + { + "epoch": 0.7443729903536977, + "grad_norm": 1.1058989763259888, + "learning_rate": 6.30548829701372e-05, + "loss": 1.1626, + "step": 9260 + }, + { + "epoch": 0.7451768488745981, + "grad_norm": 1.621466040611267, + "learning_rate": 6.301452784503632e-05, + "loss": 1.1706, + "step": 9270 + }, + { + "epoch": 0.7459807073954984, + "grad_norm": 1.8088675737380981, + "learning_rate": 6.297417271993544e-05, + "loss": 1.111, + "step": 9280 + }, + { + "epoch": 0.7467845659163987, + "grad_norm": 1.7105693817138672, + "learning_rate": 6.293381759483454e-05, + "loss": 1.1828, + "step": 9290 + }, + { + "epoch": 0.747588424437299, + "grad_norm": 1.2621886730194092, + "learning_rate": 6.289346246973366e-05, + "loss": 1.3506, + "step": 9300 + }, + { + "epoch": 0.7483922829581994, + "grad_norm": 1.4790949821472168, + "learning_rate": 6.285310734463276e-05, + "loss": 1.2797, + "step": 9310 + }, + { + "epoch": 0.7491961414790996, + "grad_norm": 1.2256627082824707, + "learning_rate": 6.281275221953188e-05, + "loss": 1.2512, + "step": 9320 + }, + { + "epoch": 0.75, + "grad_norm": 0.9516915678977966, + "learning_rate": 6.277239709443099e-05, + "loss": 1.2157, + "step": 9330 + }, + { + "epoch": 0.7508038585209004, + "grad_norm": 1.2063665390014648, + "learning_rate": 6.273204196933012e-05, + "loss": 1.1853, + "step": 9340 + }, + { + "epoch": 0.7516077170418006, + "grad_norm": 1.9841398000717163, + "learning_rate": 6.269168684422922e-05, + "loss": 1.1145, + "step": 9350 + }, + { + "epoch": 0.752411575562701, + "grad_norm": 1.2014284133911133, + "learning_rate": 6.265133171912834e-05, + "loss": 1.1489, + "step": 9360 + }, + { + "epoch": 0.7532154340836013, + "grad_norm": 1.0883020162582397, + "learning_rate": 6.261097659402745e-05, + "loss": 1.3133, + "step": 9370 + }, + { + "epoch": 0.7540192926045016, + "grad_norm": 1.5855486392974854, + "learning_rate": 6.257062146892656e-05, + "loss": 1.2134, + "step": 9380 + }, + { + "epoch": 0.7548231511254019, + "grad_norm": 1.7968937158584595, + "learning_rate": 6.253026634382567e-05, + "loss": 1.1743, + "step": 9390 + }, + { + "epoch": 0.7556270096463023, + "grad_norm": 1.5748929977416992, + "learning_rate": 6.248991121872479e-05, + "loss": 1.1228, + "step": 9400 + }, + { + "epoch": 0.7564308681672026, + "grad_norm": 1.267173409461975, + "learning_rate": 6.244955609362389e-05, + "loss": 1.0907, + "step": 9410 + }, + { + "epoch": 0.7572347266881029, + "grad_norm": 1.3452446460723877, + "learning_rate": 6.240920096852301e-05, + "loss": 1.2708, + "step": 9420 + }, + { + "epoch": 0.7580385852090032, + "grad_norm": 1.4234520196914673, + "learning_rate": 6.236884584342211e-05, + "loss": 1.2494, + "step": 9430 + }, + { + "epoch": 0.7588424437299035, + "grad_norm": 1.4506953954696655, + "learning_rate": 6.232849071832123e-05, + "loss": 1.1533, + "step": 9440 + }, + { + "epoch": 0.7596463022508039, + "grad_norm": 2.0665547847747803, + "learning_rate": 6.228813559322034e-05, + "loss": 1.2174, + "step": 9450 + }, + { + "epoch": 0.7604501607717041, + "grad_norm": 1.514133095741272, + "learning_rate": 6.224778046811946e-05, + "loss": 1.171, + "step": 9460 + }, + { + "epoch": 0.7612540192926045, + "grad_norm": 1.0817458629608154, + "learning_rate": 6.220742534301856e-05, + "loss": 1.2038, + "step": 9470 + }, + { + "epoch": 0.7620578778135049, + "grad_norm": 1.4435175657272339, + "learning_rate": 6.216707021791768e-05, + "loss": 1.1971, + "step": 9480 + }, + { + "epoch": 0.7628617363344051, + "grad_norm": 1.4563673734664917, + "learning_rate": 6.212671509281678e-05, + "loss": 1.1997, + "step": 9490 + }, + { + "epoch": 0.7636655948553055, + "grad_norm": 1.271362543106079, + "learning_rate": 6.20863599677159e-05, + "loss": 1.3304, + "step": 9500 + }, + { + "epoch": 0.7644694533762058, + "grad_norm": 1.5026984214782715, + "learning_rate": 6.204600484261502e-05, + "loss": 1.2449, + "step": 9510 + }, + { + "epoch": 0.7652733118971061, + "grad_norm": 1.5852859020233154, + "learning_rate": 6.200564971751412e-05, + "loss": 1.191, + "step": 9520 + }, + { + "epoch": 0.7660771704180064, + "grad_norm": 1.6117398738861084, + "learning_rate": 6.196529459241324e-05, + "loss": 1.1819, + "step": 9530 + }, + { + "epoch": 0.7668810289389068, + "grad_norm": 1.41196608543396, + "learning_rate": 6.192493946731235e-05, + "loss": 1.3235, + "step": 9540 + }, + { + "epoch": 0.7676848874598071, + "grad_norm": 1.1791915893554688, + "learning_rate": 6.188458434221146e-05, + "loss": 1.1063, + "step": 9550 + }, + { + "epoch": 0.7684887459807074, + "grad_norm": 1.18216073513031, + "learning_rate": 6.184422921711057e-05, + "loss": 1.1579, + "step": 9560 + }, + { + "epoch": 0.7692926045016077, + "grad_norm": 1.2205034494400024, + "learning_rate": 6.180387409200969e-05, + "loss": 1.2236, + "step": 9570 + }, + { + "epoch": 0.770096463022508, + "grad_norm": 1.3474924564361572, + "learning_rate": 6.176351896690879e-05, + "loss": 1.131, + "step": 9580 + }, + { + "epoch": 0.7709003215434084, + "grad_norm": 1.1282883882522583, + "learning_rate": 6.172316384180791e-05, + "loss": 1.1688, + "step": 9590 + }, + { + "epoch": 0.7717041800643086, + "grad_norm": 1.1234114170074463, + "learning_rate": 6.168280871670703e-05, + "loss": 1.2443, + "step": 9600 + }, + { + "epoch": 0.772508038585209, + "grad_norm": 2.7613494396209717, + "learning_rate": 6.164245359160615e-05, + "loss": 1.1255, + "step": 9610 + }, + { + "epoch": 0.7733118971061094, + "grad_norm": 1.71947181224823, + "learning_rate": 6.160209846650525e-05, + "loss": 1.2513, + "step": 9620 + }, + { + "epoch": 0.7741157556270096, + "grad_norm": 1.1563067436218262, + "learning_rate": 6.156174334140437e-05, + "loss": 1.005, + "step": 9630 + }, + { + "epoch": 0.77491961414791, + "grad_norm": 1.963090419769287, + "learning_rate": 6.152138821630347e-05, + "loss": 1.2146, + "step": 9640 + }, + { + "epoch": 0.7757234726688103, + "grad_norm": 1.3083224296569824, + "learning_rate": 6.148103309120259e-05, + "loss": 1.1852, + "step": 9650 + }, + { + "epoch": 0.7765273311897106, + "grad_norm": 1.2836787700653076, + "learning_rate": 6.14406779661017e-05, + "loss": 1.1658, + "step": 9660 + }, + { + "epoch": 0.7773311897106109, + "grad_norm": 1.1122138500213623, + "learning_rate": 6.140032284100081e-05, + "loss": 1.1442, + "step": 9670 + }, + { + "epoch": 0.7781350482315113, + "grad_norm": 2.9130606651306152, + "learning_rate": 6.135996771589992e-05, + "loss": 1.3943, + "step": 9680 + }, + { + "epoch": 0.7789389067524116, + "grad_norm": 1.2485105991363525, + "learning_rate": 6.131961259079904e-05, + "loss": 1.2203, + "step": 9690 + }, + { + "epoch": 0.7797427652733119, + "grad_norm": 1.7842824459075928, + "learning_rate": 6.127925746569814e-05, + "loss": 1.2153, + "step": 9700 + }, + { + "epoch": 0.7805466237942122, + "grad_norm": 1.3240492343902588, + "learning_rate": 6.123890234059726e-05, + "loss": 1.2034, + "step": 9710 + }, + { + "epoch": 0.7813504823151125, + "grad_norm": 1.3932291269302368, + "learning_rate": 6.119854721549636e-05, + "loss": 1.1873, + "step": 9720 + }, + { + "epoch": 0.7821543408360129, + "grad_norm": 1.2369239330291748, + "learning_rate": 6.115819209039548e-05, + "loss": 1.1213, + "step": 9730 + }, + { + "epoch": 0.7829581993569131, + "grad_norm": 1.24186110496521, + "learning_rate": 6.11178369652946e-05, + "loss": 1.1573, + "step": 9740 + }, + { + "epoch": 0.7837620578778135, + "grad_norm": 1.5317646265029907, + "learning_rate": 6.10774818401937e-05, + "loss": 1.1962, + "step": 9750 + }, + { + "epoch": 0.7845659163987139, + "grad_norm": 1.554369568824768, + "learning_rate": 6.103712671509282e-05, + "loss": 1.304, + "step": 9760 + }, + { + "epoch": 0.7853697749196141, + "grad_norm": 2.1366379261016846, + "learning_rate": 6.099677158999193e-05, + "loss": 1.1438, + "step": 9770 + }, + { + "epoch": 0.7861736334405145, + "grad_norm": 1.6524451971054077, + "learning_rate": 6.095641646489104e-05, + "loss": 1.12, + "step": 9780 + }, + { + "epoch": 0.7869774919614148, + "grad_norm": 1.5162711143493652, + "learning_rate": 6.091606133979015e-05, + "loss": 1.1718, + "step": 9790 + }, + { + "epoch": 0.7877813504823151, + "grad_norm": 1.8277530670166016, + "learning_rate": 6.087570621468926e-05, + "loss": 1.1945, + "step": 9800 + }, + { + "epoch": 0.7885852090032154, + "grad_norm": 1.7623543739318848, + "learning_rate": 6.0835351089588374e-05, + "loss": 1.1627, + "step": 9810 + }, + { + "epoch": 0.7893890675241158, + "grad_norm": 1.6323825120925903, + "learning_rate": 6.079499596448749e-05, + "loss": 1.3111, + "step": 9820 + }, + { + "epoch": 0.7901929260450161, + "grad_norm": 1.194577932357788, + "learning_rate": 6.0754640839386603e-05, + "loss": 1.2049, + "step": 9830 + }, + { + "epoch": 0.7909967845659164, + "grad_norm": 1.5735859870910645, + "learning_rate": 6.0714285714285715e-05, + "loss": 1.3183, + "step": 9840 + }, + { + "epoch": 0.7918006430868167, + "grad_norm": 1.4071894884109497, + "learning_rate": 6.0673930589184826e-05, + "loss": 1.2357, + "step": 9850 + }, + { + "epoch": 0.792604501607717, + "grad_norm": 1.5908355712890625, + "learning_rate": 6.0633575464083944e-05, + "loss": 1.1461, + "step": 9860 + }, + { + "epoch": 0.7934083601286174, + "grad_norm": 1.2151633501052856, + "learning_rate": 6.0593220338983056e-05, + "loss": 1.1617, + "step": 9870 + }, + { + "epoch": 0.7942122186495176, + "grad_norm": 1.1097683906555176, + "learning_rate": 6.0552865213882174e-05, + "loss": 1.2553, + "step": 9880 + }, + { + "epoch": 0.795016077170418, + "grad_norm": 1.1444097757339478, + "learning_rate": 6.0512510088781285e-05, + "loss": 1.2079, + "step": 9890 + }, + { + "epoch": 0.7958199356913184, + "grad_norm": 2.0320544242858887, + "learning_rate": 6.0472154963680397e-05, + "loss": 1.2089, + "step": 9900 + }, + { + "epoch": 0.7966237942122186, + "grad_norm": 1.5209218263626099, + "learning_rate": 6.043179983857951e-05, + "loss": 1.1291, + "step": 9910 + }, + { + "epoch": 0.797427652733119, + "grad_norm": 1.0873534679412842, + "learning_rate": 6.039144471347862e-05, + "loss": 1.221, + "step": 9920 + }, + { + "epoch": 0.7982315112540193, + "grad_norm": 1.0633383989334106, + "learning_rate": 6.035108958837773e-05, + "loss": 1.1958, + "step": 9930 + }, + { + "epoch": 0.7990353697749196, + "grad_norm": 2.4503116607666016, + "learning_rate": 6.031073446327684e-05, + "loss": 1.1433, + "step": 9940 + }, + { + "epoch": 0.7998392282958199, + "grad_norm": 1.9616880416870117, + "learning_rate": 6.027037933817595e-05, + "loss": 1.2961, + "step": 9950 + }, + { + "epoch": 0.8006430868167203, + "grad_norm": 1.9644263982772827, + "learning_rate": 6.0230024213075065e-05, + "loss": 1.0541, + "step": 9960 + }, + { + "epoch": 0.8014469453376206, + "grad_norm": 1.420846700668335, + "learning_rate": 6.0189669087974176e-05, + "loss": 1.3086, + "step": 9970 + }, + { + "epoch": 0.8022508038585209, + "grad_norm": 2.0573222637176514, + "learning_rate": 6.014931396287329e-05, + "loss": 1.0556, + "step": 9980 + }, + { + "epoch": 0.8030546623794212, + "grad_norm": 1.1859737634658813, + "learning_rate": 6.01089588377724e-05, + "loss": 1.1973, + "step": 9990 + }, + { + "epoch": 0.8038585209003215, + "grad_norm": 1.1852400302886963, + "learning_rate": 6.006860371267151e-05, + "loss": 1.1116, + "step": 10000 + }, + { + "epoch": 0.8038585209003215, + "eval_yahma/alpaca-cleaned_loss": 1.2155007123947144, + "eval_yahma/alpaca-cleaned_runtime": 115.7956, + "eval_yahma/alpaca-cleaned_samples_per_second": 17.272, + "eval_yahma/alpaca-cleaned_steps_per_second": 2.159, + "step": 10000 + }, + { + "epoch": 0.8046623794212219, + "grad_norm": 1.667161464691162, + "learning_rate": 6.002824858757062e-05, + "loss": 1.2352, + "step": 10010 + }, + { + "epoch": 0.8054662379421221, + "grad_norm": 2.6935198307037354, + "learning_rate": 5.998789346246973e-05, + "loss": 1.1627, + "step": 10020 + }, + { + "epoch": 0.8062700964630225, + "grad_norm": 1.9311403036117554, + "learning_rate": 5.9947538337368844e-05, + "loss": 1.0085, + "step": 10030 + }, + { + "epoch": 0.8070739549839229, + "grad_norm": 2.0662097930908203, + "learning_rate": 5.9907183212267956e-05, + "loss": 1.1416, + "step": 10040 + }, + { + "epoch": 0.8078778135048231, + "grad_norm": 2.6683616638183594, + "learning_rate": 5.9866828087167074e-05, + "loss": 1.0442, + "step": 10050 + }, + { + "epoch": 0.8086816720257235, + "grad_norm": 1.4304804801940918, + "learning_rate": 5.9826472962066185e-05, + "loss": 1.3182, + "step": 10060 + }, + { + "epoch": 0.8094855305466238, + "grad_norm": 2.3583433628082275, + "learning_rate": 5.9786117836965296e-05, + "loss": 1.2935, + "step": 10070 + }, + { + "epoch": 0.8102893890675241, + "grad_norm": 1.488024115562439, + "learning_rate": 5.974576271186441e-05, + "loss": 1.081, + "step": 10080 + }, + { + "epoch": 0.8110932475884244, + "grad_norm": 1.401058316230774, + "learning_rate": 5.970540758676352e-05, + "loss": 1.1965, + "step": 10090 + }, + { + "epoch": 0.8118971061093248, + "grad_norm": 1.3213167190551758, + "learning_rate": 5.966505246166263e-05, + "loss": 1.1513, + "step": 10100 + }, + { + "epoch": 0.8127009646302251, + "grad_norm": 1.2974404096603394, + "learning_rate": 5.962469733656174e-05, + "loss": 1.2244, + "step": 10110 + }, + { + "epoch": 0.8135048231511254, + "grad_norm": 1.1411107778549194, + "learning_rate": 5.958434221146085e-05, + "loss": 1.1874, + "step": 10120 + }, + { + "epoch": 0.8143086816720257, + "grad_norm": 1.4510279893875122, + "learning_rate": 5.954398708635998e-05, + "loss": 1.1776, + "step": 10130 + }, + { + "epoch": 0.815112540192926, + "grad_norm": 1.4337852001190186, + "learning_rate": 5.950363196125909e-05, + "loss": 1.3446, + "step": 10140 + }, + { + "epoch": 0.8159163987138264, + "grad_norm": 1.5213372707366943, + "learning_rate": 5.94632768361582e-05, + "loss": 1.1798, + "step": 10150 + }, + { + "epoch": 0.8167202572347267, + "grad_norm": 1.4217019081115723, + "learning_rate": 5.942292171105731e-05, + "loss": 1.0763, + "step": 10160 + }, + { + "epoch": 0.817524115755627, + "grad_norm": 1.2413939237594604, + "learning_rate": 5.9382566585956424e-05, + "loss": 1.0959, + "step": 10170 + }, + { + "epoch": 0.8183279742765274, + "grad_norm": 1.3456542491912842, + "learning_rate": 5.9342211460855535e-05, + "loss": 1.1784, + "step": 10180 + }, + { + "epoch": 0.8191318327974276, + "grad_norm": 1.8825595378875732, + "learning_rate": 5.9301856335754646e-05, + "loss": 1.1552, + "step": 10190 + }, + { + "epoch": 0.819935691318328, + "grad_norm": 1.545015811920166, + "learning_rate": 5.926150121065376e-05, + "loss": 1.1653, + "step": 10200 + }, + { + "epoch": 0.8207395498392283, + "grad_norm": 2.0575296878814697, + "learning_rate": 5.922114608555287e-05, + "loss": 1.0649, + "step": 10210 + }, + { + "epoch": 0.8215434083601286, + "grad_norm": 1.4069037437438965, + "learning_rate": 5.918079096045198e-05, + "loss": 1.159, + "step": 10220 + }, + { + "epoch": 0.822347266881029, + "grad_norm": 1.1657689809799194, + "learning_rate": 5.914043583535109e-05, + "loss": 1.1816, + "step": 10230 + }, + { + "epoch": 0.8231511254019293, + "grad_norm": 1.254135251045227, + "learning_rate": 5.91000807102502e-05, + "loss": 1.2129, + "step": 10240 + }, + { + "epoch": 0.8239549839228296, + "grad_norm": 2.080061912536621, + "learning_rate": 5.9059725585149315e-05, + "loss": 1.2498, + "step": 10250 + }, + { + "epoch": 0.8247588424437299, + "grad_norm": 1.7076865434646606, + "learning_rate": 5.9019370460048426e-05, + "loss": 1.0514, + "step": 10260 + }, + { + "epoch": 0.8255627009646302, + "grad_norm": 1.2622432708740234, + "learning_rate": 5.897901533494754e-05, + "loss": 1.2691, + "step": 10270 + }, + { + "epoch": 0.8263665594855305, + "grad_norm": 1.3416787385940552, + "learning_rate": 5.8938660209846655e-05, + "loss": 1.1225, + "step": 10280 + }, + { + "epoch": 0.8271704180064309, + "grad_norm": 1.5244756937026978, + "learning_rate": 5.889830508474577e-05, + "loss": 1.2898, + "step": 10290 + }, + { + "epoch": 0.8279742765273312, + "grad_norm": 1.0199939012527466, + "learning_rate": 5.885794995964488e-05, + "loss": 1.1597, + "step": 10300 + }, + { + "epoch": 0.8287781350482315, + "grad_norm": 1.2063865661621094, + "learning_rate": 5.881759483454399e-05, + "loss": 1.3285, + "step": 10310 + }, + { + "epoch": 0.8295819935691319, + "grad_norm": 1.3245232105255127, + "learning_rate": 5.87772397094431e-05, + "loss": 1.0705, + "step": 10320 + }, + { + "epoch": 0.8303858520900321, + "grad_norm": 1.2130075693130493, + "learning_rate": 5.873688458434221e-05, + "loss": 1.2279, + "step": 10330 + }, + { + "epoch": 0.8311897106109325, + "grad_norm": 1.4783278703689575, + "learning_rate": 5.8696529459241324e-05, + "loss": 1.1647, + "step": 10340 + }, + { + "epoch": 0.8319935691318328, + "grad_norm": 1.352412462234497, + "learning_rate": 5.8656174334140435e-05, + "loss": 1.194, + "step": 10350 + }, + { + "epoch": 0.8327974276527331, + "grad_norm": 1.2860634326934814, + "learning_rate": 5.8615819209039546e-05, + "loss": 1.2023, + "step": 10360 + }, + { + "epoch": 0.8336012861736335, + "grad_norm": 1.5555285215377808, + "learning_rate": 5.857546408393866e-05, + "loss": 1.1868, + "step": 10370 + }, + { + "epoch": 0.8344051446945338, + "grad_norm": 1.3393827676773071, + "learning_rate": 5.853510895883777e-05, + "loss": 1.2442, + "step": 10380 + }, + { + "epoch": 0.8352090032154341, + "grad_norm": 1.7181309461593628, + "learning_rate": 5.8494753833736894e-05, + "loss": 1.0575, + "step": 10390 + }, + { + "epoch": 0.8360128617363344, + "grad_norm": 1.2780259847640991, + "learning_rate": 5.8454398708636005e-05, + "loss": 1.1243, + "step": 10400 + }, + { + "epoch": 0.8368167202572347, + "grad_norm": 2.2146565914154053, + "learning_rate": 5.841404358353512e-05, + "loss": 1.1439, + "step": 10410 + }, + { + "epoch": 0.837620578778135, + "grad_norm": 2.1241023540496826, + "learning_rate": 5.837368845843423e-05, + "loss": 1.2429, + "step": 10420 + }, + { + "epoch": 0.8384244372990354, + "grad_norm": 1.5142546892166138, + "learning_rate": 5.833333333333334e-05, + "loss": 1.1969, + "step": 10430 + }, + { + "epoch": 0.8392282958199357, + "grad_norm": 1.250694751739502, + "learning_rate": 5.829297820823245e-05, + "loss": 1.2337, + "step": 10440 + }, + { + "epoch": 0.840032154340836, + "grad_norm": 1.0889441967010498, + "learning_rate": 5.825262308313156e-05, + "loss": 1.1911, + "step": 10450 + }, + { + "epoch": 0.8408360128617364, + "grad_norm": 1.120822787284851, + "learning_rate": 5.8212267958030674e-05, + "loss": 1.211, + "step": 10460 + }, + { + "epoch": 0.8416398713826366, + "grad_norm": 1.4169942140579224, + "learning_rate": 5.8171912832929785e-05, + "loss": 1.1747, + "step": 10470 + }, + { + "epoch": 0.842443729903537, + "grad_norm": 1.7394747734069824, + "learning_rate": 5.8131557707828896e-05, + "loss": 1.1991, + "step": 10480 + }, + { + "epoch": 0.8432475884244373, + "grad_norm": 1.3221678733825684, + "learning_rate": 5.809120258272801e-05, + "loss": 1.1979, + "step": 10490 + }, + { + "epoch": 0.8440514469453376, + "grad_norm": 1.1339091062545776, + "learning_rate": 5.805084745762712e-05, + "loss": 1.2497, + "step": 10500 + }, + { + "epoch": 0.844855305466238, + "grad_norm": 1.2805355787277222, + "learning_rate": 5.801049233252624e-05, + "loss": 1.2663, + "step": 10510 + }, + { + "epoch": 0.8456591639871383, + "grad_norm": 2.003892660140991, + "learning_rate": 5.797013720742535e-05, + "loss": 1.2009, + "step": 10520 + }, + { + "epoch": 0.8464630225080386, + "grad_norm": 1.854628562927246, + "learning_rate": 5.792978208232446e-05, + "loss": 1.0923, + "step": 10530 + }, + { + "epoch": 0.8472668810289389, + "grad_norm": 1.1635708808898926, + "learning_rate": 5.788942695722357e-05, + "loss": 1.0859, + "step": 10540 + }, + { + "epoch": 0.8480707395498392, + "grad_norm": 1.0551413297653198, + "learning_rate": 5.784907183212268e-05, + "loss": 1.2706, + "step": 10550 + }, + { + "epoch": 0.8488745980707395, + "grad_norm": 1.675723671913147, + "learning_rate": 5.7808716707021794e-05, + "loss": 1.3267, + "step": 10560 + }, + { + "epoch": 0.8496784565916399, + "grad_norm": 1.2302464246749878, + "learning_rate": 5.7768361581920905e-05, + "loss": 1.1912, + "step": 10570 + }, + { + "epoch": 0.8504823151125402, + "grad_norm": 1.1186902523040771, + "learning_rate": 5.772800645682002e-05, + "loss": 1.1371, + "step": 10580 + }, + { + "epoch": 0.8512861736334405, + "grad_norm": 1.3289992809295654, + "learning_rate": 5.768765133171913e-05, + "loss": 1.2077, + "step": 10590 + }, + { + "epoch": 0.8520900321543409, + "grad_norm": 1.3730969429016113, + "learning_rate": 5.764729620661824e-05, + "loss": 1.1428, + "step": 10600 + }, + { + "epoch": 0.8528938906752411, + "grad_norm": 1.6732438802719116, + "learning_rate": 5.760694108151735e-05, + "loss": 1.0955, + "step": 10610 + }, + { + "epoch": 0.8536977491961415, + "grad_norm": 1.1242672204971313, + "learning_rate": 5.756658595641646e-05, + "loss": 1.2269, + "step": 10620 + }, + { + "epoch": 0.8545016077170418, + "grad_norm": 1.3563451766967773, + "learning_rate": 5.7526230831315574e-05, + "loss": 1.051, + "step": 10630 + }, + { + "epoch": 0.8553054662379421, + "grad_norm": 1.6396784782409668, + "learning_rate": 5.7485875706214685e-05, + "loss": 1.2013, + "step": 10640 + }, + { + "epoch": 0.8561093247588425, + "grad_norm": 1.704081416130066, + "learning_rate": 5.744552058111381e-05, + "loss": 1.113, + "step": 10650 + }, + { + "epoch": 0.8569131832797428, + "grad_norm": 1.072421908378601, + "learning_rate": 5.740516545601292e-05, + "loss": 1.2712, + "step": 10660 + }, + { + "epoch": 0.8577170418006431, + "grad_norm": 1.3971763849258423, + "learning_rate": 5.736481033091203e-05, + "loss": 1.0984, + "step": 10670 + }, + { + "epoch": 0.8585209003215434, + "grad_norm": 1.2550153732299805, + "learning_rate": 5.7324455205811144e-05, + "loss": 1.1208, + "step": 10680 + }, + { + "epoch": 0.8593247588424437, + "grad_norm": 1.531835913658142, + "learning_rate": 5.7284100080710255e-05, + "loss": 1.3186, + "step": 10690 + }, + { + "epoch": 0.860128617363344, + "grad_norm": 1.407632827758789, + "learning_rate": 5.724374495560937e-05, + "loss": 1.0284, + "step": 10700 + }, + { + "epoch": 0.8609324758842444, + "grad_norm": 1.1127479076385498, + "learning_rate": 5.720338983050848e-05, + "loss": 1.2213, + "step": 10710 + }, + { + "epoch": 0.8617363344051447, + "grad_norm": 1.3501721620559692, + "learning_rate": 5.716303470540759e-05, + "loss": 1.1947, + "step": 10720 + }, + { + "epoch": 0.862540192926045, + "grad_norm": 1.3318605422973633, + "learning_rate": 5.71226795803067e-05, + "loss": 1.1896, + "step": 10730 + }, + { + "epoch": 0.8633440514469454, + "grad_norm": 1.1702275276184082, + "learning_rate": 5.708232445520582e-05, + "loss": 1.1036, + "step": 10740 + }, + { + "epoch": 0.8641479099678456, + "grad_norm": 1.3750840425491333, + "learning_rate": 5.704196933010493e-05, + "loss": 1.1948, + "step": 10750 + }, + { + "epoch": 0.864951768488746, + "grad_norm": 1.5514822006225586, + "learning_rate": 5.700161420500404e-05, + "loss": 1.0426, + "step": 10760 + }, + { + "epoch": 0.8657556270096463, + "grad_norm": 1.1812459230422974, + "learning_rate": 5.696125907990315e-05, + "loss": 1.0519, + "step": 10770 + }, + { + "epoch": 0.8665594855305466, + "grad_norm": 1.336189866065979, + "learning_rate": 5.6920903954802264e-05, + "loss": 1.3293, + "step": 10780 + }, + { + "epoch": 0.867363344051447, + "grad_norm": 1.533086895942688, + "learning_rate": 5.6880548829701376e-05, + "loss": 1.0463, + "step": 10790 + }, + { + "epoch": 0.8681672025723473, + "grad_norm": 1.7019423246383667, + "learning_rate": 5.684019370460049e-05, + "loss": 1.1906, + "step": 10800 + }, + { + "epoch": 0.8689710610932476, + "grad_norm": 1.6200381517410278, + "learning_rate": 5.67998385794996e-05, + "loss": 1.1637, + "step": 10810 + }, + { + "epoch": 0.8697749196141479, + "grad_norm": 1.5051641464233398, + "learning_rate": 5.675948345439871e-05, + "loss": 1.2058, + "step": 10820 + }, + { + "epoch": 0.8705787781350482, + "grad_norm": 1.2912993431091309, + "learning_rate": 5.671912832929782e-05, + "loss": 1.2305, + "step": 10830 + }, + { + "epoch": 0.8713826366559485, + "grad_norm": 1.2812238931655884, + "learning_rate": 5.667877320419693e-05, + "loss": 1.1353, + "step": 10840 + }, + { + "epoch": 0.8721864951768489, + "grad_norm": 2.6196494102478027, + "learning_rate": 5.6638418079096044e-05, + "loss": 1.2835, + "step": 10850 + }, + { + "epoch": 0.8729903536977492, + "grad_norm": 1.3909611701965332, + "learning_rate": 5.6598062953995155e-05, + "loss": 1.1568, + "step": 10860 + }, + { + "epoch": 0.8737942122186495, + "grad_norm": 1.1649755239486694, + "learning_rate": 5.655770782889427e-05, + "loss": 1.1305, + "step": 10870 + }, + { + "epoch": 0.8745980707395499, + "grad_norm": 1.537785291671753, + "learning_rate": 5.651735270379338e-05, + "loss": 1.2737, + "step": 10880 + }, + { + "epoch": 0.8754019292604501, + "grad_norm": 1.1643513441085815, + "learning_rate": 5.647699757869249e-05, + "loss": 1.2144, + "step": 10890 + }, + { + "epoch": 0.8762057877813505, + "grad_norm": 1.3522480726242065, + "learning_rate": 5.64366424535916e-05, + "loss": 1.2893, + "step": 10900 + }, + { + "epoch": 0.8770096463022508, + "grad_norm": 1.7712152004241943, + "learning_rate": 5.639628732849072e-05, + "loss": 1.1344, + "step": 10910 + }, + { + "epoch": 0.8778135048231511, + "grad_norm": 1.3849141597747803, + "learning_rate": 5.635593220338984e-05, + "loss": 1.2632, + "step": 10920 + }, + { + "epoch": 0.8786173633440515, + "grad_norm": 1.3868497610092163, + "learning_rate": 5.631557707828895e-05, + "loss": 1.2128, + "step": 10930 + }, + { + "epoch": 0.8794212218649518, + "grad_norm": 1.1606500148773193, + "learning_rate": 5.627522195318806e-05, + "loss": 1.1074, + "step": 10940 + }, + { + "epoch": 0.8802250803858521, + "grad_norm": 1.3538761138916016, + "learning_rate": 5.623486682808717e-05, + "loss": 1.209, + "step": 10950 + }, + { + "epoch": 0.8810289389067524, + "grad_norm": 1.223466157913208, + "learning_rate": 5.619451170298628e-05, + "loss": 1.1811, + "step": 10960 + }, + { + "epoch": 0.8818327974276527, + "grad_norm": 1.899377465248108, + "learning_rate": 5.61541565778854e-05, + "loss": 1.0492, + "step": 10970 + }, + { + "epoch": 0.882636655948553, + "grad_norm": 2.095160722732544, + "learning_rate": 5.611380145278451e-05, + "loss": 1.2333, + "step": 10980 + }, + { + "epoch": 0.8834405144694534, + "grad_norm": 1.1777327060699463, + "learning_rate": 5.607344632768362e-05, + "loss": 1.192, + "step": 10990 + }, + { + "epoch": 0.8842443729903537, + "grad_norm": 1.3834155797958374, + "learning_rate": 5.6033091202582735e-05, + "loss": 1.1716, + "step": 11000 + }, + { + "epoch": 0.885048231511254, + "grad_norm": 3.961866855621338, + "learning_rate": 5.5992736077481846e-05, + "loss": 1.2473, + "step": 11010 + }, + { + "epoch": 0.8858520900321544, + "grad_norm": 1.7471206188201904, + "learning_rate": 5.595238095238096e-05, + "loss": 1.2372, + "step": 11020 + }, + { + "epoch": 0.8866559485530546, + "grad_norm": 1.2151943445205688, + "learning_rate": 5.591202582728007e-05, + "loss": 1.2137, + "step": 11030 + }, + { + "epoch": 0.887459807073955, + "grad_norm": 2.229262351989746, + "learning_rate": 5.587167070217918e-05, + "loss": 1.338, + "step": 11040 + }, + { + "epoch": 0.8882636655948553, + "grad_norm": 1.3745701313018799, + "learning_rate": 5.583131557707829e-05, + "loss": 1.1346, + "step": 11050 + }, + { + "epoch": 0.8890675241157556, + "grad_norm": 1.6774941682815552, + "learning_rate": 5.57909604519774e-05, + "loss": 1.2882, + "step": 11060 + }, + { + "epoch": 0.889871382636656, + "grad_norm": 1.7017117738723755, + "learning_rate": 5.5750605326876514e-05, + "loss": 1.1427, + "step": 11070 + }, + { + "epoch": 0.8906752411575563, + "grad_norm": 1.3809226751327515, + "learning_rate": 5.5710250201775626e-05, + "loss": 1.2229, + "step": 11080 + }, + { + "epoch": 0.8914790996784566, + "grad_norm": 1.2096409797668457, + "learning_rate": 5.566989507667474e-05, + "loss": 1.1795, + "step": 11090 + }, + { + "epoch": 0.8922829581993569, + "grad_norm": 1.653446078300476, + "learning_rate": 5.562953995157385e-05, + "loss": 1.2252, + "step": 11100 + }, + { + "epoch": 0.8930868167202572, + "grad_norm": 1.1945871114730835, + "learning_rate": 5.558918482647296e-05, + "loss": 1.1876, + "step": 11110 + }, + { + "epoch": 0.8938906752411575, + "grad_norm": 1.0288665294647217, + "learning_rate": 5.554882970137207e-05, + "loss": 1.1355, + "step": 11120 + }, + { + "epoch": 0.8946945337620579, + "grad_norm": 1.1207334995269775, + "learning_rate": 5.550847457627118e-05, + "loss": 1.1891, + "step": 11130 + }, + { + "epoch": 0.8954983922829582, + "grad_norm": 1.56515371799469, + "learning_rate": 5.54681194511703e-05, + "loss": 1.233, + "step": 11140 + }, + { + "epoch": 0.8963022508038585, + "grad_norm": 2.0786585807800293, + "learning_rate": 5.542776432606941e-05, + "loss": 1.1558, + "step": 11150 + }, + { + "epoch": 0.8971061093247589, + "grad_norm": 1.1563533544540405, + "learning_rate": 5.538740920096852e-05, + "loss": 1.1564, + "step": 11160 + }, + { + "epoch": 0.8979099678456591, + "grad_norm": 1.2741695642471313, + "learning_rate": 5.5347054075867635e-05, + "loss": 1.0598, + "step": 11170 + }, + { + "epoch": 0.8987138263665595, + "grad_norm": 1.7394829988479614, + "learning_rate": 5.530669895076675e-05, + "loss": 1.0792, + "step": 11180 + }, + { + "epoch": 0.8995176848874598, + "grad_norm": 2.2357265949249268, + "learning_rate": 5.5266343825665864e-05, + "loss": 1.1272, + "step": 11190 + }, + { + "epoch": 0.9003215434083601, + "grad_norm": 1.740126371383667, + "learning_rate": 5.5225988700564976e-05, + "loss": 1.2022, + "step": 11200 + }, + { + "epoch": 0.9011254019292605, + "grad_norm": 1.7292535305023193, + "learning_rate": 5.5185633575464094e-05, + "loss": 1.1981, + "step": 11210 + }, + { + "epoch": 0.9019292604501608, + "grad_norm": 1.5324060916900635, + "learning_rate": 5.5145278450363205e-05, + "loss": 1.1449, + "step": 11220 + }, + { + "epoch": 0.9027331189710611, + "grad_norm": 1.1555876731872559, + "learning_rate": 5.5104923325262316e-05, + "loss": 1.1939, + "step": 11230 + }, + { + "epoch": 0.9035369774919614, + "grad_norm": 1.3041651248931885, + "learning_rate": 5.506456820016143e-05, + "loss": 1.2318, + "step": 11240 + }, + { + "epoch": 0.9043408360128617, + "grad_norm": 1.3641607761383057, + "learning_rate": 5.502421307506054e-05, + "loss": 1.1277, + "step": 11250 + }, + { + "epoch": 0.905144694533762, + "grad_norm": 1.4824761152267456, + "learning_rate": 5.498385794995965e-05, + "loss": 1.2096, + "step": 11260 + }, + { + "epoch": 0.9059485530546624, + "grad_norm": 1.7768710851669312, + "learning_rate": 5.494350282485876e-05, + "loss": 1.1968, + "step": 11270 + }, + { + "epoch": 0.9067524115755627, + "grad_norm": 1.0921682119369507, + "learning_rate": 5.490314769975787e-05, + "loss": 1.1655, + "step": 11280 + }, + { + "epoch": 0.907556270096463, + "grad_norm": 1.1076873540878296, + "learning_rate": 5.4862792574656985e-05, + "loss": 1.1469, + "step": 11290 + }, + { + "epoch": 0.9083601286173634, + "grad_norm": 2.4272866249084473, + "learning_rate": 5.4822437449556096e-05, + "loss": 1.2036, + "step": 11300 + }, + { + "epoch": 0.9091639871382636, + "grad_norm": 1.5198862552642822, + "learning_rate": 5.478208232445521e-05, + "loss": 1.1712, + "step": 11310 + }, + { + "epoch": 0.909967845659164, + "grad_norm": 1.6450421810150146, + "learning_rate": 5.474172719935432e-05, + "loss": 1.1633, + "step": 11320 + }, + { + "epoch": 0.9107717041800643, + "grad_norm": 1.6671468019485474, + "learning_rate": 5.470137207425343e-05, + "loss": 1.0944, + "step": 11330 + }, + { + "epoch": 0.9115755627009646, + "grad_norm": 1.7807215452194214, + "learning_rate": 5.466101694915254e-05, + "loss": 1.096, + "step": 11340 + }, + { + "epoch": 0.912379421221865, + "grad_norm": 1.8122543096542358, + "learning_rate": 5.462066182405165e-05, + "loss": 1.0954, + "step": 11350 + }, + { + "epoch": 0.9131832797427653, + "grad_norm": 1.3115532398223877, + "learning_rate": 5.4580306698950764e-05, + "loss": 1.0989, + "step": 11360 + }, + { + "epoch": 0.9139871382636656, + "grad_norm": 2.4599406719207764, + "learning_rate": 5.4539951573849876e-05, + "loss": 1.166, + "step": 11370 + }, + { + "epoch": 0.9147909967845659, + "grad_norm": 1.021290898323059, + "learning_rate": 5.4499596448748994e-05, + "loss": 1.1451, + "step": 11380 + }, + { + "epoch": 0.9155948553054662, + "grad_norm": 1.0792917013168335, + "learning_rate": 5.4459241323648105e-05, + "loss": 1.2435, + "step": 11390 + }, + { + "epoch": 0.9163987138263665, + "grad_norm": 1.6936174631118774, + "learning_rate": 5.4418886198547216e-05, + "loss": 1.2064, + "step": 11400 + }, + { + "epoch": 0.9172025723472669, + "grad_norm": 1.6657130718231201, + "learning_rate": 5.437853107344633e-05, + "loss": 1.079, + "step": 11410 + }, + { + "epoch": 0.9180064308681672, + "grad_norm": 1.8588035106658936, + "learning_rate": 5.433817594834544e-05, + "loss": 1.1328, + "step": 11420 + }, + { + "epoch": 0.9188102893890675, + "grad_norm": 1.3137052059173584, + "learning_rate": 5.429782082324455e-05, + "loss": 1.1395, + "step": 11430 + }, + { + "epoch": 0.9196141479099679, + "grad_norm": 1.2235493659973145, + "learning_rate": 5.425746569814366e-05, + "loss": 1.1534, + "step": 11440 + }, + { + "epoch": 0.9204180064308681, + "grad_norm": 1.411837100982666, + "learning_rate": 5.421711057304279e-05, + "loss": 1.3355, + "step": 11450 + }, + { + "epoch": 0.9212218649517685, + "grad_norm": 1.2228326797485352, + "learning_rate": 5.41767554479419e-05, + "loss": 1.2599, + "step": 11460 + }, + { + "epoch": 0.9220257234726688, + "grad_norm": 2.3366942405700684, + "learning_rate": 5.413640032284101e-05, + "loss": 1.2152, + "step": 11470 + }, + { + "epoch": 0.9228295819935691, + "grad_norm": 1.593866229057312, + "learning_rate": 5.409604519774012e-05, + "loss": 1.1455, + "step": 11480 + }, + { + "epoch": 0.9236334405144695, + "grad_norm": 1.1690460443496704, + "learning_rate": 5.405569007263923e-05, + "loss": 1.1246, + "step": 11490 + }, + { + "epoch": 0.9244372990353698, + "grad_norm": 1.9292722940444946, + "learning_rate": 5.4015334947538344e-05, + "loss": 1.1446, + "step": 11500 + }, + { + "epoch": 0.9252411575562701, + "grad_norm": 1.288496732711792, + "learning_rate": 5.3974979822437455e-05, + "loss": 1.1215, + "step": 11510 + }, + { + "epoch": 0.9260450160771704, + "grad_norm": 1.6341341733932495, + "learning_rate": 5.3934624697336566e-05, + "loss": 1.0706, + "step": 11520 + }, + { + "epoch": 0.9268488745980707, + "grad_norm": 1.1956830024719238, + "learning_rate": 5.389426957223568e-05, + "loss": 1.1117, + "step": 11530 + }, + { + "epoch": 0.927652733118971, + "grad_norm": 2.764254093170166, + "learning_rate": 5.385391444713479e-05, + "loss": 1.2636, + "step": 11540 + }, + { + "epoch": 0.9284565916398714, + "grad_norm": 1.0441055297851562, + "learning_rate": 5.38135593220339e-05, + "loss": 1.1597, + "step": 11550 + }, + { + "epoch": 0.9292604501607717, + "grad_norm": 1.1911731958389282, + "learning_rate": 5.377320419693301e-05, + "loss": 0.9624, + "step": 11560 + }, + { + "epoch": 0.930064308681672, + "grad_norm": 1.9981105327606201, + "learning_rate": 5.373284907183212e-05, + "loss": 1.0886, + "step": 11570 + }, + { + "epoch": 0.9308681672025724, + "grad_norm": 1.4466192722320557, + "learning_rate": 5.3692493946731235e-05, + "loss": 1.1698, + "step": 11580 + }, + { + "epoch": 0.9316720257234726, + "grad_norm": 1.43555748462677, + "learning_rate": 5.3652138821630346e-05, + "loss": 1.0717, + "step": 11590 + }, + { + "epoch": 0.932475884244373, + "grad_norm": 1.0620185136795044, + "learning_rate": 5.361178369652946e-05, + "loss": 1.2611, + "step": 11600 + }, + { + "epoch": 0.9332797427652733, + "grad_norm": 1.1827366352081299, + "learning_rate": 5.3571428571428575e-05, + "loss": 1.2629, + "step": 11610 + }, + { + "epoch": 0.9340836012861736, + "grad_norm": 1.6303887367248535, + "learning_rate": 5.353107344632769e-05, + "loss": 1.1004, + "step": 11620 + }, + { + "epoch": 0.934887459807074, + "grad_norm": 1.6994215250015259, + "learning_rate": 5.34907183212268e-05, + "loss": 1.1702, + "step": 11630 + }, + { + "epoch": 0.9356913183279743, + "grad_norm": 2.8394522666931152, + "learning_rate": 5.345036319612591e-05, + "loss": 1.2469, + "step": 11640 + }, + { + "epoch": 0.9364951768488746, + "grad_norm": 1.0047556161880493, + "learning_rate": 5.341000807102502e-05, + "loss": 1.1197, + "step": 11650 + }, + { + "epoch": 0.9372990353697749, + "grad_norm": 1.9585012197494507, + "learning_rate": 5.336965294592413e-05, + "loss": 1.1801, + "step": 11660 + }, + { + "epoch": 0.9381028938906752, + "grad_norm": 1.2510764598846436, + "learning_rate": 5.3329297820823244e-05, + "loss": 1.0855, + "step": 11670 + }, + { + "epoch": 0.9389067524115756, + "grad_norm": 1.580871343612671, + "learning_rate": 5.3288942695722355e-05, + "loss": 1.1472, + "step": 11680 + }, + { + "epoch": 0.9397106109324759, + "grad_norm": 1.1376216411590576, + "learning_rate": 5.3248587570621466e-05, + "loss": 1.1334, + "step": 11690 + }, + { + "epoch": 0.9405144694533762, + "grad_norm": 1.5755469799041748, + "learning_rate": 5.320823244552058e-05, + "loss": 1.1797, + "step": 11700 + }, + { + "epoch": 0.9413183279742765, + "grad_norm": 1.6171435117721558, + "learning_rate": 5.31678773204197e-05, + "loss": 1.2272, + "step": 11710 + }, + { + "epoch": 0.9421221864951769, + "grad_norm": 1.393620252609253, + "learning_rate": 5.3127522195318814e-05, + "loss": 1.3367, + "step": 11720 + }, + { + "epoch": 0.9429260450160771, + "grad_norm": 1.172006607055664, + "learning_rate": 5.3087167070217925e-05, + "loss": 1.0395, + "step": 11730 + }, + { + "epoch": 0.9437299035369775, + "grad_norm": 1.4762243032455444, + "learning_rate": 5.304681194511704e-05, + "loss": 1.1633, + "step": 11740 + }, + { + "epoch": 0.9445337620578779, + "grad_norm": 1.6818078756332397, + "learning_rate": 5.300645682001615e-05, + "loss": 1.1805, + "step": 11750 + }, + { + "epoch": 0.9453376205787781, + "grad_norm": 1.3330456018447876, + "learning_rate": 5.296610169491526e-05, + "loss": 1.2, + "step": 11760 + }, + { + "epoch": 0.9461414790996785, + "grad_norm": 1.6012849807739258, + "learning_rate": 5.292574656981437e-05, + "loss": 1.082, + "step": 11770 + }, + { + "epoch": 0.9469453376205788, + "grad_norm": 1.347171425819397, + "learning_rate": 5.288539144471348e-05, + "loss": 1.3004, + "step": 11780 + }, + { + "epoch": 0.9477491961414791, + "grad_norm": 2.2929813861846924, + "learning_rate": 5.2845036319612594e-05, + "loss": 1.1575, + "step": 11790 + }, + { + "epoch": 0.9485530546623794, + "grad_norm": 1.209058165550232, + "learning_rate": 5.2804681194511705e-05, + "loss": 1.1463, + "step": 11800 + }, + { + "epoch": 0.9493569131832797, + "grad_norm": 1.7847411632537842, + "learning_rate": 5.2764326069410816e-05, + "loss": 1.1799, + "step": 11810 + }, + { + "epoch": 0.9501607717041801, + "grad_norm": 1.4044344425201416, + "learning_rate": 5.272397094430993e-05, + "loss": 1.1854, + "step": 11820 + }, + { + "epoch": 0.9509646302250804, + "grad_norm": 1.4244343042373657, + "learning_rate": 5.268361581920904e-05, + "loss": 1.2356, + "step": 11830 + }, + { + "epoch": 0.9517684887459807, + "grad_norm": 1.699362874031067, + "learning_rate": 5.264326069410816e-05, + "loss": 1.1849, + "step": 11840 + }, + { + "epoch": 0.952572347266881, + "grad_norm": 3.909731864929199, + "learning_rate": 5.260290556900727e-05, + "loss": 1.0881, + "step": 11850 + }, + { + "epoch": 0.9533762057877814, + "grad_norm": 1.665794849395752, + "learning_rate": 5.256255044390638e-05, + "loss": 1.1655, + "step": 11860 + }, + { + "epoch": 0.9541800643086816, + "grad_norm": 2.0441787242889404, + "learning_rate": 5.252219531880549e-05, + "loss": 1.2538, + "step": 11870 + }, + { + "epoch": 0.954983922829582, + "grad_norm": 1.2380964756011963, + "learning_rate": 5.24818401937046e-05, + "loss": 1.1699, + "step": 11880 + }, + { + "epoch": 0.9557877813504824, + "grad_norm": 1.538638949394226, + "learning_rate": 5.2441485068603714e-05, + "loss": 1.3367, + "step": 11890 + }, + { + "epoch": 0.9565916398713826, + "grad_norm": 1.4636310338974, + "learning_rate": 5.2401129943502825e-05, + "loss": 1.2077, + "step": 11900 + }, + { + "epoch": 0.957395498392283, + "grad_norm": 1.2368042469024658, + "learning_rate": 5.236077481840194e-05, + "loss": 1.2506, + "step": 11910 + }, + { + "epoch": 0.9581993569131833, + "grad_norm": 1.5975849628448486, + "learning_rate": 5.232041969330105e-05, + "loss": 1.2262, + "step": 11920 + }, + { + "epoch": 0.9590032154340836, + "grad_norm": 1.2507879734039307, + "learning_rate": 5.228006456820016e-05, + "loss": 1.1564, + "step": 11930 + }, + { + "epoch": 0.9598070739549839, + "grad_norm": 1.380014419555664, + "learning_rate": 5.223970944309927e-05, + "loss": 1.1145, + "step": 11940 + }, + { + "epoch": 0.9606109324758842, + "grad_norm": 1.2493743896484375, + "learning_rate": 5.219935431799838e-05, + "loss": 1.1854, + "step": 11950 + }, + { + "epoch": 0.9614147909967846, + "grad_norm": 1.350716233253479, + "learning_rate": 5.2158999192897494e-05, + "loss": 1.1534, + "step": 11960 + }, + { + "epoch": 0.9622186495176849, + "grad_norm": NaN, + "learning_rate": 5.21226795803067e-05, + "loss": 1.2298, + "step": 11970 + }, + { + "epoch": 0.9630225080385852, + "grad_norm": 1.5916483402252197, + "learning_rate": 5.208232445520581e-05, + "loss": 1.233, + "step": 11980 + }, + { + "epoch": 0.9638263665594855, + "grad_norm": 1.3891382217407227, + "learning_rate": 5.2041969330104924e-05, + "loss": 1.33, + "step": 11990 + }, + { + "epoch": 0.9646302250803859, + "grad_norm": 1.6548006534576416, + "learning_rate": 5.2001614205004035e-05, + "loss": 1.2027, + "step": 12000 + }, + { + "epoch": 0.9646302250803859, + "eval_yahma/alpaca-cleaned_loss": 1.2050625085830688, + "eval_yahma/alpaca-cleaned_runtime": 115.646, + "eval_yahma/alpaca-cleaned_samples_per_second": 17.294, + "eval_yahma/alpaca-cleaned_steps_per_second": 2.162, + "step": 12000 + }, + { + "epoch": 0.9654340836012861, + "grad_norm": 1.2655686140060425, + "learning_rate": 5.196125907990315e-05, + "loss": 1.1269, + "step": 12010 + }, + { + "epoch": 0.9662379421221865, + "grad_norm": 1.2995678186416626, + "learning_rate": 5.192090395480226e-05, + "loss": 1.1227, + "step": 12020 + }, + { + "epoch": 0.9670418006430869, + "grad_norm": 1.0736733675003052, + "learning_rate": 5.188054882970137e-05, + "loss": 1.2294, + "step": 12030 + }, + { + "epoch": 0.9678456591639871, + "grad_norm": 1.19216787815094, + "learning_rate": 5.184019370460048e-05, + "loss": 1.2031, + "step": 12040 + }, + { + "epoch": 0.9686495176848875, + "grad_norm": 1.541259765625, + "learning_rate": 5.1799838579499606e-05, + "loss": 1.1046, + "step": 12050 + }, + { + "epoch": 0.9694533762057878, + "grad_norm": 1.6200529336929321, + "learning_rate": 5.175948345439872e-05, + "loss": 1.1826, + "step": 12060 + }, + { + "epoch": 0.9702572347266881, + "grad_norm": 1.4181767702102661, + "learning_rate": 5.171912832929783e-05, + "loss": 1.1438, + "step": 12070 + }, + { + "epoch": 0.9710610932475884, + "grad_norm": 1.6790999174118042, + "learning_rate": 5.167877320419694e-05, + "loss": 1.1761, + "step": 12080 + }, + { + "epoch": 0.9718649517684887, + "grad_norm": 1.2507883310317993, + "learning_rate": 5.163841807909605e-05, + "loss": 1.2146, + "step": 12090 + }, + { + "epoch": 0.9726688102893891, + "grad_norm": 1.203534483909607, + "learning_rate": 5.159806295399516e-05, + "loss": 1.3176, + "step": 12100 + }, + { + "epoch": 0.9734726688102894, + "grad_norm": 1.6226143836975098, + "learning_rate": 5.1557707828894274e-05, + "loss": 1.1501, + "step": 12110 + }, + { + "epoch": 0.9742765273311897, + "grad_norm": 1.6095842123031616, + "learning_rate": 5.1517352703793385e-05, + "loss": 1.0896, + "step": 12120 + }, + { + "epoch": 0.97508038585209, + "grad_norm": 1.9545879364013672, + "learning_rate": 5.14769975786925e-05, + "loss": 1.2088, + "step": 12130 + }, + { + "epoch": 0.9758842443729904, + "grad_norm": 1.478491187095642, + "learning_rate": 5.143664245359161e-05, + "loss": 1.1762, + "step": 12140 + }, + { + "epoch": 0.9766881028938906, + "grad_norm": 1.239725947380066, + "learning_rate": 5.139628732849072e-05, + "loss": 1.2697, + "step": 12150 + }, + { + "epoch": 0.977491961414791, + "grad_norm": 1.663704514503479, + "learning_rate": 5.135593220338983e-05, + "loss": 1.2736, + "step": 12160 + }, + { + "epoch": 0.9782958199356914, + "grad_norm": 1.4106169939041138, + "learning_rate": 5.131557707828895e-05, + "loss": 1.0439, + "step": 12170 + }, + { + "epoch": 0.9790996784565916, + "grad_norm": 1.6039048433303833, + "learning_rate": 5.127522195318806e-05, + "loss": 1.2114, + "step": 12180 + }, + { + "epoch": 0.979903536977492, + "grad_norm": 1.2211772203445435, + "learning_rate": 5.123486682808717e-05, + "loss": 1.246, + "step": 12190 + }, + { + "epoch": 0.9807073954983923, + "grad_norm": 1.741607904434204, + "learning_rate": 5.119451170298628e-05, + "loss": 1.1022, + "step": 12200 + }, + { + "epoch": 0.9815112540192926, + "grad_norm": 2.1637446880340576, + "learning_rate": 5.1154156577885394e-05, + "loss": 1.1241, + "step": 12210 + }, + { + "epoch": 0.9823151125401929, + "grad_norm": 1.2326878309249878, + "learning_rate": 5.1113801452784506e-05, + "loss": 1.1947, + "step": 12220 + }, + { + "epoch": 0.9831189710610932, + "grad_norm": 1.2424410581588745, + "learning_rate": 5.107344632768362e-05, + "loss": 1.1497, + "step": 12230 + }, + { + "epoch": 0.9839228295819936, + "grad_norm": 1.4629040956497192, + "learning_rate": 5.103309120258273e-05, + "loss": 1.1187, + "step": 12240 + }, + { + "epoch": 0.9847266881028939, + "grad_norm": 1.7737675905227661, + "learning_rate": 5.099273607748184e-05, + "loss": 1.1671, + "step": 12250 + }, + { + "epoch": 0.9855305466237942, + "grad_norm": 2.268542766571045, + "learning_rate": 5.095238095238095e-05, + "loss": 1.1304, + "step": 12260 + }, + { + "epoch": 0.9863344051446945, + "grad_norm": 1.690609097480774, + "learning_rate": 5.091202582728006e-05, + "loss": 1.1014, + "step": 12270 + }, + { + "epoch": 0.9871382636655949, + "grad_norm": 1.7486388683319092, + "learning_rate": 5.0871670702179174e-05, + "loss": 1.1526, + "step": 12280 + }, + { + "epoch": 0.9879421221864951, + "grad_norm": 1.747187614440918, + "learning_rate": 5.0831315577078285e-05, + "loss": 0.9875, + "step": 12290 + }, + { + "epoch": 0.9887459807073955, + "grad_norm": 2.9285736083984375, + "learning_rate": 5.0790960451977397e-05, + "loss": 1.1122, + "step": 12300 + }, + { + "epoch": 0.9895498392282959, + "grad_norm": 1.5583657026290894, + "learning_rate": 5.075060532687651e-05, + "loss": 1.1447, + "step": 12310 + }, + { + "epoch": 0.9903536977491961, + "grad_norm": 1.8540047407150269, + "learning_rate": 5.071025020177563e-05, + "loss": 1.1729, + "step": 12320 + }, + { + "epoch": 0.9911575562700965, + "grad_norm": 1.5402899980545044, + "learning_rate": 5.0669895076674744e-05, + "loss": 1.1523, + "step": 12330 + }, + { + "epoch": 0.9919614147909968, + "grad_norm": 1.3348851203918457, + "learning_rate": 5.0629539951573856e-05, + "loss": 1.2239, + "step": 12340 + }, + { + "epoch": 0.9927652733118971, + "grad_norm": 1.8120102882385254, + "learning_rate": 5.058918482647297e-05, + "loss": 1.2171, + "step": 12350 + }, + { + "epoch": 0.9935691318327974, + "grad_norm": 1.2614649534225464, + "learning_rate": 5.054882970137208e-05, + "loss": 1.22, + "step": 12360 + }, + { + "epoch": 0.9943729903536977, + "grad_norm": 1.311959981918335, + "learning_rate": 5.050847457627119e-05, + "loss": 1.0745, + "step": 12370 + }, + { + "epoch": 0.9951768488745981, + "grad_norm": 1.1874438524246216, + "learning_rate": 5.04681194511703e-05, + "loss": 1.2944, + "step": 12380 + }, + { + "epoch": 0.9959807073954984, + "grad_norm": 1.3048447370529175, + "learning_rate": 5.042776432606941e-05, + "loss": 1.1719, + "step": 12390 + }, + { + "epoch": 0.9967845659163987, + "grad_norm": 1.0826609134674072, + "learning_rate": 5.038740920096853e-05, + "loss": 1.1412, + "step": 12400 + }, + { + "epoch": 0.997588424437299, + "grad_norm": 1.49656343460083, + "learning_rate": 5.034705407586764e-05, + "loss": 1.1898, + "step": 12410 + }, + { + "epoch": 0.9983922829581994, + "grad_norm": 1.256178617477417, + "learning_rate": 5.030669895076675e-05, + "loss": 1.1431, + "step": 12420 + }, + { + "epoch": 0.9991961414790996, + "grad_norm": 1.5236985683441162, + "learning_rate": 5.0266343825665865e-05, + "loss": 1.1061, + "step": 12430 + }, + { + "epoch": 1.0, + "grad_norm": 1.45755934715271, + "learning_rate": 5.0225988700564976e-05, + "loss": 1.1797, + "step": 12440 + }, + { + "epoch": 1.0008038585209003, + "grad_norm": 1.336484670639038, + "learning_rate": 5.018563357546409e-05, + "loss": 0.9872, + "step": 12450 + }, + { + "epoch": 1.0016077170418007, + "grad_norm": 1.2816264629364014, + "learning_rate": 5.01452784503632e-05, + "loss": 1.0556, + "step": 12460 + }, + { + "epoch": 1.002411575562701, + "grad_norm": 2.1287145614624023, + "learning_rate": 5.010492332526231e-05, + "loss": 1.1472, + "step": 12470 + }, + { + "epoch": 1.0032154340836013, + "grad_norm": 2.0706117153167725, + "learning_rate": 5.006456820016142e-05, + "loss": 1.002, + "step": 12480 + }, + { + "epoch": 1.0040192926045015, + "grad_norm": 1.6112709045410156, + "learning_rate": 5.002421307506053e-05, + "loss": 0.9798, + "step": 12490 + }, + { + "epoch": 1.004823151125402, + "grad_norm": 2.151376247406006, + "learning_rate": 4.9983857949959644e-05, + "loss": 0.9878, + "step": 12500 + }, + { + "epoch": 1.0056270096463023, + "grad_norm": 2.537773609161377, + "learning_rate": 4.994350282485876e-05, + "loss": 1.1223, + "step": 12510 + }, + { + "epoch": 1.0064308681672025, + "grad_norm": 1.542268991470337, + "learning_rate": 4.9903147699757874e-05, + "loss": 1.1358, + "step": 12520 + }, + { + "epoch": 1.007234726688103, + "grad_norm": 2.475477695465088, + "learning_rate": 4.9862792574656985e-05, + "loss": 1.0899, + "step": 12530 + }, + { + "epoch": 1.0080385852090032, + "grad_norm": 1.7700741291046143, + "learning_rate": 4.9822437449556096e-05, + "loss": 1.1545, + "step": 12540 + }, + { + "epoch": 1.0088424437299035, + "grad_norm": 1.5795843601226807, + "learning_rate": 4.978208232445521e-05, + "loss": 1.1137, + "step": 12550 + }, + { + "epoch": 1.0096463022508038, + "grad_norm": 1.2464009523391724, + "learning_rate": 4.974172719935432e-05, + "loss": 1.0397, + "step": 12560 + }, + { + "epoch": 1.0104501607717042, + "grad_norm": 1.3018213510513306, + "learning_rate": 4.970137207425343e-05, + "loss": 1.0142, + "step": 12570 + }, + { + "epoch": 1.0112540192926045, + "grad_norm": 1.30471932888031, + "learning_rate": 4.966101694915254e-05, + "loss": 0.9567, + "step": 12580 + }, + { + "epoch": 1.0120578778135048, + "grad_norm": 1.6906229257583618, + "learning_rate": 4.962066182405165e-05, + "loss": 0.8977, + "step": 12590 + }, + { + "epoch": 1.0128617363344052, + "grad_norm": 1.269845724105835, + "learning_rate": 4.958030669895077e-05, + "loss": 1.0785, + "step": 12600 + }, + { + "epoch": 1.0136655948553055, + "grad_norm": 1.7904671430587769, + "learning_rate": 4.953995157384988e-05, + "loss": 0.9146, + "step": 12610 + }, + { + "epoch": 1.0144694533762058, + "grad_norm": 2.0014989376068115, + "learning_rate": 4.9499596448748994e-05, + "loss": 1.0232, + "step": 12620 + }, + { + "epoch": 1.015273311897106, + "grad_norm": 1.4139068126678467, + "learning_rate": 4.9459241323648106e-05, + "loss": 1.0038, + "step": 12630 + }, + { + "epoch": 1.0160771704180065, + "grad_norm": 1.4505324363708496, + "learning_rate": 4.941888619854722e-05, + "loss": 0.9644, + "step": 12640 + }, + { + "epoch": 1.0168810289389068, + "grad_norm": 1.2977114915847778, + "learning_rate": 4.9378531073446335e-05, + "loss": 1.1053, + "step": 12650 + }, + { + "epoch": 1.017684887459807, + "grad_norm": 1.6965181827545166, + "learning_rate": 4.9338175948345446e-05, + "loss": 1.0649, + "step": 12660 + }, + { + "epoch": 1.0184887459807075, + "grad_norm": 1.7907180786132812, + "learning_rate": 4.929782082324456e-05, + "loss": 0.9434, + "step": 12670 + }, + { + "epoch": 1.0192926045016077, + "grad_norm": 1.6487858295440674, + "learning_rate": 4.925746569814367e-05, + "loss": 1.0578, + "step": 12680 + }, + { + "epoch": 1.020096463022508, + "grad_norm": 2.0607364177703857, + "learning_rate": 4.921711057304278e-05, + "loss": 0.9846, + "step": 12690 + }, + { + "epoch": 1.0209003215434083, + "grad_norm": 1.3130086660385132, + "learning_rate": 4.917675544794189e-05, + "loss": 1.1002, + "step": 12700 + }, + { + "epoch": 1.0217041800643087, + "grad_norm": 1.415685772895813, + "learning_rate": 4.9136400322841e-05, + "loss": 1.0871, + "step": 12710 + }, + { + "epoch": 1.022508038585209, + "grad_norm": 1.438016653060913, + "learning_rate": 4.9096045197740115e-05, + "loss": 1.0543, + "step": 12720 + }, + { + "epoch": 1.0233118971061093, + "grad_norm": 2.078965663909912, + "learning_rate": 4.9055690072639226e-05, + "loss": 0.9583, + "step": 12730 + }, + { + "epoch": 1.0241157556270097, + "grad_norm": 1.4178944826126099, + "learning_rate": 4.901533494753834e-05, + "loss": 0.901, + "step": 12740 + }, + { + "epoch": 1.02491961414791, + "grad_norm": 2.025350570678711, + "learning_rate": 4.897497982243745e-05, + "loss": 1.0372, + "step": 12750 + }, + { + "epoch": 1.0257234726688103, + "grad_norm": 1.9596234560012817, + "learning_rate": 4.893462469733656e-05, + "loss": 1.048, + "step": 12760 + }, + { + "epoch": 1.0265273311897105, + "grad_norm": 1.2961586713790894, + "learning_rate": 4.889426957223567e-05, + "loss": 1.0288, + "step": 12770 + }, + { + "epoch": 1.027331189710611, + "grad_norm": 1.3893296718597412, + "learning_rate": 4.885391444713479e-05, + "loss": 1.0256, + "step": 12780 + }, + { + "epoch": 1.0281350482315113, + "grad_norm": 1.41170072555542, + "learning_rate": 4.88135593220339e-05, + "loss": 0.9853, + "step": 12790 + }, + { + "epoch": 1.0289389067524115, + "grad_norm": 1.3405449390411377, + "learning_rate": 4.877320419693301e-05, + "loss": 0.9838, + "step": 12800 + }, + { + "epoch": 1.029742765273312, + "grad_norm": 1.413405179977417, + "learning_rate": 4.8732849071832124e-05, + "loss": 1.0329, + "step": 12810 + }, + { + "epoch": 1.0305466237942122, + "grad_norm": 1.324311375617981, + "learning_rate": 4.8692493946731235e-05, + "loss": 1.0136, + "step": 12820 + }, + { + "epoch": 1.0313504823151125, + "grad_norm": 1.9724149703979492, + "learning_rate": 4.8652138821630346e-05, + "loss": 0.9798, + "step": 12830 + }, + { + "epoch": 1.0321543408360128, + "grad_norm": 2.3491575717926025, + "learning_rate": 4.8611783696529465e-05, + "loss": 0.9165, + "step": 12840 + }, + { + "epoch": 1.0329581993569132, + "grad_norm": 1.1819404363632202, + "learning_rate": 4.8571428571428576e-05, + "loss": 1.0794, + "step": 12850 + }, + { + "epoch": 1.0337620578778135, + "grad_norm": 1.3069233894348145, + "learning_rate": 4.853107344632769e-05, + "loss": 1.0239, + "step": 12860 + }, + { + "epoch": 1.0345659163987138, + "grad_norm": 1.3474875688552856, + "learning_rate": 4.84907183212268e-05, + "loss": 1.0174, + "step": 12870 + }, + { + "epoch": 1.0353697749196142, + "grad_norm": 2.287572145462036, + "learning_rate": 4.845036319612591e-05, + "loss": 0.8799, + "step": 12880 + }, + { + "epoch": 1.0361736334405145, + "grad_norm": 1.2526694536209106, + "learning_rate": 4.841000807102502e-05, + "loss": 1.0667, + "step": 12890 + }, + { + "epoch": 1.0369774919614148, + "grad_norm": 1.3657410144805908, + "learning_rate": 4.836965294592413e-05, + "loss": 1.1423, + "step": 12900 + }, + { + "epoch": 1.037781350482315, + "grad_norm": 1.8211760520935059, + "learning_rate": 4.832929782082325e-05, + "loss": 1.0129, + "step": 12910 + }, + { + "epoch": 1.0385852090032155, + "grad_norm": 1.362104058265686, + "learning_rate": 4.828894269572236e-05, + "loss": 0.9487, + "step": 12920 + }, + { + "epoch": 1.0393890675241158, + "grad_norm": 1.7094135284423828, + "learning_rate": 4.8248587570621474e-05, + "loss": 1.0893, + "step": 12930 + }, + { + "epoch": 1.040192926045016, + "grad_norm": 1.6447094678878784, + "learning_rate": 4.8208232445520585e-05, + "loss": 0.9119, + "step": 12940 + }, + { + "epoch": 1.0409967845659165, + "grad_norm": 1.317784070968628, + "learning_rate": 4.8167877320419696e-05, + "loss": 1.0388, + "step": 12950 + }, + { + "epoch": 1.0418006430868167, + "grad_norm": 3.135857105255127, + "learning_rate": 4.812752219531881e-05, + "loss": 0.9671, + "step": 12960 + }, + { + "epoch": 1.042604501607717, + "grad_norm": 1.596756935119629, + "learning_rate": 4.808716707021792e-05, + "loss": 0.956, + "step": 12970 + }, + { + "epoch": 1.0434083601286173, + "grad_norm": 1.973957896232605, + "learning_rate": 4.804681194511703e-05, + "loss": 1.1145, + "step": 12980 + }, + { + "epoch": 1.0442122186495177, + "grad_norm": 1.675784707069397, + "learning_rate": 4.800645682001614e-05, + "loss": 0.9583, + "step": 12990 + }, + { + "epoch": 1.045016077170418, + "grad_norm": 2.1188571453094482, + "learning_rate": 4.796610169491525e-05, + "loss": 1.2259, + "step": 13000 + }, + { + "epoch": 1.0458199356913183, + "grad_norm": 2.490846872329712, + "learning_rate": 4.7925746569814365e-05, + "loss": 1.0112, + "step": 13010 + }, + { + "epoch": 1.0466237942122187, + "grad_norm": 2.144216299057007, + "learning_rate": 4.7885391444713476e-05, + "loss": 0.9891, + "step": 13020 + }, + { + "epoch": 1.047427652733119, + "grad_norm": 1.3466781377792358, + "learning_rate": 4.784503631961259e-05, + "loss": 1.0314, + "step": 13030 + }, + { + "epoch": 1.0482315112540193, + "grad_norm": 1.44432532787323, + "learning_rate": 4.7804681194511705e-05, + "loss": 1.028, + "step": 13040 + }, + { + "epoch": 1.0490353697749195, + "grad_norm": 1.6085883378982544, + "learning_rate": 4.776432606941082e-05, + "loss": 1.0979, + "step": 13050 + }, + { + "epoch": 1.04983922829582, + "grad_norm": 2.7709875106811523, + "learning_rate": 4.772397094430993e-05, + "loss": 0.9387, + "step": 13060 + }, + { + "epoch": 1.0506430868167203, + "grad_norm": 1.5809946060180664, + "learning_rate": 4.7683615819209046e-05, + "loss": 1.0414, + "step": 13070 + }, + { + "epoch": 1.0514469453376205, + "grad_norm": 3.7606260776519775, + "learning_rate": 4.764326069410816e-05, + "loss": 1.1012, + "step": 13080 + }, + { + "epoch": 1.052250803858521, + "grad_norm": 1.3225197792053223, + "learning_rate": 4.760290556900727e-05, + "loss": 0.9132, + "step": 13090 + }, + { + "epoch": 1.0530546623794212, + "grad_norm": 1.6168928146362305, + "learning_rate": 4.756255044390638e-05, + "loss": 0.9281, + "step": 13100 + }, + { + "epoch": 1.0538585209003215, + "grad_norm": 1.2707538604736328, + "learning_rate": 4.752219531880549e-05, + "loss": 1.0895, + "step": 13110 + }, + { + "epoch": 1.0546623794212218, + "grad_norm": 1.8395226001739502, + "learning_rate": 4.74818401937046e-05, + "loss": 1.1246, + "step": 13120 + }, + { + "epoch": 1.0554662379421222, + "grad_norm": 1.9540048837661743, + "learning_rate": 4.7441485068603714e-05, + "loss": 0.8961, + "step": 13130 + }, + { + "epoch": 1.0562700964630225, + "grad_norm": 3.629779100418091, + "learning_rate": 4.7401129943502826e-05, + "loss": 0.9347, + "step": 13140 + }, + { + "epoch": 1.0570739549839228, + "grad_norm": 1.8094236850738525, + "learning_rate": 4.736077481840194e-05, + "loss": 0.9579, + "step": 13150 + }, + { + "epoch": 1.0578778135048232, + "grad_norm": 1.5016402006149292, + "learning_rate": 4.732041969330105e-05, + "loss": 0.9967, + "step": 13160 + }, + { + "epoch": 1.0586816720257235, + "grad_norm": 3.5953845977783203, + "learning_rate": 4.728006456820017e-05, + "loss": 0.9922, + "step": 13170 + }, + { + "epoch": 1.0594855305466238, + "grad_norm": 1.4096349477767944, + "learning_rate": 4.723970944309928e-05, + "loss": 1.0291, + "step": 13180 + }, + { + "epoch": 1.060289389067524, + "grad_norm": 1.606210470199585, + "learning_rate": 4.719935431799839e-05, + "loss": 1.0506, + "step": 13190 + }, + { + "epoch": 1.0610932475884245, + "grad_norm": 1.6466732025146484, + "learning_rate": 4.71589991928975e-05, + "loss": 1.0332, + "step": 13200 + }, + { + "epoch": 1.0618971061093248, + "grad_norm": 1.3050963878631592, + "learning_rate": 4.711864406779661e-05, + "loss": 1.0517, + "step": 13210 + }, + { + "epoch": 1.062700964630225, + "grad_norm": 1.3689507246017456, + "learning_rate": 4.7078288942695723e-05, + "loss": 1.0825, + "step": 13220 + }, + { + "epoch": 1.0635048231511255, + "grad_norm": 1.8197163343429565, + "learning_rate": 4.7037933817594835e-05, + "loss": 1.0044, + "step": 13230 + }, + { + "epoch": 1.0643086816720257, + "grad_norm": 1.6934906244277954, + "learning_rate": 4.6997578692493946e-05, + "loss": 0.9633, + "step": 13240 + }, + { + "epoch": 1.065112540192926, + "grad_norm": 2.1707937717437744, + "learning_rate": 4.695722356739306e-05, + "loss": 1.1439, + "step": 13250 + }, + { + "epoch": 1.0659163987138263, + "grad_norm": 1.2857168912887573, + "learning_rate": 4.691686844229217e-05, + "loss": 1.079, + "step": 13260 + }, + { + "epoch": 1.0667202572347267, + "grad_norm": 1.3009968996047974, + "learning_rate": 4.687651331719129e-05, + "loss": 0.9304, + "step": 13270 + }, + { + "epoch": 1.067524115755627, + "grad_norm": 2.0484039783477783, + "learning_rate": 4.68361581920904e-05, + "loss": 1.0075, + "step": 13280 + }, + { + "epoch": 1.0683279742765273, + "grad_norm": 2.504826545715332, + "learning_rate": 4.679580306698951e-05, + "loss": 0.9569, + "step": 13290 + }, + { + "epoch": 1.0691318327974277, + "grad_norm": 1.8763548135757446, + "learning_rate": 4.675544794188862e-05, + "loss": 1.0155, + "step": 13300 + }, + { + "epoch": 1.069935691318328, + "grad_norm": 1.8800406455993652, + "learning_rate": 4.671509281678774e-05, + "loss": 0.9565, + "step": 13310 + }, + { + "epoch": 1.0707395498392283, + "grad_norm": 1.3298194408416748, + "learning_rate": 4.667473769168685e-05, + "loss": 0.9434, + "step": 13320 + }, + { + "epoch": 1.0715434083601285, + "grad_norm": 1.45872163772583, + "learning_rate": 4.663438256658596e-05, + "loss": 0.9882, + "step": 13330 + }, + { + "epoch": 1.072347266881029, + "grad_norm": 2.167778730392456, + "learning_rate": 4.6594027441485073e-05, + "loss": 1.0759, + "step": 13340 + }, + { + "epoch": 1.0731511254019293, + "grad_norm": 2.0066630840301514, + "learning_rate": 4.6553672316384185e-05, + "loss": 0.9653, + "step": 13350 + }, + { + "epoch": 1.0739549839228295, + "grad_norm": 1.3754826784133911, + "learning_rate": 4.6513317191283296e-05, + "loss": 0.9961, + "step": 13360 + }, + { + "epoch": 1.07475884244373, + "grad_norm": 1.6868056058883667, + "learning_rate": 4.647296206618241e-05, + "loss": 0.9634, + "step": 13370 + }, + { + "epoch": 1.0755627009646302, + "grad_norm": 4.323541641235352, + "learning_rate": 4.643260694108152e-05, + "loss": 0.9814, + "step": 13380 + }, + { + "epoch": 1.0763665594855305, + "grad_norm": 1.3567148447036743, + "learning_rate": 4.639225181598063e-05, + "loss": 1.0566, + "step": 13390 + }, + { + "epoch": 1.077170418006431, + "grad_norm": 1.4276680946350098, + "learning_rate": 4.635189669087974e-05, + "loss": 1.0006, + "step": 13400 + }, + { + "epoch": 1.0779742765273312, + "grad_norm": 1.8578076362609863, + "learning_rate": 4.631154156577885e-05, + "loss": 0.8861, + "step": 13410 + }, + { + "epoch": 1.0787781350482315, + "grad_norm": 1.929571270942688, + "learning_rate": 4.6271186440677964e-05, + "loss": 0.9064, + "step": 13420 + }, + { + "epoch": 1.0795819935691318, + "grad_norm": 1.605181336402893, + "learning_rate": 4.6230831315577076e-05, + "loss": 1.034, + "step": 13430 + }, + { + "epoch": 1.0803858520900322, + "grad_norm": 1.9241151809692383, + "learning_rate": 4.6190476190476194e-05, + "loss": 1.1215, + "step": 13440 + }, + { + "epoch": 1.0811897106109325, + "grad_norm": 3.210650682449341, + "learning_rate": 4.6150121065375305e-05, + "loss": 0.9467, + "step": 13450 + }, + { + "epoch": 1.0819935691318328, + "grad_norm": 1.4049782752990723, + "learning_rate": 4.6109765940274417e-05, + "loss": 1.1126, + "step": 13460 + }, + { + "epoch": 1.082797427652733, + "grad_norm": 1.3904056549072266, + "learning_rate": 4.606941081517353e-05, + "loss": 0.9465, + "step": 13470 + }, + { + "epoch": 1.0836012861736335, + "grad_norm": 1.7483136653900146, + "learning_rate": 4.602905569007264e-05, + "loss": 1.0509, + "step": 13480 + }, + { + "epoch": 1.0844051446945338, + "grad_norm": 1.6487077474594116, + "learning_rate": 4.598870056497175e-05, + "loss": 1.0358, + "step": 13490 + }, + { + "epoch": 1.085209003215434, + "grad_norm": 1.433251976966858, + "learning_rate": 4.594834543987087e-05, + "loss": 0.9653, + "step": 13500 + }, + { + "epoch": 1.0860128617363345, + "grad_norm": 1.551595687866211, + "learning_rate": 4.590799031476998e-05, + "loss": 1.0813, + "step": 13510 + }, + { + "epoch": 1.0868167202572347, + "grad_norm": 1.8778648376464844, + "learning_rate": 4.586763518966909e-05, + "loss": 1.1041, + "step": 13520 + }, + { + "epoch": 1.087620578778135, + "grad_norm": 1.5723669528961182, + "learning_rate": 4.58272800645682e-05, + "loss": 1.1798, + "step": 13530 + }, + { + "epoch": 1.0884244372990355, + "grad_norm": 1.3904507160186768, + "learning_rate": 4.5786924939467314e-05, + "loss": 1.0971, + "step": 13540 + }, + { + "epoch": 1.0892282958199357, + "grad_norm": 2.491220712661743, + "learning_rate": 4.5746569814366426e-05, + "loss": 1.0707, + "step": 13550 + }, + { + "epoch": 1.090032154340836, + "grad_norm": 1.688567042350769, + "learning_rate": 4.570621468926554e-05, + "loss": 0.9696, + "step": 13560 + }, + { + "epoch": 1.0908360128617363, + "grad_norm": 1.3612911701202393, + "learning_rate": 4.5665859564164655e-05, + "loss": 1.0198, + "step": 13570 + }, + { + "epoch": 1.0916398713826367, + "grad_norm": 1.8947391510009766, + "learning_rate": 4.5625504439063767e-05, + "loss": 0.8653, + "step": 13580 + }, + { + "epoch": 1.092443729903537, + "grad_norm": 1.4824692010879517, + "learning_rate": 4.558514931396288e-05, + "loss": 1.1218, + "step": 13590 + }, + { + "epoch": 1.0932475884244373, + "grad_norm": 1.7231032848358154, + "learning_rate": 4.554479418886199e-05, + "loss": 1.0036, + "step": 13600 + }, + { + "epoch": 1.0940514469453375, + "grad_norm": 1.3887447118759155, + "learning_rate": 4.55044390637611e-05, + "loss": 1.0402, + "step": 13610 + }, + { + "epoch": 1.094855305466238, + "grad_norm": 1.8225650787353516, + "learning_rate": 4.546408393866021e-05, + "loss": 0.8615, + "step": 13620 + }, + { + "epoch": 1.0956591639871383, + "grad_norm": 2.701054573059082, + "learning_rate": 4.542372881355932e-05, + "loss": 0.9572, + "step": 13630 + }, + { + "epoch": 1.0964630225080385, + "grad_norm": 1.414825201034546, + "learning_rate": 4.5383373688458435e-05, + "loss": 1.0855, + "step": 13640 + }, + { + "epoch": 1.097266881028939, + "grad_norm": 2.057631731033325, + "learning_rate": 4.5343018563357546e-05, + "loss": 1.0737, + "step": 13650 + }, + { + "epoch": 1.0980707395498392, + "grad_norm": 2.5682103633880615, + "learning_rate": 4.530266343825666e-05, + "loss": 0.9428, + "step": 13660 + }, + { + "epoch": 1.0988745980707395, + "grad_norm": 1.64361572265625, + "learning_rate": 4.526230831315577e-05, + "loss": 1.1099, + "step": 13670 + }, + { + "epoch": 1.09967845659164, + "grad_norm": 1.4092881679534912, + "learning_rate": 4.522195318805488e-05, + "loss": 1.0326, + "step": 13680 + }, + { + "epoch": 1.1004823151125402, + "grad_norm": 1.314833641052246, + "learning_rate": 4.518159806295399e-05, + "loss": 0.9389, + "step": 13690 + }, + { + "epoch": 1.1012861736334405, + "grad_norm": 1.7281252145767212, + "learning_rate": 4.514124293785311e-05, + "loss": 1.096, + "step": 13700 + }, + { + "epoch": 1.1020900321543408, + "grad_norm": 1.3276362419128418, + "learning_rate": 4.510088781275222e-05, + "loss": 0.9756, + "step": 13710 + }, + { + "epoch": 1.1028938906752412, + "grad_norm": 1.3409945964813232, + "learning_rate": 4.506053268765133e-05, + "loss": 1.0168, + "step": 13720 + }, + { + "epoch": 1.1036977491961415, + "grad_norm": 1.5758594274520874, + "learning_rate": 4.502017756255045e-05, + "loss": 0.9559, + "step": 13730 + }, + { + "epoch": 1.1045016077170418, + "grad_norm": 1.2208436727523804, + "learning_rate": 4.497982243744956e-05, + "loss": 0.9951, + "step": 13740 + }, + { + "epoch": 1.105305466237942, + "grad_norm": 2.6655726432800293, + "learning_rate": 4.493946731234867e-05, + "loss": 1.0113, + "step": 13750 + }, + { + "epoch": 1.1061093247588425, + "grad_norm": 2.250619888305664, + "learning_rate": 4.4899112187247785e-05, + "loss": 0.8718, + "step": 13760 + }, + { + "epoch": 1.1069131832797428, + "grad_norm": 1.9622561931610107, + "learning_rate": 4.4858757062146896e-05, + "loss": 1.0055, + "step": 13770 + }, + { + "epoch": 1.107717041800643, + "grad_norm": 1.6765295267105103, + "learning_rate": 4.48224374495561e-05, + "loss": 1.0636, + "step": 13780 + }, + { + "epoch": 1.1085209003215435, + "grad_norm": 1.735635757446289, + "learning_rate": 4.478208232445521e-05, + "loss": 0.9623, + "step": 13790 + }, + { + "epoch": 1.1093247588424437, + "grad_norm": 1.2861146926879883, + "learning_rate": 4.474172719935432e-05, + "loss": 1.0217, + "step": 13800 + }, + { + "epoch": 1.110128617363344, + "grad_norm": 1.6345241069793701, + "learning_rate": 4.470137207425343e-05, + "loss": 1.0118, + "step": 13810 + }, + { + "epoch": 1.1109324758842445, + "grad_norm": 1.7163978815078735, + "learning_rate": 4.466101694915254e-05, + "loss": 1.0795, + "step": 13820 + }, + { + "epoch": 1.1117363344051447, + "grad_norm": 1.987023115158081, + "learning_rate": 4.462066182405166e-05, + "loss": 0.9164, + "step": 13830 + }, + { + "epoch": 1.112540192926045, + "grad_norm": 1.6738214492797852, + "learning_rate": 4.458030669895077e-05, + "loss": 0.9628, + "step": 13840 + }, + { + "epoch": 1.1133440514469453, + "grad_norm": 1.9623785018920898, + "learning_rate": 4.453995157384988e-05, + "loss": 1.0293, + "step": 13850 + }, + { + "epoch": 1.1141479099678457, + "grad_norm": 2.1075973510742188, + "learning_rate": 4.4499596448748995e-05, + "loss": 1.0122, + "step": 13860 + }, + { + "epoch": 1.114951768488746, + "grad_norm": 1.572359323501587, + "learning_rate": 4.4459241323648106e-05, + "loss": 1.1115, + "step": 13870 + }, + { + "epoch": 1.1157556270096463, + "grad_norm": 1.8940703868865967, + "learning_rate": 4.441888619854722e-05, + "loss": 1.0937, + "step": 13880 + }, + { + "epoch": 1.1165594855305465, + "grad_norm": 1.657184362411499, + "learning_rate": 4.437853107344633e-05, + "loss": 0.8793, + "step": 13890 + }, + { + "epoch": 1.117363344051447, + "grad_norm": 1.267751693725586, + "learning_rate": 4.433817594834544e-05, + "loss": 0.9959, + "step": 13900 + }, + { + "epoch": 1.1181672025723473, + "grad_norm": 2.4804649353027344, + "learning_rate": 4.429782082324456e-05, + "loss": 0.9934, + "step": 13910 + }, + { + "epoch": 1.1189710610932475, + "grad_norm": 1.7541329860687256, + "learning_rate": 4.425746569814367e-05, + "loss": 1.0165, + "step": 13920 + }, + { + "epoch": 1.119774919614148, + "grad_norm": 1.3630015850067139, + "learning_rate": 4.421711057304278e-05, + "loss": 1.098, + "step": 13930 + }, + { + "epoch": 1.1205787781350482, + "grad_norm": 1.548208236694336, + "learning_rate": 4.417675544794189e-05, + "loss": 1.1485, + "step": 13940 + }, + { + "epoch": 1.1213826366559485, + "grad_norm": 2.2378597259521484, + "learning_rate": 4.4136400322841004e-05, + "loss": 0.9316, + "step": 13950 + }, + { + "epoch": 1.122186495176849, + "grad_norm": 1.7924768924713135, + "learning_rate": 4.4096045197740115e-05, + "loss": 0.9349, + "step": 13960 + }, + { + "epoch": 1.1229903536977492, + "grad_norm": 1.8515634536743164, + "learning_rate": 4.4055690072639226e-05, + "loss": 1.0372, + "step": 13970 + }, + { + "epoch": 1.1237942122186495, + "grad_norm": 1.59554922580719, + "learning_rate": 4.401533494753834e-05, + "loss": 0.951, + "step": 13980 + }, + { + "epoch": 1.1245980707395498, + "grad_norm": 1.879597783088684, + "learning_rate": 4.397497982243745e-05, + "loss": 1.0449, + "step": 13990 + }, + { + "epoch": 1.1254019292604502, + "grad_norm": 2.153693199157715, + "learning_rate": 4.393462469733656e-05, + "loss": 1.0486, + "step": 14000 + }, + { + "epoch": 1.1254019292604502, + "eval_yahma/alpaca-cleaned_loss": 1.2180111408233643, + "eval_yahma/alpaca-cleaned_runtime": 115.6253, + "eval_yahma/alpaca-cleaned_samples_per_second": 17.297, + "eval_yahma/alpaca-cleaned_steps_per_second": 2.162, + "step": 14000 + }, + { + "epoch": 1.1262057877813505, + "grad_norm": 1.7476816177368164, + "learning_rate": 4.389426957223567e-05, + "loss": 1.0463, + "step": 14010 + }, + { + "epoch": 1.1270096463022508, + "grad_norm": 1.2039158344268799, + "learning_rate": 4.385391444713478e-05, + "loss": 0.8675, + "step": 14020 + }, + { + "epoch": 1.127813504823151, + "grad_norm": 1.786009669303894, + "learning_rate": 4.38135593220339e-05, + "loss": 0.9991, + "step": 14030 + }, + { + "epoch": 1.1286173633440515, + "grad_norm": 1.2885710000991821, + "learning_rate": 4.377320419693301e-05, + "loss": 1.1352, + "step": 14040 + }, + { + "epoch": 1.1294212218649518, + "grad_norm": 2.1474835872650146, + "learning_rate": 4.3732849071832124e-05, + "loss": 0.9888, + "step": 14050 + }, + { + "epoch": 1.130225080385852, + "grad_norm": 1.4155480861663818, + "learning_rate": 4.369249394673124e-05, + "loss": 1.0412, + "step": 14060 + }, + { + "epoch": 1.1310289389067525, + "grad_norm": 1.7451374530792236, + "learning_rate": 4.3652138821630354e-05, + "loss": 1.0104, + "step": 14070 + }, + { + "epoch": 1.1318327974276527, + "grad_norm": 2.5622613430023193, + "learning_rate": 4.3611783696529465e-05, + "loss": 1.0273, + "step": 14080 + }, + { + "epoch": 1.132636655948553, + "grad_norm": 1.6344316005706787, + "learning_rate": 4.3571428571428576e-05, + "loss": 0.9315, + "step": 14090 + }, + { + "epoch": 1.1334405144694535, + "grad_norm": 1.552697777748108, + "learning_rate": 4.353107344632769e-05, + "loss": 1.033, + "step": 14100 + }, + { + "epoch": 1.1342443729903537, + "grad_norm": 2.1421780586242676, + "learning_rate": 4.34907183212268e-05, + "loss": 1.0476, + "step": 14110 + }, + { + "epoch": 1.135048231511254, + "grad_norm": 1.256295084953308, + "learning_rate": 4.345036319612591e-05, + "loss": 0.973, + "step": 14120 + }, + { + "epoch": 1.1358520900321543, + "grad_norm": 1.2301644086837769, + "learning_rate": 4.341000807102502e-05, + "loss": 1.0045, + "step": 14130 + }, + { + "epoch": 1.1366559485530547, + "grad_norm": 2.543393135070801, + "learning_rate": 4.336965294592413e-05, + "loss": 0.9381, + "step": 14140 + }, + { + "epoch": 1.137459807073955, + "grad_norm": 1.4487563371658325, + "learning_rate": 4.3329297820823245e-05, + "loss": 1.0133, + "step": 14150 + }, + { + "epoch": 1.1382636655948553, + "grad_norm": 3.257087230682373, + "learning_rate": 4.3288942695722356e-05, + "loss": 1.0521, + "step": 14160 + }, + { + "epoch": 1.1390675241157555, + "grad_norm": 3.0251243114471436, + "learning_rate": 4.324858757062147e-05, + "loss": 0.8705, + "step": 14170 + }, + { + "epoch": 1.139871382636656, + "grad_norm": 1.651095986366272, + "learning_rate": 4.3208232445520585e-05, + "loss": 1.0427, + "step": 14180 + }, + { + "epoch": 1.1406752411575563, + "grad_norm": 1.6251319646835327, + "learning_rate": 4.31678773204197e-05, + "loss": 1.0401, + "step": 14190 + }, + { + "epoch": 1.1414790996784565, + "grad_norm": 2.8393075466156006, + "learning_rate": 4.312752219531881e-05, + "loss": 0.929, + "step": 14200 + }, + { + "epoch": 1.142282958199357, + "grad_norm": 1.2079108953475952, + "learning_rate": 4.308716707021792e-05, + "loss": 1.0199, + "step": 14210 + }, + { + "epoch": 1.1430868167202572, + "grad_norm": 1.8249192237854004, + "learning_rate": 4.304681194511703e-05, + "loss": 1.111, + "step": 14220 + }, + { + "epoch": 1.1438906752411575, + "grad_norm": 1.9499272108078003, + "learning_rate": 4.300645682001614e-05, + "loss": 0.9521, + "step": 14230 + }, + { + "epoch": 1.144694533762058, + "grad_norm": 1.579451084136963, + "learning_rate": 4.2966101694915254e-05, + "loss": 0.9798, + "step": 14240 + }, + { + "epoch": 1.1454983922829582, + "grad_norm": 1.9789243936538696, + "learning_rate": 4.2925746569814365e-05, + "loss": 1.0356, + "step": 14250 + }, + { + "epoch": 1.1463022508038585, + "grad_norm": 1.6762830018997192, + "learning_rate": 4.288539144471348e-05, + "loss": 0.9138, + "step": 14260 + }, + { + "epoch": 1.1471061093247588, + "grad_norm": 1.5332313776016235, + "learning_rate": 4.2845036319612594e-05, + "loss": 0.9959, + "step": 14270 + }, + { + "epoch": 1.1479099678456592, + "grad_norm": 2.1301565170288086, + "learning_rate": 4.2804681194511706e-05, + "loss": 0.9985, + "step": 14280 + }, + { + "epoch": 1.1487138263665595, + "grad_norm": 1.6805988550186157, + "learning_rate": 4.276432606941082e-05, + "loss": 0.9996, + "step": 14290 + }, + { + "epoch": 1.1495176848874598, + "grad_norm": 1.5411735773086548, + "learning_rate": 4.272397094430993e-05, + "loss": 0.9634, + "step": 14300 + }, + { + "epoch": 1.15032154340836, + "grad_norm": 1.270493984222412, + "learning_rate": 4.268361581920905e-05, + "loss": 1.0496, + "step": 14310 + }, + { + "epoch": 1.1511254019292605, + "grad_norm": 4.723905086517334, + "learning_rate": 4.264326069410816e-05, + "loss": 1.0178, + "step": 14320 + }, + { + "epoch": 1.1519292604501608, + "grad_norm": 1.8460774421691895, + "learning_rate": 4.260290556900727e-05, + "loss": 0.8411, + "step": 14330 + }, + { + "epoch": 1.152733118971061, + "grad_norm": 1.428191900253296, + "learning_rate": 4.256255044390638e-05, + "loss": 0.9971, + "step": 14340 + }, + { + "epoch": 1.1535369774919615, + "grad_norm": 2.1410958766937256, + "learning_rate": 4.252219531880549e-05, + "loss": 1.0238, + "step": 14350 + }, + { + "epoch": 1.1543408360128617, + "grad_norm": 2.1243741512298584, + "learning_rate": 4.2481840193704604e-05, + "loss": 1.0217, + "step": 14360 + }, + { + "epoch": 1.155144694533762, + "grad_norm": 1.4340726137161255, + "learning_rate": 4.2441485068603715e-05, + "loss": 1.0448, + "step": 14370 + }, + { + "epoch": 1.1559485530546625, + "grad_norm": 1.64406418800354, + "learning_rate": 4.2401129943502826e-05, + "loss": 1.095, + "step": 14380 + }, + { + "epoch": 1.1567524115755627, + "grad_norm": 1.3270835876464844, + "learning_rate": 4.236077481840194e-05, + "loss": 1.0975, + "step": 14390 + }, + { + "epoch": 1.157556270096463, + "grad_norm": 1.5200716257095337, + "learning_rate": 4.232041969330105e-05, + "loss": 1.1141, + "step": 14400 + }, + { + "epoch": 1.1583601286173633, + "grad_norm": 1.362056016921997, + "learning_rate": 4.228006456820016e-05, + "loss": 1.0216, + "step": 14410 + }, + { + "epoch": 1.1591639871382637, + "grad_norm": 1.8959547281265259, + "learning_rate": 4.223970944309927e-05, + "loss": 0.9394, + "step": 14420 + }, + { + "epoch": 1.159967845659164, + "grad_norm": 1.3526105880737305, + "learning_rate": 4.219935431799838e-05, + "loss": 1.0009, + "step": 14430 + }, + { + "epoch": 1.1607717041800643, + "grad_norm": 1.8486071825027466, + "learning_rate": 4.21589991928975e-05, + "loss": 1.051, + "step": 14440 + }, + { + "epoch": 1.1615755627009645, + "grad_norm": 1.899803638458252, + "learning_rate": 4.211864406779661e-05, + "loss": 0.9083, + "step": 14450 + }, + { + "epoch": 1.162379421221865, + "grad_norm": 1.7194709777832031, + "learning_rate": 4.2078288942695724e-05, + "loss": 1.0118, + "step": 14460 + }, + { + "epoch": 1.1631832797427653, + "grad_norm": 1.7135143280029297, + "learning_rate": 4.2037933817594835e-05, + "loss": 1.0856, + "step": 14470 + }, + { + "epoch": 1.1639871382636655, + "grad_norm": 1.6633415222167969, + "learning_rate": 4.199757869249395e-05, + "loss": 0.9976, + "step": 14480 + }, + { + "epoch": 1.164790996784566, + "grad_norm": 1.6620136499404907, + "learning_rate": 4.195722356739306e-05, + "loss": 1.1468, + "step": 14490 + }, + { + "epoch": 1.1655948553054662, + "grad_norm": 1.4163603782653809, + "learning_rate": 4.1916868442292176e-05, + "loss": 1.0357, + "step": 14500 + }, + { + "epoch": 1.1663987138263665, + "grad_norm": 1.6289397478103638, + "learning_rate": 4.187651331719129e-05, + "loss": 1.0538, + "step": 14510 + }, + { + "epoch": 1.167202572347267, + "grad_norm": 1.8465838432312012, + "learning_rate": 4.18361581920904e-05, + "loss": 0.9653, + "step": 14520 + }, + { + "epoch": 1.1680064308681672, + "grad_norm": 1.3394323587417603, + "learning_rate": 4.179580306698951e-05, + "loss": 1.0897, + "step": 14530 + }, + { + "epoch": 1.1688102893890675, + "grad_norm": 1.341867208480835, + "learning_rate": 4.175544794188862e-05, + "loss": 1.028, + "step": 14540 + }, + { + "epoch": 1.1696141479099678, + "grad_norm": 1.6926863193511963, + "learning_rate": 4.171509281678773e-05, + "loss": 1.0484, + "step": 14550 + }, + { + "epoch": 1.1704180064308682, + "grad_norm": 1.8348326683044434, + "learning_rate": 4.1674737691686844e-05, + "loss": 1.0871, + "step": 14560 + }, + { + "epoch": 1.1712218649517685, + "grad_norm": 1.9606865644454956, + "learning_rate": 4.163438256658596e-05, + "loss": 1.0358, + "step": 14570 + }, + { + "epoch": 1.1720257234726688, + "grad_norm": 1.9318088293075562, + "learning_rate": 4.1594027441485074e-05, + "loss": 0.9973, + "step": 14580 + }, + { + "epoch": 1.172829581993569, + "grad_norm": 1.6759283542633057, + "learning_rate": 4.1553672316384185e-05, + "loss": 1.0548, + "step": 14590 + }, + { + "epoch": 1.1736334405144695, + "grad_norm": 2.018908977508545, + "learning_rate": 4.1513317191283297e-05, + "loss": 1.0622, + "step": 14600 + }, + { + "epoch": 1.1744372990353698, + "grad_norm": 2.886817455291748, + "learning_rate": 4.147296206618241e-05, + "loss": 0.9834, + "step": 14610 + }, + { + "epoch": 1.17524115755627, + "grad_norm": 2.012728452682495, + "learning_rate": 4.143260694108152e-05, + "loss": 0.9802, + "step": 14620 + }, + { + "epoch": 1.1760450160771705, + "grad_norm": 2.133406400680542, + "learning_rate": 4.139225181598063e-05, + "loss": 1.074, + "step": 14630 + }, + { + "epoch": 1.1768488745980707, + "grad_norm": 2.4336249828338623, + "learning_rate": 4.135189669087974e-05, + "loss": 0.9832, + "step": 14640 + }, + { + "epoch": 1.177652733118971, + "grad_norm": 1.6935405731201172, + "learning_rate": 4.1311541565778853e-05, + "loss": 1.012, + "step": 14650 + }, + { + "epoch": 1.1784565916398715, + "grad_norm": 1.684714674949646, + "learning_rate": 4.1271186440677965e-05, + "loss": 1.0092, + "step": 14660 + }, + { + "epoch": 1.1792604501607717, + "grad_norm": 1.4697527885437012, + "learning_rate": 4.1230831315577076e-05, + "loss": 1.0159, + "step": 14670 + }, + { + "epoch": 1.180064308681672, + "grad_norm": 1.7050909996032715, + "learning_rate": 4.119047619047619e-05, + "loss": 1.0019, + "step": 14680 + }, + { + "epoch": 1.1808681672025723, + "grad_norm": 1.9618253707885742, + "learning_rate": 4.11501210653753e-05, + "loss": 0.9164, + "step": 14690 + }, + { + "epoch": 1.1816720257234727, + "grad_norm": 1.96260666847229, + "learning_rate": 4.110976594027442e-05, + "loss": 0.9642, + "step": 14700 + }, + { + "epoch": 1.182475884244373, + "grad_norm": 1.8400181531906128, + "learning_rate": 4.106941081517353e-05, + "loss": 0.9593, + "step": 14710 + }, + { + "epoch": 1.1832797427652733, + "grad_norm": 3.5856144428253174, + "learning_rate": 4.102905569007264e-05, + "loss": 1.3208, + "step": 14720 + }, + { + "epoch": 1.1840836012861735, + "grad_norm": 1.7220356464385986, + "learning_rate": 4.098870056497176e-05, + "loss": 1.0172, + "step": 14730 + }, + { + "epoch": 1.184887459807074, + "grad_norm": 1.6482996940612793, + "learning_rate": 4.094834543987087e-05, + "loss": 0.9748, + "step": 14740 + }, + { + "epoch": 1.1856913183279743, + "grad_norm": 1.687031865119934, + "learning_rate": 4.090799031476998e-05, + "loss": 1.0431, + "step": 14750 + }, + { + "epoch": 1.1864951768488745, + "grad_norm": 1.7135734558105469, + "learning_rate": 4.086763518966909e-05, + "loss": 1.0557, + "step": 14760 + }, + { + "epoch": 1.187299035369775, + "grad_norm": 1.9167877435684204, + "learning_rate": 4.08272800645682e-05, + "loss": 1.0152, + "step": 14770 + }, + { + "epoch": 1.1881028938906752, + "grad_norm": 1.734106421470642, + "learning_rate": 4.0786924939467315e-05, + "loss": 1.0033, + "step": 14780 + }, + { + "epoch": 1.1889067524115755, + "grad_norm": 1.9168775081634521, + "learning_rate": 4.0746569814366426e-05, + "loss": 0.9823, + "step": 14790 + }, + { + "epoch": 1.189710610932476, + "grad_norm": 3.9640393257141113, + "learning_rate": 4.070621468926554e-05, + "loss": 1.0677, + "step": 14800 + }, + { + "epoch": 1.1905144694533762, + "grad_norm": 1.4360038042068481, + "learning_rate": 4.066585956416465e-05, + "loss": 1.0105, + "step": 14810 + }, + { + "epoch": 1.1913183279742765, + "grad_norm": 4.0904541015625, + "learning_rate": 4.062550443906376e-05, + "loss": 0.9963, + "step": 14820 + }, + { + "epoch": 1.1921221864951768, + "grad_norm": 1.6084517240524292, + "learning_rate": 4.058514931396287e-05, + "loss": 0.9998, + "step": 14830 + }, + { + "epoch": 1.1929260450160772, + "grad_norm": 1.730948805809021, + "learning_rate": 4.054479418886199e-05, + "loss": 0.9964, + "step": 14840 + }, + { + "epoch": 1.1937299035369775, + "grad_norm": 1.4679205417633057, + "learning_rate": 4.05044390637611e-05, + "loss": 1.1229, + "step": 14850 + }, + { + "epoch": 1.1945337620578778, + "grad_norm": 2.152657985687256, + "learning_rate": 4.046408393866021e-05, + "loss": 1.0017, + "step": 14860 + }, + { + "epoch": 1.195337620578778, + "grad_norm": 1.3565618991851807, + "learning_rate": 4.0423728813559324e-05, + "loss": 1.0653, + "step": 14870 + }, + { + "epoch": 1.1961414790996785, + "grad_norm": 1.5673973560333252, + "learning_rate": 4.0383373688458435e-05, + "loss": 0.8449, + "step": 14880 + }, + { + "epoch": 1.1969453376205788, + "grad_norm": 4.41684627532959, + "learning_rate": 4.0343018563357547e-05, + "loss": 1.0748, + "step": 14890 + }, + { + "epoch": 1.197749196141479, + "grad_norm": 1.369780421257019, + "learning_rate": 4.030266343825666e-05, + "loss": 0.9427, + "step": 14900 + }, + { + "epoch": 1.1985530546623795, + "grad_norm": 1.9163693189620972, + "learning_rate": 4.026230831315577e-05, + "loss": 1.0125, + "step": 14910 + }, + { + "epoch": 1.1993569131832797, + "grad_norm": 2.2586166858673096, + "learning_rate": 4.022195318805488e-05, + "loss": 1.0345, + "step": 14920 + }, + { + "epoch": 1.20016077170418, + "grad_norm": 1.5923124551773071, + "learning_rate": 4.0181598062954e-05, + "loss": 1.0082, + "step": 14930 + }, + { + "epoch": 1.2009646302250805, + "grad_norm": 1.4561861753463745, + "learning_rate": 4.014124293785311e-05, + "loss": 0.939, + "step": 14940 + }, + { + "epoch": 1.2017684887459807, + "grad_norm": 1.5259567499160767, + "learning_rate": 4.010088781275222e-05, + "loss": 0.9689, + "step": 14950 + }, + { + "epoch": 1.202572347266881, + "grad_norm": 1.632391333580017, + "learning_rate": 4.006053268765133e-05, + "loss": 0.9775, + "step": 14960 + }, + { + "epoch": 1.2033762057877813, + "grad_norm": 1.7772167921066284, + "learning_rate": 4.002017756255045e-05, + "loss": 1.1127, + "step": 14970 + }, + { + "epoch": 1.2041800643086817, + "grad_norm": 1.8630040884017944, + "learning_rate": 3.997982243744956e-05, + "loss": 0.9544, + "step": 14980 + }, + { + "epoch": 1.204983922829582, + "grad_norm": 3.0156354904174805, + "learning_rate": 3.9939467312348674e-05, + "loss": 1.1024, + "step": 14990 + }, + { + "epoch": 1.2057877813504823, + "grad_norm": 1.5983487367630005, + "learning_rate": 3.9899112187247785e-05, + "loss": 0.9616, + "step": 15000 + }, + { + "epoch": 1.2065916398713825, + "grad_norm": 1.3613848686218262, + "learning_rate": 3.9858757062146896e-05, + "loss": 1.0519, + "step": 15010 + }, + { + "epoch": 1.207395498392283, + "grad_norm": 2.2423689365386963, + "learning_rate": 3.981840193704601e-05, + "loss": 1.053, + "step": 15020 + }, + { + "epoch": 1.2081993569131833, + "grad_norm": 2.0904273986816406, + "learning_rate": 3.977804681194512e-05, + "loss": 0.9744, + "step": 15030 + }, + { + "epoch": 1.2090032154340835, + "grad_norm": 1.6902353763580322, + "learning_rate": 3.973769168684423e-05, + "loss": 1.0055, + "step": 15040 + }, + { + "epoch": 1.209807073954984, + "grad_norm": 1.4305819272994995, + "learning_rate": 3.969733656174334e-05, + "loss": 1.0954, + "step": 15050 + }, + { + "epoch": 1.2106109324758842, + "grad_norm": 1.6137382984161377, + "learning_rate": 3.965698143664245e-05, + "loss": 0.8971, + "step": 15060 + }, + { + "epoch": 1.2114147909967845, + "grad_norm": 1.9220008850097656, + "learning_rate": 3.9616626311541565e-05, + "loss": 0.9589, + "step": 15070 + }, + { + "epoch": 1.212218649517685, + "grad_norm": 2.8765358924865723, + "learning_rate": 3.9576271186440676e-05, + "loss": 1.061, + "step": 15080 + }, + { + "epoch": 1.2130225080385852, + "grad_norm": 2.1828126907348633, + "learning_rate": 3.953591606133979e-05, + "loss": 1.001, + "step": 15090 + }, + { + "epoch": 1.2138263665594855, + "grad_norm": 1.8121625185012817, + "learning_rate": 3.9495560936238905e-05, + "loss": 1.0547, + "step": 15100 + }, + { + "epoch": 1.2146302250803858, + "grad_norm": 1.592613697052002, + "learning_rate": 3.945520581113802e-05, + "loss": 0.9974, + "step": 15110 + }, + { + "epoch": 1.2154340836012862, + "grad_norm": 2.5363457202911377, + "learning_rate": 3.941485068603713e-05, + "loss": 1.1056, + "step": 15120 + }, + { + "epoch": 1.2162379421221865, + "grad_norm": 1.8469159603118896, + "learning_rate": 3.937449556093624e-05, + "loss": 0.9196, + "step": 15130 + }, + { + "epoch": 1.2170418006430868, + "grad_norm": 1.3776907920837402, + "learning_rate": 3.933414043583535e-05, + "loss": 1.0031, + "step": 15140 + }, + { + "epoch": 1.217845659163987, + "grad_norm": 2.2064387798309326, + "learning_rate": 3.929378531073446e-05, + "loss": 0.9778, + "step": 15150 + }, + { + "epoch": 1.2186495176848875, + "grad_norm": 1.5302129983901978, + "learning_rate": 3.925343018563358e-05, + "loss": 1.1049, + "step": 15160 + }, + { + "epoch": 1.2194533762057878, + "grad_norm": 1.896903395652771, + "learning_rate": 3.921307506053269e-05, + "loss": 0.9388, + "step": 15170 + }, + { + "epoch": 1.220257234726688, + "grad_norm": 1.9947621822357178, + "learning_rate": 3.91727199354318e-05, + "loss": 0.9813, + "step": 15180 + }, + { + "epoch": 1.2210610932475885, + "grad_norm": 2.0882503986358643, + "learning_rate": 3.9132364810330915e-05, + "loss": 1.0022, + "step": 15190 + }, + { + "epoch": 1.2218649517684887, + "grad_norm": 1.3755741119384766, + "learning_rate": 3.9092009685230026e-05, + "loss": 0.9404, + "step": 15200 + }, + { + "epoch": 1.222668810289389, + "grad_norm": 1.8343044519424438, + "learning_rate": 3.905165456012914e-05, + "loss": 1.1459, + "step": 15210 + }, + { + "epoch": 1.2234726688102895, + "grad_norm": 2.009847402572632, + "learning_rate": 3.901129943502825e-05, + "loss": 0.9887, + "step": 15220 + }, + { + "epoch": 1.2242765273311897, + "grad_norm": 1.5839662551879883, + "learning_rate": 3.897094430992737e-05, + "loss": 0.9295, + "step": 15230 + }, + { + "epoch": 1.22508038585209, + "grad_norm": 1.9991822242736816, + "learning_rate": 3.893058918482648e-05, + "loss": 0.9761, + "step": 15240 + }, + { + "epoch": 1.2258842443729903, + "grad_norm": 1.4759560823440552, + "learning_rate": 3.889023405972559e-05, + "loss": 1.024, + "step": 15250 + }, + { + "epoch": 1.2266881028938907, + "grad_norm": 2.5121824741363525, + "learning_rate": 3.88498789346247e-05, + "loss": 0.9983, + "step": 15260 + }, + { + "epoch": 1.227491961414791, + "grad_norm": 1.6680318117141724, + "learning_rate": 3.880952380952381e-05, + "loss": 0.9924, + "step": 15270 + }, + { + "epoch": 1.2282958199356913, + "grad_norm": 1.9648467302322388, + "learning_rate": 3.8769168684422924e-05, + "loss": 1.0334, + "step": 15280 + }, + { + "epoch": 1.2290996784565915, + "grad_norm": 1.6655272245407104, + "learning_rate": 3.8728813559322035e-05, + "loss": 0.8878, + "step": 15290 + }, + { + "epoch": 1.229903536977492, + "grad_norm": 1.5394530296325684, + "learning_rate": 3.8688458434221146e-05, + "loss": 1.0047, + "step": 15300 + }, + { + "epoch": 1.2307073954983923, + "grad_norm": 1.3396995067596436, + "learning_rate": 3.864810330912026e-05, + "loss": 0.9884, + "step": 15310 + }, + { + "epoch": 1.2315112540192925, + "grad_norm": 2.521289110183716, + "learning_rate": 3.860774818401937e-05, + "loss": 0.9421, + "step": 15320 + }, + { + "epoch": 1.232315112540193, + "grad_norm": 1.703194260597229, + "learning_rate": 3.856739305891848e-05, + "loss": 1.0233, + "step": 15330 + }, + { + "epoch": 1.2331189710610932, + "grad_norm": 1.4893176555633545, + "learning_rate": 3.852703793381759e-05, + "loss": 0.9465, + "step": 15340 + }, + { + "epoch": 1.2339228295819935, + "grad_norm": 1.442811369895935, + "learning_rate": 3.84866828087167e-05, + "loss": 0.9332, + "step": 15350 + }, + { + "epoch": 1.234726688102894, + "grad_norm": 1.6394675970077515, + "learning_rate": 3.844632768361582e-05, + "loss": 0.9862, + "step": 15360 + }, + { + "epoch": 1.2355305466237942, + "grad_norm": 1.969940423965454, + "learning_rate": 3.840597255851493e-05, + "loss": 1.0272, + "step": 15370 + }, + { + "epoch": 1.2363344051446945, + "grad_norm": 2.1524131298065186, + "learning_rate": 3.8365617433414044e-05, + "loss": 1.0043, + "step": 15380 + }, + { + "epoch": 1.2371382636655948, + "grad_norm": 1.8605786561965942, + "learning_rate": 3.832526230831316e-05, + "loss": 1.0146, + "step": 15390 + }, + { + "epoch": 1.2379421221864952, + "grad_norm": 1.7119444608688354, + "learning_rate": 3.8284907183212274e-05, + "loss": 1.0073, + "step": 15400 + }, + { + "epoch": 1.2387459807073955, + "grad_norm": 1.8046406507492065, + "learning_rate": 3.8244552058111385e-05, + "loss": 1.0533, + "step": 15410 + }, + { + "epoch": 1.2395498392282958, + "grad_norm": 1.8285448551177979, + "learning_rate": 3.8204196933010496e-05, + "loss": 1.0184, + "step": 15420 + }, + { + "epoch": 1.2403536977491962, + "grad_norm": 1.8175604343414307, + "learning_rate": 3.816384180790961e-05, + "loss": 0.9599, + "step": 15430 + }, + { + "epoch": 1.2411575562700965, + "grad_norm": 2.057853937149048, + "learning_rate": 3.812348668280872e-05, + "loss": 1.013, + "step": 15440 + }, + { + "epoch": 1.2419614147909968, + "grad_norm": 1.6940542459487915, + "learning_rate": 3.808313155770783e-05, + "loss": 0.9813, + "step": 15450 + }, + { + "epoch": 1.242765273311897, + "grad_norm": 1.6630384922027588, + "learning_rate": 3.804277643260694e-05, + "loss": 1.0528, + "step": 15460 + }, + { + "epoch": 1.2435691318327975, + "grad_norm": 1.4436936378479004, + "learning_rate": 3.800242130750605e-05, + "loss": 0.9409, + "step": 15470 + }, + { + "epoch": 1.2443729903536977, + "grad_norm": 1.7026877403259277, + "learning_rate": 3.7962066182405164e-05, + "loss": 0.9785, + "step": 15480 + }, + { + "epoch": 1.245176848874598, + "grad_norm": 1.5710142850875854, + "learning_rate": 3.792171105730428e-05, + "loss": 0.925, + "step": 15490 + }, + { + "epoch": 1.2459807073954985, + "grad_norm": 1.6203075647354126, + "learning_rate": 3.7881355932203394e-05, + "loss": 1.0111, + "step": 15500 + }, + { + "epoch": 1.2467845659163987, + "grad_norm": 3.0252764225006104, + "learning_rate": 3.7841000807102505e-05, + "loss": 0.9773, + "step": 15510 + }, + { + "epoch": 1.247588424437299, + "grad_norm": 2.1563704013824463, + "learning_rate": 3.780064568200162e-05, + "loss": 0.9742, + "step": 15520 + }, + { + "epoch": 1.2483922829581993, + "grad_norm": 2.3449900150299072, + "learning_rate": 3.776029055690073e-05, + "loss": 1.0818, + "step": 15530 + }, + { + "epoch": 1.2491961414790997, + "grad_norm": 2.3925440311431885, + "learning_rate": 3.771993543179984e-05, + "loss": 1.0998, + "step": 15540 + }, + { + "epoch": 1.25, + "grad_norm": 1.5898447036743164, + "learning_rate": 3.767958030669895e-05, + "loss": 0.936, + "step": 15550 + }, + { + "epoch": 1.2508038585209003, + "grad_norm": 2.1979892253875732, + "learning_rate": 3.763922518159806e-05, + "loss": 1.1329, + "step": 15560 + }, + { + "epoch": 1.2516077170418005, + "grad_norm": 1.892656683921814, + "learning_rate": 3.7598870056497174e-05, + "loss": 1.0623, + "step": 15570 + }, + { + "epoch": 1.252411575562701, + "grad_norm": 1.9178494215011597, + "learning_rate": 3.7558514931396285e-05, + "loss": 1.038, + "step": 15580 + }, + { + "epoch": 1.2532154340836013, + "grad_norm": 1.6698342561721802, + "learning_rate": 3.75181598062954e-05, + "loss": 0.9981, + "step": 15590 + }, + { + "epoch": 1.2540192926045015, + "grad_norm": 1.7022637128829956, + "learning_rate": 3.7477804681194514e-05, + "loss": 0.9657, + "step": 15600 + }, + { + "epoch": 1.254823151125402, + "grad_norm": 1.5800572633743286, + "learning_rate": 3.7437449556093626e-05, + "loss": 1.0576, + "step": 15610 + }, + { + "epoch": 1.2556270096463023, + "grad_norm": 2.19415545463562, + "learning_rate": 3.739709443099274e-05, + "loss": 1.0083, + "step": 15620 + }, + { + "epoch": 1.2564308681672025, + "grad_norm": 1.5555347204208374, + "learning_rate": 3.7356739305891855e-05, + "loss": 1.0514, + "step": 15630 + }, + { + "epoch": 1.257234726688103, + "grad_norm": 1.9707562923431396, + "learning_rate": 3.731638418079097e-05, + "loss": 0.9158, + "step": 15640 + }, + { + "epoch": 1.2580385852090032, + "grad_norm": 1.4888017177581787, + "learning_rate": 3.727602905569008e-05, + "loss": 0.9496, + "step": 15650 + }, + { + "epoch": 1.2588424437299035, + "grad_norm": 1.655102014541626, + "learning_rate": 3.723567393058919e-05, + "loss": 1.0025, + "step": 15660 + }, + { + "epoch": 1.259646302250804, + "grad_norm": 1.51974356174469, + "learning_rate": 3.71953188054883e-05, + "loss": 1.0251, + "step": 15670 + }, + { + "epoch": 1.2604501607717042, + "grad_norm": 1.6518058776855469, + "learning_rate": 3.715496368038741e-05, + "loss": 0.9934, + "step": 15680 + }, + { + "epoch": 1.2612540192926045, + "grad_norm": 1.6622509956359863, + "learning_rate": 3.7114608555286523e-05, + "loss": 1.0582, + "step": 15690 + }, + { + "epoch": 1.2620578778135048, + "grad_norm": 2.096949338912964, + "learning_rate": 3.7074253430185635e-05, + "loss": 1.0454, + "step": 15700 + }, + { + "epoch": 1.262861736334405, + "grad_norm": 1.668216586112976, + "learning_rate": 3.7033898305084746e-05, + "loss": 1.0954, + "step": 15710 + }, + { + "epoch": 1.2636655948553055, + "grad_norm": 1.8938219547271729, + "learning_rate": 3.699354317998386e-05, + "loss": 0.9693, + "step": 15720 + }, + { + "epoch": 1.2644694533762058, + "grad_norm": 1.4523894786834717, + "learning_rate": 3.695318805488297e-05, + "loss": 1.068, + "step": 15730 + }, + { + "epoch": 1.265273311897106, + "grad_norm": 1.4719635248184204, + "learning_rate": 3.691283292978208e-05, + "loss": 1.051, + "step": 15740 + }, + { + "epoch": 1.2660771704180065, + "grad_norm": 2.059270143508911, + "learning_rate": 3.687247780468119e-05, + "loss": 1.0024, + "step": 15750 + }, + { + "epoch": 1.2668810289389068, + "grad_norm": 1.6028730869293213, + "learning_rate": 3.683212267958031e-05, + "loss": 0.948, + "step": 15760 + }, + { + "epoch": 1.267684887459807, + "grad_norm": 2.917422294616699, + "learning_rate": 3.679176755447942e-05, + "loss": 0.9057, + "step": 15770 + }, + { + "epoch": 1.2684887459807075, + "grad_norm": 1.3262091875076294, + "learning_rate": 3.675141242937853e-05, + "loss": 0.9915, + "step": 15780 + }, + { + "epoch": 1.2692926045016077, + "grad_norm": 2.217057466506958, + "learning_rate": 3.6711057304277644e-05, + "loss": 0.958, + "step": 15790 + }, + { + "epoch": 1.270096463022508, + "grad_norm": 1.6233608722686768, + "learning_rate": 3.6670702179176755e-05, + "loss": 0.8224, + "step": 15800 + }, + { + "epoch": 1.2709003215434085, + "grad_norm": 1.6154686212539673, + "learning_rate": 3.663034705407587e-05, + "loss": 1.0314, + "step": 15810 + }, + { + "epoch": 1.2717041800643087, + "grad_norm": 2.7980661392211914, + "learning_rate": 3.6589991928974985e-05, + "loss": 1.0876, + "step": 15820 + }, + { + "epoch": 1.272508038585209, + "grad_norm": 1.5344147682189941, + "learning_rate": 3.6549636803874096e-05, + "loss": 0.9062, + "step": 15830 + }, + { + "epoch": 1.2733118971061093, + "grad_norm": 3.238077163696289, + "learning_rate": 3.650928167877321e-05, + "loss": 1.0408, + "step": 15840 + }, + { + "epoch": 1.2741157556270095, + "grad_norm": 2.143214702606201, + "learning_rate": 3.646892655367232e-05, + "loss": 1.0145, + "step": 15850 + }, + { + "epoch": 1.27491961414791, + "grad_norm": 2.9116787910461426, + "learning_rate": 3.642857142857143e-05, + "loss": 0.9369, + "step": 15860 + }, + { + "epoch": 1.2757234726688103, + "grad_norm": 1.6376276016235352, + "learning_rate": 3.638821630347054e-05, + "loss": 1.0382, + "step": 15870 + }, + { + "epoch": 1.2765273311897105, + "grad_norm": 1.4802640676498413, + "learning_rate": 3.634786117836965e-05, + "loss": 1.0154, + "step": 15880 + }, + { + "epoch": 1.277331189710611, + "grad_norm": 1.6010328531265259, + "learning_rate": 3.630750605326877e-05, + "loss": 0.9832, + "step": 15890 + }, + { + "epoch": 1.2781350482315113, + "grad_norm": 2.688264846801758, + "learning_rate": 3.626715092816788e-05, + "loss": 0.9747, + "step": 15900 + }, + { + "epoch": 1.2789389067524115, + "grad_norm": 1.729894995689392, + "learning_rate": 3.6226795803066994e-05, + "loss": 0.9999, + "step": 15910 + }, + { + "epoch": 1.279742765273312, + "grad_norm": 1.8605012893676758, + "learning_rate": 3.6186440677966105e-05, + "loss": 0.9808, + "step": 15920 + }, + { + "epoch": 1.2805466237942122, + "grad_norm": 1.628440499305725, + "learning_rate": 3.6146085552865217e-05, + "loss": 1.0222, + "step": 15930 + }, + { + "epoch": 1.2813504823151125, + "grad_norm": 1.3880386352539062, + "learning_rate": 3.610573042776433e-05, + "loss": 0.9112, + "step": 15940 + }, + { + "epoch": 1.282154340836013, + "grad_norm": 1.6189711093902588, + "learning_rate": 3.606537530266344e-05, + "loss": 1.1685, + "step": 15950 + }, + { + "epoch": 1.2829581993569132, + "grad_norm": 1.9674052000045776, + "learning_rate": 3.602502017756255e-05, + "loss": 0.9313, + "step": 15960 + }, + { + "epoch": 1.2837620578778135, + "grad_norm": 1.8231611251831055, + "learning_rate": 3.598466505246166e-05, + "loss": 0.8848, + "step": 15970 + }, + { + "epoch": 1.2845659163987138, + "grad_norm": 1.5274815559387207, + "learning_rate": 3.594430992736077e-05, + "loss": 0.9129, + "step": 15980 + }, + { + "epoch": 1.285369774919614, + "grad_norm": 2.585059881210327, + "learning_rate": 3.5903954802259885e-05, + "loss": 1.0566, + "step": 15990 + }, + { + "epoch": 1.2861736334405145, + "grad_norm": 2.084228992462158, + "learning_rate": 3.5863599677158996e-05, + "loss": 1.0839, + "step": 16000 + }, + { + "epoch": 1.2861736334405145, + "eval_yahma/alpaca-cleaned_loss": 1.2161176204681396, + "eval_yahma/alpaca-cleaned_runtime": 115.6721, + "eval_yahma/alpaca-cleaned_samples_per_second": 17.29, + "eval_yahma/alpaca-cleaned_steps_per_second": 2.161, + "step": 16000 + }, + { + "epoch": 1.2869774919614148, + "grad_norm": 1.84796142578125, + "learning_rate": 3.582324455205811e-05, + "loss": 0.9685, + "step": 16010 + }, + { + "epoch": 1.287781350482315, + "grad_norm": 1.575564980506897, + "learning_rate": 3.5782889426957226e-05, + "loss": 1.0034, + "step": 16020 + }, + { + "epoch": 1.2885852090032155, + "grad_norm": 1.797165870666504, + "learning_rate": 3.574253430185634e-05, + "loss": 1.0284, + "step": 16030 + }, + { + "epoch": 1.2893890675241158, + "grad_norm": 1.263707160949707, + "learning_rate": 3.570217917675545e-05, + "loss": 1.0448, + "step": 16040 + }, + { + "epoch": 1.290192926045016, + "grad_norm": 1.7160576581954956, + "learning_rate": 3.5661824051654566e-05, + "loss": 0.9389, + "step": 16050 + }, + { + "epoch": 1.2909967845659165, + "grad_norm": 2.0201547145843506, + "learning_rate": 3.562146892655368e-05, + "loss": 0.9839, + "step": 16060 + }, + { + "epoch": 1.2918006430868167, + "grad_norm": 1.3020087480545044, + "learning_rate": 3.558111380145279e-05, + "loss": 1.0517, + "step": 16070 + }, + { + "epoch": 1.292604501607717, + "grad_norm": 1.6637574434280396, + "learning_rate": 3.55407586763519e-05, + "loss": 0.9596, + "step": 16080 + }, + { + "epoch": 1.2934083601286175, + "grad_norm": 2.665010690689087, + "learning_rate": 3.550040355125101e-05, + "loss": 1.0649, + "step": 16090 + }, + { + "epoch": 1.2942122186495177, + "grad_norm": 1.6599992513656616, + "learning_rate": 3.546004842615012e-05, + "loss": 0.9902, + "step": 16100 + }, + { + "epoch": 1.295016077170418, + "grad_norm": 1.390799641609192, + "learning_rate": 3.5419693301049235e-05, + "loss": 1.0083, + "step": 16110 + }, + { + "epoch": 1.2958199356913183, + "grad_norm": 4.305990695953369, + "learning_rate": 3.5379338175948346e-05, + "loss": 0.9338, + "step": 16120 + }, + { + "epoch": 1.2966237942122185, + "grad_norm": 2.046499252319336, + "learning_rate": 3.533898305084746e-05, + "loss": 1.0253, + "step": 16130 + }, + { + "epoch": 1.297427652733119, + "grad_norm": 2.5173068046569824, + "learning_rate": 3.529862792574657e-05, + "loss": 1.0404, + "step": 16140 + }, + { + "epoch": 1.2982315112540193, + "grad_norm": 1.7286467552185059, + "learning_rate": 3.525827280064569e-05, + "loss": 1.0585, + "step": 16150 + }, + { + "epoch": 1.2990353697749195, + "grad_norm": 2.4116015434265137, + "learning_rate": 3.52179176755448e-05, + "loss": 0.9991, + "step": 16160 + }, + { + "epoch": 1.29983922829582, + "grad_norm": 2.464102029800415, + "learning_rate": 3.517756255044391e-05, + "loss": 0.9679, + "step": 16170 + }, + { + "epoch": 1.3006430868167203, + "grad_norm": 1.9731477499008179, + "learning_rate": 3.513720742534302e-05, + "loss": 0.9587, + "step": 16180 + }, + { + "epoch": 1.3014469453376205, + "grad_norm": 1.2134771347045898, + "learning_rate": 3.509685230024213e-05, + "loss": 1.0607, + "step": 16190 + }, + { + "epoch": 1.302250803858521, + "grad_norm": 3.209575891494751, + "learning_rate": 3.5056497175141244e-05, + "loss": 0.964, + "step": 16200 + }, + { + "epoch": 1.3030546623794212, + "grad_norm": 1.5849529504776, + "learning_rate": 3.5016142050040355e-05, + "loss": 0.8894, + "step": 16210 + }, + { + "epoch": 1.3038585209003215, + "grad_norm": 2.5455405712127686, + "learning_rate": 3.4975786924939466e-05, + "loss": 1.0194, + "step": 16220 + }, + { + "epoch": 1.304662379421222, + "grad_norm": 1.4014263153076172, + "learning_rate": 3.493543179983858e-05, + "loss": 0.9338, + "step": 16230 + }, + { + "epoch": 1.3054662379421222, + "grad_norm": 1.5951437950134277, + "learning_rate": 3.489507667473769e-05, + "loss": 0.8486, + "step": 16240 + }, + { + "epoch": 1.3062700964630225, + "grad_norm": 1.797086238861084, + "learning_rate": 3.485472154963681e-05, + "loss": 0.9986, + "step": 16250 + }, + { + "epoch": 1.3070739549839228, + "grad_norm": 2.1635706424713135, + "learning_rate": 3.481436642453592e-05, + "loss": 0.9768, + "step": 16260 + }, + { + "epoch": 1.307877813504823, + "grad_norm": 2.373884916305542, + "learning_rate": 3.477401129943503e-05, + "loss": 0.9809, + "step": 16270 + }, + { + "epoch": 1.3086816720257235, + "grad_norm": 2.3494770526885986, + "learning_rate": 3.473365617433415e-05, + "loss": 1.2571, + "step": 16280 + }, + { + "epoch": 1.3094855305466238, + "grad_norm": 1.4083755016326904, + "learning_rate": 3.469330104923326e-05, + "loss": 1.1216, + "step": 16290 + }, + { + "epoch": 1.310289389067524, + "grad_norm": 1.7376712560653687, + "learning_rate": 3.465294592413237e-05, + "loss": 0.9893, + "step": 16300 + }, + { + "epoch": 1.3110932475884245, + "grad_norm": 1.5163984298706055, + "learning_rate": 3.461259079903148e-05, + "loss": 1.0158, + "step": 16310 + }, + { + "epoch": 1.3118971061093248, + "grad_norm": 1.9004069566726685, + "learning_rate": 3.4572235673930594e-05, + "loss": 1.0503, + "step": 16320 + }, + { + "epoch": 1.312700964630225, + "grad_norm": 1.295363426208496, + "learning_rate": 3.4531880548829705e-05, + "loss": 1.1076, + "step": 16330 + }, + { + "epoch": 1.3135048231511255, + "grad_norm": 1.682941198348999, + "learning_rate": 3.4491525423728816e-05, + "loss": 0.8961, + "step": 16340 + }, + { + "epoch": 1.3143086816720257, + "grad_norm": 1.8301907777786255, + "learning_rate": 3.445117029862793e-05, + "loss": 1.0844, + "step": 16350 + }, + { + "epoch": 1.315112540192926, + "grad_norm": 2.9198620319366455, + "learning_rate": 3.441081517352704e-05, + "loss": 0.9932, + "step": 16360 + }, + { + "epoch": 1.3159163987138265, + "grad_norm": 2.1517717838287354, + "learning_rate": 3.437046004842615e-05, + "loss": 0.9756, + "step": 16370 + }, + { + "epoch": 1.3167202572347267, + "grad_norm": 1.5518659353256226, + "learning_rate": 3.433010492332526e-05, + "loss": 0.8814, + "step": 16380 + }, + { + "epoch": 1.317524115755627, + "grad_norm": 2.230884552001953, + "learning_rate": 3.428974979822437e-05, + "loss": 0.9049, + "step": 16390 + }, + { + "epoch": 1.3183279742765273, + "grad_norm": 1.4246128797531128, + "learning_rate": 3.4249394673123485e-05, + "loss": 1.0478, + "step": 16400 + }, + { + "epoch": 1.3191318327974275, + "grad_norm": 1.648350477218628, + "learning_rate": 3.4209039548022596e-05, + "loss": 1.023, + "step": 16410 + }, + { + "epoch": 1.319935691318328, + "grad_norm": 1.7566518783569336, + "learning_rate": 3.4168684422921714e-05, + "loss": 1.1123, + "step": 16420 + }, + { + "epoch": 1.3207395498392283, + "grad_norm": 1.8392665386199951, + "learning_rate": 3.4128329297820825e-05, + "loss": 0.9804, + "step": 16430 + }, + { + "epoch": 1.3215434083601285, + "grad_norm": 1.5365403890609741, + "learning_rate": 3.408797417271994e-05, + "loss": 1.0917, + "step": 16440 + }, + { + "epoch": 1.322347266881029, + "grad_norm": 3.783843755722046, + "learning_rate": 3.404761904761905e-05, + "loss": 0.9347, + "step": 16450 + }, + { + "epoch": 1.3231511254019293, + "grad_norm": 2.019134044647217, + "learning_rate": 3.400726392251816e-05, + "loss": 0.9739, + "step": 16460 + }, + { + "epoch": 1.3239549839228295, + "grad_norm": 2.0950658321380615, + "learning_rate": 3.396690879741727e-05, + "loss": 1.0797, + "step": 16470 + }, + { + "epoch": 1.32475884244373, + "grad_norm": 1.5401766300201416, + "learning_rate": 3.392655367231639e-05, + "loss": 1.1162, + "step": 16480 + }, + { + "epoch": 1.3255627009646302, + "grad_norm": 1.9114347696304321, + "learning_rate": 3.38861985472155e-05, + "loss": 1.1233, + "step": 16490 + }, + { + "epoch": 1.3263665594855305, + "grad_norm": 1.4473228454589844, + "learning_rate": 3.384584342211461e-05, + "loss": 0.9608, + "step": 16500 + }, + { + "epoch": 1.327170418006431, + "grad_norm": 1.3395804166793823, + "learning_rate": 3.380548829701372e-05, + "loss": 1.0014, + "step": 16510 + }, + { + "epoch": 1.3279742765273312, + "grad_norm": 1.3364843130111694, + "learning_rate": 3.3765133171912835e-05, + "loss": 0.976, + "step": 16520 + }, + { + "epoch": 1.3287781350482315, + "grad_norm": 1.3809840679168701, + "learning_rate": 3.3724778046811946e-05, + "loss": 1.0469, + "step": 16530 + }, + { + "epoch": 1.3295819935691318, + "grad_norm": 1.4792019128799438, + "learning_rate": 3.368442292171106e-05, + "loss": 1.0364, + "step": 16540 + }, + { + "epoch": 1.330385852090032, + "grad_norm": 2.2897255420684814, + "learning_rate": 3.3644067796610175e-05, + "loss": 0.9775, + "step": 16550 + }, + { + "epoch": 1.3311897106109325, + "grad_norm": 2.1644399166107178, + "learning_rate": 3.360371267150929e-05, + "loss": 1.152, + "step": 16560 + }, + { + "epoch": 1.3319935691318328, + "grad_norm": 1.5172768831253052, + "learning_rate": 3.35633575464084e-05, + "loss": 1.056, + "step": 16570 + }, + { + "epoch": 1.332797427652733, + "grad_norm": 2.014936685562134, + "learning_rate": 3.352300242130751e-05, + "loss": 1.0585, + "step": 16580 + }, + { + "epoch": 1.3336012861736335, + "grad_norm": 3.0378189086914062, + "learning_rate": 3.348264729620662e-05, + "loss": 1.045, + "step": 16590 + }, + { + "epoch": 1.3344051446945338, + "grad_norm": 1.5051476955413818, + "learning_rate": 3.344229217110573e-05, + "loss": 1.0602, + "step": 16600 + }, + { + "epoch": 1.335209003215434, + "grad_norm": 1.7232706546783447, + "learning_rate": 3.3401937046004844e-05, + "loss": 1.0584, + "step": 16610 + }, + { + "epoch": 1.3360128617363345, + "grad_norm": 1.534652829170227, + "learning_rate": 3.3361581920903955e-05, + "loss": 1.0345, + "step": 16620 + }, + { + "epoch": 1.3368167202572347, + "grad_norm": 2.152040719985962, + "learning_rate": 3.3321226795803066e-05, + "loss": 1.013, + "step": 16630 + }, + { + "epoch": 1.337620578778135, + "grad_norm": 1.4761089086532593, + "learning_rate": 3.328087167070218e-05, + "loss": 1.0028, + "step": 16640 + }, + { + "epoch": 1.3384244372990355, + "grad_norm": 1.6546519994735718, + "learning_rate": 3.324051654560129e-05, + "loss": 1.0939, + "step": 16650 + }, + { + "epoch": 1.3392282958199357, + "grad_norm": 1.352799892425537, + "learning_rate": 3.32001614205004e-05, + "loss": 1.0636, + "step": 16660 + }, + { + "epoch": 1.340032154340836, + "grad_norm": 1.700903058052063, + "learning_rate": 3.315980629539951e-05, + "loss": 0.9948, + "step": 16670 + }, + { + "epoch": 1.3408360128617363, + "grad_norm": 3.408297061920166, + "learning_rate": 3.311945117029863e-05, + "loss": 0.9873, + "step": 16680 + }, + { + "epoch": 1.3416398713826365, + "grad_norm": 1.5373080968856812, + "learning_rate": 3.307909604519774e-05, + "loss": 1.0707, + "step": 16690 + }, + { + "epoch": 1.342443729903537, + "grad_norm": 1.3401768207550049, + "learning_rate": 3.303874092009685e-05, + "loss": 1.0821, + "step": 16700 + }, + { + "epoch": 1.3432475884244373, + "grad_norm": 2.121828556060791, + "learning_rate": 3.2998385794995964e-05, + "loss": 1.0746, + "step": 16710 + }, + { + "epoch": 1.3440514469453375, + "grad_norm": 1.4472429752349854, + "learning_rate": 3.295803066989508e-05, + "loss": 1.0205, + "step": 16720 + }, + { + "epoch": 1.344855305466238, + "grad_norm": 1.9297133684158325, + "learning_rate": 3.2917675544794194e-05, + "loss": 1.0387, + "step": 16730 + }, + { + "epoch": 1.3456591639871383, + "grad_norm": 1.7948999404907227, + "learning_rate": 3.2877320419693305e-05, + "loss": 0.9557, + "step": 16740 + }, + { + "epoch": 1.3464630225080385, + "grad_norm": 2.6864187717437744, + "learning_rate": 3.2836965294592416e-05, + "loss": 0.9276, + "step": 16750 + }, + { + "epoch": 1.347266881028939, + "grad_norm": 1.9042679071426392, + "learning_rate": 3.279661016949153e-05, + "loss": 0.9675, + "step": 16760 + }, + { + "epoch": 1.3480707395498392, + "grad_norm": 1.5159459114074707, + "learning_rate": 3.275625504439064e-05, + "loss": 0.9691, + "step": 16770 + }, + { + "epoch": 1.3488745980707395, + "grad_norm": 2.456352949142456, + "learning_rate": 3.271589991928975e-05, + "loss": 1.0205, + "step": 16780 + }, + { + "epoch": 1.34967845659164, + "grad_norm": 1.351737380027771, + "learning_rate": 3.267554479418886e-05, + "loss": 0.943, + "step": 16790 + }, + { + "epoch": 1.3504823151125402, + "grad_norm": 1.9734419584274292, + "learning_rate": 3.263518966908797e-05, + "loss": 1.0482, + "step": 16800 + }, + { + "epoch": 1.3512861736334405, + "grad_norm": 1.700422763824463, + "learning_rate": 3.259483454398709e-05, + "loss": 1.042, + "step": 16810 + }, + { + "epoch": 1.3520900321543408, + "grad_norm": 1.9596405029296875, + "learning_rate": 3.25544794188862e-05, + "loss": 1.0004, + "step": 16820 + }, + { + "epoch": 1.3528938906752412, + "grad_norm": 1.4773586988449097, + "learning_rate": 3.2514124293785314e-05, + "loss": 0.9757, + "step": 16830 + }, + { + "epoch": 1.3536977491961415, + "grad_norm": 1.8567955493927002, + "learning_rate": 3.2473769168684425e-05, + "loss": 1.0112, + "step": 16840 + }, + { + "epoch": 1.3545016077170418, + "grad_norm": 1.7968416213989258, + "learning_rate": 3.243341404358354e-05, + "loss": 0.896, + "step": 16850 + }, + { + "epoch": 1.355305466237942, + "grad_norm": 1.8025298118591309, + "learning_rate": 3.239305891848265e-05, + "loss": 1.0415, + "step": 16860 + }, + { + "epoch": 1.3561093247588425, + "grad_norm": 1.930948257446289, + "learning_rate": 3.235270379338176e-05, + "loss": 0.9868, + "step": 16870 + }, + { + "epoch": 1.3569131832797428, + "grad_norm": 1.6675337553024292, + "learning_rate": 3.231234866828087e-05, + "loss": 0.94, + "step": 16880 + }, + { + "epoch": 1.357717041800643, + "grad_norm": 1.5774610042572021, + "learning_rate": 3.227199354317998e-05, + "loss": 0.9087, + "step": 16890 + }, + { + "epoch": 1.3585209003215435, + "grad_norm": 2.006761074066162, + "learning_rate": 3.2231638418079093e-05, + "loss": 1.0439, + "step": 16900 + }, + { + "epoch": 1.3593247588424437, + "grad_norm": 2.1158947944641113, + "learning_rate": 3.2191283292978205e-05, + "loss": 1.004, + "step": 16910 + }, + { + "epoch": 1.360128617363344, + "grad_norm": 1.8996459245681763, + "learning_rate": 3.215092816787732e-05, + "loss": 0.9966, + "step": 16920 + }, + { + "epoch": 1.3609324758842445, + "grad_norm": 1.500229835510254, + "learning_rate": 3.2110573042776434e-05, + "loss": 0.8914, + "step": 16930 + }, + { + "epoch": 1.3617363344051447, + "grad_norm": 3.478053092956543, + "learning_rate": 3.2070217917675546e-05, + "loss": 1.037, + "step": 16940 + }, + { + "epoch": 1.362540192926045, + "grad_norm": 1.6184499263763428, + "learning_rate": 3.2029862792574664e-05, + "loss": 0.9153, + "step": 16950 + }, + { + "epoch": 1.3633440514469453, + "grad_norm": 1.5846716165542603, + "learning_rate": 3.1989507667473775e-05, + "loss": 1.0075, + "step": 16960 + }, + { + "epoch": 1.3641479099678457, + "grad_norm": 2.4219467639923096, + "learning_rate": 3.1949152542372887e-05, + "loss": 1.0313, + "step": 16970 + }, + { + "epoch": 1.364951768488746, + "grad_norm": 1.4937620162963867, + "learning_rate": 3.1908797417272e-05, + "loss": 1.0529, + "step": 16980 + }, + { + "epoch": 1.3657556270096463, + "grad_norm": 1.4323031902313232, + "learning_rate": 3.186844229217111e-05, + "loss": 0.9315, + "step": 16990 + }, + { + "epoch": 1.3665594855305465, + "grad_norm": 1.852509617805481, + "learning_rate": 3.182808716707022e-05, + "loss": 1.0359, + "step": 17000 + }, + { + "epoch": 1.367363344051447, + "grad_norm": 1.4422820806503296, + "learning_rate": 3.178773204196933e-05, + "loss": 1.0239, + "step": 17010 + }, + { + "epoch": 1.3681672025723473, + "grad_norm": 5.013046741485596, + "learning_rate": 3.1747376916868443e-05, + "loss": 0.8221, + "step": 17020 + }, + { + "epoch": 1.3689710610932475, + "grad_norm": 1.6348212957382202, + "learning_rate": 3.1707021791767555e-05, + "loss": 0.9839, + "step": 17030 + }, + { + "epoch": 1.369774919614148, + "grad_norm": 2.1785271167755127, + "learning_rate": 3.1666666666666666e-05, + "loss": 1.135, + "step": 17040 + }, + { + "epoch": 1.3705787781350482, + "grad_norm": 2.1272568702697754, + "learning_rate": 3.162631154156578e-05, + "loss": 1.1353, + "step": 17050 + }, + { + "epoch": 1.3713826366559485, + "grad_norm": 3.070270299911499, + "learning_rate": 3.158595641646489e-05, + "loss": 1.0135, + "step": 17060 + }, + { + "epoch": 1.372186495176849, + "grad_norm": 1.5802148580551147, + "learning_rate": 3.1545601291364e-05, + "loss": 1.0158, + "step": 17070 + }, + { + "epoch": 1.3729903536977492, + "grad_norm": 1.7264318466186523, + "learning_rate": 3.150524616626312e-05, + "loss": 0.9694, + "step": 17080 + }, + { + "epoch": 1.3737942122186495, + "grad_norm": 2.184271812438965, + "learning_rate": 3.146489104116223e-05, + "loss": 1.0871, + "step": 17090 + }, + { + "epoch": 1.3745980707395498, + "grad_norm": 1.7589523792266846, + "learning_rate": 3.142453591606134e-05, + "loss": 1.0919, + "step": 17100 + }, + { + "epoch": 1.3754019292604502, + "grad_norm": 1.6874173879623413, + "learning_rate": 3.138418079096045e-05, + "loss": 1.1151, + "step": 17110 + }, + { + "epoch": 1.3762057877813505, + "grad_norm": 1.6541131734848022, + "learning_rate": 3.1343825665859564e-05, + "loss": 1.118, + "step": 17120 + }, + { + "epoch": 1.3770096463022508, + "grad_norm": 1.4665961265563965, + "learning_rate": 3.1303470540758675e-05, + "loss": 1.0108, + "step": 17130 + }, + { + "epoch": 1.377813504823151, + "grad_norm": 2.157575845718384, + "learning_rate": 3.1263115415657787e-05, + "loss": 1.0388, + "step": 17140 + }, + { + "epoch": 1.3786173633440515, + "grad_norm": 1.7408186197280884, + "learning_rate": 3.1222760290556905e-05, + "loss": 0.9634, + "step": 17150 + }, + { + "epoch": 1.3794212218649518, + "grad_norm": 1.8646273612976074, + "learning_rate": 3.1182405165456016e-05, + "loss": 1.1186, + "step": 17160 + }, + { + "epoch": 1.380225080385852, + "grad_norm": 1.4674321413040161, + "learning_rate": 3.114205004035513e-05, + "loss": 0.9969, + "step": 17170 + }, + { + "epoch": 1.3810289389067525, + "grad_norm": 1.7009894847869873, + "learning_rate": 3.110169491525424e-05, + "loss": 1.0342, + "step": 17180 + }, + { + "epoch": 1.3818327974276527, + "grad_norm": 1.473688006401062, + "learning_rate": 3.106133979015335e-05, + "loss": 0.9908, + "step": 17190 + }, + { + "epoch": 1.382636655948553, + "grad_norm": 1.7846876382827759, + "learning_rate": 3.102098466505246e-05, + "loss": 0.9985, + "step": 17200 + }, + { + "epoch": 1.3834405144694535, + "grad_norm": 1.3931621313095093, + "learning_rate": 3.098062953995158e-05, + "loss": 1.059, + "step": 17210 + }, + { + "epoch": 1.3842443729903537, + "grad_norm": 1.5702584981918335, + "learning_rate": 3.094027441485069e-05, + "loss": 1.0033, + "step": 17220 + }, + { + "epoch": 1.385048231511254, + "grad_norm": 1.5941625833511353, + "learning_rate": 3.08999192897498e-05, + "loss": 1.056, + "step": 17230 + }, + { + "epoch": 1.3858520900321543, + "grad_norm": 1.9544049501419067, + "learning_rate": 3.0859564164648914e-05, + "loss": 1.0103, + "step": 17240 + }, + { + "epoch": 1.3866559485530547, + "grad_norm": 1.4904630184173584, + "learning_rate": 3.0819209039548025e-05, + "loss": 1.074, + "step": 17250 + }, + { + "epoch": 1.387459807073955, + "grad_norm": 2.1309163570404053, + "learning_rate": 3.0778853914447137e-05, + "loss": 1.0918, + "step": 17260 + }, + { + "epoch": 1.3882636655948553, + "grad_norm": 1.3276787996292114, + "learning_rate": 3.073849878934625e-05, + "loss": 1.0247, + "step": 17270 + }, + { + "epoch": 1.3890675241157555, + "grad_norm": 2.309291124343872, + "learning_rate": 3.069814366424536e-05, + "loss": 0.9255, + "step": 17280 + }, + { + "epoch": 1.389871382636656, + "grad_norm": 1.6173782348632812, + "learning_rate": 3.065778853914447e-05, + "loss": 1.0273, + "step": 17290 + }, + { + "epoch": 1.3906752411575563, + "grad_norm": 2.0882816314697266, + "learning_rate": 3.061743341404358e-05, + "loss": 1.0799, + "step": 17300 + }, + { + "epoch": 1.3914790996784565, + "grad_norm": 1.4501557350158691, + "learning_rate": 3.057707828894269e-05, + "loss": 0.9923, + "step": 17310 + }, + { + "epoch": 1.392282958199357, + "grad_norm": 1.792034387588501, + "learning_rate": 3.0536723163841805e-05, + "loss": 1.075, + "step": 17320 + }, + { + "epoch": 1.3930868167202572, + "grad_norm": 1.9524577856063843, + "learning_rate": 3.049636803874092e-05, + "loss": 1.1379, + "step": 17330 + }, + { + "epoch": 1.3938906752411575, + "grad_norm": 1.2398579120635986, + "learning_rate": 3.0456012913640038e-05, + "loss": 0.9899, + "step": 17340 + }, + { + "epoch": 1.394694533762058, + "grad_norm": 1.7992384433746338, + "learning_rate": 3.041565778853915e-05, + "loss": 1.0277, + "step": 17350 + }, + { + "epoch": 1.3954983922829582, + "grad_norm": 2.0077879428863525, + "learning_rate": 3.037530266343826e-05, + "loss": 1.0122, + "step": 17360 + }, + { + "epoch": 1.3963022508038585, + "grad_norm": 1.895276665687561, + "learning_rate": 3.033494753833737e-05, + "loss": 1.0742, + "step": 17370 + }, + { + "epoch": 1.3971061093247588, + "grad_norm": 1.7623761892318726, + "learning_rate": 3.0294592413236483e-05, + "loss": 1.0696, + "step": 17380 + }, + { + "epoch": 1.3979099678456592, + "grad_norm": 2.2184484004974365, + "learning_rate": 3.0254237288135594e-05, + "loss": 0.9861, + "step": 17390 + }, + { + "epoch": 1.3987138263665595, + "grad_norm": 1.6133798360824585, + "learning_rate": 3.0213882163034706e-05, + "loss": 1.043, + "step": 17400 + }, + { + "epoch": 1.3995176848874598, + "grad_norm": 1.8008503913879395, + "learning_rate": 3.0173527037933817e-05, + "loss": 1.035, + "step": 17410 + }, + { + "epoch": 1.40032154340836, + "grad_norm": 2.023869037628174, + "learning_rate": 3.0133171912832932e-05, + "loss": 1.0238, + "step": 17420 + }, + { + "epoch": 1.4011254019292605, + "grad_norm": 1.7300667762756348, + "learning_rate": 3.0092816787732043e-05, + "loss": 1.0263, + "step": 17430 + }, + { + "epoch": 1.4019292604501608, + "grad_norm": 1.5736720561981201, + "learning_rate": 3.0052461662631155e-05, + "loss": 1.0354, + "step": 17440 + }, + { + "epoch": 1.402733118971061, + "grad_norm": 1.8412175178527832, + "learning_rate": 3.0012106537530266e-05, + "loss": 1.0392, + "step": 17450 + }, + { + "epoch": 1.4035369774919615, + "grad_norm": 1.3952903747558594, + "learning_rate": 2.9971751412429377e-05, + "loss": 0.9645, + "step": 17460 + }, + { + "epoch": 1.4043408360128617, + "grad_norm": 1.6282439231872559, + "learning_rate": 2.9931396287328496e-05, + "loss": 1.1068, + "step": 17470 + }, + { + "epoch": 1.405144694533762, + "grad_norm": 2.133103132247925, + "learning_rate": 2.9891041162227607e-05, + "loss": 0.9765, + "step": 17480 + }, + { + "epoch": 1.4059485530546625, + "grad_norm": 2.174391508102417, + "learning_rate": 2.9850686037126718e-05, + "loss": 1.093, + "step": 17490 + }, + { + "epoch": 1.4067524115755627, + "grad_norm": 2.0915133953094482, + "learning_rate": 2.981033091202583e-05, + "loss": 0.983, + "step": 17500 + }, + { + "epoch": 1.407556270096463, + "grad_norm": 1.40506112575531, + "learning_rate": 2.976997578692494e-05, + "loss": 0.9038, + "step": 17510 + }, + { + "epoch": 1.4083601286173635, + "grad_norm": 1.5525906085968018, + "learning_rate": 2.9729620661824052e-05, + "loss": 1.0389, + "step": 17520 + }, + { + "epoch": 1.4091639871382637, + "grad_norm": 1.559783697128296, + "learning_rate": 2.9689265536723164e-05, + "loss": 0.996, + "step": 17530 + }, + { + "epoch": 1.409967845659164, + "grad_norm": 1.8317124843597412, + "learning_rate": 2.964891041162228e-05, + "loss": 1.1198, + "step": 17540 + }, + { + "epoch": 1.4107717041800643, + "grad_norm": 1.9380903244018555, + "learning_rate": 2.960855528652139e-05, + "loss": 1.0118, + "step": 17550 + }, + { + "epoch": 1.4115755627009645, + "grad_norm": 2.3650496006011963, + "learning_rate": 2.95682001614205e-05, + "loss": 0.9, + "step": 17560 + }, + { + "epoch": 1.412379421221865, + "grad_norm": 1.747936487197876, + "learning_rate": 2.9527845036319613e-05, + "loss": 1.1245, + "step": 17570 + }, + { + "epoch": 1.4131832797427653, + "grad_norm": 3.0566165447235107, + "learning_rate": 2.9487489911218724e-05, + "loss": 1.0397, + "step": 17580 + }, + { + "epoch": 1.4139871382636655, + "grad_norm": 1.790842890739441, + "learning_rate": 2.9447134786117835e-05, + "loss": 0.893, + "step": 17590 + }, + { + "epoch": 1.414790996784566, + "grad_norm": 2.7602193355560303, + "learning_rate": 2.9406779661016953e-05, + "loss": 1.0208, + "step": 17600 + }, + { + "epoch": 1.4155948553054662, + "grad_norm": 2.4181101322174072, + "learning_rate": 2.9366424535916065e-05, + "loss": 1.0231, + "step": 17610 + }, + { + "epoch": 1.4163987138263665, + "grad_norm": 1.5748388767242432, + "learning_rate": 2.9326069410815176e-05, + "loss": 0.9656, + "step": 17620 + }, + { + "epoch": 1.417202572347267, + "grad_norm": 1.6972712278366089, + "learning_rate": 2.9285714285714288e-05, + "loss": 0.8361, + "step": 17630 + }, + { + "epoch": 1.4180064308681672, + "grad_norm": 1.8888636827468872, + "learning_rate": 2.92453591606134e-05, + "loss": 0.9267, + "step": 17640 + }, + { + "epoch": 1.4188102893890675, + "grad_norm": 1.4726215600967407, + "learning_rate": 2.9205004035512514e-05, + "loss": 0.9852, + "step": 17650 + }, + { + "epoch": 1.419614147909968, + "grad_norm": 1.650896430015564, + "learning_rate": 2.9164648910411625e-05, + "loss": 0.9401, + "step": 17660 + }, + { + "epoch": 1.4204180064308682, + "grad_norm": 1.576968789100647, + "learning_rate": 2.9124293785310736e-05, + "loss": 1.0903, + "step": 17670 + }, + { + "epoch": 1.4212218649517685, + "grad_norm": 2.186856985092163, + "learning_rate": 2.9083938660209848e-05, + "loss": 0.9751, + "step": 17680 + }, + { + "epoch": 1.4220257234726688, + "grad_norm": 1.621895432472229, + "learning_rate": 2.904358353510896e-05, + "loss": 0.9995, + "step": 17690 + }, + { + "epoch": 1.422829581993569, + "grad_norm": 1.6178061962127686, + "learning_rate": 2.900322841000807e-05, + "loss": 0.9885, + "step": 17700 + }, + { + "epoch": 1.4236334405144695, + "grad_norm": 3.5141470432281494, + "learning_rate": 2.8962873284907182e-05, + "loss": 0.8756, + "step": 17710 + }, + { + "epoch": 1.4244372990353698, + "grad_norm": 1.3309582471847534, + "learning_rate": 2.8922518159806293e-05, + "loss": 0.9736, + "step": 17720 + }, + { + "epoch": 1.42524115755627, + "grad_norm": 1.4174798727035522, + "learning_rate": 2.888216303470541e-05, + "loss": 1.1252, + "step": 17730 + }, + { + "epoch": 1.4260450160771705, + "grad_norm": 1.849108338356018, + "learning_rate": 2.8841807909604523e-05, + "loss": 0.9253, + "step": 17740 + }, + { + "epoch": 1.4268488745980707, + "grad_norm": 1.832971215248108, + "learning_rate": 2.8801452784503634e-05, + "loss": 0.9816, + "step": 17750 + }, + { + "epoch": 1.427652733118971, + "grad_norm": 1.9136488437652588, + "learning_rate": 2.8761097659402745e-05, + "loss": 1.0795, + "step": 17760 + }, + { + "epoch": 1.4284565916398715, + "grad_norm": 1.530927300453186, + "learning_rate": 2.872074253430186e-05, + "loss": 1.0021, + "step": 17770 + }, + { + "epoch": 1.4292604501607717, + "grad_norm": 2.640871047973633, + "learning_rate": 2.868038740920097e-05, + "loss": 0.9635, + "step": 17780 + }, + { + "epoch": 1.430064308681672, + "grad_norm": 1.6071547269821167, + "learning_rate": 2.8640032284100083e-05, + "loss": 1.0267, + "step": 17790 + }, + { + "epoch": 1.4308681672025725, + "grad_norm": 1.395981788635254, + "learning_rate": 2.8599677158999194e-05, + "loss": 0.939, + "step": 17800 + }, + { + "epoch": 1.4316720257234727, + "grad_norm": 1.8101909160614014, + "learning_rate": 2.8559322033898306e-05, + "loss": 1.0454, + "step": 17810 + }, + { + "epoch": 1.432475884244373, + "grad_norm": 1.7596745491027832, + "learning_rate": 2.8518966908797417e-05, + "loss": 1.0245, + "step": 17820 + }, + { + "epoch": 1.4332797427652733, + "grad_norm": 1.522783875465393, + "learning_rate": 2.847861178369653e-05, + "loss": 1.007, + "step": 17830 + }, + { + "epoch": 1.4340836012861735, + "grad_norm": 1.8518741130828857, + "learning_rate": 2.843825665859564e-05, + "loss": 1.0237, + "step": 17840 + }, + { + "epoch": 1.434887459807074, + "grad_norm": 1.7670881748199463, + "learning_rate": 2.8397901533494754e-05, + "loss": 0.9962, + "step": 17850 + }, + { + "epoch": 1.4356913183279743, + "grad_norm": 1.9033243656158447, + "learning_rate": 2.8357546408393866e-05, + "loss": 0.9878, + "step": 17860 + }, + { + "epoch": 1.4364951768488745, + "grad_norm": 2.2080554962158203, + "learning_rate": 2.831719128329298e-05, + "loss": 1.0062, + "step": 17870 + }, + { + "epoch": 1.437299035369775, + "grad_norm": 1.6603493690490723, + "learning_rate": 2.8276836158192095e-05, + "loss": 0.9851, + "step": 17880 + }, + { + "epoch": 1.4381028938906752, + "grad_norm": 1.9345533847808838, + "learning_rate": 2.8236481033091207e-05, + "loss": 1.0904, + "step": 17890 + }, + { + "epoch": 1.4389067524115755, + "grad_norm": 1.8384795188903809, + "learning_rate": 2.8196125907990318e-05, + "loss": 0.9186, + "step": 17900 + }, + { + "epoch": 1.439710610932476, + "grad_norm": 1.4033855199813843, + "learning_rate": 2.815577078288943e-05, + "loss": 1.002, + "step": 17910 + }, + { + "epoch": 1.4405144694533762, + "grad_norm": 1.9541116952896118, + "learning_rate": 2.811541565778854e-05, + "loss": 1.0194, + "step": 17920 + }, + { + "epoch": 1.4413183279742765, + "grad_norm": 1.4612561464309692, + "learning_rate": 2.8075060532687652e-05, + "loss": 1.0518, + "step": 17930 + }, + { + "epoch": 1.442122186495177, + "grad_norm": 1.920513391494751, + "learning_rate": 2.8034705407586764e-05, + "loss": 1.0144, + "step": 17940 + }, + { + "epoch": 1.4429260450160772, + "grad_norm": 1.4925817251205444, + "learning_rate": 2.7994350282485875e-05, + "loss": 1.0138, + "step": 17950 + }, + { + "epoch": 1.4437299035369775, + "grad_norm": 1.4273995161056519, + "learning_rate": 2.7953995157384986e-05, + "loss": 0.9795, + "step": 17960 + }, + { + "epoch": 1.4445337620578778, + "grad_norm": 1.4286296367645264, + "learning_rate": 2.79136400322841e-05, + "loss": 1.0259, + "step": 17970 + }, + { + "epoch": 1.445337620578778, + "grad_norm": 1.9147309064865112, + "learning_rate": 2.7873284907183212e-05, + "loss": 1.1585, + "step": 17980 + }, + { + "epoch": 1.4461414790996785, + "grad_norm": 2.371657133102417, + "learning_rate": 2.7832929782082324e-05, + "loss": 1.0628, + "step": 17990 + }, + { + "epoch": 1.4469453376205788, + "grad_norm": 2.483755588531494, + "learning_rate": 2.7792574656981442e-05, + "loss": 1.0018, + "step": 18000 + }, + { + "epoch": 1.4469453376205788, + "eval_yahma/alpaca-cleaned_loss": 1.2110522985458374, + "eval_yahma/alpaca-cleaned_runtime": 115.7566, + "eval_yahma/alpaca-cleaned_samples_per_second": 17.278, + "eval_yahma/alpaca-cleaned_steps_per_second": 2.16, + "step": 18000 + }, + { + "epoch": 1.447749196141479, + "grad_norm": 1.9039057493209839, + "learning_rate": 2.7752219531880553e-05, + "loss": 1.0801, + "step": 18010 + }, + { + "epoch": 1.4485530546623795, + "grad_norm": 3.932539224624634, + "learning_rate": 2.7711864406779665e-05, + "loss": 0.9159, + "step": 18020 + }, + { + "epoch": 1.4493569131832797, + "grad_norm": 1.7187069654464722, + "learning_rate": 2.7675544794188862e-05, + "loss": 1.1043, + "step": 18030 + }, + { + "epoch": 1.45016077170418, + "grad_norm": 2.409668445587158, + "learning_rate": 2.7635189669087974e-05, + "loss": 0.9993, + "step": 18040 + }, + { + "epoch": 1.4509646302250805, + "grad_norm": 1.4845926761627197, + "learning_rate": 2.7594834543987085e-05, + "loss": 0.9123, + "step": 18050 + }, + { + "epoch": 1.4517684887459807, + "grad_norm": 1.4308907985687256, + "learning_rate": 2.7554479418886196e-05, + "loss": 1.0153, + "step": 18060 + }, + { + "epoch": 1.452572347266881, + "grad_norm": 1.526442050933838, + "learning_rate": 2.751412429378531e-05, + "loss": 0.8933, + "step": 18070 + }, + { + "epoch": 1.4533762057877815, + "grad_norm": 1.9068630933761597, + "learning_rate": 2.7473769168684426e-05, + "loss": 0.9328, + "step": 18080 + }, + { + "epoch": 1.4541800643086817, + "grad_norm": 1.8453564643859863, + "learning_rate": 2.7433414043583537e-05, + "loss": 0.9831, + "step": 18090 + }, + { + "epoch": 1.454983922829582, + "grad_norm": 1.743841290473938, + "learning_rate": 2.7393058918482652e-05, + "loss": 1.0306, + "step": 18100 + }, + { + "epoch": 1.4557877813504823, + "grad_norm": 1.9150433540344238, + "learning_rate": 2.7352703793381763e-05, + "loss": 1.1571, + "step": 18110 + }, + { + "epoch": 1.4565916398713825, + "grad_norm": 2.1634671688079834, + "learning_rate": 2.7312348668280875e-05, + "loss": 0.9588, + "step": 18120 + }, + { + "epoch": 1.457395498392283, + "grad_norm": 1.8215436935424805, + "learning_rate": 2.7271993543179986e-05, + "loss": 1.1203, + "step": 18130 + }, + { + "epoch": 1.4581993569131833, + "grad_norm": 1.6399837732315063, + "learning_rate": 2.7231638418079097e-05, + "loss": 1.0164, + "step": 18140 + }, + { + "epoch": 1.4590032154340835, + "grad_norm": 2.877127170562744, + "learning_rate": 2.719128329297821e-05, + "loss": 1.0546, + "step": 18150 + }, + { + "epoch": 1.459807073954984, + "grad_norm": 2.673527717590332, + "learning_rate": 2.715092816787732e-05, + "loss": 0.9971, + "step": 18160 + }, + { + "epoch": 1.4606109324758842, + "grad_norm": 1.6876904964447021, + "learning_rate": 2.711057304277643e-05, + "loss": 0.991, + "step": 18170 + }, + { + "epoch": 1.4614147909967845, + "grad_norm": 2.3266868591308594, + "learning_rate": 2.7070217917675543e-05, + "loss": 0.9961, + "step": 18180 + }, + { + "epoch": 1.462218649517685, + "grad_norm": 4.623996734619141, + "learning_rate": 2.7029862792574658e-05, + "loss": 0.9838, + "step": 18190 + }, + { + "epoch": 1.4630225080385852, + "grad_norm": 1.4012879133224487, + "learning_rate": 2.698950766747377e-05, + "loss": 1.0015, + "step": 18200 + }, + { + "epoch": 1.4638263665594855, + "grad_norm": 1.4820497035980225, + "learning_rate": 2.6949152542372884e-05, + "loss": 1.0434, + "step": 18210 + }, + { + "epoch": 1.464630225080386, + "grad_norm": 1.5504990816116333, + "learning_rate": 2.6908797417272e-05, + "loss": 0.9886, + "step": 18220 + }, + { + "epoch": 1.4654340836012862, + "grad_norm": 1.4550745487213135, + "learning_rate": 2.686844229217111e-05, + "loss": 1.0432, + "step": 18230 + }, + { + "epoch": 1.4662379421221865, + "grad_norm": 1.790533423423767, + "learning_rate": 2.682808716707022e-05, + "loss": 0.9935, + "step": 18240 + }, + { + "epoch": 1.4670418006430868, + "grad_norm": 2.5110254287719727, + "learning_rate": 2.6787732041969332e-05, + "loss": 0.9956, + "step": 18250 + }, + { + "epoch": 1.467845659163987, + "grad_norm": 1.7681093215942383, + "learning_rate": 2.6747376916868444e-05, + "loss": 1.0096, + "step": 18260 + }, + { + "epoch": 1.4686495176848875, + "grad_norm": 2.1158554553985596, + "learning_rate": 2.6707021791767555e-05, + "loss": 1.0045, + "step": 18270 + }, + { + "epoch": 1.4694533762057878, + "grad_norm": 1.3785464763641357, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.9776, + "step": 18280 + }, + { + "epoch": 1.470257234726688, + "grad_norm": 3.132347583770752, + "learning_rate": 2.6626311541565778e-05, + "loss": 0.9183, + "step": 18290 + }, + { + "epoch": 1.4710610932475885, + "grad_norm": 1.5687378644943237, + "learning_rate": 2.6585956416464893e-05, + "loss": 1.0263, + "step": 18300 + }, + { + "epoch": 1.4718649517684887, + "grad_norm": 1.2770296335220337, + "learning_rate": 2.6545601291364004e-05, + "loss": 0.8838, + "step": 18310 + }, + { + "epoch": 1.472668810289389, + "grad_norm": 1.7857643365859985, + "learning_rate": 2.6505246166263115e-05, + "loss": 0.9491, + "step": 18320 + }, + { + "epoch": 1.4734726688102895, + "grad_norm": 2.11606502532959, + "learning_rate": 2.6464891041162227e-05, + "loss": 1.1035, + "step": 18330 + }, + { + "epoch": 1.4742765273311897, + "grad_norm": 1.88895583152771, + "learning_rate": 2.6424535916061345e-05, + "loss": 1.0237, + "step": 18340 + }, + { + "epoch": 1.47508038585209, + "grad_norm": 2.3139593601226807, + "learning_rate": 2.6384180790960456e-05, + "loss": 1.0029, + "step": 18350 + }, + { + "epoch": 1.4758842443729905, + "grad_norm": 1.4084957838058472, + "learning_rate": 2.6343825665859568e-05, + "loss": 0.9766, + "step": 18360 + }, + { + "epoch": 1.4766881028938907, + "grad_norm": 1.5821465253829956, + "learning_rate": 2.630347054075868e-05, + "loss": 1.0581, + "step": 18370 + }, + { + "epoch": 1.477491961414791, + "grad_norm": 1.7947590351104736, + "learning_rate": 2.626311541565779e-05, + "loss": 0.8901, + "step": 18380 + }, + { + "epoch": 1.4782958199356913, + "grad_norm": 2.6382060050964355, + "learning_rate": 2.6222760290556902e-05, + "loss": 1.0321, + "step": 18390 + }, + { + "epoch": 1.4790996784565915, + "grad_norm": 1.636795997619629, + "learning_rate": 2.6182405165456013e-05, + "loss": 1.073, + "step": 18400 + }, + { + "epoch": 1.479903536977492, + "grad_norm": 1.451741337776184, + "learning_rate": 2.6142050040355124e-05, + "loss": 0.9504, + "step": 18410 + }, + { + "epoch": 1.4807073954983923, + "grad_norm": 1.6076511144638062, + "learning_rate": 2.610169491525424e-05, + "loss": 1.0157, + "step": 18420 + }, + { + "epoch": 1.4815112540192925, + "grad_norm": 1.6737004518508911, + "learning_rate": 2.606133979015335e-05, + "loss": 0.9674, + "step": 18430 + }, + { + "epoch": 1.482315112540193, + "grad_norm": 1.4484913349151611, + "learning_rate": 2.6020984665052462e-05, + "loss": 0.9444, + "step": 18440 + }, + { + "epoch": 1.4831189710610932, + "grad_norm": 1.8022609949111938, + "learning_rate": 2.5980629539951573e-05, + "loss": 1.045, + "step": 18450 + }, + { + "epoch": 1.4839228295819935, + "grad_norm": 1.4874672889709473, + "learning_rate": 2.5940274414850685e-05, + "loss": 0.9895, + "step": 18460 + }, + { + "epoch": 1.484726688102894, + "grad_norm": 1.8093199729919434, + "learning_rate": 2.5899919289749803e-05, + "loss": 1.0687, + "step": 18470 + }, + { + "epoch": 1.4855305466237942, + "grad_norm": 2.6661243438720703, + "learning_rate": 2.5859564164648914e-05, + "loss": 1.1001, + "step": 18480 + }, + { + "epoch": 1.4863344051446945, + "grad_norm": 2.4542813301086426, + "learning_rate": 2.5819209039548026e-05, + "loss": 1.0176, + "step": 18490 + }, + { + "epoch": 1.487138263665595, + "grad_norm": 1.6964333057403564, + "learning_rate": 2.5778853914447137e-05, + "loss": 0.8609, + "step": 18500 + }, + { + "epoch": 1.4879421221864952, + "grad_norm": 1.7633438110351562, + "learning_rate": 2.573849878934625e-05, + "loss": 1.0015, + "step": 18510 + }, + { + "epoch": 1.4887459807073955, + "grad_norm": 1.8267382383346558, + "learning_rate": 2.569814366424536e-05, + "loss": 1.1222, + "step": 18520 + }, + { + "epoch": 1.4895498392282958, + "grad_norm": 2.0830318927764893, + "learning_rate": 2.5657788539144474e-05, + "loss": 1.1318, + "step": 18530 + }, + { + "epoch": 1.490353697749196, + "grad_norm": 1.4734503030776978, + "learning_rate": 2.5617433414043586e-05, + "loss": 0.9939, + "step": 18540 + }, + { + "epoch": 1.4911575562700965, + "grad_norm": 2.606203079223633, + "learning_rate": 2.5577078288942697e-05, + "loss": 1.0286, + "step": 18550 + }, + { + "epoch": 1.4919614147909968, + "grad_norm": 3.1389873027801514, + "learning_rate": 2.553672316384181e-05, + "loss": 1.0314, + "step": 18560 + }, + { + "epoch": 1.492765273311897, + "grad_norm": 1.8032082319259644, + "learning_rate": 2.549636803874092e-05, + "loss": 1.0433, + "step": 18570 + }, + { + "epoch": 1.4935691318327975, + "grad_norm": 2.33467173576355, + "learning_rate": 2.545601291364003e-05, + "loss": 1.0446, + "step": 18580 + }, + { + "epoch": 1.4943729903536977, + "grad_norm": 2.114933490753174, + "learning_rate": 2.5415657788539143e-05, + "loss": 1.0179, + "step": 18590 + }, + { + "epoch": 1.495176848874598, + "grad_norm": 1.1957430839538574, + "learning_rate": 2.5375302663438254e-05, + "loss": 1.0259, + "step": 18600 + }, + { + "epoch": 1.4959807073954985, + "grad_norm": 1.8650671243667603, + "learning_rate": 2.5334947538337372e-05, + "loss": 0.934, + "step": 18610 + }, + { + "epoch": 1.4967845659163987, + "grad_norm": 1.485527753829956, + "learning_rate": 2.5294592413236483e-05, + "loss": 1.0833, + "step": 18620 + }, + { + "epoch": 1.497588424437299, + "grad_norm": 1.4764578342437744, + "learning_rate": 2.5254237288135595e-05, + "loss": 0.9306, + "step": 18630 + }, + { + "epoch": 1.4983922829581995, + "grad_norm": 2.014840602874756, + "learning_rate": 2.5213882163034706e-05, + "loss": 0.9984, + "step": 18640 + }, + { + "epoch": 1.4991961414790997, + "grad_norm": 1.6000443696975708, + "learning_rate": 2.517352703793382e-05, + "loss": 1.0134, + "step": 18650 + }, + { + "epoch": 1.5, + "grad_norm": 1.7050846815109253, + "learning_rate": 2.5133171912832932e-05, + "loss": 0.9482, + "step": 18660 + }, + { + "epoch": 1.5008038585209005, + "grad_norm": 1.9964686632156372, + "learning_rate": 2.5092816787732044e-05, + "loss": 1.0569, + "step": 18670 + }, + { + "epoch": 1.5016077170418005, + "grad_norm": 1.524829387664795, + "learning_rate": 2.5052461662631155e-05, + "loss": 1.126, + "step": 18680 + }, + { + "epoch": 1.502411575562701, + "grad_norm": 1.630293607711792, + "learning_rate": 2.5012106537530266e-05, + "loss": 1.0917, + "step": 18690 + }, + { + "epoch": 1.5032154340836013, + "grad_norm": 1.819430947303772, + "learning_rate": 2.497175141242938e-05, + "loss": 0.9468, + "step": 18700 + }, + { + "epoch": 1.5040192926045015, + "grad_norm": 1.743780493736267, + "learning_rate": 2.4931396287328493e-05, + "loss": 0.9956, + "step": 18710 + }, + { + "epoch": 1.504823151125402, + "grad_norm": 1.9177618026733398, + "learning_rate": 2.4891041162227604e-05, + "loss": 0.9998, + "step": 18720 + }, + { + "epoch": 1.5056270096463023, + "grad_norm": 1.604160189628601, + "learning_rate": 2.4850686037126715e-05, + "loss": 0.9501, + "step": 18730 + }, + { + "epoch": 1.5064308681672025, + "grad_norm": 2.4709489345550537, + "learning_rate": 2.4810330912025827e-05, + "loss": 1.013, + "step": 18740 + }, + { + "epoch": 1.507234726688103, + "grad_norm": 2.3379924297332764, + "learning_rate": 2.476997578692494e-05, + "loss": 1.0388, + "step": 18750 + }, + { + "epoch": 1.5080385852090032, + "grad_norm": 1.7188684940338135, + "learning_rate": 2.4729620661824053e-05, + "loss": 1.03, + "step": 18760 + }, + { + "epoch": 1.5088424437299035, + "grad_norm": 3.812397003173828, + "learning_rate": 2.4689265536723168e-05, + "loss": 1.0547, + "step": 18770 + }, + { + "epoch": 1.509646302250804, + "grad_norm": 1.712337851524353, + "learning_rate": 2.464891041162228e-05, + "loss": 1.0071, + "step": 18780 + }, + { + "epoch": 1.510450160771704, + "grad_norm": 2.7004611492156982, + "learning_rate": 2.460855528652139e-05, + "loss": 1.1183, + "step": 18790 + }, + { + "epoch": 1.5112540192926045, + "grad_norm": 1.4686659574508667, + "learning_rate": 2.45682001614205e-05, + "loss": 0.9288, + "step": 18800 + }, + { + "epoch": 1.512057877813505, + "grad_norm": 1.755505919456482, + "learning_rate": 2.4527845036319613e-05, + "loss": 1.0728, + "step": 18810 + }, + { + "epoch": 1.512861736334405, + "grad_norm": 4.083408832550049, + "learning_rate": 2.4487489911218724e-05, + "loss": 0.9492, + "step": 18820 + }, + { + "epoch": 1.5136655948553055, + "grad_norm": 1.7185946702957153, + "learning_rate": 2.4447134786117836e-05, + "loss": 0.8437, + "step": 18830 + }, + { + "epoch": 1.5144694533762058, + "grad_norm": 1.4967041015625, + "learning_rate": 2.440677966101695e-05, + "loss": 0.9956, + "step": 18840 + }, + { + "epoch": 1.515273311897106, + "grad_norm": 1.3148831129074097, + "learning_rate": 2.4366424535916062e-05, + "loss": 0.9735, + "step": 18850 + }, + { + "epoch": 1.5160771704180065, + "grad_norm": 1.886210322380066, + "learning_rate": 2.4326069410815173e-05, + "loss": 1.0393, + "step": 18860 + }, + { + "epoch": 1.5168810289389068, + "grad_norm": 1.3368439674377441, + "learning_rate": 2.4285714285714288e-05, + "loss": 0.9677, + "step": 18870 + }, + { + "epoch": 1.517684887459807, + "grad_norm": 1.8139729499816895, + "learning_rate": 2.42453591606134e-05, + "loss": 1.0196, + "step": 18880 + }, + { + "epoch": 1.5184887459807075, + "grad_norm": 2.2600748538970947, + "learning_rate": 2.420500403551251e-05, + "loss": 1.0883, + "step": 18890 + }, + { + "epoch": 1.5192926045016077, + "grad_norm": 2.071951389312744, + "learning_rate": 2.4164648910411625e-05, + "loss": 0.9809, + "step": 18900 + }, + { + "epoch": 1.520096463022508, + "grad_norm": 3.309037685394287, + "learning_rate": 2.4124293785310737e-05, + "loss": 0.9566, + "step": 18910 + }, + { + "epoch": 1.5209003215434085, + "grad_norm": 1.5997929573059082, + "learning_rate": 2.4083938660209848e-05, + "loss": 1.0077, + "step": 18920 + }, + { + "epoch": 1.5217041800643085, + "grad_norm": 2.123086929321289, + "learning_rate": 2.404358353510896e-05, + "loss": 1.2028, + "step": 18930 + }, + { + "epoch": 1.522508038585209, + "grad_norm": 2.219896078109741, + "learning_rate": 2.400322841000807e-05, + "loss": 1.0008, + "step": 18940 + }, + { + "epoch": 1.5233118971061095, + "grad_norm": 1.7317404747009277, + "learning_rate": 2.3962873284907182e-05, + "loss": 1.1455, + "step": 18950 + }, + { + "epoch": 1.5241157556270095, + "grad_norm": 1.5645885467529297, + "learning_rate": 2.3922518159806294e-05, + "loss": 0.9871, + "step": 18960 + }, + { + "epoch": 1.52491961414791, + "grad_norm": 1.4457658529281616, + "learning_rate": 2.388216303470541e-05, + "loss": 1.0322, + "step": 18970 + }, + { + "epoch": 1.5257234726688103, + "grad_norm": 1.5804071426391602, + "learning_rate": 2.3841807909604523e-05, + "loss": 1.0548, + "step": 18980 + }, + { + "epoch": 1.5265273311897105, + "grad_norm": 1.545935869216919, + "learning_rate": 2.3801452784503634e-05, + "loss": 1.0185, + "step": 18990 + }, + { + "epoch": 1.527331189710611, + "grad_norm": 2.3571431636810303, + "learning_rate": 2.3761097659402746e-05, + "loss": 1.0888, + "step": 19000 + }, + { + "epoch": 1.5281350482315113, + "grad_norm": 1.867585301399231, + "learning_rate": 2.3720742534301857e-05, + "loss": 0.9695, + "step": 19010 + }, + { + "epoch": 1.5289389067524115, + "grad_norm": 2.631490468978882, + "learning_rate": 2.368038740920097e-05, + "loss": 0.9966, + "step": 19020 + }, + { + "epoch": 1.529742765273312, + "grad_norm": 1.501379370689392, + "learning_rate": 2.3640032284100083e-05, + "loss": 0.9514, + "step": 19030 + }, + { + "epoch": 1.5305466237942122, + "grad_norm": 1.7019522190093994, + "learning_rate": 2.3599677158999195e-05, + "loss": 1.0341, + "step": 19040 + }, + { + "epoch": 1.5313504823151125, + "grad_norm": 2.4788031578063965, + "learning_rate": 2.3559322033898306e-05, + "loss": 0.9932, + "step": 19050 + }, + { + "epoch": 1.532154340836013, + "grad_norm": 1.5615988969802856, + "learning_rate": 2.3518966908797417e-05, + "loss": 1.016, + "step": 19060 + }, + { + "epoch": 1.532958199356913, + "grad_norm": 1.50218665599823, + "learning_rate": 2.347861178369653e-05, + "loss": 1.0888, + "step": 19070 + }, + { + "epoch": 1.5337620578778135, + "grad_norm": 1.6570556163787842, + "learning_rate": 2.3438256658595644e-05, + "loss": 0.8625, + "step": 19080 + }, + { + "epoch": 1.534565916398714, + "grad_norm": 2.3664464950561523, + "learning_rate": 2.3397901533494755e-05, + "loss": 1.0421, + "step": 19090 + }, + { + "epoch": 1.535369774919614, + "grad_norm": 2.178680181503296, + "learning_rate": 2.335754640839387e-05, + "loss": 1.017, + "step": 19100 + }, + { + "epoch": 1.5361736334405145, + "grad_norm": 1.6579536199569702, + "learning_rate": 2.331719128329298e-05, + "loss": 1.046, + "step": 19110 + }, + { + "epoch": 1.5369774919614148, + "grad_norm": 3.513983726501465, + "learning_rate": 2.3276836158192092e-05, + "loss": 1.0616, + "step": 19120 + }, + { + "epoch": 1.537781350482315, + "grad_norm": 2.0785059928894043, + "learning_rate": 2.3236481033091204e-05, + "loss": 0.9541, + "step": 19130 + }, + { + "epoch": 1.5385852090032155, + "grad_norm": 1.9155325889587402, + "learning_rate": 2.3196125907990315e-05, + "loss": 1.0438, + "step": 19140 + }, + { + "epoch": 1.5393890675241158, + "grad_norm": 2.0071442127227783, + "learning_rate": 2.3155770782889426e-05, + "loss": 0.9538, + "step": 19150 + }, + { + "epoch": 1.540192926045016, + "grad_norm": 2.359255313873291, + "learning_rate": 2.3115415657788538e-05, + "loss": 0.8942, + "step": 19160 + }, + { + "epoch": 1.5409967845659165, + "grad_norm": 1.5436384677886963, + "learning_rate": 2.3075060532687653e-05, + "loss": 0.9818, + "step": 19170 + }, + { + "epoch": 1.5418006430868167, + "grad_norm": 1.5936756134033203, + "learning_rate": 2.3034705407586764e-05, + "loss": 1.0091, + "step": 19180 + }, + { + "epoch": 1.542604501607717, + "grad_norm": 2.787342071533203, + "learning_rate": 2.2994350282485875e-05, + "loss": 1.0598, + "step": 19190 + }, + { + "epoch": 1.5434083601286175, + "grad_norm": 2.2612195014953613, + "learning_rate": 2.295399515738499e-05, + "loss": 1.1404, + "step": 19200 + }, + { + "epoch": 1.5442122186495175, + "grad_norm": 3.06294584274292, + "learning_rate": 2.29136400322841e-05, + "loss": 1.1383, + "step": 19210 + }, + { + "epoch": 1.545016077170418, + "grad_norm": 2.707146644592285, + "learning_rate": 2.2873284907183213e-05, + "loss": 0.9306, + "step": 19220 + }, + { + "epoch": 1.5458199356913185, + "grad_norm": 1.7285178899765015, + "learning_rate": 2.2832929782082328e-05, + "loss": 1.0923, + "step": 19230 + }, + { + "epoch": 1.5466237942122185, + "grad_norm": 1.8518304824829102, + "learning_rate": 2.279257465698144e-05, + "loss": 1.0411, + "step": 19240 + }, + { + "epoch": 1.547427652733119, + "grad_norm": 2.2175393104553223, + "learning_rate": 2.275221953188055e-05, + "loss": 1.0426, + "step": 19250 + }, + { + "epoch": 1.5482315112540193, + "grad_norm": 1.9369691610336304, + "learning_rate": 2.271186440677966e-05, + "loss": 0.952, + "step": 19260 + }, + { + "epoch": 1.5490353697749195, + "grad_norm": 2.876096725463867, + "learning_rate": 2.2671509281678773e-05, + "loss": 1.0326, + "step": 19270 + }, + { + "epoch": 1.54983922829582, + "grad_norm": 1.7443819046020508, + "learning_rate": 2.2631154156577884e-05, + "loss": 1.1068, + "step": 19280 + }, + { + "epoch": 1.5506430868167203, + "grad_norm": 1.8379837274551392, + "learning_rate": 2.2590799031476996e-05, + "loss": 1.0209, + "step": 19290 + }, + { + "epoch": 1.5514469453376205, + "grad_norm": 1.8857988119125366, + "learning_rate": 2.255044390637611e-05, + "loss": 1.1399, + "step": 19300 + }, + { + "epoch": 1.552250803858521, + "grad_norm": 1.553567886352539, + "learning_rate": 2.2510088781275225e-05, + "loss": 0.9507, + "step": 19310 + }, + { + "epoch": 1.5530546623794212, + "grad_norm": 1.7635307312011719, + "learning_rate": 2.2469733656174337e-05, + "loss": 0.9853, + "step": 19320 + }, + { + "epoch": 1.5538585209003215, + "grad_norm": 2.045858860015869, + "learning_rate": 2.2429378531073448e-05, + "loss": 0.9357, + "step": 19330 + }, + { + "epoch": 1.554662379421222, + "grad_norm": 1.603248953819275, + "learning_rate": 2.238902340597256e-05, + "loss": 0.9685, + "step": 19340 + }, + { + "epoch": 1.555466237942122, + "grad_norm": 2.1176679134368896, + "learning_rate": 2.234866828087167e-05, + "loss": 1.1459, + "step": 19350 + }, + { + "epoch": 1.5562700964630225, + "grad_norm": 1.4296454191207886, + "learning_rate": 2.2308313155770785e-05, + "loss": 1.0233, + "step": 19360 + }, + { + "epoch": 1.557073954983923, + "grad_norm": 1.4487553834915161, + "learning_rate": 2.2267958030669897e-05, + "loss": 1.0689, + "step": 19370 + }, + { + "epoch": 1.557877813504823, + "grad_norm": 1.4253681898117065, + "learning_rate": 2.2227602905569008e-05, + "loss": 1.0238, + "step": 19380 + }, + { + "epoch": 1.5586816720257235, + "grad_norm": 2.9197847843170166, + "learning_rate": 2.218724778046812e-05, + "loss": 0.9624, + "step": 19390 + }, + { + "epoch": 1.5594855305466238, + "grad_norm": 1.4034744501113892, + "learning_rate": 2.214689265536723e-05, + "loss": 1.1532, + "step": 19400 + }, + { + "epoch": 1.560289389067524, + "grad_norm": 1.631948709487915, + "learning_rate": 2.2106537530266346e-05, + "loss": 1.0096, + "step": 19410 + }, + { + "epoch": 1.5610932475884245, + "grad_norm": 1.686239242553711, + "learning_rate": 2.2066182405165457e-05, + "loss": 0.9291, + "step": 19420 + }, + { + "epoch": 1.5618971061093248, + "grad_norm": 1.7068378925323486, + "learning_rate": 2.2025827280064572e-05, + "loss": 1.0057, + "step": 19430 + }, + { + "epoch": 1.562700964630225, + "grad_norm": 2.1420505046844482, + "learning_rate": 2.1985472154963683e-05, + "loss": 1.0527, + "step": 19440 + }, + { + "epoch": 1.5635048231511255, + "grad_norm": 1.7549521923065186, + "learning_rate": 2.1945117029862795e-05, + "loss": 1.0262, + "step": 19450 + }, + { + "epoch": 1.5643086816720257, + "grad_norm": 2.7982192039489746, + "learning_rate": 2.1904761904761906e-05, + "loss": 0.9992, + "step": 19460 + }, + { + "epoch": 1.565112540192926, + "grad_norm": 1.7734447717666626, + "learning_rate": 2.1864406779661017e-05, + "loss": 0.9371, + "step": 19470 + }, + { + "epoch": 1.5659163987138265, + "grad_norm": 2.7094318866729736, + "learning_rate": 2.182405165456013e-05, + "loss": 1.0623, + "step": 19480 + }, + { + "epoch": 1.5667202572347267, + "grad_norm": 2.0985891819000244, + "learning_rate": 2.178369652945924e-05, + "loss": 1.1057, + "step": 19490 + }, + { + "epoch": 1.567524115755627, + "grad_norm": 2.8843741416931152, + "learning_rate": 2.1743341404358355e-05, + "loss": 1.089, + "step": 19500 + }, + { + "epoch": 1.5683279742765275, + "grad_norm": 2.498589038848877, + "learning_rate": 2.1702986279257466e-05, + "loss": 0.9217, + "step": 19510 + }, + { + "epoch": 1.5691318327974275, + "grad_norm": 3.109459400177002, + "learning_rate": 2.1662631154156577e-05, + "loss": 0.9486, + "step": 19520 + }, + { + "epoch": 1.569935691318328, + "grad_norm": 1.6474716663360596, + "learning_rate": 2.1622276029055692e-05, + "loss": 0.9679, + "step": 19530 + }, + { + "epoch": 1.5707395498392283, + "grad_norm": 2.079991102218628, + "learning_rate": 2.1581920903954804e-05, + "loss": 0.9412, + "step": 19540 + }, + { + "epoch": 1.5715434083601285, + "grad_norm": 2.6996138095855713, + "learning_rate": 2.1541565778853915e-05, + "loss": 0.881, + "step": 19550 + }, + { + "epoch": 1.572347266881029, + "grad_norm": 1.8078988790512085, + "learning_rate": 2.150121065375303e-05, + "loss": 1.062, + "step": 19560 + }, + { + "epoch": 1.5731511254019293, + "grad_norm": 1.8167033195495605, + "learning_rate": 2.146085552865214e-05, + "loss": 1.0287, + "step": 19570 + }, + { + "epoch": 1.5739549839228295, + "grad_norm": 1.9593679904937744, + "learning_rate": 2.1420500403551252e-05, + "loss": 1.0442, + "step": 19580 + }, + { + "epoch": 1.57475884244373, + "grad_norm": 2.62164568901062, + "learning_rate": 2.1380145278450364e-05, + "loss": 1.0162, + "step": 19590 + }, + { + "epoch": 1.5755627009646302, + "grad_norm": 1.6859263181686401, + "learning_rate": 2.1339790153349475e-05, + "loss": 0.8896, + "step": 19600 + }, + { + "epoch": 1.5763665594855305, + "grad_norm": 2.3182373046875, + "learning_rate": 2.1299435028248587e-05, + "loss": 0.9433, + "step": 19610 + }, + { + "epoch": 1.577170418006431, + "grad_norm": 2.2859318256378174, + "learning_rate": 2.1259079903147698e-05, + "loss": 1.0641, + "step": 19620 + }, + { + "epoch": 1.5779742765273312, + "grad_norm": 4.590075492858887, + "learning_rate": 2.1218724778046813e-05, + "loss": 1.0006, + "step": 19630 + }, + { + "epoch": 1.5787781350482315, + "grad_norm": 1.6839805841445923, + "learning_rate": 2.1178369652945927e-05, + "loss": 1.1331, + "step": 19640 + }, + { + "epoch": 1.579581993569132, + "grad_norm": 1.8310493230819702, + "learning_rate": 2.113801452784504e-05, + "loss": 1.1686, + "step": 19650 + }, + { + "epoch": 1.580385852090032, + "grad_norm": 1.4298887252807617, + "learning_rate": 2.109765940274415e-05, + "loss": 0.9636, + "step": 19660 + }, + { + "epoch": 1.5811897106109325, + "grad_norm": 2.003631353378296, + "learning_rate": 2.105730427764326e-05, + "loss": 0.8991, + "step": 19670 + }, + { + "epoch": 1.5819935691318328, + "grad_norm": 1.770169973373413, + "learning_rate": 2.1016949152542373e-05, + "loss": 1.0196, + "step": 19680 + }, + { + "epoch": 1.582797427652733, + "grad_norm": 3.1248345375061035, + "learning_rate": 2.0976594027441488e-05, + "loss": 1.057, + "step": 19690 + }, + { + "epoch": 1.5836012861736335, + "grad_norm": 1.6270357370376587, + "learning_rate": 2.09362389023406e-05, + "loss": 1.0216, + "step": 19700 + }, + { + "epoch": 1.5844051446945338, + "grad_norm": 2.0626096725463867, + "learning_rate": 2.089588377723971e-05, + "loss": 1.0524, + "step": 19710 + }, + { + "epoch": 1.585209003215434, + "grad_norm": 2.2830660343170166, + "learning_rate": 2.0855528652138822e-05, + "loss": 1.0987, + "step": 19720 + }, + { + "epoch": 1.5860128617363345, + "grad_norm": 1.951399803161621, + "learning_rate": 2.0815173527037933e-05, + "loss": 0.907, + "step": 19730 + }, + { + "epoch": 1.5868167202572347, + "grad_norm": 3.4775824546813965, + "learning_rate": 2.0774818401937048e-05, + "loss": 1.0355, + "step": 19740 + }, + { + "epoch": 1.587620578778135, + "grad_norm": 1.6632407903671265, + "learning_rate": 2.073446327683616e-05, + "loss": 1.0947, + "step": 19750 + }, + { + "epoch": 1.5884244372990355, + "grad_norm": 1.8771129846572876, + "learning_rate": 2.0694108151735274e-05, + "loss": 0.993, + "step": 19760 + }, + { + "epoch": 1.5892282958199357, + "grad_norm": 1.9345898628234863, + "learning_rate": 2.0653753026634385e-05, + "loss": 0.9974, + "step": 19770 + }, + { + "epoch": 1.590032154340836, + "grad_norm": 1.9626299142837524, + "learning_rate": 2.0613397901533497e-05, + "loss": 1.053, + "step": 19780 + }, + { + "epoch": 1.5908360128617365, + "grad_norm": 1.9407083988189697, + "learning_rate": 2.0573042776432608e-05, + "loss": 0.9735, + "step": 19790 + }, + { + "epoch": 1.5916398713826365, + "grad_norm": 1.5738252401351929, + "learning_rate": 2.053268765133172e-05, + "loss": 0.9544, + "step": 19800 + }, + { + "epoch": 1.592443729903537, + "grad_norm": 1.4336590766906738, + "learning_rate": 2.049233252623083e-05, + "loss": 0.9506, + "step": 19810 + }, + { + "epoch": 1.5932475884244373, + "grad_norm": 2.0407192707061768, + "learning_rate": 2.0451977401129946e-05, + "loss": 1.0241, + "step": 19820 + }, + { + "epoch": 1.5940514469453375, + "grad_norm": 1.9290578365325928, + "learning_rate": 2.0411622276029057e-05, + "loss": 0.9519, + "step": 19830 + }, + { + "epoch": 1.594855305466238, + "grad_norm": 1.5437543392181396, + "learning_rate": 2.0371267150928168e-05, + "loss": 0.9266, + "step": 19840 + }, + { + "epoch": 1.5956591639871383, + "grad_norm": 1.6253694295883179, + "learning_rate": 2.033091202582728e-05, + "loss": 1.0576, + "step": 19850 + }, + { + "epoch": 1.5964630225080385, + "grad_norm": 2.7759857177734375, + "learning_rate": 2.0290556900726394e-05, + "loss": 1.0464, + "step": 19860 + }, + { + "epoch": 1.597266881028939, + "grad_norm": 1.6483947038650513, + "learning_rate": 2.0250201775625506e-05, + "loss": 0.9771, + "step": 19870 + }, + { + "epoch": 1.5980707395498392, + "grad_norm": 1.52219820022583, + "learning_rate": 2.0209846650524617e-05, + "loss": 0.9558, + "step": 19880 + }, + { + "epoch": 1.5988745980707395, + "grad_norm": 1.429937481880188, + "learning_rate": 2.0169491525423732e-05, + "loss": 0.95, + "step": 19890 + }, + { + "epoch": 1.59967845659164, + "grad_norm": 1.8432539701461792, + "learning_rate": 2.0129136400322843e-05, + "loss": 1.0402, + "step": 19900 + }, + { + "epoch": 1.6004823151125402, + "grad_norm": 1.6091192960739136, + "learning_rate": 2.0088781275221955e-05, + "loss": 1.0194, + "step": 19910 + }, + { + "epoch": 1.6012861736334405, + "grad_norm": 2.1835174560546875, + "learning_rate": 2.0048426150121066e-05, + "loss": 0.9036, + "step": 19920 + }, + { + "epoch": 1.602090032154341, + "grad_norm": 1.876979947090149, + "learning_rate": 2.0008071025020177e-05, + "loss": 1.0052, + "step": 19930 + }, + { + "epoch": 1.602893890675241, + "grad_norm": 1.3231323957443237, + "learning_rate": 1.996771589991929e-05, + "loss": 0.9938, + "step": 19940 + }, + { + "epoch": 1.6036977491961415, + "grad_norm": 2.0215790271759033, + "learning_rate": 1.99273607748184e-05, + "loss": 1.0777, + "step": 19950 + }, + { + "epoch": 1.6045016077170418, + "grad_norm": 1.4858729839324951, + "learning_rate": 1.9887005649717515e-05, + "loss": 0.9382, + "step": 19960 + }, + { + "epoch": 1.605305466237942, + "grad_norm": 1.8996670246124268, + "learning_rate": 1.984665052461663e-05, + "loss": 1.0613, + "step": 19970 + }, + { + "epoch": 1.6061093247588425, + "grad_norm": 1.6118154525756836, + "learning_rate": 1.980629539951574e-05, + "loss": 1.0106, + "step": 19980 + }, + { + "epoch": 1.6069131832797428, + "grad_norm": 2.23201847076416, + "learning_rate": 1.9765940274414852e-05, + "loss": 0.982, + "step": 19990 + }, + { + "epoch": 1.607717041800643, + "grad_norm": 2.0391416549682617, + "learning_rate": 1.9725585149313964e-05, + "loss": 1.0126, + "step": 20000 + }, + { + "epoch": 1.607717041800643, + "eval_yahma/alpaca-cleaned_loss": 1.2087749242782593, + "eval_yahma/alpaca-cleaned_runtime": 115.7363, + "eval_yahma/alpaca-cleaned_samples_per_second": 17.281, + "eval_yahma/alpaca-cleaned_steps_per_second": 2.16, + "step": 20000 + }, + { + "epoch": 1.6085209003215435, + "grad_norm": 1.4210705757141113, + "learning_rate": 1.9685230024213075e-05, + "loss": 0.9249, + "step": 20010 + }, + { + "epoch": 1.6093247588424437, + "grad_norm": 1.8334485292434692, + "learning_rate": 1.964487489911219e-05, + "loss": 1.1195, + "step": 20020 + }, + { + "epoch": 1.610128617363344, + "grad_norm": 1.5518391132354736, + "learning_rate": 1.96045197740113e-05, + "loss": 0.9409, + "step": 20030 + }, + { + "epoch": 1.6109324758842445, + "grad_norm": 1.3544491529464722, + "learning_rate": 1.9564164648910413e-05, + "loss": 0.9035, + "step": 20040 + }, + { + "epoch": 1.6117363344051447, + "grad_norm": 1.4562795162200928, + "learning_rate": 1.9523809523809524e-05, + "loss": 1.0332, + "step": 20050 + }, + { + "epoch": 1.612540192926045, + "grad_norm": 1.9066096544265747, + "learning_rate": 1.9483454398708635e-05, + "loss": 0.9654, + "step": 20060 + }, + { + "epoch": 1.6133440514469455, + "grad_norm": 2.1256048679351807, + "learning_rate": 1.944309927360775e-05, + "loss": 1.1411, + "step": 20070 + }, + { + "epoch": 1.6141479099678455, + "grad_norm": 2.309495449066162, + "learning_rate": 1.940274414850686e-05, + "loss": 0.9443, + "step": 20080 + }, + { + "epoch": 1.614951768488746, + "grad_norm": 1.5459156036376953, + "learning_rate": 1.9362389023405976e-05, + "loss": 1.0025, + "step": 20090 + }, + { + "epoch": 1.6157556270096463, + "grad_norm": 1.8760969638824463, + "learning_rate": 1.9322033898305087e-05, + "loss": 1.1442, + "step": 20100 + }, + { + "epoch": 1.6165594855305465, + "grad_norm": 2.290050506591797, + "learning_rate": 1.92816787732042e-05, + "loss": 1.0269, + "step": 20110 + }, + { + "epoch": 1.617363344051447, + "grad_norm": 1.5273579359054565, + "learning_rate": 1.924132364810331e-05, + "loss": 0.9352, + "step": 20120 + }, + { + "epoch": 1.6181672025723473, + "grad_norm": 1.4741283655166626, + "learning_rate": 1.920096852300242e-05, + "loss": 0.962, + "step": 20130 + }, + { + "epoch": 1.6189710610932475, + "grad_norm": 2.3171274662017822, + "learning_rate": 1.9160613397901533e-05, + "loss": 1.0589, + "step": 20140 + }, + { + "epoch": 1.619774919614148, + "grad_norm": 2.144007444381714, + "learning_rate": 1.9120258272800648e-05, + "loss": 1.0362, + "step": 20150 + }, + { + "epoch": 1.6205787781350482, + "grad_norm": 2.317404270172119, + "learning_rate": 1.907990314769976e-05, + "loss": 0.972, + "step": 20160 + }, + { + "epoch": 1.6213826366559485, + "grad_norm": 1.6954485177993774, + "learning_rate": 1.903954802259887e-05, + "loss": 1.0767, + "step": 20170 + }, + { + "epoch": 1.622186495176849, + "grad_norm": 1.7256394624710083, + "learning_rate": 1.8999192897497982e-05, + "loss": 1.0305, + "step": 20180 + }, + { + "epoch": 1.6229903536977492, + "grad_norm": 1.4580116271972656, + "learning_rate": 1.8958837772397097e-05, + "loss": 0.9812, + "step": 20190 + }, + { + "epoch": 1.6237942122186495, + "grad_norm": 2.0422894954681396, + "learning_rate": 1.8918482647296208e-05, + "loss": 0.9744, + "step": 20200 + }, + { + "epoch": 1.62459807073955, + "grad_norm": 4.765537738800049, + "learning_rate": 1.887812752219532e-05, + "loss": 1.0139, + "step": 20210 + }, + { + "epoch": 1.62540192926045, + "grad_norm": 2.389866828918457, + "learning_rate": 1.8837772397094434e-05, + "loss": 1.1453, + "step": 20220 + }, + { + "epoch": 1.6262057877813505, + "grad_norm": 1.4161128997802734, + "learning_rate": 1.8797417271993545e-05, + "loss": 0.9437, + "step": 20230 + }, + { + "epoch": 1.6270096463022508, + "grad_norm": 1.9139914512634277, + "learning_rate": 1.8757062146892657e-05, + "loss": 1.0214, + "step": 20240 + }, + { + "epoch": 1.627813504823151, + "grad_norm": 1.738409161567688, + "learning_rate": 1.8716707021791768e-05, + "loss": 1.026, + "step": 20250 + }, + { + "epoch": 1.6286173633440515, + "grad_norm": 2.470611333847046, + "learning_rate": 1.867635189669088e-05, + "loss": 1.0338, + "step": 20260 + }, + { + "epoch": 1.6294212218649518, + "grad_norm": 1.3114768266677856, + "learning_rate": 1.863599677158999e-05, + "loss": 1.0561, + "step": 20270 + }, + { + "epoch": 1.630225080385852, + "grad_norm": 1.7882541418075562, + "learning_rate": 1.8595641646489102e-05, + "loss": 1.0617, + "step": 20280 + }, + { + "epoch": 1.6310289389067525, + "grad_norm": 2.458045244216919, + "learning_rate": 1.8555286521388217e-05, + "loss": 0.9396, + "step": 20290 + }, + { + "epoch": 1.6318327974276527, + "grad_norm": 2.7625696659088135, + "learning_rate": 1.851493139628733e-05, + "loss": 0.9944, + "step": 20300 + }, + { + "epoch": 1.632636655948553, + "grad_norm": 1.760848879814148, + "learning_rate": 1.8474576271186443e-05, + "loss": 0.9271, + "step": 20310 + }, + { + "epoch": 1.6334405144694535, + "grad_norm": 1.9309755563735962, + "learning_rate": 1.8434221146085554e-05, + "loss": 1.0556, + "step": 20320 + }, + { + "epoch": 1.6342443729903537, + "grad_norm": 2.2995901107788086, + "learning_rate": 1.8393866020984666e-05, + "loss": 1.0896, + "step": 20330 + }, + { + "epoch": 1.635048231511254, + "grad_norm": 2.4556405544281006, + "learning_rate": 1.8353510895883777e-05, + "loss": 0.929, + "step": 20340 + }, + { + "epoch": 1.6358520900321545, + "grad_norm": 2.014082431793213, + "learning_rate": 1.8313155770782892e-05, + "loss": 0.9454, + "step": 20350 + }, + { + "epoch": 1.6366559485530545, + "grad_norm": 1.7907990217208862, + "learning_rate": 1.8272800645682003e-05, + "loss": 0.956, + "step": 20360 + }, + { + "epoch": 1.637459807073955, + "grad_norm": 1.6757053136825562, + "learning_rate": 1.8232445520581115e-05, + "loss": 1.0474, + "step": 20370 + }, + { + "epoch": 1.6382636655948553, + "grad_norm": 1.374757170677185, + "learning_rate": 1.8192090395480226e-05, + "loss": 0.8232, + "step": 20380 + }, + { + "epoch": 1.6390675241157555, + "grad_norm": 1.8884340524673462, + "learning_rate": 1.8151735270379337e-05, + "loss": 0.9609, + "step": 20390 + }, + { + "epoch": 1.639871382636656, + "grad_norm": 1.5904606580734253, + "learning_rate": 1.811138014527845e-05, + "loss": 0.9653, + "step": 20400 + }, + { + "epoch": 1.6406752411575563, + "grad_norm": 1.5124174356460571, + "learning_rate": 1.8071025020177564e-05, + "loss": 1.0028, + "step": 20410 + }, + { + "epoch": 1.6414790996784565, + "grad_norm": 2.074727773666382, + "learning_rate": 1.8030669895076678e-05, + "loss": 0.9784, + "step": 20420 + }, + { + "epoch": 1.642282958199357, + "grad_norm": 3.159682035446167, + "learning_rate": 1.799031476997579e-05, + "loss": 1.0564, + "step": 20430 + }, + { + "epoch": 1.6430868167202572, + "grad_norm": 2.477046012878418, + "learning_rate": 1.79499596448749e-05, + "loss": 0.9725, + "step": 20440 + }, + { + "epoch": 1.6438906752411575, + "grad_norm": 1.718367576599121, + "learning_rate": 1.7909604519774012e-05, + "loss": 0.9811, + "step": 20450 + }, + { + "epoch": 1.644694533762058, + "grad_norm": 2.1779701709747314, + "learning_rate": 1.7869249394673124e-05, + "loss": 0.9761, + "step": 20460 + }, + { + "epoch": 1.6454983922829582, + "grad_norm": 2.076404094696045, + "learning_rate": 1.7828894269572235e-05, + "loss": 0.9694, + "step": 20470 + }, + { + "epoch": 1.6463022508038585, + "grad_norm": 1.9192440509796143, + "learning_rate": 1.778853914447135e-05, + "loss": 1.0546, + "step": 20480 + }, + { + "epoch": 1.647106109324759, + "grad_norm": 3.688655138015747, + "learning_rate": 1.774818401937046e-05, + "loss": 1.0202, + "step": 20490 + }, + { + "epoch": 1.647909967845659, + "grad_norm": 1.932462215423584, + "learning_rate": 1.7707828894269573e-05, + "loss": 0.9977, + "step": 20500 + }, + { + "epoch": 1.6487138263665595, + "grad_norm": 2.313615322113037, + "learning_rate": 1.7667473769168684e-05, + "loss": 1.0029, + "step": 20510 + }, + { + "epoch": 1.6495176848874598, + "grad_norm": 1.4296419620513916, + "learning_rate": 1.76271186440678e-05, + "loss": 1.0456, + "step": 20520 + }, + { + "epoch": 1.65032154340836, + "grad_norm": 2.292299747467041, + "learning_rate": 1.758676351896691e-05, + "loss": 1.0806, + "step": 20530 + }, + { + "epoch": 1.6511254019292605, + "grad_norm": 2.147498369216919, + "learning_rate": 1.754640839386602e-05, + "loss": 1.0389, + "step": 20540 + }, + { + "epoch": 1.6519292604501608, + "grad_norm": 1.8983690738677979, + "learning_rate": 1.7506053268765136e-05, + "loss": 0.9936, + "step": 20550 + }, + { + "epoch": 1.652733118971061, + "grad_norm": 1.4978177547454834, + "learning_rate": 1.7465698143664248e-05, + "loss": 1.0733, + "step": 20560 + }, + { + "epoch": 1.6535369774919615, + "grad_norm": 3.058656692504883, + "learning_rate": 1.742534301856336e-05, + "loss": 0.8716, + "step": 20570 + }, + { + "epoch": 1.6543408360128617, + "grad_norm": 1.651357889175415, + "learning_rate": 1.738498789346247e-05, + "loss": 1.0162, + "step": 20580 + }, + { + "epoch": 1.655144694533762, + "grad_norm": 1.9416139125823975, + "learning_rate": 1.734463276836158e-05, + "loss": 0.999, + "step": 20590 + }, + { + "epoch": 1.6559485530546625, + "grad_norm": 2.939375400543213, + "learning_rate": 1.7304277643260693e-05, + "loss": 1.0727, + "step": 20600 + }, + { + "epoch": 1.6567524115755627, + "grad_norm": 2.1564691066741943, + "learning_rate": 1.7263922518159804e-05, + "loss": 1.0074, + "step": 20610 + }, + { + "epoch": 1.657556270096463, + "grad_norm": 2.003150701522827, + "learning_rate": 1.722356739305892e-05, + "loss": 1.1418, + "step": 20620 + }, + { + "epoch": 1.6583601286173635, + "grad_norm": 1.420788049697876, + "learning_rate": 1.718321226795803e-05, + "loss": 0.8737, + "step": 20630 + }, + { + "epoch": 1.6591639871382635, + "grad_norm": 1.5822371244430542, + "learning_rate": 1.7142857142857145e-05, + "loss": 0.9014, + "step": 20640 + }, + { + "epoch": 1.659967845659164, + "grad_norm": 1.6281102895736694, + "learning_rate": 1.7102502017756257e-05, + "loss": 0.8962, + "step": 20650 + }, + { + "epoch": 1.6607717041800643, + "grad_norm": 2.509737491607666, + "learning_rate": 1.7062146892655368e-05, + "loss": 0.9401, + "step": 20660 + }, + { + "epoch": 1.6615755627009645, + "grad_norm": 1.9507536888122559, + "learning_rate": 1.702179176755448e-05, + "loss": 0.9771, + "step": 20670 + }, + { + "epoch": 1.662379421221865, + "grad_norm": 2.431785821914673, + "learning_rate": 1.6981436642453594e-05, + "loss": 1.0511, + "step": 20680 + }, + { + "epoch": 1.6631832797427653, + "grad_norm": 2.554717540740967, + "learning_rate": 1.6941081517352705e-05, + "loss": 0.9411, + "step": 20690 + }, + { + "epoch": 1.6639871382636655, + "grad_norm": 2.0644285678863525, + "learning_rate": 1.6900726392251817e-05, + "loss": 0.9656, + "step": 20700 + }, + { + "epoch": 1.664790996784566, + "grad_norm": 1.919416069984436, + "learning_rate": 1.6860371267150928e-05, + "loss": 0.8719, + "step": 20710 + }, + { + "epoch": 1.6655948553054662, + "grad_norm": 1.4564859867095947, + "learning_rate": 1.682001614205004e-05, + "loss": 0.8866, + "step": 20720 + }, + { + "epoch": 1.6663987138263665, + "grad_norm": 2.2104880809783936, + "learning_rate": 1.677966101694915e-05, + "loss": 1.0019, + "step": 20730 + }, + { + "epoch": 1.667202572347267, + "grad_norm": 1.7495410442352295, + "learning_rate": 1.6739305891848266e-05, + "loss": 1.1196, + "step": 20740 + }, + { + "epoch": 1.6680064308681672, + "grad_norm": 1.9178359508514404, + "learning_rate": 1.669895076674738e-05, + "loss": 1.0092, + "step": 20750 + }, + { + "epoch": 1.6688102893890675, + "grad_norm": 2.948913812637329, + "learning_rate": 1.6658595641646492e-05, + "loss": 0.9096, + "step": 20760 + }, + { + "epoch": 1.669614147909968, + "grad_norm": 1.9237521886825562, + "learning_rate": 1.6618240516545603e-05, + "loss": 1.0704, + "step": 20770 + }, + { + "epoch": 1.670418006430868, + "grad_norm": 2.0535099506378174, + "learning_rate": 1.6577885391444715e-05, + "loss": 0.9703, + "step": 20780 + }, + { + "epoch": 1.6712218649517685, + "grad_norm": 2.0139174461364746, + "learning_rate": 1.6537530266343826e-05, + "loss": 0.9179, + "step": 20790 + }, + { + "epoch": 1.6720257234726688, + "grad_norm": 1.4897915124893188, + "learning_rate": 1.6497175141242937e-05, + "loss": 0.9509, + "step": 20800 + }, + { + "epoch": 1.672829581993569, + "grad_norm": 2.042825937271118, + "learning_rate": 1.6456820016142052e-05, + "loss": 0.8686, + "step": 20810 + }, + { + "epoch": 1.6736334405144695, + "grad_norm": 2.0794460773468018, + "learning_rate": 1.6416464891041163e-05, + "loss": 1.0376, + "step": 20820 + }, + { + "epoch": 1.6744372990353698, + "grad_norm": 1.6484997272491455, + "learning_rate": 1.6380145278450364e-05, + "loss": 0.7965, + "step": 20830 + }, + { + "epoch": 1.67524115755627, + "grad_norm": 1.6736949682235718, + "learning_rate": 1.6339790153349476e-05, + "loss": 0.9812, + "step": 20840 + }, + { + "epoch": 1.6760450160771705, + "grad_norm": 1.5247830152511597, + "learning_rate": 1.6299435028248587e-05, + "loss": 0.9694, + "step": 20850 + }, + { + "epoch": 1.6768488745980707, + "grad_norm": 2.0274441242218018, + "learning_rate": 1.6259079903147702e-05, + "loss": 1.0599, + "step": 20860 + }, + { + "epoch": 1.677652733118971, + "grad_norm": 1.847269058227539, + "learning_rate": 1.6218724778046813e-05, + "loss": 0.9876, + "step": 20870 + }, + { + "epoch": 1.6784565916398715, + "grad_norm": 2.002469301223755, + "learning_rate": 1.6178369652945924e-05, + "loss": 1.0208, + "step": 20880 + }, + { + "epoch": 1.6792604501607717, + "grad_norm": 1.3857256174087524, + "learning_rate": 1.613801452784504e-05, + "loss": 1.0983, + "step": 20890 + }, + { + "epoch": 1.680064308681672, + "grad_norm": 1.6506829261779785, + "learning_rate": 1.609765940274415e-05, + "loss": 0.9916, + "step": 20900 + }, + { + "epoch": 1.6808681672025725, + "grad_norm": 1.6178910732269287, + "learning_rate": 1.6057304277643262e-05, + "loss": 0.8901, + "step": 20910 + }, + { + "epoch": 1.6816720257234725, + "grad_norm": 2.434352397918701, + "learning_rate": 1.6016949152542373e-05, + "loss": 0.9237, + "step": 20920 + }, + { + "epoch": 1.682475884244373, + "grad_norm": 1.6109217405319214, + "learning_rate": 1.5976594027441485e-05, + "loss": 1.1368, + "step": 20930 + }, + { + "epoch": 1.6832797427652733, + "grad_norm": 2.161876678466797, + "learning_rate": 1.5936238902340596e-05, + "loss": 1.0398, + "step": 20940 + }, + { + "epoch": 1.6840836012861735, + "grad_norm": 1.8207350969314575, + "learning_rate": 1.5895883777239707e-05, + "loss": 0.95, + "step": 20950 + }, + { + "epoch": 1.684887459807074, + "grad_norm": 1.5055922269821167, + "learning_rate": 1.5855528652138822e-05, + "loss": 1.0249, + "step": 20960 + }, + { + "epoch": 1.6856913183279743, + "grad_norm": 1.4368202686309814, + "learning_rate": 1.5815173527037937e-05, + "loss": 1.0758, + "step": 20970 + }, + { + "epoch": 1.6864951768488745, + "grad_norm": 3.190652847290039, + "learning_rate": 1.5774818401937048e-05, + "loss": 1.1185, + "step": 20980 + }, + { + "epoch": 1.687299035369775, + "grad_norm": 3.332230567932129, + "learning_rate": 1.573446327683616e-05, + "loss": 1.0188, + "step": 20990 + }, + { + "epoch": 1.6881028938906752, + "grad_norm": 2.1496293544769287, + "learning_rate": 1.569410815173527e-05, + "loss": 1.0209, + "step": 21000 + }, + { + "epoch": 1.6889067524115755, + "grad_norm": 1.6579251289367676, + "learning_rate": 1.5653753026634382e-05, + "loss": 0.9776, + "step": 21010 + }, + { + "epoch": 1.689710610932476, + "grad_norm": 2.849315881729126, + "learning_rate": 1.5613397901533494e-05, + "loss": 0.9393, + "step": 21020 + }, + { + "epoch": 1.6905144694533762, + "grad_norm": 1.442819595336914, + "learning_rate": 1.557304277643261e-05, + "loss": 1.192, + "step": 21030 + }, + { + "epoch": 1.6913183279742765, + "grad_norm": 1.701747179031372, + "learning_rate": 1.553268765133172e-05, + "loss": 1.0754, + "step": 21040 + }, + { + "epoch": 1.692122186495177, + "grad_norm": 1.8358919620513916, + "learning_rate": 1.549233252623083e-05, + "loss": 0.9335, + "step": 21050 + }, + { + "epoch": 1.692926045016077, + "grad_norm": 2.1475460529327393, + "learning_rate": 1.5451977401129943e-05, + "loss": 0.897, + "step": 21060 + }, + { + "epoch": 1.6937299035369775, + "grad_norm": 2.243940830230713, + "learning_rate": 1.5411622276029057e-05, + "loss": 1.0031, + "step": 21070 + }, + { + "epoch": 1.694533762057878, + "grad_norm": 2.350111246109009, + "learning_rate": 1.537126715092817e-05, + "loss": 1.1756, + "step": 21080 + }, + { + "epoch": 1.695337620578778, + "grad_norm": 1.7451907396316528, + "learning_rate": 1.5330912025827283e-05, + "loss": 1.0719, + "step": 21090 + }, + { + "epoch": 1.6961414790996785, + "grad_norm": 1.607172966003418, + "learning_rate": 1.5290556900726395e-05, + "loss": 1.0103, + "step": 21100 + }, + { + "epoch": 1.6969453376205788, + "grad_norm": 1.8843369483947754, + "learning_rate": 1.5250201775625506e-05, + "loss": 1.0322, + "step": 21110 + }, + { + "epoch": 1.697749196141479, + "grad_norm": 1.918947458267212, + "learning_rate": 1.5209846650524618e-05, + "loss": 0.9708, + "step": 21120 + }, + { + "epoch": 1.6985530546623795, + "grad_norm": 1.444896936416626, + "learning_rate": 1.5169491525423729e-05, + "loss": 0.9646, + "step": 21130 + }, + { + "epoch": 1.6993569131832797, + "grad_norm": 1.6434109210968018, + "learning_rate": 1.512913640032284e-05, + "loss": 0.9987, + "step": 21140 + }, + { + "epoch": 1.70016077170418, + "grad_norm": 2.072153091430664, + "learning_rate": 1.5088781275221953e-05, + "loss": 1.1089, + "step": 21150 + }, + { + "epoch": 1.7009646302250805, + "grad_norm": 2.1333658695220947, + "learning_rate": 1.5048426150121066e-05, + "loss": 0.9657, + "step": 21160 + }, + { + "epoch": 1.7017684887459807, + "grad_norm": 1.6481126546859741, + "learning_rate": 1.500807102502018e-05, + "loss": 0.9615, + "step": 21170 + }, + { + "epoch": 1.702572347266881, + "grad_norm": 1.8588632345199585, + "learning_rate": 1.496771589991929e-05, + "loss": 1.0058, + "step": 21180 + }, + { + "epoch": 1.7033762057877815, + "grad_norm": 1.585315465927124, + "learning_rate": 1.4927360774818402e-05, + "loss": 0.9077, + "step": 21190 + }, + { + "epoch": 1.7041800643086815, + "grad_norm": 1.5307081937789917, + "learning_rate": 1.4887005649717514e-05, + "loss": 0.996, + "step": 21200 + }, + { + "epoch": 1.704983922829582, + "grad_norm": 1.5457983016967773, + "learning_rate": 1.4846650524616627e-05, + "loss": 0.9997, + "step": 21210 + }, + { + "epoch": 1.7057877813504825, + "grad_norm": 1.6371421813964844, + "learning_rate": 1.480629539951574e-05, + "loss": 1.1061, + "step": 21220 + }, + { + "epoch": 1.7065916398713825, + "grad_norm": 2.6698966026306152, + "learning_rate": 1.4765940274414853e-05, + "loss": 1.0207, + "step": 21230 + }, + { + "epoch": 1.707395498392283, + "grad_norm": 2.1621170043945312, + "learning_rate": 1.4725585149313964e-05, + "loss": 1.042, + "step": 21240 + }, + { + "epoch": 1.7081993569131833, + "grad_norm": 1.244496464729309, + "learning_rate": 1.4685230024213075e-05, + "loss": 0.9997, + "step": 21250 + }, + { + "epoch": 1.7090032154340835, + "grad_norm": 1.8812851905822754, + "learning_rate": 1.4644874899112187e-05, + "loss": 1.0055, + "step": 21260 + }, + { + "epoch": 1.709807073954984, + "grad_norm": 1.6612039804458618, + "learning_rate": 1.46045197740113e-05, + "loss": 1.0835, + "step": 21270 + }, + { + "epoch": 1.7106109324758842, + "grad_norm": 1.7879507541656494, + "learning_rate": 1.4564164648910411e-05, + "loss": 1.0079, + "step": 21280 + }, + { + "epoch": 1.7114147909967845, + "grad_norm": 1.9121391773223877, + "learning_rate": 1.4523809523809526e-05, + "loss": 0.9546, + "step": 21290 + }, + { + "epoch": 1.712218649517685, + "grad_norm": 1.898646593093872, + "learning_rate": 1.4483454398708637e-05, + "loss": 0.9777, + "step": 21300 + }, + { + "epoch": 1.7130225080385852, + "grad_norm": 2.991395950317383, + "learning_rate": 1.4443099273607749e-05, + "loss": 1.0865, + "step": 21310 + }, + { + "epoch": 1.7138263665594855, + "grad_norm": 1.890356183052063, + "learning_rate": 1.440274414850686e-05, + "loss": 0.9562, + "step": 21320 + }, + { + "epoch": 1.714630225080386, + "grad_norm": 3.2333405017852783, + "learning_rate": 1.4362389023405973e-05, + "loss": 0.9115, + "step": 21330 + }, + { + "epoch": 1.715434083601286, + "grad_norm": 1.6147865056991577, + "learning_rate": 1.4322033898305085e-05, + "loss": 0.9789, + "step": 21340 + }, + { + "epoch": 1.7162379421221865, + "grad_norm": 1.7564023733139038, + "learning_rate": 1.42816787732042e-05, + "loss": 0.8648, + "step": 21350 + }, + { + "epoch": 1.717041800643087, + "grad_norm": 1.7845501899719238, + "learning_rate": 1.424132364810331e-05, + "loss": 1.1064, + "step": 21360 + }, + { + "epoch": 1.717845659163987, + "grad_norm": 1.3877874612808228, + "learning_rate": 1.4200968523002422e-05, + "loss": 1.0212, + "step": 21370 + }, + { + "epoch": 1.7186495176848875, + "grad_norm": 2.5669033527374268, + "learning_rate": 1.4160613397901535e-05, + "loss": 0.98, + "step": 21380 + }, + { + "epoch": 1.7194533762057878, + "grad_norm": 2.023080825805664, + "learning_rate": 1.4120258272800646e-05, + "loss": 1.0809, + "step": 21390 + }, + { + "epoch": 1.720257234726688, + "grad_norm": 3.2553956508636475, + "learning_rate": 1.4079903147699758e-05, + "loss": 0.8719, + "step": 21400 + }, + { + "epoch": 1.7210610932475885, + "grad_norm": 1.4506080150604248, + "learning_rate": 1.403954802259887e-05, + "loss": 0.9757, + "step": 21410 + }, + { + "epoch": 1.7218649517684887, + "grad_norm": 1.6034953594207764, + "learning_rate": 1.3999192897497984e-05, + "loss": 1.0364, + "step": 21420 + }, + { + "epoch": 1.722668810289389, + "grad_norm": 1.687049388885498, + "learning_rate": 1.3958837772397095e-05, + "loss": 1.0933, + "step": 21430 + }, + { + "epoch": 1.7234726688102895, + "grad_norm": 1.4316902160644531, + "learning_rate": 1.3918482647296208e-05, + "loss": 1.0475, + "step": 21440 + }, + { + "epoch": 1.7242765273311897, + "grad_norm": 2.5947353839874268, + "learning_rate": 1.387812752219532e-05, + "loss": 0.9705, + "step": 21450 + }, + { + "epoch": 1.72508038585209, + "grad_norm": 1.9034242630004883, + "learning_rate": 1.3837772397094431e-05, + "loss": 1.0019, + "step": 21460 + }, + { + "epoch": 1.7258842443729905, + "grad_norm": 1.8161375522613525, + "learning_rate": 1.3797417271993542e-05, + "loss": 0.9571, + "step": 21470 + }, + { + "epoch": 1.7266881028938905, + "grad_norm": 3.107001304626465, + "learning_rate": 1.3757062146892655e-05, + "loss": 1.1365, + "step": 21480 + }, + { + "epoch": 1.727491961414791, + "grad_norm": 1.6083005666732788, + "learning_rate": 1.3716707021791769e-05, + "loss": 1.0522, + "step": 21490 + }, + { + "epoch": 1.7282958199356915, + "grad_norm": 2.342423915863037, + "learning_rate": 1.3676351896690882e-05, + "loss": 0.9688, + "step": 21500 + }, + { + "epoch": 1.7290996784565915, + "grad_norm": 1.732395887374878, + "learning_rate": 1.3635996771589993e-05, + "loss": 1.0353, + "step": 21510 + }, + { + "epoch": 1.729903536977492, + "grad_norm": 2.039433240890503, + "learning_rate": 1.3595641646489104e-05, + "loss": 1.0223, + "step": 21520 + }, + { + "epoch": 1.7307073954983923, + "grad_norm": 1.7352782487869263, + "learning_rate": 1.3555286521388216e-05, + "loss": 0.9912, + "step": 21530 + }, + { + "epoch": 1.7315112540192925, + "grad_norm": 1.8595890998840332, + "learning_rate": 1.3514931396287329e-05, + "loss": 0.9412, + "step": 21540 + }, + { + "epoch": 1.732315112540193, + "grad_norm": 1.7216383218765259, + "learning_rate": 1.3474576271186442e-05, + "loss": 0.9254, + "step": 21550 + }, + { + "epoch": 1.7331189710610932, + "grad_norm": 2.412393808364868, + "learning_rate": 1.3434221146085555e-05, + "loss": 1.0077, + "step": 21560 + }, + { + "epoch": 1.7339228295819935, + "grad_norm": 1.396226167678833, + "learning_rate": 1.3393866020984666e-05, + "loss": 1.0498, + "step": 21570 + }, + { + "epoch": 1.734726688102894, + "grad_norm": 2.1518938541412354, + "learning_rate": 1.3353510895883778e-05, + "loss": 1.0775, + "step": 21580 + }, + { + "epoch": 1.7355305466237942, + "grad_norm": 1.9329767227172852, + "learning_rate": 1.3313155770782889e-05, + "loss": 0.9458, + "step": 21590 + }, + { + "epoch": 1.7363344051446945, + "grad_norm": 2.125119209289551, + "learning_rate": 1.3272800645682002e-05, + "loss": 1.0538, + "step": 21600 + }, + { + "epoch": 1.737138263665595, + "grad_norm": 1.8424369096755981, + "learning_rate": 1.3232445520581113e-05, + "loss": 0.98, + "step": 21610 + }, + { + "epoch": 1.737942122186495, + "grad_norm": 2.402310848236084, + "learning_rate": 1.3192090395480228e-05, + "loss": 1.0943, + "step": 21620 + }, + { + "epoch": 1.7387459807073955, + "grad_norm": 2.104525089263916, + "learning_rate": 1.315173527037934e-05, + "loss": 0.9797, + "step": 21630 + }, + { + "epoch": 1.739549839228296, + "grad_norm": 2.5234835147857666, + "learning_rate": 1.3111380145278451e-05, + "loss": 1.0867, + "step": 21640 + }, + { + "epoch": 1.740353697749196, + "grad_norm": 2.5901427268981934, + "learning_rate": 1.3071025020177562e-05, + "loss": 0.9924, + "step": 21650 + }, + { + "epoch": 1.7411575562700965, + "grad_norm": 1.5725637674331665, + "learning_rate": 1.3030669895076675e-05, + "loss": 1.0287, + "step": 21660 + }, + { + "epoch": 1.7419614147909968, + "grad_norm": 1.4871200323104858, + "learning_rate": 1.2990314769975787e-05, + "loss": 1.004, + "step": 21670 + }, + { + "epoch": 1.742765273311897, + "grad_norm": 2.0282976627349854, + "learning_rate": 1.2949959644874901e-05, + "loss": 0.9733, + "step": 21680 + }, + { + "epoch": 1.7435691318327975, + "grad_norm": 1.8308261632919312, + "learning_rate": 1.2909604519774013e-05, + "loss": 1.0513, + "step": 21690 + }, + { + "epoch": 1.7443729903536977, + "grad_norm": 1.5598034858703613, + "learning_rate": 1.2869249394673124e-05, + "loss": 0.9191, + "step": 21700 + }, + { + "epoch": 1.745176848874598, + "grad_norm": 1.718119502067566, + "learning_rate": 1.2828894269572237e-05, + "loss": 0.9828, + "step": 21710 + }, + { + "epoch": 1.7459807073954985, + "grad_norm": 2.4848999977111816, + "learning_rate": 1.2788539144471349e-05, + "loss": 0.9463, + "step": 21720 + }, + { + "epoch": 1.7467845659163987, + "grad_norm": 2.8907392024993896, + "learning_rate": 1.274818401937046e-05, + "loss": 1.0016, + "step": 21730 + }, + { + "epoch": 1.747588424437299, + "grad_norm": 1.360023856163025, + "learning_rate": 1.2707828894269571e-05, + "loss": 0.891, + "step": 21740 + }, + { + "epoch": 1.7483922829581995, + "grad_norm": 2.0196022987365723, + "learning_rate": 1.2667473769168686e-05, + "loss": 1.0393, + "step": 21750 + }, + { + "epoch": 1.7491961414790995, + "grad_norm": 1.7127809524536133, + "learning_rate": 1.2627118644067797e-05, + "loss": 1.08, + "step": 21760 + }, + { + "epoch": 1.75, + "grad_norm": 2.263516902923584, + "learning_rate": 1.258676351896691e-05, + "loss": 1.0002, + "step": 21770 + }, + { + "epoch": 1.7508038585209005, + "grad_norm": 1.9506158828735352, + "learning_rate": 1.2546408393866022e-05, + "loss": 1.0205, + "step": 21780 + }, + { + "epoch": 1.7516077170418005, + "grad_norm": 1.926055669784546, + "learning_rate": 1.2506053268765133e-05, + "loss": 0.8754, + "step": 21790 + }, + { + "epoch": 1.752411575562701, + "grad_norm": 1.4109805822372437, + "learning_rate": 1.2465698143664246e-05, + "loss": 0.9293, + "step": 21800 + }, + { + "epoch": 1.7532154340836013, + "grad_norm": 1.531019687652588, + "learning_rate": 1.2425343018563358e-05, + "loss": 1.0311, + "step": 21810 + }, + { + "epoch": 1.7540192926045015, + "grad_norm": 2.2952654361724854, + "learning_rate": 1.238498789346247e-05, + "loss": 0.9757, + "step": 21820 + }, + { + "epoch": 1.754823151125402, + "grad_norm": 2.8620126247406006, + "learning_rate": 1.2344632768361584e-05, + "loss": 0.9115, + "step": 21830 + }, + { + "epoch": 1.7556270096463023, + "grad_norm": 1.715909481048584, + "learning_rate": 1.2304277643260695e-05, + "loss": 1.0013, + "step": 21840 + }, + { + "epoch": 1.7564308681672025, + "grad_norm": 2.3659169673919678, + "learning_rate": 1.2263922518159806e-05, + "loss": 0.9478, + "step": 21850 + }, + { + "epoch": 1.757234726688103, + "grad_norm": 2.386011838912964, + "learning_rate": 1.2223567393058918e-05, + "loss": 1.0195, + "step": 21860 + }, + { + "epoch": 1.7580385852090032, + "grad_norm": 2.0422582626342773, + "learning_rate": 1.2183212267958031e-05, + "loss": 0.9469, + "step": 21870 + }, + { + "epoch": 1.7588424437299035, + "grad_norm": 1.73795747756958, + "learning_rate": 1.2142857142857144e-05, + "loss": 0.9687, + "step": 21880 + }, + { + "epoch": 1.759646302250804, + "grad_norm": 2.1826019287109375, + "learning_rate": 1.2102502017756255e-05, + "loss": 0.9925, + "step": 21890 + }, + { + "epoch": 1.760450160771704, + "grad_norm": 1.6059566736221313, + "learning_rate": 1.2062146892655368e-05, + "loss": 0.9586, + "step": 21900 + }, + { + "epoch": 1.7612540192926045, + "grad_norm": 1.6619853973388672, + "learning_rate": 1.202179176755448e-05, + "loss": 0.9419, + "step": 21910 + }, + { + "epoch": 1.762057877813505, + "grad_norm": 1.4436253309249878, + "learning_rate": 1.1981436642453591e-05, + "loss": 1.037, + "step": 21920 + }, + { + "epoch": 1.762861736334405, + "grad_norm": 1.9889531135559082, + "learning_rate": 1.1941081517352704e-05, + "loss": 1.0703, + "step": 21930 + }, + { + "epoch": 1.7636655948553055, + "grad_norm": 1.9301440715789795, + "learning_rate": 1.1900726392251817e-05, + "loss": 0.964, + "step": 21940 + }, + { + "epoch": 1.7644694533762058, + "grad_norm": 2.3707351684570312, + "learning_rate": 1.1860371267150929e-05, + "loss": 0.968, + "step": 21950 + }, + { + "epoch": 1.765273311897106, + "grad_norm": 1.9376273155212402, + "learning_rate": 1.1820016142050042e-05, + "loss": 0.8785, + "step": 21960 + }, + { + "epoch": 1.7660771704180065, + "grad_norm": 2.0454390048980713, + "learning_rate": 1.1779661016949153e-05, + "loss": 0.8985, + "step": 21970 + }, + { + "epoch": 1.7668810289389068, + "grad_norm": 2.0367326736450195, + "learning_rate": 1.1739305891848264e-05, + "loss": 0.8673, + "step": 21980 + }, + { + "epoch": 1.767684887459807, + "grad_norm": 1.7736015319824219, + "learning_rate": 1.1698950766747377e-05, + "loss": 0.9391, + "step": 21990 + }, + { + "epoch": 1.7684887459807075, + "grad_norm": 1.967177391052246, + "learning_rate": 1.165859564164649e-05, + "loss": 1.0408, + "step": 22000 + }, + { + "epoch": 1.7684887459807075, + "eval_yahma/alpaca-cleaned_loss": 1.2060405015945435, + "eval_yahma/alpaca-cleaned_runtime": 115.751, + "eval_yahma/alpaca-cleaned_samples_per_second": 17.278, + "eval_yahma/alpaca-cleaned_steps_per_second": 2.16, + "step": 22000 + }, + { + "epoch": 1.7692926045016077, + "grad_norm": 2.349555492401123, + "learning_rate": 1.1618240516545602e-05, + "loss": 1.0432, + "step": 22010 + }, + { + "epoch": 1.770096463022508, + "grad_norm": 2.213883638381958, + "learning_rate": 1.1577885391444713e-05, + "loss": 0.8698, + "step": 22020 + }, + { + "epoch": 1.7709003215434085, + "grad_norm": 1.762036919593811, + "learning_rate": 1.1537530266343826e-05, + "loss": 0.8771, + "step": 22030 + }, + { + "epoch": 1.7717041800643085, + "grad_norm": 0.9419066905975342, + "learning_rate": 1.1497175141242938e-05, + "loss": 0.813, + "step": 22040 + }, + { + "epoch": 1.772508038585209, + "grad_norm": 1.4384018182754517, + "learning_rate": 1.145682001614205e-05, + "loss": 0.9929, + "step": 22050 + }, + { + "epoch": 1.7733118971061095, + "grad_norm": 2.7204196453094482, + "learning_rate": 1.1416464891041164e-05, + "loss": 0.9315, + "step": 22060 + }, + { + "epoch": 1.7741157556270095, + "grad_norm": 1.8378400802612305, + "learning_rate": 1.1376109765940275e-05, + "loss": 1.0011, + "step": 22070 + }, + { + "epoch": 1.77491961414791, + "grad_norm": 1.4868669509887695, + "learning_rate": 1.1335754640839387e-05, + "loss": 1.0055, + "step": 22080 + }, + { + "epoch": 1.7757234726688103, + "grad_norm": 1.6384830474853516, + "learning_rate": 1.1295399515738498e-05, + "loss": 1.095, + "step": 22090 + }, + { + "epoch": 1.7765273311897105, + "grad_norm": 1.8896377086639404, + "learning_rate": 1.1255044390637613e-05, + "loss": 1.0446, + "step": 22100 + }, + { + "epoch": 1.777331189710611, + "grad_norm": 1.6007559299468994, + "learning_rate": 1.1214689265536724e-05, + "loss": 1.0298, + "step": 22110 + }, + { + "epoch": 1.7781350482315113, + "grad_norm": 1.9358576536178589, + "learning_rate": 1.1174334140435835e-05, + "loss": 0.9502, + "step": 22120 + }, + { + "epoch": 1.7789389067524115, + "grad_norm": 1.6456011533737183, + "learning_rate": 1.1133979015334948e-05, + "loss": 0.9911, + "step": 22130 + }, + { + "epoch": 1.779742765273312, + "grad_norm": 2.3452653884887695, + "learning_rate": 1.109362389023406e-05, + "loss": 0.9298, + "step": 22140 + }, + { + "epoch": 1.7805466237942122, + "grad_norm": 1.9005801677703857, + "learning_rate": 1.1053268765133173e-05, + "loss": 0.9438, + "step": 22150 + }, + { + "epoch": 1.7813504823151125, + "grad_norm": 2.6864964962005615, + "learning_rate": 1.1012913640032286e-05, + "loss": 0.9279, + "step": 22160 + }, + { + "epoch": 1.782154340836013, + "grad_norm": 1.7829868793487549, + "learning_rate": 1.0972558514931397e-05, + "loss": 0.9321, + "step": 22170 + }, + { + "epoch": 1.782958199356913, + "grad_norm": 2.1801998615264893, + "learning_rate": 1.0932203389830509e-05, + "loss": 0.9647, + "step": 22180 + }, + { + "epoch": 1.7837620578778135, + "grad_norm": 1.683989405632019, + "learning_rate": 1.089184826472962e-05, + "loss": 0.971, + "step": 22190 + }, + { + "epoch": 1.784565916398714, + "grad_norm": 1.9461780786514282, + "learning_rate": 1.0851493139628733e-05, + "loss": 1.0014, + "step": 22200 + }, + { + "epoch": 1.785369774919614, + "grad_norm": 1.9797353744506836, + "learning_rate": 1.0811138014527846e-05, + "loss": 1.0008, + "step": 22210 + }, + { + "epoch": 1.7861736334405145, + "grad_norm": 1.445884108543396, + "learning_rate": 1.0770782889426957e-05, + "loss": 0.9971, + "step": 22220 + }, + { + "epoch": 1.7869774919614148, + "grad_norm": 2.3050432205200195, + "learning_rate": 1.073042776432607e-05, + "loss": 0.9472, + "step": 22230 + }, + { + "epoch": 1.787781350482315, + "grad_norm": 1.5249042510986328, + "learning_rate": 1.0690072639225182e-05, + "loss": 1.1952, + "step": 22240 + }, + { + "epoch": 1.7885852090032155, + "grad_norm": 1.5306737422943115, + "learning_rate": 1.0649717514124293e-05, + "loss": 1.1246, + "step": 22250 + }, + { + "epoch": 1.7893890675241158, + "grad_norm": 1.9988269805908203, + "learning_rate": 1.0609362389023406e-05, + "loss": 1.0048, + "step": 22260 + }, + { + "epoch": 1.790192926045016, + "grad_norm": 1.5958483219146729, + "learning_rate": 1.056900726392252e-05, + "loss": 0.9573, + "step": 22270 + }, + { + "epoch": 1.7909967845659165, + "grad_norm": 1.8712174892425537, + "learning_rate": 1.052865213882163e-05, + "loss": 0.8928, + "step": 22280 + }, + { + "epoch": 1.7918006430868167, + "grad_norm": 1.641789197921753, + "learning_rate": 1.0488297013720744e-05, + "loss": 0.9694, + "step": 22290 + }, + { + "epoch": 1.792604501607717, + "grad_norm": 3.96578311920166, + "learning_rate": 1.0447941888619855e-05, + "loss": 1.0946, + "step": 22300 + }, + { + "epoch": 1.7934083601286175, + "grad_norm": 1.7721596956253052, + "learning_rate": 1.0407586763518967e-05, + "loss": 1.0712, + "step": 22310 + }, + { + "epoch": 1.7942122186495175, + "grad_norm": 1.861772894859314, + "learning_rate": 1.036723163841808e-05, + "loss": 0.9103, + "step": 22320 + }, + { + "epoch": 1.795016077170418, + "grad_norm": 1.7235984802246094, + "learning_rate": 1.0326876513317193e-05, + "loss": 1.0103, + "step": 22330 + }, + { + "epoch": 1.7958199356913185, + "grad_norm": 1.3846309185028076, + "learning_rate": 1.0286521388216304e-05, + "loss": 0.9419, + "step": 22340 + }, + { + "epoch": 1.7966237942122185, + "grad_norm": 3.1098134517669678, + "learning_rate": 1.0246166263115415e-05, + "loss": 0.9782, + "step": 22350 + }, + { + "epoch": 1.797427652733119, + "grad_norm": 2.1430604457855225, + "learning_rate": 1.0205811138014528e-05, + "loss": 0.9843, + "step": 22360 + }, + { + "epoch": 1.7982315112540193, + "grad_norm": 1.5648924112319946, + "learning_rate": 1.016545601291364e-05, + "loss": 0.9367, + "step": 22370 + }, + { + "epoch": 1.7990353697749195, + "grad_norm": 1.7868338823318481, + "learning_rate": 1.0125100887812753e-05, + "loss": 0.985, + "step": 22380 + }, + { + "epoch": 1.79983922829582, + "grad_norm": 2.122217893600464, + "learning_rate": 1.0084745762711866e-05, + "loss": 1.0524, + "step": 22390 + }, + { + "epoch": 1.8006430868167203, + "grad_norm": 3.732097625732422, + "learning_rate": 1.0044390637610977e-05, + "loss": 0.9872, + "step": 22400 + }, + { + "epoch": 1.8014469453376205, + "grad_norm": 1.2948299646377563, + "learning_rate": 1.0004035512510089e-05, + "loss": 0.9207, + "step": 22410 + }, + { + "epoch": 1.802250803858521, + "grad_norm": 1.9187204837799072, + "learning_rate": 9.9636803874092e-06, + "loss": 1.1478, + "step": 22420 + }, + { + "epoch": 1.8030546623794212, + "grad_norm": 1.4700504541397095, + "learning_rate": 9.923325262308315e-06, + "loss": 1.0152, + "step": 22430 + }, + { + "epoch": 1.8038585209003215, + "grad_norm": 1.5490188598632812, + "learning_rate": 9.882970137207426e-06, + "loss": 0.9741, + "step": 22440 + }, + { + "epoch": 1.804662379421222, + "grad_norm": 1.7711931467056274, + "learning_rate": 9.842615012106538e-06, + "loss": 0.9791, + "step": 22450 + }, + { + "epoch": 1.805466237942122, + "grad_norm": 2.2049827575683594, + "learning_rate": 9.80225988700565e-06, + "loss": 0.9413, + "step": 22460 + }, + { + "epoch": 1.8062700964630225, + "grad_norm": 2.086760997772217, + "learning_rate": 9.761904761904762e-06, + "loss": 1.029, + "step": 22470 + }, + { + "epoch": 1.807073954983923, + "grad_norm": 4.496650218963623, + "learning_rate": 9.721549636803875e-06, + "loss": 1.0884, + "step": 22480 + }, + { + "epoch": 1.807877813504823, + "grad_norm": 1.6099193096160889, + "learning_rate": 9.681194511702988e-06, + "loss": 0.9469, + "step": 22490 + }, + { + "epoch": 1.8086816720257235, + "grad_norm": 2.1339876651763916, + "learning_rate": 9.6408393866021e-06, + "loss": 1.0124, + "step": 22500 + }, + { + "epoch": 1.8094855305466238, + "grad_norm": 1.8483517169952393, + "learning_rate": 9.60048426150121e-06, + "loss": 0.9827, + "step": 22510 + }, + { + "epoch": 1.810289389067524, + "grad_norm": 2.988699197769165, + "learning_rate": 9.560129136400324e-06, + "loss": 1.1299, + "step": 22520 + }, + { + "epoch": 1.8110932475884245, + "grad_norm": 1.7932300567626953, + "learning_rate": 9.519774011299435e-06, + "loss": 0.9944, + "step": 22530 + }, + { + "epoch": 1.8118971061093248, + "grad_norm": 2.6900038719177246, + "learning_rate": 9.479418886198548e-06, + "loss": 1.0668, + "step": 22540 + }, + { + "epoch": 1.812700964630225, + "grad_norm": 1.5085099935531616, + "learning_rate": 9.43906376109766e-06, + "loss": 1.0023, + "step": 22550 + }, + { + "epoch": 1.8135048231511255, + "grad_norm": 2.3199098110198975, + "learning_rate": 9.398708635996773e-06, + "loss": 1.1052, + "step": 22560 + }, + { + "epoch": 1.8143086816720257, + "grad_norm": 1.4957773685455322, + "learning_rate": 9.358353510895884e-06, + "loss": 1.0233, + "step": 22570 + }, + { + "epoch": 1.815112540192926, + "grad_norm": 2.718686819076538, + "learning_rate": 9.317998385794995e-06, + "loss": 1.0884, + "step": 22580 + }, + { + "epoch": 1.8159163987138265, + "grad_norm": 2.2220447063446045, + "learning_rate": 9.277643260694108e-06, + "loss": 0.9866, + "step": 22590 + }, + { + "epoch": 1.8167202572347267, + "grad_norm": 1.6045563220977783, + "learning_rate": 9.237288135593222e-06, + "loss": 1.0357, + "step": 22600 + }, + { + "epoch": 1.817524115755627, + "grad_norm": 1.5356566905975342, + "learning_rate": 9.196933010492333e-06, + "loss": 0.9883, + "step": 22610 + }, + { + "epoch": 1.8183279742765275, + "grad_norm": 1.7813926935195923, + "learning_rate": 9.156577885391446e-06, + "loss": 0.9772, + "step": 22620 + }, + { + "epoch": 1.8191318327974275, + "grad_norm": 2.866159439086914, + "learning_rate": 9.116222760290557e-06, + "loss": 0.9915, + "step": 22630 + }, + { + "epoch": 1.819935691318328, + "grad_norm": 2.222846508026123, + "learning_rate": 9.075867635189669e-06, + "loss": 1.0007, + "step": 22640 + }, + { + "epoch": 1.8207395498392283, + "grad_norm": 2.0583643913269043, + "learning_rate": 9.035512510088782e-06, + "loss": 1.0692, + "step": 22650 + }, + { + "epoch": 1.8215434083601285, + "grad_norm": 1.5456589460372925, + "learning_rate": 8.995157384987895e-06, + "loss": 0.9902, + "step": 22660 + }, + { + "epoch": 1.822347266881029, + "grad_norm": 1.7286393642425537, + "learning_rate": 8.954802259887006e-06, + "loss": 1.0302, + "step": 22670 + }, + { + "epoch": 1.8231511254019293, + "grad_norm": 1.6188042163848877, + "learning_rate": 8.914447134786118e-06, + "loss": 1.1087, + "step": 22680 + }, + { + "epoch": 1.8239549839228295, + "grad_norm": 1.6614141464233398, + "learning_rate": 8.87409200968523e-06, + "loss": 1.0115, + "step": 22690 + }, + { + "epoch": 1.82475884244373, + "grad_norm": 1.9828251600265503, + "learning_rate": 8.833736884584342e-06, + "loss": 0.8943, + "step": 22700 + }, + { + "epoch": 1.8255627009646302, + "grad_norm": 1.587049126625061, + "learning_rate": 8.793381759483455e-06, + "loss": 0.9652, + "step": 22710 + }, + { + "epoch": 1.8263665594855305, + "grad_norm": 2.128166675567627, + "learning_rate": 8.753026634382568e-06, + "loss": 1.0803, + "step": 22720 + }, + { + "epoch": 1.827170418006431, + "grad_norm": 2.181746006011963, + "learning_rate": 8.71267150928168e-06, + "loss": 1.0535, + "step": 22730 + }, + { + "epoch": 1.8279742765273312, + "grad_norm": 1.4443950653076172, + "learning_rate": 8.67231638418079e-06, + "loss": 0.9807, + "step": 22740 + }, + { + "epoch": 1.8287781350482315, + "grad_norm": 4.426495552062988, + "learning_rate": 8.631961259079902e-06, + "loss": 0.9908, + "step": 22750 + }, + { + "epoch": 1.829581993569132, + "grad_norm": 1.8822588920593262, + "learning_rate": 8.591606133979015e-06, + "loss": 1.083, + "step": 22760 + }, + { + "epoch": 1.830385852090032, + "grad_norm": 1.6437195539474487, + "learning_rate": 8.551251008878128e-06, + "loss": 1.0005, + "step": 22770 + }, + { + "epoch": 1.8311897106109325, + "grad_norm": 1.8927476406097412, + "learning_rate": 8.51089588377724e-06, + "loss": 1.0809, + "step": 22780 + }, + { + "epoch": 1.8319935691318328, + "grad_norm": 2.30253267288208, + "learning_rate": 8.470540758676353e-06, + "loss": 0.9831, + "step": 22790 + }, + { + "epoch": 1.832797427652733, + "grad_norm": 2.003490447998047, + "learning_rate": 8.430185633575464e-06, + "loss": 1.022, + "step": 22800 + }, + { + "epoch": 1.8336012861736335, + "grad_norm": 3.780564308166504, + "learning_rate": 8.389830508474575e-06, + "loss": 0.9888, + "step": 22810 + }, + { + "epoch": 1.8344051446945338, + "grad_norm": 1.9336515665054321, + "learning_rate": 8.34947538337369e-06, + "loss": 1.0131, + "step": 22820 + }, + { + "epoch": 1.835209003215434, + "grad_norm": 1.8983111381530762, + "learning_rate": 8.309120258272802e-06, + "loss": 1.161, + "step": 22830 + }, + { + "epoch": 1.8360128617363345, + "grad_norm": 2.518303871154785, + "learning_rate": 8.268765133171913e-06, + "loss": 1.1041, + "step": 22840 + }, + { + "epoch": 1.8368167202572347, + "grad_norm": 1.8314872980117798, + "learning_rate": 8.228410008071026e-06, + "loss": 1.0529, + "step": 22850 + }, + { + "epoch": 1.837620578778135, + "grad_norm": 1.640785813331604, + "learning_rate": 8.188054882970137e-06, + "loss": 0.9747, + "step": 22860 + }, + { + "epoch": 1.8384244372990355, + "grad_norm": 1.3812037706375122, + "learning_rate": 8.14769975786925e-06, + "loss": 1.0021, + "step": 22870 + }, + { + "epoch": 1.8392282958199357, + "grad_norm": 1.4298020601272583, + "learning_rate": 8.107344632768362e-06, + "loss": 0.9967, + "step": 22880 + }, + { + "epoch": 1.840032154340836, + "grad_norm": 1.9113152027130127, + "learning_rate": 8.066989507667475e-06, + "loss": 0.8342, + "step": 22890 + }, + { + "epoch": 1.8408360128617365, + "grad_norm": 2.049980878829956, + "learning_rate": 8.026634382566586e-06, + "loss": 1.002, + "step": 22900 + }, + { + "epoch": 1.8416398713826365, + "grad_norm": 1.9057393074035645, + "learning_rate": 7.986279257465698e-06, + "loss": 0.9655, + "step": 22910 + }, + { + "epoch": 1.842443729903537, + "grad_norm": 1.5741013288497925, + "learning_rate": 7.94592413236481e-06, + "loss": 1.0502, + "step": 22920 + }, + { + "epoch": 1.8432475884244373, + "grad_norm": 2.031848192214966, + "learning_rate": 7.905569007263924e-06, + "loss": 1.0311, + "step": 22930 + }, + { + "epoch": 1.8440514469453375, + "grad_norm": 3.889958381652832, + "learning_rate": 7.865213882163035e-06, + "loss": 0.9455, + "step": 22940 + }, + { + "epoch": 1.844855305466238, + "grad_norm": 1.9395737648010254, + "learning_rate": 7.824858757062148e-06, + "loss": 0.9401, + "step": 22950 + }, + { + "epoch": 1.8456591639871383, + "grad_norm": 4.533421516418457, + "learning_rate": 7.78450363196126e-06, + "loss": 0.9171, + "step": 22960 + }, + { + "epoch": 1.8464630225080385, + "grad_norm": 1.5359047651290894, + "learning_rate": 7.74414850686037e-06, + "loss": 0.9464, + "step": 22970 + }, + { + "epoch": 1.847266881028939, + "grad_norm": 1.548888921737671, + "learning_rate": 7.703793381759484e-06, + "loss": 1.0143, + "step": 22980 + }, + { + "epoch": 1.8480707395498392, + "grad_norm": 1.649289608001709, + "learning_rate": 7.663438256658597e-06, + "loss": 1.1122, + "step": 22990 + }, + { + "epoch": 1.8488745980707395, + "grad_norm": 2.1079747676849365, + "learning_rate": 7.623083131557708e-06, + "loss": 0.9877, + "step": 23000 + }, + { + "epoch": 1.84967845659164, + "grad_norm": 1.611446738243103, + "learning_rate": 7.58272800645682e-06, + "loss": 0.9571, + "step": 23010 + }, + { + "epoch": 1.8504823151125402, + "grad_norm": 1.5056588649749756, + "learning_rate": 7.542372881355933e-06, + "loss": 0.9905, + "step": 23020 + }, + { + "epoch": 1.8512861736334405, + "grad_norm": 1.9945638179779053, + "learning_rate": 7.502017756255045e-06, + "loss": 1.0252, + "step": 23030 + }, + { + "epoch": 1.852090032154341, + "grad_norm": 1.8758174180984497, + "learning_rate": 7.461662631154156e-06, + "loss": 1.038, + "step": 23040 + }, + { + "epoch": 1.852893890675241, + "grad_norm": 2.0755865573883057, + "learning_rate": 7.425343018563358e-06, + "loss": 1.0526, + "step": 23050 + }, + { + "epoch": 1.8536977491961415, + "grad_norm": 1.5491857528686523, + "learning_rate": 7.3849878934624694e-06, + "loss": 1.0086, + "step": 23060 + }, + { + "epoch": 1.8545016077170418, + "grad_norm": 3.110924482345581, + "learning_rate": 7.3446327683615825e-06, + "loss": 0.9758, + "step": 23070 + }, + { + "epoch": 1.855305466237942, + "grad_norm": 1.3394163846969604, + "learning_rate": 7.304277643260695e-06, + "loss": 1.0135, + "step": 23080 + }, + { + "epoch": 1.8561093247588425, + "grad_norm": 1.9073662757873535, + "learning_rate": 7.263922518159806e-06, + "loss": 1.0456, + "step": 23090 + }, + { + "epoch": 1.8569131832797428, + "grad_norm": 1.4355270862579346, + "learning_rate": 7.223567393058919e-06, + "loss": 1.045, + "step": 23100 + }, + { + "epoch": 1.857717041800643, + "grad_norm": 2.270494222640991, + "learning_rate": 7.183212267958031e-06, + "loss": 0.9851, + "step": 23110 + }, + { + "epoch": 1.8585209003215435, + "grad_norm": 2.288757085800171, + "learning_rate": 7.142857142857143e-06, + "loss": 1.0242, + "step": 23120 + }, + { + "epoch": 1.8593247588424437, + "grad_norm": 1.5285948514938354, + "learning_rate": 7.102502017756256e-06, + "loss": 1.0258, + "step": 23130 + }, + { + "epoch": 1.860128617363344, + "grad_norm": 1.900128722190857, + "learning_rate": 7.062146892655368e-06, + "loss": 0.8494, + "step": 23140 + }, + { + "epoch": 1.8609324758842445, + "grad_norm": 1.519309639930725, + "learning_rate": 7.021791767554479e-06, + "loss": 1.0318, + "step": 23150 + }, + { + "epoch": 1.8617363344051447, + "grad_norm": 1.7744423151016235, + "learning_rate": 6.9814366424535916e-06, + "loss": 0.9396, + "step": 23160 + }, + { + "epoch": 1.862540192926045, + "grad_norm": 1.5099612474441528, + "learning_rate": 6.941081517352705e-06, + "loss": 0.9436, + "step": 23170 + }, + { + "epoch": 1.8633440514469455, + "grad_norm": 1.8056023120880127, + "learning_rate": 6.900726392251816e-06, + "loss": 0.9437, + "step": 23180 + }, + { + "epoch": 1.8641479099678455, + "grad_norm": 2.491077423095703, + "learning_rate": 6.860371267150928e-06, + "loss": 0.9675, + "step": 23190 + }, + { + "epoch": 1.864951768488746, + "grad_norm": 1.5670075416564941, + "learning_rate": 6.820016142050041e-06, + "loss": 0.9382, + "step": 23200 + }, + { + "epoch": 1.8657556270096463, + "grad_norm": 1.6534690856933594, + "learning_rate": 6.779661016949153e-06, + "loss": 1.0577, + "step": 23210 + }, + { + "epoch": 1.8665594855305465, + "grad_norm": 1.7739144563674927, + "learning_rate": 6.739305891848265e-06, + "loss": 1.0413, + "step": 23220 + }, + { + "epoch": 1.867363344051447, + "grad_norm": 2.2446448802948, + "learning_rate": 6.698950766747378e-06, + "loss": 0.9624, + "step": 23230 + }, + { + "epoch": 1.8681672025723473, + "grad_norm": 1.8075047731399536, + "learning_rate": 6.658595641646489e-06, + "loss": 1.0003, + "step": 23240 + }, + { + "epoch": 1.8689710610932475, + "grad_norm": 1.8375296592712402, + "learning_rate": 6.6182405165456015e-06, + "loss": 0.9927, + "step": 23250 + }, + { + "epoch": 1.869774919614148, + "grad_norm": 1.985586166381836, + "learning_rate": 6.5778853914447145e-06, + "loss": 1.0433, + "step": 23260 + }, + { + "epoch": 1.8705787781350482, + "grad_norm": 1.910912275314331, + "learning_rate": 6.537530266343826e-06, + "loss": 0.9427, + "step": 23270 + }, + { + "epoch": 1.8713826366559485, + "grad_norm": 2.1846415996551514, + "learning_rate": 6.497175141242938e-06, + "loss": 0.9872, + "step": 23280 + }, + { + "epoch": 1.872186495176849, + "grad_norm": 1.6068943738937378, + "learning_rate": 6.4568200161420495e-06, + "loss": 0.9514, + "step": 23290 + }, + { + "epoch": 1.8729903536977492, + "grad_norm": 1.9065032005310059, + "learning_rate": 6.4164648910411625e-06, + "loss": 1.0191, + "step": 23300 + }, + { + "epoch": 1.8737942122186495, + "grad_norm": 1.668900489807129, + "learning_rate": 6.376109765940275e-06, + "loss": 0.964, + "step": 23310 + }, + { + "epoch": 1.87459807073955, + "grad_norm": 1.5289818048477173, + "learning_rate": 6.335754640839386e-06, + "loss": 0.9577, + "step": 23320 + }, + { + "epoch": 1.87540192926045, + "grad_norm": 1.8854244947433472, + "learning_rate": 6.295399515738499e-06, + "loss": 1.1001, + "step": 23330 + }, + { + "epoch": 1.8762057877813505, + "grad_norm": 2.1930129528045654, + "learning_rate": 6.255044390637611e-06, + "loss": 1.036, + "step": 23340 + }, + { + "epoch": 1.8770096463022508, + "grad_norm": 1.6574493646621704, + "learning_rate": 6.214689265536724e-06, + "loss": 1.0617, + "step": 23350 + }, + { + "epoch": 1.877813504823151, + "grad_norm": 1.4845741987228394, + "learning_rate": 6.174334140435836e-06, + "loss": 1.0416, + "step": 23360 + }, + { + "epoch": 1.8786173633440515, + "grad_norm": 1.7336281538009644, + "learning_rate": 6.133979015334948e-06, + "loss": 0.9253, + "step": 23370 + }, + { + "epoch": 1.8794212218649518, + "grad_norm": 1.5522409677505493, + "learning_rate": 6.093623890234059e-06, + "loss": 1.0626, + "step": 23380 + }, + { + "epoch": 1.880225080385852, + "grad_norm": 2.1845033168792725, + "learning_rate": 6.0532687651331724e-06, + "loss": 1.1008, + "step": 23390 + }, + { + "epoch": 1.8810289389067525, + "grad_norm": 1.3744220733642578, + "learning_rate": 6.012913640032285e-06, + "loss": 1.0222, + "step": 23400 + }, + { + "epoch": 1.8818327974276527, + "grad_norm": 1.8450615406036377, + "learning_rate": 5.972558514931397e-06, + "loss": 1.1045, + "step": 23410 + }, + { + "epoch": 1.882636655948553, + "grad_norm": 3.2560296058654785, + "learning_rate": 5.932203389830509e-06, + "loss": 1.0165, + "step": 23420 + }, + { + "epoch": 1.8834405144694535, + "grad_norm": 3.120931386947632, + "learning_rate": 5.8918482647296204e-06, + "loss": 0.9879, + "step": 23430 + }, + { + "epoch": 1.8842443729903537, + "grad_norm": 2.820199728012085, + "learning_rate": 5.8514931396287335e-06, + "loss": 0.9497, + "step": 23440 + }, + { + "epoch": 1.885048231511254, + "grad_norm": 2.5963895320892334, + "learning_rate": 5.811138014527846e-06, + "loss": 1.067, + "step": 23450 + }, + { + "epoch": 1.8858520900321545, + "grad_norm": 1.6238845586776733, + "learning_rate": 5.770782889426957e-06, + "loss": 1.0693, + "step": 23460 + }, + { + "epoch": 1.8866559485530545, + "grad_norm": 1.703322172164917, + "learning_rate": 5.73042776432607e-06, + "loss": 0.8865, + "step": 23470 + }, + { + "epoch": 1.887459807073955, + "grad_norm": 2.031500816345215, + "learning_rate": 5.6900726392251815e-06, + "loss": 1.0144, + "step": 23480 + }, + { + "epoch": 1.8882636655948553, + "grad_norm": 1.9934537410736084, + "learning_rate": 5.649717514124294e-06, + "loss": 0.9858, + "step": 23490 + }, + { + "epoch": 1.8890675241157555, + "grad_norm": 1.85928475856781, + "learning_rate": 5.609362389023407e-06, + "loss": 0.9204, + "step": 23500 + }, + { + "epoch": 1.889871382636656, + "grad_norm": 1.7416903972625732, + "learning_rate": 5.569007263922518e-06, + "loss": 0.9034, + "step": 23510 + }, + { + "epoch": 1.8906752411575563, + "grad_norm": 2.1307573318481445, + "learning_rate": 5.52865213882163e-06, + "loss": 0.8743, + "step": 23520 + }, + { + "epoch": 1.8914790996784565, + "grad_norm": 1.882643461227417, + "learning_rate": 5.4882970137207426e-06, + "loss": 1.0894, + "step": 23530 + }, + { + "epoch": 1.892282958199357, + "grad_norm": 1.6504333019256592, + "learning_rate": 5.447941888619855e-06, + "loss": 1.132, + "step": 23540 + }, + { + "epoch": 1.8930868167202572, + "grad_norm": 1.440651297569275, + "learning_rate": 5.407586763518967e-06, + "loss": 1.0758, + "step": 23550 + }, + { + "epoch": 1.8938906752411575, + "grad_norm": 2.3337113857269287, + "learning_rate": 5.367231638418079e-06, + "loss": 0.9673, + "step": 23560 + }, + { + "epoch": 1.894694533762058, + "grad_norm": 2.217491865158081, + "learning_rate": 5.326876513317191e-06, + "loss": 1.0566, + "step": 23570 + }, + { + "epoch": 1.8954983922829582, + "grad_norm": 2.5645487308502197, + "learning_rate": 5.286521388216304e-06, + "loss": 0.999, + "step": 23580 + }, + { + "epoch": 1.8963022508038585, + "grad_norm": 1.5525003671646118, + "learning_rate": 5.246166263115416e-06, + "loss": 1.1088, + "step": 23590 + }, + { + "epoch": 1.897106109324759, + "grad_norm": 2.1636111736297607, + "learning_rate": 5.205811138014528e-06, + "loss": 0.9676, + "step": 23600 + }, + { + "epoch": 1.897909967845659, + "grad_norm": 1.9286320209503174, + "learning_rate": 5.16545601291364e-06, + "loss": 1.0241, + "step": 23610 + }, + { + "epoch": 1.8987138263665595, + "grad_norm": 2.1557018756866455, + "learning_rate": 5.1251008878127525e-06, + "loss": 0.9821, + "step": 23620 + }, + { + "epoch": 1.8995176848874598, + "grad_norm": 1.5869606733322144, + "learning_rate": 5.084745762711865e-06, + "loss": 0.9518, + "step": 23630 + }, + { + "epoch": 1.90032154340836, + "grad_norm": 2.810943365097046, + "learning_rate": 5.044390637610977e-06, + "loss": 1.0004, + "step": 23640 + }, + { + "epoch": 1.9011254019292605, + "grad_norm": 1.8571282625198364, + "learning_rate": 5.004035512510089e-06, + "loss": 1.0284, + "step": 23650 + }, + { + "epoch": 1.9019292604501608, + "grad_norm": 2.991077423095703, + "learning_rate": 4.963680387409201e-06, + "loss": 0.9655, + "step": 23660 + }, + { + "epoch": 1.902733118971061, + "grad_norm": 2.18481183052063, + "learning_rate": 4.9233252623083135e-06, + "loss": 0.917, + "step": 23670 + }, + { + "epoch": 1.9035369774919615, + "grad_norm": 1.5696027278900146, + "learning_rate": 4.882970137207426e-06, + "loss": 0.9889, + "step": 23680 + }, + { + "epoch": 1.9043408360128617, + "grad_norm": 1.5635316371917725, + "learning_rate": 4.842615012106538e-06, + "loss": 1.0119, + "step": 23690 + }, + { + "epoch": 1.905144694533762, + "grad_norm": 2.4708731174468994, + "learning_rate": 4.80225988700565e-06, + "loss": 0.9785, + "step": 23700 + }, + { + "epoch": 1.9059485530546625, + "grad_norm": 1.5477855205535889, + "learning_rate": 4.7619047619047615e-06, + "loss": 0.9873, + "step": 23710 + }, + { + "epoch": 1.9067524115755627, + "grad_norm": 1.5106024742126465, + "learning_rate": 4.721549636803875e-06, + "loss": 1.0617, + "step": 23720 + }, + { + "epoch": 1.907556270096463, + "grad_norm": 2.2407634258270264, + "learning_rate": 4.681194511702987e-06, + "loss": 1.0268, + "step": 23730 + }, + { + "epoch": 1.9083601286173635, + "grad_norm": 1.9189608097076416, + "learning_rate": 4.640839386602098e-06, + "loss": 1.0089, + "step": 23740 + }, + { + "epoch": 1.9091639871382635, + "grad_norm": 2.416452407836914, + "learning_rate": 4.600484261501211e-06, + "loss": 0.9246, + "step": 23750 + }, + { + "epoch": 1.909967845659164, + "grad_norm": 3.1344220638275146, + "learning_rate": 4.560129136400323e-06, + "loss": 1.1536, + "step": 23760 + }, + { + "epoch": 1.9107717041800643, + "grad_norm": 4.287921905517578, + "learning_rate": 4.519774011299436e-06, + "loss": 1.0714, + "step": 23770 + }, + { + "epoch": 1.9115755627009645, + "grad_norm": 2.1450698375701904, + "learning_rate": 4.479418886198548e-06, + "loss": 1.1106, + "step": 23780 + }, + { + "epoch": 1.912379421221865, + "grad_norm": 1.7077056169509888, + "learning_rate": 4.439063761097659e-06, + "loss": 1.0095, + "step": 23790 + }, + { + "epoch": 1.9131832797427653, + "grad_norm": 1.9565300941467285, + "learning_rate": 4.398708635996772e-06, + "loss": 0.8488, + "step": 23800 + }, + { + "epoch": 1.9139871382636655, + "grad_norm": 2.304868221282959, + "learning_rate": 4.358353510895884e-06, + "loss": 1.1288, + "step": 23810 + }, + { + "epoch": 1.914790996784566, + "grad_norm": 1.3317753076553345, + "learning_rate": 4.317998385794996e-06, + "loss": 0.9373, + "step": 23820 + }, + { + "epoch": 1.9155948553054662, + "grad_norm": 2.6593315601348877, + "learning_rate": 4.277643260694109e-06, + "loss": 1.0363, + "step": 23830 + }, + { + "epoch": 1.9163987138263665, + "grad_norm": 2.0847318172454834, + "learning_rate": 4.23728813559322e-06, + "loss": 1.0535, + "step": 23840 + }, + { + "epoch": 1.917202572347267, + "grad_norm": 1.3046883344650269, + "learning_rate": 4.1969330104923325e-06, + "loss": 0.8945, + "step": 23850 + }, + { + "epoch": 1.9180064308681672, + "grad_norm": 1.510403037071228, + "learning_rate": 4.156577885391445e-06, + "loss": 1.064, + "step": 23860 + }, + { + "epoch": 1.9188102893890675, + "grad_norm": 1.7287099361419678, + "learning_rate": 4.116222760290557e-06, + "loss": 1.0929, + "step": 23870 + }, + { + "epoch": 1.919614147909968, + "grad_norm": 2.49249267578125, + "learning_rate": 4.075867635189669e-06, + "loss": 0.9446, + "step": 23880 + }, + { + "epoch": 1.920418006430868, + "grad_norm": 1.937441349029541, + "learning_rate": 4.035512510088781e-06, + "loss": 1.106, + "step": 23890 + }, + { + "epoch": 1.9212218649517685, + "grad_norm": 1.676645278930664, + "learning_rate": 3.9951573849878936e-06, + "loss": 1.0443, + "step": 23900 + }, + { + "epoch": 1.9220257234726688, + "grad_norm": 1.4668079614639282, + "learning_rate": 3.954802259887006e-06, + "loss": 1.0366, + "step": 23910 + }, + { + "epoch": 1.922829581993569, + "grad_norm": 1.5843942165374756, + "learning_rate": 3.914447134786118e-06, + "loss": 0.9147, + "step": 23920 + }, + { + "epoch": 1.9236334405144695, + "grad_norm": 1.6224534511566162, + "learning_rate": 3.87409200968523e-06, + "loss": 0.9329, + "step": 23930 + }, + { + "epoch": 1.9244372990353698, + "grad_norm": 1.3980032205581665, + "learning_rate": 3.833736884584342e-06, + "loss": 0.9348, + "step": 23940 + }, + { + "epoch": 1.92524115755627, + "grad_norm": 1.8087605237960815, + "learning_rate": 3.7933817594834546e-06, + "loss": 1.0116, + "step": 23950 + }, + { + "epoch": 1.9260450160771705, + "grad_norm": 2.1880173683166504, + "learning_rate": 3.7530266343825673e-06, + "loss": 1.0284, + "step": 23960 + }, + { + "epoch": 1.9268488745980707, + "grad_norm": 2.184781789779663, + "learning_rate": 3.712671509281679e-06, + "loss": 0.889, + "step": 23970 + }, + { + "epoch": 1.927652733118971, + "grad_norm": 2.029803991317749, + "learning_rate": 3.6723163841807913e-06, + "loss": 1.0188, + "step": 23980 + }, + { + "epoch": 1.9284565916398715, + "grad_norm": 1.5179870128631592, + "learning_rate": 3.631961259079903e-06, + "loss": 0.8826, + "step": 23990 + }, + { + "epoch": 1.9292604501607717, + "grad_norm": 2.0556914806365967, + "learning_rate": 3.5916061339790157e-06, + "loss": 0.9491, + "step": 24000 + }, + { + "epoch": 1.9292604501607717, + "eval_yahma/alpaca-cleaned_loss": 1.203926920890808, + "eval_yahma/alpaca-cleaned_runtime": 115.7109, + "eval_yahma/alpaca-cleaned_samples_per_second": 17.284, + "eval_yahma/alpaca-cleaned_steps_per_second": 2.161, + "step": 24000 + }, + { + "epoch": 1.930064308681672, + "grad_norm": 1.6474958658218384, + "learning_rate": 3.551251008878128e-06, + "loss": 0.9901, + "step": 24010 + }, + { + "epoch": 1.9308681672025725, + "grad_norm": 1.4724137783050537, + "learning_rate": 3.5108958837772397e-06, + "loss": 1.0064, + "step": 24020 + }, + { + "epoch": 1.9316720257234725, + "grad_norm": 1.539610505104065, + "learning_rate": 3.4705407586763523e-06, + "loss": 0.9541, + "step": 24030 + }, + { + "epoch": 1.932475884244373, + "grad_norm": 1.5626659393310547, + "learning_rate": 3.430185633575464e-06, + "loss": 0.9884, + "step": 24040 + }, + { + "epoch": 1.9332797427652733, + "grad_norm": 2.267127275466919, + "learning_rate": 3.3898305084745763e-06, + "loss": 0.9561, + "step": 24050 + }, + { + "epoch": 1.9340836012861735, + "grad_norm": 1.9397815465927124, + "learning_rate": 3.349475383373689e-06, + "loss": 1.0845, + "step": 24060 + }, + { + "epoch": 1.934887459807074, + "grad_norm": 1.6136093139648438, + "learning_rate": 3.3091202582728007e-06, + "loss": 0.9394, + "step": 24070 + }, + { + "epoch": 1.9356913183279743, + "grad_norm": 1.8694920539855957, + "learning_rate": 3.268765133171913e-06, + "loss": 0.9832, + "step": 24080 + }, + { + "epoch": 1.9364951768488745, + "grad_norm": 1.9546782970428467, + "learning_rate": 3.2284100080710247e-06, + "loss": 1.0085, + "step": 24090 + }, + { + "epoch": 1.937299035369775, + "grad_norm": 1.237918496131897, + "learning_rate": 3.1880548829701374e-06, + "loss": 1.0026, + "step": 24100 + }, + { + "epoch": 1.9381028938906752, + "grad_norm": 1.7807506322860718, + "learning_rate": 3.1476997578692496e-06, + "loss": 0.9878, + "step": 24110 + }, + { + "epoch": 1.9389067524115755, + "grad_norm": 1.604943037033081, + "learning_rate": 3.107344632768362e-06, + "loss": 1.0486, + "step": 24120 + }, + { + "epoch": 1.939710610932476, + "grad_norm": 1.4765275716781616, + "learning_rate": 3.066989507667474e-06, + "loss": 0.9899, + "step": 24130 + }, + { + "epoch": 1.9405144694533762, + "grad_norm": 2.7262089252471924, + "learning_rate": 3.0266343825665862e-06, + "loss": 0.9322, + "step": 24140 + }, + { + "epoch": 1.9413183279742765, + "grad_norm": 2.0875954627990723, + "learning_rate": 2.9862792574656984e-06, + "loss": 1.0717, + "step": 24150 + }, + { + "epoch": 1.942122186495177, + "grad_norm": 1.2878005504608154, + "learning_rate": 2.9459241323648102e-06, + "loss": 0.9872, + "step": 24160 + }, + { + "epoch": 1.942926045016077, + "grad_norm": 2.3931329250335693, + "learning_rate": 2.905569007263923e-06, + "loss": 1.009, + "step": 24170 + }, + { + "epoch": 1.9437299035369775, + "grad_norm": 2.563086748123169, + "learning_rate": 2.865213882163035e-06, + "loss": 1.098, + "step": 24180 + }, + { + "epoch": 1.944533762057878, + "grad_norm": 1.4600781202316284, + "learning_rate": 2.824858757062147e-06, + "loss": 1.0331, + "step": 24190 + }, + { + "epoch": 1.945337620578778, + "grad_norm": 2.167815685272217, + "learning_rate": 2.784503631961259e-06, + "loss": 1.0031, + "step": 24200 + }, + { + "epoch": 1.9461414790996785, + "grad_norm": 1.4191280603408813, + "learning_rate": 2.7441485068603713e-06, + "loss": 0.962, + "step": 24210 + }, + { + "epoch": 1.9469453376205788, + "grad_norm": 2.354363441467285, + "learning_rate": 2.7037933817594835e-06, + "loss": 1.054, + "step": 24220 + }, + { + "epoch": 1.947749196141479, + "grad_norm": 1.9436718225479126, + "learning_rate": 2.6634382566585957e-06, + "loss": 1.0494, + "step": 24230 + }, + { + "epoch": 1.9485530546623795, + "grad_norm": 1.4945588111877441, + "learning_rate": 2.623083131557708e-06, + "loss": 0.9375, + "step": 24240 + }, + { + "epoch": 1.9493569131832797, + "grad_norm": 2.525233268737793, + "learning_rate": 2.58272800645682e-06, + "loss": 1.0606, + "step": 24250 + }, + { + "epoch": 1.95016077170418, + "grad_norm": 2.243128776550293, + "learning_rate": 2.5423728813559323e-06, + "loss": 1.0543, + "step": 24260 + }, + { + "epoch": 1.9509646302250805, + "grad_norm": 1.4152499437332153, + "learning_rate": 2.5020177562550446e-06, + "loss": 1.0039, + "step": 24270 + }, + { + "epoch": 1.9517684887459807, + "grad_norm": 1.6305155754089355, + "learning_rate": 2.4616626311541568e-06, + "loss": 0.97, + "step": 24280 + }, + { + "epoch": 1.952572347266881, + "grad_norm": 1.5360208749771118, + "learning_rate": 2.421307506053269e-06, + "loss": 0.9765, + "step": 24290 + }, + { + "epoch": 1.9533762057877815, + "grad_norm": 2.4453868865966797, + "learning_rate": 2.3809523809523808e-06, + "loss": 1.0101, + "step": 24300 + }, + { + "epoch": 1.9541800643086815, + "grad_norm": 1.925690770149231, + "learning_rate": 2.3405972558514934e-06, + "loss": 1.1117, + "step": 24310 + }, + { + "epoch": 1.954983922829582, + "grad_norm": 2.877366542816162, + "learning_rate": 2.3002421307506056e-06, + "loss": 1.0363, + "step": 24320 + }, + { + "epoch": 1.9557877813504825, + "grad_norm": 1.9311004877090454, + "learning_rate": 2.259887005649718e-06, + "loss": 0.9606, + "step": 24330 + }, + { + "epoch": 1.9565916398713825, + "grad_norm": 2.90724515914917, + "learning_rate": 2.2195318805488296e-06, + "loss": 0.8997, + "step": 24340 + }, + { + "epoch": 1.957395498392283, + "grad_norm": 2.0841379165649414, + "learning_rate": 2.179176755447942e-06, + "loss": 1.0371, + "step": 24350 + }, + { + "epoch": 1.9581993569131833, + "grad_norm": 2.0677711963653564, + "learning_rate": 2.1388216303470545e-06, + "loss": 1.0479, + "step": 24360 + }, + { + "epoch": 1.9590032154340835, + "grad_norm": 1.8002878427505493, + "learning_rate": 2.0984665052461662e-06, + "loss": 0.9957, + "step": 24370 + }, + { + "epoch": 1.959807073954984, + "grad_norm": 1.5182082653045654, + "learning_rate": 2.0581113801452785e-06, + "loss": 0.9324, + "step": 24380 + }, + { + "epoch": 1.9606109324758842, + "grad_norm": 1.88400399684906, + "learning_rate": 2.0177562550443907e-06, + "loss": 1.0915, + "step": 24390 + }, + { + "epoch": 1.9614147909967845, + "grad_norm": 2.4193053245544434, + "learning_rate": 1.977401129943503e-06, + "loss": 1.0839, + "step": 24400 + }, + { + "epoch": 1.962218649517685, + "grad_norm": 1.627624273300171, + "learning_rate": 1.937046004842615e-06, + "loss": 1.0113, + "step": 24410 + }, + { + "epoch": 1.9630225080385852, + "grad_norm": 1.4581716060638428, + "learning_rate": 1.8966908797417273e-06, + "loss": 1.0207, + "step": 24420 + }, + { + "epoch": 1.9638263665594855, + "grad_norm": 2.1971490383148193, + "learning_rate": 1.8563357546408395e-06, + "loss": 0.9363, + "step": 24430 + }, + { + "epoch": 1.964630225080386, + "grad_norm": 1.8994516134262085, + "learning_rate": 1.8159806295399515e-06, + "loss": 0.9914, + "step": 24440 + }, + { + "epoch": 1.965434083601286, + "grad_norm": 1.8573304414749146, + "learning_rate": 1.775625504439064e-06, + "loss": 0.9137, + "step": 24450 + }, + { + "epoch": 1.9662379421221865, + "grad_norm": 1.2724213600158691, + "learning_rate": 1.7352703793381762e-06, + "loss": 0.9722, + "step": 24460 + }, + { + "epoch": 1.967041800643087, + "grad_norm": 1.9029781818389893, + "learning_rate": 1.6949152542372882e-06, + "loss": 0.9432, + "step": 24470 + }, + { + "epoch": 1.967845659163987, + "grad_norm": 1.6016614437103271, + "learning_rate": 1.6545601291364004e-06, + "loss": 1.0239, + "step": 24480 + }, + { + "epoch": 1.9686495176848875, + "grad_norm": 1.7589869499206543, + "learning_rate": 1.6142050040355124e-06, + "loss": 1.0125, + "step": 24490 + }, + { + "epoch": 1.9694533762057878, + "grad_norm": 1.6908783912658691, + "learning_rate": 1.5738498789346248e-06, + "loss": 0.8737, + "step": 24500 + }, + { + "epoch": 1.970257234726688, + "grad_norm": 1.8626710176467896, + "learning_rate": 1.533494753833737e-06, + "loss": 0.9974, + "step": 24510 + }, + { + "epoch": 1.9710610932475885, + "grad_norm": 1.257219672203064, + "learning_rate": 1.4931396287328492e-06, + "loss": 1.039, + "step": 24520 + }, + { + "epoch": 1.9718649517684887, + "grad_norm": 1.5754961967468262, + "learning_rate": 1.4527845036319614e-06, + "loss": 0.967, + "step": 24530 + }, + { + "epoch": 1.972668810289389, + "grad_norm": 2.0581512451171875, + "learning_rate": 1.4124293785310734e-06, + "loss": 1.0605, + "step": 24540 + }, + { + "epoch": 1.9734726688102895, + "grad_norm": 1.8139474391937256, + "learning_rate": 1.3720742534301856e-06, + "loss": 0.9943, + "step": 24550 + }, + { + "epoch": 1.9742765273311897, + "grad_norm": 1.691847562789917, + "learning_rate": 1.3317191283292979e-06, + "loss": 0.9957, + "step": 24560 + }, + { + "epoch": 1.97508038585209, + "grad_norm": 3.390103578567505, + "learning_rate": 1.29136400322841e-06, + "loss": 0.9498, + "step": 24570 + }, + { + "epoch": 1.9758842443729905, + "grad_norm": 1.823568344116211, + "learning_rate": 1.2510088781275223e-06, + "loss": 1.0667, + "step": 24580 + }, + { + "epoch": 1.9766881028938905, + "grad_norm": 4.890821933746338, + "learning_rate": 1.2106537530266345e-06, + "loss": 0.9983, + "step": 24590 + }, + { + "epoch": 1.977491961414791, + "grad_norm": 2.4225986003875732, + "learning_rate": 1.1702986279257467e-06, + "loss": 1.0648, + "step": 24600 + }, + { + "epoch": 1.9782958199356915, + "grad_norm": 1.9177806377410889, + "learning_rate": 1.129943502824859e-06, + "loss": 1.1094, + "step": 24610 + }, + { + "epoch": 1.9790996784565915, + "grad_norm": 2.4320592880249023, + "learning_rate": 1.089588377723971e-06, + "loss": 1.06, + "step": 24620 + }, + { + "epoch": 1.979903536977492, + "grad_norm": 1.7749550342559814, + "learning_rate": 1.0492332526230831e-06, + "loss": 1.0337, + "step": 24630 + }, + { + "epoch": 1.9807073954983923, + "grad_norm": 1.7348616123199463, + "learning_rate": 1.0088781275221953e-06, + "loss": 0.8639, + "step": 24640 + }, + { + "epoch": 1.9815112540192925, + "grad_norm": 2.343806028366089, + "learning_rate": 9.685230024213075e-07, + "loss": 0.9784, + "step": 24650 + }, + { + "epoch": 1.982315112540193, + "grad_norm": 1.7652950286865234, + "learning_rate": 9.281678773204198e-07, + "loss": 0.977, + "step": 24660 + }, + { + "epoch": 1.9831189710610932, + "grad_norm": 1.9320908784866333, + "learning_rate": 8.87812752219532e-07, + "loss": 0.9649, + "step": 24670 + }, + { + "epoch": 1.9839228295819935, + "grad_norm": 2.151548147201538, + "learning_rate": 8.474576271186441e-07, + "loss": 0.9741, + "step": 24680 + }, + { + "epoch": 1.984726688102894, + "grad_norm": 1.9024114608764648, + "learning_rate": 8.071025020177562e-07, + "loss": 1.008, + "step": 24690 + }, + { + "epoch": 1.9855305466237942, + "grad_norm": 1.5782983303070068, + "learning_rate": 7.667473769168685e-07, + "loss": 0.9779, + "step": 24700 + }, + { + "epoch": 1.9863344051446945, + "grad_norm": 3.566256046295166, + "learning_rate": 7.263922518159807e-07, + "loss": 0.9926, + "step": 24710 + }, + { + "epoch": 1.987138263665595, + "grad_norm": 1.6482263803482056, + "learning_rate": 6.860371267150928e-07, + "loss": 0.9734, + "step": 24720 + }, + { + "epoch": 1.987942122186495, + "grad_norm": 2.0280840396881104, + "learning_rate": 6.45682001614205e-07, + "loss": 1.0214, + "step": 24730 + }, + { + "epoch": 1.9887459807073955, + "grad_norm": 1.5748929977416992, + "learning_rate": 6.053268765133172e-07, + "loss": 1.018, + "step": 24740 + }, + { + "epoch": 1.989549839228296, + "grad_norm": 1.5059864521026611, + "learning_rate": 5.649717514124295e-07, + "loss": 1.0083, + "step": 24750 + }, + { + "epoch": 1.990353697749196, + "grad_norm": 1.619263768196106, + "learning_rate": 5.246166263115416e-07, + "loss": 0.9868, + "step": 24760 + }, + { + "epoch": 1.9911575562700965, + "grad_norm": 1.563617467880249, + "learning_rate": 4.842615012106538e-07, + "loss": 0.952, + "step": 24770 + }, + { + "epoch": 1.9919614147909968, + "grad_norm": 1.829676866531372, + "learning_rate": 4.43906376109766e-07, + "loss": 1.0386, + "step": 24780 + }, + { + "epoch": 1.992765273311897, + "grad_norm": 3.145707130432129, + "learning_rate": 4.035512510088781e-07, + "loss": 0.9683, + "step": 24790 + }, + { + "epoch": 1.9935691318327975, + "grad_norm": 1.8739086389541626, + "learning_rate": 3.6319612590799036e-07, + "loss": 0.9351, + "step": 24800 + }, + { + "epoch": 1.9943729903536977, + "grad_norm": 1.6382272243499756, + "learning_rate": 3.228410008071025e-07, + "loss": 0.9973, + "step": 24810 + }, + { + "epoch": 1.995176848874598, + "grad_norm": 1.8064838647842407, + "learning_rate": 2.8248587570621473e-07, + "loss": 1.0362, + "step": 24820 + }, + { + "epoch": 1.9959807073954985, + "grad_norm": 1.9633867740631104, + "learning_rate": 2.421307506053269e-07, + "loss": 0.9698, + "step": 24830 + }, + { + "epoch": 1.9967845659163987, + "grad_norm": 1.9776709079742432, + "learning_rate": 2.0177562550443905e-07, + "loss": 1.0884, + "step": 24840 + }, + { + "epoch": 1.997588424437299, + "grad_norm": 2.630523681640625, + "learning_rate": 1.6142050040355126e-07, + "loss": 1.152, + "step": 24850 + }, + { + "epoch": 1.9983922829581995, + "grad_norm": 1.5119197368621826, + "learning_rate": 1.2106537530266344e-07, + "loss": 1.0154, + "step": 24860 + }, + { + "epoch": 1.9991961414790995, + "grad_norm": 2.289841651916504, + "learning_rate": 8.071025020177563e-08, + "loss": 1.0503, + "step": 24870 + }, + { + "epoch": 2.0, + "grad_norm": 1.790726661682129, + "learning_rate": 4.0355125100887814e-08, + "loss": 0.9707, + "step": 24880 + } + ], + "logging_steps": 10, + "max_steps": 24880, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.698693203135037e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}