diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6021 @@ +{ + "best_metric": 2.323676109313965, + "best_model_checkpoint": "./output/training_results/C019_random_sample_llama3-8b-base_pretrain_20240504_182259/checkpoint-1000", + "epoch": 4.0, + "eval_steps": 200, + "global_step": 4160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009615384615384616, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 2.5996, + "step": 1 + }, + { + "epoch": 0.004807692307692308, + "grad_norm": 3.093270098005958, + "learning_rate": 2.25e-06, + "loss": 2.5704, + "step": 5 + }, + { + "epoch": 0.009615384615384616, + "grad_norm": 2.3983439225151337, + "learning_rate": 6e-06, + "loss": 2.598, + "step": 10 + }, + { + "epoch": 0.014423076923076924, + "grad_norm": 2.365104415775466, + "learning_rate": 9.75e-06, + "loss": 2.5213, + "step": 15 + }, + { + "epoch": 0.019230769230769232, + "grad_norm": 2.377061508613044, + "learning_rate": 1.3500000000000001e-05, + "loss": 2.5413, + "step": 20 + }, + { + "epoch": 0.02403846153846154, + "grad_norm": 2.7238687593360633, + "learning_rate": 1.488126415936146e-05, + "loss": 2.4619, + "step": 25 + }, + { + "epoch": 0.028846153846153848, + "grad_norm": 2.1821698028288496, + "learning_rate": 1.468527480858081e-05, + "loss": 2.4796, + "step": 30 + }, + { + "epoch": 0.03365384615384615, + "grad_norm": 2.209060379147765, + "learning_rate": 1.4491642768162611e-05, + "loss": 2.4632, + "step": 35 + }, + { + "epoch": 0.038461538461538464, + "grad_norm": 2.1033623949557465, + "learning_rate": 1.4376584414398205e-05, + "loss": 2.4363, + "step": 40 + }, + { + "epoch": 0.04326923076923077, + "grad_norm": 2.232481096526571, + "learning_rate": 1.4186671032101571e-05, + "loss": 2.4888, + "step": 45 + }, + { + "epoch": 0.04807692307692308, + "grad_norm": 2.1509113321913413, + "learning_rate": 1.3999049045545275e-05, + "loss": 2.4947, + "step": 50 + }, + { + "epoch": 0.052884615384615384, + "grad_norm": 2.35512436324606, + "learning_rate": 1.3813693542528815e-05, + "loss": 2.4788, + "step": 55 + }, + { + "epoch": 0.057692307692307696, + "grad_norm": 2.0401062809167683, + "learning_rate": 1.3630579851896082e-05, + "loss": 2.4441, + "step": 60 + }, + { + "epoch": 0.0625, + "grad_norm": 2.0096811058967425, + "learning_rate": 1.3449683541492259e-05, + "loss": 2.4552, + "step": 65 + }, + { + "epoch": 0.0673076923076923, + "grad_norm": 2.258689794653528, + "learning_rate": 1.3270980416135356e-05, + "loss": 2.48, + "step": 70 + }, + { + "epoch": 0.07211538461538461, + "grad_norm": 2.020330092733293, + "learning_rate": 1.3094446515602676e-05, + "loss": 2.4756, + "step": 75 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 2.062564685463297, + "learning_rate": 1.2920058112631874e-05, + "loss": 2.4676, + "step": 80 + }, + { + "epoch": 0.08173076923076923, + "grad_norm": 2.0801794381372196, + "learning_rate": 1.2747791710936666e-05, + "loss": 2.5349, + "step": 85 + }, + { + "epoch": 0.08653846153846154, + "grad_norm": 3.522036550275993, + "learning_rate": 1.2577624043237019e-05, + "loss": 2.4357, + "step": 90 + }, + { + "epoch": 0.09134615384615384, + "grad_norm": 2.096385210617988, + "learning_rate": 1.240953206930375e-05, + "loss": 2.4441, + "step": 95 + }, + { + "epoch": 0.09615384615384616, + "grad_norm": 2.0071639436136737, + "learning_rate": 1.2243492974017472e-05, + "loss": 2.4663, + "step": 100 + }, + { + "epoch": 0.10096153846153846, + "grad_norm": 2.1419668864903794, + "learning_rate": 1.2079484165441774e-05, + "loss": 2.5266, + "step": 105 + }, + { + "epoch": 0.10576923076923077, + "grad_norm": 1.853996222690424, + "learning_rate": 1.1917483272910544e-05, + "loss": 2.4803, + "step": 110 + }, + { + "epoch": 0.11057692307692307, + "grad_norm": 1.8741352536661482, + "learning_rate": 1.1757468145129383e-05, + "loss": 2.4532, + "step": 115 + }, + { + "epoch": 0.11538461538461539, + "grad_norm": 2.5986583647330344, + "learning_rate": 1.1599416848290976e-05, + "loss": 2.4519, + "step": 120 + }, + { + "epoch": 0.1201923076923077, + "grad_norm": 1.960401134525488, + "learning_rate": 1.1443307664204364e-05, + "loss": 2.4225, + "step": 125 + }, + { + "epoch": 0.125, + "grad_norm": 2.000854689144336, + "learning_rate": 1.1289119088438038e-05, + "loss": 2.4376, + "step": 130 + }, + { + "epoch": 0.12980769230769232, + "grad_norm": 2.0163596039348373, + "learning_rate": 1.1136829828476745e-05, + "loss": 2.4494, + "step": 135 + }, + { + "epoch": 0.1346153846153846, + "grad_norm": 2.000675810989018, + "learning_rate": 1.0986418801891934e-05, + "loss": 2.462, + "step": 140 + }, + { + "epoch": 0.13942307692307693, + "grad_norm": 2.0014951060919746, + "learning_rate": 1.0837865134525763e-05, + "loss": 2.4331, + "step": 145 + }, + { + "epoch": 0.14423076923076922, + "grad_norm": 1.9032594688995426, + "learning_rate": 1.069114815868857e-05, + "loss": 2.443, + "step": 150 + }, + { + "epoch": 0.14903846153846154, + "grad_norm": 2.344078595183246, + "learning_rate": 1.0546247411369744e-05, + "loss": 2.3993, + "step": 155 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 2.261655660998884, + "learning_rate": 1.0403142632461892e-05, + "loss": 2.427, + "step": 160 + }, + { + "epoch": 0.15865384615384615, + "grad_norm": 1.9697690775283647, + "learning_rate": 1.0261813762998242e-05, + "loss": 2.3969, + "step": 165 + }, + { + "epoch": 0.16346153846153846, + "grad_norm": 1.9785704107813238, + "learning_rate": 1.0122240943403124e-05, + "loss": 2.4541, + "step": 170 + }, + { + "epoch": 0.16826923076923078, + "grad_norm": 1.8261246917010026, + "learning_rate": 9.984404511755643e-06, + "loss": 2.4736, + "step": 175 + }, + { + "epoch": 0.17307692307692307, + "grad_norm": 1.99665744273795, + "learning_rate": 9.848285002066194e-06, + "loss": 2.353, + "step": 180 + }, + { + "epoch": 0.1778846153846154, + "grad_norm": 1.8159030807907148, + "learning_rate": 9.71386314256594e-06, + "loss": 2.4447, + "step": 185 + }, + { + "epoch": 0.18269230769230768, + "grad_norm": 1.9924841032422067, + "learning_rate": 9.581119854009096e-06, + "loss": 2.3577, + "step": 190 + }, + { + "epoch": 0.1875, + "grad_norm": 1.8364970229914088, + "learning_rate": 9.45003624798795e-06, + "loss": 2.4096, + "step": 195 + }, + { + "epoch": 0.19230769230769232, + "grad_norm": 1.9566999587123155, + "learning_rate": 9.320593625260526e-06, + "loss": 2.3809, + "step": 200 + }, + { + "epoch": 0.19230769230769232, + "eval_loss": 2.4206786155700684, + "eval_runtime": 85.4007, + "eval_samples_per_second": 86.592, + "eval_steps_per_second": 0.679, + "step": 200 + }, + { + "epoch": 0.1971153846153846, + "grad_norm": 1.958978215443068, + "learning_rate": 9.192773474090845e-06, + "loss": 2.3997, + "step": 205 + }, + { + "epoch": 0.20192307692307693, + "grad_norm": 1.999117184727505, + "learning_rate": 9.066557468601675e-06, + "loss": 2.3995, + "step": 210 + }, + { + "epoch": 0.20673076923076922, + "grad_norm": 2.0120971325180634, + "learning_rate": 8.966727451760845e-06, + "loss": 2.3394, + "step": 215 + }, + { + "epoch": 0.21153846153846154, + "grad_norm": 1.8965405647532796, + "learning_rate": 8.843353314292577e-06, + "loss": 2.4373, + "step": 220 + }, + { + "epoch": 0.21634615384615385, + "grad_norm": 1.793020827788288, + "learning_rate": 8.721532984948616e-06, + "loss": 2.4004, + "step": 225 + }, + { + "epoch": 0.22115384615384615, + "grad_norm": 1.8928727830060093, + "learning_rate": 8.601248829310043e-06, + "loss": 2.4425, + "step": 230 + }, + { + "epoch": 0.22596153846153846, + "grad_norm": 1.8359177916301768, + "learning_rate": 8.482483391081384e-06, + "loss": 2.4048, + "step": 235 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 1.771634179795241, + "learning_rate": 8.365219390514311e-06, + "loss": 2.3701, + "step": 240 + }, + { + "epoch": 0.23557692307692307, + "grad_norm": 2.2382487479171966, + "learning_rate": 8.249439722843319e-06, + "loss": 2.3873, + "step": 245 + }, + { + "epoch": 0.2403846153846154, + "grad_norm": 1.825838956406169, + "learning_rate": 8.135127456733292e-06, + "loss": 2.4484, + "step": 250 + }, + { + "epoch": 0.24519230769230768, + "grad_norm": 1.779047182560338, + "learning_rate": 8.022265832738892e-06, + "loss": 2.4533, + "step": 255 + }, + { + "epoch": 0.25, + "grad_norm": 1.8121397814224398, + "learning_rate": 7.9108382617757e-06, + "loss": 2.4032, + "step": 260 + }, + { + "epoch": 0.2548076923076923, + "grad_norm": 1.7304835073136142, + "learning_rate": 7.800828323603008e-06, + "loss": 2.3965, + "step": 265 + }, + { + "epoch": 0.25961538461538464, + "grad_norm": 1.9948337899573474, + "learning_rate": 7.692219765318242e-06, + "loss": 2.4174, + "step": 270 + }, + { + "epoch": 0.2644230769230769, + "grad_norm": 2.498650132767716, + "learning_rate": 7.584996499862861e-06, + "loss": 2.39, + "step": 275 + }, + { + "epoch": 0.2692307692307692, + "grad_norm": 1.9036689673638798, + "learning_rate": 7.479142604539756e-06, + "loss": 2.3903, + "step": 280 + }, + { + "epoch": 0.27403846153846156, + "grad_norm": 1.9727971553625547, + "learning_rate": 7.374642319541976e-06, + "loss": 2.352, + "step": 285 + }, + { + "epoch": 0.27884615384615385, + "grad_norm": 1.7682776753325222, + "learning_rate": 7.271480046492797e-06, + "loss": 2.3595, + "step": 290 + }, + { + "epoch": 0.28365384615384615, + "grad_norm": 2.466547945028361, + "learning_rate": 7.1696403469970005e-06, + "loss": 2.4387, + "step": 295 + }, + { + "epoch": 0.28846153846153844, + "grad_norm": 1.7588363798238758, + "learning_rate": 7.0691079412032825e-06, + "loss": 2.4327, + "step": 300 + }, + { + "epoch": 0.2932692307692308, + "grad_norm": 1.8462300982749367, + "learning_rate": 6.969867706377832e-06, + "loss": 2.4041, + "step": 305 + }, + { + "epoch": 0.2980769230769231, + "grad_norm": 2.0032200252529098, + "learning_rate": 6.87190467548884e-06, + "loss": 2.4022, + "step": 310 + }, + { + "epoch": 0.30288461538461536, + "grad_norm": 2.0051781024154383, + "learning_rate": 6.775204035801989e-06, + "loss": 2.3978, + "step": 315 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 1.7525097649477925, + "learning_rate": 6.679751127486818e-06, + "loss": 2.3874, + "step": 320 + }, + { + "epoch": 0.3125, + "grad_norm": 1.8163864310732767, + "learning_rate": 6.585531442233879e-06, + "loss": 2.3982, + "step": 325 + }, + { + "epoch": 0.3173076923076923, + "grad_norm": 1.8911617099161901, + "learning_rate": 6.492530621882634e-06, + "loss": 2.3816, + "step": 330 + }, + { + "epoch": 0.32211538461538464, + "grad_norm": 1.8956241442821822, + "learning_rate": 6.400734457060024e-06, + "loss": 2.3557, + "step": 335 + }, + { + "epoch": 0.3269230769230769, + "grad_norm": 1.8585394840952694, + "learning_rate": 6.310128885829607e-06, + "loss": 2.4309, + "step": 340 + }, + { + "epoch": 0.3317307692307692, + "grad_norm": 1.8977154535780991, + "learning_rate": 6.220699992351257e-06, + "loss": 2.4039, + "step": 345 + }, + { + "epoch": 0.33653846153846156, + "grad_norm": 1.803139553519876, + "learning_rate": 6.132434005551287e-06, + "loss": 2.4042, + "step": 350 + }, + { + "epoch": 0.34134615384615385, + "grad_norm": 1.757715074609487, + "learning_rate": 6.045317297802985e-06, + "loss": 2.3759, + "step": 355 + }, + { + "epoch": 0.34615384615384615, + "grad_norm": 1.8026638689606764, + "learning_rate": 5.95933638361746e-06, + "loss": 2.4149, + "step": 360 + }, + { + "epoch": 0.35096153846153844, + "grad_norm": 1.7463547692619898, + "learning_rate": 5.874477918344749e-06, + "loss": 2.3951, + "step": 365 + }, + { + "epoch": 0.3557692307692308, + "grad_norm": 1.869103918883084, + "learning_rate": 5.7907286968851065e-06, + "loss": 2.3785, + "step": 370 + }, + { + "epoch": 0.3605769230769231, + "grad_norm": 1.8694975836317, + "learning_rate": 5.708075652410414e-06, + "loss": 2.4295, + "step": 375 + }, + { + "epoch": 0.36538461538461536, + "grad_norm": 1.9186264569383331, + "learning_rate": 5.626505855095647e-06, + "loss": 2.4053, + "step": 380 + }, + { + "epoch": 0.3701923076923077, + "grad_norm": 1.8627599571104616, + "learning_rate": 5.546006510860341e-06, + "loss": 2.3935, + "step": 385 + }, + { + "epoch": 0.375, + "grad_norm": 1.7601694633490985, + "learning_rate": 5.466564960119934e-06, + "loss": 2.3533, + "step": 390 + }, + { + "epoch": 0.3798076923076923, + "grad_norm": 1.6940078427675656, + "learning_rate": 5.388168676547046e-06, + "loss": 2.3602, + "step": 395 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 2.3248960946347155, + "learning_rate": 5.31080526584248e-06, + "loss": 2.3057, + "step": 400 + }, + { + "epoch": 0.38461538461538464, + "eval_loss": 2.3750226497650146, + "eval_runtime": 85.4352, + "eval_samples_per_second": 86.557, + "eval_steps_per_second": 0.679, + "step": 400 + }, + { + "epoch": 0.3894230769230769, + "grad_norm": 1.7637614396329135, + "learning_rate": 5.234462464515984e-06, + "loss": 2.3852, + "step": 405 + }, + { + "epoch": 0.3942307692307692, + "grad_norm": 1.8306112577514888, + "learning_rate": 5.159128138676664e-06, + "loss": 2.3683, + "step": 410 + }, + { + "epoch": 0.39903846153846156, + "grad_norm": 1.88396403239199, + "learning_rate": 5.0847902828330104e-06, + "loss": 2.3303, + "step": 415 + }, + { + "epoch": 0.40384615384615385, + "grad_norm": 1.9387815046466974, + "learning_rate": 5.011437018702448e-06, + "loss": 2.3596, + "step": 420 + }, + { + "epoch": 0.40865384615384615, + "grad_norm": 1.797535293599832, + "learning_rate": 4.939056594030363e-06, + "loss": 2.3807, + "step": 425 + }, + { + "epoch": 0.41346153846153844, + "grad_norm": 1.7674969210476854, + "learning_rate": 4.867637381418548e-06, + "loss": 2.4203, + "step": 430 + }, + { + "epoch": 0.4182692307692308, + "grad_norm": 1.7330827184520308, + "learning_rate": 4.797167877162977e-06, + "loss": 2.4145, + "step": 435 + }, + { + "epoch": 0.4230769230769231, + "grad_norm": 1.7505951142772842, + "learning_rate": 4.72763670010088e-06, + "loss": 2.3664, + "step": 440 + }, + { + "epoch": 0.42788461538461536, + "grad_norm": 1.7277179266718043, + "learning_rate": 4.6590325904670434e-06, + "loss": 2.3618, + "step": 445 + }, + { + "epoch": 0.4326923076923077, + "grad_norm": 1.824045183697345, + "learning_rate": 4.5913444087592555e-06, + "loss": 2.3677, + "step": 450 + }, + { + "epoch": 0.4375, + "grad_norm": 2.541872533331478, + "learning_rate": 4.524561134612869e-06, + "loss": 2.3953, + "step": 455 + }, + { + "epoch": 0.4423076923076923, + "grad_norm": 1.8053852132874109, + "learning_rate": 4.4586718656843925e-06, + "loss": 2.4119, + "step": 460 + }, + { + "epoch": 0.44711538461538464, + "grad_norm": 1.6878117932040484, + "learning_rate": 4.39366581654407e-06, + "loss": 2.3864, + "step": 465 + }, + { + "epoch": 0.4519230769230769, + "grad_norm": 1.8260105801902033, + "learning_rate": 4.329532317577373e-06, + "loss": 2.387, + "step": 470 + }, + { + "epoch": 0.4567307692307692, + "grad_norm": 1.8118051823045696, + "learning_rate": 4.26626081389535e-06, + "loss": 2.4271, + "step": 475 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 2.3122157740257157, + "learning_rate": 4.2038408642537815e-06, + "loss": 2.3746, + "step": 480 + }, + { + "epoch": 0.46634615384615385, + "grad_norm": 2.0895941468983126, + "learning_rate": 4.142262139981073e-06, + "loss": 2.3491, + "step": 485 + }, + { + "epoch": 0.47115384615384615, + "grad_norm": 1.8059979746514452, + "learning_rate": 4.0815144239148194e-06, + "loss": 2.3499, + "step": 490 + }, + { + "epoch": 0.47596153846153844, + "grad_norm": 1.886181072515567, + "learning_rate": 4.0215876093470125e-06, + "loss": 2.3631, + "step": 495 + }, + { + "epoch": 0.4807692307692308, + "grad_norm": 1.8494449235344264, + "learning_rate": 3.962471698977794e-06, + "loss": 2.3689, + "step": 500 + }, + { + "epoch": 0.4855769230769231, + "grad_norm": 1.7530451717430282, + "learning_rate": 3.904156803877704e-06, + "loss": 2.3126, + "step": 505 + }, + { + "epoch": 0.49038461538461536, + "grad_norm": 1.7478042759208887, + "learning_rate": 3.846633142458427e-06, + "loss": 2.3706, + "step": 510 + }, + { + "epoch": 0.4951923076923077, + "grad_norm": 1.7582686186315075, + "learning_rate": 3.7898910394518715e-06, + "loss": 2.3913, + "step": 515 + }, + { + "epoch": 0.5, + "grad_norm": 1.719027129765464, + "learning_rate": 3.7339209248976165e-06, + "loss": 2.3352, + "step": 520 + }, + { + "epoch": 0.5048076923076923, + "grad_norm": 1.7460100588180303, + "learning_rate": 3.678713333138621e-06, + "loss": 2.3206, + "step": 525 + }, + { + "epoch": 0.5096153846153846, + "grad_norm": 1.82603479631214, + "learning_rate": 3.6242589018251656e-06, + "loss": 2.328, + "step": 530 + }, + { + "epoch": 0.5144230769230769, + "grad_norm": 2.909265992463998, + "learning_rate": 3.570548370926946e-06, + "loss": 2.3763, + "step": 535 + }, + { + "epoch": 0.5192307692307693, + "grad_norm": 1.8988240634311662, + "learning_rate": 3.5175725817532863e-06, + "loss": 2.3422, + "step": 540 + }, + { + "epoch": 0.5240384615384616, + "grad_norm": 1.8816807225199998, + "learning_rate": 3.4653224759813952e-06, + "loss": 2.31, + "step": 545 + }, + { + "epoch": 0.5288461538461539, + "grad_norm": 1.7734887040078462, + "learning_rate": 3.413789094692631e-06, + "loss": 2.3708, + "step": 550 + }, + { + "epoch": 0.5336538461538461, + "grad_norm": 14.829267205139884, + "learning_rate": 3.362963577416697e-06, + "loss": 2.353, + "step": 555 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 1.767298642358234, + "learning_rate": 3.312837161183736e-06, + "loss": 2.3772, + "step": 560 + }, + { + "epoch": 0.5432692307692307, + "grad_norm": 2.0381765168658714, + "learning_rate": 3.2634011795842525e-06, + "loss": 2.3277, + "step": 565 + }, + { + "epoch": 0.5480769230769231, + "grad_norm": 1.687367468245635, + "learning_rate": 3.2146470618368156e-06, + "loss": 2.3702, + "step": 570 + }, + { + "epoch": 0.5528846153846154, + "grad_norm": 1.7200567763349082, + "learning_rate": 3.1665663318634906e-06, + "loss": 2.2972, + "step": 575 + }, + { + "epoch": 0.5576923076923077, + "grad_norm": 1.7213863859635832, + "learning_rate": 3.119150607372941e-06, + "loss": 2.3279, + "step": 580 + }, + { + "epoch": 0.5625, + "grad_norm": 1.7895318194941465, + "learning_rate": 3.0723915989511547e-06, + "loss": 2.3264, + "step": 585 + }, + { + "epoch": 0.5673076923076923, + "grad_norm": 1.6926941348086333, + "learning_rate": 3.035451716037107e-06, + "loss": 2.4078, + "step": 590 + }, + { + "epoch": 0.5721153846153846, + "grad_norm": 1.835513287932842, + "learning_rate": 2.9898542002308595e-06, + "loss": 2.3339, + "step": 595 + }, + { + "epoch": 0.5769230769230769, + "grad_norm": 1.7870911584404572, + "learning_rate": 2.944890676594853e-06, + "loss": 2.35, + "step": 600 + }, + { + "epoch": 0.5769230769230769, + "eval_loss": 2.3476545810699463, + "eval_runtime": 85.4325, + "eval_samples_per_second": 86.56, + "eval_steps_per_second": 0.679, + "step": 600 + }, + { + "epoch": 0.5817307692307693, + "grad_norm": 1.7960612955748432, + "learning_rate": 2.900553200489045e-06, + "loss": 2.379, + "step": 605 + }, + { + "epoch": 0.5865384615384616, + "grad_norm": 2.662329393803985, + "learning_rate": 2.8568339158905825e-06, + "loss": 2.3121, + "step": 610 + }, + { + "epoch": 0.5913461538461539, + "grad_norm": 1.751319402693243, + "learning_rate": 2.8137250545276917e-06, + "loss": 2.3453, + "step": 615 + }, + { + "epoch": 0.5961538461538461, + "grad_norm": 2.2858590472007325, + "learning_rate": 2.77121893502082e-06, + "loss": 2.3469, + "step": 620 + }, + { + "epoch": 0.6009615384615384, + "grad_norm": 1.8051336435298304, + "learning_rate": 2.729307962031005e-06, + "loss": 2.3764, + "step": 625 + }, + { + "epoch": 0.6057692307692307, + "grad_norm": 1.7204864022940245, + "learning_rate": 2.6879846254154052e-06, + "loss": 2.3047, + "step": 630 + }, + { + "epoch": 0.6105769230769231, + "grad_norm": 1.6529012434786867, + "learning_rate": 2.647241499389928e-06, + "loss": 2.3594, + "step": 635 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 1.732240061787434, + "learning_rate": 2.607071241698958e-06, + "loss": 2.3265, + "step": 640 + }, + { + "epoch": 0.6201923076923077, + "grad_norm": 1.7491108722836675, + "learning_rate": 2.567466592792067e-06, + "loss": 2.3546, + "step": 645 + }, + { + "epoch": 0.625, + "grad_norm": 1.8515026129037757, + "learning_rate": 2.5284203750077018e-06, + "loss": 2.3665, + "step": 650 + }, + { + "epoch": 0.6298076923076923, + "grad_norm": 1.9236177470936695, + "learning_rate": 2.4899254917637856e-06, + "loss": 2.3532, + "step": 655 + }, + { + "epoch": 0.6346153846153846, + "grad_norm": 1.7377562070977945, + "learning_rate": 2.4519749267551924e-06, + "loss": 2.3056, + "step": 660 + }, + { + "epoch": 0.6394230769230769, + "grad_norm": 1.8604329624496534, + "learning_rate": 2.414561743158029e-06, + "loss": 2.4127, + "step": 665 + }, + { + "epoch": 0.6442307692307693, + "grad_norm": 1.7518401108851098, + "learning_rate": 2.3776790828406987e-06, + "loss": 2.3923, + "step": 670 + }, + { + "epoch": 0.6490384615384616, + "grad_norm": 1.931606951701668, + "learning_rate": 2.341320165581676e-06, + "loss": 2.3243, + "step": 675 + }, + { + "epoch": 0.6538461538461539, + "grad_norm": 1.812856790111344, + "learning_rate": 2.3054782882939655e-06, + "loss": 2.3149, + "step": 680 + }, + { + "epoch": 0.6586538461538461, + "grad_norm": 1.7938076588828502, + "learning_rate": 2.2701468242561784e-06, + "loss": 2.3098, + "step": 685 + }, + { + "epoch": 0.6634615384615384, + "grad_norm": 1.6875935166811342, + "learning_rate": 2.2353192223501965e-06, + "loss": 2.3627, + "step": 690 + }, + { + "epoch": 0.6682692307692307, + "grad_norm": 1.7370129856938976, + "learning_rate": 2.2009890063053612e-06, + "loss": 2.3905, + "step": 695 + }, + { + "epoch": 0.6730769230769231, + "grad_norm": 1.786880089249507, + "learning_rate": 2.167149773949154e-06, + "loss": 2.3904, + "step": 700 + }, + { + "epoch": 0.6778846153846154, + "grad_norm": 1.766140826477351, + "learning_rate": 2.133795196464315e-06, + "loss": 2.3069, + "step": 705 + }, + { + "epoch": 0.6826923076923077, + "grad_norm": 1.73381149404956, + "learning_rate": 2.100919017652352e-06, + "loss": 2.3367, + "step": 710 + }, + { + "epoch": 0.6875, + "grad_norm": 1.6802393388684402, + "learning_rate": 2.0685150532033913e-06, + "loss": 2.3349, + "step": 715 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 1.719597560705125, + "learning_rate": 2.036577189972352e-06, + "loss": 2.347, + "step": 720 + }, + { + "epoch": 0.6971153846153846, + "grad_norm": 1.7179306585516882, + "learning_rate": 2.005099385261351e-06, + "loss": 2.2808, + "step": 725 + }, + { + "epoch": 0.7019230769230769, + "grad_norm": 1.693677430438375, + "learning_rate": 1.9740756661083308e-06, + "loss": 2.3601, + "step": 730 + }, + { + "epoch": 0.7067307692307693, + "grad_norm": 1.7284703551106673, + "learning_rate": 1.9435001285818512e-06, + "loss": 2.3698, + "step": 735 + }, + { + "epoch": 0.7115384615384616, + "grad_norm": 1.7201691395467102, + "learning_rate": 1.913366937082008e-06, + "loss": 2.3383, + "step": 740 + }, + { + "epoch": 0.7163461538461539, + "grad_norm": 1.8376437399845924, + "learning_rate": 1.883670323647419e-06, + "loss": 2.3575, + "step": 745 + }, + { + "epoch": 0.7211538461538461, + "grad_norm": 1.7519138621360655, + "learning_rate": 1.8544045872682494e-06, + "loss": 2.4116, + "step": 750 + }, + { + "epoch": 0.7259615384615384, + "grad_norm": 1.6767007868001402, + "learning_rate": 1.8255640932052287e-06, + "loss": 2.3197, + "step": 755 + }, + { + "epoch": 0.7307692307692307, + "grad_norm": 1.8411908944181066, + "learning_rate": 1.7971432723146058e-06, + "loss": 2.3908, + "step": 760 + }, + { + "epoch": 0.7355769230769231, + "grad_norm": 1.7508438925830225, + "learning_rate": 1.769136620379013e-06, + "loss": 2.3188, + "step": 765 + }, + { + "epoch": 0.7403846153846154, + "grad_norm": 1.7436172155395409, + "learning_rate": 1.7415386974441854e-06, + "loss": 2.321, + "step": 770 + }, + { + "epoch": 0.7451923076923077, + "grad_norm": 1.8045595856913115, + "learning_rate": 1.7143441271614997e-06, + "loss": 2.3454, + "step": 775 + }, + { + "epoch": 0.75, + "grad_norm": 1.763756591492577, + "learning_rate": 1.687547596136285e-06, + "loss": 2.3234, + "step": 780 + }, + { + "epoch": 0.7548076923076923, + "grad_norm": 1.7186205772688097, + "learning_rate": 1.661143853281865e-06, + "loss": 2.2885, + "step": 785 + }, + { + "epoch": 0.7596153846153846, + "grad_norm": 1.7694258113773655, + "learning_rate": 1.6351277091792915e-06, + "loss": 2.3391, + "step": 790 + }, + { + "epoch": 0.7644230769230769, + "grad_norm": 1.725458209313717, + "learning_rate": 1.6094940354427228e-06, + "loss": 2.3098, + "step": 795 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 34.858863328576724, + "learning_rate": 1.5842377640904125e-06, + "loss": 2.3291, + "step": 800 + }, + { + "epoch": 0.7692307692307693, + "eval_loss": 2.3324432373046875, + "eval_runtime": 85.489, + "eval_samples_per_second": 86.502, + "eval_steps_per_second": 0.678, + "step": 800 + }, + { + "epoch": 0.7740384615384616, + "grad_norm": 1.7300557356264337, + "learning_rate": 1.5593538869212577e-06, + "loss": 2.3633, + "step": 805 + }, + { + "epoch": 0.7788461538461539, + "grad_norm": 1.6677853311569053, + "learning_rate": 1.5348374548968758e-06, + "loss": 2.31, + "step": 810 + }, + { + "epoch": 0.7836538461538461, + "grad_norm": 1.6959216377511328, + "learning_rate": 1.5106835775291604e-06, + "loss": 2.3239, + "step": 815 + }, + { + "epoch": 0.7884615384615384, + "grad_norm": 1.703559225147181, + "learning_rate": 1.4868874222732831e-06, + "loss": 2.324, + "step": 820 + }, + { + "epoch": 0.7932692307692307, + "grad_norm": 1.7178542423600203, + "learning_rate": 1.4634442139260933e-06, + "loss": 2.342, + "step": 825 + }, + { + "epoch": 0.7980769230769231, + "grad_norm": 1.6873420836748758, + "learning_rate": 1.440349234029883e-06, + "loss": 2.3434, + "step": 830 + }, + { + "epoch": 0.8028846153846154, + "grad_norm": 1.742480497378871, + "learning_rate": 1.417597820281471e-06, + "loss": 2.3966, + "step": 835 + }, + { + "epoch": 0.8076923076923077, + "grad_norm": 1.6566648049272492, + "learning_rate": 1.3951853659465747e-06, + "loss": 2.3217, + "step": 840 + }, + { + "epoch": 0.8125, + "grad_norm": 1.78249147233943, + "learning_rate": 1.3731073192794095e-06, + "loss": 2.3719, + "step": 845 + }, + { + "epoch": 0.8173076923076923, + "grad_norm": 1.8035253977271137, + "learning_rate": 1.3513591829475174e-06, + "loss": 2.317, + "step": 850 + }, + { + "epoch": 0.8221153846153846, + "grad_norm": 2.035309467875598, + "learning_rate": 1.3299365134617373e-06, + "loss": 2.313, + "step": 855 + }, + { + "epoch": 0.8269230769230769, + "grad_norm": 1.7174745299655327, + "learning_rate": 1.3088349206113118e-06, + "loss": 2.3239, + "step": 860 + }, + { + "epoch": 0.8317307692307693, + "grad_norm": 1.7333933814361635, + "learning_rate": 1.2880500669040793e-06, + "loss": 2.3025, + "step": 865 + }, + { + "epoch": 0.8365384615384616, + "grad_norm": 1.7754019490280168, + "learning_rate": 1.2675776670117165e-06, + "loss": 2.2899, + "step": 870 + }, + { + "epoch": 0.8413461538461539, + "grad_norm": 1.773766560162585, + "learning_rate": 1.2474134872199916e-06, + "loss": 2.3348, + "step": 875 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 1.6780258578572016, + "learning_rate": 1.2275533448839897e-06, + "loss": 2.3305, + "step": 880 + }, + { + "epoch": 0.8509615384615384, + "grad_norm": 1.733329835045473, + "learning_rate": 1.2079931078882769e-06, + "loss": 2.3059, + "step": 885 + }, + { + "epoch": 0.8557692307692307, + "grad_norm": 1.688022550790151, + "learning_rate": 1.1887286941119609e-06, + "loss": 2.2872, + "step": 890 + }, + { + "epoch": 0.8605769230769231, + "grad_norm": 1.7172166393971702, + "learning_rate": 1.1697560708986142e-06, + "loss": 2.3042, + "step": 895 + }, + { + "epoch": 0.8653846153846154, + "grad_norm": 1.6641411293848463, + "learning_rate": 1.1510712545310206e-06, + "loss": 2.2959, + "step": 900 + }, + { + "epoch": 0.8701923076923077, + "grad_norm": 1.7296381589810081, + "learning_rate": 1.1326703097107125e-06, + "loss": 2.339, + "step": 905 + }, + { + "epoch": 0.875, + "grad_norm": 1.6487202037599287, + "learning_rate": 1.1145493490422558e-06, + "loss": 2.309, + "step": 910 + }, + { + "epoch": 0.8798076923076923, + "grad_norm": 2.181232627254535, + "learning_rate": 1.096704532522256e-06, + "loss": 2.2499, + "step": 915 + }, + { + "epoch": 0.8846153846153846, + "grad_norm": 1.7663666904603283, + "learning_rate": 1.0791320670330332e-06, + "loss": 2.4002, + "step": 920 + }, + { + "epoch": 0.8894230769230769, + "grad_norm": 2.063321871244198, + "learning_rate": 1.061828205840956e-06, + "loss": 2.3313, + "step": 925 + }, + { + "epoch": 0.8942307692307693, + "grad_norm": 1.8140222643627664, + "learning_rate": 1.0447892480993706e-06, + "loss": 2.3454, + "step": 930 + }, + { + "epoch": 0.8990384615384616, + "grad_norm": 1.7048216508873255, + "learning_rate": 1.0280115383561078e-06, + "loss": 2.3296, + "step": 935 + }, + { + "epoch": 0.9038461538461539, + "grad_norm": 1.7706072815766516, + "learning_rate": 1.0114914660655272e-06, + "loss": 2.3379, + "step": 940 + }, + { + "epoch": 0.9086538461538461, + "grad_norm": 1.8968636807180728, + "learning_rate": 9.95225465105065e-07, + "loss": 2.3336, + "step": 945 + }, + { + "epoch": 0.9134615384615384, + "grad_norm": 1.8148188080264716, + "learning_rate": 9.792100132962467e-07, + "loss": 2.3244, + "step": 950 + }, + { + "epoch": 0.9182692307692307, + "grad_norm": 1.700784769345341, + "learning_rate": 9.634416319301388e-07, + "loss": 2.2875, + "step": 955 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 1.678153310810481, + "learning_rate": 9.479168852971943e-07, + "loss": 2.3299, + "step": 960 + }, + { + "epoch": 0.9278846153846154, + "grad_norm": 1.702217146168844, + "learning_rate": 9.326323802214668e-07, + "loss": 2.3312, + "step": 965 + }, + { + "epoch": 0.9326923076923077, + "grad_norm": 1.7687681371145616, + "learning_rate": 9.175847655991562e-07, + "loss": 2.3722, + "step": 970 + }, + { + "epoch": 0.9375, + "grad_norm": 1.7230729231020288, + "learning_rate": 9.027707319414495e-07, + "loss": 2.3735, + "step": 975 + }, + { + "epoch": 0.9423076923076923, + "grad_norm": 1.7291556590880472, + "learning_rate": 8.881870109216298e-07, + "loss": 2.3127, + "step": 980 + }, + { + "epoch": 0.9471153846153846, + "grad_norm": 1.7116649138045492, + "learning_rate": 8.73830374926414e-07, + "loss": 2.3561, + "step": 985 + }, + { + "epoch": 0.9519230769230769, + "grad_norm": 1.6739783387575036, + "learning_rate": 8.596976366114889e-07, + "loss": 2.351, + "step": 990 + }, + { + "epoch": 0.9567307692307693, + "grad_norm": 1.9461130756235225, + "learning_rate": 8.457856484612148e-07, + "loss": 2.3294, + "step": 995 + }, + { + "epoch": 0.9615384615384616, + "grad_norm": 1.8094460359927895, + "learning_rate": 8.320913023524591e-07, + "loss": 2.2998, + "step": 1000 + }, + { + "epoch": 0.9615384615384616, + "eval_loss": 2.323676109313965, + "eval_runtime": 85.3479, + "eval_samples_per_second": 86.645, + "eval_steps_per_second": 0.68, + "step": 1000 + }, + { + "epoch": 0.9663461538461539, + "grad_norm": 1.7191097995210292, + "learning_rate": 8.186115291225334e-07, + "loss": 2.3048, + "step": 1005 + }, + { + "epoch": 0.9711538461538461, + "grad_norm": 1.7123549721593059, + "learning_rate": 8.05343298141196e-07, + "loss": 2.2933, + "step": 1010 + }, + { + "epoch": 0.9759615384615384, + "grad_norm": 1.615875433552917, + "learning_rate": 7.922836168866939e-07, + "loss": 2.3564, + "step": 1015 + }, + { + "epoch": 0.9807692307692307, + "grad_norm": 1.928169845331568, + "learning_rate": 7.794295305258064e-07, + "loss": 2.304, + "step": 1020 + }, + { + "epoch": 0.9855769230769231, + "grad_norm": 1.6770198711392135, + "learning_rate": 7.667781214978637e-07, + "loss": 2.3152, + "step": 1025 + }, + { + "epoch": 0.9903846153846154, + "grad_norm": 1.942852074361696, + "learning_rate": 7.543265091027068e-07, + "loss": 2.2961, + "step": 1030 + }, + { + "epoch": 0.9951923076923077, + "grad_norm": 1.7644307035655395, + "learning_rate": 7.420718490925571e-07, + "loss": 2.3559, + "step": 1035 + }, + { + "epoch": 1.0, + "grad_norm": 1.6849031142151147, + "learning_rate": 7.300113332677667e-07, + "loss": 2.2943, + "step": 1040 + }, + { + "epoch": 1.0048076923076923, + "grad_norm": 2.0233629664399646, + "learning_rate": 7.181421890764176e-07, + "loss": 2.1536, + "step": 1045 + }, + { + "epoch": 1.0096153846153846, + "grad_norm": 1.6857528037531342, + "learning_rate": 7.064616792177334e-07, + "loss": 2.1437, + "step": 1050 + }, + { + "epoch": 1.0144230769230769, + "grad_norm": 1.856293792049413, + "learning_rate": 6.949671012492914e-07, + "loss": 2.0699, + "step": 1055 + }, + { + "epoch": 1.0192307692307692, + "grad_norm": 1.8179118022888037, + "learning_rate": 6.836557871979786e-07, + "loss": 2.0974, + "step": 1060 + }, + { + "epoch": 1.0240384615384615, + "grad_norm": 1.8749106071870572, + "learning_rate": 6.725251031746841e-07, + "loss": 2.1025, + "step": 1065 + }, + { + "epoch": 1.0288461538461537, + "grad_norm": 2.4469738972729442, + "learning_rate": 6.61572448992684e-07, + "loss": 2.0592, + "step": 1070 + }, + { + "epoch": 1.0336538461538463, + "grad_norm": 1.9600481862823989, + "learning_rate": 6.507952577896988e-07, + "loss": 2.1909, + "step": 1075 + }, + { + "epoch": 1.0384615384615385, + "grad_norm": 1.7683431826042773, + "learning_rate": 6.401909956535864e-07, + "loss": 2.0983, + "step": 1080 + }, + { + "epoch": 1.0432692307692308, + "grad_norm": 1.8700170966385194, + "learning_rate": 6.297571612516455e-07, + "loss": 2.1326, + "step": 1085 + }, + { + "epoch": 1.0480769230769231, + "grad_norm": 1.7984837423328528, + "learning_rate": 6.194912854635e-07, + "loss": 2.1085, + "step": 1090 + }, + { + "epoch": 1.0528846153846154, + "grad_norm": 1.8234811332020633, + "learning_rate": 6.093909310175343e-07, + "loss": 2.1227, + "step": 1095 + }, + { + "epoch": 1.0576923076923077, + "grad_norm": 1.8669294021521274, + "learning_rate": 5.994536921308514e-07, + "loss": 2.0538, + "step": 1100 + }, + { + "epoch": 1.0625, + "grad_norm": 1.834973873963248, + "learning_rate": 5.896771941527257e-07, + "loss": 2.163, + "step": 1105 + }, + { + "epoch": 1.0673076923076923, + "grad_norm": 1.7568094748940102, + "learning_rate": 5.800590932115227e-07, + "loss": 2.1596, + "step": 1110 + }, + { + "epoch": 1.0721153846153846, + "grad_norm": 1.9456491484317202, + "learning_rate": 5.705970758650521e-07, + "loss": 2.092, + "step": 1115 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 1.8020042844163735, + "learning_rate": 5.612888587543394e-07, + "loss": 2.1022, + "step": 1120 + }, + { + "epoch": 1.0817307692307692, + "grad_norm": 1.853382300488598, + "learning_rate": 5.521321882607727e-07, + "loss": 2.0697, + "step": 1125 + }, + { + "epoch": 1.0865384615384615, + "grad_norm": 1.8362880314320598, + "learning_rate": 5.431248401666053e-07, + "loss": 2.1201, + "step": 1130 + }, + { + "epoch": 1.0913461538461537, + "grad_norm": 1.8442235682625632, + "learning_rate": 5.342646193187874e-07, + "loss": 2.0395, + "step": 1135 + }, + { + "epoch": 1.0961538461538463, + "grad_norm": 1.8214033367021532, + "learning_rate": 5.255493592960974e-07, + "loss": 2.113, + "step": 1140 + }, + { + "epoch": 1.1009615384615385, + "grad_norm": 1.7988424197058015, + "learning_rate": 5.169769220795454e-07, + "loss": 2.131, + "step": 1145 + }, + { + "epoch": 1.1057692307692308, + "grad_norm": 1.8109458308469661, + "learning_rate": 5.085451977260232e-07, + "loss": 2.1636, + "step": 1150 + }, + { + "epoch": 1.1105769230769231, + "grad_norm": 1.8188669425027102, + "learning_rate": 5.00252104045174e-07, + "loss": 2.1307, + "step": 1155 + }, + { + "epoch": 1.1153846153846154, + "grad_norm": 1.7643977620250952, + "learning_rate": 4.920955862794543e-07, + "loss": 2.1029, + "step": 1160 + }, + { + "epoch": 1.1201923076923077, + "grad_norm": 1.871509851180396, + "learning_rate": 4.84073616787364e-07, + "loss": 2.106, + "step": 1165 + }, + { + "epoch": 1.125, + "grad_norm": 1.827457682413712, + "learning_rate": 4.7618419472981506e-07, + "loss": 2.1616, + "step": 1170 + }, + { + "epoch": 1.1298076923076923, + "grad_norm": 1.7536769808765222, + "learning_rate": 4.684253457596156e-07, + "loss": 2.1077, + "step": 1175 + }, + { + "epoch": 1.1346153846153846, + "grad_norm": 1.9063367359144818, + "learning_rate": 4.6079512171404304e-07, + "loss": 2.1849, + "step": 1180 + }, + { + "epoch": 1.1394230769230769, + "grad_norm": 2.145803926574076, + "learning_rate": 4.5329160031047875e-07, + "loss": 2.1577, + "step": 1185 + }, + { + "epoch": 1.1442307692307692, + "grad_norm": 1.8443487836196741, + "learning_rate": 4.4591288484508226e-07, + "loss": 2.064, + "step": 1190 + }, + { + "epoch": 1.1490384615384615, + "grad_norm": 1.815754689621411, + "learning_rate": 4.3865710389447586e-07, + "loss": 2.1008, + "step": 1195 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 1.8139614221776288, + "learning_rate": 4.315224110204174e-07, + "loss": 2.1248, + "step": 1200 + }, + { + "epoch": 1.1538461538461537, + "eval_loss": 2.336085319519043, + "eval_runtime": 85.3746, + "eval_samples_per_second": 86.618, + "eval_steps_per_second": 0.679, + "step": 1200 + }, + { + "epoch": 1.1586538461538463, + "grad_norm": 1.7983716043793538, + "learning_rate": 4.245069844774349e-07, + "loss": 2.0729, + "step": 1205 + }, + { + "epoch": 1.1634615384615385, + "grad_norm": 1.8990292619468592, + "learning_rate": 4.17609026923398e-07, + "loss": 2.1249, + "step": 1210 + }, + { + "epoch": 1.1682692307692308, + "grad_norm": 1.762763830487173, + "learning_rate": 4.1082676513300323e-07, + "loss": 2.154, + "step": 1215 + }, + { + "epoch": 1.1730769230769231, + "grad_norm": 1.759984272000879, + "learning_rate": 4.0415844971414616e-07, + "loss": 2.1299, + "step": 1220 + }, + { + "epoch": 1.1778846153846154, + "grad_norm": 1.7856327184643472, + "learning_rate": 3.976023548271586e-07, + "loss": 2.1663, + "step": 1225 + }, + { + "epoch": 1.1826923076923077, + "grad_norm": 1.8453273970913073, + "learning_rate": 3.9115677790688485e-07, + "loss": 2.1115, + "step": 1230 + }, + { + "epoch": 1.1875, + "grad_norm": 1.7711541036032603, + "learning_rate": 3.8482003938757386e-07, + "loss": 2.1207, + "step": 1235 + }, + { + "epoch": 1.1923076923076923, + "grad_norm": 1.7750356264689093, + "learning_rate": 3.78590482430564e-07, + "loss": 2.0857, + "step": 1240 + }, + { + "epoch": 1.1971153846153846, + "grad_norm": 1.7976368503882154, + "learning_rate": 3.724664726547351e-07, + "loss": 2.1386, + "step": 1245 + }, + { + "epoch": 1.2019230769230769, + "grad_norm": 1.829414461965732, + "learning_rate": 3.6644639786970623e-07, + "loss": 2.174, + "step": 1250 + }, + { + "epoch": 1.2067307692307692, + "grad_norm": 1.825361485465677, + "learning_rate": 3.6052866781175476e-07, + "loss": 2.1057, + "step": 1255 + }, + { + "epoch": 1.2115384615384615, + "grad_norm": 1.8292622951367188, + "learning_rate": 3.547117138824332e-07, + "loss": 2.08, + "step": 1260 + }, + { + "epoch": 1.2163461538461537, + "grad_norm": 1.8307121677285738, + "learning_rate": 3.48993988889863e-07, + "loss": 2.1154, + "step": 1265 + }, + { + "epoch": 1.2211538461538463, + "grad_norm": 1.862688434301242, + "learning_rate": 3.433739667926769e-07, + "loss": 2.0719, + "step": 1270 + }, + { + "epoch": 1.2259615384615385, + "grad_norm": 1.8172648051882496, + "learning_rate": 3.378501424465974e-07, + "loss": 2.08, + "step": 1275 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 1.831590098407615, + "learning_rate": 3.3242103135361645e-07, + "loss": 2.1313, + "step": 1280 + }, + { + "epoch": 1.2355769230769231, + "grad_norm": 1.8337034054812522, + "learning_rate": 3.2708516941376294e-07, + "loss": 2.1436, + "step": 1285 + }, + { + "epoch": 1.2403846153846154, + "grad_norm": 1.8090147347855563, + "learning_rate": 3.218411126794323e-07, + "loss": 2.1503, + "step": 1290 + }, + { + "epoch": 1.2451923076923077, + "grad_norm": 1.8544882033122045, + "learning_rate": 3.166874371122564e-07, + "loss": 2.1303, + "step": 1295 + }, + { + "epoch": 1.25, + "grad_norm": 1.781492016300762, + "learning_rate": 3.116227383424919e-07, + "loss": 2.0967, + "step": 1300 + }, + { + "epoch": 1.2548076923076923, + "grad_norm": 1.8889890359608847, + "learning_rate": 3.066456314309059e-07, + "loss": 2.0931, + "step": 1305 + }, + { + "epoch": 1.2596153846153846, + "grad_norm": 1.8232794987114287, + "learning_rate": 3.017547506331364e-07, + "loss": 2.1251, + "step": 1310 + }, + { + "epoch": 1.2644230769230769, + "grad_norm": 1.8856640991380471, + "learning_rate": 2.969487491665068e-07, + "loss": 2.1139, + "step": 1315 + }, + { + "epoch": 1.2692307692307692, + "grad_norm": 1.7930598313625747, + "learning_rate": 2.9222629897927087e-07, + "loss": 2.1204, + "step": 1320 + }, + { + "epoch": 1.2740384615384617, + "grad_norm": 1.8132589043201648, + "learning_rate": 2.8758609052227305e-07, + "loss": 2.034, + "step": 1325 + }, + { + "epoch": 1.2788461538461537, + "grad_norm": 1.8767260044973102, + "learning_rate": 2.830268325229947e-07, + "loss": 2.1215, + "step": 1330 + }, + { + "epoch": 1.2836538461538463, + "grad_norm": 1.8491028909697207, + "learning_rate": 2.785472517619713e-07, + "loss": 2.1328, + "step": 1335 + }, + { + "epoch": 1.2884615384615383, + "grad_norm": 1.9076802028303976, + "learning_rate": 2.74146092851559e-07, + "loss": 2.084, + "step": 1340 + }, + { + "epoch": 1.2932692307692308, + "grad_norm": 1.849289922308255, + "learning_rate": 2.698221180170271e-07, + "loss": 2.1259, + "step": 1345 + }, + { + "epoch": 1.2980769230769231, + "grad_norm": 1.7905203171901232, + "learning_rate": 2.6557410687996006e-07, + "loss": 2.1151, + "step": 1350 + }, + { + "epoch": 1.3028846153846154, + "grad_norm": 1.8830908621706892, + "learning_rate": 2.6140085624394526e-07, + "loss": 2.1457, + "step": 1355 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 1.8596784397686372, + "learning_rate": 2.573011798825286e-07, + "loss": 2.073, + "step": 1360 + }, + { + "epoch": 1.3125, + "grad_norm": 1.8448017924414952, + "learning_rate": 2.5327390832941644e-07, + "loss": 2.1286, + "step": 1365 + }, + { + "epoch": 1.3173076923076923, + "grad_norm": 2.0018781537530996, + "learning_rate": 2.4931788867090523e-07, + "loss": 2.09, + "step": 1370 + }, + { + "epoch": 1.3221153846153846, + "grad_norm": 1.8762757684058704, + "learning_rate": 2.4543198434051835e-07, + "loss": 2.075, + "step": 1375 + }, + { + "epoch": 1.3269230769230769, + "grad_norm": 1.952448677696025, + "learning_rate": 2.4161507491583033e-07, + "loss": 2.1256, + "step": 1380 + }, + { + "epoch": 1.3317307692307692, + "grad_norm": 1.8165760972158784, + "learning_rate": 2.3786605591746012e-07, + "loss": 2.0566, + "step": 1385 + }, + { + "epoch": 1.3365384615384617, + "grad_norm": 5.253827520965963, + "learning_rate": 2.341838386102127e-07, + "loss": 2.2116, + "step": 1390 + }, + { + "epoch": 1.3413461538461537, + "grad_norm": 1.8446995708115508, + "learning_rate": 2.3056734980635093e-07, + "loss": 2.1001, + "step": 1395 + }, + { + "epoch": 1.3461538461538463, + "grad_norm": 1.9617802338733952, + "learning_rate": 2.2701553167097801e-07, + "loss": 2.1239, + "step": 1400 + }, + { + "epoch": 1.3461538461538463, + "eval_loss": 2.334371566772461, + "eval_runtime": 85.4548, + "eval_samples_per_second": 86.537, + "eval_steps_per_second": 0.679, + "step": 1400 + }, + { + "epoch": 1.3509615384615383, + "grad_norm": 1.8285827211419716, + "learning_rate": 2.2352734152951196e-07, + "loss": 2.1184, + "step": 1405 + }, + { + "epoch": 1.3557692307692308, + "grad_norm": 2.0394120658337305, + "learning_rate": 2.2010175167723296e-07, + "loss": 2.0568, + "step": 1410 + }, + { + "epoch": 1.3605769230769231, + "grad_norm": 1.7875137882919705, + "learning_rate": 2.167377491908854e-07, + "loss": 2.0625, + "step": 1415 + }, + { + "epoch": 1.3653846153846154, + "grad_norm": 1.7866761410178333, + "learning_rate": 2.134343357423158e-07, + "loss": 2.0555, + "step": 1420 + }, + { + "epoch": 1.3701923076923077, + "grad_norm": 1.932563852514787, + "learning_rate": 2.101905274141283e-07, + "loss": 2.1069, + "step": 1425 + }, + { + "epoch": 1.375, + "grad_norm": 1.9475188936955665, + "learning_rate": 2.0700535451733951e-07, + "loss": 2.1086, + "step": 1430 + }, + { + "epoch": 1.3798076923076923, + "grad_norm": 1.8526120458954936, + "learning_rate": 2.0387786141101492e-07, + "loss": 2.1378, + "step": 1435 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 1.8562018803586509, + "learning_rate": 2.0080710632386802e-07, + "loss": 2.1353, + "step": 1440 + }, + { + "epoch": 1.3894230769230769, + "grad_norm": 1.8313311377456998, + "learning_rate": 1.9779216117780527e-07, + "loss": 2.1171, + "step": 1445 + }, + { + "epoch": 1.3942307692307692, + "grad_norm": 1.8142973032453498, + "learning_rate": 1.9483211141339894e-07, + "loss": 2.0766, + "step": 1450 + }, + { + "epoch": 1.3990384615384617, + "grad_norm": 1.8237674767411933, + "learning_rate": 1.9192605581726967e-07, + "loss": 2.1593, + "step": 1455 + }, + { + "epoch": 1.4038461538461537, + "grad_norm": 1.772508678674097, + "learning_rate": 1.8907310635136197e-07, + "loss": 2.1314, + "step": 1460 + }, + { + "epoch": 1.4086538461538463, + "grad_norm": 1.8899727080269664, + "learning_rate": 1.8627238798409526e-07, + "loss": 2.0845, + "step": 1465 + }, + { + "epoch": 1.4134615384615383, + "grad_norm": 1.90653257600126, + "learning_rate": 1.8352303852337284e-07, + "loss": 2.1508, + "step": 1470 + }, + { + "epoch": 1.4182692307692308, + "grad_norm": 1.8534900824085168, + "learning_rate": 1.8082420845143144e-07, + "loss": 2.0896, + "step": 1475 + }, + { + "epoch": 1.4230769230769231, + "grad_norm": 1.8066064812360683, + "learning_rate": 1.7817506076151663e-07, + "loss": 2.1493, + "step": 1480 + }, + { + "epoch": 1.4278846153846154, + "grad_norm": 1.8590166269045232, + "learning_rate": 1.7557477079636372e-07, + "loss": 2.0614, + "step": 1485 + }, + { + "epoch": 1.4326923076923077, + "grad_norm": 1.8782140024216563, + "learning_rate": 1.7302252608847008e-07, + "loss": 2.0691, + "step": 1490 + }, + { + "epoch": 1.4375, + "grad_norm": 1.8729309652922037, + "learning_rate": 1.7051752620214163e-07, + "loss": 2.0573, + "step": 1495 + }, + { + "epoch": 1.4423076923076923, + "grad_norm": 1.8894921416533526, + "learning_rate": 1.6805898257729673e-07, + "loss": 2.0936, + "step": 1500 + }, + { + "epoch": 1.4471153846153846, + "grad_norm": 1.9015071278716307, + "learning_rate": 1.6564611837501148e-07, + "loss": 2.0837, + "step": 1505 + }, + { + "epoch": 1.4519230769230769, + "grad_norm": 1.8197453987244108, + "learning_rate": 1.6327816832478985e-07, + "loss": 2.1064, + "step": 1510 + }, + { + "epoch": 1.4567307692307692, + "grad_norm": 1.8526075910672721, + "learning_rate": 1.6095437857354324e-07, + "loss": 2.0926, + "step": 1515 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 1.8572065984966375, + "learning_rate": 1.586740065362626e-07, + "loss": 2.0582, + "step": 1520 + }, + { + "epoch": 1.4663461538461537, + "grad_norm": 1.8156159477376175, + "learning_rate": 1.5643632074836825e-07, + "loss": 2.1037, + "step": 1525 + }, + { + "epoch": 1.4711538461538463, + "grad_norm": 1.8649198187665965, + "learning_rate": 1.5424060071972007e-07, + "loss": 2.125, + "step": 1530 + }, + { + "epoch": 1.4759615384615383, + "grad_norm": 1.8545497800311697, + "learning_rate": 1.5208613679027549e-07, + "loss": 2.0884, + "step": 1535 + }, + { + "epoch": 1.4807692307692308, + "grad_norm": 1.8606969338206512, + "learning_rate": 1.4997222998737582e-07, + "loss": 2.1157, + "step": 1540 + }, + { + "epoch": 1.4855769230769231, + "grad_norm": 1.8859903197241183, + "learning_rate": 1.478981918846486e-07, + "loss": 2.1273, + "step": 1545 + }, + { + "epoch": 1.4903846153846154, + "grad_norm": 1.8869329872162925, + "learning_rate": 1.4586334446250955e-07, + "loss": 2.1386, + "step": 1550 + }, + { + "epoch": 1.4951923076923077, + "grad_norm": 1.860329950662595, + "learning_rate": 1.43867019970249e-07, + "loss": 2.157, + "step": 1555 + }, + { + "epoch": 1.5, + "grad_norm": 1.8134076526838725, + "learning_rate": 1.419085607896877e-07, + "loss": 2.1129, + "step": 1560 + }, + { + "epoch": 1.5048076923076923, + "grad_norm": 1.8259889434431678, + "learning_rate": 1.3998731930038773e-07, + "loss": 2.1292, + "step": 1565 + }, + { + "epoch": 1.5096153846153846, + "grad_norm": 1.8908539458019609, + "learning_rate": 1.381026577464028e-07, + "loss": 2.1286, + "step": 1570 + }, + { + "epoch": 1.5144230769230769, + "grad_norm": 1.7930674977942935, + "learning_rate": 1.3625394810455382e-07, + "loss": 2.1092, + "step": 1575 + }, + { + "epoch": 1.5192307692307692, + "grad_norm": 1.8496202978075098, + "learning_rate": 1.3444057195421526e-07, + "loss": 2.1075, + "step": 1580 + }, + { + "epoch": 1.5240384615384617, + "grad_norm": 1.8344118160186549, + "learning_rate": 1.326619203485973e-07, + "loss": 2.1007, + "step": 1585 + }, + { + "epoch": 1.5288461538461537, + "grad_norm": 1.8585688089026406, + "learning_rate": 1.3091739368750989e-07, + "loss": 2.1521, + "step": 1590 + }, + { + "epoch": 1.5336538461538463, + "grad_norm": 2.0502623341105517, + "learning_rate": 1.292064015915944e-07, + "loss": 2.0904, + "step": 1595 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 1.8474141432895723, + "learning_rate": 1.2752836277800852e-07, + "loss": 2.1521, + "step": 1600 + }, + { + "epoch": 1.5384615384615383, + "eval_loss": 2.333831548690796, + "eval_runtime": 85.4542, + "eval_samples_per_second": 86.538, + "eval_steps_per_second": 0.679, + "step": 1600 + }, + { + "epoch": 1.5432692307692308, + "grad_norm": 1.908368834971653, + "learning_rate": 1.2588270493755057e-07, + "loss": 2.0545, + "step": 1605 + }, + { + "epoch": 1.5480769230769231, + "grad_norm": 1.8891697271029433, + "learning_rate": 1.242688646132092e-07, + "loss": 2.1085, + "step": 1610 + }, + { + "epoch": 1.5528846153846154, + "grad_norm": 1.8238620642049488, + "learning_rate": 1.22686287080125e-07, + "loss": 2.1416, + "step": 1615 + }, + { + "epoch": 1.5576923076923077, + "grad_norm": 1.845379742670226, + "learning_rate": 1.2113442622694955e-07, + "loss": 2.0587, + "step": 1620 + }, + { + "epoch": 1.5625, + "grad_norm": 1.760419766434776, + "learning_rate": 1.1961274443858932e-07, + "loss": 2.0988, + "step": 1625 + }, + { + "epoch": 1.5673076923076923, + "grad_norm": 1.9500128322951924, + "learning_rate": 1.1812071248031999e-07, + "loss": 2.1024, + "step": 1630 + }, + { + "epoch": 1.5721153846153846, + "grad_norm": 1.8158972995099203, + "learning_rate": 1.1665780938325871e-07, + "loss": 2.1387, + "step": 1635 + }, + { + "epoch": 1.5769230769230769, + "grad_norm": 1.86611749153697, + "learning_rate": 1.152235223311802e-07, + "loss": 2.1525, + "step": 1640 + }, + { + "epoch": 1.5817307692307692, + "grad_norm": 1.8447983570027537, + "learning_rate": 1.1381734654866389e-07, + "loss": 2.0554, + "step": 1645 + }, + { + "epoch": 1.5865384615384617, + "grad_norm": 1.828362228823549, + "learning_rate": 1.1243878519055928e-07, + "loss": 2.1187, + "step": 1650 + }, + { + "epoch": 1.5913461538461537, + "grad_norm": 1.947875660376608, + "learning_rate": 1.1108734923275605e-07, + "loss": 2.0531, + "step": 1655 + }, + { + "epoch": 1.5961538461538463, + "grad_norm": 1.818226522118368, + "learning_rate": 1.0976255736424637e-07, + "loss": 2.1036, + "step": 1660 + }, + { + "epoch": 1.6009615384615383, + "grad_norm": 1.9755891501080045, + "learning_rate": 1.0846393588046656e-07, + "loss": 2.1296, + "step": 1665 + }, + { + "epoch": 1.6057692307692308, + "grad_norm": 1.8165676756032596, + "learning_rate": 1.0719101857790552e-07, + "loss": 2.0842, + "step": 1670 + }, + { + "epoch": 1.6105769230769231, + "grad_norm": 1.8480994780626476, + "learning_rate": 1.0594334664996721e-07, + "loss": 2.0833, + "step": 1675 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 1.7568276519420272, + "learning_rate": 1.0472046858407492e-07, + "loss": 2.1152, + "step": 1680 + }, + { + "epoch": 1.6201923076923077, + "grad_norm": 1.8155268250435754, + "learning_rate": 1.0352194006000441e-07, + "loss": 2.1277, + "step": 1685 + }, + { + "epoch": 1.625, + "grad_norm": 1.8688450613110825, + "learning_rate": 1.0234732384943512e-07, + "loss": 2.055, + "step": 1690 + }, + { + "epoch": 1.6298076923076923, + "grad_norm": 1.834466807811679, + "learning_rate": 1.0119618971670507e-07, + "loss": 2.1648, + "step": 1695 + }, + { + "epoch": 1.6346153846153846, + "grad_norm": 1.9150332485014145, + "learning_rate": 1.0006811432075942e-07, + "loss": 2.0587, + "step": 1700 + }, + { + "epoch": 1.6394230769230769, + "grad_norm": 1.866607921147843, + "learning_rate": 9.896268111827943e-08, + "loss": 2.076, + "step": 1705 + }, + { + "epoch": 1.6442307692307692, + "grad_norm": 1.8656204113992287, + "learning_rate": 9.787948026798065e-08, + "loss": 2.1168, + "step": 1710 + }, + { + "epoch": 1.6490384615384617, + "grad_norm": 1.849474324070502, + "learning_rate": 9.68181085360681e-08, + "loss": 2.1075, + "step": 1715 + }, + { + "epoch": 1.6538461538461537, + "grad_norm": 1.8108526684678354, + "learning_rate": 9.57781692028372e-08, + "loss": 2.1368, + "step": 1720 + }, + { + "epoch": 1.6586538461538463, + "grad_norm": 1.8133873110154997, + "learning_rate": 9.475927197040834e-08, + "loss": 2.088, + "step": 1725 + }, + { + "epoch": 1.6634615384615383, + "grad_norm": 1.8155032971792053, + "learning_rate": 9.376103287158425e-08, + "loss": 2.1397, + "step": 1730 + }, + { + "epoch": 1.6682692307692308, + "grad_norm": 1.8962575557301127, + "learning_rate": 9.278307417981768e-08, + "loss": 2.116, + "step": 1735 + }, + { + "epoch": 1.6730769230769231, + "grad_norm": 1.8976326651339515, + "learning_rate": 9.182502432027988e-08, + "loss": 2.0869, + "step": 1740 + }, + { + "epoch": 1.6778846153846154, + "grad_norm": 1.805419356077963, + "learning_rate": 9.107267296696801e-08, + "loss": 2.0926, + "step": 1745 + }, + { + "epoch": 1.6826923076923077, + "grad_norm": 1.8237173931210868, + "learning_rate": 9.014954193734225e-08, + "loss": 2.07, + "step": 1750 + }, + { + "epoch": 1.6875, + "grad_norm": 1.874303236724565, + "learning_rate": 8.924531131396056e-08, + "loss": 2.0852, + "step": 1755 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 1.8446431514031785, + "learning_rate": 8.835963210651791e-08, + "loss": 2.0639, + "step": 1760 + }, + { + "epoch": 1.6971153846153846, + "grad_norm": 1.8962482308020339, + "learning_rate": 8.749216106451011e-08, + "loss": 2.1162, + "step": 1765 + }, + { + "epoch": 1.7019230769230769, + "grad_norm": 1.8192264354608538, + "learning_rate": 8.664256059446181e-08, + "loss": 2.1065, + "step": 1770 + }, + { + "epoch": 1.7067307692307692, + "grad_norm": 2.366332770975045, + "learning_rate": 8.581049867817956e-08, + "loss": 2.0625, + "step": 1775 + }, + { + "epoch": 1.7115384615384617, + "grad_norm": 1.8446173561965722, + "learning_rate": 8.499564879201958e-08, + "loss": 2.0537, + "step": 1780 + }, + { + "epoch": 1.7163461538461537, + "grad_norm": 1.8507785394900198, + "learning_rate": 8.419768982715971e-08, + "loss": 2.1093, + "step": 1785 + }, + { + "epoch": 1.7211538461538463, + "grad_norm": 1.9304487119438947, + "learning_rate": 8.341630601086485e-08, + "loss": 2.118, + "step": 1790 + }, + { + "epoch": 1.7259615384615383, + "grad_norm": 1.8294859378005517, + "learning_rate": 8.265118682873593e-08, + "loss": 2.1369, + "step": 1795 + }, + { + "epoch": 1.7307692307692308, + "grad_norm": 1.8613822811922678, + "learning_rate": 8.190202694793183e-08, + "loss": 2.1359, + "step": 1800 + }, + { + "epoch": 1.7307692307692308, + "eval_loss": 2.333617687225342, + "eval_runtime": 85.3403, + "eval_samples_per_second": 86.653, + "eval_steps_per_second": 0.68, + "step": 1800 + }, + { + "epoch": 1.7355769230769231, + "grad_norm": 1.8159457192343773, + "learning_rate": 8.116852614135445e-08, + "loss": 2.1222, + "step": 1805 + }, + { + "epoch": 1.7403846153846154, + "grad_norm": 1.857716576691175, + "learning_rate": 8.045038921278602e-08, + "loss": 2.1139, + "step": 1810 + }, + { + "epoch": 1.7451923076923077, + "grad_norm": 1.8694725467916173, + "learning_rate": 7.974732592297013e-08, + "loss": 2.094, + "step": 1815 + }, + { + "epoch": 1.75, + "grad_norm": 1.8560579082110327, + "learning_rate": 7.905905091662493e-08, + "loss": 2.1622, + "step": 1820 + }, + { + "epoch": 1.7548076923076923, + "grad_norm": 1.875970072144303, + "learning_rate": 7.838528365037967e-08, + "loss": 2.1179, + "step": 1825 + }, + { + "epoch": 1.7596153846153846, + "grad_norm": 1.9019026590876629, + "learning_rate": 7.77257483216247e-08, + "loss": 2.1137, + "step": 1830 + }, + { + "epoch": 1.7644230769230769, + "grad_norm": 1.8292496367699893, + "learning_rate": 7.708017379826487e-08, + "loss": 2.0573, + "step": 1835 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 1.8672483366732924, + "learning_rate": 7.644829354936725e-08, + "loss": 2.1275, + "step": 1840 + }, + { + "epoch": 1.7740384615384617, + "grad_norm": 1.734535999037372, + "learning_rate": 7.582984557669328e-08, + "loss": 2.0798, + "step": 1845 + }, + { + "epoch": 1.7788461538461537, + "grad_norm": 1.8512196694843002, + "learning_rate": 7.52245723471061e-08, + "loss": 2.1569, + "step": 1850 + }, + { + "epoch": 1.7836538461538463, + "grad_norm": 1.7836085149148238, + "learning_rate": 7.463222072584383e-08, + "loss": 2.1196, + "step": 1855 + }, + { + "epoch": 1.7884615384615383, + "grad_norm": 1.8793796811188046, + "learning_rate": 7.405254191064901e-08, + "loss": 2.0593, + "step": 1860 + }, + { + "epoch": 1.7932692307692308, + "grad_norm": 1.8737352256216766, + "learning_rate": 7.348529136674602e-08, + "loss": 2.0905, + "step": 1865 + }, + { + "epoch": 1.7980769230769231, + "grad_norm": 1.832908496175927, + "learning_rate": 7.293022876265624e-08, + "loss": 2.1636, + "step": 1870 + }, + { + "epoch": 1.8028846153846154, + "grad_norm": 1.914652585529052, + "learning_rate": 7.23871179068426e-08, + "loss": 2.1163, + "step": 1875 + }, + { + "epoch": 1.8076923076923077, + "grad_norm": 1.8575655442671353, + "learning_rate": 7.185572668517463e-08, + "loss": 2.0961, + "step": 1880 + }, + { + "epoch": 1.8125, + "grad_norm": 1.872595689449834, + "learning_rate": 7.133582699920455e-08, + "loss": 2.1504, + "step": 1885 + }, + { + "epoch": 1.8173076923076923, + "grad_norm": 1.8150069813971093, + "learning_rate": 7.082719470524635e-08, + "loss": 2.1249, + "step": 1890 + }, + { + "epoch": 1.8221153846153846, + "grad_norm": 1.892110067355825, + "learning_rate": 7.032960955424859e-08, + "loss": 2.0501, + "step": 1895 + }, + { + "epoch": 1.8269230769230769, + "grad_norm": 2.017115554517963, + "learning_rate": 6.98428551324525e-08, + "loss": 2.0568, + "step": 1900 + }, + { + "epoch": 1.8317307692307692, + "grad_norm": 1.8844252622464137, + "learning_rate": 6.936671880282684e-08, + "loss": 2.1413, + "step": 1905 + }, + { + "epoch": 1.8365384615384617, + "grad_norm": 1.8438419406531692, + "learning_rate": 6.890099164727089e-08, + "loss": 2.1635, + "step": 1910 + }, + { + "epoch": 1.8413461538461537, + "grad_norm": 1.8996214354229564, + "learning_rate": 6.844546840957736e-08, + "loss": 2.1141, + "step": 1915 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 1.7579428295336565, + "learning_rate": 6.799994743914665e-08, + "loss": 2.0918, + "step": 1920 + }, + { + "epoch": 1.8509615384615383, + "grad_norm": 1.7922772912832896, + "learning_rate": 6.756423063544432e-08, + "loss": 2.078, + "step": 1925 + }, + { + "epoch": 1.8557692307692308, + "grad_norm": 1.8562342019145215, + "learning_rate": 6.713812339319366e-08, + "loss": 2.1416, + "step": 1930 + }, + { + "epoch": 1.8605769230769231, + "grad_norm": 1.9439324971687737, + "learning_rate": 6.672143454829497e-08, + "loss": 2.1372, + "step": 1935 + }, + { + "epoch": 1.8653846153846154, + "grad_norm": 1.8774979949999377, + "learning_rate": 6.631397632446378e-08, + "loss": 2.1379, + "step": 1940 + }, + { + "epoch": 1.8701923076923077, + "grad_norm": 1.842493871682372, + "learning_rate": 6.591556428057989e-08, + "loss": 2.101, + "step": 1945 + }, + { + "epoch": 1.875, + "grad_norm": 1.7980810141414054, + "learning_rate": 6.552601725873927e-08, + "loss": 2.1336, + "step": 1950 + }, + { + "epoch": 1.8798076923076923, + "grad_norm": 1.909273446139313, + "learning_rate": 6.514515733300119e-08, + "loss": 2.1389, + "step": 1955 + }, + { + "epoch": 1.8846153846153846, + "grad_norm": 1.9398969365111554, + "learning_rate": 6.484660656765394e-08, + "loss": 2.1039, + "step": 1960 + }, + { + "epoch": 1.8894230769230769, + "grad_norm": 1.85453008710647, + "learning_rate": 6.448094516468652e-08, + "loss": 2.0795, + "step": 1965 + }, + { + "epoch": 1.8942307692307692, + "grad_norm": 1.7956663379402615, + "learning_rate": 6.412348943141603e-08, + "loss": 2.1183, + "step": 1970 + }, + { + "epoch": 1.8990384615384617, + "grad_norm": 2.078977441304735, + "learning_rate": 6.377407326795944e-08, + "loss": 2.0763, + "step": 1975 + }, + { + "epoch": 1.9038461538461537, + "grad_norm": 1.757810065596903, + "learning_rate": 6.343253356981554e-08, + "loss": 2.13, + "step": 1980 + }, + { + "epoch": 1.9086538461538463, + "grad_norm": 1.8683875085590016, + "learning_rate": 6.309871018049243e-08, + "loss": 2.0809, + "step": 1985 + }, + { + "epoch": 1.9134615384615383, + "grad_norm": 1.7848369332013463, + "learning_rate": 6.277244584477894e-08, + "loss": 2.1428, + "step": 1990 + }, + { + "epoch": 1.9182692307692308, + "grad_norm": 1.802325866848323, + "learning_rate": 6.245358616265204e-08, + "loss": 2.0786, + "step": 1995 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 1.807966959879067, + "learning_rate": 6.214197954381353e-08, + "loss": 2.0531, + "step": 2000 + }, + { + "epoch": 1.9230769230769231, + "eval_loss": 2.333247184753418, + "eval_runtime": 85.394, + "eval_samples_per_second": 86.599, + "eval_steps_per_second": 0.679, + "step": 2000 + }, + { + "epoch": 1.9278846153846154, + "grad_norm": 1.779659361884406, + "learning_rate": 6.183747716284858e-08, + "loss": 2.1421, + "step": 2005 + }, + { + "epoch": 1.9326923076923077, + "grad_norm": 1.9140174756953598, + "learning_rate": 6.153993291499917e-08, + "loss": 2.1539, + "step": 2010 + }, + { + "epoch": 1.9375, + "grad_norm": 1.8616242261169418, + "learning_rate": 6.124920337254512e-08, + "loss": 2.1089, + "step": 2015 + }, + { + "epoch": 1.9423076923076923, + "grad_norm": 1.88338038531167, + "learning_rate": 6.096514774178612e-08, + "loss": 2.0954, + "step": 2020 + }, + { + "epoch": 1.9471153846153846, + "grad_norm": 1.9384073065008345, + "learning_rate": 6.068762782061749e-08, + "loss": 2.1067, + "step": 2025 + }, + { + "epoch": 1.9519230769230769, + "grad_norm": 1.7842608425146953, + "learning_rate": 6.04165079566931e-08, + "loss": 2.0734, + "step": 2030 + }, + { + "epoch": 1.9567307692307692, + "grad_norm": 1.8980213968050201, + "learning_rate": 6.015165500616844e-08, + "loss": 2.1398, + "step": 2035 + }, + { + "epoch": 1.9615384615384617, + "grad_norm": 1.8854870321306716, + "learning_rate": 5.989293829301721e-08, + "loss": 2.0905, + "step": 2040 + }, + { + "epoch": 1.9663461538461537, + "grad_norm": 1.8366214101050582, + "learning_rate": 5.964022956891487e-08, + "loss": 2.1192, + "step": 2045 + }, + { + "epoch": 1.9711538461538463, + "grad_norm": 1.9702601160939885, + "learning_rate": 5.9393402973682475e-08, + "loss": 2.0562, + "step": 2050 + }, + { + "epoch": 1.9759615384615383, + "grad_norm": 1.7854608377655588, + "learning_rate": 5.915233499628401e-08, + "loss": 2.0958, + "step": 2055 + }, + { + "epoch": 1.9807692307692308, + "grad_norm": 1.8080366636915477, + "learning_rate": 5.8916904436371357e-08, + "loss": 2.118, + "step": 2060 + }, + { + "epoch": 1.9855769230769231, + "grad_norm": 1.7747943415915806, + "learning_rate": 5.868699236636974e-08, + "loss": 2.0928, + "step": 2065 + }, + { + "epoch": 1.9903846153846154, + "grad_norm": 2.0207986490578067, + "learning_rate": 5.846248209409795e-08, + "loss": 2.1142, + "step": 2070 + }, + { + "epoch": 1.9951923076923077, + "grad_norm": 1.7957289252600956, + "learning_rate": 5.824325912591659e-08, + "loss": 2.144, + "step": 2075 + }, + { + "epoch": 2.0, + "grad_norm": 1.8248097411911974, + "learning_rate": 5.802921113039837e-08, + "loss": 2.1047, + "step": 2080 + }, + { + "epoch": 2.0048076923076925, + "grad_norm": 1.7961928041751198, + "learning_rate": 5.782022790251414e-08, + "loss": 2.1187, + "step": 2085 + }, + { + "epoch": 2.0096153846153846, + "grad_norm": 1.8336585044351084, + "learning_rate": 5.761620132832865e-08, + "loss": 2.0685, + "step": 2090 + }, + { + "epoch": 2.014423076923077, + "grad_norm": 1.8219809800904603, + "learning_rate": 5.741702535019987e-08, + "loss": 2.0564, + "step": 2095 + }, + { + "epoch": 2.019230769230769, + "grad_norm": 1.819040393659182, + "learning_rate": 5.722259593247595e-08, + "loss": 2.1339, + "step": 2100 + }, + { + "epoch": 2.0240384615384617, + "grad_norm": 1.8732187096486306, + "learning_rate": 5.703281102768385e-08, + "loss": 2.0996, + "step": 2105 + }, + { + "epoch": 2.0288461538461537, + "grad_norm": 1.8473280371987284, + "learning_rate": 5.684757054320374e-08, + "loss": 2.1093, + "step": 2110 + }, + { + "epoch": 2.0336538461538463, + "grad_norm": 1.8326317747277034, + "learning_rate": 5.6666776308423326e-08, + "loss": 2.1007, + "step": 2115 + }, + { + "epoch": 2.0384615384615383, + "grad_norm": 1.7796391236885234, + "learning_rate": 5.649033204236644e-08, + "loss": 2.0974, + "step": 2120 + }, + { + "epoch": 2.043269230769231, + "grad_norm": 1.8279643679656394, + "learning_rate": 5.631814332179001e-08, + "loss": 2.1061, + "step": 2125 + }, + { + "epoch": 2.048076923076923, + "grad_norm": 1.915680312366823, + "learning_rate": 5.615011754974382e-08, + "loss": 2.095, + "step": 2130 + }, + { + "epoch": 2.0528846153846154, + "grad_norm": 1.8545098240752675, + "learning_rate": 5.5986163924587514e-08, + "loss": 2.0248, + "step": 2135 + }, + { + "epoch": 2.0576923076923075, + "grad_norm": 2.5876380487293065, + "learning_rate": 5.5826193409459206e-08, + "loss": 2.0417, + "step": 2140 + }, + { + "epoch": 2.0625, + "grad_norm": 1.8049671277672117, + "learning_rate": 5.567011870219021e-08, + "loss": 2.0592, + "step": 2145 + }, + { + "epoch": 2.0673076923076925, + "grad_norm": 1.875703854921943, + "learning_rate": 5.551785420566048e-08, + "loss": 2.0804, + "step": 2150 + }, + { + "epoch": 2.0721153846153846, + "grad_norm": 1.8546691508228774, + "learning_rate": 5.536931599858935e-08, + "loss": 2.0805, + "step": 2155 + }, + { + "epoch": 2.076923076923077, + "grad_norm": 1.773767471396823, + "learning_rate": 5.522442180675621e-08, + "loss": 2.056, + "step": 2160 + }, + { + "epoch": 2.081730769230769, + "grad_norm": 1.861161247873578, + "learning_rate": 5.508309097464585e-08, + "loss": 2.0671, + "step": 2165 + }, + { + "epoch": 2.0865384615384617, + "grad_norm": 1.7742050719059044, + "learning_rate": 5.494524443751328e-08, + "loss": 2.0738, + "step": 2170 + }, + { + "epoch": 2.0913461538461537, + "grad_norm": 1.8318030243960468, + "learning_rate": 5.481080469386275e-08, + "loss": 2.0907, + "step": 2175 + }, + { + "epoch": 2.0961538461538463, + "grad_norm": 1.778257367233478, + "learning_rate": 5.467969577833591e-08, + "loss": 2.0639, + "step": 2180 + }, + { + "epoch": 2.1009615384615383, + "grad_norm": 1.867111620525417, + "learning_rate": 5.455184323500402e-08, + "loss": 2.105, + "step": 2185 + }, + { + "epoch": 2.105769230769231, + "grad_norm": 1.8898912766644747, + "learning_rate": 5.442717409105915e-08, + "loss": 2.0611, + "step": 2190 + }, + { + "epoch": 2.110576923076923, + "grad_norm": 1.9217461466226302, + "learning_rate": 5.430561683089944e-08, + "loss": 2.0806, + "step": 2195 + }, + { + "epoch": 2.1153846153846154, + "grad_norm": 1.861293839223179, + "learning_rate": 5.418710137060338e-08, + "loss": 2.0783, + "step": 2200 + }, + { + "epoch": 2.1153846153846154, + "eval_loss": 2.3356776237487793, + "eval_runtime": 85.3872, + "eval_samples_per_second": 86.605, + "eval_steps_per_second": 0.679, + "step": 2200 + }, + { + "epoch": 2.1201923076923075, + "grad_norm": 1.8572146395283573, + "learning_rate": 5.4071559032788445e-08, + "loss": 2.026, + "step": 2205 + }, + { + "epoch": 2.125, + "grad_norm": 1.8919061510828592, + "learning_rate": 5.395892252184894e-08, + "loss": 2.0538, + "step": 2210 + }, + { + "epoch": 2.1298076923076925, + "grad_norm": 1.9423965048231926, + "learning_rate": 5.384912589956864e-08, + "loss": 2.1354, + "step": 2215 + }, + { + "epoch": 2.1346153846153846, + "grad_norm": 1.86358642820622, + "learning_rate": 5.37421045611031e-08, + "loss": 2.0615, + "step": 2220 + }, + { + "epoch": 2.139423076923077, + "grad_norm": 1.9498064656844925, + "learning_rate": 5.363779521132732e-08, + "loss": 2.1152, + "step": 2225 + }, + { + "epoch": 2.144230769230769, + "grad_norm": 1.838720387490978, + "learning_rate": 5.353613584154386e-08, + "loss": 2.0802, + "step": 2230 + }, + { + "epoch": 2.1490384615384617, + "grad_norm": 1.8736999627632185, + "learning_rate": 5.3437065706546936e-08, + "loss": 2.0794, + "step": 2235 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 1.8185612650303689, + "learning_rate": 5.334052530203788e-08, + "loss": 2.0371, + "step": 2240 + }, + { + "epoch": 2.1586538461538463, + "grad_norm": 1.9598826857016363, + "learning_rate": 5.3246456342387584e-08, + "loss": 2.142, + "step": 2245 + }, + { + "epoch": 2.1634615384615383, + "grad_norm": 1.8852398707927738, + "learning_rate": 5.315480173874134e-08, + "loss": 2.0632, + "step": 2250 + }, + { + "epoch": 2.168269230769231, + "grad_norm": 1.8471328295295872, + "learning_rate": 5.306550557746175e-08, + "loss": 2.1116, + "step": 2255 + }, + { + "epoch": 2.173076923076923, + "grad_norm": 1.8068482718199097, + "learning_rate": 5.297851309890534e-08, + "loss": 2.0509, + "step": 2260 + }, + { + "epoch": 2.1778846153846154, + "grad_norm": 1.9264454870094807, + "learning_rate": 5.2893770676528514e-08, + "loss": 2.1262, + "step": 2265 + }, + { + "epoch": 2.1826923076923075, + "grad_norm": 1.8408137576329833, + "learning_rate": 5.281122579631865e-08, + "loss": 2.0472, + "step": 2270 + }, + { + "epoch": 2.1875, + "grad_norm": 1.821289584580464, + "learning_rate": 5.273082703654604e-08, + "loss": 2.1308, + "step": 2275 + }, + { + "epoch": 2.1923076923076925, + "grad_norm": 1.856905589818333, + "learning_rate": 5.265252404783256e-08, + "loss": 2.1068, + "step": 2280 + }, + { + "epoch": 2.1971153846153846, + "grad_norm": 1.8604589823269795, + "learning_rate": 5.257626753353287e-08, + "loss": 2.0947, + "step": 2285 + }, + { + "epoch": 2.201923076923077, + "grad_norm": 1.8525412113722146, + "learning_rate": 5.250200923042405e-08, + "loss": 2.104, + "step": 2290 + }, + { + "epoch": 2.206730769230769, + "grad_norm": 1.851550872426419, + "learning_rate": 5.242970188969973e-08, + "loss": 2.1139, + "step": 2295 + }, + { + "epoch": 2.2115384615384617, + "grad_norm": 1.8371736291077507, + "learning_rate": 5.2359299258264526e-08, + "loss": 2.1049, + "step": 2300 + }, + { + "epoch": 2.2163461538461537, + "grad_norm": 1.8854850811887058, + "learning_rate": 5.229075606032495e-08, + "loss": 2.0936, + "step": 2305 + }, + { + "epoch": 2.2211538461538463, + "grad_norm": 1.8111275047358883, + "learning_rate": 5.222402797927284e-08, + "loss": 2.0958, + "step": 2310 + }, + { + "epoch": 2.2259615384615383, + "grad_norm": 1.9091134111717707, + "learning_rate": 5.2159071639857394e-08, + "loss": 2.0999, + "step": 2315 + }, + { + "epoch": 2.230769230769231, + "grad_norm": 1.8879383298945882, + "learning_rate": 5.209584459064199e-08, + "loss": 2.1623, + "step": 2320 + }, + { + "epoch": 2.235576923076923, + "grad_norm": 37.03097635246021, + "learning_rate": 5.2034305286741963e-08, + "loss": 2.135, + "step": 2325 + }, + { + "epoch": 2.2403846153846154, + "grad_norm": 1.870738678414933, + "learning_rate": 5.197441307283966e-08, + "loss": 2.118, + "step": 2330 + }, + { + "epoch": 2.2451923076923075, + "grad_norm": 1.8528184603825324, + "learning_rate": 5.191612816647293e-08, + "loss": 2.1268, + "step": 2335 + }, + { + "epoch": 2.25, + "grad_norm": 1.9400695194615212, + "learning_rate": 5.185941164159351e-08, + "loss": 2.076, + "step": 2340 + }, + { + "epoch": 2.2548076923076925, + "grad_norm": 1.9062576912141294, + "learning_rate": 5.180422541239147e-08, + "loss": 2.1306, + "step": 2345 + }, + { + "epoch": 2.2596153846153846, + "grad_norm": 1.9730673873781654, + "learning_rate": 5.175053221738239e-08, + "loss": 2.104, + "step": 2350 + }, + { + "epoch": 2.264423076923077, + "grad_norm": 1.8371019460322038, + "learning_rate": 5.169829560375344e-08, + "loss": 2.0874, + "step": 2355 + }, + { + "epoch": 2.269230769230769, + "grad_norm": 1.874231056452069, + "learning_rate": 5.164747991196499e-08, + "loss": 2.0847, + "step": 2360 + }, + { + "epoch": 2.2740384615384617, + "grad_norm": 1.8794376823061034, + "learning_rate": 5.159805026060424e-08, + "loss": 2.0682, + "step": 2365 + }, + { + "epoch": 2.2788461538461537, + "grad_norm": 1.8255930007868693, + "learning_rate": 5.15499725314874e-08, + "loss": 2.0599, + "step": 2370 + }, + { + "epoch": 2.2836538461538463, + "grad_norm": 2.0171761498440333, + "learning_rate": 5.150321335500705e-08, + "loss": 2.0613, + "step": 2375 + }, + { + "epoch": 2.2884615384615383, + "grad_norm": 1.888512163517087, + "learning_rate": 5.145774009572124e-08, + "loss": 2.0746, + "step": 2380 + }, + { + "epoch": 2.293269230769231, + "grad_norm": 1.963864155096598, + "learning_rate": 5.141352083818108e-08, + "loss": 2.0992, + "step": 2385 + }, + { + "epoch": 2.298076923076923, + "grad_norm": 1.887413641506116, + "learning_rate": 5.1370524372993444e-08, + "loss": 2.0665, + "step": 2390 + }, + { + "epoch": 2.3028846153846154, + "grad_norm": 1.8425396594889334, + "learning_rate": 5.132872018311563e-08, + "loss": 2.0938, + "step": 2395 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 1.8343062688513765, + "learning_rate": 5.128807843037861e-08, + "loss": 2.0952, + "step": 2400 + }, + { + "epoch": 2.3076923076923075, + "eval_loss": 2.3359732627868652, + "eval_runtime": 85.421, + "eval_samples_per_second": 86.571, + "eval_steps_per_second": 0.679, + "step": 2400 + }, + { + "epoch": 2.3125, + "grad_norm": 1.8257992505700218, + "learning_rate": 5.1248569942235814e-08, + "loss": 2.0523, + "step": 2405 + }, + { + "epoch": 2.3173076923076925, + "grad_norm": 1.8895070139431327, + "learning_rate": 5.1210166198734225e-08, + "loss": 2.0834, + "step": 2410 + }, + { + "epoch": 2.3221153846153846, + "grad_norm": 1.9125461978695824, + "learning_rate": 5.117283931970468e-08, + "loss": 2.1017, + "step": 2415 + }, + { + "epoch": 2.326923076923077, + "grad_norm": 1.9275823669446988, + "learning_rate": 5.113656205216831e-08, + "loss": 2.1226, + "step": 2420 + }, + { + "epoch": 2.331730769230769, + "grad_norm": 1.889535416833256, + "learning_rate": 5.1101307757956035e-08, + "loss": 2.0764, + "step": 2425 + }, + { + "epoch": 2.3365384615384617, + "grad_norm": 1.8514556811164167, + "learning_rate": 5.106705040153818e-08, + "loss": 1.9975, + "step": 2430 + }, + { + "epoch": 2.3413461538461537, + "grad_norm": 1.958278628755969, + "learning_rate": 5.103376453806111e-08, + "loss": 2.1202, + "step": 2435 + }, + { + "epoch": 2.3461538461538463, + "grad_norm": 1.910793379676731, + "learning_rate": 5.100142530158806e-08, + "loss": 2.1254, + "step": 2440 + }, + { + "epoch": 2.3509615384615383, + "grad_norm": 2.2904582126799875, + "learning_rate": 5.0970008393541184e-08, + "loss": 2.0487, + "step": 2445 + }, + { + "epoch": 2.355769230769231, + "grad_norm": 1.928870195572868, + "learning_rate": 5.093949007134195e-08, + "loss": 2.0428, + "step": 2450 + }, + { + "epoch": 2.360576923076923, + "grad_norm": 1.9109302889112307, + "learning_rate": 5.090984713724707e-08, + "loss": 2.1073, + "step": 2455 + }, + { + "epoch": 2.3653846153846154, + "grad_norm": 1.8446780789197135, + "learning_rate": 5.0881056927377075e-08, + "loss": 2.1346, + "step": 2460 + }, + { + "epoch": 2.3701923076923075, + "grad_norm": 1.9119026418605038, + "learning_rate": 5.0853097300934865e-08, + "loss": 2.0757, + "step": 2465 + }, + { + "epoch": 2.375, + "grad_norm": 1.952480119894523, + "learning_rate": 5.082594662961142e-08, + "loss": 2.0955, + "step": 2470 + }, + { + "epoch": 2.3798076923076925, + "grad_norm": 1.9160233774476225, + "learning_rate": 5.0799583787175916e-08, + "loss": 2.094, + "step": 2475 + }, + { + "epoch": 2.3846153846153846, + "grad_norm": 1.8139526421054863, + "learning_rate": 5.07739881392477e-08, + "loss": 2.0905, + "step": 2480 + }, + { + "epoch": 2.389423076923077, + "grad_norm": 1.8207559563475217, + "learning_rate": 5.074913953324727e-08, + "loss": 2.0863, + "step": 2485 + }, + { + "epoch": 2.394230769230769, + "grad_norm": 1.8507805248963738, + "learning_rate": 5.0725018288523865e-08, + "loss": 2.0771, + "step": 2490 + }, + { + "epoch": 2.3990384615384617, + "grad_norm": 1.8116379225558112, + "learning_rate": 5.0701605186656875e-08, + "loss": 2.063, + "step": 2495 + }, + { + "epoch": 2.4038461538461537, + "grad_norm": 1.8790784349307603, + "learning_rate": 5.067888146192865e-08, + "loss": 2.0535, + "step": 2500 + }, + { + "epoch": 2.4086538461538463, + "grad_norm": 1.8572351494806207, + "learning_rate": 5.06568287919661e-08, + "loss": 2.0588, + "step": 2505 + }, + { + "epoch": 2.4134615384615383, + "grad_norm": 1.7890661820190739, + "learning_rate": 5.063542928854859e-08, + "loss": 2.0719, + "step": 2510 + }, + { + "epoch": 2.418269230769231, + "grad_norm": 1.780938750209951, + "learning_rate": 5.061466548857974e-08, + "loss": 2.1399, + "step": 2515 + }, + { + "epoch": 2.423076923076923, + "grad_norm": 1.864652061283046, + "learning_rate": 5.059452034522056e-08, + "loss": 2.0946, + "step": 2520 + }, + { + "epoch": 2.4278846153846154, + "grad_norm": 1.8661367735575938, + "learning_rate": 5.057497721918164e-08, + "loss": 2.0811, + "step": 2525 + }, + { + "epoch": 2.4326923076923075, + "grad_norm": 1.7957946183317377, + "learning_rate": 5.055601987017185e-08, + "loss": 2.0997, + "step": 2530 + }, + { + "epoch": 2.4375, + "grad_norm": 1.8001974731925174, + "learning_rate": 5.053763244850147e-08, + "loss": 2.1219, + "step": 2535 + }, + { + "epoch": 2.4423076923076925, + "grad_norm": 1.8983691367559397, + "learning_rate": 5.0519799486837034e-08, + "loss": 2.1097, + "step": 2540 + }, + { + "epoch": 2.4471153846153846, + "grad_norm": 1.905238107904784, + "learning_rate": 5.050250589210597e-08, + "loss": 2.0688, + "step": 2545 + }, + { + "epoch": 2.451923076923077, + "grad_norm": 1.825345955550652, + "learning_rate": 5.048573693754852e-08, + "loss": 2.0937, + "step": 2550 + }, + { + "epoch": 2.456730769230769, + "grad_norm": 1.855436622240645, + "learning_rate": 5.0469478254914804e-08, + "loss": 2.1167, + "step": 2555 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 1.8976603753246268, + "learning_rate": 5.04537158268048e-08, + "loss": 2.0693, + "step": 2560 + }, + { + "epoch": 2.4663461538461537, + "grad_norm": 1.9048063196287657, + "learning_rate": 5.043843597914902e-08, + "loss": 2.0695, + "step": 2565 + }, + { + "epoch": 2.4711538461538463, + "grad_norm": 1.8780277621645116, + "learning_rate": 5.042362537382771e-08, + "loss": 2.0692, + "step": 2570 + }, + { + "epoch": 2.4759615384615383, + "grad_norm": 1.7927549821388442, + "learning_rate": 5.040927100142658e-08, + "loss": 2.0756, + "step": 2575 + }, + { + "epoch": 2.480769230769231, + "grad_norm": 1.9065399802572085, + "learning_rate": 5.03953601741267e-08, + "loss": 2.0273, + "step": 2580 + }, + { + "epoch": 2.485576923076923, + "grad_norm": 1.8711481004226065, + "learning_rate": 5.0381880518726784e-08, + "loss": 2.1434, + "step": 2585 + }, + { + "epoch": 2.4903846153846154, + "grad_norm": 1.8706391357800631, + "learning_rate": 5.03688199697955e-08, + "loss": 2.1032, + "step": 2590 + }, + { + "epoch": 2.4951923076923075, + "grad_norm": 1.9079920113146567, + "learning_rate": 5.0356166762952054e-08, + "loss": 2.0575, + "step": 2595 + }, + { + "epoch": 2.5, + "grad_norm": 1.8325624675703904, + "learning_rate": 5.0343909428272807e-08, + "loss": 2.1009, + "step": 2600 + }, + { + "epoch": 2.5, + "eval_loss": 2.3360962867736816, + "eval_runtime": 85.4584, + "eval_samples_per_second": 86.533, + "eval_steps_per_second": 0.679, + "step": 2600 + }, + { + "epoch": 2.5048076923076925, + "grad_norm": 1.9117983598651567, + "learning_rate": 5.033203678382215e-08, + "loss": 2.1034, + "step": 2605 + }, + { + "epoch": 2.5096153846153846, + "grad_norm": 1.8482924541401045, + "learning_rate": 5.032053792930553e-08, + "loss": 2.0938, + "step": 2610 + }, + { + "epoch": 2.5144230769230766, + "grad_norm": 1.8309284870035238, + "learning_rate": 5.030940223984276e-08, + "loss": 2.0545, + "step": 2615 + }, + { + "epoch": 2.519230769230769, + "grad_norm": 1.887238798925063, + "learning_rate": 5.0298619359859705e-08, + "loss": 2.0947, + "step": 2620 + }, + { + "epoch": 2.5240384615384617, + "grad_norm": 1.8229917506754332, + "learning_rate": 5.0288179197096475e-08, + "loss": 2.1367, + "step": 2625 + }, + { + "epoch": 2.5288461538461537, + "grad_norm": 1.8745480293774028, + "learning_rate": 5.027807191673022e-08, + "loss": 2.1263, + "step": 2630 + }, + { + "epoch": 2.5336538461538463, + "grad_norm": 1.8565511172706295, + "learning_rate": 5.026828793561077e-08, + "loss": 2.069, + "step": 2635 + }, + { + "epoch": 2.5384615384615383, + "grad_norm": 1.8435366151404853, + "learning_rate": 5.0258817916607186e-08, + "loss": 2.0715, + "step": 2640 + }, + { + "epoch": 2.543269230769231, + "grad_norm": 1.82801282007265, + "learning_rate": 5.024965276306364e-08, + "loss": 2.1124, + "step": 2645 + }, + { + "epoch": 2.5480769230769234, + "grad_norm": 1.871706442781542, + "learning_rate": 5.02407836133626e-08, + "loss": 2.0849, + "step": 2650 + }, + { + "epoch": 2.5528846153846154, + "grad_norm": 1.8633902158148148, + "learning_rate": 5.02322018355938e-08, + "loss": 2.0835, + "step": 2655 + }, + { + "epoch": 2.5576923076923075, + "grad_norm": 1.8664407309122704, + "learning_rate": 5.022389902232716e-08, + "loss": 2.058, + "step": 2660 + }, + { + "epoch": 2.5625, + "grad_norm": 1.8241814220396138, + "learning_rate": 5.0215866985488015e-08, + "loss": 2.1001, + "step": 2665 + }, + { + "epoch": 2.5673076923076925, + "grad_norm": 1.8728742912893366, + "learning_rate": 5.020809775133292e-08, + "loss": 2.0782, + "step": 2670 + }, + { + "epoch": 2.5721153846153846, + "grad_norm": 1.836951128615928, + "learning_rate": 5.020058355552443e-08, + "loss": 2.032, + "step": 2675 + }, + { + "epoch": 2.5769230769230766, + "grad_norm": 1.8159474479645261, + "learning_rate": 5.019331683830326e-08, + "loss": 2.0842, + "step": 2680 + }, + { + "epoch": 2.581730769230769, + "grad_norm": 1.8210257982061508, + "learning_rate": 5.018629023975606e-08, + "loss": 2.1517, + "step": 2685 + }, + { + "epoch": 2.5865384615384617, + "grad_norm": 1.8501212045264834, + "learning_rate": 5.0179496595177436e-08, + "loss": 2.0773, + "step": 2690 + }, + { + "epoch": 2.5913461538461537, + "grad_norm": 1.882222780292571, + "learning_rate": 5.017292893052448e-08, + "loss": 2.0555, + "step": 2695 + }, + { + "epoch": 2.5961538461538463, + "grad_norm": 1.843070652377049, + "learning_rate": 5.0166580457962346e-08, + "loss": 2.0461, + "step": 2700 + }, + { + "epoch": 2.6009615384615383, + "grad_norm": 1.847536413092705, + "learning_rate": 5.0160444571499293e-08, + "loss": 2.1485, + "step": 2705 + }, + { + "epoch": 2.605769230769231, + "grad_norm": 1.8266553603942388, + "learning_rate": 5.0154514842709816e-08, + "loss": 2.0737, + "step": 2710 + }, + { + "epoch": 2.6105769230769234, + "grad_norm": 1.9237223597123432, + "learning_rate": 5.014878501654416e-08, + "loss": 2.0757, + "step": 2715 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 1.8948119829446708, + "learning_rate": 5.0143249007222985e-08, + "loss": 2.1339, + "step": 2720 + }, + { + "epoch": 2.6201923076923075, + "grad_norm": 1.8301707716670057, + "learning_rate": 5.013790089421563e-08, + "loss": 2.0548, + "step": 2725 + }, + { + "epoch": 2.625, + "grad_norm": 1.8663429882080074, + "learning_rate": 5.0132734918300504e-08, + "loss": 2.1375, + "step": 2730 + }, + { + "epoch": 2.6298076923076925, + "grad_norm": 1.942647379328917, + "learning_rate": 5.012774547770629e-08, + "loss": 2.1396, + "step": 2735 + }, + { + "epoch": 2.6346153846153846, + "grad_norm": 1.8441092861484971, + "learning_rate": 5.012292712433258e-08, + "loss": 2.0696, + "step": 2740 + }, + { + "epoch": 2.6394230769230766, + "grad_norm": 1.9320657665881027, + "learning_rate": 5.011827456004847e-08, + "loss": 2.1119, + "step": 2745 + }, + { + "epoch": 2.644230769230769, + "grad_norm": 1.8427805768866328, + "learning_rate": 5.0113782633067863e-08, + "loss": 2.084, + "step": 2750 + }, + { + "epoch": 2.6490384615384617, + "grad_norm": 1.8440694033677212, + "learning_rate": 5.0109446334400176e-08, + "loss": 2.0882, + "step": 2755 + }, + { + "epoch": 2.6538461538461537, + "grad_norm": 1.893152979504229, + "learning_rate": 5.010526079437498e-08, + "loss": 2.1043, + "step": 2760 + }, + { + "epoch": 2.6586538461538463, + "grad_norm": 1.9949218255548784, + "learning_rate": 5.010122127923951e-08, + "loss": 2.1103, + "step": 2765 + }, + { + "epoch": 2.6634615384615383, + "grad_norm": 1.8456542683339325, + "learning_rate": 5.0097323187827586e-08, + "loss": 2.0738, + "step": 2770 + }, + { + "epoch": 2.668269230769231, + "grad_norm": 1.8984568625826008, + "learning_rate": 5.009356204829874e-08, + "loss": 2.0612, + "step": 2775 + }, + { + "epoch": 2.6730769230769234, + "grad_norm": 1.8703440919228778, + "learning_rate": 5.008993351494639e-08, + "loss": 2.1919, + "step": 2780 + }, + { + "epoch": 2.6778846153846154, + "grad_norm": 1.9243113440055457, + "learning_rate": 5.008643336507372e-08, + "loss": 2.0829, + "step": 2785 + }, + { + "epoch": 2.6826923076923075, + "grad_norm": 1.834031155910534, + "learning_rate": 5.0083057495936144e-08, + "loss": 2.0647, + "step": 2790 + }, + { + "epoch": 2.6875, + "grad_norm": 2.0300087855547897, + "learning_rate": 5.0079801921749176e-08, + "loss": 2.0993, + "step": 2795 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 1.8096967426995145, + "learning_rate": 5.007666277076042e-08, + "loss": 2.125, + "step": 2800 + }, + { + "epoch": 2.6923076923076925, + "eval_loss": 2.3360354900360107, + "eval_runtime": 85.4625, + "eval_samples_per_second": 86.529, + "eval_steps_per_second": 0.679, + "step": 2800 + }, + { + "epoch": 2.6971153846153846, + "grad_norm": 1.863239316605401, + "learning_rate": 5.0073636282384696e-08, + "loss": 2.1135, + "step": 2805 + }, + { + "epoch": 2.7019230769230766, + "grad_norm": 1.9593347265344716, + "learning_rate": 5.007071880440107e-08, + "loss": 2.087, + "step": 2810 + }, + { + "epoch": 2.706730769230769, + "grad_norm": 1.8698219251596924, + "learning_rate": 5.006790679021062e-08, + "loss": 2.1106, + "step": 2815 + }, + { + "epoch": 2.7115384615384617, + "grad_norm": 1.9096265265503567, + "learning_rate": 5.006519679615399e-08, + "loss": 2.1065, + "step": 2820 + }, + { + "epoch": 2.7163461538461537, + "grad_norm": 1.8385721642634492, + "learning_rate": 5.0062585478887454e-08, + "loss": 2.1307, + "step": 2825 + }, + { + "epoch": 2.7211538461538463, + "grad_norm": 2.045452351348729, + "learning_rate": 5.006006959281663e-08, + "loss": 2.0573, + "step": 2830 + }, + { + "epoch": 2.7259615384615383, + "grad_norm": 1.8727571024658705, + "learning_rate": 5.005764598758657e-08, + "loss": 2.1193, + "step": 2835 + }, + { + "epoch": 2.730769230769231, + "grad_norm": 1.9077767348853074, + "learning_rate": 5.005531160562734e-08, + "loss": 2.1097, + "step": 2840 + }, + { + "epoch": 2.7355769230769234, + "grad_norm": 1.8266187984214344, + "learning_rate": 5.005306347975403e-08, + "loss": 2.0879, + "step": 2845 + }, + { + "epoch": 2.7403846153846154, + "grad_norm": 1.9460294408394188, + "learning_rate": 5.0050898730820176e-08, + "loss": 2.0667, + "step": 2850 + }, + { + "epoch": 2.7451923076923075, + "grad_norm": 1.8751685321455078, + "learning_rate": 5.0048814565423524e-08, + "loss": 2.1122, + "step": 2855 + }, + { + "epoch": 2.75, + "grad_norm": 1.8138239598798986, + "learning_rate": 5.004680827366333e-08, + "loss": 2.0571, + "step": 2860 + }, + { + "epoch": 2.7548076923076925, + "grad_norm": 1.9103749761871995, + "learning_rate": 5.0044877226948085e-08, + "loss": 2.0773, + "step": 2865 + }, + { + "epoch": 2.7596153846153846, + "grad_norm": 1.8517186742525418, + "learning_rate": 5.004301887585273e-08, + "loss": 2.0633, + "step": 2870 + }, + { + "epoch": 2.7644230769230766, + "grad_norm": 1.8277041575262993, + "learning_rate": 5.0041230748024515e-08, + "loss": 2.0995, + "step": 2875 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 1.8783284685972508, + "learning_rate": 5.0039510446136475e-08, + "loss": 2.0799, + "step": 2880 + }, + { + "epoch": 2.7740384615384617, + "grad_norm": 1.8214139607696012, + "learning_rate": 5.00378556458877e-08, + "loss": 2.1185, + "step": 2885 + }, + { + "epoch": 2.7788461538461537, + "grad_norm": 1.754546607125489, + "learning_rate": 5.0036264094049414e-08, + "loss": 2.1165, + "step": 2890 + }, + { + "epoch": 2.7836538461538463, + "grad_norm": 1.8605888233369712, + "learning_rate": 5.0034733606556126e-08, + "loss": 2.0909, + "step": 2895 + }, + { + "epoch": 2.7884615384615383, + "grad_norm": 1.903011452864366, + "learning_rate": 5.003326206664078e-08, + "loss": 2.0946, + "step": 2900 + }, + { + "epoch": 2.793269230769231, + "grad_norm": 1.7737987493209635, + "learning_rate": 5.003184742301327e-08, + "loss": 2.108, + "step": 2905 + }, + { + "epoch": 2.7980769230769234, + "grad_norm": 1.8885111840024975, + "learning_rate": 5.0030487688081324e-08, + "loss": 2.0753, + "step": 2910 + }, + { + "epoch": 2.8028846153846154, + "grad_norm": 1.8832929741438638, + "learning_rate": 5.002918093621301e-08, + "loss": 2.0825, + "step": 2915 + }, + { + "epoch": 2.8076923076923075, + "grad_norm": 1.8972739478097906, + "learning_rate": 5.0027925302039994e-08, + "loss": 2.1004, + "step": 2920 + }, + { + "epoch": 2.8125, + "grad_norm": 1.8077990099256764, + "learning_rate": 5.002671897880082e-08, + "loss": 2.0858, + "step": 2925 + }, + { + "epoch": 2.8173076923076925, + "grad_norm": 1.8611265826571517, + "learning_rate": 5.002556021672335e-08, + "loss": 2.0735, + "step": 2930 + }, + { + "epoch": 2.8221153846153846, + "grad_norm": 1.9313284111744764, + "learning_rate": 5.002444732144568e-08, + "loss": 2.1131, + "step": 2935 + }, + { + "epoch": 2.8269230769230766, + "grad_norm": 1.8676490764521987, + "learning_rate": 5.00233786524746e-08, + "loss": 2.1365, + "step": 2940 + }, + { + "epoch": 2.831730769230769, + "grad_norm": 1.8494289564318631, + "learning_rate": 5.002235262168107e-08, + "loss": 2.1757, + "step": 2945 + }, + { + "epoch": 2.8365384615384617, + "grad_norm": 1.85497440355638, + "learning_rate": 5.0021367691831825e-08, + "loss": 2.1242, + "step": 2950 + }, + { + "epoch": 2.8413461538461537, + "grad_norm": 1.8486274892842425, + "learning_rate": 5.002042237515639e-08, + "loss": 2.1245, + "step": 2955 + }, + { + "epoch": 2.8461538461538463, + "grad_norm": 1.895043426117041, + "learning_rate": 5.001951523194882e-08, + "loss": 2.0803, + "step": 2960 + }, + { + "epoch": 2.8509615384615383, + "grad_norm": 1.874846017392855, + "learning_rate": 5.001864486920352e-08, + "loss": 2.1229, + "step": 2965 + }, + { + "epoch": 2.855769230769231, + "grad_norm": 1.8257810113586723, + "learning_rate": 5.001780993928431e-08, + "loss": 2.0623, + "step": 2970 + }, + { + "epoch": 2.8605769230769234, + "grad_norm": 2.0410507440850743, + "learning_rate": 5.0017009138626176e-08, + "loss": 2.1375, + "step": 2975 + }, + { + "epoch": 2.8653846153846154, + "grad_norm": 1.8536732613204967, + "learning_rate": 5.001624120646899e-08, + "loss": 2.1198, + "step": 2980 + }, + { + "epoch": 2.8701923076923075, + "grad_norm": 1.8420057076108896, + "learning_rate": 5.0015504923622523e-08, + "loss": 2.0588, + "step": 2985 + }, + { + "epoch": 2.875, + "grad_norm": 2.06664054369849, + "learning_rate": 5.0014799111262185e-08, + "loss": 2.065, + "step": 2990 + }, + { + "epoch": 2.8798076923076925, + "grad_norm": 1.8942959478783434, + "learning_rate": 5.001412262975472e-08, + "loss": 2.0928, + "step": 2995 + }, + { + "epoch": 2.8846153846153846, + "grad_norm": 1.9095141517679362, + "learning_rate": 5.0013474377513345e-08, + "loss": 2.1206, + "step": 3000 + }, + { + "epoch": 2.8846153846153846, + "eval_loss": 2.335968494415283, + "eval_runtime": 85.3698, + "eval_samples_per_second": 86.623, + "eval_steps_per_second": 0.679, + "step": 3000 + }, + { + "epoch": 2.8894230769230766, + "grad_norm": 1.8262058020984504, + "learning_rate": 5.001285328988167e-08, + "loss": 2.095, + "step": 3005 + }, + { + "epoch": 2.894230769230769, + "grad_norm": 1.8525491687163678, + "learning_rate": 5.0012258338045814e-08, + "loss": 2.0854, + "step": 3010 + }, + { + "epoch": 2.8990384615384617, + "grad_norm": 1.876102814594601, + "learning_rate": 5.001168852797407e-08, + "loss": 2.0836, + "step": 3015 + }, + { + "epoch": 2.9038461538461537, + "grad_norm": 1.8864256560953125, + "learning_rate": 5.0011142899383596e-08, + "loss": 2.1177, + "step": 3020 + }, + { + "epoch": 2.9086538461538463, + "grad_norm": 1.8543259178498985, + "learning_rate": 5.001062052473354e-08, + "loss": 2.0708, + "step": 3025 + }, + { + "epoch": 2.9134615384615383, + "grad_norm": 1.8468081058935386, + "learning_rate": 5.0010120508243996e-08, + "loss": 2.0649, + "step": 3030 + }, + { + "epoch": 2.918269230769231, + "grad_norm": 1.870394880857915, + "learning_rate": 5.000964198494029e-08, + "loss": 2.0948, + "step": 3035 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 1.8291813927626337, + "learning_rate": 5.000918411972201e-08, + "loss": 2.0571, + "step": 3040 + }, + { + "epoch": 2.9278846153846154, + "grad_norm": 1.8345615836931617, + "learning_rate": 5.000874610645626e-08, + "loss": 2.0843, + "step": 3045 + }, + { + "epoch": 2.9326923076923075, + "grad_norm": 1.784288247829563, + "learning_rate": 5.000832716709459e-08, + "loss": 2.088, + "step": 3050 + }, + { + "epoch": 2.9375, + "grad_norm": 1.8828904166386582, + "learning_rate": 5.000792655081313e-08, + "loss": 2.1294, + "step": 3055 + }, + { + "epoch": 2.9423076923076925, + "grad_norm": 1.876834782651868, + "learning_rate": 5.00075435331754e-08, + "loss": 2.0835, + "step": 3060 + }, + { + "epoch": 2.9471153846153846, + "grad_norm": 1.7891832679275306, + "learning_rate": 5.000717741531722e-08, + "loss": 2.0758, + "step": 3065 + }, + { + "epoch": 2.9519230769230766, + "grad_norm": 1.9834817400632345, + "learning_rate": 5.000682752315336e-08, + "loss": 2.1172, + "step": 3070 + }, + { + "epoch": 2.956730769230769, + "grad_norm": 2.01686543949811, + "learning_rate": 5.000649320660537e-08, + "loss": 2.129, + "step": 3075 + }, + { + "epoch": 2.9615384615384617, + "grad_norm": 1.882159640395084, + "learning_rate": 5.0006173838850096e-08, + "loss": 2.0194, + "step": 3080 + }, + { + "epoch": 2.9663461538461537, + "grad_norm": 1.8632173120315059, + "learning_rate": 5.0005868815588486e-08, + "loss": 2.0399, + "step": 3085 + }, + { + "epoch": 2.9711538461538463, + "grad_norm": 1.899662124124679, + "learning_rate": 5.000557755433416e-08, + "loss": 2.0669, + "step": 3090 + }, + { + "epoch": 2.9759615384615383, + "grad_norm": 1.9288229898878364, + "learning_rate": 5.0005299493721366e-08, + "loss": 2.0695, + "step": 3095 + }, + { + "epoch": 2.980769230769231, + "grad_norm": 1.9430306138069855, + "learning_rate": 5.000503409283182e-08, + "loss": 2.0771, + "step": 3100 + }, + { + "epoch": 2.9855769230769234, + "grad_norm": 1.8642254344339084, + "learning_rate": 5.0004780830540004e-08, + "loss": 2.067, + "step": 3105 + }, + { + "epoch": 2.9903846153846154, + "grad_norm": 1.843625830841223, + "learning_rate": 5.0004539204876536e-08, + "loss": 2.0557, + "step": 3110 + }, + { + "epoch": 2.9951923076923075, + "grad_norm": 1.905040671688552, + "learning_rate": 5.000430873240919e-08, + "loss": 2.1085, + "step": 3115 + }, + { + "epoch": 3.0, + "grad_norm": 1.9724597892841456, + "learning_rate": 5.000408894764108e-08, + "loss": 2.1109, + "step": 3120 + }, + { + "epoch": 3.0048076923076925, + "grad_norm": 1.930998832905121, + "learning_rate": 5.0003879402425764e-08, + "loss": 2.1045, + "step": 3125 + }, + { + "epoch": 3.0096153846153846, + "grad_norm": 1.906832567119333, + "learning_rate": 5.0003679665398665e-08, + "loss": 2.0992, + "step": 3130 + }, + { + "epoch": 3.014423076923077, + "grad_norm": 1.880028734755099, + "learning_rate": 5.000348932142462e-08, + "loss": 2.0536, + "step": 3135 + }, + { + "epoch": 3.019230769230769, + "grad_norm": 1.8234161328010858, + "learning_rate": 5.000330797106105e-08, + "loss": 2.0425, + "step": 3140 + }, + { + "epoch": 3.0240384615384617, + "grad_norm": 1.9060969026597896, + "learning_rate": 5.000313523003646e-08, + "loss": 2.0724, + "step": 3145 + }, + { + "epoch": 3.0288461538461537, + "grad_norm": 1.9314817600599008, + "learning_rate": 5.000297072874381e-08, + "loss": 2.0856, + "step": 3150 + }, + { + "epoch": 3.0336538461538463, + "grad_norm": 2.205865819233671, + "learning_rate": 5.0002814111748496e-08, + "loss": 2.0542, + "step": 3155 + }, + { + "epoch": 3.0384615384615383, + "grad_norm": 1.9034298586292828, + "learning_rate": 5.000266503731057e-08, + "loss": 2.1181, + "step": 3160 + }, + { + "epoch": 3.043269230769231, + "grad_norm": 1.9630469467362441, + "learning_rate": 5.0002523176920756e-08, + "loss": 2.0769, + "step": 3165 + }, + { + "epoch": 3.048076923076923, + "grad_norm": 1.8387471826204973, + "learning_rate": 5.0002388214850104e-08, + "loss": 2.0357, + "step": 3170 + }, + { + "epoch": 3.0528846153846154, + "grad_norm": 1.8580705264609298, + "learning_rate": 5.000225984771277e-08, + "loss": 2.1436, + "step": 3175 + }, + { + "epoch": 3.0576923076923075, + "grad_norm": 1.8937514188796711, + "learning_rate": 5.0002137784041715e-08, + "loss": 2.0621, + "step": 3180 + }, + { + "epoch": 3.0625, + "grad_norm": 1.8887722007611465, + "learning_rate": 5.0002021743876964e-08, + "loss": 2.1001, + "step": 3185 + }, + { + "epoch": 3.0673076923076925, + "grad_norm": 2.058985773940214, + "learning_rate": 5.0001911458366104e-08, + "loss": 2.0544, + "step": 3190 + }, + { + "epoch": 3.0721153846153846, + "grad_norm": 1.8613730424507313, + "learning_rate": 5.000180666937676e-08, + "loss": 2.0672, + "step": 3195 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 1.883209445623825, + "learning_rate": 5.0001707129120686e-08, + "loss": 2.0593, + "step": 3200 + }, + { + "epoch": 3.076923076923077, + "eval_loss": 2.336284875869751, + "eval_runtime": 85.4905, + "eval_samples_per_second": 86.501, + "eval_steps_per_second": 0.678, + "step": 3200 + }, + { + "epoch": 3.081730769230769, + "grad_norm": 1.800038407134164, + "learning_rate": 5.000161259978923e-08, + "loss": 2.1135, + "step": 3205 + }, + { + "epoch": 3.0865384615384617, + "grad_norm": 1.9214263061349197, + "learning_rate": 5.0001522853199856e-08, + "loss": 2.0604, + "step": 3210 + }, + { + "epoch": 3.0913461538461537, + "grad_norm": 1.7946344678576902, + "learning_rate": 5.000143767045347e-08, + "loss": 2.0379, + "step": 3215 + }, + { + "epoch": 3.0961538461538463, + "grad_norm": 1.9345308159393109, + "learning_rate": 5.000135684160221e-08, + "loss": 2.1086, + "step": 3220 + }, + { + "epoch": 3.1009615384615383, + "grad_norm": 1.9155941341236926, + "learning_rate": 5.000128016532757e-08, + "loss": 2.1086, + "step": 3225 + }, + { + "epoch": 3.105769230769231, + "grad_norm": 1.8746401629643195, + "learning_rate": 5.000120744862838e-08, + "loss": 2.085, + "step": 3230 + }, + { + "epoch": 3.110576923076923, + "grad_norm": 1.9247774915660303, + "learning_rate": 5.00011385065186e-08, + "loss": 2.1239, + "step": 3235 + }, + { + "epoch": 3.1153846153846154, + "grad_norm": 1.8464578404726741, + "learning_rate": 5.0001073161734515e-08, + "loss": 2.1166, + "step": 3240 + }, + { + "epoch": 3.1201923076923075, + "grad_norm": 1.891327266772356, + "learning_rate": 5.000101124445121e-08, + "loss": 2.0818, + "step": 3245 + }, + { + "epoch": 3.125, + "grad_norm": 1.859457845102101, + "learning_rate": 5.0000952592007933e-08, + "loss": 2.043, + "step": 3250 + }, + { + "epoch": 3.1298076923076925, + "grad_norm": 1.8626819779803672, + "learning_rate": 5.0000897048642266e-08, + "loss": 2.1099, + "step": 3255 + }, + { + "epoch": 3.1346153846153846, + "grad_norm": 1.848088739569789, + "learning_rate": 5.000084446523276e-08, + "loss": 2.0433, + "step": 3260 + }, + { + "epoch": 3.139423076923077, + "grad_norm": 1.8088561980329354, + "learning_rate": 5.0000794699049865e-08, + "loss": 2.0828, + "step": 3265 + }, + { + "epoch": 3.144230769230769, + "grad_norm": 1.8338377212136632, + "learning_rate": 5.000074761351487e-08, + "loss": 2.0958, + "step": 3270 + }, + { + "epoch": 3.1490384615384617, + "grad_norm": 1.9050955056716428, + "learning_rate": 5.000070307796674e-08, + "loss": 2.1296, + "step": 3275 + }, + { + "epoch": 3.1538461538461537, + "grad_norm": 1.9053203587270828, + "learning_rate": 5.0000660967436526e-08, + "loss": 2.127, + "step": 3280 + }, + { + "epoch": 3.1586538461538463, + "grad_norm": 1.878537794460004, + "learning_rate": 5.000062116242918e-08, + "loss": 2.1055, + "step": 3285 + }, + { + "epoch": 3.1634615384615383, + "grad_norm": 1.8810850477235284, + "learning_rate": 5.000058354871263e-08, + "loss": 2.087, + "step": 3290 + }, + { + "epoch": 3.168269230769231, + "grad_norm": 1.8129515946311003, + "learning_rate": 5.000054801711379e-08, + "loss": 2.0779, + "step": 3295 + }, + { + "epoch": 3.173076923076923, + "grad_norm": 2.0073035626574915, + "learning_rate": 5.0000514463321446e-08, + "loss": 2.1102, + "step": 3300 + }, + { + "epoch": 3.1778846153846154, + "grad_norm": 1.904610541350343, + "learning_rate": 5.000048278769574e-08, + "loss": 2.0952, + "step": 3305 + }, + { + "epoch": 3.1826923076923075, + "grad_norm": 1.808902174339809, + "learning_rate": 5.000045289508406e-08, + "loss": 2.0609, + "step": 3310 + }, + { + "epoch": 3.1875, + "grad_norm": 1.8554788011848724, + "learning_rate": 5.000042469464323e-08, + "loss": 2.0534, + "step": 3315 + }, + { + "epoch": 3.1923076923076925, + "grad_norm": 1.9599174090809928, + "learning_rate": 5.000039809966777e-08, + "loss": 2.0668, + "step": 3320 + }, + { + "epoch": 3.1971153846153846, + "grad_norm": 1.8859333707205377, + "learning_rate": 5.000037302742402e-08, + "loss": 2.073, + "step": 3325 + }, + { + "epoch": 3.201923076923077, + "grad_norm": 1.8053367407893148, + "learning_rate": 5.000034939899001e-08, + "loss": 2.058, + "step": 3330 + }, + { + "epoch": 3.206730769230769, + "grad_norm": 1.9093669207818855, + "learning_rate": 5.000032713910095e-08, + "loss": 2.0711, + "step": 3335 + }, + { + "epoch": 3.2115384615384617, + "grad_norm": 1.8573175727984386, + "learning_rate": 5.0000306175999996e-08, + "loss": 2.1104, + "step": 3340 + }, + { + "epoch": 3.2163461538461537, + "grad_norm": 1.818915273922553, + "learning_rate": 5.000028644129445e-08, + "loss": 2.0857, + "step": 3345 + }, + { + "epoch": 3.2211538461538463, + "grad_norm": 1.8159720078784984, + "learning_rate": 5.000026786981683e-08, + "loss": 2.0886, + "step": 3350 + }, + { + "epoch": 3.2259615384615383, + "grad_norm": 1.8959271365869055, + "learning_rate": 5.000025380834318e-08, + "loss": 2.1141, + "step": 3355 + }, + { + "epoch": 3.230769230769231, + "grad_norm": 1.8963113166938355, + "learning_rate": 5.000023717623903e-08, + "loss": 2.1259, + "step": 3360 + }, + { + "epoch": 3.235576923076923, + "grad_norm": 1.9029307905210568, + "learning_rate": 5.0000221540931055e-08, + "loss": 2.0854, + "step": 3365 + }, + { + "epoch": 3.2403846153846154, + "grad_norm": 1.838526466646601, + "learning_rate": 5.0000206848327065e-08, + "loss": 2.0741, + "step": 3370 + }, + { + "epoch": 3.2451923076923075, + "grad_norm": 1.8859567421929686, + "learning_rate": 5.000019304696002e-08, + "loss": 2.0582, + "step": 3375 + }, + { + "epoch": 3.25, + "grad_norm": 1.9217466457908856, + "learning_rate": 5.000018008787587e-08, + "loss": 2.0699, + "step": 3380 + }, + { + "epoch": 3.2548076923076925, + "grad_norm": 1.9074470673862487, + "learning_rate": 5.0000167924525525e-08, + "loss": 2.032, + "step": 3385 + }, + { + "epoch": 3.2596153846153846, + "grad_norm": 1.8425868401366883, + "learning_rate": 5.000015651266079e-08, + "loss": 2.1211, + "step": 3390 + }, + { + "epoch": 3.264423076923077, + "grad_norm": 1.8269121873511085, + "learning_rate": 5.00001458102343e-08, + "loss": 2.1272, + "step": 3395 + }, + { + "epoch": 3.269230769230769, + "grad_norm": 1.9274516851712518, + "learning_rate": 5.000013577730309e-08, + "loss": 2.0927, + "step": 3400 + }, + { + "epoch": 3.269230769230769, + "eval_loss": 2.3365249633789062, + "eval_runtime": 85.4018, + "eval_samples_per_second": 86.591, + "eval_steps_per_second": 0.679, + "step": 3400 + }, + { + "epoch": 3.2740384615384617, + "grad_norm": 1.889849662397209, + "learning_rate": 5.000012637593584e-08, + "loss": 2.0617, + "step": 3405 + }, + { + "epoch": 3.2788461538461537, + "grad_norm": 1.9502873503727838, + "learning_rate": 5.000011757012371e-08, + "loss": 2.1223, + "step": 3410 + }, + { + "epoch": 3.2836538461538463, + "grad_norm": 1.9403389617445832, + "learning_rate": 5.0000109325694494e-08, + "loss": 2.0963, + "step": 3415 + }, + { + "epoch": 3.2884615384615383, + "grad_norm": 1.9220338068487544, + "learning_rate": 5.0000101610230143e-08, + "loss": 2.0916, + "step": 3420 + }, + { + "epoch": 3.293269230769231, + "grad_norm": 1.9375048503232193, + "learning_rate": 5.000009439298745e-08, + "loss": 2.0717, + "step": 3425 + }, + { + "epoch": 3.298076923076923, + "grad_norm": 1.8438418543194979, + "learning_rate": 5.000008895827592e-08, + "loss": 2.1255, + "step": 3430 + }, + { + "epoch": 3.3028846153846154, + "grad_norm": 1.8629567514533452, + "learning_rate": 5.00000825654154e-08, + "loss": 2.0806, + "step": 3435 + }, + { + "epoch": 3.3076923076923075, + "grad_norm": 1.9106656016326038, + "learning_rate": 5.000007659296849e-08, + "loss": 2.1158, + "step": 3440 + }, + { + "epoch": 3.3125, + "grad_norm": 1.9013483711226824, + "learning_rate": 5.000007101588647e-08, + "loss": 2.1251, + "step": 3445 + }, + { + "epoch": 3.3173076923076925, + "grad_norm": 1.918508888857165, + "learning_rate": 5.0000065810456154e-08, + "loss": 2.0693, + "step": 3450 + }, + { + "epoch": 3.3221153846153846, + "grad_norm": 1.8062766125316954, + "learning_rate": 5.0000060954237113e-08, + "loss": 2.1227, + "step": 3455 + }, + { + "epoch": 3.326923076923077, + "grad_norm": 1.863020981136348, + "learning_rate": 5.000005642600152e-08, + "loss": 2.1291, + "step": 3460 + }, + { + "epoch": 3.331730769230769, + "grad_norm": 1.814260156227495, + "learning_rate": 5.000005220567642e-08, + "loss": 2.0376, + "step": 3465 + }, + { + "epoch": 3.3365384615384617, + "grad_norm": 1.860164501188251, + "learning_rate": 5.000004827428838e-08, + "loss": 2.0692, + "step": 3470 + }, + { + "epoch": 3.3413461538461537, + "grad_norm": 1.8559616510930068, + "learning_rate": 5.000004461391041e-08, + "loss": 2.1154, + "step": 3475 + }, + { + "epoch": 3.3461538461538463, + "grad_norm": 1.8531248832701233, + "learning_rate": 5.000004120761112e-08, + "loss": 2.1368, + "step": 3480 + }, + { + "epoch": 3.3509615384615383, + "grad_norm": 2.0855871097245697, + "learning_rate": 5.000003803940601e-08, + "loss": 2.0614, + "step": 3485 + }, + { + "epoch": 3.355769230769231, + "grad_norm": 1.849398364726841, + "learning_rate": 5.000003509421077e-08, + "loss": 2.0439, + "step": 3490 + }, + { + "epoch": 3.360576923076923, + "grad_norm": 1.8843707405312315, + "learning_rate": 5.000003235779665e-08, + "loss": 2.1177, + "step": 3495 + }, + { + "epoch": 3.3653846153846154, + "grad_norm": 1.8674622419471962, + "learning_rate": 5.0000029816747665e-08, + "loss": 2.0846, + "step": 3500 + }, + { + "epoch": 3.3701923076923075, + "grad_norm": 1.861783824284357, + "learning_rate": 5.000002745841968e-08, + "loss": 2.0955, + "step": 3505 + }, + { + "epoch": 3.375, + "grad_norm": 1.9278334626136537, + "learning_rate": 5.000002527090128e-08, + "loss": 2.059, + "step": 3510 + }, + { + "epoch": 3.3798076923076925, + "grad_norm": 1.8337005789104908, + "learning_rate": 5.0000023242976346e-08, + "loss": 2.0665, + "step": 3515 + }, + { + "epoch": 3.3846153846153846, + "grad_norm": 1.9024075084324792, + "learning_rate": 5.000002136408825e-08, + "loss": 2.1361, + "step": 3520 + }, + { + "epoch": 3.389423076923077, + "grad_norm": 1.8782715480203358, + "learning_rate": 5.0000019624305734e-08, + "loss": 2.1163, + "step": 3525 + }, + { + "epoch": 3.394230769230769, + "grad_norm": 1.86058034338409, + "learning_rate": 5.000001801429018e-08, + "loss": 2.1186, + "step": 3530 + }, + { + "epoch": 3.3990384615384617, + "grad_norm": 1.8881759634428155, + "learning_rate": 5.000001652526446e-08, + "loss": 2.0883, + "step": 3535 + }, + { + "epoch": 3.4038461538461537, + "grad_norm": 1.785713447960782, + "learning_rate": 5.000001514898321e-08, + "loss": 2.0527, + "step": 3540 + }, + { + "epoch": 3.4086538461538463, + "grad_norm": 1.9555165881816705, + "learning_rate": 5.0000013877704346e-08, + "loss": 2.1163, + "step": 3545 + }, + { + "epoch": 3.4134615384615383, + "grad_norm": 1.9223532202133446, + "learning_rate": 5.000001270416205e-08, + "loss": 2.0901, + "step": 3550 + }, + { + "epoch": 3.418269230769231, + "grad_norm": 1.9193635011123766, + "learning_rate": 5.000001162154087e-08, + "loss": 2.0746, + "step": 3555 + }, + { + "epoch": 3.423076923076923, + "grad_norm": 1.8733962144827436, + "learning_rate": 5.000001062345115e-08, + "loss": 2.0671, + "step": 3560 + }, + { + "epoch": 3.4278846153846154, + "grad_norm": 1.85873983452056, + "learning_rate": 5.0000009703905566e-08, + "loss": 2.1137, + "step": 3565 + }, + { + "epoch": 3.4326923076923075, + "grad_norm": 1.8503554423844921, + "learning_rate": 5.000000885729673e-08, + "loss": 2.0894, + "step": 3570 + }, + { + "epoch": 3.4375, + "grad_norm": 1.8222014591366218, + "learning_rate": 5.0000008078376005e-08, + "loss": 2.0432, + "step": 3575 + }, + { + "epoch": 3.4423076923076925, + "grad_norm": 1.7957714401504574, + "learning_rate": 5.0000007362233173e-08, + "loss": 2.1261, + "step": 3580 + }, + { + "epoch": 3.4471153846153846, + "grad_norm": 1.931908483475819, + "learning_rate": 5.000000670427727e-08, + "loss": 2.0361, + "step": 3585 + }, + { + "epoch": 3.451923076923077, + "grad_norm": 1.9002646238486756, + "learning_rate": 5.00000061002182e-08, + "loss": 2.0524, + "step": 3590 + }, + { + "epoch": 3.456730769230769, + "grad_norm": 1.8204343994860845, + "learning_rate": 5.0000005546049374e-08, + "loss": 2.0467, + "step": 3595 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 1.9057120685414555, + "learning_rate": 5.00000050380312e-08, + "loss": 2.093, + "step": 3600 + }, + { + "epoch": 3.4615384615384617, + "eval_loss": 2.3367574214935303, + "eval_runtime": 85.4244, + "eval_samples_per_second": 86.568, + "eval_steps_per_second": 0.679, + "step": 3600 + }, + { + "epoch": 3.4663461538461537, + "grad_norm": 1.9365323482683579, + "learning_rate": 5.000000457267532e-08, + "loss": 2.0553, + "step": 3605 + }, + { + "epoch": 3.4711538461538463, + "grad_norm": 1.8079565138425362, + "learning_rate": 5.0000004146729796e-08, + "loss": 2.089, + "step": 3610 + }, + { + "epoch": 3.4759615384615383, + "grad_norm": 1.8121185503245834, + "learning_rate": 5.0000003757164884e-08, + "loss": 2.0986, + "step": 3615 + }, + { + "epoch": 3.480769230769231, + "grad_norm": 1.8091507058120948, + "learning_rate": 5.00000034011597e-08, + "loss": 2.0754, + "step": 3620 + }, + { + "epoch": 3.485576923076923, + "grad_norm": 1.8733942037147027, + "learning_rate": 5.000000307608948e-08, + "loss": 2.0668, + "step": 3625 + }, + { + "epoch": 3.4903846153846154, + "grad_norm": 1.8821202627650557, + "learning_rate": 5.000000277951357e-08, + "loss": 1.9986, + "step": 3630 + }, + { + "epoch": 3.4951923076923075, + "grad_norm": 1.842855668232229, + "learning_rate": 5.0000002509163964e-08, + "loss": 2.0966, + "step": 3635 + }, + { + "epoch": 3.5, + "grad_norm": 1.8876473696523732, + "learning_rate": 5.0000002262934616e-08, + "loss": 2.0639, + "step": 3640 + }, + { + "epoch": 3.5048076923076925, + "grad_norm": 1.9962924727314426, + "learning_rate": 5.0000002038871134e-08, + "loss": 2.0818, + "step": 3645 + }, + { + "epoch": 3.5096153846153846, + "grad_norm": 1.9564800425998439, + "learning_rate": 5.0000001835161206e-08, + "loss": 2.1244, + "step": 3650 + }, + { + "epoch": 3.5144230769230766, + "grad_norm": 1.8523701031395317, + "learning_rate": 5.0000001650125436e-08, + "loss": 2.0887, + "step": 3655 + }, + { + "epoch": 3.519230769230769, + "grad_norm": 1.9350705828074954, + "learning_rate": 5.0000001482208764e-08, + "loss": 2.0847, + "step": 3660 + }, + { + "epoch": 3.5240384615384617, + "grad_norm": 1.946869882547775, + "learning_rate": 5.000000132997231e-08, + "loss": 2.0947, + "step": 3665 + }, + { + "epoch": 3.5288461538461537, + "grad_norm": 1.8459205035434865, + "learning_rate": 5.0000001192085726e-08, + "loss": 2.0312, + "step": 3670 + }, + { + "epoch": 3.5336538461538463, + "grad_norm": 1.919571637460775, + "learning_rate": 5.000000106731995e-08, + "loss": 2.0684, + "step": 3675 + }, + { + "epoch": 3.5384615384615383, + "grad_norm": 1.8251904058697088, + "learning_rate": 5.000000095454041e-08, + "loss": 2.0681, + "step": 3680 + }, + { + "epoch": 3.543269230769231, + "grad_norm": 1.8644080480328407, + "learning_rate": 5.000000085270059e-08, + "loss": 2.07, + "step": 3685 + }, + { + "epoch": 3.5480769230769234, + "grad_norm": 1.9449733940426817, + "learning_rate": 5.0000000760835994e-08, + "loss": 2.0474, + "step": 3690 + }, + { + "epoch": 3.5528846153846154, + "grad_norm": 1.8861381009831941, + "learning_rate": 5.000000067805847e-08, + "loss": 2.0788, + "step": 3695 + }, + { + "epoch": 3.5576923076923075, + "grad_norm": 1.9119855215360249, + "learning_rate": 5.000000060355086e-08, + "loss": 2.133, + "step": 3700 + }, + { + "epoch": 3.5625, + "grad_norm": 2.0025144773598713, + "learning_rate": 5.000000053656201e-08, + "loss": 2.0604, + "step": 3705 + }, + { + "epoch": 3.5673076923076925, + "grad_norm": 1.9599184161336376, + "learning_rate": 5.000000047640201e-08, + "loss": 2.0693, + "step": 3710 + }, + { + "epoch": 3.5721153846153846, + "grad_norm": 1.9332484541798294, + "learning_rate": 5.000000042243783e-08, + "loss": 2.1326, + "step": 3715 + }, + { + "epoch": 3.5769230769230766, + "grad_norm": 1.8373427956250443, + "learning_rate": 5.000000037408913e-08, + "loss": 2.0914, + "step": 3720 + }, + { + "epoch": 3.581730769230769, + "grad_norm": 1.8985422762821798, + "learning_rate": 5.000000033082442e-08, + "loss": 2.1263, + "step": 3725 + }, + { + "epoch": 3.5865384615384617, + "grad_norm": 1.8507361941632516, + "learning_rate": 5.000000029215739e-08, + "loss": 2.1016, + "step": 3730 + }, + { + "epoch": 3.5913461538461537, + "grad_norm": 1.918522522188892, + "learning_rate": 5.0000000257643545e-08, + "loss": 2.1104, + "step": 3735 + }, + { + "epoch": 3.5961538461538463, + "grad_norm": 1.9234648718431095, + "learning_rate": 5.0000000226876985e-08, + "loss": 2.0551, + "step": 3740 + }, + { + "epoch": 3.6009615384615383, + "grad_norm": 1.822481727821557, + "learning_rate": 5.000000019948749e-08, + "loss": 2.165, + "step": 3745 + }, + { + "epoch": 3.605769230769231, + "grad_norm": 1.8897986361161199, + "learning_rate": 5.000000017513769e-08, + "loss": 2.1189, + "step": 3750 + }, + { + "epoch": 3.6105769230769234, + "grad_norm": 1.8846334119765857, + "learning_rate": 5.0000000153520544e-08, + "loss": 2.0941, + "step": 3755 + }, + { + "epoch": 3.6153846153846154, + "grad_norm": 1.9439696562766058, + "learning_rate": 5.000000013435687e-08, + "loss": 2.0899, + "step": 3760 + }, + { + "epoch": 3.6201923076923075, + "grad_norm": 2.1285672502730897, + "learning_rate": 5.000000011739313e-08, + "loss": 2.0651, + "step": 3765 + }, + { + "epoch": 3.625, + "grad_norm": 1.9213014147357517, + "learning_rate": 5.000000010239938e-08, + "loss": 2.0956, + "step": 3770 + }, + { + "epoch": 3.6298076923076925, + "grad_norm": 2.0068609857257806, + "learning_rate": 5.0000000089167275e-08, + "loss": 2.1357, + "step": 3775 + }, + { + "epoch": 3.6346153846153846, + "grad_norm": 1.8705225726991637, + "learning_rate": 5.0000000077508284e-08, + "loss": 2.0578, + "step": 3780 + }, + { + "epoch": 3.6394230769230766, + "grad_norm": 1.8943581631321806, + "learning_rate": 5.000000006725204e-08, + "loss": 2.0315, + "step": 3785 + }, + { + "epoch": 3.644230769230769, + "grad_norm": 1.7746155655966087, + "learning_rate": 5.0000000058244776e-08, + "loss": 2.0558, + "step": 3790 + }, + { + "epoch": 3.6490384615384617, + "grad_norm": 1.9075711009896643, + "learning_rate": 5.00000000503479e-08, + "loss": 2.0978, + "step": 3795 + }, + { + "epoch": 3.6538461538461537, + "grad_norm": 1.850526459782874, + "learning_rate": 5.0000000043436655e-08, + "loss": 2.066, + "step": 3800 + }, + { + "epoch": 3.6538461538461537, + "eval_loss": 2.3363423347473145, + "eval_runtime": 85.3021, + "eval_samples_per_second": 86.692, + "eval_steps_per_second": 0.68, + "step": 3800 + }, + { + "epoch": 3.6586538461538463, + "grad_norm": 1.8690566333305048, + "learning_rate": 5.000000003739891e-08, + "loss": 2.0487, + "step": 3805 + }, + { + "epoch": 3.6634615384615383, + "grad_norm": 1.900722274652347, + "learning_rate": 5.000000003213401e-08, + "loss": 2.1207, + "step": 3810 + }, + { + "epoch": 3.668269230769231, + "grad_norm": 1.9465838080070361, + "learning_rate": 5.0000000027551756e-08, + "loss": 2.055, + "step": 3815 + }, + { + "epoch": 3.6730769230769234, + "grad_norm": 1.9044190775719372, + "learning_rate": 5.000000002357143e-08, + "loss": 2.0932, + "step": 3820 + }, + { + "epoch": 3.6778846153846154, + "grad_norm": 1.877437768825067, + "learning_rate": 5.00000000201209e-08, + "loss": 2.0378, + "step": 3825 + }, + { + "epoch": 3.6826923076923075, + "grad_norm": 1.9479165928017026, + "learning_rate": 5.0000000017135845e-08, + "loss": 2.12, + "step": 3830 + }, + { + "epoch": 3.6875, + "grad_norm": 1.8934460533416513, + "learning_rate": 5.000000001455896e-08, + "loss": 2.0638, + "step": 3835 + }, + { + "epoch": 3.6923076923076925, + "grad_norm": 1.8852430662362558, + "learning_rate": 5.00000000123393e-08, + "loss": 2.0684, + "step": 3840 + }, + { + "epoch": 3.6971153846153846, + "grad_norm": 1.860403694759792, + "learning_rate": 5.000000001043168e-08, + "loss": 2.0769, + "step": 3845 + }, + { + "epoch": 3.7019230769230766, + "grad_norm": 1.8537616298510589, + "learning_rate": 5.000000000879604e-08, + "loss": 2.0796, + "step": 3850 + }, + { + "epoch": 3.706730769230769, + "grad_norm": 1.9070836535172773, + "learning_rate": 5.0000000007396964e-08, + "loss": 2.0788, + "step": 3855 + }, + { + "epoch": 3.7115384615384617, + "grad_norm": 1.8144568187717154, + "learning_rate": 5.0000000006203204e-08, + "loss": 2.0824, + "step": 3860 + }, + { + "epoch": 3.7163461538461537, + "grad_norm": 1.891955133693288, + "learning_rate": 5.000000000518723e-08, + "loss": 2.0976, + "step": 3865 + }, + { + "epoch": 3.7211538461538463, + "grad_norm": 1.9703595895690142, + "learning_rate": 5.000000000432485e-08, + "loss": 2.0787, + "step": 3870 + }, + { + "epoch": 3.7259615384615383, + "grad_norm": 1.8460940153632612, + "learning_rate": 5.000000000359484e-08, + "loss": 2.1149, + "step": 3875 + }, + { + "epoch": 3.730769230769231, + "grad_norm": 1.9416809896930844, + "learning_rate": 5.000000000297862e-08, + "loss": 2.103, + "step": 3880 + }, + { + "epoch": 3.7355769230769234, + "grad_norm": 1.8235135326813838, + "learning_rate": 5.0000000002459973e-08, + "loss": 2.0464, + "step": 3885 + }, + { + "epoch": 3.7403846153846154, + "grad_norm": 1.8544605215958418, + "learning_rate": 5.000000000202477e-08, + "loss": 2.1148, + "step": 3890 + }, + { + "epoch": 3.7451923076923075, + "grad_norm": 1.9297008145685273, + "learning_rate": 5.000000000166072e-08, + "loss": 2.0917, + "step": 3895 + }, + { + "epoch": 3.75, + "grad_norm": 1.841810840824877, + "learning_rate": 5.000000000135718e-08, + "loss": 2.0486, + "step": 3900 + }, + { + "epoch": 3.7548076923076925, + "grad_norm": 1.8206643156132905, + "learning_rate": 5.0000000001104946e-08, + "loss": 2.0672, + "step": 3905 + }, + { + "epoch": 3.7596153846153846, + "grad_norm": 1.8759920863049961, + "learning_rate": 5.000000000089607e-08, + "loss": 2.0244, + "step": 3910 + }, + { + "epoch": 3.7644230769230766, + "grad_norm": 1.9048495951309699, + "learning_rate": 5.0000000000723734e-08, + "loss": 2.0743, + "step": 3915 + }, + { + "epoch": 3.769230769230769, + "grad_norm": 1.8193159595260147, + "learning_rate": 5.000000000058207e-08, + "loss": 2.0722, + "step": 3920 + }, + { + "epoch": 3.7740384615384617, + "grad_norm": 1.8691020909344, + "learning_rate": 5.0000000000466084e-08, + "loss": 2.1207, + "step": 3925 + }, + { + "epoch": 3.7788461538461537, + "grad_norm": 1.8608578096368507, + "learning_rate": 5.00000000003715e-08, + "loss": 2.1023, + "step": 3930 + }, + { + "epoch": 3.7836538461538463, + "grad_norm": 1.861692606774206, + "learning_rate": 5.00000000002947e-08, + "loss": 2.1159, + "step": 3935 + }, + { + "epoch": 3.7884615384615383, + "grad_norm": 1.9009512697877335, + "learning_rate": 5.0000000000232614e-08, + "loss": 2.0928, + "step": 3940 + }, + { + "epoch": 3.793269230769231, + "grad_norm": 1.8247326337722605, + "learning_rate": 5.000000000018266e-08, + "loss": 2.0607, + "step": 3945 + }, + { + "epoch": 3.7980769230769234, + "grad_norm": 1.838081967907657, + "learning_rate": 5.000000000014265e-08, + "loss": 2.1089, + "step": 3950 + }, + { + "epoch": 3.8028846153846154, + "grad_norm": 1.929918706709054, + "learning_rate": 5.000000000011078e-08, + "loss": 2.0905, + "step": 3955 + }, + { + "epoch": 3.8076923076923075, + "grad_norm": 1.8508307524707792, + "learning_rate": 5.0000000000085515e-08, + "loss": 2.1306, + "step": 3960 + }, + { + "epoch": 3.8125, + "grad_norm": 1.8695517798307058, + "learning_rate": 5.00000000000656e-08, + "loss": 2.0873, + "step": 3965 + }, + { + "epoch": 3.8173076923076925, + "grad_norm": 1.9513218569006434, + "learning_rate": 5.000000000005e-08, + "loss": 2.1049, + "step": 3970 + }, + { + "epoch": 3.8221153846153846, + "grad_norm": 1.8982042501595857, + "learning_rate": 5.000000000003784e-08, + "loss": 2.1205, + "step": 3975 + }, + { + "epoch": 3.8269230769230766, + "grad_norm": 1.8184591699240908, + "learning_rate": 5.000000000002844e-08, + "loss": 2.0395, + "step": 3980 + }, + { + "epoch": 3.831730769230769, + "grad_norm": 1.8444114349744394, + "learning_rate": 5.0000000000021207e-08, + "loss": 2.0824, + "step": 3985 + }, + { + "epoch": 3.8365384615384617, + "grad_norm": 1.8531735260873148, + "learning_rate": 5.000000000001569e-08, + "loss": 2.0544, + "step": 3990 + }, + { + "epoch": 3.8413461538461537, + "grad_norm": 1.8352559334251506, + "learning_rate": 5.0000000000011505e-08, + "loss": 2.0938, + "step": 3995 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 1.8424349150299684, + "learning_rate": 5.000000000000836e-08, + "loss": 2.1086, + "step": 4000 + }, + { + "epoch": 3.8461538461538463, + "eval_loss": 2.3361942768096924, + "eval_runtime": 85.4169, + "eval_samples_per_second": 86.575, + "eval_steps_per_second": 0.679, + "step": 4000 + }, + { + "epoch": 3.8509615384615383, + "grad_norm": 1.90467764249709, + "learning_rate": 5.000000000000602e-08, + "loss": 2.0919, + "step": 4005 + }, + { + "epoch": 3.855769230769231, + "grad_norm": 1.9147996032600165, + "learning_rate": 5.000000000000429e-08, + "loss": 2.0992, + "step": 4010 + }, + { + "epoch": 3.8605769230769234, + "grad_norm": 1.899917149171274, + "learning_rate": 5.000000000000303e-08, + "loss": 2.0772, + "step": 4015 + }, + { + "epoch": 3.8653846153846154, + "grad_norm": 1.8983270516331723, + "learning_rate": 5.000000000000211e-08, + "loss": 2.088, + "step": 4020 + }, + { + "epoch": 3.8701923076923075, + "grad_norm": 1.9175004513272587, + "learning_rate": 5.0000000000001454e-08, + "loss": 2.0511, + "step": 4025 + }, + { + "epoch": 3.875, + "grad_norm": 1.8660541755671598, + "learning_rate": 5.0000000000000984e-08, + "loss": 2.1061, + "step": 4030 + }, + { + "epoch": 3.8798076923076925, + "grad_norm": 1.8945222773765362, + "learning_rate": 5.000000000000066e-08, + "loss": 2.0912, + "step": 4035 + }, + { + "epoch": 3.8846153846153846, + "grad_norm": 1.9243273581552536, + "learning_rate": 5.0000000000000434e-08, + "loss": 2.126, + "step": 4040 + }, + { + "epoch": 3.8894230769230766, + "grad_norm": 1.8550808979879474, + "learning_rate": 5.000000000000028e-08, + "loss": 2.1042, + "step": 4045 + }, + { + "epoch": 3.894230769230769, + "grad_norm": 1.97506748062818, + "learning_rate": 5.0000000000000176e-08, + "loss": 2.1115, + "step": 4050 + }, + { + "epoch": 3.8990384615384617, + "grad_norm": 1.9079814987909542, + "learning_rate": 5.000000000000011e-08, + "loss": 2.049, + "step": 4055 + }, + { + "epoch": 3.9038461538461537, + "grad_norm": 1.9271203991857457, + "learning_rate": 5.000000000000007e-08, + "loss": 2.134, + "step": 4060 + }, + { + "epoch": 3.9086538461538463, + "grad_norm": 1.9736638939991642, + "learning_rate": 5.000000000000004e-08, + "loss": 2.1579, + "step": 4065 + }, + { + "epoch": 3.9134615384615383, + "grad_norm": 1.8949062426649275, + "learning_rate": 5.0000000000000024e-08, + "loss": 2.1017, + "step": 4070 + }, + { + "epoch": 3.918269230769231, + "grad_norm": 1.8881914290487865, + "learning_rate": 5.000000000000001e-08, + "loss": 2.0493, + "step": 4075 + }, + { + "epoch": 3.9230769230769234, + "grad_norm": 1.9185864408059423, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.0971, + "step": 4080 + }, + { + "epoch": 3.9278846153846154, + "grad_norm": 1.910935901032547, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.0574, + "step": 4085 + }, + { + "epoch": 3.9326923076923075, + "grad_norm": 1.8477236208599264, + "learning_rate": 5e-08, + "loss": 2.0316, + "step": 4090 + }, + { + "epoch": 3.9375, + "grad_norm": 1.8681233408771172, + "learning_rate": 5e-08, + "loss": 2.0406, + "step": 4095 + }, + { + "epoch": 3.9423076923076925, + "grad_norm": 1.976625704514766, + "learning_rate": 5e-08, + "loss": 2.1185, + "step": 4100 + }, + { + "epoch": 3.9471153846153846, + "grad_norm": 1.8722374970584073, + "learning_rate": 5e-08, + "loss": 2.0834, + "step": 4105 + }, + { + "epoch": 3.9519230769230766, + "grad_norm": 2.0555523827232234, + "learning_rate": 5e-08, + "loss": 2.0699, + "step": 4110 + }, + { + "epoch": 3.956730769230769, + "grad_norm": 1.8728593232700466, + "learning_rate": 5e-08, + "loss": 2.0932, + "step": 4115 + }, + { + "epoch": 3.9615384615384617, + "grad_norm": 1.8543407125566582, + "learning_rate": 5e-08, + "loss": 2.1006, + "step": 4120 + }, + { + "epoch": 3.9663461538461537, + "grad_norm": 1.8246615617187374, + "learning_rate": 5e-08, + "loss": 2.0577, + "step": 4125 + }, + { + "epoch": 3.9711538461538463, + "grad_norm": 1.9485201624855024, + "learning_rate": 5e-08, + "loss": 2.1165, + "step": 4130 + }, + { + "epoch": 3.9759615384615383, + "grad_norm": 1.988247558955116, + "learning_rate": 5e-08, + "loss": 2.0729, + "step": 4135 + }, + { + "epoch": 3.980769230769231, + "grad_norm": 1.9867643817669718, + "learning_rate": 5e-08, + "loss": 2.0647, + "step": 4140 + }, + { + "epoch": 3.9855769230769234, + "grad_norm": 1.9105220330651407, + "learning_rate": 5e-08, + "loss": 2.0665, + "step": 4145 + }, + { + "epoch": 3.9903846153846154, + "grad_norm": 1.8202876344304606, + "learning_rate": 5e-08, + "loss": 2.1232, + "step": 4150 + }, + { + "epoch": 3.9951923076923075, + "grad_norm": 1.9398674577857897, + "learning_rate": 5e-08, + "loss": 2.0924, + "step": 4155 + }, + { + "epoch": 4.0, + "grad_norm": 1.9383477945644347, + "learning_rate": 5e-08, + "loss": 2.1167, + "step": 4160 + }, + { + "epoch": 4.0, + "step": 4160, + "total_flos": 434462785536000.0, + "train_loss": 2.16538261238199, + "train_runtime": 15200.3368, + "train_samples_per_second": 17.512, + "train_steps_per_second": 0.274 + } + ], + "logging_steps": 5, + "max_steps": 4160, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 200, + "total_flos": 434462785536000.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}