{ "best_metric": 53.041802, "best_model_checkpoint": "/home/sushant/D1/vulnerability_ati/training/vuln-expert/v1-20250314-214509/checkpoint-1892", "epoch": 2.0, "eval_steps": 500, "global_step": 1892, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010570824524312897, "grad_norm": 0.310546875, "learning_rate": 1.7605633802816901e-07, "loss": 0.5840004682540894, "memory(GiB)": 54.88, "step": 1, "token_acc": 0.877961901811987, "train_speed(iter/s)": 0.051729 }, { "epoch": 0.005285412262156448, "grad_norm": 0.30078125, "learning_rate": 8.802816901408452e-07, "loss": 0.5785399675369263, "memory(GiB)": 77.69, "step": 5, "token_acc": 0.8787542627059451, "train_speed(iter/s)": 0.074597 }, { "epoch": 0.010570824524312896, "grad_norm": 0.337890625, "learning_rate": 1.7605633802816904e-06, "loss": 0.627132797241211, "memory(GiB)": 77.69, "step": 10, "token_acc": 0.8759471161280347, "train_speed(iter/s)": 0.077468 }, { "epoch": 0.015856236786469344, "grad_norm": 0.578125, "learning_rate": 2.640845070422535e-06, "loss": 0.5077077388763428, "memory(GiB)": 77.69, "step": 15, "token_acc": 0.9069921756200061, "train_speed(iter/s)": 0.074032 }, { "epoch": 0.021141649048625793, "grad_norm": 0.2265625, "learning_rate": 3.521126760563381e-06, "loss": 0.5215200424194336, "memory(GiB)": 77.69, "step": 20, "token_acc": 0.8947583588845087, "train_speed(iter/s)": 0.07611 }, { "epoch": 0.026427061310782242, "grad_norm": 0.359375, "learning_rate": 4.401408450704226e-06, "loss": 0.5607146263122559, "memory(GiB)": 77.69, "step": 25, "token_acc": 0.8887670030715226, "train_speed(iter/s)": 0.07665 }, { "epoch": 0.03171247357293869, "grad_norm": 0.375, "learning_rate": 5.28169014084507e-06, "loss": 0.4995281219482422, "memory(GiB)": 77.69, "step": 30, "token_acc": 0.8995339353335275, "train_speed(iter/s)": 0.076919 }, { "epoch": 0.03699788583509514, "grad_norm": 0.36328125, "learning_rate": 6.161971830985916e-06, "loss": 0.46155481338500975, "memory(GiB)": 77.69, "step": 35, "token_acc": 0.9020970266040689, "train_speed(iter/s)": 0.076859 }, { "epoch": 0.042283298097251586, "grad_norm": 0.40234375, "learning_rate": 7.042253521126762e-06, "loss": 0.525246000289917, "memory(GiB)": 77.69, "step": 40, "token_acc": 0.8897597042513863, "train_speed(iter/s)": 0.077563 }, { "epoch": 0.04756871035940803, "grad_norm": 0.283203125, "learning_rate": 7.922535211267606e-06, "loss": 0.484679651260376, "memory(GiB)": 77.69, "step": 45, "token_acc": 0.8976993865030675, "train_speed(iter/s)": 0.077196 }, { "epoch": 0.052854122621564484, "grad_norm": 0.3515625, "learning_rate": 8.802816901408451e-06, "loss": 0.5049729824066163, "memory(GiB)": 77.69, "step": 50, "token_acc": 0.8943454686289698, "train_speed(iter/s)": 0.077792 }, { "epoch": 0.05813953488372093, "grad_norm": 0.345703125, "learning_rate": 9.683098591549296e-06, "loss": 0.5481216907501221, "memory(GiB)": 77.69, "step": 55, "token_acc": 0.878755253012885, "train_speed(iter/s)": 0.078937 }, { "epoch": 0.06342494714587738, "grad_norm": 0.298828125, "learning_rate": 1.056338028169014e-05, "loss": 0.4035048961639404, "memory(GiB)": 77.69, "step": 60, "token_acc": 0.9058074298711145, "train_speed(iter/s)": 0.07891 }, { "epoch": 0.06871035940803383, "grad_norm": 0.2451171875, "learning_rate": 1.1443661971830987e-05, "loss": 0.39989733695983887, "memory(GiB)": 77.69, "step": 65, "token_acc": 0.9038362843212727, "train_speed(iter/s)": 0.07845 }, { "epoch": 0.07399577167019028, "grad_norm": 0.23828125, "learning_rate": 1.2323943661971832e-05, "loss": 0.3942647695541382, "memory(GiB)": 77.69, "step": 70, "token_acc": 0.9019590004852014, "train_speed(iter/s)": 0.078589 }, { "epoch": 0.07928118393234672, "grad_norm": 0.162109375, "learning_rate": 1.3204225352112675e-05, "loss": 0.442060661315918, "memory(GiB)": 77.69, "step": 75, "token_acc": 0.8944171246745733, "train_speed(iter/s)": 0.079231 }, { "epoch": 0.08456659619450317, "grad_norm": 0.16796875, "learning_rate": 1.4084507042253523e-05, "loss": 0.33968005180358884, "memory(GiB)": 77.69, "step": 80, "token_acc": 0.9157256378084484, "train_speed(iter/s)": 0.07962 }, { "epoch": 0.08985200845665962, "grad_norm": 0.1640625, "learning_rate": 1.4964788732394366e-05, "loss": 0.3238480567932129, "memory(GiB)": 77.69, "step": 85, "token_acc": 0.9219054694029157, "train_speed(iter/s)": 0.079847 }, { "epoch": 0.09513742071881606, "grad_norm": 0.181640625, "learning_rate": 1.5845070422535213e-05, "loss": 0.3175073146820068, "memory(GiB)": 77.69, "step": 90, "token_acc": 0.9217296774972823, "train_speed(iter/s)": 0.080018 }, { "epoch": 0.10042283298097252, "grad_norm": 0.1337890625, "learning_rate": 1.6725352112676056e-05, "loss": 0.33591723442077637, "memory(GiB)": 77.69, "step": 95, "token_acc": 0.9197468988848515, "train_speed(iter/s)": 0.079684 }, { "epoch": 0.10570824524312897, "grad_norm": 0.13671875, "learning_rate": 1.7605633802816902e-05, "loss": 0.28483469486236573, "memory(GiB)": 77.69, "step": 100, "token_acc": 0.9299266925398879, "train_speed(iter/s)": 0.079562 }, { "epoch": 0.1109936575052854, "grad_norm": 0.1455078125, "learning_rate": 1.848591549295775e-05, "loss": 0.312535834312439, "memory(GiB)": 77.69, "step": 105, "token_acc": 0.9187720930232558, "train_speed(iter/s)": 0.079606 }, { "epoch": 0.11627906976744186, "grad_norm": 0.138671875, "learning_rate": 1.9366197183098592e-05, "loss": 0.23693528175354003, "memory(GiB)": 77.69, "step": 110, "token_acc": 0.9388760139049827, "train_speed(iter/s)": 0.07887 }, { "epoch": 0.12156448202959831, "grad_norm": 0.150390625, "learning_rate": 2.024647887323944e-05, "loss": 0.2640044689178467, "memory(GiB)": 77.69, "step": 115, "token_acc": 0.9317763533667577, "train_speed(iter/s)": 0.078972 }, { "epoch": 0.12684989429175475, "grad_norm": 0.2373046875, "learning_rate": 2.112676056338028e-05, "loss": 0.26922695636749266, "memory(GiB)": 77.69, "step": 120, "token_acc": 0.9294535501319691, "train_speed(iter/s)": 0.079051 }, { "epoch": 0.1321353065539112, "grad_norm": 0.2119140625, "learning_rate": 2.2007042253521128e-05, "loss": 0.27900142669677735, "memory(GiB)": 77.69, "step": 125, "token_acc": 0.9264384559359068, "train_speed(iter/s)": 0.079411 }, { "epoch": 0.13742071881606766, "grad_norm": 0.369140625, "learning_rate": 2.2887323943661974e-05, "loss": 0.28076226711273194, "memory(GiB)": 77.69, "step": 130, "token_acc": 0.9281237498333111, "train_speed(iter/s)": 0.079879 }, { "epoch": 0.1427061310782241, "grad_norm": 0.1357421875, "learning_rate": 2.3767605633802817e-05, "loss": 0.2778019428253174, "memory(GiB)": 77.69, "step": 135, "token_acc": 0.9258921540183197, "train_speed(iter/s)": 0.080329 }, { "epoch": 0.14799154334038056, "grad_norm": 0.19140625, "learning_rate": 2.4647887323943664e-05, "loss": 0.2151641845703125, "memory(GiB)": 77.69, "step": 140, "token_acc": 0.9393794320935939, "train_speed(iter/s)": 0.080428 }, { "epoch": 0.15327695560253699, "grad_norm": 0.140625, "learning_rate": 2.5528169014084507e-05, "loss": 0.22943789958953859, "memory(GiB)": 77.69, "step": 145, "token_acc": 0.9390464150261875, "train_speed(iter/s)": 0.080533 }, { "epoch": 0.15856236786469344, "grad_norm": 0.1494140625, "learning_rate": 2.640845070422535e-05, "loss": 0.1837033748626709, "memory(GiB)": 77.69, "step": 150, "token_acc": 0.9488915074949987, "train_speed(iter/s)": 0.080154 }, { "epoch": 0.1638477801268499, "grad_norm": 0.189453125, "learning_rate": 2.72887323943662e-05, "loss": 0.23342387676239013, "memory(GiB)": 77.69, "step": 155, "token_acc": 0.9370308331782369, "train_speed(iter/s)": 0.080321 }, { "epoch": 0.16913319238900634, "grad_norm": 0.21484375, "learning_rate": 2.8169014084507046e-05, "loss": 0.18380935192108155, "memory(GiB)": 77.69, "step": 160, "token_acc": 0.9478817830087459, "train_speed(iter/s)": 0.080245 }, { "epoch": 0.1744186046511628, "grad_norm": 0.2021484375, "learning_rate": 2.904929577464789e-05, "loss": 0.23660047054290773, "memory(GiB)": 77.69, "step": 165, "token_acc": 0.9379279452230115, "train_speed(iter/s)": 0.080375 }, { "epoch": 0.17970401691331925, "grad_norm": 0.2109375, "learning_rate": 2.9929577464788733e-05, "loss": 0.21739184856414795, "memory(GiB)": 77.69, "step": 170, "token_acc": 0.9434620016479686, "train_speed(iter/s)": 0.08052 }, { "epoch": 0.1849894291754757, "grad_norm": 0.26953125, "learning_rate": 3.0809859154929576e-05, "loss": 0.1963450074195862, "memory(GiB)": 77.69, "step": 175, "token_acc": 0.9442798059074479, "train_speed(iter/s)": 0.080367 }, { "epoch": 0.19027484143763213, "grad_norm": 0.37890625, "learning_rate": 3.1690140845070426e-05, "loss": 0.17974252700805665, "memory(GiB)": 77.69, "step": 180, "token_acc": 0.9481139164412383, "train_speed(iter/s)": 0.080581 }, { "epoch": 0.19556025369978858, "grad_norm": 0.263671875, "learning_rate": 3.257042253521127e-05, "loss": 0.19265687465667725, "memory(GiB)": 77.69, "step": 185, "token_acc": 0.9474324577275847, "train_speed(iter/s)": 0.080612 }, { "epoch": 0.20084566596194503, "grad_norm": 0.3671875, "learning_rate": 3.345070422535211e-05, "loss": 0.21900510787963867, "memory(GiB)": 77.69, "step": 190, "token_acc": 0.9434389140271493, "train_speed(iter/s)": 0.080986 }, { "epoch": 0.20613107822410148, "grad_norm": 0.357421875, "learning_rate": 3.4330985915492955e-05, "loss": 0.1479341506958008, "memory(GiB)": 77.69, "step": 195, "token_acc": 0.9580980312626751, "train_speed(iter/s)": 0.080992 }, { "epoch": 0.21141649048625794, "grad_norm": 0.396484375, "learning_rate": 3.5211267605633805e-05, "loss": 0.2439335346221924, "memory(GiB)": 77.69, "step": 200, "token_acc": 0.9371973693719737, "train_speed(iter/s)": 0.081141 }, { "epoch": 0.2167019027484144, "grad_norm": 0.208984375, "learning_rate": 3.6091549295774655e-05, "loss": 0.14025869369506835, "memory(GiB)": 77.69, "step": 205, "token_acc": 0.9638251940019837, "train_speed(iter/s)": 0.080958 }, { "epoch": 0.2219873150105708, "grad_norm": 0.244140625, "learning_rate": 3.69718309859155e-05, "loss": 0.14009780883789064, "memory(GiB)": 77.69, "step": 210, "token_acc": 0.9628481089626657, "train_speed(iter/s)": 0.080941 }, { "epoch": 0.22727272727272727, "grad_norm": 0.259765625, "learning_rate": 3.785211267605634e-05, "loss": 0.1344698429107666, "memory(GiB)": 77.69, "step": 215, "token_acc": 0.9627247579529737, "train_speed(iter/s)": 0.081075 }, { "epoch": 0.23255813953488372, "grad_norm": 0.271484375, "learning_rate": 3.8732394366197184e-05, "loss": 0.14501973390579223, "memory(GiB)": 77.69, "step": 220, "token_acc": 0.9614604462474645, "train_speed(iter/s)": 0.081093 }, { "epoch": 0.23784355179704017, "grad_norm": 0.2275390625, "learning_rate": 3.9612676056338034e-05, "loss": 0.14495565891265869, "memory(GiB)": 77.69, "step": 225, "token_acc": 0.9630995166687322, "train_speed(iter/s)": 0.08089 }, { "epoch": 0.24312896405919662, "grad_norm": 0.318359375, "learning_rate": 4.049295774647888e-05, "loss": 0.12553561925888063, "memory(GiB)": 77.69, "step": 230, "token_acc": 0.9673265219242231, "train_speed(iter/s)": 0.080941 }, { "epoch": 0.24841437632135308, "grad_norm": 0.423828125, "learning_rate": 4.137323943661972e-05, "loss": 0.15523269176483154, "memory(GiB)": 77.69, "step": 235, "token_acc": 0.9600874352331606, "train_speed(iter/s)": 0.081247 }, { "epoch": 0.2536997885835095, "grad_norm": 0.314453125, "learning_rate": 4.225352112676056e-05, "loss": 0.19534292221069335, "memory(GiB)": 77.69, "step": 240, "token_acc": 0.9515238624113939, "train_speed(iter/s)": 0.081251 }, { "epoch": 0.25898520084566595, "grad_norm": 0.16796875, "learning_rate": 4.3133802816901406e-05, "loss": 0.13154494762420654, "memory(GiB)": 77.69, "step": 245, "token_acc": 0.9655755646204358, "train_speed(iter/s)": 0.080771 }, { "epoch": 0.2642706131078224, "grad_norm": 0.22265625, "learning_rate": 4.4014084507042256e-05, "loss": 0.15482015609741212, "memory(GiB)": 77.69, "step": 250, "token_acc": 0.9589573684032591, "train_speed(iter/s)": 0.0809 }, { "epoch": 0.26955602536997886, "grad_norm": 0.2333984375, "learning_rate": 4.48943661971831e-05, "loss": 0.14184291362762452, "memory(GiB)": 77.69, "step": 255, "token_acc": 0.9634158207115753, "train_speed(iter/s)": 0.080979 }, { "epoch": 0.2748414376321353, "grad_norm": 0.2431640625, "learning_rate": 4.577464788732395e-05, "loss": 0.13292162418365477, "memory(GiB)": 77.69, "step": 260, "token_acc": 0.9646657224824782, "train_speed(iter/s)": 0.081171 }, { "epoch": 0.28012684989429176, "grad_norm": 0.255859375, "learning_rate": 4.665492957746479e-05, "loss": 0.17609896659851074, "memory(GiB)": 77.69, "step": 265, "token_acc": 0.9529624249087125, "train_speed(iter/s)": 0.08138 }, { "epoch": 0.2854122621564482, "grad_norm": 0.2333984375, "learning_rate": 4.7535211267605635e-05, "loss": 0.12164682149887085, "memory(GiB)": 77.69, "step": 270, "token_acc": 0.9661621584178991, "train_speed(iter/s)": 0.081378 }, { "epoch": 0.29069767441860467, "grad_norm": 0.1787109375, "learning_rate": 4.8415492957746485e-05, "loss": 0.11981290578842163, "memory(GiB)": 77.69, "step": 275, "token_acc": 0.968763868501007, "train_speed(iter/s)": 0.081356 }, { "epoch": 0.2959830866807611, "grad_norm": 0.2470703125, "learning_rate": 4.929577464788733e-05, "loss": 0.1150665283203125, "memory(GiB)": 77.69, "step": 280, "token_acc": 0.9691602348439871, "train_speed(iter/s)": 0.081389 }, { "epoch": 0.3012684989429176, "grad_norm": 0.197265625, "learning_rate": 4.999998108667359e-05, "loss": 0.14339234828948974, "memory(GiB)": 77.69, "step": 285, "token_acc": 0.9605376212102614, "train_speed(iter/s)": 0.081431 }, { "epoch": 0.30655391120507397, "grad_norm": 0.2109375, "learning_rate": 4.9999319123254007e-05, "loss": 0.1458795428276062, "memory(GiB)": 77.69, "step": 290, "token_acc": 0.9617036165272432, "train_speed(iter/s)": 0.081426 }, { "epoch": 0.3118393234672304, "grad_norm": 0.25390625, "learning_rate": 4.99977115221308e-05, "loss": 0.11519958972930908, "memory(GiB)": 77.69, "step": 295, "token_acc": 0.9685056793037321, "train_speed(iter/s)": 0.081576 }, { "epoch": 0.3171247357293869, "grad_norm": 0.16015625, "learning_rate": 4.999515834411396e-05, "loss": 0.12930259704589844, "memory(GiB)": 77.69, "step": 300, "token_acc": 0.9662626598040843, "train_speed(iter/s)": 0.081628 }, { "epoch": 0.3224101479915433, "grad_norm": 0.1474609375, "learning_rate": 4.999165968578138e-05, "loss": 0.13337899446487428, "memory(GiB)": 77.69, "step": 305, "token_acc": 0.9646794726074108, "train_speed(iter/s)": 0.081502 }, { "epoch": 0.3276955602536998, "grad_norm": 0.1865234375, "learning_rate": 4.9987215679475176e-05, "loss": 0.1647234082221985, "memory(GiB)": 77.69, "step": 310, "token_acc": 0.9580583899314385, "train_speed(iter/s)": 0.08153 }, { "epoch": 0.33298097251585623, "grad_norm": 0.1611328125, "learning_rate": 4.9981826493296736e-05, "loss": 0.1301745891571045, "memory(GiB)": 77.69, "step": 315, "token_acc": 0.9630228873832878, "train_speed(iter/s)": 0.081514 }, { "epoch": 0.3382663847780127, "grad_norm": 0.185546875, "learning_rate": 4.997549233110031e-05, "loss": 0.17971887588500976, "memory(GiB)": 77.69, "step": 320, "token_acc": 0.9543125135882226, "train_speed(iter/s)": 0.08159 }, { "epoch": 0.34355179704016914, "grad_norm": 0.16015625, "learning_rate": 4.9968213432485336e-05, "loss": 0.14173873662948608, "memory(GiB)": 77.69, "step": 325, "token_acc": 0.9625554467423239, "train_speed(iter/s)": 0.081421 }, { "epoch": 0.3488372093023256, "grad_norm": 0.130859375, "learning_rate": 4.9959990072787346e-05, "loss": 0.11465731859207154, "memory(GiB)": 77.69, "step": 330, "token_acc": 0.9700368530684185, "train_speed(iter/s)": 0.081302 }, { "epoch": 0.35412262156448204, "grad_norm": 0.1455078125, "learning_rate": 4.995082256306757e-05, "loss": 0.12255452871322632, "memory(GiB)": 77.69, "step": 335, "token_acc": 0.9657452605290044, "train_speed(iter/s)": 0.081256 }, { "epoch": 0.3594080338266385, "grad_norm": 0.177734375, "learning_rate": 4.9940711250101166e-05, "loss": 0.11792256832122802, "memory(GiB)": 77.69, "step": 340, "token_acc": 0.96757131163025, "train_speed(iter/s)": 0.081252 }, { "epoch": 0.36469344608879495, "grad_norm": 0.203125, "learning_rate": 4.992965651636411e-05, "loss": 0.14774703979492188, "memory(GiB)": 77.69, "step": 345, "token_acc": 0.962194035809711, "train_speed(iter/s)": 0.081292 }, { "epoch": 0.3699788583509514, "grad_norm": 0.3515625, "learning_rate": 4.9917658780018706e-05, "loss": 0.16488633155822754, "memory(GiB)": 77.69, "step": 350, "token_acc": 0.9567865003668379, "train_speed(iter/s)": 0.081382 }, { "epoch": 0.3752642706131078, "grad_norm": 0.13671875, "learning_rate": 4.990471849489778e-05, "loss": 0.10595872402191162, "memory(GiB)": 77.69, "step": 355, "token_acc": 0.9708169338581165, "train_speed(iter/s)": 0.081385 }, { "epoch": 0.38054968287526425, "grad_norm": 0.169921875, "learning_rate": 4.989083615048754e-05, "loss": 0.11339321136474609, "memory(GiB)": 77.69, "step": 360, "token_acc": 0.9697678771995507, "train_speed(iter/s)": 0.081353 }, { "epoch": 0.3858350951374207, "grad_norm": 0.193359375, "learning_rate": 4.9876012271909e-05, "loss": 0.11111799478530884, "memory(GiB)": 77.69, "step": 365, "token_acc": 0.9713587048175839, "train_speed(iter/s)": 0.081348 }, { "epoch": 0.39112050739957716, "grad_norm": 0.1982421875, "learning_rate": 4.9860247419898196e-05, "loss": 0.11626808643341065, "memory(GiB)": 77.69, "step": 370, "token_acc": 0.9689474349254485, "train_speed(iter/s)": 0.081304 }, { "epoch": 0.3964059196617336, "grad_norm": 0.1572265625, "learning_rate": 4.984354219078488e-05, "loss": 0.10674415826797486, "memory(GiB)": 77.69, "step": 375, "token_acc": 0.9697379836890508, "train_speed(iter/s)": 0.081383 }, { "epoch": 0.40169133192389006, "grad_norm": 0.1591796875, "learning_rate": 4.982589721647006e-05, "loss": 0.15452499389648439, "memory(GiB)": 77.69, "step": 380, "token_acc": 0.9586577003345811, "train_speed(iter/s)": 0.08138 }, { "epoch": 0.4069767441860465, "grad_norm": 0.25, "learning_rate": 4.980731316440202e-05, "loss": 0.1141701102256775, "memory(GiB)": 77.69, "step": 385, "token_acc": 0.9677666677315273, "train_speed(iter/s)": 0.081402 }, { "epoch": 0.41226215644820297, "grad_norm": 0.1962890625, "learning_rate": 4.9787790737551144e-05, "loss": 0.1376587986946106, "memory(GiB)": 77.69, "step": 390, "token_acc": 0.965669492881139, "train_speed(iter/s)": 0.081388 }, { "epoch": 0.4175475687103594, "grad_norm": 0.283203125, "learning_rate": 4.976733067438323e-05, "loss": 0.11453235149383545, "memory(GiB)": 77.69, "step": 395, "token_acc": 0.9680796567842735, "train_speed(iter/s)": 0.081276 }, { "epoch": 0.42283298097251587, "grad_norm": 0.28515625, "learning_rate": 4.9745933748831665e-05, "loss": 0.11970688104629516, "memory(GiB)": 77.69, "step": 400, "token_acc": 0.9685019663925635, "train_speed(iter/s)": 0.081339 }, { "epoch": 0.4281183932346723, "grad_norm": 0.2021484375, "learning_rate": 4.972360077026805e-05, "loss": 0.11000370979309082, "memory(GiB)": 77.69, "step": 405, "token_acc": 0.9692411695115768, "train_speed(iter/s)": 0.081341 }, { "epoch": 0.4334038054968288, "grad_norm": 0.193359375, "learning_rate": 4.9700332583471676e-05, "loss": 0.11478041410446167, "memory(GiB)": 77.69, "step": 410, "token_acc": 0.9686780992115839, "train_speed(iter/s)": 0.08145 }, { "epoch": 0.43868921775898523, "grad_norm": 0.169921875, "learning_rate": 4.96761300685975e-05, "loss": 0.12017009258270264, "memory(GiB)": 77.69, "step": 415, "token_acc": 0.9649507526482066, "train_speed(iter/s)": 0.08139 }, { "epoch": 0.4439746300211416, "grad_norm": 0.2255859375, "learning_rate": 4.965099414114287e-05, "loss": 0.08261418342590332, "memory(GiB)": 77.69, "step": 420, "token_acc": 0.9753066176952844, "train_speed(iter/s)": 0.081358 }, { "epoch": 0.4492600422832981, "grad_norm": 0.16015625, "learning_rate": 4.962492575191292e-05, "loss": 0.12857224941253662, "memory(GiB)": 77.69, "step": 425, "token_acc": 0.965159949924087, "train_speed(iter/s)": 0.0812 }, { "epoch": 0.45454545454545453, "grad_norm": 0.14453125, "learning_rate": 4.959792588698457e-05, "loss": 0.1270362377166748, "memory(GiB)": 77.69, "step": 430, "token_acc": 0.9667915423201473, "train_speed(iter/s)": 0.081239 }, { "epoch": 0.459830866807611, "grad_norm": 0.2177734375, "learning_rate": 4.956999556766924e-05, "loss": 0.16223521232604982, "memory(GiB)": 77.69, "step": 435, "token_acc": 0.9561979387265284, "train_speed(iter/s)": 0.081374 }, { "epoch": 0.46511627906976744, "grad_norm": 0.189453125, "learning_rate": 4.9541135850474234e-05, "loss": 0.115000319480896, "memory(GiB)": 77.69, "step": 440, "token_acc": 0.9689815665380611, "train_speed(iter/s)": 0.081281 }, { "epoch": 0.4704016913319239, "grad_norm": 0.2265625, "learning_rate": 4.9511347827062746e-05, "loss": 0.1402798295021057, "memory(GiB)": 77.69, "step": 445, "token_acc": 0.9626071250144115, "train_speed(iter/s)": 0.081426 }, { "epoch": 0.47568710359408034, "grad_norm": 0.275390625, "learning_rate": 4.94806326242126e-05, "loss": 0.09813187718391418, "memory(GiB)": 77.69, "step": 450, "token_acc": 0.9748599987956885, "train_speed(iter/s)": 0.081314 }, { "epoch": 0.4809725158562368, "grad_norm": 0.341796875, "learning_rate": 4.9448991403773575e-05, "loss": 0.13812012672424318, "memory(GiB)": 77.69, "step": 455, "token_acc": 0.9638653727028701, "train_speed(iter/s)": 0.081409 }, { "epoch": 0.48625792811839325, "grad_norm": 0.1611328125, "learning_rate": 4.941642536262352e-05, "loss": 0.11200442314147949, "memory(GiB)": 77.69, "step": 460, "token_acc": 0.9684644269431429, "train_speed(iter/s)": 0.081468 }, { "epoch": 0.4915433403805497, "grad_norm": 0.2392578125, "learning_rate": 4.9382935732623035e-05, "loss": 0.1466497778892517, "memory(GiB)": 77.69, "step": 465, "token_acc": 0.9616228070175439, "train_speed(iter/s)": 0.081386 }, { "epoch": 0.49682875264270615, "grad_norm": 0.1943359375, "learning_rate": 4.93485237805689e-05, "loss": 0.10534273386001587, "memory(GiB)": 77.69, "step": 470, "token_acc": 0.9706304442145313, "train_speed(iter/s)": 0.081409 }, { "epoch": 0.5021141649048626, "grad_norm": 0.294921875, "learning_rate": 4.931319080814614e-05, "loss": 0.11787580251693726, "memory(GiB)": 77.69, "step": 475, "token_acc": 0.9681176349331776, "train_speed(iter/s)": 0.081443 }, { "epoch": 0.507399577167019, "grad_norm": 0.330078125, "learning_rate": 4.9276938151878774e-05, "loss": 0.09996870756149293, "memory(GiB)": 77.69, "step": 480, "token_acc": 0.974033835664447, "train_speed(iter/s)": 0.081462 }, { "epoch": 0.5126849894291755, "grad_norm": 0.15625, "learning_rate": 4.923976718307932e-05, "loss": 0.09865926504135132, "memory(GiB)": 77.69, "step": 485, "token_acc": 0.9732607232454262, "train_speed(iter/s)": 0.081496 }, { "epoch": 0.5179704016913319, "grad_norm": 0.236328125, "learning_rate": 4.9201679307796834e-05, "loss": 0.09538254737854004, "memory(GiB)": 77.69, "step": 490, "token_acc": 0.9754637108004234, "train_speed(iter/s)": 0.081391 }, { "epoch": 0.5232558139534884, "grad_norm": 0.29296875, "learning_rate": 4.9162675966763807e-05, "loss": 0.11104313135147095, "memory(GiB)": 77.69, "step": 495, "token_acc": 0.969951279957194, "train_speed(iter/s)": 0.081332 }, { "epoch": 0.5285412262156448, "grad_norm": 0.2470703125, "learning_rate": 4.91227586353416e-05, "loss": 0.13056395053863526, "memory(GiB)": 77.69, "step": 500, "token_acc": 0.9666207168176447, "train_speed(iter/s)": 0.081353 }, { "epoch": 0.5338266384778013, "grad_norm": 0.2333984375, "learning_rate": 4.908192882346469e-05, "loss": 0.10588997602462769, "memory(GiB)": 77.69, "step": 505, "token_acc": 0.9706939718816203, "train_speed(iter/s)": 0.081404 }, { "epoch": 0.5391120507399577, "grad_norm": 0.318359375, "learning_rate": 4.904018807558352e-05, "loss": 0.1565835952758789, "memory(GiB)": 77.69, "step": 510, "token_acc": 0.9585944792639018, "train_speed(iter/s)": 0.08149 }, { "epoch": 0.5443974630021141, "grad_norm": 0.27734375, "learning_rate": 4.899753797060609e-05, "loss": 0.12696717977523803, "memory(GiB)": 77.69, "step": 515, "token_acc": 0.9631993991738641, "train_speed(iter/s)": 0.081544 }, { "epoch": 0.5496828752642706, "grad_norm": 0.28125, "learning_rate": 4.895398012183824e-05, "loss": 0.10460776090621948, "memory(GiB)": 77.69, "step": 520, "token_acc": 0.9696146839445915, "train_speed(iter/s)": 0.081598 }, { "epoch": 0.554968287526427, "grad_norm": 0.23828125, "learning_rate": 4.8909516176922584e-05, "loss": 0.11158976554870606, "memory(GiB)": 77.69, "step": 525, "token_acc": 0.9698326881858025, "train_speed(iter/s)": 0.081538 }, { "epoch": 0.5602536997885835, "grad_norm": 0.1875, "learning_rate": 4.886414781777627e-05, "loss": 0.100558340549469, "memory(GiB)": 77.69, "step": 530, "token_acc": 0.9732903010405654, "train_speed(iter/s)": 0.081487 }, { "epoch": 0.5655391120507399, "grad_norm": 0.2177734375, "learning_rate": 4.8817876760527254e-05, "loss": 0.1437574028968811, "memory(GiB)": 77.69, "step": 535, "token_acc": 0.9594058762350494, "train_speed(iter/s)": 0.08156 }, { "epoch": 0.5708245243128964, "grad_norm": 0.244140625, "learning_rate": 4.8770704755449456e-05, "loss": 0.07569544315338135, "memory(GiB)": 77.69, "step": 540, "token_acc": 0.9776643474868327, "train_speed(iter/s)": 0.081519 }, { "epoch": 0.5761099365750528, "grad_norm": 0.1787109375, "learning_rate": 4.872263358689656e-05, "loss": 0.09071049690246583, "memory(GiB)": 77.69, "step": 545, "token_acc": 0.9751284613188695, "train_speed(iter/s)": 0.081485 }, { "epoch": 0.5813953488372093, "grad_norm": 0.203125, "learning_rate": 4.867366507323444e-05, "loss": 0.1144001841545105, "memory(GiB)": 77.69, "step": 550, "token_acc": 0.9688177874186551, "train_speed(iter/s)": 0.081467 }, { "epoch": 0.5866807610993657, "grad_norm": 0.255859375, "learning_rate": 4.862380106677247e-05, "loss": 0.11915791034698486, "memory(GiB)": 77.69, "step": 555, "token_acc": 0.967741935483871, "train_speed(iter/s)": 0.081443 }, { "epoch": 0.5919661733615222, "grad_norm": 0.2109375, "learning_rate": 4.857304345369339e-05, "loss": 0.10802276134490967, "memory(GiB)": 77.69, "step": 560, "token_acc": 0.9713095952288527, "train_speed(iter/s)": 0.081475 }, { "epoch": 0.5972515856236786, "grad_norm": 0.2080078125, "learning_rate": 4.852139415398202e-05, "loss": 0.0878659188747406, "memory(GiB)": 77.69, "step": 565, "token_acc": 0.9754778914848427, "train_speed(iter/s)": 0.08136 }, { "epoch": 0.6025369978858351, "grad_norm": 0.390625, "learning_rate": 4.846885512135256e-05, "loss": 0.1495951533317566, "memory(GiB)": 77.69, "step": 570, "token_acc": 0.9591729270518852, "train_speed(iter/s)": 0.081416 }, { "epoch": 0.6078224101479915, "grad_norm": 0.1884765625, "learning_rate": 4.841542834317474e-05, "loss": 0.0931200623512268, "memory(GiB)": 77.69, "step": 575, "token_acc": 0.9749940864393607, "train_speed(iter/s)": 0.081415 }, { "epoch": 0.6131078224101479, "grad_norm": 0.26171875, "learning_rate": 4.836111584039865e-05, "loss": 0.11454503536224366, "memory(GiB)": 77.69, "step": 580, "token_acc": 0.9679121316697638, "train_speed(iter/s)": 0.081413 }, { "epoch": 0.6183932346723044, "grad_norm": 0.314453125, "learning_rate": 4.830591966747824e-05, "loss": 0.1001734733581543, "memory(GiB)": 77.69, "step": 585, "token_acc": 0.9707518182878924, "train_speed(iter/s)": 0.081481 }, { "epoch": 0.6236786469344608, "grad_norm": 0.333984375, "learning_rate": 4.824984191229368e-05, "loss": 0.12401694059371948, "memory(GiB)": 77.69, "step": 590, "token_acc": 0.9647066981001248, "train_speed(iter/s)": 0.081513 }, { "epoch": 0.6289640591966174, "grad_norm": 0.201171875, "learning_rate": 4.819288469607231e-05, "loss": 0.08301239609718322, "memory(GiB)": 77.69, "step": 595, "token_acc": 0.9759754865208462, "train_speed(iter/s)": 0.081466 }, { "epoch": 0.6342494714587738, "grad_norm": 0.466796875, "learning_rate": 4.8135050173308466e-05, "loss": 0.11875600814819336, "memory(GiB)": 77.69, "step": 600, "token_acc": 0.9698893895944285, "train_speed(iter/s)": 0.081479 }, { "epoch": 0.6395348837209303, "grad_norm": 0.2392578125, "learning_rate": 4.807634053168194e-05, "loss": 0.08307773470878602, "memory(GiB)": 77.69, "step": 605, "token_acc": 0.9767186981069412, "train_speed(iter/s)": 0.081495 }, { "epoch": 0.6448202959830867, "grad_norm": 0.37109375, "learning_rate": 4.801675799197524e-05, "loss": 0.11512783765792847, "memory(GiB)": 77.69, "step": 610, "token_acc": 0.9690937158294205, "train_speed(iter/s)": 0.081487 }, { "epoch": 0.6501057082452432, "grad_norm": 0.263671875, "learning_rate": 4.7956304807989585e-05, "loss": 0.059033775329589845, "memory(GiB)": 77.69, "step": 615, "token_acc": 0.9819070221974033, "train_speed(iter/s)": 0.081445 }, { "epoch": 0.6553911205073996, "grad_norm": 0.326171875, "learning_rate": 4.789498326645967e-05, "loss": 0.11581664085388184, "memory(GiB)": 77.69, "step": 620, "token_acc": 0.9700020977554017, "train_speed(iter/s)": 0.081398 }, { "epoch": 0.6606765327695561, "grad_norm": 0.2392578125, "learning_rate": 4.7832795686967126e-05, "loss": 0.11886576414108277, "memory(GiB)": 77.69, "step": 625, "token_acc": 0.9664377348710639, "train_speed(iter/s)": 0.081376 }, { "epoch": 0.6659619450317125, "grad_norm": 0.326171875, "learning_rate": 4.776974442185282e-05, "loss": 0.1024976134300232, "memory(GiB)": 77.69, "step": 630, "token_acc": 0.9723366282262493, "train_speed(iter/s)": 0.081371 }, { "epoch": 0.671247357293869, "grad_norm": 0.23046875, "learning_rate": 4.770583185612785e-05, "loss": 0.11690139770507812, "memory(GiB)": 77.69, "step": 635, "token_acc": 0.96782347315207, "train_speed(iter/s)": 0.081398 }, { "epoch": 0.6765327695560254, "grad_norm": 0.2392578125, "learning_rate": 4.7641060407383326e-05, "loss": 0.09218586087226868, "memory(GiB)": 77.69, "step": 640, "token_acc": 0.9743474491925386, "train_speed(iter/s)": 0.081383 }, { "epoch": 0.6818181818181818, "grad_norm": 0.423828125, "learning_rate": 4.757543252569895e-05, "loss": 0.09415521025657654, "memory(GiB)": 77.69, "step": 645, "token_acc": 0.971652904747862, "train_speed(iter/s)": 0.081397 }, { "epoch": 0.6871035940803383, "grad_norm": 0.287109375, "learning_rate": 4.75089506935503e-05, "loss": 0.07093398571014405, "memory(GiB)": 77.69, "step": 650, "token_acc": 0.979292014720486, "train_speed(iter/s)": 0.081316 }, { "epoch": 0.6923890063424947, "grad_norm": 0.33203125, "learning_rate": 4.744161742571495e-05, "loss": 0.09763147234916687, "memory(GiB)": 77.69, "step": 655, "token_acc": 0.9721920238027122, "train_speed(iter/s)": 0.081289 }, { "epoch": 0.6976744186046512, "grad_norm": 0.26953125, "learning_rate": 4.737343526917734e-05, "loss": 0.09553920626640319, "memory(GiB)": 77.69, "step": 660, "token_acc": 0.9735271467077397, "train_speed(iter/s)": 0.08123 }, { "epoch": 0.7029598308668076, "grad_norm": 0.326171875, "learning_rate": 4.730440680303242e-05, "loss": 0.12122514247894287, "memory(GiB)": 77.69, "step": 665, "token_acc": 0.9685226194242328, "train_speed(iter/s)": 0.081231 }, { "epoch": 0.7082452431289641, "grad_norm": 0.275390625, "learning_rate": 4.723453463838812e-05, "loss": 0.10825440883636475, "memory(GiB)": 77.69, "step": 670, "token_acc": 0.9692710408012292, "train_speed(iter/s)": 0.081174 }, { "epoch": 0.7135306553911205, "grad_norm": 0.32421875, "learning_rate": 4.7163821418266565e-05, "loss": 0.09260475039482116, "memory(GiB)": 77.69, "step": 675, "token_acc": 0.9731341283666086, "train_speed(iter/s)": 0.081208 }, { "epoch": 0.718816067653277, "grad_norm": 0.2578125, "learning_rate": 4.709226981750407e-05, "loss": 0.0953767478466034, "memory(GiB)": 77.69, "step": 680, "token_acc": 0.9742266494644304, "train_speed(iter/s)": 0.081191 }, { "epoch": 0.7241014799154334, "grad_norm": 0.361328125, "learning_rate": 4.7019882542650015e-05, "loss": 0.08152833580970764, "memory(GiB)": 77.69, "step": 685, "token_acc": 0.9772788504714863, "train_speed(iter/s)": 0.081179 }, { "epoch": 0.7293868921775899, "grad_norm": 0.255859375, "learning_rate": 4.694666233186443e-05, "loss": 0.09803698062896729, "memory(GiB)": 77.69, "step": 690, "token_acc": 0.9727941176470588, "train_speed(iter/s)": 0.081213 }, { "epoch": 0.7346723044397463, "grad_norm": 0.38671875, "learning_rate": 4.6872611954814426e-05, "loss": 0.09239259958267212, "memory(GiB)": 77.69, "step": 695, "token_acc": 0.974837453623774, "train_speed(iter/s)": 0.081222 }, { "epoch": 0.7399577167019028, "grad_norm": 0.482421875, "learning_rate": 4.6797734212569424e-05, "loss": 0.0974138081073761, "memory(GiB)": 77.69, "step": 700, "token_acc": 0.9747778733161364, "train_speed(iter/s)": 0.081267 }, { "epoch": 0.7452431289640592, "grad_norm": 0.2890625, "learning_rate": 4.672203193749522e-05, "loss": 0.07721266746520997, "memory(GiB)": 77.69, "step": 705, "token_acc": 0.977774820335374, "train_speed(iter/s)": 0.081282 }, { "epoch": 0.7505285412262156, "grad_norm": 0.322265625, "learning_rate": 4.664550799314682e-05, "loss": 0.09376782178878784, "memory(GiB)": 77.69, "step": 710, "token_acc": 0.9728926578241647, "train_speed(iter/s)": 0.081277 }, { "epoch": 0.7558139534883721, "grad_norm": 0.33984375, "learning_rate": 4.656816527416015e-05, "loss": 0.11421089172363282, "memory(GiB)": 77.69, "step": 715, "token_acc": 0.9664844230209311, "train_speed(iter/s)": 0.081321 }, { "epoch": 0.7610993657505285, "grad_norm": 0.349609375, "learning_rate": 4.649000670614252e-05, "loss": 0.13693536520004274, "memory(GiB)": 77.69, "step": 720, "token_acc": 0.9621859967931587, "train_speed(iter/s)": 0.081358 }, { "epoch": 0.766384778012685, "grad_norm": 0.2353515625, "learning_rate": 4.641103524556201e-05, "loss": 0.06772813200950623, "memory(GiB)": 77.69, "step": 725, "token_acc": 0.9814220839023151, "train_speed(iter/s)": 0.081356 }, { "epoch": 0.7716701902748414, "grad_norm": 0.287109375, "learning_rate": 4.6331253879635604e-05, "loss": 0.08766214251518249, "memory(GiB)": 77.69, "step": 730, "token_acc": 0.9775230049781264, "train_speed(iter/s)": 0.081225 }, { "epoch": 0.7769556025369979, "grad_norm": 0.255859375, "learning_rate": 4.62506656262162e-05, "loss": 0.06932131052017212, "memory(GiB)": 77.69, "step": 735, "token_acc": 0.97956741556227, "train_speed(iter/s)": 0.081175 }, { "epoch": 0.7822410147991543, "grad_norm": 0.275390625, "learning_rate": 4.6169273533678446e-05, "loss": 0.11336870193481445, "memory(GiB)": 77.69, "step": 740, "token_acc": 0.9693814061630154, "train_speed(iter/s)": 0.081198 }, { "epoch": 0.7875264270613108, "grad_norm": 0.494140625, "learning_rate": 4.608708068080348e-05, "loss": 0.0770769476890564, "memory(GiB)": 77.69, "step": 745, "token_acc": 0.978170163874947, "train_speed(iter/s)": 0.081169 }, { "epoch": 0.7928118393234672, "grad_norm": 0.265625, "learning_rate": 4.600409017666236e-05, "loss": 0.056537413597106935, "memory(GiB)": 77.69, "step": 750, "token_acc": 0.9828174186778594, "train_speed(iter/s)": 0.081191 }, { "epoch": 0.7980972515856237, "grad_norm": 0.265625, "learning_rate": 4.592030516049861e-05, "loss": 0.09012222290039062, "memory(GiB)": 77.69, "step": 755, "token_acc": 0.9759961714775074, "train_speed(iter/s)": 0.081098 }, { "epoch": 0.8033826638477801, "grad_norm": 0.27734375, "learning_rate": 4.583572880160935e-05, "loss": 0.08086287975311279, "memory(GiB)": 77.69, "step": 760, "token_acc": 0.9775874059556173, "train_speed(iter/s)": 0.081084 }, { "epoch": 0.8086680761099366, "grad_norm": 0.259765625, "learning_rate": 4.575036429922546e-05, "loss": 0.06636863350868225, "memory(GiB)": 77.69, "step": 765, "token_acc": 0.9798997772828508, "train_speed(iter/s)": 0.081035 }, { "epoch": 0.813953488372093, "grad_norm": 0.412109375, "learning_rate": 4.5664214882390556e-05, "loss": 0.11157206296920777, "memory(GiB)": 77.69, "step": 770, "token_acc": 0.9686773802899878, "train_speed(iter/s)": 0.081041 }, { "epoch": 0.8192389006342494, "grad_norm": 0.2431640625, "learning_rate": 4.5577283809838865e-05, "loss": 0.07278598546981811, "memory(GiB)": 77.69, "step": 775, "token_acc": 0.9779526534859522, "train_speed(iter/s)": 0.081058 }, { "epoch": 0.8245243128964059, "grad_norm": 0.3359375, "learning_rate": 4.5489574369871933e-05, "loss": 0.07560226917266846, "memory(GiB)": 77.69, "step": 780, "token_acc": 0.9765994274682112, "train_speed(iter/s)": 0.081075 }, { "epoch": 0.8298097251585623, "grad_norm": 0.32421875, "learning_rate": 4.5401089880234265e-05, "loss": 0.055092573165893555, "memory(GiB)": 77.69, "step": 785, "token_acc": 0.9828075608025288, "train_speed(iter/s)": 0.081068 }, { "epoch": 0.8350951374207188, "grad_norm": 0.431640625, "learning_rate": 4.531183368798779e-05, "loss": 0.092472904920578, "memory(GiB)": 77.69, "step": 790, "token_acc": 0.9737849400109363, "train_speed(iter/s)": 0.081132 }, { "epoch": 0.8403805496828752, "grad_norm": 0.4453125, "learning_rate": 4.52218091693853e-05, "loss": 0.08904616236686706, "memory(GiB)": 77.69, "step": 795, "token_acc": 0.9736369680851064, "train_speed(iter/s)": 0.081111 }, { "epoch": 0.8456659619450317, "grad_norm": 0.333984375, "learning_rate": 4.5131019729742686e-05, "loss": 0.06626922488212586, "memory(GiB)": 77.69, "step": 800, "token_acc": 0.9800637259200795, "train_speed(iter/s)": 0.081094 }, { "epoch": 0.8509513742071881, "grad_norm": 0.298828125, "learning_rate": 4.503946880331018e-05, "loss": 0.07227307558059692, "memory(GiB)": 77.69, "step": 805, "token_acc": 0.9797429167507907, "train_speed(iter/s)": 0.081099 }, { "epoch": 0.8562367864693446, "grad_norm": 0.33203125, "learning_rate": 4.49471598531424e-05, "loss": 0.08108741641044617, "memory(GiB)": 77.69, "step": 810, "token_acc": 0.977376622509927, "train_speed(iter/s)": 0.081105 }, { "epoch": 0.861522198731501, "grad_norm": 0.427734375, "learning_rate": 4.48540963709674e-05, "loss": 0.06158255934715271, "memory(GiB)": 77.69, "step": 815, "token_acc": 0.9824003699101234, "train_speed(iter/s)": 0.081069 }, { "epoch": 0.8668076109936576, "grad_norm": 0.3828125, "learning_rate": 4.476028187705456e-05, "loss": 0.07041846513748169, "memory(GiB)": 77.69, "step": 820, "token_acc": 0.9795444423235986, "train_speed(iter/s)": 0.081055 }, { "epoch": 0.872093023255814, "grad_norm": 0.55078125, "learning_rate": 4.466571992008143e-05, "loss": 0.04567690789699554, "memory(GiB)": 77.69, "step": 825, "token_acc": 0.9864392728134067, "train_speed(iter/s)": 0.08103 }, { "epoch": 0.8773784355179705, "grad_norm": 0.4375, "learning_rate": 4.457041407699949e-05, "loss": 0.09166967868804932, "memory(GiB)": 77.69, "step": 830, "token_acc": 0.9738362215426436, "train_speed(iter/s)": 0.081058 }, { "epoch": 0.8826638477801269, "grad_norm": 0.486328125, "learning_rate": 4.4474367952898884e-05, "loss": 0.06858194470405579, "memory(GiB)": 77.69, "step": 835, "token_acc": 0.979127198917456, "train_speed(iter/s)": 0.081059 }, { "epoch": 0.8879492600422833, "grad_norm": 0.234375, "learning_rate": 4.4377585180872e-05, "loss": 0.1041705846786499, "memory(GiB)": 77.69, "step": 840, "token_acc": 0.9736306103149778, "train_speed(iter/s)": 0.081079 }, { "epoch": 0.8932346723044398, "grad_norm": 0.3046875, "learning_rate": 4.428006942187608e-05, "loss": 0.05868891477584839, "memory(GiB)": 77.69, "step": 845, "token_acc": 0.9820925427337048, "train_speed(iter/s)": 0.081106 }, { "epoch": 0.8985200845665962, "grad_norm": 0.330078125, "learning_rate": 4.4181824364594724e-05, "loss": 0.0798562228679657, "memory(GiB)": 77.69, "step": 850, "token_acc": 0.9758835263643671, "train_speed(iter/s)": 0.08107 }, { "epoch": 0.9038054968287527, "grad_norm": 0.341796875, "learning_rate": 4.408285372529836e-05, "loss": 0.05559794902801514, "memory(GiB)": 77.69, "step": 855, "token_acc": 0.9831912152628673, "train_speed(iter/s)": 0.081061 }, { "epoch": 0.9090909090909091, "grad_norm": 0.357421875, "learning_rate": 4.398316124770366e-05, "loss": 0.06730075478553772, "memory(GiB)": 77.69, "step": 860, "token_acc": 0.9808792175708145, "train_speed(iter/s)": 0.081083 }, { "epoch": 0.9143763213530656, "grad_norm": 0.490234375, "learning_rate": 4.388275070283196e-05, "loss": 0.048085132241249086, "memory(GiB)": 77.69, "step": 865, "token_acc": 0.9845503671885825, "train_speed(iter/s)": 0.081091 }, { "epoch": 0.919661733615222, "grad_norm": 0.40625, "learning_rate": 4.3781625888866586e-05, "loss": 0.0662156879901886, "memory(GiB)": 77.69, "step": 870, "token_acc": 0.980640958024385, "train_speed(iter/s)": 0.081069 }, { "epoch": 0.9249471458773785, "grad_norm": 0.453125, "learning_rate": 4.367979063100919e-05, "loss": 0.07822101712226867, "memory(GiB)": 77.69, "step": 875, "token_acc": 0.9755400666695528, "train_speed(iter/s)": 0.081135 }, { "epoch": 0.9302325581395349, "grad_norm": 0.33984375, "learning_rate": 4.3577248781335064e-05, "loss": 0.09637467861175537, "memory(GiB)": 77.69, "step": 880, "token_acc": 0.9725105787086364, "train_speed(iter/s)": 0.081126 }, { "epoch": 0.9355179704016914, "grad_norm": 0.5, "learning_rate": 4.3474004218647415e-05, "loss": 0.08438252210617066, "memory(GiB)": 77.69, "step": 885, "token_acc": 0.9742904841402337, "train_speed(iter/s)": 0.081161 }, { "epoch": 0.9408033826638478, "grad_norm": 0.30078125, "learning_rate": 4.3370060848330646e-05, "loss": 0.06415098309516906, "memory(GiB)": 77.69, "step": 890, "token_acc": 0.9815889598245954, "train_speed(iter/s)": 0.08118 }, { "epoch": 0.9460887949260042, "grad_norm": 0.337890625, "learning_rate": 4.326542260220265e-05, "loss": 0.09511008858680725, "memory(GiB)": 77.69, "step": 895, "token_acc": 0.9751622490447018, "train_speed(iter/s)": 0.081133 }, { "epoch": 0.9513742071881607, "grad_norm": 0.3984375, "learning_rate": 4.316009343836605e-05, "loss": 0.08048057556152344, "memory(GiB)": 77.69, "step": 900, "token_acc": 0.9757339513976682, "train_speed(iter/s)": 0.081138 }, { "epoch": 0.9566596194503171, "grad_norm": 0.369140625, "learning_rate": 4.3054077341058516e-05, "loss": 0.10019546747207642, "memory(GiB)": 77.69, "step": 905, "token_acc": 0.9727745142340714, "train_speed(iter/s)": 0.081151 }, { "epoch": 0.9619450317124736, "grad_norm": 0.390625, "learning_rate": 4.2947378320501995e-05, "loss": 0.09644404649734498, "memory(GiB)": 77.69, "step": 910, "token_acc": 0.9723212203976951, "train_speed(iter/s)": 0.081129 }, { "epoch": 0.96723044397463, "grad_norm": 0.36328125, "learning_rate": 4.28400004127511e-05, "loss": 0.06356018185615539, "memory(GiB)": 77.69, "step": 915, "token_acc": 0.9803577571379429, "train_speed(iter/s)": 0.081196 }, { "epoch": 0.9725158562367865, "grad_norm": 0.326171875, "learning_rate": 4.273194767954037e-05, "loss": 0.07740767598152161, "memory(GiB)": 77.69, "step": 920, "token_acc": 0.9758036772294637, "train_speed(iter/s)": 0.081214 }, { "epoch": 0.9778012684989429, "grad_norm": 0.5234375, "learning_rate": 4.2623224208130674e-05, "loss": 0.07831965684890747, "memory(GiB)": 77.69, "step": 925, "token_acc": 0.9774587221823403, "train_speed(iter/s)": 0.081192 }, { "epoch": 0.9830866807610994, "grad_norm": 0.376953125, "learning_rate": 4.251383411115455e-05, "loss": 0.046628785133361814, "memory(GiB)": 77.69, "step": 930, "token_acc": 0.9856247824573616, "train_speed(iter/s)": 0.08118 }, { "epoch": 0.9883720930232558, "grad_norm": 0.42578125, "learning_rate": 4.2403781526460715e-05, "loss": 0.0668062448501587, "memory(GiB)": 77.69, "step": 935, "token_acc": 0.9794814917510052, "train_speed(iter/s)": 0.081225 }, { "epoch": 0.9936575052854123, "grad_norm": 0.302734375, "learning_rate": 4.229307061695747e-05, "loss": 0.056691086292266844, "memory(GiB)": 77.69, "step": 940, "token_acc": 0.982403914171186, "train_speed(iter/s)": 0.081237 }, { "epoch": 0.9989429175475687, "grad_norm": 0.41015625, "learning_rate": 4.21817055704553e-05, "loss": 0.06192230582237244, "memory(GiB)": 77.69, "step": 945, "token_acc": 0.9823213888977053, "train_speed(iter/s)": 0.081246 }, { "epoch": 1.0, "eval_bleu-4": 38.124388, "eval_rouge-1": 58.614495, "eval_rouge-2": 47.757536, "eval_rouge-l": 52.039578, "eval_runtime": 554.2046, "eval_samples_per_second": 0.55, "eval_steps_per_second": 0.55, "step": 946 }, { "epoch": 1.0042283298097252, "grad_norm": 0.2109375, "learning_rate": 4.206969059950838e-05, "loss": 0.07877237796783447, "memory(GiB)": 77.69, "step": 950, "token_acc": 0.9774861742858078, "train_speed(iter/s)": 0.077594 }, { "epoch": 1.0095137420718816, "grad_norm": 0.2734375, "learning_rate": 4.195702994125533e-05, "loss": 0.07364320158958435, "memory(GiB)": 77.69, "step": 955, "token_acc": 0.9786994331584692, "train_speed(iter/s)": 0.077614 }, { "epoch": 1.014799154334038, "grad_norm": 0.2734375, "learning_rate": 4.184372785725885e-05, "loss": 0.05215795636177063, "memory(GiB)": 77.69, "step": 960, "token_acc": 0.9840337109057873, "train_speed(iter/s)": 0.077676 }, { "epoch": 1.0200845665961944, "grad_norm": 0.462890625, "learning_rate": 4.172978863334457e-05, "loss": 0.06663031578063965, "memory(GiB)": 77.69, "step": 965, "token_acc": 0.9795414671860596, "train_speed(iter/s)": 0.07768 }, { "epoch": 1.025369978858351, "grad_norm": 0.396484375, "learning_rate": 4.1615216579438906e-05, "loss": 0.05644761919975281, "memory(GiB)": 77.69, "step": 970, "token_acc": 0.9826387861938423, "train_speed(iter/s)": 0.077717 }, { "epoch": 1.0306553911205074, "grad_norm": 0.232421875, "learning_rate": 4.1500016029406046e-05, "loss": 0.0393169492483139, "memory(GiB)": 77.69, "step": 975, "token_acc": 0.9871817696279153, "train_speed(iter/s)": 0.077637 }, { "epoch": 1.0359408033826638, "grad_norm": 0.419921875, "learning_rate": 4.138419134088403e-05, "loss": 0.08201128244400024, "memory(GiB)": 77.69, "step": 980, "token_acc": 0.9763346992101669, "train_speed(iter/s)": 0.077646 }, { "epoch": 1.0412262156448202, "grad_norm": 0.353515625, "learning_rate": 4.126774689511987e-05, "loss": 0.04808052778244019, "memory(GiB)": 77.69, "step": 985, "token_acc": 0.9849737261616579, "train_speed(iter/s)": 0.077653 }, { "epoch": 1.0465116279069768, "grad_norm": 0.4140625, "learning_rate": 4.115068709680386e-05, "loss": 0.05080167055130005, "memory(GiB)": 77.69, "step": 990, "token_acc": 0.9844046057426031, "train_speed(iter/s)": 0.077626 }, { "epoch": 1.0517970401691332, "grad_norm": 0.4140625, "learning_rate": 4.103301637390296e-05, "loss": 0.08551814556121826, "memory(GiB)": 77.69, "step": 995, "token_acc": 0.9754455445544554, "train_speed(iter/s)": 0.07761 }, { "epoch": 1.0570824524312896, "grad_norm": 0.30078125, "learning_rate": 4.0914739177493264e-05, "loss": 0.049226969480514526, "memory(GiB)": 77.69, "step": 1000, "token_acc": 0.9829714502753687, "train_speed(iter/s)": 0.077602 }, { "epoch": 1.062367864693446, "grad_norm": 0.46875, "learning_rate": 4.0795859981591695e-05, "loss": 0.06593365073204041, "memory(GiB)": 77.69, "step": 1005, "token_acc": 0.9804994128190865, "train_speed(iter/s)": 0.077606 }, { "epoch": 1.0676532769556026, "grad_norm": 0.34765625, "learning_rate": 4.067638328298671e-05, "loss": 0.05800300240516663, "memory(GiB)": 77.69, "step": 1010, "token_acc": 0.9832294319079814, "train_speed(iter/s)": 0.077614 }, { "epoch": 1.072938689217759, "grad_norm": 0.41796875, "learning_rate": 4.055631360106823e-05, "loss": 0.05422772169113159, "memory(GiB)": 77.69, "step": 1015, "token_acc": 0.9833902230891137, "train_speed(iter/s)": 0.077605 }, { "epoch": 1.0782241014799154, "grad_norm": 0.4453125, "learning_rate": 4.0435655477656685e-05, "loss": 0.06057540774345398, "memory(GiB)": 77.69, "step": 1020, "token_acc": 0.980710359129662, "train_speed(iter/s)": 0.077638 }, { "epoch": 1.0835095137420718, "grad_norm": 0.36328125, "learning_rate": 4.0314413476831204e-05, "loss": 0.045259985327720645, "memory(GiB)": 77.69, "step": 1025, "token_acc": 0.9848457771900354, "train_speed(iter/s)": 0.077677 }, { "epoch": 1.0887949260042284, "grad_norm": 0.380859375, "learning_rate": 4.019259218475699e-05, "loss": 0.06250202655792236, "memory(GiB)": 77.69, "step": 1030, "token_acc": 0.9815415697318477, "train_speed(iter/s)": 0.077683 }, { "epoch": 1.0940803382663848, "grad_norm": 0.390625, "learning_rate": 4.0070196209511814e-05, "loss": 0.042073619365692136, "memory(GiB)": 77.69, "step": 1035, "token_acc": 0.9870359979919086, "train_speed(iter/s)": 0.077695 }, { "epoch": 1.0993657505285412, "grad_norm": 0.5625, "learning_rate": 3.9947230180911734e-05, "loss": 0.050903499126434326, "memory(GiB)": 77.69, "step": 1040, "token_acc": 0.9818915284625729, "train_speed(iter/s)": 0.077748 }, { "epoch": 1.1046511627906976, "grad_norm": 0.443359375, "learning_rate": 3.982369875033593e-05, "loss": 0.072785484790802, "memory(GiB)": 77.69, "step": 1045, "token_acc": 0.9779370034052214, "train_speed(iter/s)": 0.077788 }, { "epoch": 1.109936575052854, "grad_norm": 0.671875, "learning_rate": 3.969960659055082e-05, "loss": 0.046435147523880005, "memory(GiB)": 77.69, "step": 1050, "token_acc": 0.9857312351692269, "train_speed(iter/s)": 0.077777 }, { "epoch": 1.1152219873150107, "grad_norm": 0.2421875, "learning_rate": 3.957495839553322e-05, "loss": 0.05626019239425659, "memory(GiB)": 77.69, "step": 1055, "token_acc": 0.9830789038072466, "train_speed(iter/s)": 0.077797 }, { "epoch": 1.120507399577167, "grad_norm": 0.287109375, "learning_rate": 3.944975888029288e-05, "loss": 0.043729889392852786, "memory(GiB)": 77.69, "step": 1060, "token_acc": 0.9860346747256243, "train_speed(iter/s)": 0.077818 }, { "epoch": 1.1257928118393234, "grad_norm": 0.306640625, "learning_rate": 3.932401278069408e-05, "loss": 0.04030815064907074, "memory(GiB)": 77.69, "step": 1065, "token_acc": 0.9871183046888036, "train_speed(iter/s)": 0.077838 }, { "epoch": 1.1310782241014798, "grad_norm": 0.53515625, "learning_rate": 3.919772485327644e-05, "loss": 0.052184182405471805, "memory(GiB)": 77.69, "step": 1070, "token_acc": 0.9839149494321908, "train_speed(iter/s)": 0.077858 }, { "epoch": 1.1363636363636362, "grad_norm": 0.2314453125, "learning_rate": 3.9070899875075114e-05, "loss": 0.044597327709198, "memory(GiB)": 77.69, "step": 1075, "token_acc": 0.9874493088239648, "train_speed(iter/s)": 0.077869 }, { "epoch": 1.1416490486257929, "grad_norm": 0.466796875, "learning_rate": 3.894354264344e-05, "loss": 0.04796979427337646, "memory(GiB)": 77.69, "step": 1080, "token_acc": 0.9855796077916106, "train_speed(iter/s)": 0.077888 }, { "epoch": 1.1469344608879493, "grad_norm": 0.357421875, "learning_rate": 3.881565797585431e-05, "loss": 0.0906874418258667, "memory(GiB)": 77.69, "step": 1085, "token_acc": 0.9743463937758463, "train_speed(iter/s)": 0.077913 }, { "epoch": 1.1522198731501057, "grad_norm": 0.421875, "learning_rate": 3.8687250709752306e-05, "loss": 0.05481817722320557, "memory(GiB)": 77.69, "step": 1090, "token_acc": 0.9835675969308073, "train_speed(iter/s)": 0.07794 }, { "epoch": 1.1575052854122623, "grad_norm": 0.455078125, "learning_rate": 3.85583257023364e-05, "loss": 0.05167987942695618, "memory(GiB)": 77.69, "step": 1095, "token_acc": 0.9852494214092519, "train_speed(iter/s)": 0.077934 }, { "epoch": 1.1627906976744187, "grad_norm": 0.380859375, "learning_rate": 3.8428887830393295e-05, "loss": 0.056168985366821286, "memory(GiB)": 77.69, "step": 1100, "token_acc": 0.9826521814105881, "train_speed(iter/s)": 0.077947 }, { "epoch": 1.168076109936575, "grad_norm": 0.41015625, "learning_rate": 3.829894199010964e-05, "loss": 0.0756365954875946, "memory(GiB)": 77.69, "step": 1105, "token_acc": 0.9783764829641751, "train_speed(iter/s)": 0.077941 }, { "epoch": 1.1733615221987315, "grad_norm": 0.484375, "learning_rate": 3.816849309688673e-05, "loss": 0.0505562424659729, "memory(GiB)": 77.69, "step": 1110, "token_acc": 0.9833694360981063, "train_speed(iter/s)": 0.077949 }, { "epoch": 1.1786469344608879, "grad_norm": 0.37109375, "learning_rate": 3.8037546085154643e-05, "loss": 0.047308170795440675, "memory(GiB)": 77.69, "step": 1115, "token_acc": 0.9846572794899043, "train_speed(iter/s)": 0.07798 }, { "epoch": 1.1839323467230445, "grad_norm": 0.3828125, "learning_rate": 3.7906105908185534e-05, "loss": 0.03376719355583191, "memory(GiB)": 77.69, "step": 1120, "token_acc": 0.9891336560308209, "train_speed(iter/s)": 0.077997 }, { "epoch": 1.1892177589852009, "grad_norm": 0.423828125, "learning_rate": 3.77741775379063e-05, "loss": 0.038771599531173706, "memory(GiB)": 77.69, "step": 1125, "token_acc": 0.9876621427856881, "train_speed(iter/s)": 0.078 }, { "epoch": 1.1945031712473573, "grad_norm": 0.625, "learning_rate": 3.764176596471049e-05, "loss": 0.05943330526351929, "memory(GiB)": 77.69, "step": 1130, "token_acc": 0.9834041423260754, "train_speed(iter/s)": 0.078062 }, { "epoch": 1.1997885835095137, "grad_norm": 0.33984375, "learning_rate": 3.750887619726957e-05, "loss": 0.04846176207065582, "memory(GiB)": 77.69, "step": 1135, "token_acc": 0.9846868207864059, "train_speed(iter/s)": 0.078069 }, { "epoch": 1.20507399577167, "grad_norm": 0.33203125, "learning_rate": 3.737551326234342e-05, "loss": 0.07271291613578797, "memory(GiB)": 77.69, "step": 1140, "token_acc": 0.9787466289970463, "train_speed(iter/s)": 0.078054 }, { "epoch": 1.2103594080338267, "grad_norm": 0.33984375, "learning_rate": 3.724168220459021e-05, "loss": 0.05263459086418152, "memory(GiB)": 77.69, "step": 1145, "token_acc": 0.9843895700224486, "train_speed(iter/s)": 0.078045 }, { "epoch": 1.215644820295983, "grad_norm": 0.62109375, "learning_rate": 3.710738808637558e-05, "loss": 0.049906060099601746, "memory(GiB)": 77.69, "step": 1150, "token_acc": 0.984859559360969, "train_speed(iter/s)": 0.078088 }, { "epoch": 1.2209302325581395, "grad_norm": 0.482421875, "learning_rate": 3.697263598758114e-05, "loss": 0.05486075878143311, "memory(GiB)": 77.69, "step": 1155, "token_acc": 0.9830310122878877, "train_speed(iter/s)": 0.078141 }, { "epoch": 1.226215644820296, "grad_norm": 0.56640625, "learning_rate": 3.683743100541233e-05, "loss": 0.050484615564346316, "memory(GiB)": 77.69, "step": 1160, "token_acc": 0.9847595926899994, "train_speed(iter/s)": 0.07817 }, { "epoch": 1.2315010570824525, "grad_norm": 0.5078125, "learning_rate": 3.670177825420559e-05, "loss": 0.04356261491775513, "memory(GiB)": 77.69, "step": 1165, "token_acc": 0.986015037593985, "train_speed(iter/s)": 0.078189 }, { "epoch": 1.236786469344609, "grad_norm": 0.39453125, "learning_rate": 3.656568286523492e-05, "loss": 0.045412346720695496, "memory(GiB)": 77.69, "step": 1170, "token_acc": 0.9871712071395021, "train_speed(iter/s)": 0.078194 }, { "epoch": 1.2420718816067653, "grad_norm": 0.4609375, "learning_rate": 3.642914998651776e-05, "loss": 0.04921023845672608, "memory(GiB)": 77.69, "step": 1175, "token_acc": 0.9846773460921177, "train_speed(iter/s)": 0.078188 }, { "epoch": 1.2473572938689217, "grad_norm": 0.62109375, "learning_rate": 3.6292184782620294e-05, "loss": 0.046738225221633914, "memory(GiB)": 77.69, "step": 1180, "token_acc": 0.9854931360140201, "train_speed(iter/s)": 0.078201 }, { "epoch": 1.2526427061310783, "grad_norm": 0.67578125, "learning_rate": 3.6154792434462057e-05, "loss": 0.07521597146987916, "memory(GiB)": 77.69, "step": 1185, "token_acc": 0.9785143380983757, "train_speed(iter/s)": 0.078201 }, { "epoch": 1.2579281183932347, "grad_norm": 0.48828125, "learning_rate": 3.6016978139119975e-05, "loss": 0.040938377380371094, "memory(GiB)": 77.69, "step": 1190, "token_acc": 0.9864248132321735, "train_speed(iter/s)": 0.0782 }, { "epoch": 1.263213530655391, "grad_norm": 0.408203125, "learning_rate": 3.587874710963178e-05, "loss": 0.03505093455314636, "memory(GiB)": 77.69, "step": 1195, "token_acc": 0.9881515814211574, "train_speed(iter/s)": 0.078205 }, { "epoch": 1.2684989429175475, "grad_norm": 0.46484375, "learning_rate": 3.5740104574798806e-05, "loss": 0.07729544639587402, "memory(GiB)": 77.69, "step": 1200, "token_acc": 0.9786948833494952, "train_speed(iter/s)": 0.078231 }, { "epoch": 1.273784355179704, "grad_norm": 0.310546875, "learning_rate": 3.560105577898821e-05, "loss": 0.043123260140419006, "memory(GiB)": 77.69, "step": 1205, "token_acc": 0.986168100595632, "train_speed(iter/s)": 0.078258 }, { "epoch": 1.2790697674418605, "grad_norm": 0.5625, "learning_rate": 3.546160598193461e-05, "loss": 0.03295296728610993, "memory(GiB)": 77.69, "step": 1210, "token_acc": 0.990035103612275, "train_speed(iter/s)": 0.078249 }, { "epoch": 1.284355179704017, "grad_norm": 0.482421875, "learning_rate": 3.53217604585411e-05, "loss": 0.05090029239654541, "memory(GiB)": 77.69, "step": 1215, "token_acc": 0.9842664563854155, "train_speed(iter/s)": 0.078271 }, { "epoch": 1.2896405919661733, "grad_norm": 0.515625, "learning_rate": 3.518152449867974e-05, "loss": 0.05112537145614624, "memory(GiB)": 77.69, "step": 1220, "token_acc": 0.9844214700467356, "train_speed(iter/s)": 0.078226 }, { "epoch": 1.29492600422833, "grad_norm": 0.484375, "learning_rate": 3.504090340699142e-05, "loss": 0.03835583031177521, "memory(GiB)": 77.69, "step": 1225, "token_acc": 0.9872772700374778, "train_speed(iter/s)": 0.078233 }, { "epoch": 1.3002114164904863, "grad_norm": 0.3515625, "learning_rate": 3.489990250268528e-05, "loss": 0.03968880772590637, "memory(GiB)": 77.69, "step": 1230, "token_acc": 0.9869090122254679, "train_speed(iter/s)": 0.078265 }, { "epoch": 1.3054968287526427, "grad_norm": 0.8984375, "learning_rate": 3.475852711933744e-05, "loss": 0.05279853343963623, "memory(GiB)": 77.69, "step": 1235, "token_acc": 0.9844943300161999, "train_speed(iter/s)": 0.078289 }, { "epoch": 1.3107822410147991, "grad_norm": 0.55078125, "learning_rate": 3.461678260468923e-05, "loss": 0.03934960067272186, "memory(GiB)": 77.69, "step": 1240, "token_acc": 0.9875799774181407, "train_speed(iter/s)": 0.078276 }, { "epoch": 1.3160676532769555, "grad_norm": 0.53125, "learning_rate": 3.447467432044501e-05, "loss": 0.05894434452056885, "memory(GiB)": 77.69, "step": 1245, "token_acc": 0.981131427000275, "train_speed(iter/s)": 0.078301 }, { "epoch": 1.3213530655391121, "grad_norm": 0.326171875, "learning_rate": 3.433220764206921e-05, "loss": 0.05034686923027039, "memory(GiB)": 77.69, "step": 1250, "token_acc": 0.9836901763224182, "train_speed(iter/s)": 0.078303 }, { "epoch": 1.3266384778012685, "grad_norm": 0.396484375, "learning_rate": 3.418938795858313e-05, "loss": 0.039085912704467776, "memory(GiB)": 77.69, "step": 1255, "token_acc": 0.9870506809555705, "train_speed(iter/s)": 0.07831 }, { "epoch": 1.331923890063425, "grad_norm": 0.365234375, "learning_rate": 3.404622067236098e-05, "loss": 0.039952915906906125, "memory(GiB)": 77.69, "step": 1260, "token_acc": 0.9882705813499805, "train_speed(iter/s)": 0.078271 }, { "epoch": 1.3372093023255813, "grad_norm": 0.421875, "learning_rate": 3.390271119892562e-05, "loss": 0.04313199520111084, "memory(GiB)": 77.69, "step": 1265, "token_acc": 0.9857399350873558, "train_speed(iter/s)": 0.078298 }, { "epoch": 1.3424947145877377, "grad_norm": 0.4375, "learning_rate": 3.375886496674364e-05, "loss": 0.03943063616752625, "memory(GiB)": 77.69, "step": 1270, "token_acc": 0.9873038564961367, "train_speed(iter/s)": 0.078292 }, { "epoch": 1.3477801268498943, "grad_norm": 0.75, "learning_rate": 3.3614687417020076e-05, "loss": 0.051702433824539186, "memory(GiB)": 77.69, "step": 1275, "token_acc": 0.9846246715432143, "train_speed(iter/s)": 0.078314 }, { "epoch": 1.3530655391120507, "grad_norm": 0.2890625, "learning_rate": 3.347018400349252e-05, "loss": 0.038116741180419925, "memory(GiB)": 77.69, "step": 1280, "token_acc": 0.9880043620501636, "train_speed(iter/s)": 0.078274 }, { "epoch": 1.3583509513742071, "grad_norm": 0.37890625, "learning_rate": 3.33253601922249e-05, "loss": 0.04347434639930725, "memory(GiB)": 77.69, "step": 1285, "token_acc": 0.9872196329740751, "train_speed(iter/s)": 0.07827 }, { "epoch": 1.3636363636363638, "grad_norm": 0.439453125, "learning_rate": 3.3180221461400675e-05, "loss": 0.05601614713668823, "memory(GiB)": 77.69, "step": 1290, "token_acc": 0.9820909778326103, "train_speed(iter/s)": 0.078308 }, { "epoch": 1.3689217758985202, "grad_norm": 0.384765625, "learning_rate": 3.3034773301115595e-05, "loss": 0.04715239107608795, "memory(GiB)": 77.69, "step": 1295, "token_acc": 0.9846506438285938, "train_speed(iter/s)": 0.078311 }, { "epoch": 1.3742071881606766, "grad_norm": 0.578125, "learning_rate": 3.288902121317008e-05, "loss": 0.07717665433883666, "memory(GiB)": 77.69, "step": 1300, "token_acc": 0.9763774658339002, "train_speed(iter/s)": 0.078335 }, { "epoch": 1.379492600422833, "grad_norm": 0.234375, "learning_rate": 3.274297071086108e-05, "loss": 0.02975306212902069, "memory(GiB)": 77.69, "step": 1305, "token_acc": 0.9900385328956758, "train_speed(iter/s)": 0.078313 }, { "epoch": 1.3847780126849893, "grad_norm": 0.169921875, "learning_rate": 3.259662731877351e-05, "loss": 0.04149012267589569, "memory(GiB)": 77.69, "step": 1310, "token_acc": 0.9882310634541829, "train_speed(iter/s)": 0.078339 }, { "epoch": 1.390063424947146, "grad_norm": 0.498046875, "learning_rate": 3.244999657257131e-05, "loss": 0.0479136735200882, "memory(GiB)": 77.69, "step": 1315, "token_acc": 0.984, "train_speed(iter/s)": 0.078357 }, { "epoch": 1.3953488372093024, "grad_norm": 0.4296875, "learning_rate": 3.230308401878804e-05, "loss": 0.048540860414505005, "memory(GiB)": 77.69, "step": 1320, "token_acc": 0.9848636061210911, "train_speed(iter/s)": 0.078408 }, { "epoch": 1.4006342494714588, "grad_norm": 0.66796875, "learning_rate": 3.215589521461703e-05, "loss": 0.04019085168838501, "memory(GiB)": 77.69, "step": 1325, "token_acc": 0.9858363145214281, "train_speed(iter/s)": 0.07842 }, { "epoch": 1.4059196617336152, "grad_norm": 0.78125, "learning_rate": 3.2008435727701224e-05, "loss": 0.047250103950500486, "memory(GiB)": 77.69, "step": 1330, "token_acc": 0.985756862745098, "train_speed(iter/s)": 0.078444 }, { "epoch": 1.4112050739957716, "grad_norm": 0.341796875, "learning_rate": 3.186071113592257e-05, "loss": 0.036807119846343994, "memory(GiB)": 77.69, "step": 1335, "token_acc": 0.9878989995235826, "train_speed(iter/s)": 0.078447 }, { "epoch": 1.4164904862579282, "grad_norm": 0.2578125, "learning_rate": 3.171272702719102e-05, "loss": 0.0361470490694046, "memory(GiB)": 77.69, "step": 1340, "token_acc": 0.9886267902274642, "train_speed(iter/s)": 0.078458 }, { "epoch": 1.4217758985200846, "grad_norm": 1.8125, "learning_rate": 3.1564488999233106e-05, "loss": 0.04911347031593323, "memory(GiB)": 77.69, "step": 1345, "token_acc": 0.9850762066045724, "train_speed(iter/s)": 0.078496 }, { "epoch": 1.427061310782241, "grad_norm": 0.451171875, "learning_rate": 3.14160026593803e-05, "loss": 0.04021271765232086, "memory(GiB)": 77.69, "step": 1350, "token_acc": 0.9879680143487034, "train_speed(iter/s)": 0.078536 }, { "epoch": 1.4323467230443976, "grad_norm": 0.380859375, "learning_rate": 3.126727362435682e-05, "loss": 0.06748023033142089, "memory(GiB)": 77.69, "step": 1355, "token_acc": 0.9805634670156262, "train_speed(iter/s)": 0.07857 }, { "epoch": 1.437632135306554, "grad_norm": 0.388671875, "learning_rate": 3.111830752006723e-05, "loss": 0.044831705093383786, "memory(GiB)": 77.69, "step": 1360, "token_acc": 0.9858516840580787, "train_speed(iter/s)": 0.078587 }, { "epoch": 1.4429175475687104, "grad_norm": 0.58203125, "learning_rate": 3.096910998138355e-05, "loss": 0.06620057821273803, "memory(GiB)": 77.69, "step": 1365, "token_acc": 0.980337586294141, "train_speed(iter/s)": 0.078596 }, { "epoch": 1.4482029598308668, "grad_norm": 0.52734375, "learning_rate": 3.0819686651932244e-05, "loss": 0.04734827876091004, "memory(GiB)": 77.69, "step": 1370, "token_acc": 0.9845995893223819, "train_speed(iter/s)": 0.078615 }, { "epoch": 1.4534883720930232, "grad_norm": 0.400390625, "learning_rate": 3.0670043183880594e-05, "loss": 0.046893104910850525, "memory(GiB)": 77.69, "step": 1375, "token_acc": 0.9849442379182156, "train_speed(iter/s)": 0.078642 }, { "epoch": 1.4587737843551798, "grad_norm": 0.392578125, "learning_rate": 3.052018523772298e-05, "loss": 0.04950014352798462, "memory(GiB)": 77.69, "step": 1380, "token_acc": 0.9866039662160072, "train_speed(iter/s)": 0.078657 }, { "epoch": 1.4640591966173362, "grad_norm": 0.240234375, "learning_rate": 3.0370118482066757e-05, "loss": 0.035290053486824034, "memory(GiB)": 77.69, "step": 1385, "token_acc": 0.9890547548780837, "train_speed(iter/s)": 0.078666 }, { "epoch": 1.4693446088794926, "grad_norm": 0.78515625, "learning_rate": 3.0219848593417815e-05, "loss": 0.035723057389259336, "memory(GiB)": 77.69, "step": 1390, "token_acc": 0.9875944304547475, "train_speed(iter/s)": 0.078681 }, { "epoch": 1.474630021141649, "grad_norm": 0.427734375, "learning_rate": 3.0069381255965844e-05, "loss": 0.04783242642879486, "memory(GiB)": 77.69, "step": 1395, "token_acc": 0.9849552767154461, "train_speed(iter/s)": 0.078671 }, { "epoch": 1.4799154334038054, "grad_norm": 0.490234375, "learning_rate": 2.9918722161369374e-05, "loss": 0.04224193394184113, "memory(GiB)": 77.69, "step": 1400, "token_acc": 0.9867886902642262, "train_speed(iter/s)": 0.078677 }, { "epoch": 1.485200845665962, "grad_norm": 1.4609375, "learning_rate": 2.9767877008540424e-05, "loss": 0.051251459121704104, "memory(GiB)": 77.69, "step": 1405, "token_acc": 0.9880581930370489, "train_speed(iter/s)": 0.078652 }, { "epoch": 1.4904862579281184, "grad_norm": 0.27734375, "learning_rate": 2.961685150342894e-05, "loss": 0.03187498450279236, "memory(GiB)": 77.69, "step": 1410, "token_acc": 0.9887939001848429, "train_speed(iter/s)": 0.078694 }, { "epoch": 1.4957716701902748, "grad_norm": 0.251953125, "learning_rate": 2.946565135880699e-05, "loss": 0.041289350390434264, "memory(GiB)": 77.69, "step": 1415, "token_acc": 0.9878709118475201, "train_speed(iter/s)": 0.078685 }, { "epoch": 1.5010570824524314, "grad_norm": 0.240234375, "learning_rate": 2.9314282294052647e-05, "loss": 0.03621939420700073, "memory(GiB)": 77.69, "step": 1420, "token_acc": 0.988533130042564, "train_speed(iter/s)": 0.078704 }, { "epoch": 1.5063424947145876, "grad_norm": 0.474609375, "learning_rate": 2.9162750034933646e-05, "loss": 0.03570509552955627, "memory(GiB)": 77.69, "step": 1425, "token_acc": 0.9873137272754864, "train_speed(iter/s)": 0.078715 }, { "epoch": 1.5116279069767442, "grad_norm": 0.59375, "learning_rate": 2.901106031339078e-05, "loss": 0.03850190341472626, "memory(GiB)": 77.69, "step": 1430, "token_acc": 0.9877597495018503, "train_speed(iter/s)": 0.078721 }, { "epoch": 1.5169133192389006, "grad_norm": 0.490234375, "learning_rate": 2.8859218867321147e-05, "loss": 0.03164239525794983, "memory(GiB)": 77.69, "step": 1435, "token_acc": 0.9887983365194353, "train_speed(iter/s)": 0.078761 }, { "epoch": 1.522198731501057, "grad_norm": 0.671875, "learning_rate": 2.870723144036101e-05, "loss": 0.04476139843463898, "memory(GiB)": 77.69, "step": 1440, "token_acc": 0.9858895922481169, "train_speed(iter/s)": 0.078792 }, { "epoch": 1.5274841437632136, "grad_norm": 0.75390625, "learning_rate": 2.8555103781668613e-05, "loss": 0.04332542419433594, "memory(GiB)": 77.69, "step": 1445, "token_acc": 0.987423758350276, "train_speed(iter/s)": 0.078783 }, { "epoch": 1.53276955602537, "grad_norm": 0.408203125, "learning_rate": 2.8402841645706657e-05, "loss": 0.03143905699253082, "memory(GiB)": 77.69, "step": 1450, "token_acc": 0.9884236638352252, "train_speed(iter/s)": 0.078814 }, { "epoch": 1.5380549682875264, "grad_norm": 0.416015625, "learning_rate": 2.825045079202469e-05, "loss": 0.04023267030715942, "memory(GiB)": 77.69, "step": 1455, "token_acc": 0.9881045440641675, "train_speed(iter/s)": 0.078824 }, { "epoch": 1.543340380549683, "grad_norm": 0.3203125, "learning_rate": 2.8097936985041173e-05, "loss": 0.04661189019680023, "memory(GiB)": 77.69, "step": 1460, "token_acc": 0.9855500191644309, "train_speed(iter/s)": 0.078866 }, { "epoch": 1.5486257928118392, "grad_norm": 0.314453125, "learning_rate": 2.7945305993825477e-05, "loss": 0.03682275414466858, "memory(GiB)": 77.69, "step": 1465, "token_acc": 0.9878913738019169, "train_speed(iter/s)": 0.078856 }, { "epoch": 1.5539112050739958, "grad_norm": 0.5078125, "learning_rate": 2.7792563591879638e-05, "loss": 0.03426058888435364, "memory(GiB)": 77.69, "step": 1470, "token_acc": 0.9892900856793145, "train_speed(iter/s)": 0.078839 }, { "epoch": 1.5591966173361522, "grad_norm": 0.44140625, "learning_rate": 2.763971555692e-05, "loss": 0.04307305216789246, "memory(GiB)": 77.69, "step": 1475, "token_acc": 0.9857514682327816, "train_speed(iter/s)": 0.078844 }, { "epoch": 1.5644820295983086, "grad_norm": 0.2021484375, "learning_rate": 2.7486767670658586e-05, "loss": 0.028118404746055602, "memory(GiB)": 77.69, "step": 1480, "token_acc": 0.9900190272724239, "train_speed(iter/s)": 0.078846 }, { "epoch": 1.5697674418604652, "grad_norm": 0.294921875, "learning_rate": 2.7333725718584515e-05, "loss": 0.03999925553798676, "memory(GiB)": 77.69, "step": 1485, "token_acc": 0.985392506690455, "train_speed(iter/s)": 0.07889 }, { "epoch": 1.5750528541226214, "grad_norm": 0.314453125, "learning_rate": 2.7180595489745046e-05, "loss": 0.02785443961620331, "memory(GiB)": 77.69, "step": 1490, "token_acc": 0.990169064870565, "train_speed(iter/s)": 0.07892 }, { "epoch": 1.580338266384778, "grad_norm": 0.318359375, "learning_rate": 2.7027382776526645e-05, "loss": 0.04204976856708527, "memory(GiB)": 77.69, "step": 1495, "token_acc": 0.9866892861803471, "train_speed(iter/s)": 0.078941 }, { "epoch": 1.5856236786469344, "grad_norm": 0.392578125, "learning_rate": 2.6874093374435893e-05, "loss": 0.027912557125091553, "memory(GiB)": 77.69, "step": 1500, "token_acc": 0.9894640220297721, "train_speed(iter/s)": 0.078965 }, { "epoch": 1.5909090909090908, "grad_norm": 0.40234375, "learning_rate": 2.672073308188024e-05, "loss": 0.030119818449020386, "memory(GiB)": 77.69, "step": 1505, "token_acc": 0.989537299338999, "train_speed(iter/s)": 0.07899 }, { "epoch": 1.5961945031712474, "grad_norm": 1.1796875, "learning_rate": 2.6567307699948662e-05, "loss": 0.03889271914958954, "memory(GiB)": 77.69, "step": 1510, "token_acc": 0.9882164585334614, "train_speed(iter/s)": 0.078957 }, { "epoch": 1.6014799154334038, "grad_norm": 0.35546875, "learning_rate": 2.6413823032192264e-05, "loss": 0.034280434250831604, "memory(GiB)": 77.69, "step": 1515, "token_acc": 0.9885982994073692, "train_speed(iter/s)": 0.078961 }, { "epoch": 1.6067653276955602, "grad_norm": 0.51171875, "learning_rate": 2.626028488440471e-05, "loss": 0.04397706687450409, "memory(GiB)": 77.69, "step": 1520, "token_acc": 0.9859556911947552, "train_speed(iter/s)": 0.078938 }, { "epoch": 1.6120507399577169, "grad_norm": 0.6328125, "learning_rate": 2.6106699064402652e-05, "loss": 0.053912556171417235, "memory(GiB)": 77.69, "step": 1525, "token_acc": 0.9848702082704498, "train_speed(iter/s)": 0.078994 }, { "epoch": 1.617336152219873, "grad_norm": 0.30078125, "learning_rate": 2.595307138180598e-05, "loss": 0.030446991324424744, "memory(GiB)": 77.69, "step": 1530, "token_acc": 0.9904753040456689, "train_speed(iter/s)": 0.078992 }, { "epoch": 1.6226215644820297, "grad_norm": 0.27734375, "learning_rate": 2.579940764781813e-05, "loss": 0.029166772961616516, "memory(GiB)": 77.69, "step": 1535, "token_acc": 0.9905016158493747, "train_speed(iter/s)": 0.078985 }, { "epoch": 1.627906976744186, "grad_norm": 0.6484375, "learning_rate": 2.5645713675006234e-05, "loss": 0.022643569111824035, "memory(GiB)": 77.69, "step": 1540, "token_acc": 0.9921177733476461, "train_speed(iter/s)": 0.078988 }, { "epoch": 1.6331923890063424, "grad_norm": 0.66015625, "learning_rate": 2.5491995277081244e-05, "loss": 0.043573996424674986, "memory(GiB)": 77.69, "step": 1545, "token_acc": 0.987487269023716, "train_speed(iter/s)": 0.078993 }, { "epoch": 1.638477801268499, "grad_norm": 0.765625, "learning_rate": 2.5338258268678034e-05, "loss": 0.03580450713634491, "memory(GiB)": 77.69, "step": 1550, "token_acc": 0.9884658602683588, "train_speed(iter/s)": 0.079026 }, { "epoch": 1.6437632135306552, "grad_norm": 0.322265625, "learning_rate": 2.518450846513545e-05, "loss": 0.031298312544822696, "memory(GiB)": 77.69, "step": 1555, "token_acc": 0.9889735910699701, "train_speed(iter/s)": 0.07904 }, { "epoch": 1.6490486257928119, "grad_norm": 0.37109375, "learning_rate": 2.503075168227634e-05, "loss": 0.029430508613586426, "memory(GiB)": 77.69, "step": 1560, "token_acc": 0.9900430283418086, "train_speed(iter/s)": 0.079061 }, { "epoch": 1.6543340380549683, "grad_norm": 0.322265625, "learning_rate": 2.4876993736187532e-05, "loss": 0.03158299028873444, "memory(GiB)": 77.69, "step": 1565, "token_acc": 0.990119990464334, "train_speed(iter/s)": 0.079049 }, { "epoch": 1.6596194503171247, "grad_norm": 0.51171875, "learning_rate": 2.4723240442999885e-05, "loss": 0.03700158298015595, "memory(GiB)": 77.69, "step": 1570, "token_acc": 0.9885576633544113, "train_speed(iter/s)": 0.079033 }, { "epoch": 1.6649048625792813, "grad_norm": 0.419921875, "learning_rate": 2.456949761866823e-05, "loss": 0.04017901718616486, "memory(GiB)": 77.69, "step": 1575, "token_acc": 0.9868304232021308, "train_speed(iter/s)": 0.079054 }, { "epoch": 1.6701902748414377, "grad_norm": 0.349609375, "learning_rate": 2.4415771078751416e-05, "loss": 0.044619354605674746, "memory(GiB)": 77.69, "step": 1580, "token_acc": 0.9860335195530726, "train_speed(iter/s)": 0.079056 }, { "epoch": 1.675475687103594, "grad_norm": 0.78515625, "learning_rate": 2.4262066638192305e-05, "loss": 0.036984241008758544, "memory(GiB)": 77.69, "step": 1585, "token_acc": 0.9879392805156997, "train_speed(iter/s)": 0.079042 }, { "epoch": 1.6807610993657507, "grad_norm": 0.291015625, "learning_rate": 2.41083901110978e-05, "loss": 0.02616061270236969, "memory(GiB)": 77.69, "step": 1590, "token_acc": 0.9910856157927338, "train_speed(iter/s)": 0.079044 }, { "epoch": 1.6860465116279069, "grad_norm": 0.5234375, "learning_rate": 2.3954747310518953e-05, "loss": 0.04159751832485199, "memory(GiB)": 77.69, "step": 1595, "token_acc": 0.9864792323693217, "train_speed(iter/s)": 0.079063 }, { "epoch": 1.6913319238900635, "grad_norm": 0.1962890625, "learning_rate": 2.3801144048231057e-05, "loss": 0.041912186145782473, "memory(GiB)": 77.69, "step": 1600, "token_acc": 0.9859863456701401, "train_speed(iter/s)": 0.079061 }, { "epoch": 1.6966173361522199, "grad_norm": 0.373046875, "learning_rate": 2.36475861345138e-05, "loss": 0.05294908881187439, "memory(GiB)": 77.69, "step": 1605, "token_acc": 0.9853015541246671, "train_speed(iter/s)": 0.079073 }, { "epoch": 1.7019027484143763, "grad_norm": 0.44921875, "learning_rate": 2.3494079377931503e-05, "loss": 0.031134456396102905, "memory(GiB)": 77.69, "step": 1610, "token_acc": 0.9893742186925509, "train_speed(iter/s)": 0.079098 }, { "epoch": 1.707188160676533, "grad_norm": 0.28125, "learning_rate": 2.3340629585113356e-05, "loss": 0.030457425117492675, "memory(GiB)": 77.69, "step": 1615, "token_acc": 0.9888744146745062, "train_speed(iter/s)": 0.079119 }, { "epoch": 1.712473572938689, "grad_norm": 0.224609375, "learning_rate": 2.318724256053386e-05, "loss": 0.03183789849281311, "memory(GiB)": 77.69, "step": 1620, "token_acc": 0.9896454841051867, "train_speed(iter/s)": 0.079106 }, { "epoch": 1.7177589852008457, "grad_norm": 0.34765625, "learning_rate": 2.3033924106293177e-05, "loss": 0.03787479996681213, "memory(GiB)": 77.69, "step": 1625, "token_acc": 0.9875393035984628, "train_speed(iter/s)": 0.079087 }, { "epoch": 1.723044397463002, "grad_norm": 0.236328125, "learning_rate": 2.288068002189769e-05, "loss": 0.03849000632762909, "memory(GiB)": 77.69, "step": 1630, "token_acc": 0.9864257844519493, "train_speed(iter/s)": 0.0791 }, { "epoch": 1.7283298097251585, "grad_norm": 0.453125, "learning_rate": 2.2727516104040626e-05, "loss": 0.032643947005271914, "memory(GiB)": 77.69, "step": 1635, "token_acc": 0.9891313748666647, "train_speed(iter/s)": 0.079105 }, { "epoch": 1.733615221987315, "grad_norm": 0.7578125, "learning_rate": 2.257443814638282e-05, "loss": 0.03904370963573456, "memory(GiB)": 77.69, "step": 1640, "token_acc": 0.9890084726356767, "train_speed(iter/s)": 0.079093 }, { "epoch": 1.7389006342494715, "grad_norm": 0.921875, "learning_rate": 2.2421451939333505e-05, "loss": 0.03423792719841003, "memory(GiB)": 77.69, "step": 1645, "token_acc": 0.988858198069582, "train_speed(iter/s)": 0.079109 }, { "epoch": 1.744186046511628, "grad_norm": 0.44921875, "learning_rate": 2.2268563269831284e-05, "loss": 0.03314145505428314, "memory(GiB)": 77.69, "step": 1650, "token_acc": 0.9886611760199195, "train_speed(iter/s)": 0.079134 }, { "epoch": 1.7494714587737845, "grad_norm": 0.3515625, "learning_rate": 2.2115777921125304e-05, "loss": 0.029022321105003357, "memory(GiB)": 77.69, "step": 1655, "token_acc": 0.9894945683991414, "train_speed(iter/s)": 0.079141 }, { "epoch": 1.7547568710359407, "grad_norm": 0.32421875, "learning_rate": 2.196310167255642e-05, "loss": 0.03672107458114624, "memory(GiB)": 77.69, "step": 1660, "token_acc": 0.9885863035642771, "train_speed(iter/s)": 0.079136 }, { "epoch": 1.7600422832980973, "grad_norm": 0.177734375, "learning_rate": 2.1810540299338587e-05, "loss": 0.02837436497211456, "memory(GiB)": 77.69, "step": 1665, "token_acc": 0.9896625222024866, "train_speed(iter/s)": 0.079135 }, { "epoch": 1.7653276955602537, "grad_norm": 0.259765625, "learning_rate": 2.1658099572340436e-05, "loss": 0.025397276878356932, "memory(GiB)": 77.69, "step": 1670, "token_acc": 0.9907165548888105, "train_speed(iter/s)": 0.079134 }, { "epoch": 1.77061310782241, "grad_norm": 0.51171875, "learning_rate": 2.1505785257867e-05, "loss": 0.025075176358222963, "memory(GiB)": 77.69, "step": 1675, "token_acc": 0.9914622090410713, "train_speed(iter/s)": 0.079137 }, { "epoch": 1.7758985200845667, "grad_norm": 1.2734375, "learning_rate": 2.1353603117441518e-05, "loss": 0.03774846792221069, "memory(GiB)": 77.69, "step": 1680, "token_acc": 0.9877381882408885, "train_speed(iter/s)": 0.079152 }, { "epoch": 1.781183932346723, "grad_norm": 0.421875, "learning_rate": 2.1201558907587565e-05, "loss": 0.03866144418716431, "memory(GiB)": 77.69, "step": 1685, "token_acc": 0.9871838238057654, "train_speed(iter/s)": 0.079179 }, { "epoch": 1.7864693446088795, "grad_norm": 0.396484375, "learning_rate": 2.104965837961127e-05, "loss": 0.03020806908607483, "memory(GiB)": 77.69, "step": 1690, "token_acc": 0.9896781122281555, "train_speed(iter/s)": 0.079175 }, { "epoch": 1.791754756871036, "grad_norm": 0.1787109375, "learning_rate": 2.0897907279383793e-05, "loss": 0.04235799014568329, "memory(GiB)": 77.69, "step": 1695, "token_acc": 0.986909090909091, "train_speed(iter/s)": 0.07919 }, { "epoch": 1.7970401691331923, "grad_norm": 0.443359375, "learning_rate": 2.074631134712394e-05, "loss": 0.03160930275917053, "memory(GiB)": 77.69, "step": 1700, "token_acc": 0.9885105160693198, "train_speed(iter/s)": 0.079219 }, { "epoch": 1.802325581395349, "grad_norm": 0.271484375, "learning_rate": 2.0594876317181058e-05, "loss": 0.026660746335983275, "memory(GiB)": 77.69, "step": 1705, "token_acc": 0.9907431722689075, "train_speed(iter/s)": 0.079247 }, { "epoch": 1.8076109936575053, "grad_norm": 0.326171875, "learning_rate": 2.044360791781811e-05, "loss": 0.02017672061920166, "memory(GiB)": 77.69, "step": 1710, "token_acc": 0.9927319367250962, "train_speed(iter/s)": 0.07925 }, { "epoch": 1.8128964059196617, "grad_norm": 0.5859375, "learning_rate": 2.0292511870995014e-05, "loss": 0.025962281227111816, "memory(GiB)": 77.69, "step": 1715, "token_acc": 0.990054484130416, "train_speed(iter/s)": 0.079291 }, { "epoch": 1.8181818181818183, "grad_norm": 0.35546875, "learning_rate": 2.014159389215218e-05, "loss": 0.03555901646614075, "memory(GiB)": 77.69, "step": 1720, "token_acc": 0.9892529087568892, "train_speed(iter/s)": 0.079295 }, { "epoch": 1.8234672304439745, "grad_norm": 0.59375, "learning_rate": 1.999085968999432e-05, "loss": 0.030091509222984314, "memory(GiB)": 77.69, "step": 1725, "token_acc": 0.989273646030319, "train_speed(iter/s)": 0.079284 }, { "epoch": 1.8287526427061311, "grad_norm": 0.16796875, "learning_rate": 1.9840314966274507e-05, "loss": 0.02063000351190567, "memory(GiB)": 77.69, "step": 1730, "token_acc": 0.9928059756321558, "train_speed(iter/s)": 0.07928 }, { "epoch": 1.8340380549682875, "grad_norm": 0.5390625, "learning_rate": 1.9689965415578515e-05, "loss": 0.02800295948982239, "memory(GiB)": 77.69, "step": 1735, "token_acc": 0.9897434184150742, "train_speed(iter/s)": 0.079283 }, { "epoch": 1.839323467230444, "grad_norm": 0.453125, "learning_rate": 1.953981672510939e-05, "loss": 0.02261122465133667, "memory(GiB)": 77.69, "step": 1740, "token_acc": 0.9915695428744165, "train_speed(iter/s)": 0.079278 }, { "epoch": 1.8446088794926006, "grad_norm": 0.2333984375, "learning_rate": 1.9389874574472325e-05, "loss": 0.028632891178131104, "memory(GiB)": 77.69, "step": 1745, "token_acc": 0.9898502600087714, "train_speed(iter/s)": 0.079289 }, { "epoch": 1.8498942917547567, "grad_norm": 0.51953125, "learning_rate": 1.9240144635459837e-05, "loss": 0.0253725528717041, "memory(GiB)": 77.69, "step": 1750, "token_acc": 0.9911390710748227, "train_speed(iter/s)": 0.079287 }, { "epoch": 1.8551797040169133, "grad_norm": 1.359375, "learning_rate": 1.9090632571837205e-05, "loss": 0.0353877454996109, "memory(GiB)": 77.69, "step": 1755, "token_acc": 0.9888795771015988, "train_speed(iter/s)": 0.079271 }, { "epoch": 1.8604651162790697, "grad_norm": 0.73046875, "learning_rate": 1.8941344039128238e-05, "loss": 0.032440242171287534, "memory(GiB)": 77.69, "step": 1760, "token_acc": 0.9908135636926251, "train_speed(iter/s)": 0.079275 }, { "epoch": 1.8657505285412261, "grad_norm": 0.169921875, "learning_rate": 1.879228468440134e-05, "loss": 0.03260525465011597, "memory(GiB)": 77.69, "step": 1765, "token_acc": 0.9896746817538897, "train_speed(iter/s)": 0.079286 }, { "epoch": 1.8710359408033828, "grad_norm": 0.267578125, "learning_rate": 1.864346014605592e-05, "loss": 0.024031616747379303, "memory(GiB)": 77.69, "step": 1770, "token_acc": 0.9917319941205291, "train_speed(iter/s)": 0.079294 }, { "epoch": 1.8763213530655392, "grad_norm": 0.3359375, "learning_rate": 1.8494876053609073e-05, "loss": 0.03516561090946198, "memory(GiB)": 77.69, "step": 1775, "token_acc": 0.9877237305625733, "train_speed(iter/s)": 0.079292 }, { "epoch": 1.8816067653276956, "grad_norm": 0.52734375, "learning_rate": 1.8346538027482684e-05, "loss": 0.03454855680465698, "memory(GiB)": 77.69, "step": 1780, "token_acc": 0.9887051360885466, "train_speed(iter/s)": 0.079318 }, { "epoch": 1.8868921775898522, "grad_norm": 0.671875, "learning_rate": 1.819845167879077e-05, "loss": 0.03431518375873566, "memory(GiB)": 77.69, "step": 1785, "token_acc": 0.9889875553009325, "train_speed(iter/s)": 0.079342 }, { "epoch": 1.8921775898520083, "grad_norm": 0.7265625, "learning_rate": 1.8050622609127294e-05, "loss": 0.03371002376079559, "memory(GiB)": 77.69, "step": 1790, "token_acc": 0.9893836836480228, "train_speed(iter/s)": 0.079362 }, { "epoch": 1.897463002114165, "grad_norm": 0.515625, "learning_rate": 1.790305641035423e-05, "loss": 0.04240670502185821, "memory(GiB)": 77.69, "step": 1795, "token_acc": 0.9878384293128244, "train_speed(iter/s)": 0.079376 }, { "epoch": 1.9027484143763214, "grad_norm": 0.51171875, "learning_rate": 1.7755758664390047e-05, "loss": 0.03319351375102997, "memory(GiB)": 77.69, "step": 1800, "token_acc": 0.9893189019831239, "train_speed(iter/s)": 0.079404 }, { "epoch": 1.9080338266384778, "grad_norm": 0.33203125, "learning_rate": 1.7608734942998574e-05, "loss": 0.022714193165302276, "memory(GiB)": 77.69, "step": 1805, "token_acc": 0.9928090531442487, "train_speed(iter/s)": 0.079386 }, { "epoch": 1.9133192389006344, "grad_norm": 0.58203125, "learning_rate": 1.7461990807578274e-05, "loss": 0.04771170318126679, "memory(GiB)": 77.69, "step": 1810, "token_acc": 0.9862021602047825, "train_speed(iter/s)": 0.079398 }, { "epoch": 1.9186046511627906, "grad_norm": 0.625, "learning_rate": 1.7315531808951795e-05, "loss": 0.03116103708744049, "memory(GiB)": 77.69, "step": 1815, "token_acc": 0.9899183294941215, "train_speed(iter/s)": 0.079407 }, { "epoch": 1.9238900634249472, "grad_norm": 0.314453125, "learning_rate": 1.7169363487156055e-05, "loss": 0.02592660188674927, "memory(GiB)": 77.69, "step": 1820, "token_acc": 0.9910156844830211, "train_speed(iter/s)": 0.079412 }, { "epoch": 1.9291754756871036, "grad_norm": 0.54296875, "learning_rate": 1.7023491371232714e-05, "loss": 0.021815077960491182, "memory(GiB)": 77.69, "step": 1825, "token_acc": 0.9926727509778357, "train_speed(iter/s)": 0.079394 }, { "epoch": 1.93446088794926, "grad_norm": 0.302734375, "learning_rate": 1.687792097901894e-05, "loss": 0.03171166181564331, "memory(GiB)": 77.69, "step": 1830, "token_acc": 0.9899566940016585, "train_speed(iter/s)": 0.079392 }, { "epoch": 1.9397463002114166, "grad_norm": 0.341796875, "learning_rate": 1.673265781693876e-05, "loss": 0.0259665310382843, "memory(GiB)": 77.69, "step": 1835, "token_acc": 0.9912671089092283, "train_speed(iter/s)": 0.079386 }, { "epoch": 1.945031712473573, "grad_norm": 0.2578125, "learning_rate": 1.6587707379794743e-05, "loss": 0.022189879417419435, "memory(GiB)": 77.69, "step": 1840, "token_acc": 0.9925913908717283, "train_speed(iter/s)": 0.079339 }, { "epoch": 1.9503171247357294, "grad_norm": 0.263671875, "learning_rate": 1.6443075150560167e-05, "loss": 0.02136421054601669, "memory(GiB)": 77.69, "step": 1845, "token_acc": 0.992376303260411, "train_speed(iter/s)": 0.079338 }, { "epoch": 1.955602536997886, "grad_norm": 0.33984375, "learning_rate": 1.6298766600171596e-05, "loss": 0.03282491862773895, "memory(GiB)": 77.69, "step": 1850, "token_acc": 0.9881762683320469, "train_speed(iter/s)": 0.079357 }, { "epoch": 1.9608879492600422, "grad_norm": 0.49609375, "learning_rate": 1.6154787187321937e-05, "loss": 0.03140755593776703, "memory(GiB)": 77.69, "step": 1855, "token_acc": 0.9887269378552602, "train_speed(iter/s)": 0.079345 }, { "epoch": 1.9661733615221988, "grad_norm": 0.2421875, "learning_rate": 1.601114235825398e-05, "loss": 0.03199794888496399, "memory(GiB)": 77.69, "step": 1860, "token_acc": 0.9900421059101946, "train_speed(iter/s)": 0.079351 }, { "epoch": 1.9714587737843552, "grad_norm": 0.85546875, "learning_rate": 1.586783754655436e-05, "loss": 0.032571139931678775, "memory(GiB)": 77.69, "step": 1865, "token_acc": 0.9894899007189318, "train_speed(iter/s)": 0.079372 }, { "epoch": 1.9767441860465116, "grad_norm": 0.50390625, "learning_rate": 1.5724878172948037e-05, "loss": 0.03356512188911438, "memory(GiB)": 77.69, "step": 1870, "token_acc": 0.9875881243173469, "train_speed(iter/s)": 0.079365 }, { "epoch": 1.9820295983086682, "grad_norm": 0.462890625, "learning_rate": 1.5582269645093235e-05, "loss": 0.02602737247943878, "memory(GiB)": 77.69, "step": 1875, "token_acc": 0.9904466111023746, "train_speed(iter/s)": 0.079373 }, { "epoch": 1.9873150105708244, "grad_norm": 0.2001953125, "learning_rate": 1.54400173573769e-05, "loss": 0.03088199198246002, "memory(GiB)": 77.69, "step": 1880, "token_acc": 0.9896220461398219, "train_speed(iter/s)": 0.07936 }, { "epoch": 1.992600422832981, "grad_norm": 0.25, "learning_rate": 1.5298126690710658e-05, "loss": 0.028480422496795655, "memory(GiB)": 77.69, "step": 1885, "token_acc": 0.9909081622229032, "train_speed(iter/s)": 0.079364 }, { "epoch": 1.9978858350951374, "grad_norm": 0.421875, "learning_rate": 1.5156603012327241e-05, "loss": 0.02729175388813019, "memory(GiB)": 77.69, "step": 1890, "token_acc": 0.9903470792817092, "train_speed(iter/s)": 0.079389 }, { "epoch": 2.0, "eval_bleu-4": 38.589481, "eval_rouge-1": 59.241469, "eval_rouge-2": 49.814934, "eval_rouge-l": 53.041802, "eval_runtime": 535.4823, "eval_samples_per_second": 0.57, "eval_steps_per_second": 0.57, "step": 1892 } ], "logging_steps": 5, "max_steps": 2838, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.3221314067093914e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }