diff --git "a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/output.log" "b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/output.log" new file mode 100644--- /dev/null +++ "b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/output.log" @@ -0,0 +1,2801 @@ +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +| distributed init (rank 6): env://, gpu 6 +| distributed init (rank 3): env://, gpu 3 +| distributed init (rank 1): env://, gpu 1 +| distributed init (rank 7): env://, gpu 7 +| distributed init (rank 5): env://, gpu 5 +| distributed init (rank 2): env://, gpu 2 +| distributed init (rank 0): env://, gpu 0 +| distributed init (rank 4): env://, gpu 4 +[00:02:04.046928] > initializing model parallel with size 1 +[00:02:04.047015] > initializing ddp with size 8 +[00:02:04.047022] > initializing pipeline with size 1 +[00:02:04.226045] job dir: /data/liuyijiang/mmlab/LLaMA2-Accessory/accessory +[00:02:04.226122] Namespace(batch_size=16, +accum_iter=1, +llama_type='llama_qformerv2_peft', +llama_config=['../checkpoints/llama2/Llama-2-13b/params.json', +'configs/model/finetune/sg/llamaPeft_normBiasLora.json'], +no_visual=False, +tokenizer_path='../checkpoints/llama2/Llama-2-13b/tokenizer.model', +pretrained_path='../checkpoints/mm/lamaQformerv2_13b/finetuned/', +pretrained_type='consolidated', +weight_decay=0.02, +lr=5e-05, +min_lr=5e-06, +epochs=3, +warmup_epochs=0.2, +clip_grad=2, +max_words=512, +dialog=False, +data_config='configs/data/finetune/mm/alpaca_llava.yaml', +output_dir='output/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B', +log_dir='./output_dir', +save_interval=1, +only_save_trainable=True, +device='cuda', +seed=0, +resume='', +num_workers=16, +pin_mem=True, +world_size=8, +local_rank=-1, +dist_on_itp=False, +dist_url='env://', +model_parallel_size=1, +data_parallel='sdp', +precision='bf16', +checkpointing=True, +quant=True, +rank=0, +gpu=0, +distributed=True, +dist_backend='nccl') +[00:02:04.226978] Start initialization. +[00:02:04.227022] ## Processing on RANK 0. +[00:02:04.237574] Model Args: + ModelArgs(dim=5120, n_layers=40, n_heads=40, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=16, bias_tuning=True) +[00:03:36.399161] build llama model with qformerv2 +[00:03:36.779030] (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /Salesforce/blip2-opt-2.7b/resolve/main/config.json (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1007)')))"), '(Request ID: f000589d-f862-41f8-832e-73fc0c96ee6a)') + Loading checkpoint shards: 0%| | 0/2 [00:00 +[00:36:06.281575] Start training for 3 epochs +[00:36:06.296296] log_dir: ./output_dir +[00:36:22.985451] Epoch: [0] [0/3229] lr: 0.000000 grad_norm: 2.3647 (2.3647) closs: 1.5947 (1.5947) time: 16.6883 data: 8.5180 max mem: 36209 +[00:37:03.163928] Epoch: [0] [10/3229] lr: 0.000001 grad_norm: 2.2614 (2.1844) closs: 1.3985 (1.3614) time: 5.1696 data: 0.7746 max mem: 54683 +[00:37:43.938889] Epoch: [0] [20/3229] lr: 0.000002 grad_norm: 2.2614 (2.2117) closs: 1.4480 (1.4415) time: 4.0476 data: 0.0002 max mem: 54683 +[00:38:24.104529] Epoch: [0] [30/3229] lr: 0.000002 grad_norm: 2.2231 (2.2052) closs: 1.4753 (1.4253) time: 4.0470 data: 0.0002 max mem: 54683 +[00:39:05.082555] Epoch: [0] [40/3229] lr: 0.000003 grad_norm: 2.1752 (2.1914) closs: 1.4110 (1.4059) time: 4.0571 data: 0.0002 max mem: 54683 +[00:39:46.229231] Epoch: [0] [50/3229] lr: 0.000004 grad_norm: 2.0509 (2.2594) closs: 1.4161 (1.4234) time: 4.1061 data: 0.0002 max mem: 54683 +[00:40:26.733155] Epoch: [0] [60/3229] lr: 0.000005 grad_norm: 1.9159 (2.1852) closs: 1.4598 (1.4133) time: 4.0824 data: 0.0003 max mem: 54683 +[00:41:07.256178] Epoch: [0] [70/3229] lr: 0.000005 grad_norm: 1.7466 (2.1041) closs: 1.4069 (1.4059) time: 4.0513 data: 0.0003 max mem: 54683 +[00:41:48.461794] Epoch: [0] [80/3229] lr: 0.000006 grad_norm: 1.4995 (2.0192) closs: 1.3245 (1.3975) time: 4.0864 data: 0.0002 max mem: 54683 +[00:42:28.626897] Epoch: [0] [90/3229] lr: 0.000007 grad_norm: 1.2119 (1.9324) closs: 1.2737 (1.3830) time: 4.0684 data: 0.0002 max mem: 54683 +[00:43:09.428679] Epoch: [0] [100/3229] lr: 0.000008 grad_norm: 1.1636 (1.8590) closs: 1.2619 (1.3700) time: 4.0482 data: 0.0003 max mem: 54683 +[00:43:49.914083] Epoch: [0] [110/3229] lr: 0.000009 grad_norm: 1.0306 (1.7801) closs: 1.2915 (1.3614) time: 4.0643 data: 0.0003 max mem: 54683 +[00:44:31.063164] Epoch: [0] [120/3229] lr: 0.000009 grad_norm: 0.9462 (1.7095) closs: 1.3081 (1.3546) time: 4.0816 data: 0.0002 max mem: 54683 +[00:45:11.261841] Epoch: [0] [130/3229] lr: 0.000010 grad_norm: 0.9337 (1.6472) closs: 1.2631 (1.3397) time: 4.0673 data: 0.0002 max mem: 54683 +[00:45:52.112790] Epoch: [0] [140/3229] lr: 0.000011 grad_norm: 0.8573 (1.5931) closs: 1.2118 (1.3288) time: 4.0524 data: 0.0002 max mem: 54683 +[00:46:32.944599] Epoch: [0] [150/3229] lr: 0.000012 grad_norm: 0.8517 (1.5447) closs: 1.1676 (1.3182) time: 4.0841 data: 0.0003 max mem: 54683 +[00:47:14.287016] Epoch: [0] [160/3229] lr: 0.000012 grad_norm: 0.8240 (1.4999) closs: 1.1756 (1.3118) time: 4.1086 data: 0.0003 max mem: 54683 +[00:47:55.388906] Epoch: [0] [170/3229] lr: 0.000013 grad_norm: 0.8136 (1.4618) closs: 1.1849 (1.3054) time: 4.1221 data: 0.0003 max mem: 54683 +[00:48:36.512000] Epoch: [0] [180/3229] lr: 0.000014 grad_norm: 0.8954 (1.4305) closs: 1.2078 (1.3010) time: 4.1112 data: 0.0002 max mem: 54683 +[00:49:17.622250] Epoch: [0] [190/3229] lr: 0.000015 grad_norm: 0.8401 (1.3994) closs: 1.2296 (1.2966) time: 4.1116 data: 0.0002 max mem: 54683 +[00:49:58.917542] Epoch: [0] [200/3229] lr: 0.000015 grad_norm: 0.8401 (1.3746) closs: 1.2198 (1.2906) time: 4.1202 data: 0.0002 max mem: 54683 +[00:50:39.721214] Epoch: [0] [210/3229] lr: 0.000016 grad_norm: 0.8446 (1.3499) closs: 1.1936 (1.2863) time: 4.1049 data: 0.0002 max mem: 54684 +[00:51:20.209941] Epoch: [0] [220/3229] lr: 0.000017 grad_norm: 0.8360 (1.3275) closs: 1.2039 (1.2828) time: 4.0645 data: 0.0002 max mem: 54684 +[00:52:01.340571] Epoch: [0] [230/3229] lr: 0.000018 grad_norm: 0.8406 (1.3077) closs: 1.2159 (1.2796) time: 4.0809 data: 0.0002 max mem: 54684 +[00:52:42.293014] Epoch: [0] [240/3229] lr: 0.000019 grad_norm: 0.8406 (1.2881) closs: 1.2119 (1.2756) time: 4.1041 data: 0.0002 max mem: 54684 +[00:53:22.448171] Epoch: [0] [250/3229] lr: 0.000019 grad_norm: 0.8104 (1.2702) closs: 1.1467 (1.2704) time: 4.0553 data: 0.0002 max mem: 54684 +[00:54:02.261862] Epoch: [0] [260/3229] lr: 0.000020 grad_norm: 0.8084 (1.2538) closs: 1.1398 (1.2651) time: 3.9984 data: 0.0002 max mem: 54684 +[00:54:42.733423] Epoch: [0] [270/3229] lr: 0.000021 grad_norm: 0.8337 (1.2407) closs: 1.1322 (1.2590) time: 4.0142 data: 0.0002 max mem: 54684 +[00:55:23.786823] Epoch: [0] [280/3229] lr: 0.000022 grad_norm: 0.8337 (1.2265) closs: 1.1322 (1.2549) time: 4.0762 data: 0.0002 max mem: 54684 +[00:56:04.261686] Epoch: [0] [290/3229] lr: 0.000022 grad_norm: 0.8146 (1.2134) closs: 1.1835 (1.2526) time: 4.0763 data: 0.0002 max mem: 54684 +[00:56:45.065966] Epoch: [0] [300/3229] lr: 0.000023 grad_norm: 0.8291 (1.2034) closs: 1.2046 (1.2503) time: 4.0639 data: 0.0002 max mem: 54684 +[00:57:25.203092] Epoch: [0] [310/3229] lr: 0.000024 grad_norm: 0.8414 (1.1916) closs: 1.1724 (1.2467) time: 4.0470 data: 0.0002 max mem: 54684 +[00:58:05.237988] Epoch: [0] [320/3229] lr: 0.000025 grad_norm: 0.8399 (1.1806) closs: 1.1496 (1.2420) time: 4.0085 data: 0.0002 max mem: 54684 +[00:58:46.046197] Epoch: [0] [330/3229] lr: 0.000026 grad_norm: 0.8551 (1.1711) closs: 1.1496 (1.2392) time: 4.0421 data: 0.0002 max mem: 54684 +[00:59:27.204738] Epoch: [0] [340/3229] lr: 0.000026 grad_norm: 0.8768 (1.1625) closs: 1.1481 (1.2362) time: 4.0983 data: 0.0002 max mem: 54684 +[01:00:08.357924] Epoch: [0] [350/3229] lr: 0.000027 grad_norm: 0.8572 (1.1541) closs: 1.1341 (1.2344) time: 4.1155 data: 0.0002 max mem: 54684 +[01:00:49.082581] Epoch: [0] [360/3229] lr: 0.000028 grad_norm: 0.8620 (1.1473) closs: 1.1321 (1.2311) time: 4.0938 data: 0.0002 max mem: 54684 +[01:01:29.908089] Epoch: [0] [370/3229] lr: 0.000029 grad_norm: 0.9078 (1.1413) closs: 1.1322 (1.2292) time: 4.0774 data: 0.0002 max mem: 54684 +[01:02:10.408877] Epoch: [0] [380/3229] lr: 0.000029 grad_norm: 0.9115 (1.1385) closs: 1.1444 (1.2265) time: 4.0662 data: 0.0002 max mem: 54684 +[01:02:50.910726] Epoch: [0] [390/3229] lr: 0.000030 grad_norm: 0.8868 (1.1318) closs: 1.1205 (1.2241) time: 4.0501 data: 0.0002 max mem: 54684 +[01:03:32.368355] Epoch: [0] [400/3229] lr: 0.000031 grad_norm: 0.8768 (1.1270) closs: 1.1205 (1.2212) time: 4.0979 data: 0.0002 max mem: 54684 +[01:04:12.522689] Epoch: [0] [410/3229] lr: 0.000032 grad_norm: 0.8617 (1.1200) closs: 1.1138 (1.2181) time: 4.0805 data: 0.0002 max mem: 54684 +[01:04:53.670656] Epoch: [0] [420/3229] lr: 0.000033 grad_norm: 0.8747 (1.1175) closs: 1.0839 (1.2160) time: 4.0650 data: 0.0002 max mem: 54684 +[01:05:34.517336] Epoch: [0] [430/3229] lr: 0.000033 grad_norm: 0.9140 (1.1135) closs: 1.1140 (1.2136) time: 4.0997 data: 0.0002 max mem: 54684 +[01:06:15.357490] Epoch: [0] [440/3229] lr: 0.000034 grad_norm: 0.9000 (1.1083) closs: 1.1255 (1.2112) time: 4.0843 data: 0.0002 max mem: 54684 +[01:06:56.510385] Epoch: [0] [450/3229] lr: 0.000035 grad_norm: 0.9130 (1.1047) closs: 1.1443 (1.2102) time: 4.0996 data: 0.0002 max mem: 54684 +[01:07:37.338507] Epoch: [0] [460/3229] lr: 0.000036 grad_norm: 0.8889 (1.0996) closs: 1.1569 (1.2089) time: 4.0990 data: 0.0002 max mem: 54684 +[01:08:17.507539] Epoch: [0] [470/3229] lr: 0.000036 grad_norm: 0.8632 (1.0962) closs: 1.1353 (1.2070) time: 4.0498 data: 0.0002 max mem: 54684 +[01:08:58.904572] Epoch: [0] [480/3229] lr: 0.000037 grad_norm: 0.9061 (1.0920) closs: 1.0907 (1.2045) time: 4.0782 data: 0.0002 max mem: 54684 +[01:09:39.746010] Epoch: [0] [490/3229] lr: 0.000038 grad_norm: 0.9314 (1.0890) closs: 1.0907 (1.2028) time: 4.1118 data: 0.0002 max mem: 54684 +[01:10:20.231136] Epoch: [0] [500/3229] lr: 0.000039 grad_norm: 0.9235 (1.0854) closs: 1.1249 (1.2011) time: 4.0663 data: 0.0002 max mem: 54684 +[01:11:01.054523] Epoch: [0] [510/3229] lr: 0.000039 grad_norm: 0.9309 (1.0834) closs: 1.1249 (1.1995) time: 4.0654 data: 0.0002 max mem: 54684 +[01:11:42.308142] Epoch: [0] [520/3229] lr: 0.000040 grad_norm: 0.9541 (1.0806) closs: 1.1226 (1.1972) time: 4.1038 data: 0.0002 max mem: 54684 +[01:12:23.110317] Epoch: [0] [530/3229] lr: 0.000041 grad_norm: 0.9516 (1.0777) closs: 1.1269 (1.1965) time: 4.1027 data: 0.0002 max mem: 54684 +[01:13:03.936397] Epoch: [0] [540/3229] lr: 0.000042 grad_norm: 0.9204 (1.0814) closs: 1.1651 (1.1959) time: 4.0813 data: 0.0002 max mem: 54684 +[01:13:44.208566] Epoch: [0] [550/3229] lr: 0.000043 grad_norm: 0.9204 (1.0785) closs: 1.1192 (1.1931) time: 4.0548 data: 0.0002 max mem: 54684 +[01:14:25.497954] Epoch: [0] [560/3229] lr: 0.000043 grad_norm: 0.9295 (1.0758) closs: 1.0839 (1.1919) time: 4.0780 data: 0.0002 max mem: 54684 +[01:15:05.977736] Epoch: [0] [570/3229] lr: 0.000044 grad_norm: 0.9723 (1.0751) closs: 1.1325 (1.1915) time: 4.0884 data: 0.0002 max mem: 54684 +[01:15:47.138810] Epoch: [0] [580/3229] lr: 0.000045 grad_norm: 0.9485 (1.0728) closs: 1.1764 (1.1910) time: 4.0820 data: 0.0002 max mem: 54684 +[01:16:28.229266] Epoch: [0] [590/3229] lr: 0.000046 grad_norm: 0.9286 (1.0704) closs: 1.1431 (1.1902) time: 4.1125 data: 0.0002 max mem: 54684 +[01:17:08.494066] Epoch: [0] [600/3229] lr: 0.000046 grad_norm: 0.9124 (1.0680) closs: 1.1127 (1.1888) time: 4.0677 data: 0.0002 max mem: 54684 +[01:17:49.333146] Epoch: [0] [610/3229] lr: 0.000047 grad_norm: 0.8772 (1.0649) closs: 1.1387 (1.1877) time: 4.0551 data: 0.0002 max mem: 54684 +[01:18:30.513096] Epoch: [0] [620/3229] lr: 0.000048 grad_norm: 0.8946 (1.0631) closs: 1.1490 (1.1873) time: 4.1009 data: 0.0002 max mem: 54684 +[01:19:12.132382] Epoch: [0] [630/3229] lr: 0.000049 grad_norm: 0.8910 (1.0597) closs: 1.1475 (1.1862) time: 4.1399 data: 0.0002 max mem: 54684 +[01:19:52.876937] Epoch: [0] [640/3229] lr: 0.000050 grad_norm: 0.8884 (1.0579) closs: 1.1173 (1.1843) time: 4.1181 data: 0.0002 max mem: 54684 +[01:20:33.485645] Epoch: [0] [650/3229] lr: 0.000050 grad_norm: 0.8884 (1.0558) closs: 1.0704 (1.1821) time: 4.0676 data: 0.0002 max mem: 54684 +[01:21:14.337520] Epoch: [0] [660/3229] lr: 0.000050 grad_norm: 0.8974 (1.0531) closs: 1.0758 (1.1812) time: 4.0730 data: 0.0002 max mem: 54684 +[01:21:54.945867] Epoch: [0] [670/3229] lr: 0.000050 grad_norm: 0.9121 (1.0517) closs: 1.1193 (1.1797) time: 4.0729 data: 0.0002 max mem: 54684 +[01:22:36.533925] Epoch: [0] [680/3229] lr: 0.000050 grad_norm: 0.9163 (1.0500) closs: 1.1473 (1.1795) time: 4.1097 data: 0.0002 max mem: 54684 +[01:23:17.315950] Epoch: [0] [690/3229] lr: 0.000050 grad_norm: 0.9097 (1.0480) closs: 1.1498 (1.1785) time: 4.1184 data: 0.0002 max mem: 54684 +[01:23:57.791032] Epoch: [0] [700/3229] lr: 0.000050 grad_norm: 0.8856 (1.0454) closs: 1.1322 (1.1770) time: 4.0628 data: 0.0003 max mem: 54684 +[01:24:38.841536] Epoch: [0] [710/3229] lr: 0.000050 grad_norm: 0.8786 (1.0446) closs: 1.1357 (1.1766) time: 4.0762 data: 0.0003 max mem: 54684 +[01:25:19.414277] Epoch: [0] [720/3229] lr: 0.000050 grad_norm: 0.9131 (1.0427) closs: 1.1209 (1.1750) time: 4.0811 data: 0.0002 max mem: 54684 +[01:26:00.560290] Epoch: [0] [730/3229] lr: 0.000050 grad_norm: 0.9133 (1.0408) closs: 1.0723 (1.1737) time: 4.0859 data: 0.0002 max mem: 54684 +[01:26:41.063904] Epoch: [0] [740/3229] lr: 0.000050 grad_norm: 0.8809 (1.0382) closs: 1.0816 (1.1727) time: 4.0824 data: 0.0002 max mem: 54684 +[01:27:22.243420] Epoch: [0] [750/3229] lr: 0.000050 grad_norm: 0.8721 (1.0366) closs: 1.1144 (1.1718) time: 4.0841 data: 0.0002 max mem: 54684 +[01:28:03.061085] Epoch: [0] [760/3229] lr: 0.000050 grad_norm: 0.8563 (1.0340) closs: 1.1227 (1.1713) time: 4.0998 data: 0.0002 max mem: 54684 +[01:28:43.881750] Epoch: [0] [770/3229] lr: 0.000050 grad_norm: 0.8563 (1.0319) closs: 1.1247 (1.1701) time: 4.0818 data: 0.0002 max mem: 54684 +[01:29:24.382823] Epoch: [0] [780/3229] lr: 0.000050 grad_norm: 0.8666 (1.0297) closs: 1.0862 (1.1689) time: 4.0660 data: 0.0003 max mem: 54684 +[01:30:05.263440] Epoch: [0] [790/3229] lr: 0.000050 grad_norm: 0.8803 (1.0278) closs: 1.0773 (1.1675) time: 4.0690 data: 0.0002 max mem: 54684 +[01:30:46.132360] Epoch: [0] [800/3229] lr: 0.000050 grad_norm: 0.8803 (1.0263) closs: 1.0871 (1.1669) time: 4.0874 data: 0.0002 max mem: 54684 +[01:31:26.624450] Epoch: [0] [810/3229] lr: 0.000050 grad_norm: 0.8782 (1.0245) closs: 1.0958 (1.1659) time: 4.0680 data: 0.0002 max mem: 54684 +[01:32:07.126461] Epoch: [0] [820/3229] lr: 0.000050 grad_norm: 0.8762 (1.0225) closs: 1.0958 (1.1649) time: 4.0496 data: 0.0002 max mem: 54684 +[01:32:48.359187] Epoch: [0] [830/3229] lr: 0.000050 grad_norm: 0.8256 (1.0204) closs: 1.0959 (1.1644) time: 4.0867 data: 0.0002 max mem: 54684 +[01:33:29.469866] Epoch: [0] [840/3229] lr: 0.000050 grad_norm: 0.8423 (1.0185) closs: 1.1185 (1.1638) time: 4.1171 data: 0.0002 max mem: 54684 +[01:34:09.970107] Epoch: [0] [850/3229] lr: 0.000050 grad_norm: 0.8891 (1.0170) closs: 1.1185 (1.1627) time: 4.0805 data: 0.0002 max mem: 54684 +[01:34:50.161303] Epoch: [0] [860/3229] lr: 0.000050 grad_norm: 0.8891 (1.0154) closs: 1.0997 (1.1617) time: 4.0345 data: 0.0002 max mem: 54684 +[01:35:31.765849] Epoch: [0] [870/3229] lr: 0.000050 grad_norm: 0.8328 (1.0135) closs: 1.0972 (1.1610) time: 4.0897 data: 0.0002 max mem: 54684 +[01:36:12.904367] Epoch: [0] [880/3229] lr: 0.000050 grad_norm: 0.8507 (1.0124) closs: 1.0972 (1.1607) time: 4.1371 data: 0.0002 max mem: 54684 +[01:36:54.052581] Epoch: [0] [890/3229] lr: 0.000050 grad_norm: 0.8788 (1.0109) closs: 1.1339 (1.1602) time: 4.1143 data: 0.0002 max mem: 54684 +[01:37:34.868212] Epoch: [0] [900/3229] lr: 0.000050 grad_norm: 0.8585 (1.0094) closs: 1.1287 (1.1597) time: 4.0981 data: 0.0002 max mem: 54684 +[01:38:16.113165] Epoch: [0] [910/3229] lr: 0.000050 grad_norm: 0.8376 (1.0077) closs: 1.1211 (1.1589) time: 4.1030 data: 0.0002 max mem: 54684 +[01:38:56.583668] Epoch: [0] [920/3229] lr: 0.000050 grad_norm: 0.8361 (1.0058) closs: 1.0987 (1.1582) time: 4.0857 data: 0.0002 max mem: 54684 +[01:39:37.055401] Epoch: [0] [930/3229] lr: 0.000050 grad_norm: 0.8472 (1.0044) closs: 1.1219 (1.1577) time: 4.0470 data: 0.0002 max mem: 54684 +[01:40:18.207956] Epoch: [0] [940/3229] lr: 0.000050 grad_norm: 0.8701 (1.0031) closs: 1.1219 (1.1571) time: 4.0811 data: 0.0002 max mem: 54684 +[01:40:59.474520] Epoch: [0] [950/3229] lr: 0.000050 grad_norm: 0.8784 (1.0016) closs: 1.1027 (1.1563) time: 4.1209 data: 0.0002 max mem: 54684 +[01:41:40.642522] Epoch: [0] [960/3229] lr: 0.000050 grad_norm: 0.8791 (1.0010) closs: 1.0988 (1.1557) time: 4.1217 data: 0.0002 max mem: 54684 +[01:42:21.476520] Epoch: [0] [970/3229] lr: 0.000050 grad_norm: 0.8580 (0.9997) closs: 1.1107 (1.1550) time: 4.1000 data: 0.0002 max mem: 54684 +[01:43:01.957616] Epoch: [0] [980/3229] lr: 0.000050 grad_norm: 0.8710 (0.9985) closs: 1.1156 (1.1545) time: 4.0657 data: 0.0002 max mem: 54684 +[01:43:43.560118] Epoch: [0] [990/3229] lr: 0.000050 grad_norm: 0.8756 (0.9976) closs: 1.1236 (1.1544) time: 4.1041 data: 0.0002 max mem: 54684 +[01:44:24.011660] Epoch: [0] [1000/3229] lr: 0.000050 grad_norm: 0.8893 (0.9967) closs: 1.1164 (1.1533) time: 4.1026 data: 0.0002 max mem: 54684 +[01:45:04.795094] Epoch: [0] [1010/3229] lr: 0.000050 grad_norm: 0.8719 (0.9951) closs: 1.0624 (1.1522) time: 4.0617 data: 0.0002 max mem: 54684 +[01:45:45.580136] Epoch: [0] [1020/3229] lr: 0.000050 grad_norm: 0.8502 (0.9936) closs: 1.0902 (1.1518) time: 4.0784 data: 0.0002 max mem: 54684 +[01:46:27.178733] Epoch: [0] [1030/3229] lr: 0.000050 grad_norm: 0.8502 (0.9924) closs: 1.1499 (1.1517) time: 4.1191 data: 0.0002 max mem: 54684 +[01:47:07.978058] Epoch: [0] [1040/3229] lr: 0.000050 grad_norm: 0.8731 (0.9909) closs: 1.0968 (1.1510) time: 4.1198 data: 0.0002 max mem: 54684 +[01:47:49.138856] Epoch: [0] [1050/3229] lr: 0.000050 grad_norm: 0.8731 (0.9900) closs: 1.1148 (1.1508) time: 4.0979 data: 0.0002 max mem: 54684 +[01:48:30.427876] Epoch: [0] [1060/3229] lr: 0.000050 grad_norm: 0.8918 (0.9889) closs: 1.1191 (1.1504) time: 4.1224 data: 0.0002 max mem: 54684 +[01:49:10.546105] Epoch: [0] [1070/3229] lr: 0.000050 grad_norm: 0.8288 (0.9871) closs: 1.0949 (1.1494) time: 4.0703 data: 0.0002 max mem: 54684 +[01:49:51.703975] Epoch: [0] [1080/3229] lr: 0.000050 grad_norm: 0.8567 (0.9861) closs: 1.1071 (1.1494) time: 4.0637 data: 0.0002 max mem: 54684 +[01:50:32.529173] Epoch: [0] [1090/3229] lr: 0.000050 grad_norm: 0.8706 (0.9851) closs: 1.1351 (1.1491) time: 4.0991 data: 0.0002 max mem: 54684 +[01:51:12.582431] Epoch: [0] [1100/3229] lr: 0.000050 grad_norm: 0.8475 (0.9835) closs: 1.0957 (1.1482) time: 4.0438 data: 0.0002 max mem: 54684 +[01:51:53.016781] Epoch: [0] [1110/3229] lr: 0.000050 grad_norm: 0.8386 (0.9821) closs: 1.0904 (1.1474) time: 4.0243 data: 0.0002 max mem: 54684 +[01:52:33.815856] Epoch: [0] [1120/3229] lr: 0.000050 grad_norm: 0.8762 (0.9811) closs: 1.1067 (1.1470) time: 4.0616 data: 0.0002 max mem: 54684 +[01:53:14.619418] Epoch: [0] [1130/3229] lr: 0.000050 grad_norm: 0.8767 (0.9802) closs: 1.0902 (1.1461) time: 4.0801 data: 0.0002 max mem: 54684 +[01:53:55.215391] Epoch: [0] [1140/3229] lr: 0.000050 grad_norm: 0.8585 (0.9789) closs: 1.0626 (1.1455) time: 4.0699 data: 0.0002 max mem: 54684 +[01:54:36.057646] Epoch: [0] [1150/3229] lr: 0.000050 grad_norm: 0.8274 (0.9774) closs: 1.0626 (1.1444) time: 4.0718 data: 0.0002 max mem: 54684 +[01:55:16.560302] Epoch: [0] [1160/3229] lr: 0.000050 grad_norm: 0.8237 (0.9766) closs: 1.0592 (1.1436) time: 4.0672 data: 0.0002 max mem: 54684 +[01:55:57.057967] Epoch: [0] [1170/3229] lr: 0.000050 grad_norm: 0.8557 (0.9756) closs: 1.0592 (1.1425) time: 4.0499 data: 0.0002 max mem: 54684 +[01:56:37.362333] Epoch: [0] [1180/3229] lr: 0.000050 grad_norm: 0.8549 (0.9744) closs: 1.0497 (1.1417) time: 4.0400 data: 0.0002 max mem: 54684 +[01:57:18.113945] Epoch: [0] [1190/3229] lr: 0.000050 grad_norm: 0.8588 (0.9733) closs: 1.0712 (1.1410) time: 4.0527 data: 0.0002 max mem: 54684 +[01:57:58.946014] Epoch: [0] [1200/3229] lr: 0.000050 grad_norm: 0.8545 (0.9721) closs: 1.1016 (1.1406) time: 4.0791 data: 0.0002 max mem: 54684 +[01:58:39.470964] Epoch: [0] [1210/3229] lr: 0.000050 grad_norm: 0.8155 (0.9705) closs: 1.0832 (1.1398) time: 4.0678 data: 0.0002 max mem: 54684 +[01:59:20.231476] Epoch: [0] [1220/3229] lr: 0.000050 grad_norm: 0.8030 (0.9693) closs: 1.0567 (1.1392) time: 4.0642 data: 0.0002 max mem: 54684 +[02:00:01.565697] Epoch: [0] [1230/3229] lr: 0.000050 grad_norm: 0.8324 (0.9682) closs: 1.1310 (1.1394) time: 4.1047 data: 0.0002 max mem: 54684 +[02:00:42.411163] Epoch: [0] [1240/3229] lr: 0.000050 grad_norm: 0.8455 (0.9676) closs: 1.1277 (1.1390) time: 4.1089 data: 0.0002 max mem: 54684 +[02:01:22.913104] Epoch: [0] [1250/3229] lr: 0.000050 grad_norm: 0.8509 (0.9664) closs: 1.0848 (1.1386) time: 4.0673 data: 0.0002 max mem: 54684 +[02:02:03.681895] Epoch: [0] [1260/3229] lr: 0.000049 grad_norm: 0.8150 (0.9655) closs: 1.0657 (1.1378) time: 4.0635 data: 0.0002 max mem: 54684 +[02:02:44.382490] Epoch: [0] [1270/3229] lr: 0.000049 grad_norm: 0.8150 (0.9645) closs: 1.0536 (1.1371) time: 4.0734 data: 0.0002 max mem: 54684 +[02:03:25.548675] Epoch: [0] [1280/3229] lr: 0.000049 grad_norm: 0.8142 (0.9634) closs: 1.0764 (1.1366) time: 4.0933 data: 0.0002 max mem: 54684 +[02:04:05.764704] Epoch: [0] [1290/3229] lr: 0.000049 grad_norm: 0.8104 (0.9624) closs: 1.0764 (1.1362) time: 4.0690 data: 0.0002 max mem: 54684 +[02:04:46.308118] Epoch: [0] [1300/3229] lr: 0.000049 grad_norm: 0.8170 (0.9614) closs: 1.0782 (1.1357) time: 4.0379 data: 0.0002 max mem: 54684 +[02:05:27.268012] Epoch: [0] [1310/3229] lr: 0.000049 grad_norm: 0.8101 (0.9602) closs: 1.0872 (1.1352) time: 4.0751 data: 0.0002 max mem: 54684 +[02:06:07.751784] Epoch: [0] [1320/3229] lr: 0.000049 grad_norm: 0.8024 (0.9591) closs: 1.0757 (1.1345) time: 4.0721 data: 0.0002 max mem: 54684 +[02:06:48.603185] Epoch: [0] [1330/3229] lr: 0.000049 grad_norm: 0.8059 (0.9579) closs: 1.0757 (1.1343) time: 4.0667 data: 0.0002 max mem: 54684 +[02:07:29.989007] Epoch: [0] [1340/3229] lr: 0.000049 grad_norm: 0.8320 (0.9575) closs: 1.1105 (1.1342) time: 4.1118 data: 0.0002 max mem: 54684 +[02:08:10.751898] Epoch: [0] [1350/3229] lr: 0.000049 grad_norm: 0.8769 (0.9566) closs: 1.0763 (1.1338) time: 4.1074 data: 0.0002 max mem: 54684 +[02:08:51.277537] Epoch: [0] [1360/3229] lr: 0.000049 grad_norm: 0.8228 (0.9556) closs: 1.0611 (1.1332) time: 4.0644 data: 0.0002 max mem: 54684 +[02:09:32.452759] Epoch: [0] [1370/3229] lr: 0.000049 grad_norm: 0.8208 (0.9549) closs: 1.0753 (1.1329) time: 4.0850 data: 0.0002 max mem: 54684 +[02:10:13.889635] Epoch: [0] [1380/3229] lr: 0.000049 grad_norm: 0.8204 (0.9543) closs: 1.0725 (1.1325) time: 4.1305 data: 0.0002 max mem: 54684 +[02:10:54.494775] Epoch: [0] [1390/3229] lr: 0.000049 grad_norm: 0.8187 (0.9533) closs: 1.0725 (1.1321) time: 4.1020 data: 0.0002 max mem: 54684 +[02:11:35.317556] Epoch: [0] [1400/3229] lr: 0.000049 grad_norm: 0.8187 (0.9522) closs: 1.0902 (1.1317) time: 4.0713 data: 0.0002 max mem: 54684 +[02:12:16.125134] Epoch: [0] [1410/3229] lr: 0.000049 grad_norm: 0.7985 (0.9512) closs: 1.0906 (1.1314) time: 4.0814 data: 0.0002 max mem: 54684 +[02:12:57.277533] Epoch: [0] [1420/3229] lr: 0.000049 grad_norm: 0.8158 (0.9501) closs: 1.1104 (1.1314) time: 4.0979 data: 0.0002 max mem: 54684 +[02:13:37.904443] Epoch: [0] [1430/3229] lr: 0.000049 grad_norm: 0.8008 (0.9488) closs: 1.0989 (1.1309) time: 4.0889 data: 0.0002 max mem: 54684 +[02:14:18.743491] Epoch: [0] [1440/3229] lr: 0.000049 grad_norm: 0.8008 (0.9479) closs: 1.0963 (1.1307) time: 4.0732 data: 0.0002 max mem: 54684 +[02:14:59.571598] Epoch: [0] [1450/3229] lr: 0.000049 grad_norm: 0.8153 (0.9467) closs: 1.1322 (1.1308) time: 4.0833 data: 0.0002 max mem: 54684 +[02:15:40.772886] Epoch: [0] [1460/3229] lr: 0.000049 grad_norm: 0.8315 (0.9465) closs: 1.1439 (1.1306) time: 4.1014 data: 0.0002 max mem: 54684 +[02:16:21.736547] Epoch: [0] [1470/3229] lr: 0.000049 grad_norm: 0.8342 (0.9457) closs: 1.0850 (1.1301) time: 4.1082 data: 0.0002 max mem: 54684 +[02:17:02.910967] Epoch: [0] [1480/3229] lr: 0.000049 grad_norm: 0.7991 (0.9446) closs: 1.0827 (1.1298) time: 4.1068 data: 0.0002 max mem: 54684 +[02:17:44.083251] Epoch: [0] [1490/3229] lr: 0.000049 grad_norm: 0.7991 (0.9442) closs: 1.1022 (1.1297) time: 4.1173 data: 0.0002 max mem: 54684 +[02:18:26.042677] Epoch: [0] [1500/3229] lr: 0.000049 grad_norm: 0.8720 (0.9438) closs: 1.1096 (1.1297) time: 4.1565 data: 0.0002 max mem: 54684 +[02:19:06.874119] Epoch: [0] [1510/3229] lr: 0.000049 grad_norm: 0.8466 (0.9430) closs: 1.0920 (1.1293) time: 4.1395 data: 0.0002 max mem: 54684 +[02:19:48.004002] Epoch: [0] [1520/3229] lr: 0.000049 grad_norm: 0.8431 (0.9425) closs: 1.0898 (1.1291) time: 4.0980 data: 0.0002 max mem: 54684 +[02:20:29.271406] Epoch: [0] [1530/3229] lr: 0.000049 grad_norm: 0.8229 (0.9419) closs: 1.0907 (1.1289) time: 4.1198 data: 0.0002 max mem: 54684 +[02:21:10.763947] Epoch: [0] [1540/3229] lr: 0.000049 grad_norm: 0.8301 (0.9413) closs: 1.1024 (1.1285) time: 4.1379 data: 0.0002 max mem: 54684 +[02:21:50.930108] Epoch: [0] [1550/3229] lr: 0.000049 grad_norm: 0.8301 (0.9403) closs: 1.0550 (1.1277) time: 4.0829 data: 0.0002 max mem: 54684 +[02:22:31.739856] Epoch: [0] [1560/3229] lr: 0.000049 grad_norm: 0.7819 (0.9392) closs: 1.0473 (1.1272) time: 4.0487 data: 0.0002 max mem: 54684 +[02:23:12.906228] Epoch: [0] [1570/3229] lr: 0.000049 grad_norm: 0.7600 (0.9384) closs: 1.0405 (1.1268) time: 4.0987 data: 0.0002 max mem: 54684 +[02:23:54.080302] Epoch: [0] [1580/3229] lr: 0.000049 grad_norm: 0.8070 (0.9376) closs: 1.0689 (1.1264) time: 4.1169 data: 0.0002 max mem: 54684 +[02:24:34.900214] Epoch: [0] [1590/3229] lr: 0.000049 grad_norm: 0.8137 (0.9367) closs: 1.0689 (1.1259) time: 4.0996 data: 0.0002 max mem: 54684 +[02:25:14.745725] Epoch: [0] [1600/3229] lr: 0.000049 grad_norm: 0.7806 (0.9354) closs: 1.0465 (1.1251) time: 4.0332 data: 0.0002 max mem: 54684 +[02:25:55.704023] Epoch: [0] [1610/3229] lr: 0.000049 grad_norm: 0.8074 (0.9348) closs: 1.0935 (1.1250) time: 4.0401 data: 0.0002 max mem: 54684 +[02:26:37.197949] Epoch: [0] [1620/3229] lr: 0.000049 grad_norm: 0.8162 (0.9342) closs: 1.1148 (1.1250) time: 4.1225 data: 0.0002 max mem: 54684 +[02:27:18.326238] Epoch: [0] [1630/3229] lr: 0.000049 grad_norm: 0.8162 (0.9335) closs: 1.0970 (1.1247) time: 4.1310 data: 0.0002 max mem: 54684 +[02:27:59.451089] Epoch: [0] [1640/3229] lr: 0.000049 grad_norm: 0.8227 (0.9328) closs: 1.1007 (1.1248) time: 4.1126 data: 0.0002 max mem: 54684 +[02:28:40.359589] Epoch: [0] [1650/3229] lr: 0.000049 grad_norm: 0.8227 (0.9321) closs: 1.0987 (1.1244) time: 4.1016 data: 0.0002 max mem: 54684 +[02:29:21.499384] Epoch: [0] [1660/3229] lr: 0.000049 grad_norm: 0.7906 (0.9314) closs: 1.0815 (1.1243) time: 4.1023 data: 0.0002 max mem: 54684 +[02:30:01.961500] Epoch: [0] [1670/3229] lr: 0.000049 grad_norm: 0.7906 (0.9306) closs: 1.1051 (1.1242) time: 4.0800 data: 0.0002 max mem: 54684 +[02:30:43.093943] Epoch: [0] [1680/3229] lr: 0.000049 grad_norm: 0.7620 (0.9296) closs: 1.0947 (1.1241) time: 4.0797 data: 0.0002 max mem: 54684 +[02:31:23.378880] Epoch: [0] [1690/3229] lr: 0.000049 grad_norm: 0.7389 (0.9285) closs: 1.0770 (1.1237) time: 4.0708 data: 0.0002 max mem: 54684 +[02:32:04.494682] Epoch: [0] [1700/3229] lr: 0.000049 grad_norm: 0.7654 (0.9278) closs: 1.0655 (1.1234) time: 4.0700 data: 0.0002 max mem: 54684 +[02:32:44.682897] Epoch: [0] [1710/3229] lr: 0.000048 grad_norm: 0.7788 (0.9267) closs: 1.0655 (1.1229) time: 4.0651 data: 0.0002 max mem: 54684 +[02:33:25.176480] Epoch: [0] [1720/3229] lr: 0.000048 grad_norm: 0.7793 (0.9260) closs: 1.0609 (1.1226) time: 4.0340 data: 0.0002 max mem: 54684 +[02:34:06.615615] Epoch: [0] [1730/3229] lr: 0.000048 grad_norm: 0.7983 (0.9254) closs: 1.0603 (1.1224) time: 4.0966 data: 0.0002 max mem: 54684 +[02:34:47.629024] Epoch: [0] [1740/3229] lr: 0.000048 grad_norm: 0.7874 (0.9245) closs: 1.0761 (1.1222) time: 4.1226 data: 0.0002 max mem: 54684 +[02:35:28.439728] Epoch: [0] [1750/3229] lr: 0.000048 grad_norm: 0.8113 (0.9241) closs: 1.0861 (1.1220) time: 4.0911 data: 0.0002 max mem: 54684 +[02:36:09.569880] Epoch: [0] [1760/3229] lr: 0.000048 grad_norm: 0.8314 (0.9236) closs: 1.1081 (1.1218) time: 4.0970 data: 0.0002 max mem: 54684 +[02:36:50.672861] Epoch: [0] [1770/3229] lr: 0.000048 grad_norm: 0.8306 (0.9229) closs: 1.0888 (1.1214) time: 4.1116 data: 0.0002 max mem: 54684 +[02:37:31.667284] Epoch: [0] [1780/3229] lr: 0.000048 grad_norm: 0.8092 (0.9221) closs: 1.0832 (1.1211) time: 4.1048 data: 0.0002 max mem: 54684 +[02:38:12.473138] Epoch: [0] [1790/3229] lr: 0.000048 grad_norm: 0.7678 (0.9212) closs: 1.0395 (1.1207) time: 4.0899 data: 0.0002 max mem: 54684 +[02:38:52.942058] Epoch: [0] [1800/3229] lr: 0.000048 grad_norm: 0.7929 (0.9204) closs: 1.0356 (1.1204) time: 4.0637 data: 0.0002 max mem: 54684 +[02:39:33.666135] Epoch: [0] [1810/3229] lr: 0.000048 grad_norm: 0.7905 (0.9195) closs: 1.0807 (1.1202) time: 4.0596 data: 0.0002 max mem: 54684 +[02:40:14.381280] Epoch: [0] [1820/3229] lr: 0.000048 grad_norm: 0.7724 (0.9187) closs: 1.1055 (1.1200) time: 4.0719 data: 0.0002 max mem: 54684 +[02:40:55.197221] Epoch: [0] [1830/3229] lr: 0.000048 grad_norm: 0.7724 (0.9180) closs: 1.0897 (1.1199) time: 4.0765 data: 0.0002 max mem: 54684 +[02:41:35.043279] Epoch: [0] [1840/3229] lr: 0.000048 grad_norm: 0.7413 (0.9168) closs: 1.0573 (1.1195) time: 4.0330 data: 0.0002 max mem: 54684 +[02:42:16.132889] Epoch: [0] [1850/3229] lr: 0.000048 grad_norm: 0.7712 (0.9161) closs: 1.0468 (1.1191) time: 4.0467 data: 0.0002 max mem: 54684 +[02:42:56.518513] Epoch: [0] [1860/3229] lr: 0.000048 grad_norm: 0.7990 (0.9154) closs: 1.0714 (1.1188) time: 4.0737 data: 0.0002 max mem: 54684 +[02:43:37.655055] Epoch: [0] [1870/3229] lr: 0.000048 grad_norm: 0.8236 (0.9150) closs: 1.1020 (1.1187) time: 4.0760 data: 0.0002 max mem: 54684 +[02:44:18.501800] Epoch: [0] [1880/3229] lr: 0.000048 grad_norm: 0.8055 (0.9143) closs: 1.0773 (1.1183) time: 4.0991 data: 0.0002 max mem: 54684 +[02:44:59.327626] Epoch: [0] [1890/3229] lr: 0.000048 grad_norm: 0.7469 (0.9135) closs: 1.0612 (1.1180) time: 4.0836 data: 0.0002 max mem: 54684 +[02:45:40.957982] Epoch: [0] [1900/3229] lr: 0.000048 grad_norm: 0.8118 (0.9132) closs: 1.0947 (1.1179) time: 4.1227 data: 0.0002 max mem: 54684 +[02:46:21.456520] Epoch: [0] [1910/3229] lr: 0.000048 grad_norm: 0.8009 (0.9124) closs: 1.0516 (1.1173) time: 4.1064 data: 0.0002 max mem: 54684 +[02:47:01.609291] Epoch: [0] [1920/3229] lr: 0.000048 grad_norm: 0.8009 (0.9121) closs: 1.0265 (1.1168) time: 4.0325 data: 0.0002 max mem: 54684 +[02:47:43.618438] Epoch: [0] [1930/3229] lr: 0.000048 grad_norm: 0.8074 (0.9115) closs: 1.0395 (1.1167) time: 4.1080 data: 0.0002 max mem: 54684 +[02:48:24.870145] Epoch: [0] [1940/3229] lr: 0.000048 grad_norm: 0.7995 (0.9110) closs: 1.1143 (1.1167) time: 4.1630 data: 0.0002 max mem: 54684 +[02:49:06.019794] Epoch: [0] [1950/3229] lr: 0.000048 grad_norm: 0.8393 (0.9109) closs: 1.1143 (1.1166) time: 4.1200 data: 0.0001 max mem: 54684 +[02:49:46.491553] Epoch: [0] [1960/3229] lr: 0.000048 grad_norm: 0.8373 (0.9102) closs: 1.0866 (1.1165) time: 4.0810 data: 0.0002 max mem: 54684 +[02:50:27.704222] Epoch: [0] [1970/3229] lr: 0.000048 grad_norm: 0.8284 (0.9099) closs: 1.0866 (1.1163) time: 4.0841 data: 0.0002 max mem: 54684 +[02:51:08.271766] Epoch: [0] [1980/3229] lr: 0.000048 grad_norm: 0.7955 (0.9092) closs: 1.0799 (1.1161) time: 4.0890 data: 0.0002 max mem: 54684 +[02:51:49.381940] Epoch: [0] [1990/3229] lr: 0.000048 grad_norm: 0.7955 (0.9090) closs: 1.0799 (1.1159) time: 4.0838 data: 0.0002 max mem: 54684 +[02:52:29.511376] Epoch: [0] [2000/3229] lr: 0.000048 grad_norm: 0.8154 (0.9083) closs: 1.0683 (1.1154) time: 4.0619 data: 0.0002 max mem: 54684 +[02:53:10.897602] Epoch: [0] [2010/3229] lr: 0.000048 grad_norm: 0.8103 (0.9078) closs: 1.0683 (1.1152) time: 4.0757 data: 0.0002 max mem: 54684 +[02:53:51.562695] Epoch: [0] [2020/3229] lr: 0.000047 grad_norm: 0.8103 (0.9072) closs: 1.0827 (1.1151) time: 4.1025 data: 0.0002 max mem: 54684 +[02:54:31.690739] Epoch: [0] [2030/3229] lr: 0.000047 grad_norm: 0.7805 (0.9065) closs: 1.0595 (1.1145) time: 4.0396 data: 0.0002 max mem: 54684 +[02:55:12.459763] Epoch: [0] [2040/3229] lr: 0.000047 grad_norm: 0.7728 (0.9058) closs: 1.0613 (1.1143) time: 4.0448 data: 0.0002 max mem: 54684 +[02:55:54.276314] Epoch: [0] [2050/3229] lr: 0.000047 grad_norm: 0.7637 (0.9051) closs: 1.1000 (1.1144) time: 4.1292 data: 0.0002 max mem: 54684 +[02:56:35.163977] Epoch: [0] [2060/3229] lr: 0.000047 grad_norm: 0.7644 (0.9045) closs: 1.1134 (1.1143) time: 4.1351 data: 0.0002 max mem: 54684 +[02:57:15.269544] Epoch: [0] [2070/3229] lr: 0.000047 grad_norm: 0.7892 (0.9039) closs: 1.0588 (1.1138) time: 4.0496 data: 0.0002 max mem: 54684 +[02:57:56.040287] Epoch: [0] [2080/3229] lr: 0.000047 grad_norm: 0.8042 (0.9033) closs: 1.0447 (1.1136) time: 4.0438 data: 0.0002 max mem: 54684 +[02:58:36.828734] Epoch: [0] [2090/3229] lr: 0.000047 grad_norm: 0.7867 (0.9027) closs: 1.0914 (1.1133) time: 4.0779 data: 0.0002 max mem: 54684 +[02:59:17.056084] Epoch: [0] [2100/3229] lr: 0.000047 grad_norm: 0.7766 (0.9021) closs: 1.0796 (1.1131) time: 4.0507 data: 0.0002 max mem: 54684 +[02:59:57.850948] Epoch: [0] [2110/3229] lr: 0.000047 grad_norm: 0.7773 (0.9016) closs: 1.0748 (1.1129) time: 4.0510 data: 0.0002 max mem: 54684 +[03:00:38.445171] Epoch: [0] [2120/3229] lr: 0.000047 grad_norm: 0.8051 (0.9012) closs: 1.0689 (1.1126) time: 4.0694 data: 0.0002 max mem: 54684 +[03:01:19.387745] Epoch: [0] [2130/3229] lr: 0.000047 grad_norm: 0.8057 (0.9006) closs: 1.0761 (1.1126) time: 4.0768 data: 0.0002 max mem: 54684 +[03:01:59.964804] Epoch: [0] [2140/3229] lr: 0.000047 grad_norm: 0.8057 (0.9000) closs: 1.1012 (1.1124) time: 4.0759 data: 0.0002 max mem: 54684 +[03:02:40.770931] Epoch: [0] [2150/3229] lr: 0.000047 grad_norm: 0.7907 (0.8995) closs: 1.0643 (1.1121) time: 4.0691 data: 0.0002 max mem: 54684 +[03:03:21.686342] Epoch: [0] [2160/3229] lr: 0.000047 grad_norm: 0.7718 (0.8990) closs: 1.0844 (1.1122) time: 4.0860 data: 0.0002 max mem: 54684 +[03:04:02.991052] Epoch: [0] [2170/3229] lr: 0.000047 grad_norm: 0.7893 (0.8986) closs: 1.1509 (1.1124) time: 4.1109 data: 0.0002 max mem: 54684 +[03:04:44.216654] Epoch: [0] [2180/3229] lr: 0.000047 grad_norm: 0.8023 (0.8981) closs: 1.0995 (1.1123) time: 4.1264 data: 0.0002 max mem: 54684 +[03:05:24.677881] Epoch: [0] [2190/3229] lr: 0.000047 grad_norm: 0.8062 (0.8976) closs: 1.0876 (1.1122) time: 4.0843 data: 0.0002 max mem: 54684 +[03:06:05.666908] Epoch: [0] [2200/3229] lr: 0.000047 grad_norm: 0.8079 (0.8971) closs: 1.0597 (1.1120) time: 4.0724 data: 0.0002 max mem: 54684 +[03:06:46.537094] Epoch: [0] [2210/3229] lr: 0.000047 grad_norm: 0.8028 (0.8967) closs: 1.0597 (1.1120) time: 4.0929 data: 0.0002 max mem: 54684 +[03:07:27.457503] Epoch: [0] [2220/3229] lr: 0.000047 grad_norm: 0.8028 (0.8962) closs: 1.0845 (1.1117) time: 4.0895 data: 0.0002 max mem: 54684 +[03:08:08.278999] Epoch: [0] [2230/3229] lr: 0.000047 grad_norm: 0.7934 (0.8957) closs: 1.0507 (1.1114) time: 4.0870 data: 0.0002 max mem: 54684 +[03:08:48.891856] Epoch: [0] [2240/3229] lr: 0.000047 grad_norm: 0.7621 (0.8951) closs: 1.0717 (1.1111) time: 4.0717 data: 0.0002 max mem: 54684 +[03:09:29.597430] Epoch: [0] [2250/3229] lr: 0.000047 grad_norm: 0.7756 (0.8946) closs: 1.0717 (1.1110) time: 4.0659 data: 0.0002 max mem: 54684 +[03:10:10.515536] Epoch: [0] [2260/3229] lr: 0.000047 grad_norm: 0.7727 (0.8940) closs: 1.0694 (1.1108) time: 4.0811 data: 0.0002 max mem: 54684 +[03:10:51.637059] Epoch: [0] [2270/3229] lr: 0.000047 grad_norm: 0.7727 (0.8936) closs: 1.0694 (1.1106) time: 4.1019 data: 0.0002 max mem: 54684 +[03:11:32.595121] Epoch: [0] [2280/3229] lr: 0.000046 grad_norm: 0.7681 (0.8929) closs: 1.0451 (1.1103) time: 4.1039 data: 0.0002 max mem: 54684 +[03:12:13.624396] Epoch: [0] [2290/3229] lr: 0.000046 grad_norm: 0.7482 (0.8924) closs: 1.0431 (1.1103) time: 4.0993 data: 0.0002 max mem: 54684 +[03:12:54.747566] Epoch: [0] [2300/3229] lr: 0.000046 grad_norm: 0.7703 (0.8920) closs: 1.1258 (1.1103) time: 4.1076 data: 0.0002 max mem: 54684 +[03:13:34.899825] Epoch: [0] [2310/3229] lr: 0.000046 grad_norm: 0.7731 (0.8914) closs: 1.0817 (1.1101) time: 4.0637 data: 0.0002 max mem: 54684 +[03:14:15.886367] Epoch: [0] [2320/3229] lr: 0.000046 grad_norm: 0.8009 (0.8911) closs: 1.0688 (1.1099) time: 4.0569 data: 0.0002 max mem: 54684 +[03:14:56.926589] Epoch: [0] [2330/3229] lr: 0.000046 grad_norm: 0.7560 (0.8905) closs: 1.0458 (1.1094) time: 4.1013 data: 0.0002 max mem: 54684 +[03:15:37.678425] Epoch: [0] [2340/3229] lr: 0.000046 grad_norm: 0.7591 (0.8902) closs: 1.0475 (1.1094) time: 4.0895 data: 0.0002 max mem: 54684 +[03:16:18.128561] Epoch: [0] [2350/3229] lr: 0.000046 grad_norm: 0.8028 (0.8896) closs: 1.0808 (1.1091) time: 4.0600 data: 0.0002 max mem: 54684 +[03:16:59.115639] Epoch: [0] [2360/3229] lr: 0.000046 grad_norm: 0.7890 (0.8892) closs: 1.0613 (1.1090) time: 4.0718 data: 0.0002 max mem: 54684 +[03:17:39.826484] Epoch: [0] [2370/3229] lr: 0.000046 grad_norm: 0.7886 (0.8888) closs: 1.0808 (1.1091) time: 4.0848 data: 0.0002 max mem: 54684 +[03:18:20.921523] Epoch: [0] [2380/3229] lr: 0.000046 grad_norm: 0.7769 (0.8883) closs: 1.1109 (1.1091) time: 4.0902 data: 0.0002 max mem: 54684 +[03:19:01.354222] Epoch: [0] [2390/3229] lr: 0.000046 grad_norm: 0.7578 (0.8877) closs: 1.0823 (1.1089) time: 4.0763 data: 0.0002 max mem: 54684 +[03:19:42.341834] Epoch: [0] [2400/3229] lr: 0.000046 grad_norm: 0.7938 (0.8873) closs: 1.0722 (1.1086) time: 4.0710 data: 0.0002 max mem: 54684 +[03:20:23.981102] Epoch: [0] [2410/3229] lr: 0.000046 grad_norm: 0.8303 (0.8871) closs: 1.0663 (1.1085) time: 4.1313 data: 0.0002 max mem: 54684 +[03:21:04.723017] Epoch: [0] [2420/3229] lr: 0.000046 grad_norm: 0.8142 (0.8867) closs: 1.0663 (1.1083) time: 4.1190 data: 0.0002 max mem: 54684 +[03:21:45.161994] Epoch: [0] [2430/3229] lr: 0.000046 grad_norm: 0.7785 (0.8863) closs: 1.0653 (1.1080) time: 4.0590 data: 0.0002 max mem: 54684 +[03:22:26.409460] Epoch: [0] [2440/3229] lr: 0.000046 grad_norm: 0.7811 (0.8859) closs: 1.0728 (1.1080) time: 4.0843 data: 0.0002 max mem: 54684 +[03:23:07.737120] Epoch: [0] [2450/3229] lr: 0.000046 grad_norm: 0.7773 (0.8855) closs: 1.1207 (1.1079) time: 4.1287 data: 0.0002 max mem: 54684 +[03:23:48.818206] Epoch: [0] [2460/3229] lr: 0.000046 grad_norm: 0.7819 (0.8853) closs: 1.0838 (1.1078) time: 4.1204 data: 0.0002 max mem: 54684 +[03:24:29.929209] Epoch: [0] [2470/3229] lr: 0.000046 grad_norm: 0.8072 (0.8851) closs: 1.0518 (1.1075) time: 4.1095 data: 0.0002 max mem: 54684 +[03:25:10.886008] Epoch: [0] [2480/3229] lr: 0.000046 grad_norm: 0.7804 (0.8846) closs: 1.0573 (1.1073) time: 4.1033 data: 0.0002 max mem: 54684 +[03:25:52.557958] Epoch: [0] [2490/3229] lr: 0.000046 grad_norm: 0.7737 (0.8845) closs: 1.0763 (1.1072) time: 4.1314 data: 0.0002 max mem: 54684 +[03:26:33.671349] Epoch: [0] [2500/3229] lr: 0.000045 grad_norm: 0.7789 (0.8840) closs: 1.1176 (1.1073) time: 4.1392 data: 0.0002 max mem: 54684 +[03:27:14.135334] Epoch: [0] [2510/3229] lr: 0.000045 grad_norm: 0.7701 (0.8835) closs: 1.0906 (1.1070) time: 4.0788 data: 0.0002 max mem: 54684 +[03:27:55.432584] Epoch: [0] [2520/3229] lr: 0.000045 grad_norm: 0.7708 (0.8832) closs: 1.0723 (1.1069) time: 4.0880 data: 0.0002 max mem: 54684 +[03:28:36.466155] Epoch: [0] [2530/3229] lr: 0.000045 grad_norm: 0.7946 (0.8827) closs: 1.0810 (1.1069) time: 4.1165 data: 0.0002 max mem: 54684 +[03:29:17.239455] Epoch: [0] [2540/3229] lr: 0.000045 grad_norm: 0.7615 (0.8823) closs: 1.0746 (1.1067) time: 4.0903 data: 0.0002 max mem: 54684 +[03:29:58.019807] Epoch: [0] [2550/3229] lr: 0.000045 grad_norm: 0.7483 (0.8819) closs: 1.0516 (1.1065) time: 4.0776 data: 0.0002 max mem: 54684 +[03:30:39.329898] Epoch: [0] [2560/3229] lr: 0.000045 grad_norm: 0.7708 (0.8816) closs: 1.0516 (1.1063) time: 4.1045 data: 0.0002 max mem: 54684 +[03:31:20.317679] Epoch: [0] [2570/3229] lr: 0.000045 grad_norm: 0.7788 (0.8812) closs: 1.0652 (1.1062) time: 4.1148 data: 0.0002 max mem: 54684 +[03:32:01.406782] Epoch: [0] [2580/3229] lr: 0.000045 grad_norm: 0.7574 (0.8808) closs: 1.0688 (1.1062) time: 4.1038 data: 0.0002 max mem: 54684 +[03:32:41.538400] Epoch: [0] [2590/3229] lr: 0.000045 grad_norm: 0.7515 (0.8802) closs: 1.0843 (1.1059) time: 4.0610 data: 0.0002 max mem: 54684 +[03:33:22.786415] Epoch: [0] [2600/3229] lr: 0.000045 grad_norm: 0.8014 (0.8799) closs: 1.0804 (1.1058) time: 4.0689 data: 0.0002 max mem: 54684 +[03:34:03.508058] Epoch: [0] [2610/3229] lr: 0.000045 grad_norm: 0.8014 (0.8794) closs: 1.0795 (1.1055) time: 4.0984 data: 0.0002 max mem: 54684 +[03:34:43.958992] Epoch: [0] [2620/3229] lr: 0.000045 grad_norm: 0.7646 (0.8789) closs: 1.0806 (1.1054) time: 4.0586 data: 0.0002 max mem: 54684 +[03:35:25.187896] Epoch: [0] [2630/3229] lr: 0.000045 grad_norm: 0.7696 (0.8785) closs: 1.1043 (1.1055) time: 4.0839 data: 0.0002 max mem: 54684 +[03:36:06.087296] Epoch: [0] [2640/3229] lr: 0.000045 grad_norm: 0.8024 (0.8782) closs: 1.1064 (1.1054) time: 4.1063 data: 0.0002 max mem: 54684 +[03:36:46.644106] Epoch: [0] [2650/3229] lr: 0.000045 grad_norm: 0.7676 (0.8777) closs: 1.0427 (1.1051) time: 4.0727 data: 0.0002 max mem: 54684 +[03:37:27.758262] Epoch: [0] [2660/3229] lr: 0.000045 grad_norm: 0.7810 (0.8776) closs: 1.0405 (1.1049) time: 4.0835 data: 0.0002 max mem: 54684 +[03:38:08.680275] Epoch: [0] [2670/3229] lr: 0.000045 grad_norm: 0.8024 (0.8774) closs: 1.0546 (1.1047) time: 4.1017 data: 0.0002 max mem: 54684 +[03:38:49.498507] Epoch: [0] [2680/3229] lr: 0.000045 grad_norm: 0.7959 (0.8770) closs: 1.0579 (1.1046) time: 4.0869 data: 0.0002 max mem: 54684 +[03:39:30.230437] Epoch: [0] [2690/3229] lr: 0.000045 grad_norm: 0.7603 (0.8766) closs: 1.0688 (1.1044) time: 4.0774 data: 0.0002 max mem: 54684 +[03:40:10.699217] Epoch: [0] [2700/3229] lr: 0.000045 grad_norm: 0.7214 (0.8762) closs: 1.0769 (1.1043) time: 4.0600 data: 0.0002 max mem: 54684 +[03:40:51.627601] Epoch: [0] [2710/3229] lr: 0.000044 grad_norm: 0.7282 (0.8757) closs: 1.0952 (1.1042) time: 4.0698 data: 0.0002 max mem: 54684 +[03:41:32.962290] Epoch: [0] [2720/3229] lr: 0.000044 grad_norm: 0.7610 (0.8755) closs: 1.0952 (1.1042) time: 4.1131 data: 0.0002 max mem: 54684 +[03:42:13.881942] Epoch: [0] [2730/3229] lr: 0.000044 grad_norm: 0.7852 (0.8752) closs: 1.1198 (1.1041) time: 4.1127 data: 0.0002 max mem: 54684 +[03:42:54.354106] Epoch: [0] [2740/3229] lr: 0.000044 grad_norm: 0.7901 (0.8749) closs: 1.0710 (1.1039) time: 4.0695 data: 0.0002 max mem: 54684 +[03:43:35.270721] Epoch: [0] [2750/3229] lr: 0.000044 grad_norm: 0.7921 (0.8746) closs: 1.0635 (1.1037) time: 4.0694 data: 0.0002 max mem: 54684 +[03:44:16.560153] Epoch: [0] [2760/3229] lr: 0.000044 grad_norm: 0.7653 (0.8742) closs: 1.0744 (1.1036) time: 4.1102 data: 0.0002 max mem: 54684 +[03:44:57.121862] Epoch: [0] [2770/3229] lr: 0.000044 grad_norm: 0.7653 (0.8738) closs: 1.0760 (1.1034) time: 4.0925 data: 0.0002 max mem: 54684 +[03:45:37.257683] Epoch: [0] [2780/3229] lr: 0.000044 grad_norm: 0.7473 (0.8732) closs: 1.0355 (1.1030) time: 4.0348 data: 0.0002 max mem: 54684 +[03:46:17.901019] Epoch: [0] [2790/3229] lr: 0.000044 grad_norm: 0.7473 (0.8729) closs: 1.0248 (1.1028) time: 4.0389 data: 0.0002 max mem: 54684 +[03:46:58.454326] Epoch: [0] [2800/3229] lr: 0.000044 grad_norm: 0.7700 (0.8725) closs: 1.0784 (1.1027) time: 4.0598 data: 0.0002 max mem: 54684 +[03:47:39.337936] Epoch: [0] [2810/3229] lr: 0.000044 grad_norm: 0.7546 (0.8720) closs: 1.0590 (1.1026) time: 4.0718 data: 0.0002 max mem: 54684 +[03:48:20.119515] Epoch: [0] [2820/3229] lr: 0.000044 grad_norm: 0.7763 (0.8717) closs: 1.0590 (1.1024) time: 4.0832 data: 0.0002 max mem: 54684 +[03:49:01.136368] Epoch: [0] [2830/3229] lr: 0.000044 grad_norm: 0.7699 (0.8713) closs: 1.0643 (1.1023) time: 4.0899 data: 0.0002 max mem: 54684 +[03:49:41.768883] Epoch: [0] [2840/3229] lr: 0.000044 grad_norm: 0.7428 (0.8708) closs: 1.0609 (1.1022) time: 4.0824 data: 0.0002 max mem: 54684 +[03:50:22.978721] Epoch: [0] [2850/3229] lr: 0.000044 grad_norm: 0.7529 (0.8704) closs: 1.0452 (1.1021) time: 4.0921 data: 0.0002 max mem: 54684 +[03:51:03.103441] Epoch: [0] [2860/3229] lr: 0.000044 grad_norm: 0.7534 (0.8699) closs: 1.0985 (1.1020) time: 4.0667 data: 0.0002 max mem: 54684 +[03:51:44.422934] Epoch: [0] [2870/3229] lr: 0.000044 grad_norm: 0.7563 (0.8695) closs: 1.0943 (1.1019) time: 4.0721 data: 0.0002 max mem: 54684 +[03:52:25.402059] Epoch: [0] [2880/3229] lr: 0.000044 grad_norm: 0.7777 (0.8691) closs: 1.1125 (1.1020) time: 4.1149 data: 0.0002 max mem: 54684 +[03:53:06.008254] Epoch: [0] [2890/3229] lr: 0.000043 grad_norm: 0.7777 (0.8688) closs: 1.1062 (1.1018) time: 4.0792 data: 0.0002 max mem: 54684 +[03:53:47.142690] Epoch: [0] [2900/3229] lr: 0.000043 grad_norm: 0.7781 (0.8685) closs: 1.0867 (1.1017) time: 4.0870 data: 0.0002 max mem: 54684 +[03:54:28.114121] Epoch: [0] [2910/3229] lr: 0.000043 grad_norm: 0.7930 (0.8682) closs: 1.0739 (1.1015) time: 4.1052 data: 0.0002 max mem: 54684 +[03:55:09.721076] Epoch: [0] [2920/3229] lr: 0.000043 grad_norm: 0.7677 (0.8679) closs: 1.0869 (1.1015) time: 4.1289 data: 0.0002 max mem: 54684 +[03:55:50.967003] Epoch: [0] [2930/3229] lr: 0.000043 grad_norm: 0.8109 (0.8678) closs: 1.0987 (1.1014) time: 4.1426 data: 0.0002 max mem: 54684 +[03:56:31.795882] Epoch: [0] [2940/3229] lr: 0.000043 grad_norm: 0.8109 (0.8675) closs: 1.0884 (1.1015) time: 4.1037 data: 0.0002 max mem: 54684 +[03:57:13.073814] Epoch: [0] [2950/3229] lr: 0.000043 grad_norm: 0.7720 (0.8672) closs: 1.0852 (1.1014) time: 4.1053 data: 0.0002 max mem: 54684 +[03:57:54.032832] Epoch: [0] [2960/3229] lr: 0.000043 grad_norm: 0.7005 (0.8666) closs: 1.0610 (1.1012) time: 4.1118 data: 0.0002 max mem: 54684 +[03:58:34.931320] Epoch: [0] [2970/3229] lr: 0.000043 grad_norm: 0.7079 (0.8663) closs: 1.0292 (1.1009) time: 4.0928 data: 0.0002 max mem: 54684 +[03:59:16.050440] Epoch: [0] [2980/3229] lr: 0.000043 grad_norm: 0.7676 (0.8661) closs: 1.0698 (1.1009) time: 4.1008 data: 0.0002 max mem: 54684 +[03:59:57.343357] Epoch: [0] [2990/3229] lr: 0.000043 grad_norm: 0.7941 (0.8659) closs: 1.0382 (1.1006) time: 4.1205 data: 0.0002 max mem: 54684 +[04:00:38.675134] Epoch: [0] [3000/3229] lr: 0.000043 grad_norm: 0.8105 (0.8657) closs: 1.0089 (1.1003) time: 4.1312 data: 0.0002 max mem: 54684 +[04:01:19.570077] Epoch: [0] [3010/3229] lr: 0.000043 grad_norm: 0.7900 (0.8653) closs: 1.0515 (1.1003) time: 4.1113 data: 0.0002 max mem: 54684 +[04:02:00.671184] Epoch: [0] [3020/3229] lr: 0.000043 grad_norm: 0.7517 (0.8650) closs: 1.0734 (1.1003) time: 4.0997 data: 0.0002 max mem: 54684 +[04:02:40.993797] Epoch: [0] [3030/3229] lr: 0.000043 grad_norm: 0.7689 (0.8646) closs: 1.0596 (1.1000) time: 4.0711 data: 0.0002 max mem: 54684 +[04:03:22.299784] Epoch: [0] [3040/3229] lr: 0.000043 grad_norm: 0.7714 (0.8643) closs: 1.0596 (1.0998) time: 4.0814 data: 0.0002 max mem: 54684 +[04:04:03.204281] Epoch: [0] [3050/3229] lr: 0.000043 grad_norm: 0.7578 (0.8639) closs: 1.0455 (1.0997) time: 4.1105 data: 0.0002 max mem: 54684 +[04:04:44.303035] Epoch: [0] [3060/3229] lr: 0.000043 grad_norm: 0.7642 (0.8637) closs: 1.0497 (1.0996) time: 4.1001 data: 0.0002 max mem: 54684 +[04:05:25.256193] Epoch: [0] [3070/3229] lr: 0.000042 grad_norm: 0.7642 (0.8633) closs: 1.0778 (1.0996) time: 4.1025 data: 0.0002 max mem: 54684 +[04:06:05.981901] Epoch: [0] [3080/3229] lr: 0.000042 grad_norm: 0.7319 (0.8628) closs: 1.0596 (1.0994) time: 4.0839 data: 0.0002 max mem: 54684 +[04:06:46.434233] Epoch: [0] [3090/3229] lr: 0.000042 grad_norm: 0.7442 (0.8624) closs: 1.0718 (1.0993) time: 4.0588 data: 0.0002 max mem: 54684 +[04:07:27.009193] Epoch: [0] [3100/3229] lr: 0.000042 grad_norm: 0.7568 (0.8619) closs: 1.1194 (1.0992) time: 4.0513 data: 0.0002 max mem: 54684 +[04:08:07.508250] Epoch: [0] [3110/3229] lr: 0.000042 grad_norm: 0.7233 (0.8615) closs: 1.0551 (1.0991) time: 4.0536 data: 0.0002 max mem: 54684 +[04:08:48.595599] Epoch: [0] [3120/3229] lr: 0.000042 grad_norm: 0.7674 (0.8612) closs: 1.0551 (1.0989) time: 4.0793 data: 0.0002 max mem: 54684 +[04:09:28.728428] Epoch: [0] [3130/3229] lr: 0.000042 grad_norm: 0.7612 (0.8607) closs: 1.0629 (1.0987) time: 4.0609 data: 0.0002 max mem: 54684 +[04:10:08.523858] Epoch: [0] [3140/3229] lr: 0.000042 grad_norm: 0.6953 (0.8602) closs: 1.0381 (1.0985) time: 3.9964 data: 0.0002 max mem: 54684 +[04:10:49.845722] Epoch: [0] [3150/3229] lr: 0.000042 grad_norm: 0.7319 (0.8599) closs: 1.0381 (1.0984) time: 4.0558 data: 0.0002 max mem: 54684 +[04:11:30.298486] Epoch: [0] [3160/3229] lr: 0.000042 grad_norm: 0.7591 (0.8595) closs: 1.0370 (1.0981) time: 4.0887 data: 0.0002 max mem: 54684 +[04:12:11.068402] Epoch: [0] [3170/3229] lr: 0.000042 grad_norm: 0.7661 (0.8592) closs: 1.0615 (1.0980) time: 4.0611 data: 0.0002 max mem: 54684 +[04:12:51.939751] Epoch: [0] [3180/3229] lr: 0.000042 grad_norm: 0.8030 (0.8590) closs: 1.0806 (1.0979) time: 4.0820 data: 0.0002 max mem: 54684 +[04:13:32.816037] Epoch: [0] [3190/3229] lr: 0.000042 grad_norm: 0.7835 (0.8587) closs: 1.0738 (1.0978) time: 4.0873 data: 0.0002 max mem: 54684 +[04:14:14.251177] Epoch: [0] [3200/3229] lr: 0.000042 grad_norm: 0.7516 (0.8583) closs: 1.0760 (1.0977) time: 4.1155 data: 0.0003 max mem: 54684 +[04:14:55.041953] Epoch: [0] [3210/3229] lr: 0.000042 grad_norm: 0.7438 (0.8580) closs: 1.0863 (1.0976) time: 4.1112 data: 0.0003 max mem: 54684 +[04:15:36.390692] Epoch: [0] [3220/3229] lr: 0.000042 grad_norm: 0.7658 (0.8579) closs: 1.0767 (1.0975) time: 4.1069 data: 0.0001 max mem: 54684 +[04:16:09.561812] Epoch: [0] Total time: 3:40:03 +[04:16:09.562785] Averaged stats: lr: 0.000042 grad_norm: 0.7787 (0.8577) closs: 1.0483 (1.0961) +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[04:16:09.927151] model saved +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[04:16:11.631099] optimizer saved +[04:16:11.631728] other rank-common saved +[04:16:11.636736] rank-specific saved +[04:16:11.651089] log_dir: ./output_dir +[04:16:23.993611] Epoch: [1] [0/3229] lr: 0.000042 grad_norm: 0.8670 (0.8670) closs: 1.0075 (1.0075) time: 12.3415 data: 7.9810 max mem: 54684 +[04:17:06.440076] Epoch: [1] [10/3229] lr: 0.000041 grad_norm: 0.7389 (0.8121) closs: 1.0826 (1.0946) time: 4.9807 data: 0.7257 max mem: 54684 +[04:17:46.869458] Epoch: [1] [20/3229] lr: 0.000041 grad_norm: 0.7389 (0.7906) closs: 1.0572 (1.0652) time: 4.1437 data: 0.0002 max mem: 54684 +[04:18:28.213204] Epoch: [1] [30/3229] lr: 0.000041 grad_norm: 0.7856 (0.7980) closs: 1.0665 (1.0783) time: 4.0886 data: 0.0002 max mem: 54684 +[04:19:09.261576] Epoch: [1] [40/3229] lr: 0.000041 grad_norm: 0.7665 (0.7884) closs: 1.0941 (1.0814) time: 4.1195 data: 0.0002 max mem: 54684 +[04:19:50.348250] Epoch: [1] [50/3229] lr: 0.000041 grad_norm: 0.7463 (0.7844) closs: 1.0827 (1.0739) time: 4.1067 data: 0.0002 max mem: 54684 +[04:20:30.454474] Epoch: [1] [60/3229] lr: 0.000041 grad_norm: 0.7801 (0.7795) closs: 1.0234 (1.0633) time: 4.0596 data: 0.0002 max mem: 54684 +[04:21:11.357215] Epoch: [1] [70/3229] lr: 0.000041 grad_norm: 0.8025 (0.7812) closs: 1.0314 (1.0602) time: 4.0504 data: 0.0002 max mem: 54684 +[04:21:51.356188] Epoch: [1] [80/3229] lr: 0.000041 grad_norm: 0.7522 (0.7691) closs: 1.0226 (1.0522) time: 4.0450 data: 0.0002 max mem: 54684 +[04:22:33.043953] Epoch: [1] [90/3229] lr: 0.000041 grad_norm: 0.7018 (0.7681) closs: 1.0252 (1.0538) time: 4.0843 data: 0.0002 max mem: 54684 +[04:23:13.831663] Epoch: [1] [100/3229] lr: 0.000041 grad_norm: 0.7546 (0.7674) closs: 1.0535 (1.0529) time: 4.1237 data: 0.0002 max mem: 54684 +[04:23:55.181411] Epoch: [1] [110/3229] lr: 0.000041 grad_norm: 0.7521 (0.7679) closs: 1.0461 (1.0515) time: 4.1068 data: 0.0002 max mem: 54684 +[04:24:35.870155] Epoch: [1] [120/3229] lr: 0.000041 grad_norm: 0.7521 (0.7650) closs: 1.0400 (1.0525) time: 4.1019 data: 0.0002 max mem: 54684 +[04:25:17.325692] Epoch: [1] [130/3229] lr: 0.000041 grad_norm: 0.7635 (0.7660) closs: 1.0768 (1.0555) time: 4.1071 data: 0.0002 max mem: 54684 +[04:25:58.111171] Epoch: [1] [140/3229] lr: 0.000041 grad_norm: 0.7635 (0.7666) closs: 1.0801 (1.0569) time: 4.1120 data: 0.0002 max mem: 54684 +[04:26:38.709088] Epoch: [1] [150/3229] lr: 0.000041 grad_norm: 0.7391 (0.7641) closs: 1.0635 (1.0556) time: 4.0691 data: 0.0002 max mem: 54684 +[04:27:19.437075] Epoch: [1] [160/3229] lr: 0.000041 grad_norm: 0.7284 (0.7647) closs: 1.0512 (1.0563) time: 4.0662 data: 0.0002 max mem: 54684 +[04:28:00.075659] Epoch: [1] [170/3229] lr: 0.000040 grad_norm: 0.7635 (0.7653) closs: 1.0589 (1.0533) time: 4.0683 data: 0.0002 max mem: 54684 +[04:28:40.520775] Epoch: [1] [180/3229] lr: 0.000040 grad_norm: 0.7911 (0.7696) closs: 1.0460 (1.0532) time: 4.0541 data: 0.0002 max mem: 54684 +[04:29:21.220750] Epoch: [1] [190/3229] lr: 0.000040 grad_norm: 0.7859 (0.7694) closs: 1.0297 (1.0513) time: 4.0572 data: 0.0002 max mem: 54684 +[04:30:02.479262] Epoch: [1] [200/3229] lr: 0.000040 grad_norm: 0.7721 (0.7706) closs: 1.0362 (1.0522) time: 4.0979 data: 0.0002 max mem: 54684 +[04:30:43.921995] Epoch: [1] [210/3229] lr: 0.000040 grad_norm: 0.8007 (0.7734) closs: 1.0904 (1.0546) time: 4.1350 data: 0.0002 max mem: 54684 +[04:31:24.392183] Epoch: [1] [220/3229] lr: 0.000040 grad_norm: 0.8126 (0.7731) closs: 1.0870 (1.0548) time: 4.0956 data: 0.0002 max mem: 54684 +[04:32:04.710459] Epoch: [1] [230/3229] lr: 0.000040 grad_norm: 0.7601 (0.7722) closs: 1.0787 (1.0528) time: 4.0394 data: 0.0002 max mem: 54684 +[04:32:45.662972] Epoch: [1] [240/3229] lr: 0.000040 grad_norm: 0.7548 (0.7715) closs: 1.0161 (1.0511) time: 4.0635 data: 0.0002 max mem: 54684 +[04:33:27.043779] Epoch: [1] [250/3229] lr: 0.000040 grad_norm: 0.7622 (0.7718) closs: 1.0228 (1.0510) time: 4.1166 data: 0.0002 max mem: 54684 +[04:34:08.159916] Epoch: [1] [260/3229] lr: 0.000040 grad_norm: 0.7596 (0.7716) closs: 1.0571 (1.0517) time: 4.1248 data: 0.0002 max mem: 54684 +[04:34:49.118869] Epoch: [1] [270/3229] lr: 0.000040 grad_norm: 0.7411 (0.7711) closs: 1.0854 (1.0523) time: 4.1037 data: 0.0002 max mem: 54684 +[04:35:29.743500] Epoch: [1] [280/3229] lr: 0.000040 grad_norm: 0.7411 (0.7708) closs: 1.0320 (1.0520) time: 4.0791 data: 0.0002 max mem: 54684 +[04:36:10.896285] Epoch: [1] [290/3229] lr: 0.000040 grad_norm: 0.7698 (0.7711) closs: 1.0320 (1.0519) time: 4.0888 data: 0.0002 max mem: 54684 +[04:36:50.693561] Epoch: [1] [300/3229] lr: 0.000040 grad_norm: 0.7667 (0.7693) closs: 1.0176 (1.0507) time: 4.0474 data: 0.0002 max mem: 54684 +[04:37:31.371110] Epoch: [1] [310/3229] lr: 0.000040 grad_norm: 0.7233 (0.7689) closs: 1.0032 (1.0496) time: 4.0237 data: 0.0002 max mem: 54684 +[04:38:12.621107] Epoch: [1] [320/3229] lr: 0.000039 grad_norm: 0.7637 (0.7696) closs: 1.0102 (1.0494) time: 4.0963 data: 0.0002 max mem: 54684 +[04:38:53.280696] Epoch: [1] [330/3229] lr: 0.000039 grad_norm: 0.7538 (0.7692) closs: 1.0404 (1.0488) time: 4.0954 data: 0.0002 max mem: 54684 +[04:39:34.384153] Epoch: [1] [340/3229] lr: 0.000039 grad_norm: 0.7223 (0.7678) closs: 1.0712 (1.0507) time: 4.0881 data: 0.0002 max mem: 54684 +[04:40:14.745485] Epoch: [1] [350/3229] lr: 0.000039 grad_norm: 0.7186 (0.7656) closs: 1.0880 (1.0494) time: 4.0732 data: 0.0002 max mem: 54684 +[04:40:55.725764] Epoch: [1] [360/3229] lr: 0.000039 grad_norm: 0.7281 (0.7657) closs: 1.0522 (1.0491) time: 4.0670 data: 0.0002 max mem: 54684 +[04:41:37.083851] Epoch: [1] [370/3229] lr: 0.000039 grad_norm: 0.7441 (0.7654) closs: 1.0619 (1.0497) time: 4.1169 data: 0.0002 max mem: 54684 +[04:42:18.173848] Epoch: [1] [380/3229] lr: 0.000039 grad_norm: 0.7505 (0.7652) closs: 1.0880 (1.0510) time: 4.1223 data: 0.0002 max mem: 54684 +[04:42:58.469739] Epoch: [1] [390/3229] lr: 0.000039 grad_norm: 0.7717 (0.7642) closs: 1.0462 (1.0496) time: 4.0692 data: 0.0002 max mem: 54684 +[04:43:39.241200] Epoch: [1] [400/3229] lr: 0.000039 grad_norm: 0.7749 (0.7649) closs: 0.9971 (1.0484) time: 4.0533 data: 0.0002 max mem: 54684 +[04:44:20.120315] Epoch: [1] [410/3229] lr: 0.000039 grad_norm: 0.8080 (0.7657) closs: 1.0498 (1.0489) time: 4.0825 data: 0.0002 max mem: 54684 +[04:45:00.557566] Epoch: [1] [420/3229] lr: 0.000039 grad_norm: 0.7665 (0.7651) closs: 1.0759 (1.0488) time: 4.0658 data: 0.0002 max mem: 54684 +[04:45:41.828412] Epoch: [1] [430/3229] lr: 0.000039 grad_norm: 0.7265 (0.7650) closs: 1.0759 (1.0498) time: 4.0853 data: 0.0002 max mem: 54684 +[04:46:22.509399] Epoch: [1] [440/3229] lr: 0.000039 grad_norm: 0.7462 (0.7646) closs: 1.0742 (1.0497) time: 4.0975 data: 0.0002 max mem: 54684 +[04:47:03.187153] Epoch: [1] [450/3229] lr: 0.000039 grad_norm: 0.7618 (0.7645) closs: 1.0617 (1.0497) time: 4.0679 data: 0.0002 max mem: 54684 +[04:47:44.104151] Epoch: [1] [460/3229] lr: 0.000039 grad_norm: 0.7541 (0.7639) closs: 1.0582 (1.0501) time: 4.0797 data: 0.0002 max mem: 54684 +[04:48:24.641461] Epoch: [1] [470/3229] lr: 0.000038 grad_norm: 0.7528 (0.7635) closs: 1.0582 (1.0494) time: 4.0727 data: 0.0002 max mem: 54684 +[04:49:05.751178] Epoch: [1] [480/3229] lr: 0.000038 grad_norm: 0.7481 (0.7628) closs: 1.0947 (1.0506) time: 4.0823 data: 0.0002 max mem: 54684 +[04:49:46.537424] Epoch: [1] [490/3229] lr: 0.000038 grad_norm: 0.7552 (0.7629) closs: 1.0837 (1.0503) time: 4.0947 data: 0.0002 max mem: 54684 +[04:50:27.314255] Epoch: [1] [500/3229] lr: 0.000038 grad_norm: 0.7712 (0.7634) closs: 1.0294 (1.0500) time: 4.0781 data: 0.0002 max mem: 54684 +[04:51:08.310823] Epoch: [1] [510/3229] lr: 0.000038 grad_norm: 0.8222 (0.7643) closs: 1.0444 (1.0500) time: 4.0886 data: 0.0002 max mem: 54684 +[04:51:49.708115] Epoch: [1] [520/3229] lr: 0.000038 grad_norm: 0.7931 (0.7644) closs: 1.0786 (1.0511) time: 4.1196 data: 0.0002 max mem: 54684 +[04:52:30.929265] Epoch: [1] [530/3229] lr: 0.000038 grad_norm: 0.7668 (0.7648) closs: 1.0823 (1.0516) time: 4.1309 data: 0.0002 max mem: 54684 +[04:53:11.799214] Epoch: [1] [540/3229] lr: 0.000038 grad_norm: 0.7484 (0.7644) closs: 1.0851 (1.0518) time: 4.1045 data: 0.0002 max mem: 54684 +[04:53:52.617648] Epoch: [1] [550/3229] lr: 0.000038 grad_norm: 0.7576 (0.7642) closs: 1.0762 (1.0517) time: 4.0844 data: 0.0002 max mem: 54684 +[04:54:33.511754] Epoch: [1] [560/3229] lr: 0.000038 grad_norm: 0.7606 (0.7637) closs: 1.0481 (1.0516) time: 4.0856 data: 0.0002 max mem: 54684 +[04:55:14.276726] Epoch: [1] [570/3229] lr: 0.000038 grad_norm: 0.7616 (0.7649) closs: 1.0444 (1.0516) time: 4.0829 data: 0.0002 max mem: 54684 +[04:55:54.841137] Epoch: [1] [580/3229] lr: 0.000038 grad_norm: 0.7616 (0.7646) closs: 1.0423 (1.0511) time: 4.0664 data: 0.0002 max mem: 54684 +[04:56:35.646680] Epoch: [1] [590/3229] lr: 0.000038 grad_norm: 0.7542 (0.7649) closs: 1.0440 (1.0512) time: 4.0684 data: 0.0002 max mem: 54684 +[04:57:16.334209] Epoch: [1] [600/3229] lr: 0.000038 grad_norm: 0.7437 (0.7638) closs: 1.0745 (1.0512) time: 4.0746 data: 0.0002 max mem: 54684 +[04:57:57.448181] Epoch: [1] [610/3229] lr: 0.000038 grad_norm: 0.7439 (0.7642) closs: 1.0596 (1.0512) time: 4.0900 data: 0.0002 max mem: 54684 +[04:58:38.714028] Epoch: [1] [620/3229] lr: 0.000037 grad_norm: 0.7602 (0.7641) closs: 1.0594 (1.0513) time: 4.1189 data: 0.0002 max mem: 54684 +[04:59:19.266620] Epoch: [1] [630/3229] lr: 0.000037 grad_norm: 0.7325 (0.7639) closs: 1.0401 (1.0502) time: 4.0909 data: 0.0002 max mem: 54684 +[05:00:00.368157] Epoch: [1] [640/3229] lr: 0.000037 grad_norm: 0.7513 (0.7638) closs: 1.0248 (1.0503) time: 4.0826 data: 0.0002 max mem: 54684 +[05:00:41.160239] Epoch: [1] [650/3229] lr: 0.000037 grad_norm: 0.7513 (0.7635) closs: 1.0358 (1.0502) time: 4.0946 data: 0.0002 max mem: 54684 +[05:01:22.138831] Epoch: [1] [660/3229] lr: 0.000037 grad_norm: 0.7456 (0.7637) closs: 1.0590 (1.0504) time: 4.0885 data: 0.0002 max mem: 54684 +[05:02:02.946232] Epoch: [1] [670/3229] lr: 0.000037 grad_norm: 0.7457 (0.7634) closs: 1.0762 (1.0507) time: 4.0892 data: 0.0002 max mem: 54684 +[05:02:44.502278] Epoch: [1] [680/3229] lr: 0.000037 grad_norm: 0.7440 (0.7634) closs: 1.0678 (1.0509) time: 4.1181 data: 0.0002 max mem: 54684 +[05:03:24.951117] Epoch: [1] [690/3229] lr: 0.000037 grad_norm: 0.7456 (0.7630) closs: 1.0585 (1.0513) time: 4.1002 data: 0.0002 max mem: 54684 +[05:04:05.578068] Epoch: [1] [700/3229] lr: 0.000037 grad_norm: 0.7712 (0.7629) closs: 1.0648 (1.0509) time: 4.0537 data: 0.0002 max mem: 54684 +[05:04:46.382125] Epoch: [1] [710/3229] lr: 0.000037 grad_norm: 0.7536 (0.7623) closs: 1.0831 (1.0508) time: 4.0715 data: 0.0002 max mem: 54684 +[05:05:27.836191] Epoch: [1] [720/3229] lr: 0.000037 grad_norm: 0.7533 (0.7623) closs: 1.0761 (1.0516) time: 4.1128 data: 0.0002 max mem: 54684 +[05:06:08.635002] Epoch: [1] [730/3229] lr: 0.000037 grad_norm: 0.7805 (0.7629) closs: 1.0763 (1.0519) time: 4.1126 data: 0.0002 max mem: 54684 +[05:06:48.634490] Epoch: [1] [740/3229] lr: 0.000037 grad_norm: 0.7186 (0.7620) closs: 1.0197 (1.0513) time: 4.0398 data: 0.0002 max mem: 54684 +[05:07:29.445391] Epoch: [1] [750/3229] lr: 0.000037 grad_norm: 0.7025 (0.7616) closs: 1.0359 (1.0516) time: 4.0405 data: 0.0002 max mem: 54684 +[05:08:11.035741] Epoch: [1] [760/3229] lr: 0.000036 grad_norm: 0.7647 (0.7623) closs: 1.0791 (1.0519) time: 4.1200 data: 0.0002 max mem: 54684 +[05:08:51.808203] Epoch: [1] [770/3229] lr: 0.000036 grad_norm: 0.8157 (0.7630) closs: 1.0874 (1.0524) time: 4.1181 data: 0.0002 max mem: 54684 +[05:09:32.821381] Epoch: [1] [780/3229] lr: 0.000036 grad_norm: 0.7640 (0.7628) closs: 1.0954 (1.0530) time: 4.0892 data: 0.0002 max mem: 54684 +[05:10:13.634195] Epoch: [1] [790/3229] lr: 0.000036 grad_norm: 0.7289 (0.7624) closs: 1.0748 (1.0527) time: 4.0912 data: 0.0002 max mem: 54684 +[05:10:55.053924] Epoch: [1] [800/3229] lr: 0.000036 grad_norm: 0.7343 (0.7624) closs: 1.0321 (1.0528) time: 4.1116 data: 0.0002 max mem: 54684 +[05:11:35.196793] Epoch: [1] [810/3229] lr: 0.000036 grad_norm: 0.7367 (0.7620) closs: 1.0322 (1.0524) time: 4.0781 data: 0.0002 max mem: 54684 +[05:12:16.522962] Epoch: [1] [820/3229] lr: 0.000036 grad_norm: 0.7401 (0.7621) closs: 1.0246 (1.0522) time: 4.0734 data: 0.0002 max mem: 54684 +[05:12:57.137015] Epoch: [1] [830/3229] lr: 0.000036 grad_norm: 0.7595 (0.7617) closs: 1.0450 (1.0524) time: 4.0969 data: 0.0002 max mem: 54684 +[05:13:38.237630] Epoch: [1] [840/3229] lr: 0.000036 grad_norm: 0.7652 (0.7616) closs: 1.0833 (1.0530) time: 4.0857 data: 0.0002 max mem: 54684 +[05:14:19.021316] Epoch: [1] [850/3229] lr: 0.000036 grad_norm: 0.7652 (0.7614) closs: 1.0680 (1.0532) time: 4.0941 data: 0.0002 max mem: 54684 +[05:14:59.987219] Epoch: [1] [860/3229] lr: 0.000036 grad_norm: 0.7362 (0.7610) closs: 1.0479 (1.0534) time: 4.0874 data: 0.0002 max mem: 54684 +[05:15:40.669534] Epoch: [1] [870/3229] lr: 0.000036 grad_norm: 0.7348 (0.7608) closs: 1.0472 (1.0530) time: 4.0823 data: 0.0002 max mem: 54684 +[05:16:21.345203] Epoch: [1] [880/3229] lr: 0.000036 grad_norm: 0.7769 (0.7607) closs: 1.0219 (1.0529) time: 4.0678 data: 0.0002 max mem: 54684 +[05:17:02.174965] Epoch: [1] [890/3229] lr: 0.000036 grad_norm: 0.7393 (0.7603) closs: 1.0763 (1.0534) time: 4.0752 data: 0.0002 max mem: 54684 +[05:17:43.153839] Epoch: [1] [900/3229] lr: 0.000035 grad_norm: 0.7229 (0.7600) closs: 1.0763 (1.0535) time: 4.0904 data: 0.0002 max mem: 54684 +[05:18:23.532375] Epoch: [1] [910/3229] lr: 0.000035 grad_norm: 0.7346 (0.7597) closs: 1.0722 (1.0536) time: 4.0678 data: 0.0002 max mem: 54684 +[05:19:04.844983] Epoch: [1] [920/3229] lr: 0.000035 grad_norm: 0.7679 (0.7600) closs: 1.0613 (1.0536) time: 4.0845 data: 0.0002 max mem: 54684 +[05:19:45.960496] Epoch: [1] [930/3229] lr: 0.000035 grad_norm: 0.7816 (0.7600) closs: 1.0613 (1.0537) time: 4.1213 data: 0.0002 max mem: 54684 +[05:20:26.646864] Epoch: [1] [940/3229] lr: 0.000035 grad_norm: 0.7816 (0.7601) closs: 1.0160 (1.0532) time: 4.0900 data: 0.0002 max mem: 54684 +[05:21:07.563052] Epoch: [1] [950/3229] lr: 0.000035 grad_norm: 0.7621 (0.7605) closs: 0.9814 (1.0528) time: 4.0801 data: 0.0002 max mem: 54684 +[05:21:48.451416] Epoch: [1] [960/3229] lr: 0.000035 grad_norm: 0.7508 (0.7605) closs: 0.9968 (1.0529) time: 4.0902 data: 0.0002 max mem: 54684 +[05:22:29.552875] Epoch: [1] [970/3229] lr: 0.000035 grad_norm: 0.7519 (0.7605) closs: 1.0570 (1.0530) time: 4.0994 data: 0.0002 max mem: 54684 +[05:23:10.530952] Epoch: [1] [980/3229] lr: 0.000035 grad_norm: 0.7625 (0.7606) closs: 1.0570 (1.0532) time: 4.1039 data: 0.0002 max mem: 54684 +[05:23:51.878010] Epoch: [1] [990/3229] lr: 0.000035 grad_norm: 0.7625 (0.7608) closs: 1.0764 (1.0534) time: 4.1162 data: 0.0002 max mem: 54684 +[05:24:31.849857] Epoch: [1] [1000/3229] lr: 0.000035 grad_norm: 0.7446 (0.7599) closs: 1.0764 (1.0532) time: 4.0659 data: 0.0002 max mem: 54684 +[05:25:12.639207] Epoch: [1] [1010/3229] lr: 0.000035 grad_norm: 0.7020 (0.7597) closs: 1.0242 (1.0529) time: 4.0380 data: 0.0002 max mem: 54684 +[05:25:52.977496] Epoch: [1] [1020/3229] lr: 0.000035 grad_norm: 0.7052 (0.7594) closs: 1.0555 (1.0527) time: 4.0563 data: 0.0002 max mem: 54684 +[05:26:34.003363] Epoch: [1] [1030/3229] lr: 0.000034 grad_norm: 0.7147 (0.7588) closs: 1.0503 (1.0527) time: 4.0681 data: 0.0002 max mem: 54684 +[05:27:14.573525] Epoch: [1] [1040/3229] lr: 0.000034 grad_norm: 0.7147 (0.7586) closs: 1.0450 (1.0524) time: 4.0797 data: 0.0002 max mem: 54684 +[05:27:55.498440] Epoch: [1] [1050/3229] lr: 0.000034 grad_norm: 0.7522 (0.7589) closs: 1.0486 (1.0523) time: 4.0747 data: 0.0002 max mem: 54684 +[05:28:36.363003] Epoch: [1] [1060/3229] lr: 0.000034 grad_norm: 0.7522 (0.7588) closs: 1.0553 (1.0523) time: 4.0894 data: 0.0002 max mem: 54684 +[05:29:17.566434] Epoch: [1] [1070/3229] lr: 0.000034 grad_norm: 0.7544 (0.7588) closs: 1.0536 (1.0523) time: 4.1033 data: 0.0002 max mem: 54684 +[05:29:57.687127] Epoch: [1] [1080/3229] lr: 0.000034 grad_norm: 0.7544 (0.7586) closs: 1.0454 (1.0519) time: 4.0661 data: 0.0002 max mem: 54684 +[05:30:38.261585] Epoch: [1] [1090/3229] lr: 0.000034 grad_norm: 0.7317 (0.7584) closs: 1.0487 (1.0519) time: 4.0347 data: 0.0002 max mem: 54684 +[05:31:19.146600] Epoch: [1] [1100/3229] lr: 0.000034 grad_norm: 0.7461 (0.7585) closs: 1.0237 (1.0518) time: 4.0729 data: 0.0002 max mem: 54684 +[05:32:00.230890] Epoch: [1] [1110/3229] lr: 0.000034 grad_norm: 0.7654 (0.7586) closs: 1.0237 (1.0518) time: 4.0984 data: 0.0002 max mem: 54684 +[05:32:40.323167] Epoch: [1] [1120/3229] lr: 0.000034 grad_norm: 0.7725 (0.7589) closs: 1.0261 (1.0517) time: 4.0588 data: 0.0002 max mem: 54684 +[05:33:21.572530] Epoch: [1] [1130/3229] lr: 0.000034 grad_norm: 0.7690 (0.7592) closs: 1.0308 (1.0517) time: 4.0670 data: 0.0002 max mem: 54684 +[05:34:02.762302] Epoch: [1] [1140/3229] lr: 0.000034 grad_norm: 0.7708 (0.7595) closs: 1.0746 (1.0522) time: 4.1219 data: 0.0002 max mem: 54684 +[05:34:43.703986] Epoch: [1] [1150/3229] lr: 0.000034 grad_norm: 0.7656 (0.7593) closs: 1.0766 (1.0521) time: 4.1065 data: 0.0003 max mem: 54684 +[05:35:24.484939] Epoch: [1] [1160/3229] lr: 0.000034 grad_norm: 0.7151 (0.7591) closs: 1.0380 (1.0517) time: 4.0861 data: 0.0003 max mem: 54684 +[05:36:05.401846] Epoch: [1] [1170/3229] lr: 0.000033 grad_norm: 0.7522 (0.7591) closs: 1.0499 (1.0519) time: 4.0848 data: 0.0002 max mem: 54684 +[05:36:45.957264] Epoch: [1] [1180/3229] lr: 0.000033 grad_norm: 0.7522 (0.7588) closs: 1.0587 (1.0517) time: 4.0735 data: 0.0002 max mem: 54684 +[05:37:27.507925] Epoch: [1] [1190/3229] lr: 0.000033 grad_norm: 0.7522 (0.7589) closs: 1.0445 (1.0518) time: 4.1052 data: 0.0002 max mem: 54684 +[05:38:08.323255] Epoch: [1] [1200/3229] lr: 0.000033 grad_norm: 0.8299 (0.7594) closs: 1.0666 (1.0517) time: 4.1182 data: 0.0002 max mem: 54684 +[05:38:49.657429] Epoch: [1] [1210/3229] lr: 0.000033 grad_norm: 0.8322 (0.7596) closs: 1.0805 (1.0520) time: 4.1074 data: 0.0002 max mem: 54684 +[05:39:30.796365] Epoch: [1] [1220/3229] lr: 0.000033 grad_norm: 0.7634 (0.7597) closs: 1.0942 (1.0524) time: 4.1236 data: 0.0002 max mem: 54684 +[05:40:12.472149] Epoch: [1] [1230/3229] lr: 0.000033 grad_norm: 0.7564 (0.7597) closs: 1.0754 (1.0523) time: 4.1407 data: 0.0002 max mem: 54684 +[05:40:53.596788] Epoch: [1] [1240/3229] lr: 0.000033 grad_norm: 0.7499 (0.7610) closs: 1.0346 (1.0525) time: 4.1400 data: 0.0002 max mem: 54684 +[05:41:34.343746] Epoch: [1] [1250/3229] lr: 0.000033 grad_norm: 0.7309 (0.7605) closs: 1.0346 (1.0524) time: 4.0935 data: 0.0002 max mem: 54684 +[05:42:14.850396] Epoch: [1] [1260/3229] lr: 0.000033 grad_norm: 0.7357 (0.7604) closs: 1.0046 (1.0521) time: 4.0626 data: 0.0002 max mem: 54684 +[05:42:55.902233] Epoch: [1] [1270/3229] lr: 0.000033 grad_norm: 0.7459 (0.7603) closs: 0.9939 (1.0517) time: 4.0779 data: 0.0002 max mem: 54684 +[05:43:36.700285] Epoch: [1] [1280/3229] lr: 0.000033 grad_norm: 0.7424 (0.7602) closs: 1.0316 (1.0518) time: 4.0924 data: 0.0002 max mem: 54684 +[05:44:18.029631] Epoch: [1] [1290/3229] lr: 0.000033 grad_norm: 0.7584 (0.7604) closs: 1.0437 (1.0518) time: 4.1063 data: 0.0002 max mem: 54684 +[05:44:58.460022] Epoch: [1] [1300/3229] lr: 0.000032 grad_norm: 0.7471 (0.7602) closs: 1.0371 (1.0516) time: 4.0879 data: 0.0002 max mem: 54684 +[05:45:39.434680] Epoch: [1] [1310/3229] lr: 0.000032 grad_norm: 0.7390 (0.7601) closs: 1.0832 (1.0518) time: 4.0702 data: 0.0002 max mem: 54684 +[05:46:19.533138] Epoch: [1] [1320/3229] lr: 0.000032 grad_norm: 0.7437 (0.7601) closs: 1.0843 (1.0517) time: 4.0536 data: 0.0002 max mem: 54684 +[05:46:59.861164] Epoch: [1] [1330/3229] lr: 0.000032 grad_norm: 0.7416 (0.7597) closs: 1.0420 (1.0514) time: 4.0213 data: 0.0002 max mem: 54684 +[05:47:39.762335] Epoch: [1] [1340/3229] lr: 0.000032 grad_norm: 0.7236 (0.7592) closs: 1.0353 (1.0512) time: 4.0114 data: 0.0002 max mem: 54684 +[05:48:21.193611] Epoch: [1] [1350/3229] lr: 0.000032 grad_norm: 0.7413 (0.7594) closs: 1.0614 (1.0513) time: 4.0666 data: 0.0002 max mem: 54684 +[05:49:01.979501] Epoch: [1] [1360/3229] lr: 0.000032 grad_norm: 0.7393 (0.7591) closs: 1.0725 (1.0512) time: 4.1108 data: 0.0002 max mem: 54684 +[05:49:42.964934] Epoch: [1] [1370/3229] lr: 0.000032 grad_norm: 0.7161 (0.7588) closs: 1.0531 (1.0512) time: 4.0885 data: 0.0002 max mem: 54684 +[05:50:23.751298] Epoch: [1] [1380/3229] lr: 0.000032 grad_norm: 0.7334 (0.7589) closs: 1.0633 (1.0516) time: 4.0885 data: 0.0002 max mem: 54684 +[05:51:05.208897] Epoch: [1] [1390/3229] lr: 0.000032 grad_norm: 0.7776 (0.7591) closs: 1.0852 (1.0519) time: 4.1121 data: 0.0002 max mem: 54684 +[05:51:46.307285] Epoch: [1] [1400/3229] lr: 0.000032 grad_norm: 0.7727 (0.7591) closs: 1.0727 (1.0521) time: 4.1277 data: 0.0002 max mem: 54684 +[05:52:27.639855] Epoch: [1] [1410/3229] lr: 0.000032 grad_norm: 0.7589 (0.7592) closs: 1.0593 (1.0520) time: 4.1215 data: 0.0002 max mem: 54684 +[05:53:08.510464] Epoch: [1] [1420/3229] lr: 0.000032 grad_norm: 0.7454 (0.7591) closs: 1.0521 (1.0518) time: 4.1101 data: 0.0002 max mem: 54684 +[05:53:49.534800] Epoch: [1] [1430/3229] lr: 0.000031 grad_norm: 0.7239 (0.7589) closs: 1.0510 (1.0515) time: 4.0947 data: 0.0002 max mem: 54684 +[05:54:30.634251] Epoch: [1] [1440/3229] lr: 0.000031 grad_norm: 0.7376 (0.7591) closs: 1.0525 (1.0516) time: 4.1061 data: 0.0002 max mem: 54684 +[05:55:11.319746] Epoch: [1] [1450/3229] lr: 0.000031 grad_norm: 0.7381 (0.7590) closs: 1.0587 (1.0514) time: 4.0892 data: 0.0002 max mem: 54684 +[05:55:52.498426] Epoch: [1] [1460/3229] lr: 0.000031 grad_norm: 0.7446 (0.7591) closs: 1.0462 (1.0513) time: 4.0931 data: 0.0002 max mem: 54684 +[05:56:33.716084] Epoch: [1] [1470/3229] lr: 0.000031 grad_norm: 0.7441 (0.7589) closs: 1.0490 (1.0514) time: 4.1198 data: 0.0002 max mem: 54684 +[05:57:14.183953] Epoch: [1] [1480/3229] lr: 0.000031 grad_norm: 0.7353 (0.7586) closs: 1.0471 (1.0514) time: 4.0842 data: 0.0002 max mem: 54684 +[05:57:55.151729] Epoch: [1] [1490/3229] lr: 0.000031 grad_norm: 0.7442 (0.7585) closs: 1.0415 (1.0514) time: 4.0717 data: 0.0002 max mem: 54684 +[05:58:36.455544] Epoch: [1] [1500/3229] lr: 0.000031 grad_norm: 0.7452 (0.7585) closs: 1.0665 (1.0515) time: 4.1135 data: 0.0002 max mem: 54684 +[05:59:17.089096] Epoch: [1] [1510/3229] lr: 0.000031 grad_norm: 0.7419 (0.7581) closs: 1.0889 (1.0517) time: 4.0968 data: 0.0002 max mem: 54684 +[05:59:57.877708] Epoch: [1] [1520/3229] lr: 0.000031 grad_norm: 0.7463 (0.7583) closs: 1.0725 (1.0517) time: 4.0710 data: 0.0002 max mem: 54684 +[06:00:39.181341] Epoch: [1] [1530/3229] lr: 0.000031 grad_norm: 0.7860 (0.7584) closs: 1.0767 (1.0519) time: 4.1045 data: 0.0002 max mem: 54684 +[06:01:20.218941] Epoch: [1] [1540/3229] lr: 0.000031 grad_norm: 0.7508 (0.7584) closs: 1.0767 (1.0517) time: 4.1170 data: 0.0002 max mem: 54684 +[06:02:01.186493] Epoch: [1] [1550/3229] lr: 0.000031 grad_norm: 0.7369 (0.7583) closs: 1.0714 (1.0517) time: 4.1002 data: 0.0002 max mem: 54684 +[06:02:42.438164] Epoch: [1] [1560/3229] lr: 0.000030 grad_norm: 0.7650 (0.7584) closs: 1.0850 (1.0520) time: 4.1109 data: 0.0002 max mem: 54684 +[06:03:23.689124] Epoch: [1] [1570/3229] lr: 0.000030 grad_norm: 0.7622 (0.7584) closs: 1.0709 (1.0521) time: 4.1251 data: 0.0002 max mem: 54684 +[06:04:04.719315] Epoch: [1] [1580/3229] lr: 0.000030 grad_norm: 0.7378 (0.7583) closs: 1.0553 (1.0520) time: 4.1140 data: 0.0002 max mem: 54684 +[06:04:45.698136] Epoch: [1] [1590/3229] lr: 0.000030 grad_norm: 0.7576 (0.7584) closs: 1.0282 (1.0517) time: 4.1004 data: 0.0002 max mem: 54684 +[06:05:27.010602] Epoch: [1] [1600/3229] lr: 0.000030 grad_norm: 0.7794 (0.7588) closs: 1.0155 (1.0516) time: 4.1145 data: 0.0002 max mem: 54684 +[06:06:08.217639] Epoch: [1] [1610/3229] lr: 0.000030 grad_norm: 0.7703 (0.7590) closs: 1.0618 (1.0518) time: 4.1259 data: 0.0002 max mem: 54684 +[06:06:49.708388] Epoch: [1] [1620/3229] lr: 0.000030 grad_norm: 0.7704 (0.7592) closs: 1.0795 (1.0519) time: 4.1348 data: 0.0002 max mem: 54684 +[06:07:30.802140] Epoch: [1] [1630/3229] lr: 0.000030 grad_norm: 0.7723 (0.7593) closs: 1.0570 (1.0521) time: 4.1292 data: 0.0002 max mem: 54684 +[06:08:12.042914] Epoch: [1] [1640/3229] lr: 0.000030 grad_norm: 0.7690 (0.7593) closs: 1.0839 (1.0523) time: 4.1167 data: 0.0002 max mem: 54684 +[06:08:52.552185] Epoch: [1] [1650/3229] lr: 0.000030 grad_norm: 0.7620 (0.7592) closs: 1.0666 (1.0522) time: 4.0874 data: 0.0002 max mem: 54684 +[06:09:34.109821] Epoch: [1] [1660/3229] lr: 0.000030 grad_norm: 0.7443 (0.7592) closs: 1.0719 (1.0526) time: 4.1033 data: 0.0002 max mem: 54684 +[06:10:15.226675] Epoch: [1] [1670/3229] lr: 0.000030 grad_norm: 0.7640 (0.7592) closs: 1.0737 (1.0524) time: 4.1337 data: 0.0002 max mem: 54684 +[06:10:55.893166] Epoch: [1] [1680/3229] lr: 0.000030 grad_norm: 0.7557 (0.7591) closs: 1.0457 (1.0524) time: 4.0891 data: 0.0002 max mem: 54684 +[06:11:37.000880] Epoch: [1] [1690/3229] lr: 0.000029 grad_norm: 0.7449 (0.7591) closs: 1.0646 (1.0525) time: 4.0886 data: 0.0002 max mem: 54684 +[06:12:18.347873] Epoch: [1] [1700/3229] lr: 0.000029 grad_norm: 0.7646 (0.7592) closs: 1.0581 (1.0523) time: 4.1227 data: 0.0002 max mem: 54684 +[06:12:59.455957] Epoch: [1] [1710/3229] lr: 0.000029 grad_norm: 0.7725 (0.7593) closs: 1.0392 (1.0523) time: 4.1227 data: 0.0002 max mem: 54684 +[06:13:40.686492] Epoch: [1] [1720/3229] lr: 0.000029 grad_norm: 0.7731 (0.7595) closs: 1.0769 (1.0526) time: 4.1169 data: 0.0002 max mem: 54684 +[06:14:21.266645] Epoch: [1] [1730/3229] lr: 0.000029 grad_norm: 0.7730 (0.7594) closs: 1.0708 (1.0525) time: 4.0905 data: 0.0002 max mem: 54684 +[06:15:02.058571] Epoch: [1] [1740/3229] lr: 0.000029 grad_norm: 0.7642 (0.7593) closs: 1.0508 (1.0524) time: 4.0685 data: 0.0002 max mem: 54684 +[06:15:43.158278] Epoch: [1] [1750/3229] lr: 0.000029 grad_norm: 0.7606 (0.7594) closs: 1.0358 (1.0525) time: 4.0945 data: 0.0002 max mem: 54684 +[06:16:24.131577] Epoch: [1] [1760/3229] lr: 0.000029 grad_norm: 0.7606 (0.7593) closs: 1.0631 (1.0525) time: 4.1036 data: 0.0002 max mem: 54684 +[06:17:04.589751] Epoch: [1] [1770/3229] lr: 0.000029 grad_norm: 0.7370 (0.7591) closs: 1.0766 (1.0524) time: 4.0715 data: 0.0002 max mem: 54684 +[06:17:46.162753] Epoch: [1] [1780/3229] lr: 0.000029 grad_norm: 0.7053 (0.7589) closs: 1.0610 (1.0525) time: 4.1015 data: 0.0002 max mem: 54684 +[06:18:27.288021] Epoch: [1] [1790/3229] lr: 0.000029 grad_norm: 0.7304 (0.7589) closs: 1.0519 (1.0525) time: 4.1348 data: 0.0002 max mem: 54684 +[06:19:08.597875] Epoch: [1] [1800/3229] lr: 0.000029 grad_norm: 0.7718 (0.7593) closs: 1.0343 (1.0524) time: 4.1217 data: 0.0002 max mem: 54684 +[06:19:48.096823] Epoch: [1] [1810/3229] lr: 0.000028 grad_norm: 0.7355 (0.7589) closs: 0.9675 (1.0521) time: 4.0404 data: 0.0002 max mem: 54684 +[06:20:29.329365] Epoch: [1] [1820/3229] lr: 0.000028 grad_norm: 0.7154 (0.7589) closs: 1.0411 (1.0522) time: 4.0365 data: 0.0002 max mem: 54684 +[06:21:10.086608] Epoch: [1] [1830/3229] lr: 0.000028 grad_norm: 0.7412 (0.7589) closs: 1.0599 (1.0522) time: 4.0994 data: 0.0002 max mem: 54684 +[06:21:51.381549] Epoch: [1] [1840/3229] lr: 0.000028 grad_norm: 0.7621 (0.7589) closs: 1.0664 (1.0523) time: 4.1025 data: 0.0002 max mem: 54684 +[06:22:32.162442] Epoch: [1] [1850/3229] lr: 0.000028 grad_norm: 0.7662 (0.7590) closs: 1.0554 (1.0522) time: 4.1037 data: 0.0002 max mem: 54684 +[06:23:13.433902] Epoch: [1] [1860/3229] lr: 0.000028 grad_norm: 0.7852 (0.7590) closs: 1.0432 (1.0521) time: 4.1025 data: 0.0002 max mem: 54684 +[06:23:54.209307] Epoch: [1] [1870/3229] lr: 0.000028 grad_norm: 0.7674 (0.7591) closs: 1.0409 (1.0521) time: 4.1023 data: 0.0002 max mem: 54684 +[06:24:35.546365] Epoch: [1] [1880/3229] lr: 0.000028 grad_norm: 0.7487 (0.7591) closs: 1.0409 (1.0522) time: 4.1056 data: 0.0002 max mem: 54684 +[06:25:15.982687] Epoch: [1] [1890/3229] lr: 0.000028 grad_norm: 0.7683 (0.7591) closs: 1.0359 (1.0521) time: 4.0886 data: 0.0002 max mem: 54684 +[06:25:57.440510] Epoch: [1] [1900/3229] lr: 0.000028 grad_norm: 0.7670 (0.7592) closs: 1.0557 (1.0522) time: 4.0946 data: 0.0002 max mem: 54684 +[06:26:38.556623] Epoch: [1] [1910/3229] lr: 0.000028 grad_norm: 0.7661 (0.7592) closs: 1.0564 (1.0521) time: 4.1286 data: 0.0002 max mem: 54684 +[06:27:19.228566] Epoch: [1] [1920/3229] lr: 0.000028 grad_norm: 0.7542 (0.7590) closs: 1.0294 (1.0521) time: 4.0893 data: 0.0002 max mem: 54684 +[06:27:59.439243] Epoch: [1] [1930/3229] lr: 0.000028 grad_norm: 0.7197 (0.7587) closs: 1.0121 (1.0517) time: 4.0441 data: 0.0002 max mem: 54684 +[06:28:40.298280] Epoch: [1] [1940/3229] lr: 0.000027 grad_norm: 0.7343 (0.7586) closs: 1.0121 (1.0518) time: 4.0534 data: 0.0002 max mem: 54684 +[06:29:21.418176] Epoch: [1] [1950/3229] lr: 0.000027 grad_norm: 0.7666 (0.7588) closs: 1.0867 (1.0519) time: 4.0989 data: 0.0002 max mem: 54684 +[06:30:01.756178] Epoch: [1] [1960/3229] lr: 0.000027 grad_norm: 0.7619 (0.7587) closs: 1.0618 (1.0519) time: 4.0728 data: 0.0002 max mem: 54684 +[06:30:43.062087] Epoch: [1] [1970/3229] lr: 0.000027 grad_norm: 0.7588 (0.7588) closs: 1.0484 (1.0518) time: 4.0821 data: 0.0002 max mem: 54684 +[06:31:23.143763] Epoch: [1] [1980/3229] lr: 0.000027 grad_norm: 0.7496 (0.7584) closs: 1.0332 (1.0516) time: 4.0693 data: 0.0002 max mem: 54684 +[06:32:04.271038] Epoch: [1] [1990/3229] lr: 0.000027 grad_norm: 0.7496 (0.7585) closs: 1.0258 (1.0515) time: 4.0604 data: 0.0002 max mem: 54684 +[06:32:45.254334] Epoch: [1] [2000/3229] lr: 0.000027 grad_norm: 0.7825 (0.7587) closs: 1.0567 (1.0515) time: 4.1055 data: 0.0002 max mem: 54684 +[06:33:26.464223] Epoch: [1] [2010/3229] lr: 0.000027 grad_norm: 0.7723 (0.7588) closs: 1.0622 (1.0516) time: 4.1096 data: 0.0002 max mem: 54684 +[06:34:07.977608] Epoch: [1] [2020/3229] lr: 0.000027 grad_norm: 0.7569 (0.7588) closs: 1.0776 (1.0517) time: 4.1361 data: 0.0002 max mem: 54684 +[06:34:48.895957] Epoch: [1] [2030/3229] lr: 0.000027 grad_norm: 0.7560 (0.7586) closs: 1.0790 (1.0517) time: 4.1215 data: 0.0002 max mem: 54684 +[06:35:29.451828] Epoch: [1] [2040/3229] lr: 0.000027 grad_norm: 0.7256 (0.7584) closs: 1.0240 (1.0514) time: 4.0736 data: 0.0002 max mem: 54684 +[06:36:10.091914] Epoch: [1] [2050/3229] lr: 0.000027 grad_norm: 0.7247 (0.7581) closs: 1.0561 (1.0513) time: 4.0597 data: 0.0002 max mem: 54684 +[06:36:50.691448] Epoch: [1] [2060/3229] lr: 0.000027 grad_norm: 0.7255 (0.7579) closs: 1.0204 (1.0512) time: 4.0619 data: 0.0002 max mem: 54684 +[06:37:31.150888] Epoch: [1] [2070/3229] lr: 0.000026 grad_norm: 0.7327 (0.7578) closs: 1.0344 (1.0512) time: 4.0529 data: 0.0002 max mem: 54684 +[06:38:11.548200] Epoch: [1] [2080/3229] lr: 0.000026 grad_norm: 0.7327 (0.7576) closs: 1.0414 (1.0511) time: 4.0428 data: 0.0002 max mem: 54684 +[06:38:51.240169] Epoch: [1] [2090/3229] lr: 0.000026 grad_norm: 0.7045 (0.7574) closs: 0.9855 (1.0507) time: 4.0044 data: 0.0002 max mem: 54684 +[06:39:32.307282] Epoch: [1] [2100/3229] lr: 0.000026 grad_norm: 0.7700 (0.7575) closs: 0.9816 (1.0506) time: 4.0379 data: 0.0002 max mem: 54684 +[06:40:12.900721] Epoch: [1] [2110/3229] lr: 0.000026 grad_norm: 0.7683 (0.7572) closs: 1.0346 (1.0504) time: 4.0830 data: 0.0002 max mem: 54684 +[06:40:54.073838] Epoch: [1] [2120/3229] lr: 0.000026 grad_norm: 0.7175 (0.7572) closs: 1.0233 (1.0503) time: 4.0883 data: 0.0002 max mem: 54684 +[06:41:34.494204] Epoch: [1] [2130/3229] lr: 0.000026 grad_norm: 0.7567 (0.7575) closs: 1.0420 (1.0504) time: 4.0796 data: 0.0002 max mem: 54684 +[06:42:14.786527] Epoch: [1] [2140/3229] lr: 0.000026 grad_norm: 0.7746 (0.7574) closs: 1.0572 (1.0504) time: 4.0356 data: 0.0002 max mem: 54684 +[06:42:56.042219] Epoch: [1] [2150/3229] lr: 0.000026 grad_norm: 0.7436 (0.7574) closs: 1.0572 (1.0506) time: 4.0773 data: 0.0002 max mem: 54684 +[06:43:36.873461] Epoch: [1] [2160/3229] lr: 0.000026 grad_norm: 0.7565 (0.7574) closs: 1.1052 (1.0509) time: 4.1043 data: 0.0002 max mem: 54684 +[06:44:18.036372] Epoch: [1] [2170/3229] lr: 0.000026 grad_norm: 0.7559 (0.7573) closs: 1.0642 (1.0508) time: 4.0996 data: 0.0002 max mem: 54684 +[06:44:58.954633] Epoch: [1] [2180/3229] lr: 0.000026 grad_norm: 0.7512 (0.7574) closs: 1.0519 (1.0507) time: 4.1040 data: 0.0002 max mem: 54684 +[06:45:39.550709] Epoch: [1] [2190/3229] lr: 0.000026 grad_norm: 0.7721 (0.7580) closs: 1.0434 (1.0508) time: 4.0757 data: 0.0002 max mem: 54684 +[06:46:20.109102] Epoch: [1] [2200/3229] lr: 0.000025 grad_norm: 0.7721 (0.7579) closs: 1.0396 (1.0507) time: 4.0577 data: 0.0002 max mem: 54684 +[06:47:01.387992] Epoch: [1] [2210/3229] lr: 0.000025 grad_norm: 0.7774 (0.7582) closs: 1.0393 (1.0508) time: 4.0918 data: 0.0002 max mem: 54684 +[06:47:42.292906] Epoch: [1] [2220/3229] lr: 0.000025 grad_norm: 0.7702 (0.7580) closs: 1.0655 (1.0509) time: 4.1091 data: 0.0002 max mem: 54684 +[06:48:22.615690] Epoch: [1] [2230/3229] lr: 0.000025 grad_norm: 0.7125 (0.7578) closs: 1.0602 (1.0509) time: 4.0613 data: 0.0002 max mem: 54684 +[06:49:03.122150] Epoch: [1] [2240/3229] lr: 0.000025 grad_norm: 0.7564 (0.7580) closs: 1.0392 (1.0507) time: 4.0414 data: 0.0002 max mem: 54684 +[06:49:43.913988] Epoch: [1] [2250/3229] lr: 0.000025 grad_norm: 0.7275 (0.7587) closs: 1.0392 (1.0508) time: 4.0649 data: 0.0002 max mem: 54684 +[06:50:24.836181] Epoch: [1] [2260/3229] lr: 0.000025 grad_norm: 0.7275 (0.7588) closs: 1.0988 (1.0510) time: 4.0856 data: 0.0002 max mem: 54684 +[06:51:05.820250] Epoch: [1] [2270/3229] lr: 0.000025 grad_norm: 0.7384 (0.7587) closs: 1.0513 (1.0508) time: 4.0953 data: 0.0002 max mem: 54684 +[06:51:46.269588] Epoch: [1] [2280/3229] lr: 0.000025 grad_norm: 0.7730 (0.7588) closs: 1.0402 (1.0509) time: 4.0716 data: 0.0002 max mem: 54684 +[06:52:27.410955] Epoch: [1] [2290/3229] lr: 0.000025 grad_norm: 0.7238 (0.7586) closs: 1.0385 (1.0507) time: 4.0795 data: 0.0002 max mem: 54684 +[06:53:08.358905] Epoch: [1] [2300/3229] lr: 0.000025 grad_norm: 0.7204 (0.7586) closs: 1.0344 (1.0506) time: 4.1044 data: 0.0002 max mem: 54684 +[06:53:49.351820] Epoch: [1] [2310/3229] lr: 0.000025 grad_norm: 0.7408 (0.7585) closs: 1.0498 (1.0506) time: 4.0970 data: 0.0002 max mem: 54684 +[06:54:30.150638] Epoch: [1] [2320/3229] lr: 0.000025 grad_norm: 0.7644 (0.7586) closs: 1.0665 (1.0507) time: 4.0895 data: 0.0002 max mem: 54684 +[06:55:11.609155] Epoch: [1] [2330/3229] lr: 0.000024 grad_norm: 0.7644 (0.7586) closs: 1.0783 (1.0508) time: 4.1128 data: 0.0002 max mem: 54684 +[06:55:52.526582] Epoch: [1] [2340/3229] lr: 0.000024 grad_norm: 0.7481 (0.7587) closs: 1.0944 (1.0510) time: 4.1187 data: 0.0002 max mem: 54684 +[06:56:33.528322] Epoch: [1] [2350/3229] lr: 0.000024 grad_norm: 0.7847 (0.7587) closs: 1.1101 (1.0511) time: 4.0959 data: 0.0002 max mem: 54684 +[06:57:13.657474] Epoch: [1] [2360/3229] lr: 0.000024 grad_norm: 0.7256 (0.7585) closs: 1.0694 (1.0511) time: 4.0565 data: 0.0002 max mem: 54684 +[06:57:54.348834] Epoch: [1] [2370/3229] lr: 0.000024 grad_norm: 0.7192 (0.7584) closs: 1.0375 (1.0509) time: 4.0410 data: 0.0002 max mem: 54684 +[06:58:35.300759] Epoch: [1] [2380/3229] lr: 0.000024 grad_norm: 0.7204 (0.7584) closs: 1.0375 (1.0509) time: 4.0821 data: 0.0002 max mem: 54684 +[06:59:15.644906] Epoch: [1] [2390/3229] lr: 0.000024 grad_norm: 0.7417 (0.7583) closs: 1.0504 (1.0508) time: 4.0647 data: 0.0002 max mem: 54684 +[06:59:56.423928] Epoch: [1] [2400/3229] lr: 0.000024 grad_norm: 0.7580 (0.7583) closs: 1.0390 (1.0508) time: 4.0561 data: 0.0002 max mem: 54684 +[07:00:36.732262] Epoch: [1] [2410/3229] lr: 0.000024 grad_norm: 0.7551 (0.7580) closs: 1.0390 (1.0508) time: 4.0543 data: 0.0002 max mem: 54684 +[07:01:17.509223] Epoch: [1] [2420/3229] lr: 0.000024 grad_norm: 0.7224 (0.7579) closs: 1.0253 (1.0507) time: 4.0542 data: 0.0002 max mem: 54684 +[07:01:59.103175] Epoch: [1] [2430/3229] lr: 0.000024 grad_norm: 0.7534 (0.7580) closs: 1.0289 (1.0507) time: 4.1185 data: 0.0002 max mem: 54684 +[07:02:39.633371] Epoch: [1] [2440/3229] lr: 0.000024 grad_norm: 0.7523 (0.7579) closs: 1.0303 (1.0506) time: 4.1061 data: 0.0002 max mem: 54684 +[07:03:21.099615] Epoch: [1] [2450/3229] lr: 0.000024 grad_norm: 0.7399 (0.7580) closs: 1.0666 (1.0508) time: 4.0998 data: 0.0002 max mem: 54684 +[07:04:01.876483] Epoch: [1] [2460/3229] lr: 0.000023 grad_norm: 0.7601 (0.7597) closs: 1.0710 (1.0509) time: 4.1121 data: 0.0002 max mem: 54684 +[07:04:42.177832] Epoch: [1] [2470/3229] lr: 0.000023 grad_norm: 0.7070 (0.7594) closs: 1.0346 (1.0508) time: 4.0538 data: 0.0002 max mem: 54684 +[07:05:22.730871] Epoch: [1] [2480/3229] lr: 0.000023 grad_norm: 0.6798 (0.7592) closs: 1.0346 (1.0508) time: 4.0427 data: 0.0002 max mem: 54684 +[07:06:04.230691] Epoch: [1] [2490/3229] lr: 0.000023 grad_norm: 0.7520 (0.7593) closs: 1.0195 (1.0507) time: 4.1026 data: 0.0002 max mem: 54684 +[07:06:44.671239] Epoch: [1] [2500/3229] lr: 0.000023 grad_norm: 0.7559 (0.7592) closs: 1.0066 (1.0506) time: 4.0970 data: 0.0002 max mem: 54684 +[07:07:25.306096] Epoch: [1] [2510/3229] lr: 0.000023 grad_norm: 0.7394 (0.7590) closs: 1.0125 (1.0504) time: 4.0537 data: 0.0002 max mem: 54684 +[07:08:05.672440] Epoch: [1] [2520/3229] lr: 0.000023 grad_norm: 0.7341 (0.7589) closs: 1.0304 (1.0502) time: 4.0500 data: 0.0002 max mem: 54684 +[07:08:46.342159] Epoch: [1] [2530/3229] lr: 0.000023 grad_norm: 0.7582 (0.7590) closs: 1.0381 (1.0503) time: 4.0517 data: 0.0002 max mem: 54684 +[07:09:27.086959] Epoch: [1] [2540/3229] lr: 0.000023 grad_norm: 0.7684 (0.7590) closs: 1.0815 (1.0503) time: 4.0707 data: 0.0002 max mem: 54684 +[07:10:08.439021] Epoch: [1] [2550/3229] lr: 0.000023 grad_norm: 0.7684 (0.7590) closs: 1.0815 (1.0505) time: 4.1048 data: 0.0002 max mem: 54684 +[07:10:49.035524] Epoch: [1] [2560/3229] lr: 0.000023 grad_norm: 0.7595 (0.7590) closs: 1.0433 (1.0504) time: 4.0974 data: 0.0002 max mem: 54684 +[07:11:30.100693] Epoch: [1] [2570/3229] lr: 0.000023 grad_norm: 0.7435 (0.7588) closs: 1.0562 (1.0506) time: 4.0830 data: 0.0002 max mem: 54684 +[07:12:11.188418] Epoch: [1] [2580/3229] lr: 0.000023 grad_norm: 0.7404 (0.7589) closs: 1.1022 (1.0507) time: 4.1076 data: 0.0002 max mem: 54684 +[07:12:52.475637] Epoch: [1] [2590/3229] lr: 0.000022 grad_norm: 0.7772 (0.7591) closs: 1.0955 (1.0509) time: 4.1187 data: 0.0002 max mem: 54684 +[07:13:33.482943] Epoch: [1] [2600/3229] lr: 0.000022 grad_norm: 0.7876 (0.7590) closs: 1.0745 (1.0508) time: 4.1147 data: 0.0002 max mem: 54684 +[07:14:14.212298] Epoch: [1] [2610/3229] lr: 0.000022 grad_norm: 0.7625 (0.7589) closs: 1.0001 (1.0507) time: 4.0868 data: 0.0002 max mem: 54684 +[07:14:55.108401] Epoch: [1] [2620/3229] lr: 0.000022 grad_norm: 0.7431 (0.7588) closs: 1.0001 (1.0506) time: 4.0812 data: 0.0002 max mem: 54684 +[07:15:36.195488] Epoch: [1] [2630/3229] lr: 0.000022 grad_norm: 0.7435 (0.7590) closs: 1.0573 (1.0508) time: 4.0991 data: 0.0002 max mem: 54684 +[07:16:17.647209] Epoch: [1] [2640/3229] lr: 0.000022 grad_norm: 0.7787 (0.7590) closs: 1.0782 (1.0508) time: 4.1269 data: 0.0002 max mem: 54684 +[07:16:58.892659] Epoch: [1] [2650/3229] lr: 0.000022 grad_norm: 0.7769 (0.7590) closs: 1.0465 (1.0508) time: 4.1348 data: 0.0002 max mem: 54684 +[07:17:39.799115] Epoch: [1] [2660/3229] lr: 0.000022 grad_norm: 0.7668 (0.7590) closs: 1.0389 (1.0509) time: 4.1075 data: 0.0002 max mem: 54684 +[07:18:20.650599] Epoch: [1] [2670/3229] lr: 0.000022 grad_norm: 0.7837 (0.7592) closs: 1.0408 (1.0508) time: 4.0878 data: 0.0002 max mem: 54684 +[07:19:01.954445] Epoch: [1] [2680/3229] lr: 0.000022 grad_norm: 0.7896 (0.7591) closs: 1.0526 (1.0508) time: 4.1077 data: 0.0002 max mem: 54684 +[07:19:42.555933] Epoch: [1] [2690/3229] lr: 0.000022 grad_norm: 0.7348 (0.7591) closs: 1.0526 (1.0508) time: 4.0952 data: 0.0002 max mem: 54684 +[07:20:23.489822] Epoch: [1] [2700/3229] lr: 0.000022 grad_norm: 0.7487 (0.7592) closs: 1.0357 (1.0508) time: 4.0767 data: 0.0002 max mem: 54684 +[07:21:03.709034] Epoch: [1] [2710/3229] lr: 0.000022 grad_norm: 0.7536 (0.7593) closs: 1.0253 (1.0506) time: 4.0576 data: 0.0002 max mem: 54684 +[07:21:44.808335] Epoch: [1] [2720/3229] lr: 0.000021 grad_norm: 0.7750 (0.7593) closs: 1.0362 (1.0505) time: 4.0659 data: 0.0002 max mem: 54684 +[07:22:26.050486] Epoch: [1] [2730/3229] lr: 0.000021 grad_norm: 0.7431 (0.7593) closs: 1.0379 (1.0505) time: 4.1170 data: 0.0002 max mem: 54684 +[07:23:06.929188] Epoch: [1] [2740/3229] lr: 0.000021 grad_norm: 0.7314 (0.7592) closs: 1.0382 (1.0505) time: 4.1060 data: 0.0002 max mem: 54684 +[07:23:47.376913] Epoch: [1] [2750/3229] lr: 0.000021 grad_norm: 0.7727 (0.7592) closs: 1.0259 (1.0503) time: 4.0663 data: 0.0002 max mem: 54684 +[07:24:28.781962] Epoch: [1] [2760/3229] lr: 0.000021 grad_norm: 0.7543 (0.7591) closs: 1.0343 (1.0504) time: 4.0926 data: 0.0002 max mem: 54684 +[07:25:09.722107] Epoch: [1] [2770/3229] lr: 0.000021 grad_norm: 0.7543 (0.7592) closs: 1.0747 (1.0505) time: 4.1172 data: 0.0002 max mem: 54684 +[07:25:50.621826] Epoch: [1] [2780/3229] lr: 0.000021 grad_norm: 0.7577 (0.7591) closs: 1.0669 (1.0505) time: 4.0919 data: 0.0002 max mem: 54684 +[07:26:30.847780] Epoch: [1] [2790/3229] lr: 0.000021 grad_norm: 0.7133 (0.7590) closs: 1.0389 (1.0505) time: 4.0562 data: 0.0002 max mem: 54684 +[07:27:12.004919] Epoch: [1] [2800/3229] lr: 0.000021 grad_norm: 0.7479 (0.7588) closs: 1.0389 (1.0505) time: 4.0691 data: 0.0002 max mem: 54684 +[07:27:52.590244] Epoch: [1] [2810/3229] lr: 0.000021 grad_norm: 0.7538 (0.7590) closs: 1.0261 (1.0503) time: 4.0871 data: 0.0002 max mem: 54684 +[07:28:33.829143] Epoch: [1] [2820/3229] lr: 0.000021 grad_norm: 0.7608 (0.7591) closs: 1.0348 (1.0505) time: 4.0911 data: 0.0002 max mem: 54684 +[07:29:14.344969] Epoch: [1] [2830/3229] lr: 0.000021 grad_norm: 0.7575 (0.7589) closs: 1.0923 (1.0506) time: 4.0877 data: 0.0002 max mem: 54684 +[07:29:54.993752] Epoch: [1] [2840/3229] lr: 0.000021 grad_norm: 0.7351 (0.7589) closs: 1.0629 (1.0506) time: 4.0582 data: 0.0002 max mem: 54684 +[07:30:35.251355] Epoch: [1] [2850/3229] lr: 0.000020 grad_norm: 0.7529 (0.7587) closs: 1.0003 (1.0504) time: 4.0453 data: 0.0002 max mem: 54684 +[07:31:15.852435] Epoch: [1] [2860/3229] lr: 0.000020 grad_norm: 0.7370 (0.7586) closs: 1.0153 (1.0504) time: 4.0429 data: 0.0002 max mem: 54684 +[07:31:56.405734] Epoch: [1] [2870/3229] lr: 0.000020 grad_norm: 0.7265 (0.7583) closs: 1.0422 (1.0503) time: 4.0577 data: 0.0002 max mem: 54684 +[07:32:37.536609] Epoch: [1] [2880/3229] lr: 0.000020 grad_norm: 0.7202 (0.7582) closs: 1.0430 (1.0504) time: 4.0841 data: 0.0002 max mem: 54684 +[07:33:18.137999] Epoch: [1] [2890/3229] lr: 0.000020 grad_norm: 0.7069 (0.7580) closs: 1.0299 (1.0504) time: 4.0866 data: 0.0002 max mem: 54684 +[07:33:59.054025] Epoch: [1] [2900/3229] lr: 0.000020 grad_norm: 0.7126 (0.7579) closs: 0.9991 (1.0503) time: 4.0758 data: 0.0002 max mem: 54684 +[07:34:40.302514] Epoch: [1] [2910/3229] lr: 0.000020 grad_norm: 0.7618 (0.7580) closs: 1.0246 (1.0504) time: 4.1082 data: 0.0002 max mem: 54684 +[07:35:21.613397] Epoch: [1] [2920/3229] lr: 0.000020 grad_norm: 0.7726 (0.7581) closs: 1.0246 (1.0504) time: 4.1279 data: 0.0002 max mem: 54684 +[07:36:01.885012] Epoch: [1] [2930/3229] lr: 0.000020 grad_norm: 0.7795 (0.7580) closs: 1.0148 (1.0502) time: 4.0791 data: 0.0002 max mem: 54684 +[07:36:42.517521] Epoch: [1] [2940/3229] lr: 0.000020 grad_norm: 0.7532 (0.7580) closs: 1.0222 (1.0501) time: 4.0451 data: 0.0002 max mem: 54684 +[07:37:22.983411] Epoch: [1] [2950/3229] lr: 0.000020 grad_norm: 0.7556 (0.7579) closs: 1.0219 (1.0500) time: 4.0549 data: 0.0002 max mem: 54684 +[07:38:03.810435] Epoch: [1] [2960/3229] lr: 0.000020 grad_norm: 0.7329 (0.7577) closs: 1.0213 (1.0499) time: 4.0646 data: 0.0002 max mem: 54684 +[07:38:44.734197] Epoch: [1] [2970/3229] lr: 0.000020 grad_norm: 0.7669 (0.7578) closs: 1.0230 (1.0499) time: 4.0875 data: 0.0002 max mem: 54684 +[07:39:25.067815] Epoch: [1] [2980/3229] lr: 0.000020 grad_norm: 0.7443 (0.7575) closs: 1.0170 (1.0497) time: 4.0628 data: 0.0002 max mem: 54684 +[07:40:06.277140] Epoch: [1] [2990/3229] lr: 0.000019 grad_norm: 0.7237 (0.7575) closs: 1.0334 (1.0498) time: 4.0771 data: 0.0002 max mem: 54684 +[07:40:47.518619] Epoch: [1] [3000/3229] lr: 0.000019 grad_norm: 0.7443 (0.7576) closs: 1.0808 (1.0499) time: 4.1225 data: 0.0002 max mem: 54684 +[07:41:28.756394] Epoch: [1] [3010/3229] lr: 0.000019 grad_norm: 0.7306 (0.7576) closs: 1.0731 (1.0499) time: 4.1239 data: 0.0002 max mem: 54684 +[07:42:09.820428] Epoch: [1] [3020/3229] lr: 0.000019 grad_norm: 0.7376 (0.7576) closs: 1.0600 (1.0498) time: 4.1150 data: 0.0002 max mem: 54684 +[07:42:50.371264] Epoch: [1] [3030/3229] lr: 0.000019 grad_norm: 0.7764 (0.7576) closs: 1.0239 (1.0497) time: 4.0807 data: 0.0002 max mem: 54684 +[07:43:31.735769] Epoch: [1] [3040/3229] lr: 0.000019 grad_norm: 0.8111 (0.7577) closs: 1.0480 (1.0498) time: 4.0957 data: 0.0002 max mem: 54684 +[07:44:12.340935] Epoch: [1] [3050/3229] lr: 0.000019 grad_norm: 0.7614 (0.7576) closs: 1.0513 (1.0498) time: 4.0984 data: 0.0002 max mem: 54684 +[07:44:53.338278] Epoch: [1] [3060/3229] lr: 0.000019 grad_norm: 0.7472 (0.7577) closs: 1.0586 (1.0498) time: 4.0801 data: 0.0002 max mem: 54684 +[07:45:33.983619] Epoch: [1] [3070/3229] lr: 0.000019 grad_norm: 0.7472 (0.7576) closs: 1.0348 (1.0497) time: 4.0821 data: 0.0002 max mem: 54684 +[07:46:14.880787] Epoch: [1] [3080/3229] lr: 0.000019 grad_norm: 0.7548 (0.7576) closs: 1.0463 (1.0498) time: 4.0771 data: 0.0002 max mem: 54684 +[07:46:55.135508] Epoch: [1] [3090/3229] lr: 0.000019 grad_norm: 0.7275 (0.7574) closs: 1.0621 (1.0497) time: 4.0575 data: 0.0002 max mem: 54684 +[07:47:35.489073] Epoch: [1] [3100/3229] lr: 0.000019 grad_norm: 0.7098 (0.7573) closs: 0.9939 (1.0496) time: 4.0303 data: 0.0002 max mem: 54684 +[07:48:16.842485] Epoch: [1] [3110/3229] lr: 0.000019 grad_norm: 0.7606 (0.7574) closs: 1.0400 (1.0497) time: 4.0853 data: 0.0002 max mem: 54684 +[07:48:58.060831] Epoch: [1] [3120/3229] lr: 0.000019 grad_norm: 0.7567 (0.7574) closs: 1.0607 (1.0498) time: 4.1285 data: 0.0002 max mem: 54684 +[07:49:39.119706] Epoch: [1] [3130/3229] lr: 0.000018 grad_norm: 0.7286 (0.7572) closs: 1.0710 (1.0499) time: 4.1138 data: 0.0002 max mem: 54684 +[07:50:19.961668] Epoch: [1] [3140/3229] lr: 0.000018 grad_norm: 0.7239 (0.7572) closs: 1.0326 (1.0498) time: 4.0950 data: 0.0002 max mem: 54684 +[07:51:00.627490] Epoch: [1] [3150/3229] lr: 0.000018 grad_norm: 0.7701 (0.7572) closs: 1.0245 (1.0498) time: 4.0753 data: 0.0002 max mem: 54684 +[07:51:41.096988] Epoch: [1] [3160/3229] lr: 0.000018 grad_norm: 0.7701 (0.7572) closs: 1.0417 (1.0497) time: 4.0567 data: 0.0002 max mem: 54684 +[07:52:21.540824] Epoch: [1] [3170/3229] lr: 0.000018 grad_norm: 0.7199 (0.7571) closs: 1.0092 (1.0496) time: 4.0456 data: 0.0002 max mem: 54684 +[07:53:01.782122] Epoch: [1] [3180/3229] lr: 0.000018 grad_norm: 0.7290 (0.7570) closs: 1.0158 (1.0495) time: 4.0342 data: 0.0002 max mem: 54684 +[07:53:42.520581] Epoch: [1] [3190/3229] lr: 0.000018 grad_norm: 0.7290 (0.7569) closs: 1.0676 (1.0495) time: 4.0489 data: 0.0002 max mem: 54684 +[07:54:22.768996] Epoch: [1] [3200/3229] lr: 0.000018 grad_norm: 0.7260 (0.7568) closs: 1.0775 (1.0495) time: 4.0493 data: 0.0002 max mem: 54684 +[07:55:03.308755] Epoch: [1] [3210/3229] lr: 0.000018 grad_norm: 0.7512 (0.7568) closs: 1.0424 (1.0494) time: 4.0393 data: 0.0002 max mem: 54684 +[07:55:44.129491] Epoch: [1] [3220/3229] lr: 0.000018 grad_norm: 0.7508 (0.7567) closs: 1.0736 (1.0494) time: 4.0680 data: 0.0001 max mem: 54684 +[07:56:16.699747] Epoch: [1] Total time: 3:40:05 +[07:56:16.736123] Averaged stats: lr: 0.000018 grad_norm: 0.7281 (0.7566) closs: 1.0383 (1.0483) +[07:56:17.102213] model saved +[07:56:18.915517] optimizer saved +[07:56:18.916157] other rank-common saved +[07:56:18.922035] rank-specific saved +[07:56:18.936513] log_dir: ./output_dir +[07:56:31.268719] Epoch: [2] [0/3229] lr: 0.000018 grad_norm: 0.8262 (0.8262) closs: 1.0084 (1.0084) time: 12.3313 data: 8.2426 max mem: 54684 +[07:57:12.183294] Epoch: [2] [10/3229] lr: 0.000018 grad_norm: 0.7358 (0.7439) closs: 1.0242 (1.0370) time: 4.8405 data: 0.7495 max mem: 54684 +[07:57:52.839749] Epoch: [2] [20/3229] lr: 0.000018 grad_norm: 0.7358 (0.7614) closs: 1.0264 (1.0467) time: 4.0785 data: 0.0002 max mem: 54684 +[07:58:33.435224] Epoch: [2] [30/3229] lr: 0.000018 grad_norm: 0.7811 (0.7651) closs: 1.0351 (1.0354) time: 4.0625 data: 0.0002 max mem: 54684 +[07:59:15.317025] Epoch: [2] [40/3229] lr: 0.000017 grad_norm: 0.7708 (0.7695) closs: 1.0460 (1.0453) time: 4.1238 data: 0.0002 max mem: 54684 +[07:59:56.906136] Epoch: [2] [50/3229] lr: 0.000017 grad_norm: 0.7742 (0.7745) closs: 1.0633 (1.0491) time: 4.1735 data: 0.0002 max mem: 54684 +[08:00:37.793068] Epoch: [2] [60/3229] lr: 0.000017 grad_norm: 0.7780 (0.7750) closs: 1.0396 (1.0518) time: 4.1237 data: 0.0002 max mem: 54684 +[08:01:17.989880] Epoch: [2] [70/3229] lr: 0.000017 grad_norm: 0.7406 (0.7669) closs: 1.0464 (1.0460) time: 4.0541 data: 0.0002 max mem: 54684 +[08:01:59.026748] Epoch: [2] [80/3229] lr: 0.000017 grad_norm: 0.7321 (0.7684) closs: 1.0689 (1.0462) time: 4.0616 data: 0.0002 max mem: 54684 +[08:02:39.476084] Epoch: [2] [90/3229] lr: 0.000017 grad_norm: 0.7710 (0.7672) closs: 1.0697 (1.0457) time: 4.0742 data: 0.0002 max mem: 54684 +[08:03:20.882001] Epoch: [2] [100/3229] lr: 0.000017 grad_norm: 0.7731 (0.7683) closs: 1.0697 (1.0507) time: 4.0927 data: 0.0002 max mem: 54684 +[08:04:01.732738] Epoch: [2] [110/3229] lr: 0.000017 grad_norm: 0.7636 (0.7693) closs: 1.0633 (1.0496) time: 4.1128 data: 0.0002 max mem: 54684 +[08:04:43.243616] Epoch: [2] [120/3229] lr: 0.000017 grad_norm: 0.7653 (0.7695) closs: 1.0444 (1.0471) time: 4.1180 data: 0.0002 max mem: 54684 +[08:05:24.640326] Epoch: [2] [130/3229] lr: 0.000017 grad_norm: 0.7895 (0.7719) closs: 1.0530 (1.0490) time: 4.1453 data: 0.0002 max mem: 54684 +[08:06:04.919803] Epoch: [2] [140/3229] lr: 0.000017 grad_norm: 0.7767 (0.7695) closs: 1.0625 (1.0491) time: 4.0837 data: 0.0002 max mem: 54684 +[08:06:45.485984] Epoch: [2] [150/3229] lr: 0.000017 grad_norm: 0.7487 (0.7669) closs: 1.0756 (1.0476) time: 4.0422 data: 0.0002 max mem: 54684 +[08:07:26.589317] Epoch: [2] [160/3229] lr: 0.000017 grad_norm: 0.7543 (0.7668) closs: 1.0570 (1.0491) time: 4.0834 data: 0.0002 max mem: 54684 +[08:08:07.809141] Epoch: [2] [170/3229] lr: 0.000017 grad_norm: 0.7705 (0.7656) closs: 1.0286 (1.0475) time: 4.1161 data: 0.0002 max mem: 54684 +[08:08:48.382644] Epoch: [2] [180/3229] lr: 0.000016 grad_norm: 0.7462 (0.7654) closs: 1.0195 (1.0463) time: 4.0896 data: 0.0002 max mem: 54684 +[08:09:29.267697] Epoch: [2] [190/3229] lr: 0.000016 grad_norm: 0.7566 (0.7639) closs: 1.0265 (1.0465) time: 4.0729 data: 0.0002 max mem: 54684 +[08:10:09.377776] Epoch: [2] [200/3229] lr: 0.000016 grad_norm: 0.7441 (0.7628) closs: 1.0265 (1.0435) time: 4.0497 data: 0.0002 max mem: 54684 +[08:10:49.302139] Epoch: [2] [210/3229] lr: 0.000016 grad_norm: 0.6802 (0.7596) closs: 0.9949 (1.0425) time: 4.0017 data: 0.0002 max mem: 54684 +[08:11:30.519323] Epoch: [2] [220/3229] lr: 0.000016 grad_norm: 0.7500 (0.7609) closs: 1.0888 (1.0456) time: 4.0570 data: 0.0002 max mem: 54684 +[08:12:11.704337] Epoch: [2] [230/3229] lr: 0.000016 grad_norm: 0.7773 (0.7609) closs: 1.0791 (1.0452) time: 4.1200 data: 0.0002 max mem: 54684 +[08:12:52.393373] Epoch: [2] [240/3229] lr: 0.000016 grad_norm: 0.7641 (0.7595) closs: 1.0536 (1.0445) time: 4.0936 data: 0.0002 max mem: 54684 +[08:13:33.064691] Epoch: [2] [250/3229] lr: 0.000016 grad_norm: 0.7542 (0.7585) closs: 1.0588 (1.0449) time: 4.0680 data: 0.0002 max mem: 54684 +[08:14:14.045473] Epoch: [2] [260/3229] lr: 0.000016 grad_norm: 0.7569 (0.7583) closs: 1.0562 (1.0460) time: 4.0825 data: 0.0002 max mem: 54684 +[08:14:54.905661] Epoch: [2] [270/3229] lr: 0.000016 grad_norm: 0.7574 (0.7579) closs: 1.0383 (1.0455) time: 4.0920 data: 0.0002 max mem: 54684 +[08:15:35.926717] Epoch: [2] [280/3229] lr: 0.000016 grad_norm: 0.7558 (0.7577) closs: 1.0300 (1.0455) time: 4.0940 data: 0.0002 max mem: 54684 +[08:16:16.814567] Epoch: [2] [290/3229] lr: 0.000016 grad_norm: 0.7416 (0.7564) closs: 1.0680 (1.0467) time: 4.0954 data: 0.0002 max mem: 54684 +[08:16:58.068460] Epoch: [2] [300/3229] lr: 0.000016 grad_norm: 0.7514 (0.7573) closs: 1.0594 (1.0468) time: 4.1070 data: 0.0002 max mem: 54684 +[08:17:38.927633] Epoch: [2] [310/3229] lr: 0.000016 grad_norm: 0.7797 (0.7577) closs: 1.0474 (1.0460) time: 4.1056 data: 0.0002 max mem: 54684 +[08:18:19.652536] Epoch: [2] [320/3229] lr: 0.000016 grad_norm: 0.7461 (0.7563) closs: 1.0455 (1.0458) time: 4.0791 data: 0.0002 max mem: 54684 +[08:19:00.882256] Epoch: [2] [330/3229] lr: 0.000015 grad_norm: 0.7702 (0.7581) closs: 1.0466 (1.0463) time: 4.0977 data: 0.0002 max mem: 54684 +[08:19:40.850985] Epoch: [2] [340/3229] lr: 0.000015 grad_norm: 0.7544 (0.7554) closs: 1.0466 (1.0460) time: 4.0599 data: 0.0002 max mem: 54684 +[08:20:21.704908] Epoch: [2] [350/3229] lr: 0.000015 grad_norm: 0.7372 (0.7563) closs: 1.0080 (1.0453) time: 4.0411 data: 0.0002 max mem: 54684 +[08:21:02.268285] Epoch: [2] [360/3229] lr: 0.000015 grad_norm: 0.8174 (0.7568) closs: 0.9763 (1.0430) time: 4.0708 data: 0.0002 max mem: 54684 +[08:21:42.525189] Epoch: [2] [370/3229] lr: 0.000015 grad_norm: 0.7369 (0.7568) closs: 0.9641 (1.0418) time: 4.0410 data: 0.0002 max mem: 54684 +[08:22:23.507510] Epoch: [2] [380/3229] lr: 0.000015 grad_norm: 0.7944 (0.7578) closs: 1.0548 (1.0421) time: 4.0619 data: 0.0002 max mem: 54684 +[08:23:04.788986] Epoch: [2] [390/3229] lr: 0.000015 grad_norm: 0.7995 (0.7582) closs: 1.0502 (1.0420) time: 4.1131 data: 0.0002 max mem: 54684 +[08:23:45.036107] Epoch: [2] [400/3229] lr: 0.000015 grad_norm: 0.7850 (0.7579) closs: 1.0068 (1.0413) time: 4.0764 data: 0.0002 max mem: 54684 +[08:24:25.660061] Epoch: [2] [410/3229] lr: 0.000015 grad_norm: 0.7850 (0.7576) closs: 1.0540 (1.0410) time: 4.0435 data: 0.0002 max mem: 54684 +[08:25:06.015694] Epoch: [2] [420/3229] lr: 0.000015 grad_norm: 0.7293 (0.7570) closs: 1.0468 (1.0396) time: 4.0489 data: 0.0002 max mem: 54684 +[08:25:46.999427] Epoch: [2] [430/3229] lr: 0.000015 grad_norm: 0.7238 (0.7570) closs: 1.0225 (1.0398) time: 4.0669 data: 0.0002 max mem: 54684 +[08:26:27.949780] Epoch: [2] [440/3229] lr: 0.000015 grad_norm: 0.7509 (0.7575) closs: 1.0472 (1.0399) time: 4.0966 data: 0.0002 max mem: 54684 +[08:27:08.849435] Epoch: [2] [450/3229] lr: 0.000015 grad_norm: 0.7973 (0.7579) closs: 1.0547 (1.0396) time: 4.0924 data: 0.0002 max mem: 54684 +[08:27:49.906838] Epoch: [2] [460/3229] lr: 0.000015 grad_norm: 0.7565 (0.7575) closs: 0.9997 (1.0391) time: 4.0978 data: 0.0002 max mem: 54684 +[08:28:30.907563] Epoch: [2] [470/3229] lr: 0.000015 grad_norm: 0.7846 (0.7587) closs: 1.0305 (1.0388) time: 4.1028 data: 0.0002 max mem: 54684 +[08:29:11.501494] Epoch: [2] [480/3229] lr: 0.000015 grad_norm: 0.7925 (0.7583) closs: 1.0507 (1.0382) time: 4.0797 data: 0.0002 max mem: 54684 +[08:29:52.450393] Epoch: [2] [490/3229] lr: 0.000014 grad_norm: 0.7658 (0.7582) closs: 1.0217 (1.0378) time: 4.0771 data: 0.0002 max mem: 54684 +[08:30:33.131092] Epoch: [2] [500/3229] lr: 0.000014 grad_norm: 0.7658 (0.7582) closs: 1.0195 (1.0377) time: 4.0814 data: 0.0002 max mem: 54684 +[08:31:13.731758] Epoch: [2] [510/3229] lr: 0.000014 grad_norm: 0.7490 (0.7574) closs: 1.0083 (1.0372) time: 4.0640 data: 0.0002 max mem: 54684 +[08:31:54.606631] Epoch: [2] [520/3229] lr: 0.000014 grad_norm: 0.7661 (0.7578) closs: 1.0242 (1.0374) time: 4.0737 data: 0.0002 max mem: 54684 +[08:32:35.652465] Epoch: [2] [530/3229] lr: 0.000014 grad_norm: 0.7661 (0.7574) closs: 1.0650 (1.0373) time: 4.0960 data: 0.0002 max mem: 54684 +[08:33:16.811929] Epoch: [2] [540/3229] lr: 0.000014 grad_norm: 0.7808 (0.7580) closs: 1.0536 (1.0379) time: 4.1102 data: 0.0002 max mem: 54684 +[08:33:58.098430] Epoch: [2] [550/3229] lr: 0.000014 grad_norm: 0.7890 (0.7584) closs: 1.0505 (1.0379) time: 4.1222 data: 0.0002 max mem: 54684 +[08:34:39.012102] Epoch: [2] [560/3229] lr: 0.000014 grad_norm: 0.7536 (0.7579) closs: 1.0541 (1.0380) time: 4.1099 data: 0.0002 max mem: 54684 +[08:35:20.411837] Epoch: [2] [570/3229] lr: 0.000014 grad_norm: 0.7575 (0.7583) closs: 1.0281 (1.0376) time: 4.1156 data: 0.0002 max mem: 54684 +[08:36:01.585232] Epoch: [2] [580/3229] lr: 0.000014 grad_norm: 0.7735 (0.7588) closs: 1.0281 (1.0379) time: 4.1286 data: 0.0002 max mem: 54684 +[08:36:42.866022] Epoch: [2] [590/3229] lr: 0.000014 grad_norm: 0.7978 (0.7597) closs: 1.0419 (1.0385) time: 4.1226 data: 0.0002 max mem: 54684 +[08:37:24.134897] Epoch: [2] [600/3229] lr: 0.000014 grad_norm: 0.7944 (0.7605) closs: 1.0606 (1.0390) time: 4.1274 data: 0.0002 max mem: 54684 +[08:38:05.564805] Epoch: [2] [610/3229] lr: 0.000014 grad_norm: 0.7800 (0.7610) closs: 1.0638 (1.0396) time: 4.1349 data: 0.0002 max mem: 54684 +[08:38:47.052757] Epoch: [2] [620/3229] lr: 0.000014 grad_norm: 0.7830 (0.7618) closs: 1.0701 (1.0400) time: 4.1458 data: 0.0002 max mem: 54684 +[08:39:28.070271] Epoch: [2] [630/3229] lr: 0.000014 grad_norm: 0.7867 (0.7620) closs: 1.0398 (1.0394) time: 4.1252 data: 0.0002 max mem: 54684 +[08:40:09.431657] Epoch: [2] [640/3229] lr: 0.000014 grad_norm: 0.7636 (0.7620) closs: 1.0375 (1.0393) time: 4.1189 data: 0.0002 max mem: 54684 +[08:40:50.029316] Epoch: [2] [650/3229] lr: 0.000013 grad_norm: 0.7721 (0.7620) closs: 1.0376 (1.0392) time: 4.0979 data: 0.0002 max mem: 54684 +[08:41:31.253698] Epoch: [2] [660/3229] lr: 0.000013 grad_norm: 0.8149 (0.7639) closs: 1.0376 (1.0390) time: 4.0910 data: 0.0002 max mem: 54684 +[08:42:11.674957] Epoch: [2] [670/3229] lr: 0.000013 grad_norm: 0.7910 (0.7640) closs: 0.9759 (1.0377) time: 4.0822 data: 0.0002 max mem: 54684 +[08:42:52.587904] Epoch: [2] [680/3229] lr: 0.000013 grad_norm: 0.8076 (0.7644) closs: 1.0142 (1.0376) time: 4.0666 data: 0.0002 max mem: 54684 +[08:43:33.204090] Epoch: [2] [690/3229] lr: 0.000013 grad_norm: 0.7571 (0.7643) closs: 1.0510 (1.0373) time: 4.0764 data: 0.0002 max mem: 54684 +[08:44:13.752622] Epoch: [2] [700/3229] lr: 0.000013 grad_norm: 0.7474 (0.7643) closs: 1.0141 (1.0369) time: 4.0582 data: 0.0002 max mem: 54684 +[08:44:54.180675] Epoch: [2] [710/3229] lr: 0.000013 grad_norm: 0.7605 (0.7641) closs: 1.0175 (1.0365) time: 4.0488 data: 0.0002 max mem: 54684 +[08:45:34.801694] Epoch: [2] [720/3229] lr: 0.000013 grad_norm: 0.7678 (0.7645) closs: 1.0230 (1.0360) time: 4.0524 data: 0.0002 max mem: 54684 +[08:46:15.397922] Epoch: [2] [730/3229] lr: 0.000013 grad_norm: 0.7292 (0.7638) closs: 1.0149 (1.0362) time: 4.0608 data: 0.0002 max mem: 54684 +[08:46:56.241797] Epoch: [2] [740/3229] lr: 0.000013 grad_norm: 0.7162 (0.7634) closs: 1.0468 (1.0362) time: 4.0719 data: 0.0002 max mem: 54684 +[08:47:37.401308] Epoch: [2] [750/3229] lr: 0.000013 grad_norm: 0.7125 (0.7625) closs: 1.0468 (1.0361) time: 4.1001 data: 0.0002 max mem: 54684 +[08:48:17.975003] Epoch: [2] [760/3229] lr: 0.000013 grad_norm: 0.7230 (0.7623) closs: 0.9952 (1.0357) time: 4.0866 data: 0.0002 max mem: 54684 +[08:48:58.599971] Epoch: [2] [770/3229] lr: 0.000013 grad_norm: 0.7391 (0.7618) closs: 0.9999 (1.0354) time: 4.0599 data: 0.0002 max mem: 54684 +[08:49:39.723248] Epoch: [2] [780/3229] lr: 0.000013 grad_norm: 0.7642 (0.7620) closs: 1.0226 (1.0358) time: 4.0873 data: 0.0002 max mem: 54684 +[08:50:20.513740] Epoch: [2] [790/3229] lr: 0.000013 grad_norm: 0.7684 (0.7622) closs: 1.0461 (1.0352) time: 4.0956 data: 0.0002 max mem: 54684 +[08:51:00.788007] Epoch: [2] [800/3229] lr: 0.000013 grad_norm: 0.7667 (0.7621) closs: 1.0416 (1.0349) time: 4.0532 data: 0.0002 max mem: 54684 +[08:51:42.084938] Epoch: [2] [810/3229] lr: 0.000012 grad_norm: 0.7514 (0.7623) closs: 1.0492 (1.0351) time: 4.0785 data: 0.0002 max mem: 54684 +[08:52:22.683852] Epoch: [2] [820/3229] lr: 0.000012 grad_norm: 0.7899 (0.7626) closs: 1.0387 (1.0346) time: 4.0947 data: 0.0002 max mem: 54684 +[08:53:03.136708] Epoch: [2] [830/3229] lr: 0.000012 grad_norm: 0.7319 (0.7623) closs: 1.0123 (1.0341) time: 4.0525 data: 0.0002 max mem: 54684 +[08:53:44.061472] Epoch: [2] [840/3229] lr: 0.000012 grad_norm: 0.7243 (0.7622) closs: 1.0273 (1.0344) time: 4.0688 data: 0.0002 max mem: 54684 +[08:54:24.982996] Epoch: [2] [850/3229] lr: 0.000012 grad_norm: 0.7494 (0.7625) closs: 1.0501 (1.0346) time: 4.0923 data: 0.0002 max mem: 54684 +[08:55:05.938568] Epoch: [2] [860/3229] lr: 0.000012 grad_norm: 0.7728 (0.7625) closs: 1.0500 (1.0344) time: 4.0938 data: 0.0002 max mem: 54684 +[08:55:47.280626] Epoch: [2] [870/3229] lr: 0.000012 grad_norm: 0.7828 (0.7630) closs: 1.0542 (1.0347) time: 4.1148 data: 0.0002 max mem: 54684 +[08:56:27.932374] Epoch: [2] [880/3229] lr: 0.000012 grad_norm: 0.7772 (0.7629) closs: 1.0480 (1.0344) time: 4.0996 data: 0.0002 max mem: 54684 +[08:57:09.157992] Epoch: [2] [890/3229] lr: 0.000012 grad_norm: 0.7710 (0.7629) closs: 1.0352 (1.0346) time: 4.0938 data: 0.0002 max mem: 54684 +[08:57:50.048167] Epoch: [2] [900/3229] lr: 0.000012 grad_norm: 0.7654 (0.7630) closs: 1.0552 (1.0348) time: 4.1057 data: 0.0002 max mem: 54684 +[08:58:31.441896] Epoch: [2] [910/3229] lr: 0.000012 grad_norm: 0.7748 (0.7631) closs: 1.0552 (1.0352) time: 4.1141 data: 0.0002 max mem: 54684 +[08:59:12.358818] Epoch: [2] [920/3229] lr: 0.000012 grad_norm: 0.7834 (0.7633) closs: 1.0587 (1.0353) time: 4.1155 data: 0.0002 max mem: 54684 +[08:59:53.384497] Epoch: [2] [930/3229] lr: 0.000012 grad_norm: 0.7600 (0.7639) closs: 1.0610 (1.0358) time: 4.0971 data: 0.0002 max mem: 54684 +[09:00:34.296386] Epoch: [2] [940/3229] lr: 0.000012 grad_norm: 0.7367 (0.7634) closs: 1.0361 (1.0355) time: 4.0968 data: 0.0002 max mem: 54684 +[09:01:15.300491] Epoch: [2] [950/3229] lr: 0.000012 grad_norm: 0.7532 (0.7637) closs: 1.0273 (1.0357) time: 4.0957 data: 0.0002 max mem: 54684 +[09:01:56.214466] Epoch: [2] [960/3229] lr: 0.000012 grad_norm: 0.8101 (0.7637) closs: 1.0385 (1.0358) time: 4.0958 data: 0.0002 max mem: 54684 +[09:02:37.461375] Epoch: [2] [970/3229] lr: 0.000012 grad_norm: 0.7784 (0.7640) closs: 1.0385 (1.0359) time: 4.1080 data: 0.0002 max mem: 54684 +[09:03:18.133192] Epoch: [2] [980/3229] lr: 0.000012 grad_norm: 0.7784 (0.7641) closs: 1.0226 (1.0356) time: 4.0959 data: 0.0002 max mem: 54684 +[09:03:58.519062] Epoch: [2] [990/3229] lr: 0.000011 grad_norm: 0.7619 (0.7637) closs: 1.0127 (1.0353) time: 4.0528 data: 0.0002 max mem: 54684 +[09:04:39.761660] Epoch: [2] [1000/3229] lr: 0.000011 grad_norm: 0.7440 (0.7638) closs: 1.0497 (1.0358) time: 4.0814 data: 0.0002 max mem: 54684 +[09:05:20.726355] Epoch: [2] [1010/3229] lr: 0.000011 grad_norm: 0.7440 (0.7637) closs: 1.0649 (1.0361) time: 4.1103 data: 0.0002 max mem: 54684 +[09:06:00.952110] Epoch: [2] [1020/3229] lr: 0.000011 grad_norm: 0.7606 (0.7640) closs: 1.0374 (1.0356) time: 4.0595 data: 0.0002 max mem: 54684 +[09:06:41.558460] Epoch: [2] [1030/3229] lr: 0.000011 grad_norm: 0.7384 (0.7636) closs: 1.0224 (1.0354) time: 4.0415 data: 0.0002 max mem: 54684 +[09:07:22.293637] Epoch: [2] [1040/3229] lr: 0.000011 grad_norm: 0.7438 (0.7637) closs: 1.0008 (1.0349) time: 4.0670 data: 0.0002 max mem: 54684 +[09:08:03.484133] Epoch: [2] [1050/3229] lr: 0.000011 grad_norm: 0.7482 (0.7634) closs: 1.0428 (1.0352) time: 4.0962 data: 0.0002 max mem: 54684 +[09:08:44.382701] Epoch: [2] [1060/3229] lr: 0.000011 grad_norm: 0.7485 (0.7636) closs: 1.0560 (1.0352) time: 4.1044 data: 0.0002 max mem: 54684 +[09:09:25.808204] Epoch: [2] [1070/3229] lr: 0.000011 grad_norm: 0.7534 (0.7633) closs: 1.0265 (1.0352) time: 4.1161 data: 0.0002 max mem: 54684 +[09:10:06.913588] Epoch: [2] [1080/3229] lr: 0.000011 grad_norm: 0.7569 (0.7634) closs: 1.0402 (1.0353) time: 4.1265 data: 0.0002 max mem: 54684 +[09:10:47.784050] Epoch: [2] [1090/3229] lr: 0.000011 grad_norm: 0.7794 (0.7635) closs: 1.0821 (1.0357) time: 4.0987 data: 0.0002 max mem: 54684 +[09:11:29.039121] Epoch: [2] [1100/3229] lr: 0.000011 grad_norm: 0.7814 (0.7636) closs: 1.0679 (1.0358) time: 4.1062 data: 0.0002 max mem: 54684 +[09:12:10.373961] Epoch: [2] [1110/3229] lr: 0.000011 grad_norm: 0.7643 (0.7639) closs: 1.0508 (1.0360) time: 4.1294 data: 0.0002 max mem: 54684 +[09:12:51.449473] Epoch: [2] [1120/3229] lr: 0.000011 grad_norm: 0.7562 (0.7638) closs: 1.0466 (1.0358) time: 4.1205 data: 0.0002 max mem: 54684 +[09:13:32.329462] Epoch: [2] [1130/3229] lr: 0.000011 grad_norm: 0.7943 (0.7646) closs: 1.0470 (1.0359) time: 4.0977 data: 0.0002 max mem: 54684 +[09:14:12.321437] Epoch: [2] [1140/3229] lr: 0.000011 grad_norm: 0.8189 (0.7647) closs: 1.0094 (1.0354) time: 4.0435 data: 0.0002 max mem: 54684 +[09:14:53.353944] Epoch: [2] [1150/3229] lr: 0.000011 grad_norm: 0.8029 (0.7651) closs: 0.9823 (1.0353) time: 4.0512 data: 0.0002 max mem: 54684 +[09:15:34.257366] Epoch: [2] [1160/3229] lr: 0.000011 grad_norm: 0.8163 (0.7653) closs: 1.0455 (1.0354) time: 4.0967 data: 0.0002 max mem: 54684 +[09:16:14.798544] Epoch: [2] [1170/3229] lr: 0.000011 grad_norm: 0.8128 (0.7652) closs: 1.0318 (1.0353) time: 4.0722 data: 0.0002 max mem: 54684 +[09:16:55.872961] Epoch: [2] [1180/3229] lr: 0.000010 grad_norm: 0.7566 (0.7651) closs: 1.0318 (1.0353) time: 4.0807 data: 0.0002 max mem: 54684 +[09:17:36.505833] Epoch: [2] [1190/3229] lr: 0.000010 grad_norm: 0.7433 (0.7650) closs: 1.0522 (1.0355) time: 4.0853 data: 0.0002 max mem: 54684 +[09:18:17.759241] Epoch: [2] [1200/3229] lr: 0.000010 grad_norm: 0.7621 (0.7651) closs: 1.0598 (1.0356) time: 4.0942 data: 0.0002 max mem: 54684 +[09:18:59.258398] Epoch: [2] [1210/3229] lr: 0.000010 grad_norm: 0.7694 (0.7652) closs: 1.0676 (1.0360) time: 4.1376 data: 0.0002 max mem: 54684 +[09:19:40.458187] Epoch: [2] [1220/3229] lr: 0.000010 grad_norm: 0.7466 (0.7650) closs: 1.0545 (1.0359) time: 4.1349 data: 0.0002 max mem: 54684 +[09:20:21.687691] Epoch: [2] [1230/3229] lr: 0.000010 grad_norm: 0.7782 (0.7653) closs: 1.0469 (1.0362) time: 4.1214 data: 0.0002 max mem: 54684 +[09:21:02.306360] Epoch: [2] [1240/3229] lr: 0.000010 grad_norm: 0.7785 (0.7651) closs: 1.0469 (1.0361) time: 4.0923 data: 0.0002 max mem: 54684 +[09:21:43.494418] Epoch: [2] [1250/3229] lr: 0.000010 grad_norm: 0.7669 (0.7654) closs: 1.0804 (1.0366) time: 4.0903 data: 0.0002 max mem: 54684 +[09:22:24.223450] Epoch: [2] [1260/3229] lr: 0.000010 grad_norm: 0.7949 (0.7653) closs: 1.0789 (1.0365) time: 4.0958 data: 0.0002 max mem: 54684 +[09:23:05.137178] Epoch: [2] [1270/3229] lr: 0.000010 grad_norm: 0.7839 (0.7655) closs: 1.0398 (1.0366) time: 4.0821 data: 0.0002 max mem: 54684 +[09:23:45.713305] Epoch: [2] [1280/3229] lr: 0.000010 grad_norm: 0.7820 (0.7654) closs: 1.0345 (1.0364) time: 4.0744 data: 0.0002 max mem: 54684 +[09:24:26.883899] Epoch: [2] [1290/3229] lr: 0.000010 grad_norm: 0.7848 (0.7655) closs: 1.0622 (1.0368) time: 4.0873 data: 0.0002 max mem: 54684 +[09:25:07.882033] Epoch: [2] [1300/3229] lr: 0.000010 grad_norm: 0.7626 (0.7653) closs: 1.0601 (1.0370) time: 4.1084 data: 0.0002 max mem: 54684 +[09:25:48.788547] Epoch: [2] [1310/3229] lr: 0.000010 grad_norm: 0.7626 (0.7655) closs: 1.0317 (1.0368) time: 4.0952 data: 0.0002 max mem: 54684 +[09:26:29.719803] Epoch: [2] [1320/3229] lr: 0.000010 grad_norm: 0.7593 (0.7652) closs: 1.0151 (1.0369) time: 4.0918 data: 0.0002 max mem: 54684 +[09:27:10.299624] Epoch: [2] [1330/3229] lr: 0.000010 grad_norm: 0.7602 (0.7652) closs: 1.0215 (1.0365) time: 4.0755 data: 0.0002 max mem: 54684 +[09:27:50.751310] Epoch: [2] [1340/3229] lr: 0.000010 grad_norm: 0.7643 (0.7651) closs: 0.9875 (1.0362) time: 4.0515 data: 0.0002 max mem: 54684 +[09:28:31.398595] Epoch: [2] [1350/3229] lr: 0.000010 grad_norm: 0.7591 (0.7650) closs: 0.9875 (1.0358) time: 4.0549 data: 0.0002 max mem: 54684 +[09:29:12.305782] Epoch: [2] [1360/3229] lr: 0.000010 grad_norm: 0.7741 (0.7651) closs: 1.0085 (1.0361) time: 4.0777 data: 0.0001 max mem: 54684 +[09:29:53.571306] Epoch: [2] [1370/3229] lr: 0.000010 grad_norm: 0.7792 (0.7653) closs: 1.0583 (1.0362) time: 4.1086 data: 0.0002 max mem: 54684 +[09:30:34.513851] Epoch: [2] [1380/3229] lr: 0.000009 grad_norm: 0.8100 (0.7656) closs: 1.0583 (1.0366) time: 4.1103 data: 0.0001 max mem: 54684 +[09:31:15.446793] Epoch: [2] [1390/3229] lr: 0.000009 grad_norm: 0.8007 (0.7656) closs: 1.0600 (1.0367) time: 4.0937 data: 0.0001 max mem: 54684 +[09:31:56.718293] Epoch: [2] [1400/3229] lr: 0.000009 grad_norm: 0.7785 (0.7658) closs: 1.0538 (1.0368) time: 4.1101 data: 0.0001 max mem: 54684 +[09:32:37.066477] Epoch: [2] [1410/3229] lr: 0.000009 grad_norm: 0.7777 (0.7654) closs: 1.0459 (1.0365) time: 4.0809 data: 0.0002 max mem: 54684 +[09:33:18.072485] Epoch: [2] [1420/3229] lr: 0.000009 grad_norm: 0.7777 (0.7656) closs: 1.0269 (1.0366) time: 4.0676 data: 0.0001 max mem: 54684 +[09:33:58.688418] Epoch: [2] [1430/3229] lr: 0.000009 grad_norm: 0.7950 (0.7658) closs: 1.0548 (1.0367) time: 4.0810 data: 0.0002 max mem: 54684 +[09:34:40.028938] Epoch: [2] [1440/3229] lr: 0.000009 grad_norm: 0.7547 (0.7655) closs: 1.0359 (1.0365) time: 4.0977 data: 0.0002 max mem: 54684 +[09:35:20.652994] Epoch: [2] [1450/3229] lr: 0.000009 grad_norm: 0.7292 (0.7654) closs: 1.0334 (1.0363) time: 4.0982 data: 0.0001 max mem: 54684 +[09:36:00.716699] Epoch: [2] [1460/3229] lr: 0.000009 grad_norm: 0.7434 (0.7651) closs: 0.9868 (1.0360) time: 4.0343 data: 0.0002 max mem: 54684 +[09:36:41.682870] Epoch: [2] [1470/3229] lr: 0.000009 grad_norm: 0.7452 (0.7652) closs: 0.9904 (1.0359) time: 4.0514 data: 0.0001 max mem: 54684 +[09:37:22.665178] Epoch: [2] [1480/3229] lr: 0.000009 grad_norm: 0.7568 (0.7653) closs: 1.0227 (1.0358) time: 4.0974 data: 0.0001 max mem: 54684 +[09:38:03.979174] Epoch: [2] [1490/3229] lr: 0.000009 grad_norm: 0.7775 (0.7653) closs: 1.0474 (1.0360) time: 4.1147 data: 0.0002 max mem: 54684 +[09:38:45.030747] Epoch: [2] [1500/3229] lr: 0.000009 grad_norm: 0.7560 (0.7653) closs: 1.0474 (1.0360) time: 4.1182 data: 0.0002 max mem: 54684 +[09:39:25.831863] Epoch: [2] [1510/3229] lr: 0.000009 grad_norm: 0.7644 (0.7652) closs: 1.0295 (1.0358) time: 4.0926 data: 0.0001 max mem: 54684 +[09:40:06.337863] Epoch: [2] [1520/3229] lr: 0.000009 grad_norm: 0.7842 (0.7653) closs: 1.0512 (1.0360) time: 4.0653 data: 0.0001 max mem: 54684 +[09:40:46.718752] Epoch: [2] [1530/3229] lr: 0.000009 grad_norm: 0.7452 (0.7650) closs: 1.0411 (1.0359) time: 4.0443 data: 0.0002 max mem: 54684 +[09:41:27.479284] Epoch: [2] [1540/3229] lr: 0.000009 grad_norm: 0.7451 (0.7647) closs: 1.0411 (1.0359) time: 4.0570 data: 0.0002 max mem: 54684 +[09:42:08.444014] Epoch: [2] [1550/3229] lr: 0.000009 grad_norm: 0.7502 (0.7648) closs: 1.0511 (1.0361) time: 4.0862 data: 0.0002 max mem: 54684 +[09:42:49.492626] Epoch: [2] [1560/3229] lr: 0.000009 grad_norm: 0.7626 (0.7648) closs: 1.0354 (1.0359) time: 4.1006 data: 0.0002 max mem: 54684 +[09:43:30.218498] Epoch: [2] [1570/3229] lr: 0.000009 grad_norm: 0.7822 (0.7649) closs: 1.0210 (1.0362) time: 4.0886 data: 0.0002 max mem: 54684 +[09:44:11.014656] Epoch: [2] [1580/3229] lr: 0.000009 grad_norm: 0.7599 (0.7648) closs: 1.0582 (1.0360) time: 4.0760 data: 0.0001 max mem: 54684 +[09:44:52.430295] Epoch: [2] [1590/3229] lr: 0.000009 grad_norm: 0.7556 (0.7649) closs: 1.0126 (1.0362) time: 4.1105 data: 0.0001 max mem: 54684 +[09:45:32.708025] Epoch: [2] [1600/3229] lr: 0.000009 grad_norm: 0.7689 (0.7650) closs: 1.0061 (1.0360) time: 4.0846 data: 0.0002 max mem: 54684 +[09:46:13.311218] Epoch: [2] [1610/3229] lr: 0.000008 grad_norm: 0.7788 (0.7652) closs: 1.0035 (1.0361) time: 4.0440 data: 0.0002 max mem: 54684 +[09:46:55.090230] Epoch: [2] [1620/3229] lr: 0.000008 grad_norm: 0.7534 (0.7651) closs: 1.0395 (1.0361) time: 4.1190 data: 0.0002 max mem: 54684 +[09:47:36.090209] Epoch: [2] [1630/3229] lr: 0.000008 grad_norm: 0.7519 (0.7651) closs: 1.0562 (1.0361) time: 4.1389 data: 0.0002 max mem: 54684 +[09:48:17.317549] Epoch: [2] [1640/3229] lr: 0.000008 grad_norm: 0.7637 (0.7651) closs: 1.0442 (1.0362) time: 4.1113 data: 0.0002 max mem: 54684 +[09:48:58.147416] Epoch: [2] [1650/3229] lr: 0.000008 grad_norm: 0.7460 (0.7650) closs: 1.0442 (1.0361) time: 4.1028 data: 0.0002 max mem: 54684 +[09:49:38.823915] Epoch: [2] [1660/3229] lr: 0.000008 grad_norm: 0.7707 (0.7653) closs: 0.9922 (1.0360) time: 4.0752 data: 0.0002 max mem: 54684 +[09:50:20.328398] Epoch: [2] [1670/3229] lr: 0.000008 grad_norm: 0.7983 (0.7656) closs: 1.0671 (1.0362) time: 4.1090 data: 0.0002 max mem: 54684 +[09:51:01.223284] Epoch: [2] [1680/3229] lr: 0.000008 grad_norm: 0.8044 (0.7656) closs: 1.0319 (1.0360) time: 4.1199 data: 0.0001 max mem: 54684 +[09:51:41.770661] Epoch: [2] [1690/3229] lr: 0.000008 grad_norm: 0.7965 (0.7656) closs: 1.0218 (1.0362) time: 4.0720 data: 0.0002 max mem: 54684 +[09:52:23.631344] Epoch: [2] [1700/3229] lr: 0.000008 grad_norm: 0.7809 (0.7658) closs: 1.0307 (1.0362) time: 4.1203 data: 0.0002 max mem: 54684 +[09:53:03.971107] Epoch: [2] [1710/3229] lr: 0.000008 grad_norm: 0.7531 (0.7655) closs: 1.0542 (1.0362) time: 4.1099 data: 0.0001 max mem: 54684 +[09:53:44.185439] Epoch: [2] [1720/3229] lr: 0.000008 grad_norm: 0.7503 (0.7655) closs: 1.0655 (1.0362) time: 4.0276 data: 0.0002 max mem: 54684 +[09:54:25.528421] Epoch: [2] [1730/3229] lr: 0.000008 grad_norm: 0.7650 (0.7655) closs: 1.0793 (1.0364) time: 4.0778 data: 0.0002 max mem: 54684 +[09:55:06.203734] Epoch: [2] [1740/3229] lr: 0.000008 grad_norm: 0.7650 (0.7657) closs: 1.0568 (1.0362) time: 4.1008 data: 0.0002 max mem: 54684 +[09:55:46.816613] Epoch: [2] [1750/3229] lr: 0.000008 grad_norm: 0.7542 (0.7655) closs: 1.0362 (1.0364) time: 4.0644 data: 0.0002 max mem: 54684 +[09:56:27.712286] Epoch: [2] [1760/3229] lr: 0.000008 grad_norm: 0.7409 (0.7656) closs: 1.0798 (1.0365) time: 4.0754 data: 0.0002 max mem: 54684 +[09:57:09.011822] Epoch: [2] [1770/3229] lr: 0.000008 grad_norm: 0.7625 (0.7657) closs: 1.0369 (1.0366) time: 4.1097 data: 0.0002 max mem: 54684 +[09:57:50.358048] Epoch: [2] [1780/3229] lr: 0.000008 grad_norm: 0.7780 (0.7660) closs: 1.0474 (1.0366) time: 4.1322 data: 0.0002 max mem: 54684 +[09:58:30.935480] Epoch: [2] [1790/3229] lr: 0.000008 grad_norm: 0.7607 (0.7658) closs: 1.0474 (1.0366) time: 4.0961 data: 0.0002 max mem: 54684 +[09:59:12.191909] Epoch: [2] [1800/3229] lr: 0.000008 grad_norm: 0.7704 (0.7660) closs: 1.0390 (1.0366) time: 4.0916 data: 0.0002 max mem: 54684 +[09:59:53.188925] Epoch: [2] [1810/3229] lr: 0.000008 grad_norm: 0.7880 (0.7661) closs: 1.0390 (1.0365) time: 4.1126 data: 0.0002 max mem: 54684 +[10:00:34.288625] Epoch: [2] [1820/3229] lr: 0.000008 grad_norm: 0.8177 (0.7663) closs: 1.0470 (1.0366) time: 4.1048 data: 0.0002 max mem: 54684 +[10:01:15.216189] Epoch: [2] [1830/3229] lr: 0.000008 grad_norm: 0.7464 (0.7662) closs: 1.0408 (1.0365) time: 4.1013 data: 0.0002 max mem: 54684 +[10:01:56.484523] Epoch: [2] [1840/3229] lr: 0.000008 grad_norm: 0.7634 (0.7664) closs: 1.0408 (1.0367) time: 4.1097 data: 0.0002 max mem: 54684 +[10:02:36.820581] Epoch: [2] [1850/3229] lr: 0.000008 grad_norm: 0.7761 (0.7663) closs: 1.0366 (1.0366) time: 4.0802 data: 0.0002 max mem: 54684 +[10:03:17.820103] Epoch: [2] [1860/3229] lr: 0.000007 grad_norm: 0.7713 (0.7665) closs: 1.0248 (1.0365) time: 4.0667 data: 0.0002 max mem: 54684 +[10:03:58.463854] Epoch: [2] [1870/3229] lr: 0.000007 grad_norm: 0.7713 (0.7666) closs: 1.0134 (1.0362) time: 4.0821 data: 0.0002 max mem: 54684 +[10:04:39.335047] Epoch: [2] [1880/3229] lr: 0.000007 grad_norm: 0.7584 (0.7665) closs: 1.0266 (1.0362) time: 4.0757 data: 0.0002 max mem: 54684 +[10:05:20.773148] Epoch: [2] [1890/3229] lr: 0.000007 grad_norm: 0.7505 (0.7666) closs: 1.0527 (1.0364) time: 4.1154 data: 0.0002 max mem: 54684 +[10:06:01.774211] Epoch: [2] [1900/3229] lr: 0.000007 grad_norm: 0.7830 (0.7667) closs: 1.0807 (1.0365) time: 4.1219 data: 0.0002 max mem: 54684 +[10:06:43.015666] Epoch: [2] [1910/3229] lr: 0.000007 grad_norm: 0.7959 (0.7668) closs: 1.0580 (1.0366) time: 4.1121 data: 0.0002 max mem: 54684 +[10:07:22.898606] Epoch: [2] [1920/3229] lr: 0.000007 grad_norm: 0.7359 (0.7665) closs: 1.0269 (1.0364) time: 4.0562 data: 0.0002 max mem: 54684 +[10:08:03.733126] Epoch: [2] [1930/3229] lr: 0.000007 grad_norm: 0.7359 (0.7667) closs: 1.0022 (1.0362) time: 4.0358 data: 0.0002 max mem: 54684 +[10:08:44.009814] Epoch: [2] [1940/3229] lr: 0.000007 grad_norm: 0.7834 (0.7665) closs: 1.0467 (1.0361) time: 4.0555 data: 0.0002 max mem: 54684 +[10:09:24.934480] Epoch: [2] [1950/3229] lr: 0.000007 grad_norm: 0.7408 (0.7665) closs: 1.0467 (1.0360) time: 4.0600 data: 0.0002 max mem: 54684 +[10:10:05.513930] Epoch: [2] [1960/3229] lr: 0.000007 grad_norm: 0.7564 (0.7664) closs: 1.0360 (1.0360) time: 4.0751 data: 0.0002 max mem: 54684 +[10:10:47.318139] Epoch: [2] [1970/3229] lr: 0.000007 grad_norm: 0.7470 (0.7664) closs: 1.0260 (1.0361) time: 4.1191 data: 0.0002 max mem: 54684 +[10:11:28.590511] Epoch: [2] [1980/3229] lr: 0.000007 grad_norm: 0.7601 (0.7667) closs: 1.0797 (1.0363) time: 4.1538 data: 0.0002 max mem: 54684 +[10:12:09.496980] Epoch: [2] [1990/3229] lr: 0.000007 grad_norm: 0.7790 (0.7667) closs: 1.0679 (1.0363) time: 4.1089 data: 0.0002 max mem: 54684 +[10:12:50.748606] Epoch: [2] [2000/3229] lr: 0.000007 grad_norm: 0.7970 (0.7670) closs: 1.0297 (1.0364) time: 4.1078 data: 0.0002 max mem: 54684 +[10:13:32.182612] Epoch: [2] [2010/3229] lr: 0.000007 grad_norm: 0.8054 (0.7671) closs: 1.0496 (1.0365) time: 4.1342 data: 0.0002 max mem: 54684 +[10:14:13.410630] Epoch: [2] [2020/3229] lr: 0.000007 grad_norm: 0.7780 (0.7672) closs: 1.0542 (1.0367) time: 4.1330 data: 0.0002 max mem: 54684 +[10:14:54.361523] Epoch: [2] [2030/3229] lr: 0.000007 grad_norm: 0.8040 (0.7677) closs: 1.0542 (1.0368) time: 4.1089 data: 0.0002 max mem: 54684 +[10:15:35.931648] Epoch: [2] [2040/3229] lr: 0.000007 grad_norm: 0.8409 (0.7681) closs: 1.0486 (1.0369) time: 4.1260 data: 0.0002 max mem: 54684 +[10:16:16.746598] Epoch: [2] [2050/3229] lr: 0.000007 grad_norm: 0.7987 (0.7682) closs: 1.0486 (1.0368) time: 4.1192 data: 0.0002 max mem: 54684 +[10:16:58.012617] Epoch: [2] [2060/3229] lr: 0.000007 grad_norm: 0.8159 (0.7685) closs: 1.0473 (1.0369) time: 4.1040 data: 0.0002 max mem: 54684 +[10:17:38.676913] Epoch: [2] [2070/3229] lr: 0.000007 grad_norm: 0.7728 (0.7682) closs: 1.0073 (1.0366) time: 4.0965 data: 0.0002 max mem: 54684 +[10:18:20.035619] Epoch: [2] [2080/3229] lr: 0.000007 grad_norm: 0.7435 (0.7684) closs: 1.0386 (1.0368) time: 4.1011 data: 0.0002 max mem: 54684 +[10:19:01.109902] Epoch: [2] [2090/3229] lr: 0.000007 grad_norm: 0.7514 (0.7683) closs: 1.0462 (1.0367) time: 4.1216 data: 0.0002 max mem: 54684 +[10:19:42.145840] Epoch: [2] [2100/3229] lr: 0.000007 grad_norm: 0.7514 (0.7683) closs: 1.0461 (1.0368) time: 4.1054 data: 0.0002 max mem: 54684 +[10:20:21.912487] Epoch: [2] [2110/3229] lr: 0.000007 grad_norm: 0.7724 (0.7682) closs: 1.0394 (1.0367) time: 4.0401 data: 0.0002 max mem: 54684 +[10:21:02.609058] Epoch: [2] [2120/3229] lr: 0.000007 grad_norm: 0.7465 (0.7681) closs: 1.0394 (1.0367) time: 4.0231 data: 0.0002 max mem: 54684 +[10:21:43.356036] Epoch: [2] [2130/3229] lr: 0.000007 grad_norm: 0.7602 (0.7681) closs: 1.0324 (1.0366) time: 4.0721 data: 0.0002 max mem: 54684 +[10:22:24.153129] Epoch: [2] [2140/3229] lr: 0.000007 grad_norm: 0.7624 (0.7679) closs: 1.0118 (1.0365) time: 4.0771 data: 0.0002 max mem: 54684 +[10:23:04.987144] Epoch: [2] [2150/3229] lr: 0.000007 grad_norm: 0.7632 (0.7680) closs: 1.0690 (1.0368) time: 4.0815 data: 0.0002 max mem: 54684 +[10:23:46.301006] Epoch: [2] [2160/3229] lr: 0.000007 grad_norm: 0.7924 (0.7682) closs: 1.0757 (1.0370) time: 4.1073 data: 0.0002 max mem: 54684 +[10:24:27.430902] Epoch: [2] [2170/3229] lr: 0.000007 grad_norm: 0.7768 (0.7682) closs: 1.0597 (1.0370) time: 4.1221 data: 0.0002 max mem: 54684 +[10:25:08.536310] Epoch: [2] [2180/3229] lr: 0.000006 grad_norm: 0.7764 (0.7683) closs: 1.0400 (1.0370) time: 4.1117 data: 0.0002 max mem: 54684 +[10:25:49.054789] Epoch: [2] [2190/3229] lr: 0.000006 grad_norm: 0.7764 (0.7683) closs: 1.0429 (1.0370) time: 4.0811 data: 0.0002 max mem: 54684 +[10:26:28.974982] Epoch: [2] [2200/3229] lr: 0.000006 grad_norm: 0.7571 (0.7680) closs: 0.9973 (1.0367) time: 4.0219 data: 0.0002 max mem: 54684 +[10:27:10.250973] Epoch: [2] [2210/3229] lr: 0.000006 grad_norm: 0.7298 (0.7679) closs: 1.0033 (1.0367) time: 4.0597 data: 0.0002 max mem: 54684 +[10:27:51.496280] Epoch: [2] [2220/3229] lr: 0.000006 grad_norm: 0.7623 (0.7681) closs: 1.0618 (1.0369) time: 4.1260 data: 0.0002 max mem: 54684 +[10:28:32.694107] Epoch: [2] [2230/3229] lr: 0.000006 grad_norm: 0.8018 (0.7682) closs: 1.1010 (1.0371) time: 4.1221 data: 0.0002 max mem: 54684 +[10:29:13.690626] Epoch: [2] [2240/3229] lr: 0.000006 grad_norm: 0.7677 (0.7682) closs: 1.0752 (1.0372) time: 4.1097 data: 0.0002 max mem: 54684 +[10:29:55.103277] Epoch: [2] [2250/3229] lr: 0.000006 grad_norm: 0.7532 (0.7683) closs: 1.1006 (1.0373) time: 4.1204 data: 0.0002 max mem: 54684 +[10:30:36.017463] Epoch: [2] [2260/3229] lr: 0.000006 grad_norm: 0.7805 (0.7683) closs: 1.0913 (1.0374) time: 4.1163 data: 0.0002 max mem: 54684 +[10:31:17.232848] Epoch: [2] [2270/3229] lr: 0.000006 grad_norm: 0.7978 (0.7685) closs: 1.0719 (1.0376) time: 4.1064 data: 0.0002 max mem: 54684 +[10:31:57.554662] Epoch: [2] [2280/3229] lr: 0.000006 grad_norm: 0.7693 (0.7683) closs: 1.0401 (1.0375) time: 4.0768 data: 0.0002 max mem: 54684 +[10:32:38.982581] Epoch: [2] [2290/3229] lr: 0.000006 grad_norm: 0.7707 (0.7684) closs: 1.0389 (1.0375) time: 4.0874 data: 0.0002 max mem: 54684 +[10:33:19.900400] Epoch: [2] [2300/3229] lr: 0.000006 grad_norm: 0.7707 (0.7684) closs: 1.0546 (1.0376) time: 4.1172 data: 0.0002 max mem: 54684 +[10:34:00.422956] Epoch: [2] [2310/3229] lr: 0.000006 grad_norm: 0.7404 (0.7683) closs: 1.0432 (1.0375) time: 4.0720 data: 0.0002 max mem: 54684 +[10:34:41.838857] Epoch: [2] [2320/3229] lr: 0.000006 grad_norm: 0.7949 (0.7686) closs: 1.0168 (1.0374) time: 4.0969 data: 0.0002 max mem: 54684 +[10:35:22.205090] Epoch: [2] [2330/3229] lr: 0.000006 grad_norm: 0.7612 (0.7684) closs: 1.0087 (1.0372) time: 4.0890 data: 0.0002 max mem: 54684 +[10:36:03.158561] Epoch: [2] [2340/3229] lr: 0.000006 grad_norm: 0.7306 (0.7685) closs: 1.0268 (1.0373) time: 4.0659 data: 0.0002 max mem: 54684 +[10:36:44.005255] Epoch: [2] [2350/3229] lr: 0.000006 grad_norm: 0.7974 (0.7686) closs: 1.0339 (1.0373) time: 4.0899 data: 0.0002 max mem: 54684 +[10:37:24.060589] Epoch: [2] [2360/3229] lr: 0.000006 grad_norm: 0.7845 (0.7683) closs: 0.9994 (1.0371) time: 4.0450 data: 0.0002 max mem: 54684 +[10:38:04.953222] Epoch: [2] [2370/3229] lr: 0.000006 grad_norm: 0.7452 (0.7684) closs: 0.9928 (1.0371) time: 4.0473 data: 0.0002 max mem: 54684 +[10:38:45.575555] Epoch: [2] [2380/3229] lr: 0.000006 grad_norm: 0.7639 (0.7682) closs: 1.0037 (1.0371) time: 4.0757 data: 0.0002 max mem: 54684 +[10:39:26.779560] Epoch: [2] [2390/3229] lr: 0.000006 grad_norm: 0.7854 (0.7685) closs: 1.0731 (1.0372) time: 4.0913 data: 0.0002 max mem: 54684 +[10:40:07.277159] Epoch: [2] [2400/3229] lr: 0.000006 grad_norm: 0.7837 (0.7683) closs: 1.0746 (1.0371) time: 4.0850 data: 0.0002 max mem: 54684 +[10:40:47.656246] Epoch: [2] [2410/3229] lr: 0.000006 grad_norm: 0.7215 (0.7681) closs: 1.0545 (1.0370) time: 4.0438 data: 0.0002 max mem: 54684 +[10:41:28.261216] Epoch: [2] [2420/3229] lr: 0.000006 grad_norm: 0.7847 (0.7682) closs: 1.0446 (1.0370) time: 4.0491 data: 0.0002 max mem: 54684 +[10:42:09.186454] Epoch: [2] [2430/3229] lr: 0.000006 grad_norm: 0.7847 (0.7682) closs: 1.0480 (1.0370) time: 4.0764 data: 0.0002 max mem: 54684 +[10:42:50.581684] Epoch: [2] [2440/3229] lr: 0.000006 grad_norm: 0.7911 (0.7684) closs: 1.0551 (1.0370) time: 4.1160 data: 0.0002 max mem: 54684 +[10:43:31.660191] Epoch: [2] [2450/3229] lr: 0.000006 grad_norm: 0.7996 (0.7685) closs: 1.0322 (1.0370) time: 4.1236 data: 0.0002 max mem: 54684 +[10:44:12.906840] Epoch: [2] [2460/3229] lr: 0.000006 grad_norm: 0.7953 (0.7685) closs: 1.0322 (1.0370) time: 4.1162 data: 0.0002 max mem: 54684 +[10:44:54.081525] Epoch: [2] [2470/3229] lr: 0.000006 grad_norm: 0.7730 (0.7686) closs: 1.0670 (1.0371) time: 4.1210 data: 0.0002 max mem: 54684 +[10:45:35.195478] Epoch: [2] [2480/3229] lr: 0.000006 grad_norm: 0.7885 (0.7687) closs: 1.0536 (1.0371) time: 4.1144 data: 0.0002 max mem: 54684 +[10:46:16.528666] Epoch: [2] [2490/3229] lr: 0.000006 grad_norm: 0.7880 (0.7687) closs: 1.0536 (1.0373) time: 4.1223 data: 0.0002 max mem: 54684 +[10:46:56.834636] Epoch: [2] [2500/3229] lr: 0.000006 grad_norm: 0.7751 (0.7687) closs: 1.0654 (1.0372) time: 4.0819 data: 0.0002 max mem: 54684 +[10:47:38.101086] Epoch: [2] [2510/3229] lr: 0.000006 grad_norm: 0.7705 (0.7688) closs: 1.0349 (1.0373) time: 4.0786 data: 0.0002 max mem: 54684 +[10:48:18.776789] Epoch: [2] [2520/3229] lr: 0.000006 grad_norm: 0.7878 (0.7688) closs: 1.0416 (1.0373) time: 4.0970 data: 0.0002 max mem: 54684 +[10:48:59.471432] Epoch: [2] [2530/3229] lr: 0.000006 grad_norm: 0.7899 (0.7688) closs: 1.0064 (1.0373) time: 4.0685 data: 0.0002 max mem: 54684 +[10:49:40.706118] Epoch: [2] [2540/3229] lr: 0.000006 grad_norm: 0.7632 (0.7690) closs: 0.9999 (1.0372) time: 4.0964 data: 0.0002 max mem: 54684 +[10:50:21.879580] Epoch: [2] [2550/3229] lr: 0.000006 grad_norm: 0.8102 (0.7692) closs: 1.0065 (1.0372) time: 4.1203 data: 0.0002 max mem: 54684 +[10:51:03.280777] Epoch: [2] [2560/3229] lr: 0.000006 grad_norm: 0.7923 (0.7692) closs: 1.0337 (1.0373) time: 4.1287 data: 0.0002 max mem: 54684 +[10:51:44.513039] Epoch: [2] [2570/3229] lr: 0.000006 grad_norm: 0.7990 (0.7697) closs: 1.0381 (1.0373) time: 4.1316 data: 0.0002 max mem: 54684 +[10:52:25.106616] Epoch: [2] [2580/3229] lr: 0.000006 grad_norm: 0.8055 (0.7697) closs: 1.0194 (1.0372) time: 4.0912 data: 0.0002 max mem: 54684 +[10:53:05.456512] Epoch: [2] [2590/3229] lr: 0.000006 grad_norm: 0.7706 (0.7696) closs: 1.0125 (1.0371) time: 4.0471 data: 0.0002 max mem: 54684 +[10:53:46.140129] Epoch: [2] [2600/3229] lr: 0.000006 grad_norm: 0.7412 (0.7694) closs: 1.0323 (1.0371) time: 4.0516 data: 0.0002 max mem: 54684 +[10:54:27.649177] Epoch: [2] [2610/3229] lr: 0.000006 grad_norm: 0.7625 (0.7696) closs: 1.0566 (1.0371) time: 4.1096 data: 0.0002 max mem: 54684 +[10:55:08.432178] Epoch: [2] [2620/3229] lr: 0.000006 grad_norm: 0.8024 (0.7697) closs: 1.0176 (1.0371) time: 4.1145 data: 0.0002 max mem: 54684 +[10:55:49.037158] Epoch: [2] [2630/3229] lr: 0.000005 grad_norm: 0.7726 (0.7697) closs: 1.0176 (1.0369) time: 4.0693 data: 0.0002 max mem: 54684 +[10:56:30.399477] Epoch: [2] [2640/3229] lr: 0.000005 grad_norm: 0.7389 (0.7696) closs: 1.0279 (1.0371) time: 4.0983 data: 0.0002 max mem: 54684 +[10:57:11.914088] Epoch: [2] [2650/3229] lr: 0.000005 grad_norm: 0.7500 (0.7697) closs: 1.0760 (1.0372) time: 4.1438 data: 0.0002 max mem: 54684 +[10:57:53.051298] Epoch: [2] [2660/3229] lr: 0.000005 grad_norm: 0.7937 (0.7698) closs: 1.0704 (1.0373) time: 4.1325 data: 0.0002 max mem: 54684 +[10:58:34.322783] Epoch: [2] [2670/3229] lr: 0.000005 grad_norm: 0.8010 (0.7699) closs: 1.0704 (1.0374) time: 4.1204 data: 0.0002 max mem: 54684 +[10:59:15.014004] Epoch: [2] [2680/3229] lr: 0.000005 grad_norm: 0.8010 (0.7699) closs: 1.0494 (1.0374) time: 4.0981 data: 0.0002 max mem: 54684 +[10:59:55.439695] Epoch: [2] [2690/3229] lr: 0.000005 grad_norm: 0.7508 (0.7698) closs: 1.0343 (1.0373) time: 4.0558 data: 0.0002 max mem: 54684 +[11:00:36.264761] Epoch: [2] [2700/3229] lr: 0.000005 grad_norm: 0.7522 (0.7699) closs: 1.0379 (1.0374) time: 4.0625 data: 0.0002 max mem: 54684 +[11:01:17.279401] Epoch: [2] [2710/3229] lr: 0.000005 grad_norm: 0.7584 (0.7698) closs: 1.0813 (1.0375) time: 4.0919 data: 0.0002 max mem: 54684 +[11:01:58.168258] Epoch: [2] [2720/3229] lr: 0.000005 grad_norm: 0.7584 (0.7697) closs: 1.0343 (1.0375) time: 4.0951 data: 0.0002 max mem: 54684 +[11:02:39.456366] Epoch: [2] [2730/3229] lr: 0.000005 grad_norm: 0.7529 (0.7697) closs: 1.0343 (1.0375) time: 4.1088 data: 0.0002 max mem: 54684 +[11:03:20.315865] Epoch: [2] [2740/3229] lr: 0.000005 grad_norm: 0.7555 (0.7697) closs: 1.0462 (1.0375) time: 4.1073 data: 0.0002 max mem: 54684 +[11:04:02.063978] Epoch: [2] [2750/3229] lr: 0.000005 grad_norm: 0.7999 (0.7700) closs: 1.0785 (1.0377) time: 4.1303 data: 0.0002 max mem: 54684 +[11:04:42.801143] Epoch: [2] [2760/3229] lr: 0.000005 grad_norm: 0.7942 (0.7699) closs: 1.0842 (1.0378) time: 4.1242 data: 0.0002 max mem: 54684 +[11:05:23.770788] Epoch: [2] [2770/3229] lr: 0.000005 grad_norm: 0.7815 (0.7700) closs: 1.0259 (1.0377) time: 4.0853 data: 0.0002 max mem: 54684 +[11:06:04.580722] Epoch: [2] [2780/3229] lr: 0.000005 grad_norm: 0.7815 (0.7699) closs: 1.0562 (1.0378) time: 4.0889 data: 0.0002 max mem: 54684 +[11:06:45.682994] Epoch: [2] [2790/3229] lr: 0.000005 grad_norm: 0.7723 (0.7699) closs: 1.0843 (1.0379) time: 4.0955 data: 0.0002 max mem: 54684 +[11:07:26.664509] Epoch: [2] [2800/3229] lr: 0.000005 grad_norm: 0.7851 (0.7700) closs: 1.0863 (1.0381) time: 4.1041 data: 0.0002 max mem: 54684 +[11:08:07.660594] Epoch: [2] [2810/3229] lr: 0.000005 grad_norm: 0.7649 (0.7699) closs: 1.0497 (1.0380) time: 4.0988 data: 0.0002 max mem: 54684 +[11:08:48.441366] Epoch: [2] [2820/3229] lr: 0.000005 grad_norm: 0.7366 (0.7698) closs: 1.0300 (1.0380) time: 4.0888 data: 0.0002 max mem: 54684 +[11:09:29.171313] Epoch: [2] [2830/3229] lr: 0.000005 grad_norm: 0.7514 (0.7698) closs: 1.0300 (1.0378) time: 4.0755 data: 0.0002 max mem: 54684 +[11:10:09.985836] Epoch: [2] [2840/3229] lr: 0.000005 grad_norm: 0.7665 (0.7697) closs: 0.9997 (1.0377) time: 4.0772 data: 0.0002 max mem: 54684 +[11:10:51.230047] Epoch: [2] [2850/3229] lr: 0.000005 grad_norm: 0.7750 (0.7698) closs: 1.0333 (1.0378) time: 4.1029 data: 0.0002 max mem: 54684 +[11:11:31.685949] Epoch: [2] [2860/3229] lr: 0.000005 grad_norm: 0.7660 (0.7697) closs: 1.0579 (1.0378) time: 4.0849 data: 0.0002 max mem: 54684 +[11:12:12.974896] Epoch: [2] [2870/3229] lr: 0.000005 grad_norm: 0.7565 (0.7698) closs: 1.0466 (1.0378) time: 4.0872 data: 0.0002 max mem: 54684 +[11:12:54.373299] Epoch: [2] [2880/3229] lr: 0.000005 grad_norm: 0.7817 (0.7699) closs: 1.0407 (1.0379) time: 4.1343 data: 0.0002 max mem: 54684 +[11:13:35.283572] Epoch: [2] [2890/3229] lr: 0.000005 grad_norm: 0.7849 (0.7698) closs: 1.0329 (1.0377) time: 4.1154 data: 0.0002 max mem: 54684 +[11:14:16.118014] Epoch: [2] [2900/3229] lr: 0.000005 grad_norm: 0.7950 (0.7700) closs: 1.0221 (1.0376) time: 4.0872 data: 0.0002 max mem: 54684 +[11:14:57.161039] Epoch: [2] [2910/3229] lr: 0.000005 grad_norm: 0.8078 (0.7701) closs: 1.0416 (1.0376) time: 4.0938 data: 0.0002 max mem: 54684 +[11:15:38.521781] Epoch: [2] [2920/3229] lr: 0.000005 grad_norm: 0.7894 (0.7701) closs: 1.0502 (1.0377) time: 4.1201 data: 0.0002 max mem: 54684 +[11:16:19.106288] Epoch: [2] [2930/3229] lr: 0.000005 grad_norm: 0.7824 (0.7701) closs: 1.0595 (1.0377) time: 4.0972 data: 0.0002 max mem: 54684 +[11:17:00.252881] Epoch: [2] [2940/3229] lr: 0.000005 grad_norm: 0.7871 (0.7701) closs: 1.0469 (1.0376) time: 4.0865 data: 0.0002 max mem: 54684 +[11:17:40.642013] Epoch: [2] [2950/3229] lr: 0.000005 grad_norm: 0.7919 (0.7701) closs: 1.0446 (1.0375) time: 4.0767 data: 0.0002 max mem: 54684 +[11:18:20.871868] Epoch: [2] [2960/3229] lr: 0.000005 grad_norm: 0.7919 (0.7701) closs: 0.9901 (1.0374) time: 4.0309 data: 0.0002 max mem: 54684 +[11:19:01.539668] Epoch: [2] [2970/3229] lr: 0.000005 grad_norm: 0.8446 (0.7702) closs: 1.0398 (1.0373) time: 4.0448 data: 0.0002 max mem: 54684 +[11:19:42.663674] Epoch: [2] [2980/3229] lr: 0.000005 grad_norm: 0.7707 (0.7702) closs: 1.0450 (1.0374) time: 4.0895 data: 0.0002 max mem: 54684 +[11:20:23.730795] Epoch: [2] [2990/3229] lr: 0.000005 grad_norm: 0.7557 (0.7702) closs: 1.0745 (1.0375) time: 4.1095 data: 0.0002 max mem: 54684 +[11:21:04.740958] Epoch: [2] [3000/3229] lr: 0.000005 grad_norm: 0.7990 (0.7704) closs: 1.0864 (1.0375) time: 4.1038 data: 0.0002 max mem: 54684 +[11:21:45.020285] Epoch: [2] [3010/3229] lr: 0.000005 grad_norm: 0.8011 (0.7705) closs: 0.9966 (1.0374) time: 4.0644 data: 0.0002 max mem: 54684 +[11:22:25.881577] Epoch: [2] [3020/3229] lr: 0.000005 grad_norm: 0.7592 (0.7705) closs: 1.0149 (1.0375) time: 4.0570 data: 0.0002 max mem: 54684 +[11:23:07.305216] Epoch: [2] [3030/3229] lr: 0.000005 grad_norm: 0.7607 (0.7706) closs: 1.0597 (1.0376) time: 4.1142 data: 0.0002 max mem: 54684 +[11:23:48.348449] Epoch: [2] [3040/3229] lr: 0.000005 grad_norm: 0.7679 (0.7705) closs: 1.0597 (1.0375) time: 4.1233 data: 0.0002 max mem: 54684 +[11:24:29.596151] Epoch: [2] [3050/3229] lr: 0.000005 grad_norm: 0.7677 (0.7705) closs: 1.0172 (1.0375) time: 4.1145 data: 0.0002 max mem: 54684 +[11:25:10.442041] Epoch: [2] [3060/3229] lr: 0.000005 grad_norm: 0.7748 (0.7706) closs: 1.0172 (1.0375) time: 4.1046 data: 0.0002 max mem: 54684 +[11:25:51.521912] Epoch: [2] [3070/3229] lr: 0.000005 grad_norm: 0.8210 (0.7707) closs: 1.0409 (1.0374) time: 4.0962 data: 0.0002 max mem: 54684 +[11:26:32.713656] Epoch: [2] [3080/3229] lr: 0.000005 grad_norm: 0.7547 (0.7706) closs: 1.0294 (1.0373) time: 4.1135 data: 0.0002 max mem: 54684 +[11:27:13.511213] Epoch: [2] [3090/3229] lr: 0.000005 grad_norm: 0.7538 (0.7706) closs: 1.0294 (1.0374) time: 4.0994 data: 0.0002 max mem: 54684 +[11:27:54.680369] Epoch: [2] [3100/3229] lr: 0.000005 grad_norm: 0.7631 (0.7707) closs: 1.0731 (1.0376) time: 4.0983 data: 0.0002 max mem: 54684 +[11:28:35.049341] Epoch: [2] [3110/3229] lr: 0.000005 grad_norm: 0.7713 (0.7705) closs: 1.0357 (1.0373) time: 4.0768 data: 0.0002 max mem: 54684 +[11:29:15.418890] Epoch: [2] [3120/3229] lr: 0.000005 grad_norm: 0.7078 (0.7705) closs: 1.0218 (1.0374) time: 4.0369 data: 0.0002 max mem: 54684 +[11:29:56.061537] Epoch: [2] [3130/3229] lr: 0.000005 grad_norm: 0.7244 (0.7703) closs: 1.0440 (1.0374) time: 4.0505 data: 0.0002 max mem: 54684 +[11:30:36.293253] Epoch: [2] [3140/3229] lr: 0.000005 grad_norm: 0.7669 (0.7705) closs: 1.0339 (1.0373) time: 4.0437 data: 0.0002 max mem: 54684 +[11:31:16.325807] Epoch: [2] [3150/3229] lr: 0.000005 grad_norm: 0.7848 (0.7703) closs: 1.0084 (1.0372) time: 4.0131 data: 0.0004 max mem: 54684 +[11:31:57.799087] Epoch: [2] [3160/3229] lr: 0.000005 grad_norm: 0.7618 (0.7704) closs: 1.0084 (1.0372) time: 4.0752 data: 0.0003 max mem: 54684 +[11:32:38.264034] Epoch: [2] [3170/3229] lr: 0.000005 grad_norm: 0.7903 (0.7705) closs: 1.0556 (1.0371) time: 4.0968 data: 0.0002 max mem: 54684 +[11:33:18.885238] Epoch: [2] [3180/3229] lr: 0.000005 grad_norm: 0.7486 (0.7705) closs: 1.0379 (1.0372) time: 4.0542 data: 0.0002 max mem: 54684 +[11:33:59.955564] Epoch: [2] [3190/3229] lr: 0.000005 grad_norm: 0.7458 (0.7706) closs: 1.0379 (1.0372) time: 4.0845 data: 0.0002 max mem: 54684 +[11:34:41.295405] Epoch: [2] [3200/3229] lr: 0.000005 grad_norm: 0.7953 (0.7707) closs: 1.0546 (1.0373) time: 4.1204 data: 0.0002 max mem: 54684 +[11:35:22.419516] Epoch: [2] [3210/3229] lr: 0.000005 grad_norm: 0.7931 (0.7707) closs: 1.0572 (1.0373) time: 4.1231 data: 0.0002 max mem: 54684 +[11:36:02.526362] Epoch: [2] [3220/3229] lr: 0.000005 grad_norm: 0.7683 (0.7707) closs: 1.0178 (1.0372) time: 4.0615 data: 0.0001 max mem: 54684 +[11:36:35.973354] Epoch: [2] Total time: 3:40:17 +[11:36:35.974297] Averaged stats: lr: 0.000005 grad_norm: 0.7420 (0.7708) closs: 1.0624 (1.0381) +[11:36:36.337101] model saved +[11:36:38.023619] optimizer saved +[11:36:38.024212] other rank-common saved +[11:36:38.029179] rank-specific saved +[11:36:38.029388] Training time 11:00:31