diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.model.pth deleted file mode 100644 index 314b3ea31b149d73d691efd6f496b97f6a84e1d7..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51942e1cdd4fc4bb9aa2a9649a26fdd2e3822e4eb924e4e55084b8e09a90ea29 -size 90952079 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 26d42273836e83896673b1fe4cd0c1c6fd644004..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:10f9cf84a8ccd5c8c1dfa19876aab016341e6c5c44a461108eb4423f345204ec -size 204403795 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.other.pth deleted file mode 100644 index 7e913b448dd37999501c9e98ec60f8d9d1ca1241..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d37ea8ab9cda46cc28c0964d75caf944b1305770a4bc789c738b99991c8672b8 -size 1815 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.model.pth deleted file mode 100644 index eeb2b771457f443b3d8654d4cb077da3e5cded14..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6ed39d3860a4c5b79b338c9f98875ba542b909b8bac0991be734db1360554e02 -size 90952079 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 83a3e825c1453e522c81944f203440d027874e82..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a26da94d90384063a4c8c4d7f9f630cd714d7eb07f25fce0a58ea54182cbb9b8 -size 204403795 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.other.pth deleted file mode 100644 index 8073af126d00c3679383f16a39784114786f2372..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:56b9b1f946e7d9a93adaab132ff2381cef31be146bae4ac5763f3249d98fa378 -size 1815 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.model.pth deleted file mode 100644 index 6d00377f554cfe2800301419cb1d888f10f800ab..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dcbe6ac8e5588bd2dbf6f2fa10826e9efd617af80fa5495358165d8dccfa19c9 -size 90952079 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 808c3563296dc0a7be82da6869671c500c55404f..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:db07aed2b90e65be9083d5431d37272028e06d4b92d6ba3067f059976e7ff591 -size 204403795 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.other.pth deleted file mode 100644 index 68ac231f9774f7a4a6254684400eb280cd69e9fa..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ac90e9e7304e3e220667308999a777273eaea82ea51cb28003c1fa9e40738a9 -size 1815 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/log.txt b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/log.txt deleted file mode 100644 index 6abb734a07c980ed975f724eb9dad68f2e62731a..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/log.txt +++ /dev/null @@ -1,3 +0,0 @@ -{"train_lr": 4.2672042852177435e-05, "train_grad_norm": 0.8576727652187944, "train_closs": 1.0961337126687645, "epoch": 0, "val_lr": 4.2672042852177435e-05, "val_grad_norm": 0.8576727652187944, "val_closs": 1.0961337126687645} -{"train_lr": 2.989280819774688e-05, "train_grad_norm": 0.7565592593381042, "train_closs": 1.0483260756908241, "epoch": 1, "val_lr": 2.989280819774688e-05, "val_grad_norm": 0.7565592593381042, "val_closs": 1.0483260756908241} -{"train_lr": 9.43437279837357e-06, "train_grad_norm": 0.7707539895124279, "train_closs": 1.0381248756278665, "epoch": 2, "val_lr": 9.43437279837357e-06, "val_grad_norm": 0.7707539895124279, "val_closs": 1.0381248756278665} diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/output.log b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/output.log deleted file mode 100644 index 3e8df40b9236cc509af3a1865e95527f2edb136a..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/output.log +++ /dev/null @@ -1,2801 +0,0 @@ -WARNING:torch.distributed.run: -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -| distributed init (rank 6): env://, gpu 6 -| distributed init (rank 3): env://, gpu 3 -| distributed init (rank 1): env://, gpu 1 -| distributed init (rank 7): env://, gpu 7 -| distributed init (rank 5): env://, gpu 5 -| distributed init (rank 2): env://, gpu 2 -| distributed init (rank 0): env://, gpu 0 -| distributed init (rank 4): env://, gpu 4 -[00:02:04.046928] > initializing model parallel with size 1 -[00:02:04.047015] > initializing ddp with size 8 -[00:02:04.047022] > initializing pipeline with size 1 -[00:02:04.226045] job dir: /data/liuyijiang/mmlab/LLaMA2-Accessory/accessory -[00:02:04.226122] Namespace(batch_size=16, -accum_iter=1, -llama_type='llama_qformerv2_peft', -llama_config=['../checkpoints/llama2/Llama-2-13b/params.json', -'configs/model/finetune/sg/llamaPeft_normBiasLora.json'], -no_visual=False, -tokenizer_path='../checkpoints/llama2/Llama-2-13b/tokenizer.model', -pretrained_path='../checkpoints/mm/lamaQformerv2_13b/finetuned/', -pretrained_type='consolidated', -weight_decay=0.02, -lr=5e-05, -min_lr=5e-06, -epochs=3, -warmup_epochs=0.2, -clip_grad=2, -max_words=512, -dialog=False, -data_config='configs/data/finetune/mm/alpaca_llava.yaml', -output_dir='output/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B', -log_dir='./output_dir', -save_interval=1, -only_save_trainable=True, -device='cuda', -seed=0, -resume='', -num_workers=16, -pin_mem=True, -world_size=8, -local_rank=-1, -dist_on_itp=False, -dist_url='env://', -model_parallel_size=1, -data_parallel='sdp', -precision='bf16', -checkpointing=True, -quant=True, -rank=0, -gpu=0, -distributed=True, -dist_backend='nccl') -[00:02:04.226978] Start initialization. -[00:02:04.227022] ## Processing on RANK 0. -[00:02:04.237574] Model Args: - ModelArgs(dim=5120, n_layers=40, n_heads=40, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=16, bias_tuning=True) -[00:03:36.399161] build llama model with qformerv2 -[00:03:36.779030] (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /Salesforce/blip2-opt-2.7b/resolve/main/config.json (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1007)')))"), '(Request ID: f000589d-f862-41f8-832e-73fc0c96ee6a)') - Loading checkpoint shards: 0%| | 0/2 [00:00 -[00:36:06.281575] Start training for 3 epochs -[00:36:06.296296] log_dir: ./output_dir -[00:36:22.985451] Epoch: [0] [0/3229] lr: 0.000000 grad_norm: 2.3647 (2.3647) closs: 1.5947 (1.5947) time: 16.6883 data: 8.5180 max mem: 36209 -[00:37:03.163928] Epoch: [0] [10/3229] lr: 0.000001 grad_norm: 2.2614 (2.1844) closs: 1.3985 (1.3614) time: 5.1696 data: 0.7746 max mem: 54683 -[00:37:43.938889] Epoch: [0] [20/3229] lr: 0.000002 grad_norm: 2.2614 (2.2117) closs: 1.4480 (1.4415) time: 4.0476 data: 0.0002 max mem: 54683 -[00:38:24.104529] Epoch: [0] [30/3229] lr: 0.000002 grad_norm: 2.2231 (2.2052) closs: 1.4753 (1.4253) time: 4.0470 data: 0.0002 max mem: 54683 -[00:39:05.082555] Epoch: [0] [40/3229] lr: 0.000003 grad_norm: 2.1752 (2.1914) closs: 1.4110 (1.4059) time: 4.0571 data: 0.0002 max mem: 54683 -[00:39:46.229231] Epoch: [0] [50/3229] lr: 0.000004 grad_norm: 2.0509 (2.2594) closs: 1.4161 (1.4234) time: 4.1061 data: 0.0002 max mem: 54683 -[00:40:26.733155] Epoch: [0] [60/3229] lr: 0.000005 grad_norm: 1.9159 (2.1852) closs: 1.4598 (1.4133) time: 4.0824 data: 0.0003 max mem: 54683 -[00:41:07.256178] Epoch: [0] [70/3229] lr: 0.000005 grad_norm: 1.7466 (2.1041) closs: 1.4069 (1.4059) time: 4.0513 data: 0.0003 max mem: 54683 -[00:41:48.461794] Epoch: [0] [80/3229] lr: 0.000006 grad_norm: 1.4995 (2.0192) closs: 1.3245 (1.3975) time: 4.0864 data: 0.0002 max mem: 54683 -[00:42:28.626897] Epoch: [0] [90/3229] lr: 0.000007 grad_norm: 1.2119 (1.9324) closs: 1.2737 (1.3830) time: 4.0684 data: 0.0002 max mem: 54683 -[00:43:09.428679] Epoch: [0] [100/3229] lr: 0.000008 grad_norm: 1.1636 (1.8590) closs: 1.2619 (1.3700) time: 4.0482 data: 0.0003 max mem: 54683 -[00:43:49.914083] Epoch: [0] [110/3229] lr: 0.000009 grad_norm: 1.0306 (1.7801) closs: 1.2915 (1.3614) time: 4.0643 data: 0.0003 max mem: 54683 -[00:44:31.063164] Epoch: [0] [120/3229] lr: 0.000009 grad_norm: 0.9462 (1.7095) closs: 1.3081 (1.3546) time: 4.0816 data: 0.0002 max mem: 54683 -[00:45:11.261841] Epoch: [0] [130/3229] lr: 0.000010 grad_norm: 0.9337 (1.6472) closs: 1.2631 (1.3397) time: 4.0673 data: 0.0002 max mem: 54683 -[00:45:52.112790] Epoch: [0] [140/3229] lr: 0.000011 grad_norm: 0.8573 (1.5931) closs: 1.2118 (1.3288) time: 4.0524 data: 0.0002 max mem: 54683 -[00:46:32.944599] Epoch: [0] [150/3229] lr: 0.000012 grad_norm: 0.8517 (1.5447) closs: 1.1676 (1.3182) time: 4.0841 data: 0.0003 max mem: 54683 -[00:47:14.287016] Epoch: [0] [160/3229] lr: 0.000012 grad_norm: 0.8240 (1.4999) closs: 1.1756 (1.3118) time: 4.1086 data: 0.0003 max mem: 54683 -[00:47:55.388906] Epoch: [0] [170/3229] lr: 0.000013 grad_norm: 0.8136 (1.4618) closs: 1.1849 (1.3054) time: 4.1221 data: 0.0003 max mem: 54683 -[00:48:36.512000] Epoch: [0] [180/3229] lr: 0.000014 grad_norm: 0.8954 (1.4305) closs: 1.2078 (1.3010) time: 4.1112 data: 0.0002 max mem: 54683 -[00:49:17.622250] Epoch: [0] [190/3229] lr: 0.000015 grad_norm: 0.8401 (1.3994) closs: 1.2296 (1.2966) time: 4.1116 data: 0.0002 max mem: 54683 -[00:49:58.917542] Epoch: [0] [200/3229] lr: 0.000015 grad_norm: 0.8401 (1.3746) closs: 1.2198 (1.2906) time: 4.1202 data: 0.0002 max mem: 54683 -[00:50:39.721214] Epoch: [0] [210/3229] lr: 0.000016 grad_norm: 0.8446 (1.3499) closs: 1.1936 (1.2863) time: 4.1049 data: 0.0002 max mem: 54684 -[00:51:20.209941] Epoch: [0] [220/3229] lr: 0.000017 grad_norm: 0.8360 (1.3275) closs: 1.2039 (1.2828) time: 4.0645 data: 0.0002 max mem: 54684 -[00:52:01.340571] Epoch: [0] [230/3229] lr: 0.000018 grad_norm: 0.8406 (1.3077) closs: 1.2159 (1.2796) time: 4.0809 data: 0.0002 max mem: 54684 -[00:52:42.293014] Epoch: [0] [240/3229] lr: 0.000019 grad_norm: 0.8406 (1.2881) closs: 1.2119 (1.2756) time: 4.1041 data: 0.0002 max mem: 54684 -[00:53:22.448171] Epoch: [0] [250/3229] lr: 0.000019 grad_norm: 0.8104 (1.2702) closs: 1.1467 (1.2704) time: 4.0553 data: 0.0002 max mem: 54684 -[00:54:02.261862] Epoch: [0] [260/3229] lr: 0.000020 grad_norm: 0.8084 (1.2538) closs: 1.1398 (1.2651) time: 3.9984 data: 0.0002 max mem: 54684 -[00:54:42.733423] Epoch: [0] [270/3229] lr: 0.000021 grad_norm: 0.8337 (1.2407) closs: 1.1322 (1.2590) time: 4.0142 data: 0.0002 max mem: 54684 -[00:55:23.786823] Epoch: [0] [280/3229] lr: 0.000022 grad_norm: 0.8337 (1.2265) closs: 1.1322 (1.2549) time: 4.0762 data: 0.0002 max mem: 54684 -[00:56:04.261686] Epoch: [0] [290/3229] lr: 0.000022 grad_norm: 0.8146 (1.2134) closs: 1.1835 (1.2526) time: 4.0763 data: 0.0002 max mem: 54684 -[00:56:45.065966] Epoch: [0] [300/3229] lr: 0.000023 grad_norm: 0.8291 (1.2034) closs: 1.2046 (1.2503) time: 4.0639 data: 0.0002 max mem: 54684 -[00:57:25.203092] Epoch: [0] [310/3229] lr: 0.000024 grad_norm: 0.8414 (1.1916) closs: 1.1724 (1.2467) time: 4.0470 data: 0.0002 max mem: 54684 -[00:58:05.237988] Epoch: [0] [320/3229] lr: 0.000025 grad_norm: 0.8399 (1.1806) closs: 1.1496 (1.2420) time: 4.0085 data: 0.0002 max mem: 54684 -[00:58:46.046197] Epoch: [0] [330/3229] lr: 0.000026 grad_norm: 0.8551 (1.1711) closs: 1.1496 (1.2392) time: 4.0421 data: 0.0002 max mem: 54684 -[00:59:27.204738] Epoch: [0] [340/3229] lr: 0.000026 grad_norm: 0.8768 (1.1625) closs: 1.1481 (1.2362) time: 4.0983 data: 0.0002 max mem: 54684 -[01:00:08.357924] Epoch: [0] [350/3229] lr: 0.000027 grad_norm: 0.8572 (1.1541) closs: 1.1341 (1.2344) time: 4.1155 data: 0.0002 max mem: 54684 -[01:00:49.082581] Epoch: [0] [360/3229] lr: 0.000028 grad_norm: 0.8620 (1.1473) closs: 1.1321 (1.2311) time: 4.0938 data: 0.0002 max mem: 54684 -[01:01:29.908089] Epoch: [0] [370/3229] lr: 0.000029 grad_norm: 0.9078 (1.1413) closs: 1.1322 (1.2292) time: 4.0774 data: 0.0002 max mem: 54684 -[01:02:10.408877] Epoch: [0] [380/3229] lr: 0.000029 grad_norm: 0.9115 (1.1385) closs: 1.1444 (1.2265) time: 4.0662 data: 0.0002 max mem: 54684 -[01:02:50.910726] Epoch: [0] [390/3229] lr: 0.000030 grad_norm: 0.8868 (1.1318) closs: 1.1205 (1.2241) time: 4.0501 data: 0.0002 max mem: 54684 -[01:03:32.368355] Epoch: [0] [400/3229] lr: 0.000031 grad_norm: 0.8768 (1.1270) closs: 1.1205 (1.2212) time: 4.0979 data: 0.0002 max mem: 54684 -[01:04:12.522689] Epoch: [0] [410/3229] lr: 0.000032 grad_norm: 0.8617 (1.1200) closs: 1.1138 (1.2181) time: 4.0805 data: 0.0002 max mem: 54684 -[01:04:53.670656] Epoch: [0] [420/3229] lr: 0.000033 grad_norm: 0.8747 (1.1175) closs: 1.0839 (1.2160) time: 4.0650 data: 0.0002 max mem: 54684 -[01:05:34.517336] Epoch: [0] [430/3229] lr: 0.000033 grad_norm: 0.9140 (1.1135) closs: 1.1140 (1.2136) time: 4.0997 data: 0.0002 max mem: 54684 -[01:06:15.357490] Epoch: [0] [440/3229] lr: 0.000034 grad_norm: 0.9000 (1.1083) closs: 1.1255 (1.2112) time: 4.0843 data: 0.0002 max mem: 54684 -[01:06:56.510385] Epoch: [0] [450/3229] lr: 0.000035 grad_norm: 0.9130 (1.1047) closs: 1.1443 (1.2102) time: 4.0996 data: 0.0002 max mem: 54684 -[01:07:37.338507] Epoch: [0] [460/3229] lr: 0.000036 grad_norm: 0.8889 (1.0996) closs: 1.1569 (1.2089) time: 4.0990 data: 0.0002 max mem: 54684 -[01:08:17.507539] Epoch: [0] [470/3229] lr: 0.000036 grad_norm: 0.8632 (1.0962) closs: 1.1353 (1.2070) time: 4.0498 data: 0.0002 max mem: 54684 -[01:08:58.904572] Epoch: [0] [480/3229] lr: 0.000037 grad_norm: 0.9061 (1.0920) closs: 1.0907 (1.2045) time: 4.0782 data: 0.0002 max mem: 54684 -[01:09:39.746010] Epoch: [0] [490/3229] lr: 0.000038 grad_norm: 0.9314 (1.0890) closs: 1.0907 (1.2028) time: 4.1118 data: 0.0002 max mem: 54684 -[01:10:20.231136] Epoch: [0] [500/3229] lr: 0.000039 grad_norm: 0.9235 (1.0854) closs: 1.1249 (1.2011) time: 4.0663 data: 0.0002 max mem: 54684 -[01:11:01.054523] Epoch: [0] [510/3229] lr: 0.000039 grad_norm: 0.9309 (1.0834) closs: 1.1249 (1.1995) time: 4.0654 data: 0.0002 max mem: 54684 -[01:11:42.308142] Epoch: [0] [520/3229] lr: 0.000040 grad_norm: 0.9541 (1.0806) closs: 1.1226 (1.1972) time: 4.1038 data: 0.0002 max mem: 54684 -[01:12:23.110317] Epoch: [0] [530/3229] lr: 0.000041 grad_norm: 0.9516 (1.0777) closs: 1.1269 (1.1965) time: 4.1027 data: 0.0002 max mem: 54684 -[01:13:03.936397] Epoch: [0] [540/3229] lr: 0.000042 grad_norm: 0.9204 (1.0814) closs: 1.1651 (1.1959) time: 4.0813 data: 0.0002 max mem: 54684 -[01:13:44.208566] Epoch: [0] [550/3229] lr: 0.000043 grad_norm: 0.9204 (1.0785) closs: 1.1192 (1.1931) time: 4.0548 data: 0.0002 max mem: 54684 -[01:14:25.497954] Epoch: [0] [560/3229] lr: 0.000043 grad_norm: 0.9295 (1.0758) closs: 1.0839 (1.1919) time: 4.0780 data: 0.0002 max mem: 54684 -[01:15:05.977736] Epoch: [0] [570/3229] lr: 0.000044 grad_norm: 0.9723 (1.0751) closs: 1.1325 (1.1915) time: 4.0884 data: 0.0002 max mem: 54684 -[01:15:47.138810] Epoch: [0] [580/3229] lr: 0.000045 grad_norm: 0.9485 (1.0728) closs: 1.1764 (1.1910) time: 4.0820 data: 0.0002 max mem: 54684 -[01:16:28.229266] Epoch: [0] [590/3229] lr: 0.000046 grad_norm: 0.9286 (1.0704) closs: 1.1431 (1.1902) time: 4.1125 data: 0.0002 max mem: 54684 -[01:17:08.494066] Epoch: [0] [600/3229] lr: 0.000046 grad_norm: 0.9124 (1.0680) closs: 1.1127 (1.1888) time: 4.0677 data: 0.0002 max mem: 54684 -[01:17:49.333146] Epoch: [0] [610/3229] lr: 0.000047 grad_norm: 0.8772 (1.0649) closs: 1.1387 (1.1877) time: 4.0551 data: 0.0002 max mem: 54684 -[01:18:30.513096] Epoch: [0] [620/3229] lr: 0.000048 grad_norm: 0.8946 (1.0631) closs: 1.1490 (1.1873) time: 4.1009 data: 0.0002 max mem: 54684 -[01:19:12.132382] Epoch: [0] [630/3229] lr: 0.000049 grad_norm: 0.8910 (1.0597) closs: 1.1475 (1.1862) time: 4.1399 data: 0.0002 max mem: 54684 -[01:19:52.876937] Epoch: [0] [640/3229] lr: 0.000050 grad_norm: 0.8884 (1.0579) closs: 1.1173 (1.1843) time: 4.1181 data: 0.0002 max mem: 54684 -[01:20:33.485645] Epoch: [0] [650/3229] lr: 0.000050 grad_norm: 0.8884 (1.0558) closs: 1.0704 (1.1821) time: 4.0676 data: 0.0002 max mem: 54684 -[01:21:14.337520] Epoch: [0] [660/3229] lr: 0.000050 grad_norm: 0.8974 (1.0531) closs: 1.0758 (1.1812) time: 4.0730 data: 0.0002 max mem: 54684 -[01:21:54.945867] Epoch: [0] [670/3229] lr: 0.000050 grad_norm: 0.9121 (1.0517) closs: 1.1193 (1.1797) time: 4.0729 data: 0.0002 max mem: 54684 -[01:22:36.533925] Epoch: [0] [680/3229] lr: 0.000050 grad_norm: 0.9163 (1.0500) closs: 1.1473 (1.1795) time: 4.1097 data: 0.0002 max mem: 54684 -[01:23:17.315950] Epoch: [0] [690/3229] lr: 0.000050 grad_norm: 0.9097 (1.0480) closs: 1.1498 (1.1785) time: 4.1184 data: 0.0002 max mem: 54684 -[01:23:57.791032] Epoch: [0] [700/3229] lr: 0.000050 grad_norm: 0.8856 (1.0454) closs: 1.1322 (1.1770) time: 4.0628 data: 0.0003 max mem: 54684 -[01:24:38.841536] Epoch: [0] [710/3229] lr: 0.000050 grad_norm: 0.8786 (1.0446) closs: 1.1357 (1.1766) time: 4.0762 data: 0.0003 max mem: 54684 -[01:25:19.414277] Epoch: [0] [720/3229] lr: 0.000050 grad_norm: 0.9131 (1.0427) closs: 1.1209 (1.1750) time: 4.0811 data: 0.0002 max mem: 54684 -[01:26:00.560290] Epoch: [0] [730/3229] lr: 0.000050 grad_norm: 0.9133 (1.0408) closs: 1.0723 (1.1737) time: 4.0859 data: 0.0002 max mem: 54684 -[01:26:41.063904] Epoch: [0] [740/3229] lr: 0.000050 grad_norm: 0.8809 (1.0382) closs: 1.0816 (1.1727) time: 4.0824 data: 0.0002 max mem: 54684 -[01:27:22.243420] Epoch: [0] [750/3229] lr: 0.000050 grad_norm: 0.8721 (1.0366) closs: 1.1144 (1.1718) time: 4.0841 data: 0.0002 max mem: 54684 -[01:28:03.061085] Epoch: [0] [760/3229] lr: 0.000050 grad_norm: 0.8563 (1.0340) closs: 1.1227 (1.1713) time: 4.0998 data: 0.0002 max mem: 54684 -[01:28:43.881750] Epoch: [0] [770/3229] lr: 0.000050 grad_norm: 0.8563 (1.0319) closs: 1.1247 (1.1701) time: 4.0818 data: 0.0002 max mem: 54684 -[01:29:24.382823] Epoch: [0] [780/3229] lr: 0.000050 grad_norm: 0.8666 (1.0297) closs: 1.0862 (1.1689) time: 4.0660 data: 0.0003 max mem: 54684 -[01:30:05.263440] Epoch: [0] [790/3229] lr: 0.000050 grad_norm: 0.8803 (1.0278) closs: 1.0773 (1.1675) time: 4.0690 data: 0.0002 max mem: 54684 -[01:30:46.132360] Epoch: [0] [800/3229] lr: 0.000050 grad_norm: 0.8803 (1.0263) closs: 1.0871 (1.1669) time: 4.0874 data: 0.0002 max mem: 54684 -[01:31:26.624450] Epoch: [0] [810/3229] lr: 0.000050 grad_norm: 0.8782 (1.0245) closs: 1.0958 (1.1659) time: 4.0680 data: 0.0002 max mem: 54684 -[01:32:07.126461] Epoch: [0] [820/3229] lr: 0.000050 grad_norm: 0.8762 (1.0225) closs: 1.0958 (1.1649) time: 4.0496 data: 0.0002 max mem: 54684 -[01:32:48.359187] Epoch: [0] [830/3229] lr: 0.000050 grad_norm: 0.8256 (1.0204) closs: 1.0959 (1.1644) time: 4.0867 data: 0.0002 max mem: 54684 -[01:33:29.469866] Epoch: [0] [840/3229] lr: 0.000050 grad_norm: 0.8423 (1.0185) closs: 1.1185 (1.1638) time: 4.1171 data: 0.0002 max mem: 54684 -[01:34:09.970107] Epoch: [0] [850/3229] lr: 0.000050 grad_norm: 0.8891 (1.0170) closs: 1.1185 (1.1627) time: 4.0805 data: 0.0002 max mem: 54684 -[01:34:50.161303] Epoch: [0] [860/3229] lr: 0.000050 grad_norm: 0.8891 (1.0154) closs: 1.0997 (1.1617) time: 4.0345 data: 0.0002 max mem: 54684 -[01:35:31.765849] Epoch: [0] [870/3229] lr: 0.000050 grad_norm: 0.8328 (1.0135) closs: 1.0972 (1.1610) time: 4.0897 data: 0.0002 max mem: 54684 -[01:36:12.904367] Epoch: [0] [880/3229] lr: 0.000050 grad_norm: 0.8507 (1.0124) closs: 1.0972 (1.1607) time: 4.1371 data: 0.0002 max mem: 54684 -[01:36:54.052581] Epoch: [0] [890/3229] lr: 0.000050 grad_norm: 0.8788 (1.0109) closs: 1.1339 (1.1602) time: 4.1143 data: 0.0002 max mem: 54684 -[01:37:34.868212] Epoch: [0] [900/3229] lr: 0.000050 grad_norm: 0.8585 (1.0094) closs: 1.1287 (1.1597) time: 4.0981 data: 0.0002 max mem: 54684 -[01:38:16.113165] Epoch: [0] [910/3229] lr: 0.000050 grad_norm: 0.8376 (1.0077) closs: 1.1211 (1.1589) time: 4.1030 data: 0.0002 max mem: 54684 -[01:38:56.583668] Epoch: [0] [920/3229] lr: 0.000050 grad_norm: 0.8361 (1.0058) closs: 1.0987 (1.1582) time: 4.0857 data: 0.0002 max mem: 54684 -[01:39:37.055401] Epoch: [0] [930/3229] lr: 0.000050 grad_norm: 0.8472 (1.0044) closs: 1.1219 (1.1577) time: 4.0470 data: 0.0002 max mem: 54684 -[01:40:18.207956] Epoch: [0] [940/3229] lr: 0.000050 grad_norm: 0.8701 (1.0031) closs: 1.1219 (1.1571) time: 4.0811 data: 0.0002 max mem: 54684 -[01:40:59.474520] Epoch: [0] [950/3229] lr: 0.000050 grad_norm: 0.8784 (1.0016) closs: 1.1027 (1.1563) time: 4.1209 data: 0.0002 max mem: 54684 -[01:41:40.642522] Epoch: [0] [960/3229] lr: 0.000050 grad_norm: 0.8791 (1.0010) closs: 1.0988 (1.1557) time: 4.1217 data: 0.0002 max mem: 54684 -[01:42:21.476520] Epoch: [0] [970/3229] lr: 0.000050 grad_norm: 0.8580 (0.9997) closs: 1.1107 (1.1550) time: 4.1000 data: 0.0002 max mem: 54684 -[01:43:01.957616] Epoch: [0] [980/3229] lr: 0.000050 grad_norm: 0.8710 (0.9985) closs: 1.1156 (1.1545) time: 4.0657 data: 0.0002 max mem: 54684 -[01:43:43.560118] Epoch: [0] [990/3229] lr: 0.000050 grad_norm: 0.8756 (0.9976) closs: 1.1236 (1.1544) time: 4.1041 data: 0.0002 max mem: 54684 -[01:44:24.011660] Epoch: [0] [1000/3229] lr: 0.000050 grad_norm: 0.8893 (0.9967) closs: 1.1164 (1.1533) time: 4.1026 data: 0.0002 max mem: 54684 -[01:45:04.795094] Epoch: [0] [1010/3229] lr: 0.000050 grad_norm: 0.8719 (0.9951) closs: 1.0624 (1.1522) time: 4.0617 data: 0.0002 max mem: 54684 -[01:45:45.580136] Epoch: [0] [1020/3229] lr: 0.000050 grad_norm: 0.8502 (0.9936) closs: 1.0902 (1.1518) time: 4.0784 data: 0.0002 max mem: 54684 -[01:46:27.178733] Epoch: [0] [1030/3229] lr: 0.000050 grad_norm: 0.8502 (0.9924) closs: 1.1499 (1.1517) time: 4.1191 data: 0.0002 max mem: 54684 -[01:47:07.978058] Epoch: [0] [1040/3229] lr: 0.000050 grad_norm: 0.8731 (0.9909) closs: 1.0968 (1.1510) time: 4.1198 data: 0.0002 max mem: 54684 -[01:47:49.138856] Epoch: [0] [1050/3229] lr: 0.000050 grad_norm: 0.8731 (0.9900) closs: 1.1148 (1.1508) time: 4.0979 data: 0.0002 max mem: 54684 -[01:48:30.427876] Epoch: [0] [1060/3229] lr: 0.000050 grad_norm: 0.8918 (0.9889) closs: 1.1191 (1.1504) time: 4.1224 data: 0.0002 max mem: 54684 -[01:49:10.546105] Epoch: [0] [1070/3229] lr: 0.000050 grad_norm: 0.8288 (0.9871) closs: 1.0949 (1.1494) time: 4.0703 data: 0.0002 max mem: 54684 -[01:49:51.703975] Epoch: [0] [1080/3229] lr: 0.000050 grad_norm: 0.8567 (0.9861) closs: 1.1071 (1.1494) time: 4.0637 data: 0.0002 max mem: 54684 -[01:50:32.529173] Epoch: [0] [1090/3229] lr: 0.000050 grad_norm: 0.8706 (0.9851) closs: 1.1351 (1.1491) time: 4.0991 data: 0.0002 max mem: 54684 -[01:51:12.582431] Epoch: [0] [1100/3229] lr: 0.000050 grad_norm: 0.8475 (0.9835) closs: 1.0957 (1.1482) time: 4.0438 data: 0.0002 max mem: 54684 -[01:51:53.016781] Epoch: [0] [1110/3229] lr: 0.000050 grad_norm: 0.8386 (0.9821) closs: 1.0904 (1.1474) time: 4.0243 data: 0.0002 max mem: 54684 -[01:52:33.815856] Epoch: [0] [1120/3229] lr: 0.000050 grad_norm: 0.8762 (0.9811) closs: 1.1067 (1.1470) time: 4.0616 data: 0.0002 max mem: 54684 -[01:53:14.619418] Epoch: [0] [1130/3229] lr: 0.000050 grad_norm: 0.8767 (0.9802) closs: 1.0902 (1.1461) time: 4.0801 data: 0.0002 max mem: 54684 -[01:53:55.215391] Epoch: [0] [1140/3229] lr: 0.000050 grad_norm: 0.8585 (0.9789) closs: 1.0626 (1.1455) time: 4.0699 data: 0.0002 max mem: 54684 -[01:54:36.057646] Epoch: [0] [1150/3229] lr: 0.000050 grad_norm: 0.8274 (0.9774) closs: 1.0626 (1.1444) time: 4.0718 data: 0.0002 max mem: 54684 -[01:55:16.560302] Epoch: [0] [1160/3229] lr: 0.000050 grad_norm: 0.8237 (0.9766) closs: 1.0592 (1.1436) time: 4.0672 data: 0.0002 max mem: 54684 -[01:55:57.057967] Epoch: [0] [1170/3229] lr: 0.000050 grad_norm: 0.8557 (0.9756) closs: 1.0592 (1.1425) time: 4.0499 data: 0.0002 max mem: 54684 -[01:56:37.362333] Epoch: [0] [1180/3229] lr: 0.000050 grad_norm: 0.8549 (0.9744) closs: 1.0497 (1.1417) time: 4.0400 data: 0.0002 max mem: 54684 -[01:57:18.113945] Epoch: [0] [1190/3229] lr: 0.000050 grad_norm: 0.8588 (0.9733) closs: 1.0712 (1.1410) time: 4.0527 data: 0.0002 max mem: 54684 -[01:57:58.946014] Epoch: [0] [1200/3229] lr: 0.000050 grad_norm: 0.8545 (0.9721) closs: 1.1016 (1.1406) time: 4.0791 data: 0.0002 max mem: 54684 -[01:58:39.470964] Epoch: [0] [1210/3229] lr: 0.000050 grad_norm: 0.8155 (0.9705) closs: 1.0832 (1.1398) time: 4.0678 data: 0.0002 max mem: 54684 -[01:59:20.231476] Epoch: [0] [1220/3229] lr: 0.000050 grad_norm: 0.8030 (0.9693) closs: 1.0567 (1.1392) time: 4.0642 data: 0.0002 max mem: 54684 -[02:00:01.565697] Epoch: [0] [1230/3229] lr: 0.000050 grad_norm: 0.8324 (0.9682) closs: 1.1310 (1.1394) time: 4.1047 data: 0.0002 max mem: 54684 -[02:00:42.411163] Epoch: [0] [1240/3229] lr: 0.000050 grad_norm: 0.8455 (0.9676) closs: 1.1277 (1.1390) time: 4.1089 data: 0.0002 max mem: 54684 -[02:01:22.913104] Epoch: [0] [1250/3229] lr: 0.000050 grad_norm: 0.8509 (0.9664) closs: 1.0848 (1.1386) time: 4.0673 data: 0.0002 max mem: 54684 -[02:02:03.681895] Epoch: [0] [1260/3229] lr: 0.000049 grad_norm: 0.8150 (0.9655) closs: 1.0657 (1.1378) time: 4.0635 data: 0.0002 max mem: 54684 -[02:02:44.382490] Epoch: [0] [1270/3229] lr: 0.000049 grad_norm: 0.8150 (0.9645) closs: 1.0536 (1.1371) time: 4.0734 data: 0.0002 max mem: 54684 -[02:03:25.548675] Epoch: [0] [1280/3229] lr: 0.000049 grad_norm: 0.8142 (0.9634) closs: 1.0764 (1.1366) time: 4.0933 data: 0.0002 max mem: 54684 -[02:04:05.764704] Epoch: [0] [1290/3229] lr: 0.000049 grad_norm: 0.8104 (0.9624) closs: 1.0764 (1.1362) time: 4.0690 data: 0.0002 max mem: 54684 -[02:04:46.308118] Epoch: [0] [1300/3229] lr: 0.000049 grad_norm: 0.8170 (0.9614) closs: 1.0782 (1.1357) time: 4.0379 data: 0.0002 max mem: 54684 -[02:05:27.268012] Epoch: [0] [1310/3229] lr: 0.000049 grad_norm: 0.8101 (0.9602) closs: 1.0872 (1.1352) time: 4.0751 data: 0.0002 max mem: 54684 -[02:06:07.751784] Epoch: [0] [1320/3229] lr: 0.000049 grad_norm: 0.8024 (0.9591) closs: 1.0757 (1.1345) time: 4.0721 data: 0.0002 max mem: 54684 -[02:06:48.603185] Epoch: [0] [1330/3229] lr: 0.000049 grad_norm: 0.8059 (0.9579) closs: 1.0757 (1.1343) time: 4.0667 data: 0.0002 max mem: 54684 -[02:07:29.989007] Epoch: [0] [1340/3229] lr: 0.000049 grad_norm: 0.8320 (0.9575) closs: 1.1105 (1.1342) time: 4.1118 data: 0.0002 max mem: 54684 -[02:08:10.751898] Epoch: [0] [1350/3229] lr: 0.000049 grad_norm: 0.8769 (0.9566) closs: 1.0763 (1.1338) time: 4.1074 data: 0.0002 max mem: 54684 -[02:08:51.277537] Epoch: [0] [1360/3229] lr: 0.000049 grad_norm: 0.8228 (0.9556) closs: 1.0611 (1.1332) time: 4.0644 data: 0.0002 max mem: 54684 -[02:09:32.452759] Epoch: [0] [1370/3229] lr: 0.000049 grad_norm: 0.8208 (0.9549) closs: 1.0753 (1.1329) time: 4.0850 data: 0.0002 max mem: 54684 -[02:10:13.889635] Epoch: [0] [1380/3229] lr: 0.000049 grad_norm: 0.8204 (0.9543) closs: 1.0725 (1.1325) time: 4.1305 data: 0.0002 max mem: 54684 -[02:10:54.494775] Epoch: [0] [1390/3229] lr: 0.000049 grad_norm: 0.8187 (0.9533) closs: 1.0725 (1.1321) time: 4.1020 data: 0.0002 max mem: 54684 -[02:11:35.317556] Epoch: [0] [1400/3229] lr: 0.000049 grad_norm: 0.8187 (0.9522) closs: 1.0902 (1.1317) time: 4.0713 data: 0.0002 max mem: 54684 -[02:12:16.125134] Epoch: [0] [1410/3229] lr: 0.000049 grad_norm: 0.7985 (0.9512) closs: 1.0906 (1.1314) time: 4.0814 data: 0.0002 max mem: 54684 -[02:12:57.277533] Epoch: [0] [1420/3229] lr: 0.000049 grad_norm: 0.8158 (0.9501) closs: 1.1104 (1.1314) time: 4.0979 data: 0.0002 max mem: 54684 -[02:13:37.904443] Epoch: [0] [1430/3229] lr: 0.000049 grad_norm: 0.8008 (0.9488) closs: 1.0989 (1.1309) time: 4.0889 data: 0.0002 max mem: 54684 -[02:14:18.743491] Epoch: [0] [1440/3229] lr: 0.000049 grad_norm: 0.8008 (0.9479) closs: 1.0963 (1.1307) time: 4.0732 data: 0.0002 max mem: 54684 -[02:14:59.571598] Epoch: [0] [1450/3229] lr: 0.000049 grad_norm: 0.8153 (0.9467) closs: 1.1322 (1.1308) time: 4.0833 data: 0.0002 max mem: 54684 -[02:15:40.772886] Epoch: [0] [1460/3229] lr: 0.000049 grad_norm: 0.8315 (0.9465) closs: 1.1439 (1.1306) time: 4.1014 data: 0.0002 max mem: 54684 -[02:16:21.736547] Epoch: [0] [1470/3229] lr: 0.000049 grad_norm: 0.8342 (0.9457) closs: 1.0850 (1.1301) time: 4.1082 data: 0.0002 max mem: 54684 -[02:17:02.910967] Epoch: [0] [1480/3229] lr: 0.000049 grad_norm: 0.7991 (0.9446) closs: 1.0827 (1.1298) time: 4.1068 data: 0.0002 max mem: 54684 -[02:17:44.083251] Epoch: [0] [1490/3229] lr: 0.000049 grad_norm: 0.7991 (0.9442) closs: 1.1022 (1.1297) time: 4.1173 data: 0.0002 max mem: 54684 -[02:18:26.042677] Epoch: [0] [1500/3229] lr: 0.000049 grad_norm: 0.8720 (0.9438) closs: 1.1096 (1.1297) time: 4.1565 data: 0.0002 max mem: 54684 -[02:19:06.874119] Epoch: [0] [1510/3229] lr: 0.000049 grad_norm: 0.8466 (0.9430) closs: 1.0920 (1.1293) time: 4.1395 data: 0.0002 max mem: 54684 -[02:19:48.004002] Epoch: [0] [1520/3229] lr: 0.000049 grad_norm: 0.8431 (0.9425) closs: 1.0898 (1.1291) time: 4.0980 data: 0.0002 max mem: 54684 -[02:20:29.271406] Epoch: [0] [1530/3229] lr: 0.000049 grad_norm: 0.8229 (0.9419) closs: 1.0907 (1.1289) time: 4.1198 data: 0.0002 max mem: 54684 -[02:21:10.763947] Epoch: [0] [1540/3229] lr: 0.000049 grad_norm: 0.8301 (0.9413) closs: 1.1024 (1.1285) time: 4.1379 data: 0.0002 max mem: 54684 -[02:21:50.930108] Epoch: [0] [1550/3229] lr: 0.000049 grad_norm: 0.8301 (0.9403) closs: 1.0550 (1.1277) time: 4.0829 data: 0.0002 max mem: 54684 -[02:22:31.739856] Epoch: [0] [1560/3229] lr: 0.000049 grad_norm: 0.7819 (0.9392) closs: 1.0473 (1.1272) time: 4.0487 data: 0.0002 max mem: 54684 -[02:23:12.906228] Epoch: [0] [1570/3229] lr: 0.000049 grad_norm: 0.7600 (0.9384) closs: 1.0405 (1.1268) time: 4.0987 data: 0.0002 max mem: 54684 -[02:23:54.080302] Epoch: [0] [1580/3229] lr: 0.000049 grad_norm: 0.8070 (0.9376) closs: 1.0689 (1.1264) time: 4.1169 data: 0.0002 max mem: 54684 -[02:24:34.900214] Epoch: [0] [1590/3229] lr: 0.000049 grad_norm: 0.8137 (0.9367) closs: 1.0689 (1.1259) time: 4.0996 data: 0.0002 max mem: 54684 -[02:25:14.745725] Epoch: [0] [1600/3229] lr: 0.000049 grad_norm: 0.7806 (0.9354) closs: 1.0465 (1.1251) time: 4.0332 data: 0.0002 max mem: 54684 -[02:25:55.704023] Epoch: [0] [1610/3229] lr: 0.000049 grad_norm: 0.8074 (0.9348) closs: 1.0935 (1.1250) time: 4.0401 data: 0.0002 max mem: 54684 -[02:26:37.197949] Epoch: [0] [1620/3229] lr: 0.000049 grad_norm: 0.8162 (0.9342) closs: 1.1148 (1.1250) time: 4.1225 data: 0.0002 max mem: 54684 -[02:27:18.326238] Epoch: [0] [1630/3229] lr: 0.000049 grad_norm: 0.8162 (0.9335) closs: 1.0970 (1.1247) time: 4.1310 data: 0.0002 max mem: 54684 -[02:27:59.451089] Epoch: [0] [1640/3229] lr: 0.000049 grad_norm: 0.8227 (0.9328) closs: 1.1007 (1.1248) time: 4.1126 data: 0.0002 max mem: 54684 -[02:28:40.359589] Epoch: [0] [1650/3229] lr: 0.000049 grad_norm: 0.8227 (0.9321) closs: 1.0987 (1.1244) time: 4.1016 data: 0.0002 max mem: 54684 -[02:29:21.499384] Epoch: [0] [1660/3229] lr: 0.000049 grad_norm: 0.7906 (0.9314) closs: 1.0815 (1.1243) time: 4.1023 data: 0.0002 max mem: 54684 -[02:30:01.961500] Epoch: [0] [1670/3229] lr: 0.000049 grad_norm: 0.7906 (0.9306) closs: 1.1051 (1.1242) time: 4.0800 data: 0.0002 max mem: 54684 -[02:30:43.093943] Epoch: [0] [1680/3229] lr: 0.000049 grad_norm: 0.7620 (0.9296) closs: 1.0947 (1.1241) time: 4.0797 data: 0.0002 max mem: 54684 -[02:31:23.378880] Epoch: [0] [1690/3229] lr: 0.000049 grad_norm: 0.7389 (0.9285) closs: 1.0770 (1.1237) time: 4.0708 data: 0.0002 max mem: 54684 -[02:32:04.494682] Epoch: [0] [1700/3229] lr: 0.000049 grad_norm: 0.7654 (0.9278) closs: 1.0655 (1.1234) time: 4.0700 data: 0.0002 max mem: 54684 -[02:32:44.682897] Epoch: [0] [1710/3229] lr: 0.000048 grad_norm: 0.7788 (0.9267) closs: 1.0655 (1.1229) time: 4.0651 data: 0.0002 max mem: 54684 -[02:33:25.176480] Epoch: [0] [1720/3229] lr: 0.000048 grad_norm: 0.7793 (0.9260) closs: 1.0609 (1.1226) time: 4.0340 data: 0.0002 max mem: 54684 -[02:34:06.615615] Epoch: [0] [1730/3229] lr: 0.000048 grad_norm: 0.7983 (0.9254) closs: 1.0603 (1.1224) time: 4.0966 data: 0.0002 max mem: 54684 -[02:34:47.629024] Epoch: [0] [1740/3229] lr: 0.000048 grad_norm: 0.7874 (0.9245) closs: 1.0761 (1.1222) time: 4.1226 data: 0.0002 max mem: 54684 -[02:35:28.439728] Epoch: [0] [1750/3229] lr: 0.000048 grad_norm: 0.8113 (0.9241) closs: 1.0861 (1.1220) time: 4.0911 data: 0.0002 max mem: 54684 -[02:36:09.569880] Epoch: [0] [1760/3229] lr: 0.000048 grad_norm: 0.8314 (0.9236) closs: 1.1081 (1.1218) time: 4.0970 data: 0.0002 max mem: 54684 -[02:36:50.672861] Epoch: [0] [1770/3229] lr: 0.000048 grad_norm: 0.8306 (0.9229) closs: 1.0888 (1.1214) time: 4.1116 data: 0.0002 max mem: 54684 -[02:37:31.667284] Epoch: [0] [1780/3229] lr: 0.000048 grad_norm: 0.8092 (0.9221) closs: 1.0832 (1.1211) time: 4.1048 data: 0.0002 max mem: 54684 -[02:38:12.473138] Epoch: [0] [1790/3229] lr: 0.000048 grad_norm: 0.7678 (0.9212) closs: 1.0395 (1.1207) time: 4.0899 data: 0.0002 max mem: 54684 -[02:38:52.942058] Epoch: [0] [1800/3229] lr: 0.000048 grad_norm: 0.7929 (0.9204) closs: 1.0356 (1.1204) time: 4.0637 data: 0.0002 max mem: 54684 -[02:39:33.666135] Epoch: [0] [1810/3229] lr: 0.000048 grad_norm: 0.7905 (0.9195) closs: 1.0807 (1.1202) time: 4.0596 data: 0.0002 max mem: 54684 -[02:40:14.381280] Epoch: [0] [1820/3229] lr: 0.000048 grad_norm: 0.7724 (0.9187) closs: 1.1055 (1.1200) time: 4.0719 data: 0.0002 max mem: 54684 -[02:40:55.197221] Epoch: [0] [1830/3229] lr: 0.000048 grad_norm: 0.7724 (0.9180) closs: 1.0897 (1.1199) time: 4.0765 data: 0.0002 max mem: 54684 -[02:41:35.043279] Epoch: [0] [1840/3229] lr: 0.000048 grad_norm: 0.7413 (0.9168) closs: 1.0573 (1.1195) time: 4.0330 data: 0.0002 max mem: 54684 -[02:42:16.132889] Epoch: [0] [1850/3229] lr: 0.000048 grad_norm: 0.7712 (0.9161) closs: 1.0468 (1.1191) time: 4.0467 data: 0.0002 max mem: 54684 -[02:42:56.518513] Epoch: [0] [1860/3229] lr: 0.000048 grad_norm: 0.7990 (0.9154) closs: 1.0714 (1.1188) time: 4.0737 data: 0.0002 max mem: 54684 -[02:43:37.655055] Epoch: [0] [1870/3229] lr: 0.000048 grad_norm: 0.8236 (0.9150) closs: 1.1020 (1.1187) time: 4.0760 data: 0.0002 max mem: 54684 -[02:44:18.501800] Epoch: [0] [1880/3229] lr: 0.000048 grad_norm: 0.8055 (0.9143) closs: 1.0773 (1.1183) time: 4.0991 data: 0.0002 max mem: 54684 -[02:44:59.327626] Epoch: [0] [1890/3229] lr: 0.000048 grad_norm: 0.7469 (0.9135) closs: 1.0612 (1.1180) time: 4.0836 data: 0.0002 max mem: 54684 -[02:45:40.957982] Epoch: [0] [1900/3229] lr: 0.000048 grad_norm: 0.8118 (0.9132) closs: 1.0947 (1.1179) time: 4.1227 data: 0.0002 max mem: 54684 -[02:46:21.456520] Epoch: [0] [1910/3229] lr: 0.000048 grad_norm: 0.8009 (0.9124) closs: 1.0516 (1.1173) time: 4.1064 data: 0.0002 max mem: 54684 -[02:47:01.609291] Epoch: [0] [1920/3229] lr: 0.000048 grad_norm: 0.8009 (0.9121) closs: 1.0265 (1.1168) time: 4.0325 data: 0.0002 max mem: 54684 -[02:47:43.618438] Epoch: [0] [1930/3229] lr: 0.000048 grad_norm: 0.8074 (0.9115) closs: 1.0395 (1.1167) time: 4.1080 data: 0.0002 max mem: 54684 -[02:48:24.870145] Epoch: [0] [1940/3229] lr: 0.000048 grad_norm: 0.7995 (0.9110) closs: 1.1143 (1.1167) time: 4.1630 data: 0.0002 max mem: 54684 -[02:49:06.019794] Epoch: [0] [1950/3229] lr: 0.000048 grad_norm: 0.8393 (0.9109) closs: 1.1143 (1.1166) time: 4.1200 data: 0.0001 max mem: 54684 -[02:49:46.491553] Epoch: [0] [1960/3229] lr: 0.000048 grad_norm: 0.8373 (0.9102) closs: 1.0866 (1.1165) time: 4.0810 data: 0.0002 max mem: 54684 -[02:50:27.704222] Epoch: [0] [1970/3229] lr: 0.000048 grad_norm: 0.8284 (0.9099) closs: 1.0866 (1.1163) time: 4.0841 data: 0.0002 max mem: 54684 -[02:51:08.271766] Epoch: [0] [1980/3229] lr: 0.000048 grad_norm: 0.7955 (0.9092) closs: 1.0799 (1.1161) time: 4.0890 data: 0.0002 max mem: 54684 -[02:51:49.381940] Epoch: [0] [1990/3229] lr: 0.000048 grad_norm: 0.7955 (0.9090) closs: 1.0799 (1.1159) time: 4.0838 data: 0.0002 max mem: 54684 -[02:52:29.511376] Epoch: [0] [2000/3229] lr: 0.000048 grad_norm: 0.8154 (0.9083) closs: 1.0683 (1.1154) time: 4.0619 data: 0.0002 max mem: 54684 -[02:53:10.897602] Epoch: [0] [2010/3229] lr: 0.000048 grad_norm: 0.8103 (0.9078) closs: 1.0683 (1.1152) time: 4.0757 data: 0.0002 max mem: 54684 -[02:53:51.562695] Epoch: [0] [2020/3229] lr: 0.000047 grad_norm: 0.8103 (0.9072) closs: 1.0827 (1.1151) time: 4.1025 data: 0.0002 max mem: 54684 -[02:54:31.690739] Epoch: [0] [2030/3229] lr: 0.000047 grad_norm: 0.7805 (0.9065) closs: 1.0595 (1.1145) time: 4.0396 data: 0.0002 max mem: 54684 -[02:55:12.459763] Epoch: [0] [2040/3229] lr: 0.000047 grad_norm: 0.7728 (0.9058) closs: 1.0613 (1.1143) time: 4.0448 data: 0.0002 max mem: 54684 -[02:55:54.276314] Epoch: [0] [2050/3229] lr: 0.000047 grad_norm: 0.7637 (0.9051) closs: 1.1000 (1.1144) time: 4.1292 data: 0.0002 max mem: 54684 -[02:56:35.163977] Epoch: [0] [2060/3229] lr: 0.000047 grad_norm: 0.7644 (0.9045) closs: 1.1134 (1.1143) time: 4.1351 data: 0.0002 max mem: 54684 -[02:57:15.269544] Epoch: [0] [2070/3229] lr: 0.000047 grad_norm: 0.7892 (0.9039) closs: 1.0588 (1.1138) time: 4.0496 data: 0.0002 max mem: 54684 -[02:57:56.040287] Epoch: [0] [2080/3229] lr: 0.000047 grad_norm: 0.8042 (0.9033) closs: 1.0447 (1.1136) time: 4.0438 data: 0.0002 max mem: 54684 -[02:58:36.828734] Epoch: [0] [2090/3229] lr: 0.000047 grad_norm: 0.7867 (0.9027) closs: 1.0914 (1.1133) time: 4.0779 data: 0.0002 max mem: 54684 -[02:59:17.056084] Epoch: [0] [2100/3229] lr: 0.000047 grad_norm: 0.7766 (0.9021) closs: 1.0796 (1.1131) time: 4.0507 data: 0.0002 max mem: 54684 -[02:59:57.850948] Epoch: [0] [2110/3229] lr: 0.000047 grad_norm: 0.7773 (0.9016) closs: 1.0748 (1.1129) time: 4.0510 data: 0.0002 max mem: 54684 -[03:00:38.445171] Epoch: [0] [2120/3229] lr: 0.000047 grad_norm: 0.8051 (0.9012) closs: 1.0689 (1.1126) time: 4.0694 data: 0.0002 max mem: 54684 -[03:01:19.387745] Epoch: [0] [2130/3229] lr: 0.000047 grad_norm: 0.8057 (0.9006) closs: 1.0761 (1.1126) time: 4.0768 data: 0.0002 max mem: 54684 -[03:01:59.964804] Epoch: [0] [2140/3229] lr: 0.000047 grad_norm: 0.8057 (0.9000) closs: 1.1012 (1.1124) time: 4.0759 data: 0.0002 max mem: 54684 -[03:02:40.770931] Epoch: [0] [2150/3229] lr: 0.000047 grad_norm: 0.7907 (0.8995) closs: 1.0643 (1.1121) time: 4.0691 data: 0.0002 max mem: 54684 -[03:03:21.686342] Epoch: [0] [2160/3229] lr: 0.000047 grad_norm: 0.7718 (0.8990) closs: 1.0844 (1.1122) time: 4.0860 data: 0.0002 max mem: 54684 -[03:04:02.991052] Epoch: [0] [2170/3229] lr: 0.000047 grad_norm: 0.7893 (0.8986) closs: 1.1509 (1.1124) time: 4.1109 data: 0.0002 max mem: 54684 -[03:04:44.216654] Epoch: [0] [2180/3229] lr: 0.000047 grad_norm: 0.8023 (0.8981) closs: 1.0995 (1.1123) time: 4.1264 data: 0.0002 max mem: 54684 -[03:05:24.677881] Epoch: [0] [2190/3229] lr: 0.000047 grad_norm: 0.8062 (0.8976) closs: 1.0876 (1.1122) time: 4.0843 data: 0.0002 max mem: 54684 -[03:06:05.666908] Epoch: [0] [2200/3229] lr: 0.000047 grad_norm: 0.8079 (0.8971) closs: 1.0597 (1.1120) time: 4.0724 data: 0.0002 max mem: 54684 -[03:06:46.537094] Epoch: [0] [2210/3229] lr: 0.000047 grad_norm: 0.8028 (0.8967) closs: 1.0597 (1.1120) time: 4.0929 data: 0.0002 max mem: 54684 -[03:07:27.457503] Epoch: [0] [2220/3229] lr: 0.000047 grad_norm: 0.8028 (0.8962) closs: 1.0845 (1.1117) time: 4.0895 data: 0.0002 max mem: 54684 -[03:08:08.278999] Epoch: [0] [2230/3229] lr: 0.000047 grad_norm: 0.7934 (0.8957) closs: 1.0507 (1.1114) time: 4.0870 data: 0.0002 max mem: 54684 -[03:08:48.891856] Epoch: [0] [2240/3229] lr: 0.000047 grad_norm: 0.7621 (0.8951) closs: 1.0717 (1.1111) time: 4.0717 data: 0.0002 max mem: 54684 -[03:09:29.597430] Epoch: [0] [2250/3229] lr: 0.000047 grad_norm: 0.7756 (0.8946) closs: 1.0717 (1.1110) time: 4.0659 data: 0.0002 max mem: 54684 -[03:10:10.515536] Epoch: [0] [2260/3229] lr: 0.000047 grad_norm: 0.7727 (0.8940) closs: 1.0694 (1.1108) time: 4.0811 data: 0.0002 max mem: 54684 -[03:10:51.637059] Epoch: [0] [2270/3229] lr: 0.000047 grad_norm: 0.7727 (0.8936) closs: 1.0694 (1.1106) time: 4.1019 data: 0.0002 max mem: 54684 -[03:11:32.595121] Epoch: [0] [2280/3229] lr: 0.000046 grad_norm: 0.7681 (0.8929) closs: 1.0451 (1.1103) time: 4.1039 data: 0.0002 max mem: 54684 -[03:12:13.624396] Epoch: [0] [2290/3229] lr: 0.000046 grad_norm: 0.7482 (0.8924) closs: 1.0431 (1.1103) time: 4.0993 data: 0.0002 max mem: 54684 -[03:12:54.747566] Epoch: [0] [2300/3229] lr: 0.000046 grad_norm: 0.7703 (0.8920) closs: 1.1258 (1.1103) time: 4.1076 data: 0.0002 max mem: 54684 -[03:13:34.899825] Epoch: [0] [2310/3229] lr: 0.000046 grad_norm: 0.7731 (0.8914) closs: 1.0817 (1.1101) time: 4.0637 data: 0.0002 max mem: 54684 -[03:14:15.886367] Epoch: [0] [2320/3229] lr: 0.000046 grad_norm: 0.8009 (0.8911) closs: 1.0688 (1.1099) time: 4.0569 data: 0.0002 max mem: 54684 -[03:14:56.926589] Epoch: [0] [2330/3229] lr: 0.000046 grad_norm: 0.7560 (0.8905) closs: 1.0458 (1.1094) time: 4.1013 data: 0.0002 max mem: 54684 -[03:15:37.678425] Epoch: [0] [2340/3229] lr: 0.000046 grad_norm: 0.7591 (0.8902) closs: 1.0475 (1.1094) time: 4.0895 data: 0.0002 max mem: 54684 -[03:16:18.128561] Epoch: [0] [2350/3229] lr: 0.000046 grad_norm: 0.8028 (0.8896) closs: 1.0808 (1.1091) time: 4.0600 data: 0.0002 max mem: 54684 -[03:16:59.115639] Epoch: [0] [2360/3229] lr: 0.000046 grad_norm: 0.7890 (0.8892) closs: 1.0613 (1.1090) time: 4.0718 data: 0.0002 max mem: 54684 -[03:17:39.826484] Epoch: [0] [2370/3229] lr: 0.000046 grad_norm: 0.7886 (0.8888) closs: 1.0808 (1.1091) time: 4.0848 data: 0.0002 max mem: 54684 -[03:18:20.921523] Epoch: [0] [2380/3229] lr: 0.000046 grad_norm: 0.7769 (0.8883) closs: 1.1109 (1.1091) time: 4.0902 data: 0.0002 max mem: 54684 -[03:19:01.354222] Epoch: [0] [2390/3229] lr: 0.000046 grad_norm: 0.7578 (0.8877) closs: 1.0823 (1.1089) time: 4.0763 data: 0.0002 max mem: 54684 -[03:19:42.341834] Epoch: [0] [2400/3229] lr: 0.000046 grad_norm: 0.7938 (0.8873) closs: 1.0722 (1.1086) time: 4.0710 data: 0.0002 max mem: 54684 -[03:20:23.981102] Epoch: [0] [2410/3229] lr: 0.000046 grad_norm: 0.8303 (0.8871) closs: 1.0663 (1.1085) time: 4.1313 data: 0.0002 max mem: 54684 -[03:21:04.723017] Epoch: [0] [2420/3229] lr: 0.000046 grad_norm: 0.8142 (0.8867) closs: 1.0663 (1.1083) time: 4.1190 data: 0.0002 max mem: 54684 -[03:21:45.161994] Epoch: [0] [2430/3229] lr: 0.000046 grad_norm: 0.7785 (0.8863) closs: 1.0653 (1.1080) time: 4.0590 data: 0.0002 max mem: 54684 -[03:22:26.409460] Epoch: [0] [2440/3229] lr: 0.000046 grad_norm: 0.7811 (0.8859) closs: 1.0728 (1.1080) time: 4.0843 data: 0.0002 max mem: 54684 -[03:23:07.737120] Epoch: [0] [2450/3229] lr: 0.000046 grad_norm: 0.7773 (0.8855) closs: 1.1207 (1.1079) time: 4.1287 data: 0.0002 max mem: 54684 -[03:23:48.818206] Epoch: [0] [2460/3229] lr: 0.000046 grad_norm: 0.7819 (0.8853) closs: 1.0838 (1.1078) time: 4.1204 data: 0.0002 max mem: 54684 -[03:24:29.929209] Epoch: [0] [2470/3229] lr: 0.000046 grad_norm: 0.8072 (0.8851) closs: 1.0518 (1.1075) time: 4.1095 data: 0.0002 max mem: 54684 -[03:25:10.886008] Epoch: [0] [2480/3229] lr: 0.000046 grad_norm: 0.7804 (0.8846) closs: 1.0573 (1.1073) time: 4.1033 data: 0.0002 max mem: 54684 -[03:25:52.557958] Epoch: [0] [2490/3229] lr: 0.000046 grad_norm: 0.7737 (0.8845) closs: 1.0763 (1.1072) time: 4.1314 data: 0.0002 max mem: 54684 -[03:26:33.671349] Epoch: [0] [2500/3229] lr: 0.000045 grad_norm: 0.7789 (0.8840) closs: 1.1176 (1.1073) time: 4.1392 data: 0.0002 max mem: 54684 -[03:27:14.135334] Epoch: [0] [2510/3229] lr: 0.000045 grad_norm: 0.7701 (0.8835) closs: 1.0906 (1.1070) time: 4.0788 data: 0.0002 max mem: 54684 -[03:27:55.432584] Epoch: [0] [2520/3229] lr: 0.000045 grad_norm: 0.7708 (0.8832) closs: 1.0723 (1.1069) time: 4.0880 data: 0.0002 max mem: 54684 -[03:28:36.466155] Epoch: [0] [2530/3229] lr: 0.000045 grad_norm: 0.7946 (0.8827) closs: 1.0810 (1.1069) time: 4.1165 data: 0.0002 max mem: 54684 -[03:29:17.239455] Epoch: [0] [2540/3229] lr: 0.000045 grad_norm: 0.7615 (0.8823) closs: 1.0746 (1.1067) time: 4.0903 data: 0.0002 max mem: 54684 -[03:29:58.019807] Epoch: [0] [2550/3229] lr: 0.000045 grad_norm: 0.7483 (0.8819) closs: 1.0516 (1.1065) time: 4.0776 data: 0.0002 max mem: 54684 -[03:30:39.329898] Epoch: [0] [2560/3229] lr: 0.000045 grad_norm: 0.7708 (0.8816) closs: 1.0516 (1.1063) time: 4.1045 data: 0.0002 max mem: 54684 -[03:31:20.317679] Epoch: [0] [2570/3229] lr: 0.000045 grad_norm: 0.7788 (0.8812) closs: 1.0652 (1.1062) time: 4.1148 data: 0.0002 max mem: 54684 -[03:32:01.406782] Epoch: [0] [2580/3229] lr: 0.000045 grad_norm: 0.7574 (0.8808) closs: 1.0688 (1.1062) time: 4.1038 data: 0.0002 max mem: 54684 -[03:32:41.538400] Epoch: [0] [2590/3229] lr: 0.000045 grad_norm: 0.7515 (0.8802) closs: 1.0843 (1.1059) time: 4.0610 data: 0.0002 max mem: 54684 -[03:33:22.786415] Epoch: [0] [2600/3229] lr: 0.000045 grad_norm: 0.8014 (0.8799) closs: 1.0804 (1.1058) time: 4.0689 data: 0.0002 max mem: 54684 -[03:34:03.508058] Epoch: [0] [2610/3229] lr: 0.000045 grad_norm: 0.8014 (0.8794) closs: 1.0795 (1.1055) time: 4.0984 data: 0.0002 max mem: 54684 -[03:34:43.958992] Epoch: [0] [2620/3229] lr: 0.000045 grad_norm: 0.7646 (0.8789) closs: 1.0806 (1.1054) time: 4.0586 data: 0.0002 max mem: 54684 -[03:35:25.187896] Epoch: [0] [2630/3229] lr: 0.000045 grad_norm: 0.7696 (0.8785) closs: 1.1043 (1.1055) time: 4.0839 data: 0.0002 max mem: 54684 -[03:36:06.087296] Epoch: [0] [2640/3229] lr: 0.000045 grad_norm: 0.8024 (0.8782) closs: 1.1064 (1.1054) time: 4.1063 data: 0.0002 max mem: 54684 -[03:36:46.644106] Epoch: [0] [2650/3229] lr: 0.000045 grad_norm: 0.7676 (0.8777) closs: 1.0427 (1.1051) time: 4.0727 data: 0.0002 max mem: 54684 -[03:37:27.758262] Epoch: [0] [2660/3229] lr: 0.000045 grad_norm: 0.7810 (0.8776) closs: 1.0405 (1.1049) time: 4.0835 data: 0.0002 max mem: 54684 -[03:38:08.680275] Epoch: [0] [2670/3229] lr: 0.000045 grad_norm: 0.8024 (0.8774) closs: 1.0546 (1.1047) time: 4.1017 data: 0.0002 max mem: 54684 -[03:38:49.498507] Epoch: [0] [2680/3229] lr: 0.000045 grad_norm: 0.7959 (0.8770) closs: 1.0579 (1.1046) time: 4.0869 data: 0.0002 max mem: 54684 -[03:39:30.230437] Epoch: [0] [2690/3229] lr: 0.000045 grad_norm: 0.7603 (0.8766) closs: 1.0688 (1.1044) time: 4.0774 data: 0.0002 max mem: 54684 -[03:40:10.699217] Epoch: [0] [2700/3229] lr: 0.000045 grad_norm: 0.7214 (0.8762) closs: 1.0769 (1.1043) time: 4.0600 data: 0.0002 max mem: 54684 -[03:40:51.627601] Epoch: [0] [2710/3229] lr: 0.000044 grad_norm: 0.7282 (0.8757) closs: 1.0952 (1.1042) time: 4.0698 data: 0.0002 max mem: 54684 -[03:41:32.962290] Epoch: [0] [2720/3229] lr: 0.000044 grad_norm: 0.7610 (0.8755) closs: 1.0952 (1.1042) time: 4.1131 data: 0.0002 max mem: 54684 -[03:42:13.881942] Epoch: [0] [2730/3229] lr: 0.000044 grad_norm: 0.7852 (0.8752) closs: 1.1198 (1.1041) time: 4.1127 data: 0.0002 max mem: 54684 -[03:42:54.354106] Epoch: [0] [2740/3229] lr: 0.000044 grad_norm: 0.7901 (0.8749) closs: 1.0710 (1.1039) time: 4.0695 data: 0.0002 max mem: 54684 -[03:43:35.270721] Epoch: [0] [2750/3229] lr: 0.000044 grad_norm: 0.7921 (0.8746) closs: 1.0635 (1.1037) time: 4.0694 data: 0.0002 max mem: 54684 -[03:44:16.560153] Epoch: [0] [2760/3229] lr: 0.000044 grad_norm: 0.7653 (0.8742) closs: 1.0744 (1.1036) time: 4.1102 data: 0.0002 max mem: 54684 -[03:44:57.121862] Epoch: [0] [2770/3229] lr: 0.000044 grad_norm: 0.7653 (0.8738) closs: 1.0760 (1.1034) time: 4.0925 data: 0.0002 max mem: 54684 -[03:45:37.257683] Epoch: [0] [2780/3229] lr: 0.000044 grad_norm: 0.7473 (0.8732) closs: 1.0355 (1.1030) time: 4.0348 data: 0.0002 max mem: 54684 -[03:46:17.901019] Epoch: [0] [2790/3229] lr: 0.000044 grad_norm: 0.7473 (0.8729) closs: 1.0248 (1.1028) time: 4.0389 data: 0.0002 max mem: 54684 -[03:46:58.454326] Epoch: [0] [2800/3229] lr: 0.000044 grad_norm: 0.7700 (0.8725) closs: 1.0784 (1.1027) time: 4.0598 data: 0.0002 max mem: 54684 -[03:47:39.337936] Epoch: [0] [2810/3229] lr: 0.000044 grad_norm: 0.7546 (0.8720) closs: 1.0590 (1.1026) time: 4.0718 data: 0.0002 max mem: 54684 -[03:48:20.119515] Epoch: [0] [2820/3229] lr: 0.000044 grad_norm: 0.7763 (0.8717) closs: 1.0590 (1.1024) time: 4.0832 data: 0.0002 max mem: 54684 -[03:49:01.136368] Epoch: [0] [2830/3229] lr: 0.000044 grad_norm: 0.7699 (0.8713) closs: 1.0643 (1.1023) time: 4.0899 data: 0.0002 max mem: 54684 -[03:49:41.768883] Epoch: [0] [2840/3229] lr: 0.000044 grad_norm: 0.7428 (0.8708) closs: 1.0609 (1.1022) time: 4.0824 data: 0.0002 max mem: 54684 -[03:50:22.978721] Epoch: [0] [2850/3229] lr: 0.000044 grad_norm: 0.7529 (0.8704) closs: 1.0452 (1.1021) time: 4.0921 data: 0.0002 max mem: 54684 -[03:51:03.103441] Epoch: [0] [2860/3229] lr: 0.000044 grad_norm: 0.7534 (0.8699) closs: 1.0985 (1.1020) time: 4.0667 data: 0.0002 max mem: 54684 -[03:51:44.422934] Epoch: [0] [2870/3229] lr: 0.000044 grad_norm: 0.7563 (0.8695) closs: 1.0943 (1.1019) time: 4.0721 data: 0.0002 max mem: 54684 -[03:52:25.402059] Epoch: [0] [2880/3229] lr: 0.000044 grad_norm: 0.7777 (0.8691) closs: 1.1125 (1.1020) time: 4.1149 data: 0.0002 max mem: 54684 -[03:53:06.008254] Epoch: [0] [2890/3229] lr: 0.000043 grad_norm: 0.7777 (0.8688) closs: 1.1062 (1.1018) time: 4.0792 data: 0.0002 max mem: 54684 -[03:53:47.142690] Epoch: [0] [2900/3229] lr: 0.000043 grad_norm: 0.7781 (0.8685) closs: 1.0867 (1.1017) time: 4.0870 data: 0.0002 max mem: 54684 -[03:54:28.114121] Epoch: [0] [2910/3229] lr: 0.000043 grad_norm: 0.7930 (0.8682) closs: 1.0739 (1.1015) time: 4.1052 data: 0.0002 max mem: 54684 -[03:55:09.721076] Epoch: [0] [2920/3229] lr: 0.000043 grad_norm: 0.7677 (0.8679) closs: 1.0869 (1.1015) time: 4.1289 data: 0.0002 max mem: 54684 -[03:55:50.967003] Epoch: [0] [2930/3229] lr: 0.000043 grad_norm: 0.8109 (0.8678) closs: 1.0987 (1.1014) time: 4.1426 data: 0.0002 max mem: 54684 -[03:56:31.795882] Epoch: [0] [2940/3229] lr: 0.000043 grad_norm: 0.8109 (0.8675) closs: 1.0884 (1.1015) time: 4.1037 data: 0.0002 max mem: 54684 -[03:57:13.073814] Epoch: [0] [2950/3229] lr: 0.000043 grad_norm: 0.7720 (0.8672) closs: 1.0852 (1.1014) time: 4.1053 data: 0.0002 max mem: 54684 -[03:57:54.032832] Epoch: [0] [2960/3229] lr: 0.000043 grad_norm: 0.7005 (0.8666) closs: 1.0610 (1.1012) time: 4.1118 data: 0.0002 max mem: 54684 -[03:58:34.931320] Epoch: [0] [2970/3229] lr: 0.000043 grad_norm: 0.7079 (0.8663) closs: 1.0292 (1.1009) time: 4.0928 data: 0.0002 max mem: 54684 -[03:59:16.050440] Epoch: [0] [2980/3229] lr: 0.000043 grad_norm: 0.7676 (0.8661) closs: 1.0698 (1.1009) time: 4.1008 data: 0.0002 max mem: 54684 -[03:59:57.343357] Epoch: [0] [2990/3229] lr: 0.000043 grad_norm: 0.7941 (0.8659) closs: 1.0382 (1.1006) time: 4.1205 data: 0.0002 max mem: 54684 -[04:00:38.675134] Epoch: [0] [3000/3229] lr: 0.000043 grad_norm: 0.8105 (0.8657) closs: 1.0089 (1.1003) time: 4.1312 data: 0.0002 max mem: 54684 -[04:01:19.570077] Epoch: [0] [3010/3229] lr: 0.000043 grad_norm: 0.7900 (0.8653) closs: 1.0515 (1.1003) time: 4.1113 data: 0.0002 max mem: 54684 -[04:02:00.671184] Epoch: [0] [3020/3229] lr: 0.000043 grad_norm: 0.7517 (0.8650) closs: 1.0734 (1.1003) time: 4.0997 data: 0.0002 max mem: 54684 -[04:02:40.993797] Epoch: [0] [3030/3229] lr: 0.000043 grad_norm: 0.7689 (0.8646) closs: 1.0596 (1.1000) time: 4.0711 data: 0.0002 max mem: 54684 -[04:03:22.299784] Epoch: [0] [3040/3229] lr: 0.000043 grad_norm: 0.7714 (0.8643) closs: 1.0596 (1.0998) time: 4.0814 data: 0.0002 max mem: 54684 -[04:04:03.204281] Epoch: [0] [3050/3229] lr: 0.000043 grad_norm: 0.7578 (0.8639) closs: 1.0455 (1.0997) time: 4.1105 data: 0.0002 max mem: 54684 -[04:04:44.303035] Epoch: [0] [3060/3229] lr: 0.000043 grad_norm: 0.7642 (0.8637) closs: 1.0497 (1.0996) time: 4.1001 data: 0.0002 max mem: 54684 -[04:05:25.256193] Epoch: [0] [3070/3229] lr: 0.000042 grad_norm: 0.7642 (0.8633) closs: 1.0778 (1.0996) time: 4.1025 data: 0.0002 max mem: 54684 -[04:06:05.981901] Epoch: [0] [3080/3229] lr: 0.000042 grad_norm: 0.7319 (0.8628) closs: 1.0596 (1.0994) time: 4.0839 data: 0.0002 max mem: 54684 -[04:06:46.434233] Epoch: [0] [3090/3229] lr: 0.000042 grad_norm: 0.7442 (0.8624) closs: 1.0718 (1.0993) time: 4.0588 data: 0.0002 max mem: 54684 -[04:07:27.009193] Epoch: [0] [3100/3229] lr: 0.000042 grad_norm: 0.7568 (0.8619) closs: 1.1194 (1.0992) time: 4.0513 data: 0.0002 max mem: 54684 -[04:08:07.508250] Epoch: [0] [3110/3229] lr: 0.000042 grad_norm: 0.7233 (0.8615) closs: 1.0551 (1.0991) time: 4.0536 data: 0.0002 max mem: 54684 -[04:08:48.595599] Epoch: [0] [3120/3229] lr: 0.000042 grad_norm: 0.7674 (0.8612) closs: 1.0551 (1.0989) time: 4.0793 data: 0.0002 max mem: 54684 -[04:09:28.728428] Epoch: [0] [3130/3229] lr: 0.000042 grad_norm: 0.7612 (0.8607) closs: 1.0629 (1.0987) time: 4.0609 data: 0.0002 max mem: 54684 -[04:10:08.523858] Epoch: [0] [3140/3229] lr: 0.000042 grad_norm: 0.6953 (0.8602) closs: 1.0381 (1.0985) time: 3.9964 data: 0.0002 max mem: 54684 -[04:10:49.845722] Epoch: [0] [3150/3229] lr: 0.000042 grad_norm: 0.7319 (0.8599) closs: 1.0381 (1.0984) time: 4.0558 data: 0.0002 max mem: 54684 -[04:11:30.298486] Epoch: [0] [3160/3229] lr: 0.000042 grad_norm: 0.7591 (0.8595) closs: 1.0370 (1.0981) time: 4.0887 data: 0.0002 max mem: 54684 -[04:12:11.068402] Epoch: [0] [3170/3229] lr: 0.000042 grad_norm: 0.7661 (0.8592) closs: 1.0615 (1.0980) time: 4.0611 data: 0.0002 max mem: 54684 -[04:12:51.939751] Epoch: [0] [3180/3229] lr: 0.000042 grad_norm: 0.8030 (0.8590) closs: 1.0806 (1.0979) time: 4.0820 data: 0.0002 max mem: 54684 -[04:13:32.816037] Epoch: [0] [3190/3229] lr: 0.000042 grad_norm: 0.7835 (0.8587) closs: 1.0738 (1.0978) time: 4.0873 data: 0.0002 max mem: 54684 -[04:14:14.251177] Epoch: [0] [3200/3229] lr: 0.000042 grad_norm: 0.7516 (0.8583) closs: 1.0760 (1.0977) time: 4.1155 data: 0.0003 max mem: 54684 -[04:14:55.041953] Epoch: [0] [3210/3229] lr: 0.000042 grad_norm: 0.7438 (0.8580) closs: 1.0863 (1.0976) time: 4.1112 data: 0.0003 max mem: 54684 -[04:15:36.390692] Epoch: [0] [3220/3229] lr: 0.000042 grad_norm: 0.7658 (0.8579) closs: 1.0767 (1.0975) time: 4.1069 data: 0.0001 max mem: 54684 -[04:16:09.561812] Epoch: [0] Total time: 3:40:03 -[04:16:09.562785] Averaged stats: lr: 0.000042 grad_norm: 0.7787 (0.8577) closs: 1.0483 (1.0961) -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -[04:16:09.927151] model saved -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -[04:16:11.631099] optimizer saved -[04:16:11.631728] other rank-common saved -[04:16:11.636736] rank-specific saved -[04:16:11.651089] log_dir: ./output_dir -[04:16:23.993611] Epoch: [1] [0/3229] lr: 0.000042 grad_norm: 0.8670 (0.8670) closs: 1.0075 (1.0075) time: 12.3415 data: 7.9810 max mem: 54684 -[04:17:06.440076] Epoch: [1] [10/3229] lr: 0.000041 grad_norm: 0.7389 (0.8121) closs: 1.0826 (1.0946) time: 4.9807 data: 0.7257 max mem: 54684 -[04:17:46.869458] Epoch: [1] [20/3229] lr: 0.000041 grad_norm: 0.7389 (0.7906) closs: 1.0572 (1.0652) time: 4.1437 data: 0.0002 max mem: 54684 -[04:18:28.213204] Epoch: [1] [30/3229] lr: 0.000041 grad_norm: 0.7856 (0.7980) closs: 1.0665 (1.0783) time: 4.0886 data: 0.0002 max mem: 54684 -[04:19:09.261576] Epoch: [1] [40/3229] lr: 0.000041 grad_norm: 0.7665 (0.7884) closs: 1.0941 (1.0814) time: 4.1195 data: 0.0002 max mem: 54684 -[04:19:50.348250] Epoch: [1] [50/3229] lr: 0.000041 grad_norm: 0.7463 (0.7844) closs: 1.0827 (1.0739) time: 4.1067 data: 0.0002 max mem: 54684 -[04:20:30.454474] Epoch: [1] [60/3229] lr: 0.000041 grad_norm: 0.7801 (0.7795) closs: 1.0234 (1.0633) time: 4.0596 data: 0.0002 max mem: 54684 -[04:21:11.357215] Epoch: [1] [70/3229] lr: 0.000041 grad_norm: 0.8025 (0.7812) closs: 1.0314 (1.0602) time: 4.0504 data: 0.0002 max mem: 54684 -[04:21:51.356188] Epoch: [1] [80/3229] lr: 0.000041 grad_norm: 0.7522 (0.7691) closs: 1.0226 (1.0522) time: 4.0450 data: 0.0002 max mem: 54684 -[04:22:33.043953] Epoch: [1] [90/3229] lr: 0.000041 grad_norm: 0.7018 (0.7681) closs: 1.0252 (1.0538) time: 4.0843 data: 0.0002 max mem: 54684 -[04:23:13.831663] Epoch: [1] [100/3229] lr: 0.000041 grad_norm: 0.7546 (0.7674) closs: 1.0535 (1.0529) time: 4.1237 data: 0.0002 max mem: 54684 -[04:23:55.181411] Epoch: [1] [110/3229] lr: 0.000041 grad_norm: 0.7521 (0.7679) closs: 1.0461 (1.0515) time: 4.1068 data: 0.0002 max mem: 54684 -[04:24:35.870155] Epoch: [1] [120/3229] lr: 0.000041 grad_norm: 0.7521 (0.7650) closs: 1.0400 (1.0525) time: 4.1019 data: 0.0002 max mem: 54684 -[04:25:17.325692] Epoch: [1] [130/3229] lr: 0.000041 grad_norm: 0.7635 (0.7660) closs: 1.0768 (1.0555) time: 4.1071 data: 0.0002 max mem: 54684 -[04:25:58.111171] Epoch: [1] [140/3229] lr: 0.000041 grad_norm: 0.7635 (0.7666) closs: 1.0801 (1.0569) time: 4.1120 data: 0.0002 max mem: 54684 -[04:26:38.709088] Epoch: [1] [150/3229] lr: 0.000041 grad_norm: 0.7391 (0.7641) closs: 1.0635 (1.0556) time: 4.0691 data: 0.0002 max mem: 54684 -[04:27:19.437075] Epoch: [1] [160/3229] lr: 0.000041 grad_norm: 0.7284 (0.7647) closs: 1.0512 (1.0563) time: 4.0662 data: 0.0002 max mem: 54684 -[04:28:00.075659] Epoch: [1] [170/3229] lr: 0.000040 grad_norm: 0.7635 (0.7653) closs: 1.0589 (1.0533) time: 4.0683 data: 0.0002 max mem: 54684 -[04:28:40.520775] Epoch: [1] [180/3229] lr: 0.000040 grad_norm: 0.7911 (0.7696) closs: 1.0460 (1.0532) time: 4.0541 data: 0.0002 max mem: 54684 -[04:29:21.220750] Epoch: [1] [190/3229] lr: 0.000040 grad_norm: 0.7859 (0.7694) closs: 1.0297 (1.0513) time: 4.0572 data: 0.0002 max mem: 54684 -[04:30:02.479262] Epoch: [1] [200/3229] lr: 0.000040 grad_norm: 0.7721 (0.7706) closs: 1.0362 (1.0522) time: 4.0979 data: 0.0002 max mem: 54684 -[04:30:43.921995] Epoch: [1] [210/3229] lr: 0.000040 grad_norm: 0.8007 (0.7734) closs: 1.0904 (1.0546) time: 4.1350 data: 0.0002 max mem: 54684 -[04:31:24.392183] Epoch: [1] [220/3229] lr: 0.000040 grad_norm: 0.8126 (0.7731) closs: 1.0870 (1.0548) time: 4.0956 data: 0.0002 max mem: 54684 -[04:32:04.710459] Epoch: [1] [230/3229] lr: 0.000040 grad_norm: 0.7601 (0.7722) closs: 1.0787 (1.0528) time: 4.0394 data: 0.0002 max mem: 54684 -[04:32:45.662972] Epoch: [1] [240/3229] lr: 0.000040 grad_norm: 0.7548 (0.7715) closs: 1.0161 (1.0511) time: 4.0635 data: 0.0002 max mem: 54684 -[04:33:27.043779] Epoch: [1] [250/3229] lr: 0.000040 grad_norm: 0.7622 (0.7718) closs: 1.0228 (1.0510) time: 4.1166 data: 0.0002 max mem: 54684 -[04:34:08.159916] Epoch: [1] [260/3229] lr: 0.000040 grad_norm: 0.7596 (0.7716) closs: 1.0571 (1.0517) time: 4.1248 data: 0.0002 max mem: 54684 -[04:34:49.118869] Epoch: [1] [270/3229] lr: 0.000040 grad_norm: 0.7411 (0.7711) closs: 1.0854 (1.0523) time: 4.1037 data: 0.0002 max mem: 54684 -[04:35:29.743500] Epoch: [1] [280/3229] lr: 0.000040 grad_norm: 0.7411 (0.7708) closs: 1.0320 (1.0520) time: 4.0791 data: 0.0002 max mem: 54684 -[04:36:10.896285] Epoch: [1] [290/3229] lr: 0.000040 grad_norm: 0.7698 (0.7711) closs: 1.0320 (1.0519) time: 4.0888 data: 0.0002 max mem: 54684 -[04:36:50.693561] Epoch: [1] [300/3229] lr: 0.000040 grad_norm: 0.7667 (0.7693) closs: 1.0176 (1.0507) time: 4.0474 data: 0.0002 max mem: 54684 -[04:37:31.371110] Epoch: [1] [310/3229] lr: 0.000040 grad_norm: 0.7233 (0.7689) closs: 1.0032 (1.0496) time: 4.0237 data: 0.0002 max mem: 54684 -[04:38:12.621107] Epoch: [1] [320/3229] lr: 0.000039 grad_norm: 0.7637 (0.7696) closs: 1.0102 (1.0494) time: 4.0963 data: 0.0002 max mem: 54684 -[04:38:53.280696] Epoch: [1] [330/3229] lr: 0.000039 grad_norm: 0.7538 (0.7692) closs: 1.0404 (1.0488) time: 4.0954 data: 0.0002 max mem: 54684 -[04:39:34.384153] Epoch: [1] [340/3229] lr: 0.000039 grad_norm: 0.7223 (0.7678) closs: 1.0712 (1.0507) time: 4.0881 data: 0.0002 max mem: 54684 -[04:40:14.745485] Epoch: [1] [350/3229] lr: 0.000039 grad_norm: 0.7186 (0.7656) closs: 1.0880 (1.0494) time: 4.0732 data: 0.0002 max mem: 54684 -[04:40:55.725764] Epoch: [1] [360/3229] lr: 0.000039 grad_norm: 0.7281 (0.7657) closs: 1.0522 (1.0491) time: 4.0670 data: 0.0002 max mem: 54684 -[04:41:37.083851] Epoch: [1] [370/3229] lr: 0.000039 grad_norm: 0.7441 (0.7654) closs: 1.0619 (1.0497) time: 4.1169 data: 0.0002 max mem: 54684 -[04:42:18.173848] Epoch: [1] [380/3229] lr: 0.000039 grad_norm: 0.7505 (0.7652) closs: 1.0880 (1.0510) time: 4.1223 data: 0.0002 max mem: 54684 -[04:42:58.469739] Epoch: [1] [390/3229] lr: 0.000039 grad_norm: 0.7717 (0.7642) closs: 1.0462 (1.0496) time: 4.0692 data: 0.0002 max mem: 54684 -[04:43:39.241200] Epoch: [1] [400/3229] lr: 0.000039 grad_norm: 0.7749 (0.7649) closs: 0.9971 (1.0484) time: 4.0533 data: 0.0002 max mem: 54684 -[04:44:20.120315] Epoch: [1] [410/3229] lr: 0.000039 grad_norm: 0.8080 (0.7657) closs: 1.0498 (1.0489) time: 4.0825 data: 0.0002 max mem: 54684 -[04:45:00.557566] Epoch: [1] [420/3229] lr: 0.000039 grad_norm: 0.7665 (0.7651) closs: 1.0759 (1.0488) time: 4.0658 data: 0.0002 max mem: 54684 -[04:45:41.828412] Epoch: [1] [430/3229] lr: 0.000039 grad_norm: 0.7265 (0.7650) closs: 1.0759 (1.0498) time: 4.0853 data: 0.0002 max mem: 54684 -[04:46:22.509399] Epoch: [1] [440/3229] lr: 0.000039 grad_norm: 0.7462 (0.7646) closs: 1.0742 (1.0497) time: 4.0975 data: 0.0002 max mem: 54684 -[04:47:03.187153] Epoch: [1] [450/3229] lr: 0.000039 grad_norm: 0.7618 (0.7645) closs: 1.0617 (1.0497) time: 4.0679 data: 0.0002 max mem: 54684 -[04:47:44.104151] Epoch: [1] [460/3229] lr: 0.000039 grad_norm: 0.7541 (0.7639) closs: 1.0582 (1.0501) time: 4.0797 data: 0.0002 max mem: 54684 -[04:48:24.641461] Epoch: [1] [470/3229] lr: 0.000038 grad_norm: 0.7528 (0.7635) closs: 1.0582 (1.0494) time: 4.0727 data: 0.0002 max mem: 54684 -[04:49:05.751178] Epoch: [1] [480/3229] lr: 0.000038 grad_norm: 0.7481 (0.7628) closs: 1.0947 (1.0506) time: 4.0823 data: 0.0002 max mem: 54684 -[04:49:46.537424] Epoch: [1] [490/3229] lr: 0.000038 grad_norm: 0.7552 (0.7629) closs: 1.0837 (1.0503) time: 4.0947 data: 0.0002 max mem: 54684 -[04:50:27.314255] Epoch: [1] [500/3229] lr: 0.000038 grad_norm: 0.7712 (0.7634) closs: 1.0294 (1.0500) time: 4.0781 data: 0.0002 max mem: 54684 -[04:51:08.310823] Epoch: [1] [510/3229] lr: 0.000038 grad_norm: 0.8222 (0.7643) closs: 1.0444 (1.0500) time: 4.0886 data: 0.0002 max mem: 54684 -[04:51:49.708115] Epoch: [1] [520/3229] lr: 0.000038 grad_norm: 0.7931 (0.7644) closs: 1.0786 (1.0511) time: 4.1196 data: 0.0002 max mem: 54684 -[04:52:30.929265] Epoch: [1] [530/3229] lr: 0.000038 grad_norm: 0.7668 (0.7648) closs: 1.0823 (1.0516) time: 4.1309 data: 0.0002 max mem: 54684 -[04:53:11.799214] Epoch: [1] [540/3229] lr: 0.000038 grad_norm: 0.7484 (0.7644) closs: 1.0851 (1.0518) time: 4.1045 data: 0.0002 max mem: 54684 -[04:53:52.617648] Epoch: [1] [550/3229] lr: 0.000038 grad_norm: 0.7576 (0.7642) closs: 1.0762 (1.0517) time: 4.0844 data: 0.0002 max mem: 54684 -[04:54:33.511754] Epoch: [1] [560/3229] lr: 0.000038 grad_norm: 0.7606 (0.7637) closs: 1.0481 (1.0516) time: 4.0856 data: 0.0002 max mem: 54684 -[04:55:14.276726] Epoch: [1] [570/3229] lr: 0.000038 grad_norm: 0.7616 (0.7649) closs: 1.0444 (1.0516) time: 4.0829 data: 0.0002 max mem: 54684 -[04:55:54.841137] Epoch: [1] [580/3229] lr: 0.000038 grad_norm: 0.7616 (0.7646) closs: 1.0423 (1.0511) time: 4.0664 data: 0.0002 max mem: 54684 -[04:56:35.646680] Epoch: [1] [590/3229] lr: 0.000038 grad_norm: 0.7542 (0.7649) closs: 1.0440 (1.0512) time: 4.0684 data: 0.0002 max mem: 54684 -[04:57:16.334209] Epoch: [1] [600/3229] lr: 0.000038 grad_norm: 0.7437 (0.7638) closs: 1.0745 (1.0512) time: 4.0746 data: 0.0002 max mem: 54684 -[04:57:57.448181] Epoch: [1] [610/3229] lr: 0.000038 grad_norm: 0.7439 (0.7642) closs: 1.0596 (1.0512) time: 4.0900 data: 0.0002 max mem: 54684 -[04:58:38.714028] Epoch: [1] [620/3229] lr: 0.000037 grad_norm: 0.7602 (0.7641) closs: 1.0594 (1.0513) time: 4.1189 data: 0.0002 max mem: 54684 -[04:59:19.266620] Epoch: [1] [630/3229] lr: 0.000037 grad_norm: 0.7325 (0.7639) closs: 1.0401 (1.0502) time: 4.0909 data: 0.0002 max mem: 54684 -[05:00:00.368157] Epoch: [1] [640/3229] lr: 0.000037 grad_norm: 0.7513 (0.7638) closs: 1.0248 (1.0503) time: 4.0826 data: 0.0002 max mem: 54684 -[05:00:41.160239] Epoch: [1] [650/3229] lr: 0.000037 grad_norm: 0.7513 (0.7635) closs: 1.0358 (1.0502) time: 4.0946 data: 0.0002 max mem: 54684 -[05:01:22.138831] Epoch: [1] [660/3229] lr: 0.000037 grad_norm: 0.7456 (0.7637) closs: 1.0590 (1.0504) time: 4.0885 data: 0.0002 max mem: 54684 -[05:02:02.946232] Epoch: [1] [670/3229] lr: 0.000037 grad_norm: 0.7457 (0.7634) closs: 1.0762 (1.0507) time: 4.0892 data: 0.0002 max mem: 54684 -[05:02:44.502278] Epoch: [1] [680/3229] lr: 0.000037 grad_norm: 0.7440 (0.7634) closs: 1.0678 (1.0509) time: 4.1181 data: 0.0002 max mem: 54684 -[05:03:24.951117] Epoch: [1] [690/3229] lr: 0.000037 grad_norm: 0.7456 (0.7630) closs: 1.0585 (1.0513) time: 4.1002 data: 0.0002 max mem: 54684 -[05:04:05.578068] Epoch: [1] [700/3229] lr: 0.000037 grad_norm: 0.7712 (0.7629) closs: 1.0648 (1.0509) time: 4.0537 data: 0.0002 max mem: 54684 -[05:04:46.382125] Epoch: [1] [710/3229] lr: 0.000037 grad_norm: 0.7536 (0.7623) closs: 1.0831 (1.0508) time: 4.0715 data: 0.0002 max mem: 54684 -[05:05:27.836191] Epoch: [1] [720/3229] lr: 0.000037 grad_norm: 0.7533 (0.7623) closs: 1.0761 (1.0516) time: 4.1128 data: 0.0002 max mem: 54684 -[05:06:08.635002] Epoch: [1] [730/3229] lr: 0.000037 grad_norm: 0.7805 (0.7629) closs: 1.0763 (1.0519) time: 4.1126 data: 0.0002 max mem: 54684 -[05:06:48.634490] Epoch: [1] [740/3229] lr: 0.000037 grad_norm: 0.7186 (0.7620) closs: 1.0197 (1.0513) time: 4.0398 data: 0.0002 max mem: 54684 -[05:07:29.445391] Epoch: [1] [750/3229] lr: 0.000037 grad_norm: 0.7025 (0.7616) closs: 1.0359 (1.0516) time: 4.0405 data: 0.0002 max mem: 54684 -[05:08:11.035741] Epoch: [1] [760/3229] lr: 0.000036 grad_norm: 0.7647 (0.7623) closs: 1.0791 (1.0519) time: 4.1200 data: 0.0002 max mem: 54684 -[05:08:51.808203] Epoch: [1] [770/3229] lr: 0.000036 grad_norm: 0.8157 (0.7630) closs: 1.0874 (1.0524) time: 4.1181 data: 0.0002 max mem: 54684 -[05:09:32.821381] Epoch: [1] [780/3229] lr: 0.000036 grad_norm: 0.7640 (0.7628) closs: 1.0954 (1.0530) time: 4.0892 data: 0.0002 max mem: 54684 -[05:10:13.634195] Epoch: [1] [790/3229] lr: 0.000036 grad_norm: 0.7289 (0.7624) closs: 1.0748 (1.0527) time: 4.0912 data: 0.0002 max mem: 54684 -[05:10:55.053924] Epoch: [1] [800/3229] lr: 0.000036 grad_norm: 0.7343 (0.7624) closs: 1.0321 (1.0528) time: 4.1116 data: 0.0002 max mem: 54684 -[05:11:35.196793] Epoch: [1] [810/3229] lr: 0.000036 grad_norm: 0.7367 (0.7620) closs: 1.0322 (1.0524) time: 4.0781 data: 0.0002 max mem: 54684 -[05:12:16.522962] Epoch: [1] [820/3229] lr: 0.000036 grad_norm: 0.7401 (0.7621) closs: 1.0246 (1.0522) time: 4.0734 data: 0.0002 max mem: 54684 -[05:12:57.137015] Epoch: [1] [830/3229] lr: 0.000036 grad_norm: 0.7595 (0.7617) closs: 1.0450 (1.0524) time: 4.0969 data: 0.0002 max mem: 54684 -[05:13:38.237630] Epoch: [1] [840/3229] lr: 0.000036 grad_norm: 0.7652 (0.7616) closs: 1.0833 (1.0530) time: 4.0857 data: 0.0002 max mem: 54684 -[05:14:19.021316] Epoch: [1] [850/3229] lr: 0.000036 grad_norm: 0.7652 (0.7614) closs: 1.0680 (1.0532) time: 4.0941 data: 0.0002 max mem: 54684 -[05:14:59.987219] Epoch: [1] [860/3229] lr: 0.000036 grad_norm: 0.7362 (0.7610) closs: 1.0479 (1.0534) time: 4.0874 data: 0.0002 max mem: 54684 -[05:15:40.669534] Epoch: [1] [870/3229] lr: 0.000036 grad_norm: 0.7348 (0.7608) closs: 1.0472 (1.0530) time: 4.0823 data: 0.0002 max mem: 54684 -[05:16:21.345203] Epoch: [1] [880/3229] lr: 0.000036 grad_norm: 0.7769 (0.7607) closs: 1.0219 (1.0529) time: 4.0678 data: 0.0002 max mem: 54684 -[05:17:02.174965] Epoch: [1] [890/3229] lr: 0.000036 grad_norm: 0.7393 (0.7603) closs: 1.0763 (1.0534) time: 4.0752 data: 0.0002 max mem: 54684 -[05:17:43.153839] Epoch: [1] [900/3229] lr: 0.000035 grad_norm: 0.7229 (0.7600) closs: 1.0763 (1.0535) time: 4.0904 data: 0.0002 max mem: 54684 -[05:18:23.532375] Epoch: [1] [910/3229] lr: 0.000035 grad_norm: 0.7346 (0.7597) closs: 1.0722 (1.0536) time: 4.0678 data: 0.0002 max mem: 54684 -[05:19:04.844983] Epoch: [1] [920/3229] lr: 0.000035 grad_norm: 0.7679 (0.7600) closs: 1.0613 (1.0536) time: 4.0845 data: 0.0002 max mem: 54684 -[05:19:45.960496] Epoch: [1] [930/3229] lr: 0.000035 grad_norm: 0.7816 (0.7600) closs: 1.0613 (1.0537) time: 4.1213 data: 0.0002 max mem: 54684 -[05:20:26.646864] Epoch: [1] [940/3229] lr: 0.000035 grad_norm: 0.7816 (0.7601) closs: 1.0160 (1.0532) time: 4.0900 data: 0.0002 max mem: 54684 -[05:21:07.563052] Epoch: [1] [950/3229] lr: 0.000035 grad_norm: 0.7621 (0.7605) closs: 0.9814 (1.0528) time: 4.0801 data: 0.0002 max mem: 54684 -[05:21:48.451416] Epoch: [1] [960/3229] lr: 0.000035 grad_norm: 0.7508 (0.7605) closs: 0.9968 (1.0529) time: 4.0902 data: 0.0002 max mem: 54684 -[05:22:29.552875] Epoch: [1] [970/3229] lr: 0.000035 grad_norm: 0.7519 (0.7605) closs: 1.0570 (1.0530) time: 4.0994 data: 0.0002 max mem: 54684 -[05:23:10.530952] Epoch: [1] [980/3229] lr: 0.000035 grad_norm: 0.7625 (0.7606) closs: 1.0570 (1.0532) time: 4.1039 data: 0.0002 max mem: 54684 -[05:23:51.878010] Epoch: [1] [990/3229] lr: 0.000035 grad_norm: 0.7625 (0.7608) closs: 1.0764 (1.0534) time: 4.1162 data: 0.0002 max mem: 54684 -[05:24:31.849857] Epoch: [1] [1000/3229] lr: 0.000035 grad_norm: 0.7446 (0.7599) closs: 1.0764 (1.0532) time: 4.0659 data: 0.0002 max mem: 54684 -[05:25:12.639207] Epoch: [1] [1010/3229] lr: 0.000035 grad_norm: 0.7020 (0.7597) closs: 1.0242 (1.0529) time: 4.0380 data: 0.0002 max mem: 54684 -[05:25:52.977496] Epoch: [1] [1020/3229] lr: 0.000035 grad_norm: 0.7052 (0.7594) closs: 1.0555 (1.0527) time: 4.0563 data: 0.0002 max mem: 54684 -[05:26:34.003363] Epoch: [1] [1030/3229] lr: 0.000034 grad_norm: 0.7147 (0.7588) closs: 1.0503 (1.0527) time: 4.0681 data: 0.0002 max mem: 54684 -[05:27:14.573525] Epoch: [1] [1040/3229] lr: 0.000034 grad_norm: 0.7147 (0.7586) closs: 1.0450 (1.0524) time: 4.0797 data: 0.0002 max mem: 54684 -[05:27:55.498440] Epoch: [1] [1050/3229] lr: 0.000034 grad_norm: 0.7522 (0.7589) closs: 1.0486 (1.0523) time: 4.0747 data: 0.0002 max mem: 54684 -[05:28:36.363003] Epoch: [1] [1060/3229] lr: 0.000034 grad_norm: 0.7522 (0.7588) closs: 1.0553 (1.0523) time: 4.0894 data: 0.0002 max mem: 54684 -[05:29:17.566434] Epoch: [1] [1070/3229] lr: 0.000034 grad_norm: 0.7544 (0.7588) closs: 1.0536 (1.0523) time: 4.1033 data: 0.0002 max mem: 54684 -[05:29:57.687127] Epoch: [1] [1080/3229] lr: 0.000034 grad_norm: 0.7544 (0.7586) closs: 1.0454 (1.0519) time: 4.0661 data: 0.0002 max mem: 54684 -[05:30:38.261585] Epoch: [1] [1090/3229] lr: 0.000034 grad_norm: 0.7317 (0.7584) closs: 1.0487 (1.0519) time: 4.0347 data: 0.0002 max mem: 54684 -[05:31:19.146600] Epoch: [1] [1100/3229] lr: 0.000034 grad_norm: 0.7461 (0.7585) closs: 1.0237 (1.0518) time: 4.0729 data: 0.0002 max mem: 54684 -[05:32:00.230890] Epoch: [1] [1110/3229] lr: 0.000034 grad_norm: 0.7654 (0.7586) closs: 1.0237 (1.0518) time: 4.0984 data: 0.0002 max mem: 54684 -[05:32:40.323167] Epoch: [1] [1120/3229] lr: 0.000034 grad_norm: 0.7725 (0.7589) closs: 1.0261 (1.0517) time: 4.0588 data: 0.0002 max mem: 54684 -[05:33:21.572530] Epoch: [1] [1130/3229] lr: 0.000034 grad_norm: 0.7690 (0.7592) closs: 1.0308 (1.0517) time: 4.0670 data: 0.0002 max mem: 54684 -[05:34:02.762302] Epoch: [1] [1140/3229] lr: 0.000034 grad_norm: 0.7708 (0.7595) closs: 1.0746 (1.0522) time: 4.1219 data: 0.0002 max mem: 54684 -[05:34:43.703986] Epoch: [1] [1150/3229] lr: 0.000034 grad_norm: 0.7656 (0.7593) closs: 1.0766 (1.0521) time: 4.1065 data: 0.0003 max mem: 54684 -[05:35:24.484939] Epoch: [1] [1160/3229] lr: 0.000034 grad_norm: 0.7151 (0.7591) closs: 1.0380 (1.0517) time: 4.0861 data: 0.0003 max mem: 54684 -[05:36:05.401846] Epoch: [1] [1170/3229] lr: 0.000033 grad_norm: 0.7522 (0.7591) closs: 1.0499 (1.0519) time: 4.0848 data: 0.0002 max mem: 54684 -[05:36:45.957264] Epoch: [1] [1180/3229] lr: 0.000033 grad_norm: 0.7522 (0.7588) closs: 1.0587 (1.0517) time: 4.0735 data: 0.0002 max mem: 54684 -[05:37:27.507925] Epoch: [1] [1190/3229] lr: 0.000033 grad_norm: 0.7522 (0.7589) closs: 1.0445 (1.0518) time: 4.1052 data: 0.0002 max mem: 54684 -[05:38:08.323255] Epoch: [1] [1200/3229] lr: 0.000033 grad_norm: 0.8299 (0.7594) closs: 1.0666 (1.0517) time: 4.1182 data: 0.0002 max mem: 54684 -[05:38:49.657429] Epoch: [1] [1210/3229] lr: 0.000033 grad_norm: 0.8322 (0.7596) closs: 1.0805 (1.0520) time: 4.1074 data: 0.0002 max mem: 54684 -[05:39:30.796365] Epoch: [1] [1220/3229] lr: 0.000033 grad_norm: 0.7634 (0.7597) closs: 1.0942 (1.0524) time: 4.1236 data: 0.0002 max mem: 54684 -[05:40:12.472149] Epoch: [1] [1230/3229] lr: 0.000033 grad_norm: 0.7564 (0.7597) closs: 1.0754 (1.0523) time: 4.1407 data: 0.0002 max mem: 54684 -[05:40:53.596788] Epoch: [1] [1240/3229] lr: 0.000033 grad_norm: 0.7499 (0.7610) closs: 1.0346 (1.0525) time: 4.1400 data: 0.0002 max mem: 54684 -[05:41:34.343746] Epoch: [1] [1250/3229] lr: 0.000033 grad_norm: 0.7309 (0.7605) closs: 1.0346 (1.0524) time: 4.0935 data: 0.0002 max mem: 54684 -[05:42:14.850396] Epoch: [1] [1260/3229] lr: 0.000033 grad_norm: 0.7357 (0.7604) closs: 1.0046 (1.0521) time: 4.0626 data: 0.0002 max mem: 54684 -[05:42:55.902233] Epoch: [1] [1270/3229] lr: 0.000033 grad_norm: 0.7459 (0.7603) closs: 0.9939 (1.0517) time: 4.0779 data: 0.0002 max mem: 54684 -[05:43:36.700285] Epoch: [1] [1280/3229] lr: 0.000033 grad_norm: 0.7424 (0.7602) closs: 1.0316 (1.0518) time: 4.0924 data: 0.0002 max mem: 54684 -[05:44:18.029631] Epoch: [1] [1290/3229] lr: 0.000033 grad_norm: 0.7584 (0.7604) closs: 1.0437 (1.0518) time: 4.1063 data: 0.0002 max mem: 54684 -[05:44:58.460022] Epoch: [1] [1300/3229] lr: 0.000032 grad_norm: 0.7471 (0.7602) closs: 1.0371 (1.0516) time: 4.0879 data: 0.0002 max mem: 54684 -[05:45:39.434680] Epoch: [1] [1310/3229] lr: 0.000032 grad_norm: 0.7390 (0.7601) closs: 1.0832 (1.0518) time: 4.0702 data: 0.0002 max mem: 54684 -[05:46:19.533138] Epoch: [1] [1320/3229] lr: 0.000032 grad_norm: 0.7437 (0.7601) closs: 1.0843 (1.0517) time: 4.0536 data: 0.0002 max mem: 54684 -[05:46:59.861164] Epoch: [1] [1330/3229] lr: 0.000032 grad_norm: 0.7416 (0.7597) closs: 1.0420 (1.0514) time: 4.0213 data: 0.0002 max mem: 54684 -[05:47:39.762335] Epoch: [1] [1340/3229] lr: 0.000032 grad_norm: 0.7236 (0.7592) closs: 1.0353 (1.0512) time: 4.0114 data: 0.0002 max mem: 54684 -[05:48:21.193611] Epoch: [1] [1350/3229] lr: 0.000032 grad_norm: 0.7413 (0.7594) closs: 1.0614 (1.0513) time: 4.0666 data: 0.0002 max mem: 54684 -[05:49:01.979501] Epoch: [1] [1360/3229] lr: 0.000032 grad_norm: 0.7393 (0.7591) closs: 1.0725 (1.0512) time: 4.1108 data: 0.0002 max mem: 54684 -[05:49:42.964934] Epoch: [1] [1370/3229] lr: 0.000032 grad_norm: 0.7161 (0.7588) closs: 1.0531 (1.0512) time: 4.0885 data: 0.0002 max mem: 54684 -[05:50:23.751298] Epoch: [1] [1380/3229] lr: 0.000032 grad_norm: 0.7334 (0.7589) closs: 1.0633 (1.0516) time: 4.0885 data: 0.0002 max mem: 54684 -[05:51:05.208897] Epoch: [1] [1390/3229] lr: 0.000032 grad_norm: 0.7776 (0.7591) closs: 1.0852 (1.0519) time: 4.1121 data: 0.0002 max mem: 54684 -[05:51:46.307285] Epoch: [1] [1400/3229] lr: 0.000032 grad_norm: 0.7727 (0.7591) closs: 1.0727 (1.0521) time: 4.1277 data: 0.0002 max mem: 54684 -[05:52:27.639855] Epoch: [1] [1410/3229] lr: 0.000032 grad_norm: 0.7589 (0.7592) closs: 1.0593 (1.0520) time: 4.1215 data: 0.0002 max mem: 54684 -[05:53:08.510464] Epoch: [1] [1420/3229] lr: 0.000032 grad_norm: 0.7454 (0.7591) closs: 1.0521 (1.0518) time: 4.1101 data: 0.0002 max mem: 54684 -[05:53:49.534800] Epoch: [1] [1430/3229] lr: 0.000031 grad_norm: 0.7239 (0.7589) closs: 1.0510 (1.0515) time: 4.0947 data: 0.0002 max mem: 54684 -[05:54:30.634251] Epoch: [1] [1440/3229] lr: 0.000031 grad_norm: 0.7376 (0.7591) closs: 1.0525 (1.0516) time: 4.1061 data: 0.0002 max mem: 54684 -[05:55:11.319746] Epoch: [1] [1450/3229] lr: 0.000031 grad_norm: 0.7381 (0.7590) closs: 1.0587 (1.0514) time: 4.0892 data: 0.0002 max mem: 54684 -[05:55:52.498426] Epoch: [1] [1460/3229] lr: 0.000031 grad_norm: 0.7446 (0.7591) closs: 1.0462 (1.0513) time: 4.0931 data: 0.0002 max mem: 54684 -[05:56:33.716084] Epoch: [1] [1470/3229] lr: 0.000031 grad_norm: 0.7441 (0.7589) closs: 1.0490 (1.0514) time: 4.1198 data: 0.0002 max mem: 54684 -[05:57:14.183953] Epoch: [1] [1480/3229] lr: 0.000031 grad_norm: 0.7353 (0.7586) closs: 1.0471 (1.0514) time: 4.0842 data: 0.0002 max mem: 54684 -[05:57:55.151729] Epoch: [1] [1490/3229] lr: 0.000031 grad_norm: 0.7442 (0.7585) closs: 1.0415 (1.0514) time: 4.0717 data: 0.0002 max mem: 54684 -[05:58:36.455544] Epoch: [1] [1500/3229] lr: 0.000031 grad_norm: 0.7452 (0.7585) closs: 1.0665 (1.0515) time: 4.1135 data: 0.0002 max mem: 54684 -[05:59:17.089096] Epoch: [1] [1510/3229] lr: 0.000031 grad_norm: 0.7419 (0.7581) closs: 1.0889 (1.0517) time: 4.0968 data: 0.0002 max mem: 54684 -[05:59:57.877708] Epoch: [1] [1520/3229] lr: 0.000031 grad_norm: 0.7463 (0.7583) closs: 1.0725 (1.0517) time: 4.0710 data: 0.0002 max mem: 54684 -[06:00:39.181341] Epoch: [1] [1530/3229] lr: 0.000031 grad_norm: 0.7860 (0.7584) closs: 1.0767 (1.0519) time: 4.1045 data: 0.0002 max mem: 54684 -[06:01:20.218941] Epoch: [1] [1540/3229] lr: 0.000031 grad_norm: 0.7508 (0.7584) closs: 1.0767 (1.0517) time: 4.1170 data: 0.0002 max mem: 54684 -[06:02:01.186493] Epoch: [1] [1550/3229] lr: 0.000031 grad_norm: 0.7369 (0.7583) closs: 1.0714 (1.0517) time: 4.1002 data: 0.0002 max mem: 54684 -[06:02:42.438164] Epoch: [1] [1560/3229] lr: 0.000030 grad_norm: 0.7650 (0.7584) closs: 1.0850 (1.0520) time: 4.1109 data: 0.0002 max mem: 54684 -[06:03:23.689124] Epoch: [1] [1570/3229] lr: 0.000030 grad_norm: 0.7622 (0.7584) closs: 1.0709 (1.0521) time: 4.1251 data: 0.0002 max mem: 54684 -[06:04:04.719315] Epoch: [1] [1580/3229] lr: 0.000030 grad_norm: 0.7378 (0.7583) closs: 1.0553 (1.0520) time: 4.1140 data: 0.0002 max mem: 54684 -[06:04:45.698136] Epoch: [1] [1590/3229] lr: 0.000030 grad_norm: 0.7576 (0.7584) closs: 1.0282 (1.0517) time: 4.1004 data: 0.0002 max mem: 54684 -[06:05:27.010602] Epoch: [1] [1600/3229] lr: 0.000030 grad_norm: 0.7794 (0.7588) closs: 1.0155 (1.0516) time: 4.1145 data: 0.0002 max mem: 54684 -[06:06:08.217639] Epoch: [1] [1610/3229] lr: 0.000030 grad_norm: 0.7703 (0.7590) closs: 1.0618 (1.0518) time: 4.1259 data: 0.0002 max mem: 54684 -[06:06:49.708388] Epoch: [1] [1620/3229] lr: 0.000030 grad_norm: 0.7704 (0.7592) closs: 1.0795 (1.0519) time: 4.1348 data: 0.0002 max mem: 54684 -[06:07:30.802140] Epoch: [1] [1630/3229] lr: 0.000030 grad_norm: 0.7723 (0.7593) closs: 1.0570 (1.0521) time: 4.1292 data: 0.0002 max mem: 54684 -[06:08:12.042914] Epoch: [1] [1640/3229] lr: 0.000030 grad_norm: 0.7690 (0.7593) closs: 1.0839 (1.0523) time: 4.1167 data: 0.0002 max mem: 54684 -[06:08:52.552185] Epoch: [1] [1650/3229] lr: 0.000030 grad_norm: 0.7620 (0.7592) closs: 1.0666 (1.0522) time: 4.0874 data: 0.0002 max mem: 54684 -[06:09:34.109821] Epoch: [1] [1660/3229] lr: 0.000030 grad_norm: 0.7443 (0.7592) closs: 1.0719 (1.0526) time: 4.1033 data: 0.0002 max mem: 54684 -[06:10:15.226675] Epoch: [1] [1670/3229] lr: 0.000030 grad_norm: 0.7640 (0.7592) closs: 1.0737 (1.0524) time: 4.1337 data: 0.0002 max mem: 54684 -[06:10:55.893166] Epoch: [1] [1680/3229] lr: 0.000030 grad_norm: 0.7557 (0.7591) closs: 1.0457 (1.0524) time: 4.0891 data: 0.0002 max mem: 54684 -[06:11:37.000880] Epoch: [1] [1690/3229] lr: 0.000029 grad_norm: 0.7449 (0.7591) closs: 1.0646 (1.0525) time: 4.0886 data: 0.0002 max mem: 54684 -[06:12:18.347873] Epoch: [1] [1700/3229] lr: 0.000029 grad_norm: 0.7646 (0.7592) closs: 1.0581 (1.0523) time: 4.1227 data: 0.0002 max mem: 54684 -[06:12:59.455957] Epoch: [1] [1710/3229] lr: 0.000029 grad_norm: 0.7725 (0.7593) closs: 1.0392 (1.0523) time: 4.1227 data: 0.0002 max mem: 54684 -[06:13:40.686492] Epoch: [1] [1720/3229] lr: 0.000029 grad_norm: 0.7731 (0.7595) closs: 1.0769 (1.0526) time: 4.1169 data: 0.0002 max mem: 54684 -[06:14:21.266645] Epoch: [1] [1730/3229] lr: 0.000029 grad_norm: 0.7730 (0.7594) closs: 1.0708 (1.0525) time: 4.0905 data: 0.0002 max mem: 54684 -[06:15:02.058571] Epoch: [1] [1740/3229] lr: 0.000029 grad_norm: 0.7642 (0.7593) closs: 1.0508 (1.0524) time: 4.0685 data: 0.0002 max mem: 54684 -[06:15:43.158278] Epoch: [1] [1750/3229] lr: 0.000029 grad_norm: 0.7606 (0.7594) closs: 1.0358 (1.0525) time: 4.0945 data: 0.0002 max mem: 54684 -[06:16:24.131577] Epoch: [1] [1760/3229] lr: 0.000029 grad_norm: 0.7606 (0.7593) closs: 1.0631 (1.0525) time: 4.1036 data: 0.0002 max mem: 54684 -[06:17:04.589751] Epoch: [1] [1770/3229] lr: 0.000029 grad_norm: 0.7370 (0.7591) closs: 1.0766 (1.0524) time: 4.0715 data: 0.0002 max mem: 54684 -[06:17:46.162753] Epoch: [1] [1780/3229] lr: 0.000029 grad_norm: 0.7053 (0.7589) closs: 1.0610 (1.0525) time: 4.1015 data: 0.0002 max mem: 54684 -[06:18:27.288021] Epoch: [1] [1790/3229] lr: 0.000029 grad_norm: 0.7304 (0.7589) closs: 1.0519 (1.0525) time: 4.1348 data: 0.0002 max mem: 54684 -[06:19:08.597875] Epoch: [1] [1800/3229] lr: 0.000029 grad_norm: 0.7718 (0.7593) closs: 1.0343 (1.0524) time: 4.1217 data: 0.0002 max mem: 54684 -[06:19:48.096823] Epoch: [1] [1810/3229] lr: 0.000028 grad_norm: 0.7355 (0.7589) closs: 0.9675 (1.0521) time: 4.0404 data: 0.0002 max mem: 54684 -[06:20:29.329365] Epoch: [1] [1820/3229] lr: 0.000028 grad_norm: 0.7154 (0.7589) closs: 1.0411 (1.0522) time: 4.0365 data: 0.0002 max mem: 54684 -[06:21:10.086608] Epoch: [1] [1830/3229] lr: 0.000028 grad_norm: 0.7412 (0.7589) closs: 1.0599 (1.0522) time: 4.0994 data: 0.0002 max mem: 54684 -[06:21:51.381549] Epoch: [1] [1840/3229] lr: 0.000028 grad_norm: 0.7621 (0.7589) closs: 1.0664 (1.0523) time: 4.1025 data: 0.0002 max mem: 54684 -[06:22:32.162442] Epoch: [1] [1850/3229] lr: 0.000028 grad_norm: 0.7662 (0.7590) closs: 1.0554 (1.0522) time: 4.1037 data: 0.0002 max mem: 54684 -[06:23:13.433902] Epoch: [1] [1860/3229] lr: 0.000028 grad_norm: 0.7852 (0.7590) closs: 1.0432 (1.0521) time: 4.1025 data: 0.0002 max mem: 54684 -[06:23:54.209307] Epoch: [1] [1870/3229] lr: 0.000028 grad_norm: 0.7674 (0.7591) closs: 1.0409 (1.0521) time: 4.1023 data: 0.0002 max mem: 54684 -[06:24:35.546365] Epoch: [1] [1880/3229] lr: 0.000028 grad_norm: 0.7487 (0.7591) closs: 1.0409 (1.0522) time: 4.1056 data: 0.0002 max mem: 54684 -[06:25:15.982687] Epoch: [1] [1890/3229] lr: 0.000028 grad_norm: 0.7683 (0.7591) closs: 1.0359 (1.0521) time: 4.0886 data: 0.0002 max mem: 54684 -[06:25:57.440510] Epoch: [1] [1900/3229] lr: 0.000028 grad_norm: 0.7670 (0.7592) closs: 1.0557 (1.0522) time: 4.0946 data: 0.0002 max mem: 54684 -[06:26:38.556623] Epoch: [1] [1910/3229] lr: 0.000028 grad_norm: 0.7661 (0.7592) closs: 1.0564 (1.0521) time: 4.1286 data: 0.0002 max mem: 54684 -[06:27:19.228566] Epoch: [1] [1920/3229] lr: 0.000028 grad_norm: 0.7542 (0.7590) closs: 1.0294 (1.0521) time: 4.0893 data: 0.0002 max mem: 54684 -[06:27:59.439243] Epoch: [1] [1930/3229] lr: 0.000028 grad_norm: 0.7197 (0.7587) closs: 1.0121 (1.0517) time: 4.0441 data: 0.0002 max mem: 54684 -[06:28:40.298280] Epoch: [1] [1940/3229] lr: 0.000027 grad_norm: 0.7343 (0.7586) closs: 1.0121 (1.0518) time: 4.0534 data: 0.0002 max mem: 54684 -[06:29:21.418176] Epoch: [1] [1950/3229] lr: 0.000027 grad_norm: 0.7666 (0.7588) closs: 1.0867 (1.0519) time: 4.0989 data: 0.0002 max mem: 54684 -[06:30:01.756178] Epoch: [1] [1960/3229] lr: 0.000027 grad_norm: 0.7619 (0.7587) closs: 1.0618 (1.0519) time: 4.0728 data: 0.0002 max mem: 54684 -[06:30:43.062087] Epoch: [1] [1970/3229] lr: 0.000027 grad_norm: 0.7588 (0.7588) closs: 1.0484 (1.0518) time: 4.0821 data: 0.0002 max mem: 54684 -[06:31:23.143763] Epoch: [1] [1980/3229] lr: 0.000027 grad_norm: 0.7496 (0.7584) closs: 1.0332 (1.0516) time: 4.0693 data: 0.0002 max mem: 54684 -[06:32:04.271038] Epoch: [1] [1990/3229] lr: 0.000027 grad_norm: 0.7496 (0.7585) closs: 1.0258 (1.0515) time: 4.0604 data: 0.0002 max mem: 54684 -[06:32:45.254334] Epoch: [1] [2000/3229] lr: 0.000027 grad_norm: 0.7825 (0.7587) closs: 1.0567 (1.0515) time: 4.1055 data: 0.0002 max mem: 54684 -[06:33:26.464223] Epoch: [1] [2010/3229] lr: 0.000027 grad_norm: 0.7723 (0.7588) closs: 1.0622 (1.0516) time: 4.1096 data: 0.0002 max mem: 54684 -[06:34:07.977608] Epoch: [1] [2020/3229] lr: 0.000027 grad_norm: 0.7569 (0.7588) closs: 1.0776 (1.0517) time: 4.1361 data: 0.0002 max mem: 54684 -[06:34:48.895957] Epoch: [1] [2030/3229] lr: 0.000027 grad_norm: 0.7560 (0.7586) closs: 1.0790 (1.0517) time: 4.1215 data: 0.0002 max mem: 54684 -[06:35:29.451828] Epoch: [1] [2040/3229] lr: 0.000027 grad_norm: 0.7256 (0.7584) closs: 1.0240 (1.0514) time: 4.0736 data: 0.0002 max mem: 54684 -[06:36:10.091914] Epoch: [1] [2050/3229] lr: 0.000027 grad_norm: 0.7247 (0.7581) closs: 1.0561 (1.0513) time: 4.0597 data: 0.0002 max mem: 54684 -[06:36:50.691448] Epoch: [1] [2060/3229] lr: 0.000027 grad_norm: 0.7255 (0.7579) closs: 1.0204 (1.0512) time: 4.0619 data: 0.0002 max mem: 54684 -[06:37:31.150888] Epoch: [1] [2070/3229] lr: 0.000026 grad_norm: 0.7327 (0.7578) closs: 1.0344 (1.0512) time: 4.0529 data: 0.0002 max mem: 54684 -[06:38:11.548200] Epoch: [1] [2080/3229] lr: 0.000026 grad_norm: 0.7327 (0.7576) closs: 1.0414 (1.0511) time: 4.0428 data: 0.0002 max mem: 54684 -[06:38:51.240169] Epoch: [1] [2090/3229] lr: 0.000026 grad_norm: 0.7045 (0.7574) closs: 0.9855 (1.0507) time: 4.0044 data: 0.0002 max mem: 54684 -[06:39:32.307282] Epoch: [1] [2100/3229] lr: 0.000026 grad_norm: 0.7700 (0.7575) closs: 0.9816 (1.0506) time: 4.0379 data: 0.0002 max mem: 54684 -[06:40:12.900721] Epoch: [1] [2110/3229] lr: 0.000026 grad_norm: 0.7683 (0.7572) closs: 1.0346 (1.0504) time: 4.0830 data: 0.0002 max mem: 54684 -[06:40:54.073838] Epoch: [1] [2120/3229] lr: 0.000026 grad_norm: 0.7175 (0.7572) closs: 1.0233 (1.0503) time: 4.0883 data: 0.0002 max mem: 54684 -[06:41:34.494204] Epoch: [1] [2130/3229] lr: 0.000026 grad_norm: 0.7567 (0.7575) closs: 1.0420 (1.0504) time: 4.0796 data: 0.0002 max mem: 54684 -[06:42:14.786527] Epoch: [1] [2140/3229] lr: 0.000026 grad_norm: 0.7746 (0.7574) closs: 1.0572 (1.0504) time: 4.0356 data: 0.0002 max mem: 54684 -[06:42:56.042219] Epoch: [1] [2150/3229] lr: 0.000026 grad_norm: 0.7436 (0.7574) closs: 1.0572 (1.0506) time: 4.0773 data: 0.0002 max mem: 54684 -[06:43:36.873461] Epoch: [1] [2160/3229] lr: 0.000026 grad_norm: 0.7565 (0.7574) closs: 1.1052 (1.0509) time: 4.1043 data: 0.0002 max mem: 54684 -[06:44:18.036372] Epoch: [1] [2170/3229] lr: 0.000026 grad_norm: 0.7559 (0.7573) closs: 1.0642 (1.0508) time: 4.0996 data: 0.0002 max mem: 54684 -[06:44:58.954633] Epoch: [1] [2180/3229] lr: 0.000026 grad_norm: 0.7512 (0.7574) closs: 1.0519 (1.0507) time: 4.1040 data: 0.0002 max mem: 54684 -[06:45:39.550709] Epoch: [1] [2190/3229] lr: 0.000026 grad_norm: 0.7721 (0.7580) closs: 1.0434 (1.0508) time: 4.0757 data: 0.0002 max mem: 54684 -[06:46:20.109102] Epoch: [1] [2200/3229] lr: 0.000025 grad_norm: 0.7721 (0.7579) closs: 1.0396 (1.0507) time: 4.0577 data: 0.0002 max mem: 54684 -[06:47:01.387992] Epoch: [1] [2210/3229] lr: 0.000025 grad_norm: 0.7774 (0.7582) closs: 1.0393 (1.0508) time: 4.0918 data: 0.0002 max mem: 54684 -[06:47:42.292906] Epoch: [1] [2220/3229] lr: 0.000025 grad_norm: 0.7702 (0.7580) closs: 1.0655 (1.0509) time: 4.1091 data: 0.0002 max mem: 54684 -[06:48:22.615690] Epoch: [1] [2230/3229] lr: 0.000025 grad_norm: 0.7125 (0.7578) closs: 1.0602 (1.0509) time: 4.0613 data: 0.0002 max mem: 54684 -[06:49:03.122150] Epoch: [1] [2240/3229] lr: 0.000025 grad_norm: 0.7564 (0.7580) closs: 1.0392 (1.0507) time: 4.0414 data: 0.0002 max mem: 54684 -[06:49:43.913988] Epoch: [1] [2250/3229] lr: 0.000025 grad_norm: 0.7275 (0.7587) closs: 1.0392 (1.0508) time: 4.0649 data: 0.0002 max mem: 54684 -[06:50:24.836181] Epoch: [1] [2260/3229] lr: 0.000025 grad_norm: 0.7275 (0.7588) closs: 1.0988 (1.0510) time: 4.0856 data: 0.0002 max mem: 54684 -[06:51:05.820250] Epoch: [1] [2270/3229] lr: 0.000025 grad_norm: 0.7384 (0.7587) closs: 1.0513 (1.0508) time: 4.0953 data: 0.0002 max mem: 54684 -[06:51:46.269588] Epoch: [1] [2280/3229] lr: 0.000025 grad_norm: 0.7730 (0.7588) closs: 1.0402 (1.0509) time: 4.0716 data: 0.0002 max mem: 54684 -[06:52:27.410955] Epoch: [1] [2290/3229] lr: 0.000025 grad_norm: 0.7238 (0.7586) closs: 1.0385 (1.0507) time: 4.0795 data: 0.0002 max mem: 54684 -[06:53:08.358905] Epoch: [1] [2300/3229] lr: 0.000025 grad_norm: 0.7204 (0.7586) closs: 1.0344 (1.0506) time: 4.1044 data: 0.0002 max mem: 54684 -[06:53:49.351820] Epoch: [1] [2310/3229] lr: 0.000025 grad_norm: 0.7408 (0.7585) closs: 1.0498 (1.0506) time: 4.0970 data: 0.0002 max mem: 54684 -[06:54:30.150638] Epoch: [1] [2320/3229] lr: 0.000025 grad_norm: 0.7644 (0.7586) closs: 1.0665 (1.0507) time: 4.0895 data: 0.0002 max mem: 54684 -[06:55:11.609155] Epoch: [1] [2330/3229] lr: 0.000024 grad_norm: 0.7644 (0.7586) closs: 1.0783 (1.0508) time: 4.1128 data: 0.0002 max mem: 54684 -[06:55:52.526582] Epoch: [1] [2340/3229] lr: 0.000024 grad_norm: 0.7481 (0.7587) closs: 1.0944 (1.0510) time: 4.1187 data: 0.0002 max mem: 54684 -[06:56:33.528322] Epoch: [1] [2350/3229] lr: 0.000024 grad_norm: 0.7847 (0.7587) closs: 1.1101 (1.0511) time: 4.0959 data: 0.0002 max mem: 54684 -[06:57:13.657474] Epoch: [1] [2360/3229] lr: 0.000024 grad_norm: 0.7256 (0.7585) closs: 1.0694 (1.0511) time: 4.0565 data: 0.0002 max mem: 54684 -[06:57:54.348834] Epoch: [1] [2370/3229] lr: 0.000024 grad_norm: 0.7192 (0.7584) closs: 1.0375 (1.0509) time: 4.0410 data: 0.0002 max mem: 54684 -[06:58:35.300759] Epoch: [1] [2380/3229] lr: 0.000024 grad_norm: 0.7204 (0.7584) closs: 1.0375 (1.0509) time: 4.0821 data: 0.0002 max mem: 54684 -[06:59:15.644906] Epoch: [1] [2390/3229] lr: 0.000024 grad_norm: 0.7417 (0.7583) closs: 1.0504 (1.0508) time: 4.0647 data: 0.0002 max mem: 54684 -[06:59:56.423928] Epoch: [1] [2400/3229] lr: 0.000024 grad_norm: 0.7580 (0.7583) closs: 1.0390 (1.0508) time: 4.0561 data: 0.0002 max mem: 54684 -[07:00:36.732262] Epoch: [1] [2410/3229] lr: 0.000024 grad_norm: 0.7551 (0.7580) closs: 1.0390 (1.0508) time: 4.0543 data: 0.0002 max mem: 54684 -[07:01:17.509223] Epoch: [1] [2420/3229] lr: 0.000024 grad_norm: 0.7224 (0.7579) closs: 1.0253 (1.0507) time: 4.0542 data: 0.0002 max mem: 54684 -[07:01:59.103175] Epoch: [1] [2430/3229] lr: 0.000024 grad_norm: 0.7534 (0.7580) closs: 1.0289 (1.0507) time: 4.1185 data: 0.0002 max mem: 54684 -[07:02:39.633371] Epoch: [1] [2440/3229] lr: 0.000024 grad_norm: 0.7523 (0.7579) closs: 1.0303 (1.0506) time: 4.1061 data: 0.0002 max mem: 54684 -[07:03:21.099615] Epoch: [1] [2450/3229] lr: 0.000024 grad_norm: 0.7399 (0.7580) closs: 1.0666 (1.0508) time: 4.0998 data: 0.0002 max mem: 54684 -[07:04:01.876483] Epoch: [1] [2460/3229] lr: 0.000023 grad_norm: 0.7601 (0.7597) closs: 1.0710 (1.0509) time: 4.1121 data: 0.0002 max mem: 54684 -[07:04:42.177832] Epoch: [1] [2470/3229] lr: 0.000023 grad_norm: 0.7070 (0.7594) closs: 1.0346 (1.0508) time: 4.0538 data: 0.0002 max mem: 54684 -[07:05:22.730871] Epoch: [1] [2480/3229] lr: 0.000023 grad_norm: 0.6798 (0.7592) closs: 1.0346 (1.0508) time: 4.0427 data: 0.0002 max mem: 54684 -[07:06:04.230691] Epoch: [1] [2490/3229] lr: 0.000023 grad_norm: 0.7520 (0.7593) closs: 1.0195 (1.0507) time: 4.1026 data: 0.0002 max mem: 54684 -[07:06:44.671239] Epoch: [1] [2500/3229] lr: 0.000023 grad_norm: 0.7559 (0.7592) closs: 1.0066 (1.0506) time: 4.0970 data: 0.0002 max mem: 54684 -[07:07:25.306096] Epoch: [1] [2510/3229] lr: 0.000023 grad_norm: 0.7394 (0.7590) closs: 1.0125 (1.0504) time: 4.0537 data: 0.0002 max mem: 54684 -[07:08:05.672440] Epoch: [1] [2520/3229] lr: 0.000023 grad_norm: 0.7341 (0.7589) closs: 1.0304 (1.0502) time: 4.0500 data: 0.0002 max mem: 54684 -[07:08:46.342159] Epoch: [1] [2530/3229] lr: 0.000023 grad_norm: 0.7582 (0.7590) closs: 1.0381 (1.0503) time: 4.0517 data: 0.0002 max mem: 54684 -[07:09:27.086959] Epoch: [1] [2540/3229] lr: 0.000023 grad_norm: 0.7684 (0.7590) closs: 1.0815 (1.0503) time: 4.0707 data: 0.0002 max mem: 54684 -[07:10:08.439021] Epoch: [1] [2550/3229] lr: 0.000023 grad_norm: 0.7684 (0.7590) closs: 1.0815 (1.0505) time: 4.1048 data: 0.0002 max mem: 54684 -[07:10:49.035524] Epoch: [1] [2560/3229] lr: 0.000023 grad_norm: 0.7595 (0.7590) closs: 1.0433 (1.0504) time: 4.0974 data: 0.0002 max mem: 54684 -[07:11:30.100693] Epoch: [1] [2570/3229] lr: 0.000023 grad_norm: 0.7435 (0.7588) closs: 1.0562 (1.0506) time: 4.0830 data: 0.0002 max mem: 54684 -[07:12:11.188418] Epoch: [1] [2580/3229] lr: 0.000023 grad_norm: 0.7404 (0.7589) closs: 1.1022 (1.0507) time: 4.1076 data: 0.0002 max mem: 54684 -[07:12:52.475637] Epoch: [1] [2590/3229] lr: 0.000022 grad_norm: 0.7772 (0.7591) closs: 1.0955 (1.0509) time: 4.1187 data: 0.0002 max mem: 54684 -[07:13:33.482943] Epoch: [1] [2600/3229] lr: 0.000022 grad_norm: 0.7876 (0.7590) closs: 1.0745 (1.0508) time: 4.1147 data: 0.0002 max mem: 54684 -[07:14:14.212298] Epoch: [1] [2610/3229] lr: 0.000022 grad_norm: 0.7625 (0.7589) closs: 1.0001 (1.0507) time: 4.0868 data: 0.0002 max mem: 54684 -[07:14:55.108401] Epoch: [1] [2620/3229] lr: 0.000022 grad_norm: 0.7431 (0.7588) closs: 1.0001 (1.0506) time: 4.0812 data: 0.0002 max mem: 54684 -[07:15:36.195488] Epoch: [1] [2630/3229] lr: 0.000022 grad_norm: 0.7435 (0.7590) closs: 1.0573 (1.0508) time: 4.0991 data: 0.0002 max mem: 54684 -[07:16:17.647209] Epoch: [1] [2640/3229] lr: 0.000022 grad_norm: 0.7787 (0.7590) closs: 1.0782 (1.0508) time: 4.1269 data: 0.0002 max mem: 54684 -[07:16:58.892659] Epoch: [1] [2650/3229] lr: 0.000022 grad_norm: 0.7769 (0.7590) closs: 1.0465 (1.0508) time: 4.1348 data: 0.0002 max mem: 54684 -[07:17:39.799115] Epoch: [1] [2660/3229] lr: 0.000022 grad_norm: 0.7668 (0.7590) closs: 1.0389 (1.0509) time: 4.1075 data: 0.0002 max mem: 54684 -[07:18:20.650599] Epoch: [1] [2670/3229] lr: 0.000022 grad_norm: 0.7837 (0.7592) closs: 1.0408 (1.0508) time: 4.0878 data: 0.0002 max mem: 54684 -[07:19:01.954445] Epoch: [1] [2680/3229] lr: 0.000022 grad_norm: 0.7896 (0.7591) closs: 1.0526 (1.0508) time: 4.1077 data: 0.0002 max mem: 54684 -[07:19:42.555933] Epoch: [1] [2690/3229] lr: 0.000022 grad_norm: 0.7348 (0.7591) closs: 1.0526 (1.0508) time: 4.0952 data: 0.0002 max mem: 54684 -[07:20:23.489822] Epoch: [1] [2700/3229] lr: 0.000022 grad_norm: 0.7487 (0.7592) closs: 1.0357 (1.0508) time: 4.0767 data: 0.0002 max mem: 54684 -[07:21:03.709034] Epoch: [1] [2710/3229] lr: 0.000022 grad_norm: 0.7536 (0.7593) closs: 1.0253 (1.0506) time: 4.0576 data: 0.0002 max mem: 54684 -[07:21:44.808335] Epoch: [1] [2720/3229] lr: 0.000021 grad_norm: 0.7750 (0.7593) closs: 1.0362 (1.0505) time: 4.0659 data: 0.0002 max mem: 54684 -[07:22:26.050486] Epoch: [1] [2730/3229] lr: 0.000021 grad_norm: 0.7431 (0.7593) closs: 1.0379 (1.0505) time: 4.1170 data: 0.0002 max mem: 54684 -[07:23:06.929188] Epoch: [1] [2740/3229] lr: 0.000021 grad_norm: 0.7314 (0.7592) closs: 1.0382 (1.0505) time: 4.1060 data: 0.0002 max mem: 54684 -[07:23:47.376913] Epoch: [1] [2750/3229] lr: 0.000021 grad_norm: 0.7727 (0.7592) closs: 1.0259 (1.0503) time: 4.0663 data: 0.0002 max mem: 54684 -[07:24:28.781962] Epoch: [1] [2760/3229] lr: 0.000021 grad_norm: 0.7543 (0.7591) closs: 1.0343 (1.0504) time: 4.0926 data: 0.0002 max mem: 54684 -[07:25:09.722107] Epoch: [1] [2770/3229] lr: 0.000021 grad_norm: 0.7543 (0.7592) closs: 1.0747 (1.0505) time: 4.1172 data: 0.0002 max mem: 54684 -[07:25:50.621826] Epoch: [1] [2780/3229] lr: 0.000021 grad_norm: 0.7577 (0.7591) closs: 1.0669 (1.0505) time: 4.0919 data: 0.0002 max mem: 54684 -[07:26:30.847780] Epoch: [1] [2790/3229] lr: 0.000021 grad_norm: 0.7133 (0.7590) closs: 1.0389 (1.0505) time: 4.0562 data: 0.0002 max mem: 54684 -[07:27:12.004919] Epoch: [1] [2800/3229] lr: 0.000021 grad_norm: 0.7479 (0.7588) closs: 1.0389 (1.0505) time: 4.0691 data: 0.0002 max mem: 54684 -[07:27:52.590244] Epoch: [1] [2810/3229] lr: 0.000021 grad_norm: 0.7538 (0.7590) closs: 1.0261 (1.0503) time: 4.0871 data: 0.0002 max mem: 54684 -[07:28:33.829143] Epoch: [1] [2820/3229] lr: 0.000021 grad_norm: 0.7608 (0.7591) closs: 1.0348 (1.0505) time: 4.0911 data: 0.0002 max mem: 54684 -[07:29:14.344969] Epoch: [1] [2830/3229] lr: 0.000021 grad_norm: 0.7575 (0.7589) closs: 1.0923 (1.0506) time: 4.0877 data: 0.0002 max mem: 54684 -[07:29:54.993752] Epoch: [1] [2840/3229] lr: 0.000021 grad_norm: 0.7351 (0.7589) closs: 1.0629 (1.0506) time: 4.0582 data: 0.0002 max mem: 54684 -[07:30:35.251355] Epoch: [1] [2850/3229] lr: 0.000020 grad_norm: 0.7529 (0.7587) closs: 1.0003 (1.0504) time: 4.0453 data: 0.0002 max mem: 54684 -[07:31:15.852435] Epoch: [1] [2860/3229] lr: 0.000020 grad_norm: 0.7370 (0.7586) closs: 1.0153 (1.0504) time: 4.0429 data: 0.0002 max mem: 54684 -[07:31:56.405734] Epoch: [1] [2870/3229] lr: 0.000020 grad_norm: 0.7265 (0.7583) closs: 1.0422 (1.0503) time: 4.0577 data: 0.0002 max mem: 54684 -[07:32:37.536609] Epoch: [1] [2880/3229] lr: 0.000020 grad_norm: 0.7202 (0.7582) closs: 1.0430 (1.0504) time: 4.0841 data: 0.0002 max mem: 54684 -[07:33:18.137999] Epoch: [1] [2890/3229] lr: 0.000020 grad_norm: 0.7069 (0.7580) closs: 1.0299 (1.0504) time: 4.0866 data: 0.0002 max mem: 54684 -[07:33:59.054025] Epoch: [1] [2900/3229] lr: 0.000020 grad_norm: 0.7126 (0.7579) closs: 0.9991 (1.0503) time: 4.0758 data: 0.0002 max mem: 54684 -[07:34:40.302514] Epoch: [1] [2910/3229] lr: 0.000020 grad_norm: 0.7618 (0.7580) closs: 1.0246 (1.0504) time: 4.1082 data: 0.0002 max mem: 54684 -[07:35:21.613397] Epoch: [1] [2920/3229] lr: 0.000020 grad_norm: 0.7726 (0.7581) closs: 1.0246 (1.0504) time: 4.1279 data: 0.0002 max mem: 54684 -[07:36:01.885012] Epoch: [1] [2930/3229] lr: 0.000020 grad_norm: 0.7795 (0.7580) closs: 1.0148 (1.0502) time: 4.0791 data: 0.0002 max mem: 54684 -[07:36:42.517521] Epoch: [1] [2940/3229] lr: 0.000020 grad_norm: 0.7532 (0.7580) closs: 1.0222 (1.0501) time: 4.0451 data: 0.0002 max mem: 54684 -[07:37:22.983411] Epoch: [1] [2950/3229] lr: 0.000020 grad_norm: 0.7556 (0.7579) closs: 1.0219 (1.0500) time: 4.0549 data: 0.0002 max mem: 54684 -[07:38:03.810435] Epoch: [1] [2960/3229] lr: 0.000020 grad_norm: 0.7329 (0.7577) closs: 1.0213 (1.0499) time: 4.0646 data: 0.0002 max mem: 54684 -[07:38:44.734197] Epoch: [1] [2970/3229] lr: 0.000020 grad_norm: 0.7669 (0.7578) closs: 1.0230 (1.0499) time: 4.0875 data: 0.0002 max mem: 54684 -[07:39:25.067815] Epoch: [1] [2980/3229] lr: 0.000020 grad_norm: 0.7443 (0.7575) closs: 1.0170 (1.0497) time: 4.0628 data: 0.0002 max mem: 54684 -[07:40:06.277140] Epoch: [1] [2990/3229] lr: 0.000019 grad_norm: 0.7237 (0.7575) closs: 1.0334 (1.0498) time: 4.0771 data: 0.0002 max mem: 54684 -[07:40:47.518619] Epoch: [1] [3000/3229] lr: 0.000019 grad_norm: 0.7443 (0.7576) closs: 1.0808 (1.0499) time: 4.1225 data: 0.0002 max mem: 54684 -[07:41:28.756394] Epoch: [1] [3010/3229] lr: 0.000019 grad_norm: 0.7306 (0.7576) closs: 1.0731 (1.0499) time: 4.1239 data: 0.0002 max mem: 54684 -[07:42:09.820428] Epoch: [1] [3020/3229] lr: 0.000019 grad_norm: 0.7376 (0.7576) closs: 1.0600 (1.0498) time: 4.1150 data: 0.0002 max mem: 54684 -[07:42:50.371264] Epoch: [1] [3030/3229] lr: 0.000019 grad_norm: 0.7764 (0.7576) closs: 1.0239 (1.0497) time: 4.0807 data: 0.0002 max mem: 54684 -[07:43:31.735769] Epoch: [1] [3040/3229] lr: 0.000019 grad_norm: 0.8111 (0.7577) closs: 1.0480 (1.0498) time: 4.0957 data: 0.0002 max mem: 54684 -[07:44:12.340935] Epoch: [1] [3050/3229] lr: 0.000019 grad_norm: 0.7614 (0.7576) closs: 1.0513 (1.0498) time: 4.0984 data: 0.0002 max mem: 54684 -[07:44:53.338278] Epoch: [1] [3060/3229] lr: 0.000019 grad_norm: 0.7472 (0.7577) closs: 1.0586 (1.0498) time: 4.0801 data: 0.0002 max mem: 54684 -[07:45:33.983619] Epoch: [1] [3070/3229] lr: 0.000019 grad_norm: 0.7472 (0.7576) closs: 1.0348 (1.0497) time: 4.0821 data: 0.0002 max mem: 54684 -[07:46:14.880787] Epoch: [1] [3080/3229] lr: 0.000019 grad_norm: 0.7548 (0.7576) closs: 1.0463 (1.0498) time: 4.0771 data: 0.0002 max mem: 54684 -[07:46:55.135508] Epoch: [1] [3090/3229] lr: 0.000019 grad_norm: 0.7275 (0.7574) closs: 1.0621 (1.0497) time: 4.0575 data: 0.0002 max mem: 54684 -[07:47:35.489073] Epoch: [1] [3100/3229] lr: 0.000019 grad_norm: 0.7098 (0.7573) closs: 0.9939 (1.0496) time: 4.0303 data: 0.0002 max mem: 54684 -[07:48:16.842485] Epoch: [1] [3110/3229] lr: 0.000019 grad_norm: 0.7606 (0.7574) closs: 1.0400 (1.0497) time: 4.0853 data: 0.0002 max mem: 54684 -[07:48:58.060831] Epoch: [1] [3120/3229] lr: 0.000019 grad_norm: 0.7567 (0.7574) closs: 1.0607 (1.0498) time: 4.1285 data: 0.0002 max mem: 54684 -[07:49:39.119706] Epoch: [1] [3130/3229] lr: 0.000018 grad_norm: 0.7286 (0.7572) closs: 1.0710 (1.0499) time: 4.1138 data: 0.0002 max mem: 54684 -[07:50:19.961668] Epoch: [1] [3140/3229] lr: 0.000018 grad_norm: 0.7239 (0.7572) closs: 1.0326 (1.0498) time: 4.0950 data: 0.0002 max mem: 54684 -[07:51:00.627490] Epoch: [1] [3150/3229] lr: 0.000018 grad_norm: 0.7701 (0.7572) closs: 1.0245 (1.0498) time: 4.0753 data: 0.0002 max mem: 54684 -[07:51:41.096988] Epoch: [1] [3160/3229] lr: 0.000018 grad_norm: 0.7701 (0.7572) closs: 1.0417 (1.0497) time: 4.0567 data: 0.0002 max mem: 54684 -[07:52:21.540824] Epoch: [1] [3170/3229] lr: 0.000018 grad_norm: 0.7199 (0.7571) closs: 1.0092 (1.0496) time: 4.0456 data: 0.0002 max mem: 54684 -[07:53:01.782122] Epoch: [1] [3180/3229] lr: 0.000018 grad_norm: 0.7290 (0.7570) closs: 1.0158 (1.0495) time: 4.0342 data: 0.0002 max mem: 54684 -[07:53:42.520581] Epoch: [1] [3190/3229] lr: 0.000018 grad_norm: 0.7290 (0.7569) closs: 1.0676 (1.0495) time: 4.0489 data: 0.0002 max mem: 54684 -[07:54:22.768996] Epoch: [1] [3200/3229] lr: 0.000018 grad_norm: 0.7260 (0.7568) closs: 1.0775 (1.0495) time: 4.0493 data: 0.0002 max mem: 54684 -[07:55:03.308755] Epoch: [1] [3210/3229] lr: 0.000018 grad_norm: 0.7512 (0.7568) closs: 1.0424 (1.0494) time: 4.0393 data: 0.0002 max mem: 54684 -[07:55:44.129491] Epoch: [1] [3220/3229] lr: 0.000018 grad_norm: 0.7508 (0.7567) closs: 1.0736 (1.0494) time: 4.0680 data: 0.0001 max mem: 54684 -[07:56:16.699747] Epoch: [1] Total time: 3:40:05 -[07:56:16.736123] Averaged stats: lr: 0.000018 grad_norm: 0.7281 (0.7566) closs: 1.0383 (1.0483) -[07:56:17.102213] model saved -[07:56:18.915517] optimizer saved -[07:56:18.916157] other rank-common saved -[07:56:18.922035] rank-specific saved -[07:56:18.936513] log_dir: ./output_dir -[07:56:31.268719] Epoch: [2] [0/3229] lr: 0.000018 grad_norm: 0.8262 (0.8262) closs: 1.0084 (1.0084) time: 12.3313 data: 8.2426 max mem: 54684 -[07:57:12.183294] Epoch: [2] [10/3229] lr: 0.000018 grad_norm: 0.7358 (0.7439) closs: 1.0242 (1.0370) time: 4.8405 data: 0.7495 max mem: 54684 -[07:57:52.839749] Epoch: [2] [20/3229] lr: 0.000018 grad_norm: 0.7358 (0.7614) closs: 1.0264 (1.0467) time: 4.0785 data: 0.0002 max mem: 54684 -[07:58:33.435224] Epoch: [2] [30/3229] lr: 0.000018 grad_norm: 0.7811 (0.7651) closs: 1.0351 (1.0354) time: 4.0625 data: 0.0002 max mem: 54684 -[07:59:15.317025] Epoch: [2] [40/3229] lr: 0.000017 grad_norm: 0.7708 (0.7695) closs: 1.0460 (1.0453) time: 4.1238 data: 0.0002 max mem: 54684 -[07:59:56.906136] Epoch: [2] [50/3229] lr: 0.000017 grad_norm: 0.7742 (0.7745) closs: 1.0633 (1.0491) time: 4.1735 data: 0.0002 max mem: 54684 -[08:00:37.793068] Epoch: [2] [60/3229] lr: 0.000017 grad_norm: 0.7780 (0.7750) closs: 1.0396 (1.0518) time: 4.1237 data: 0.0002 max mem: 54684 -[08:01:17.989880] Epoch: [2] [70/3229] lr: 0.000017 grad_norm: 0.7406 (0.7669) closs: 1.0464 (1.0460) time: 4.0541 data: 0.0002 max mem: 54684 -[08:01:59.026748] Epoch: [2] [80/3229] lr: 0.000017 grad_norm: 0.7321 (0.7684) closs: 1.0689 (1.0462) time: 4.0616 data: 0.0002 max mem: 54684 -[08:02:39.476084] Epoch: [2] [90/3229] lr: 0.000017 grad_norm: 0.7710 (0.7672) closs: 1.0697 (1.0457) time: 4.0742 data: 0.0002 max mem: 54684 -[08:03:20.882001] Epoch: [2] [100/3229] lr: 0.000017 grad_norm: 0.7731 (0.7683) closs: 1.0697 (1.0507) time: 4.0927 data: 0.0002 max mem: 54684 -[08:04:01.732738] Epoch: [2] [110/3229] lr: 0.000017 grad_norm: 0.7636 (0.7693) closs: 1.0633 (1.0496) time: 4.1128 data: 0.0002 max mem: 54684 -[08:04:43.243616] Epoch: [2] [120/3229] lr: 0.000017 grad_norm: 0.7653 (0.7695) closs: 1.0444 (1.0471) time: 4.1180 data: 0.0002 max mem: 54684 -[08:05:24.640326] Epoch: [2] [130/3229] lr: 0.000017 grad_norm: 0.7895 (0.7719) closs: 1.0530 (1.0490) time: 4.1453 data: 0.0002 max mem: 54684 -[08:06:04.919803] Epoch: [2] [140/3229] lr: 0.000017 grad_norm: 0.7767 (0.7695) closs: 1.0625 (1.0491) time: 4.0837 data: 0.0002 max mem: 54684 -[08:06:45.485984] Epoch: [2] [150/3229] lr: 0.000017 grad_norm: 0.7487 (0.7669) closs: 1.0756 (1.0476) time: 4.0422 data: 0.0002 max mem: 54684 -[08:07:26.589317] Epoch: [2] [160/3229] lr: 0.000017 grad_norm: 0.7543 (0.7668) closs: 1.0570 (1.0491) time: 4.0834 data: 0.0002 max mem: 54684 -[08:08:07.809141] Epoch: [2] [170/3229] lr: 0.000017 grad_norm: 0.7705 (0.7656) closs: 1.0286 (1.0475) time: 4.1161 data: 0.0002 max mem: 54684 -[08:08:48.382644] Epoch: [2] [180/3229] lr: 0.000016 grad_norm: 0.7462 (0.7654) closs: 1.0195 (1.0463) time: 4.0896 data: 0.0002 max mem: 54684 -[08:09:29.267697] Epoch: [2] [190/3229] lr: 0.000016 grad_norm: 0.7566 (0.7639) closs: 1.0265 (1.0465) time: 4.0729 data: 0.0002 max mem: 54684 -[08:10:09.377776] Epoch: [2] [200/3229] lr: 0.000016 grad_norm: 0.7441 (0.7628) closs: 1.0265 (1.0435) time: 4.0497 data: 0.0002 max mem: 54684 -[08:10:49.302139] Epoch: [2] [210/3229] lr: 0.000016 grad_norm: 0.6802 (0.7596) closs: 0.9949 (1.0425) time: 4.0017 data: 0.0002 max mem: 54684 -[08:11:30.519323] Epoch: [2] [220/3229] lr: 0.000016 grad_norm: 0.7500 (0.7609) closs: 1.0888 (1.0456) time: 4.0570 data: 0.0002 max mem: 54684 -[08:12:11.704337] Epoch: [2] [230/3229] lr: 0.000016 grad_norm: 0.7773 (0.7609) closs: 1.0791 (1.0452) time: 4.1200 data: 0.0002 max mem: 54684 -[08:12:52.393373] Epoch: [2] [240/3229] lr: 0.000016 grad_norm: 0.7641 (0.7595) closs: 1.0536 (1.0445) time: 4.0936 data: 0.0002 max mem: 54684 -[08:13:33.064691] Epoch: [2] [250/3229] lr: 0.000016 grad_norm: 0.7542 (0.7585) closs: 1.0588 (1.0449) time: 4.0680 data: 0.0002 max mem: 54684 -[08:14:14.045473] Epoch: [2] [260/3229] lr: 0.000016 grad_norm: 0.7569 (0.7583) closs: 1.0562 (1.0460) time: 4.0825 data: 0.0002 max mem: 54684 -[08:14:54.905661] Epoch: [2] [270/3229] lr: 0.000016 grad_norm: 0.7574 (0.7579) closs: 1.0383 (1.0455) time: 4.0920 data: 0.0002 max mem: 54684 -[08:15:35.926717] Epoch: [2] [280/3229] lr: 0.000016 grad_norm: 0.7558 (0.7577) closs: 1.0300 (1.0455) time: 4.0940 data: 0.0002 max mem: 54684 -[08:16:16.814567] Epoch: [2] [290/3229] lr: 0.000016 grad_norm: 0.7416 (0.7564) closs: 1.0680 (1.0467) time: 4.0954 data: 0.0002 max mem: 54684 -[08:16:58.068460] Epoch: [2] [300/3229] lr: 0.000016 grad_norm: 0.7514 (0.7573) closs: 1.0594 (1.0468) time: 4.1070 data: 0.0002 max mem: 54684 -[08:17:38.927633] Epoch: [2] [310/3229] lr: 0.000016 grad_norm: 0.7797 (0.7577) closs: 1.0474 (1.0460) time: 4.1056 data: 0.0002 max mem: 54684 -[08:18:19.652536] Epoch: [2] [320/3229] lr: 0.000016 grad_norm: 0.7461 (0.7563) closs: 1.0455 (1.0458) time: 4.0791 data: 0.0002 max mem: 54684 -[08:19:00.882256] Epoch: [2] [330/3229] lr: 0.000015 grad_norm: 0.7702 (0.7581) closs: 1.0466 (1.0463) time: 4.0977 data: 0.0002 max mem: 54684 -[08:19:40.850985] Epoch: [2] [340/3229] lr: 0.000015 grad_norm: 0.7544 (0.7554) closs: 1.0466 (1.0460) time: 4.0599 data: 0.0002 max mem: 54684 -[08:20:21.704908] Epoch: [2] [350/3229] lr: 0.000015 grad_norm: 0.7372 (0.7563) closs: 1.0080 (1.0453) time: 4.0411 data: 0.0002 max mem: 54684 -[08:21:02.268285] Epoch: [2] [360/3229] lr: 0.000015 grad_norm: 0.8174 (0.7568) closs: 0.9763 (1.0430) time: 4.0708 data: 0.0002 max mem: 54684 -[08:21:42.525189] Epoch: [2] [370/3229] lr: 0.000015 grad_norm: 0.7369 (0.7568) closs: 0.9641 (1.0418) time: 4.0410 data: 0.0002 max mem: 54684 -[08:22:23.507510] Epoch: [2] [380/3229] lr: 0.000015 grad_norm: 0.7944 (0.7578) closs: 1.0548 (1.0421) time: 4.0619 data: 0.0002 max mem: 54684 -[08:23:04.788986] Epoch: [2] [390/3229] lr: 0.000015 grad_norm: 0.7995 (0.7582) closs: 1.0502 (1.0420) time: 4.1131 data: 0.0002 max mem: 54684 -[08:23:45.036107] Epoch: [2] [400/3229] lr: 0.000015 grad_norm: 0.7850 (0.7579) closs: 1.0068 (1.0413) time: 4.0764 data: 0.0002 max mem: 54684 -[08:24:25.660061] Epoch: [2] [410/3229] lr: 0.000015 grad_norm: 0.7850 (0.7576) closs: 1.0540 (1.0410) time: 4.0435 data: 0.0002 max mem: 54684 -[08:25:06.015694] Epoch: [2] [420/3229] lr: 0.000015 grad_norm: 0.7293 (0.7570) closs: 1.0468 (1.0396) time: 4.0489 data: 0.0002 max mem: 54684 -[08:25:46.999427] Epoch: [2] [430/3229] lr: 0.000015 grad_norm: 0.7238 (0.7570) closs: 1.0225 (1.0398) time: 4.0669 data: 0.0002 max mem: 54684 -[08:26:27.949780] Epoch: [2] [440/3229] lr: 0.000015 grad_norm: 0.7509 (0.7575) closs: 1.0472 (1.0399) time: 4.0966 data: 0.0002 max mem: 54684 -[08:27:08.849435] Epoch: [2] [450/3229] lr: 0.000015 grad_norm: 0.7973 (0.7579) closs: 1.0547 (1.0396) time: 4.0924 data: 0.0002 max mem: 54684 -[08:27:49.906838] Epoch: [2] [460/3229] lr: 0.000015 grad_norm: 0.7565 (0.7575) closs: 0.9997 (1.0391) time: 4.0978 data: 0.0002 max mem: 54684 -[08:28:30.907563] Epoch: [2] [470/3229] lr: 0.000015 grad_norm: 0.7846 (0.7587) closs: 1.0305 (1.0388) time: 4.1028 data: 0.0002 max mem: 54684 -[08:29:11.501494] Epoch: [2] [480/3229] lr: 0.000015 grad_norm: 0.7925 (0.7583) closs: 1.0507 (1.0382) time: 4.0797 data: 0.0002 max mem: 54684 -[08:29:52.450393] Epoch: [2] [490/3229] lr: 0.000014 grad_norm: 0.7658 (0.7582) closs: 1.0217 (1.0378) time: 4.0771 data: 0.0002 max mem: 54684 -[08:30:33.131092] Epoch: [2] [500/3229] lr: 0.000014 grad_norm: 0.7658 (0.7582) closs: 1.0195 (1.0377) time: 4.0814 data: 0.0002 max mem: 54684 -[08:31:13.731758] Epoch: [2] [510/3229] lr: 0.000014 grad_norm: 0.7490 (0.7574) closs: 1.0083 (1.0372) time: 4.0640 data: 0.0002 max mem: 54684 -[08:31:54.606631] Epoch: [2] [520/3229] lr: 0.000014 grad_norm: 0.7661 (0.7578) closs: 1.0242 (1.0374) time: 4.0737 data: 0.0002 max mem: 54684 -[08:32:35.652465] Epoch: [2] [530/3229] lr: 0.000014 grad_norm: 0.7661 (0.7574) closs: 1.0650 (1.0373) time: 4.0960 data: 0.0002 max mem: 54684 -[08:33:16.811929] Epoch: [2] [540/3229] lr: 0.000014 grad_norm: 0.7808 (0.7580) closs: 1.0536 (1.0379) time: 4.1102 data: 0.0002 max mem: 54684 -[08:33:58.098430] Epoch: [2] [550/3229] lr: 0.000014 grad_norm: 0.7890 (0.7584) closs: 1.0505 (1.0379) time: 4.1222 data: 0.0002 max mem: 54684 -[08:34:39.012102] Epoch: [2] [560/3229] lr: 0.000014 grad_norm: 0.7536 (0.7579) closs: 1.0541 (1.0380) time: 4.1099 data: 0.0002 max mem: 54684 -[08:35:20.411837] Epoch: [2] [570/3229] lr: 0.000014 grad_norm: 0.7575 (0.7583) closs: 1.0281 (1.0376) time: 4.1156 data: 0.0002 max mem: 54684 -[08:36:01.585232] Epoch: [2] [580/3229] lr: 0.000014 grad_norm: 0.7735 (0.7588) closs: 1.0281 (1.0379) time: 4.1286 data: 0.0002 max mem: 54684 -[08:36:42.866022] Epoch: [2] [590/3229] lr: 0.000014 grad_norm: 0.7978 (0.7597) closs: 1.0419 (1.0385) time: 4.1226 data: 0.0002 max mem: 54684 -[08:37:24.134897] Epoch: [2] [600/3229] lr: 0.000014 grad_norm: 0.7944 (0.7605) closs: 1.0606 (1.0390) time: 4.1274 data: 0.0002 max mem: 54684 -[08:38:05.564805] Epoch: [2] [610/3229] lr: 0.000014 grad_norm: 0.7800 (0.7610) closs: 1.0638 (1.0396) time: 4.1349 data: 0.0002 max mem: 54684 -[08:38:47.052757] Epoch: [2] [620/3229] lr: 0.000014 grad_norm: 0.7830 (0.7618) closs: 1.0701 (1.0400) time: 4.1458 data: 0.0002 max mem: 54684 -[08:39:28.070271] Epoch: [2] [630/3229] lr: 0.000014 grad_norm: 0.7867 (0.7620) closs: 1.0398 (1.0394) time: 4.1252 data: 0.0002 max mem: 54684 -[08:40:09.431657] Epoch: [2] [640/3229] lr: 0.000014 grad_norm: 0.7636 (0.7620) closs: 1.0375 (1.0393) time: 4.1189 data: 0.0002 max mem: 54684 -[08:40:50.029316] Epoch: [2] [650/3229] lr: 0.000013 grad_norm: 0.7721 (0.7620) closs: 1.0376 (1.0392) time: 4.0979 data: 0.0002 max mem: 54684 -[08:41:31.253698] Epoch: [2] [660/3229] lr: 0.000013 grad_norm: 0.8149 (0.7639) closs: 1.0376 (1.0390) time: 4.0910 data: 0.0002 max mem: 54684 -[08:42:11.674957] Epoch: [2] [670/3229] lr: 0.000013 grad_norm: 0.7910 (0.7640) closs: 0.9759 (1.0377) time: 4.0822 data: 0.0002 max mem: 54684 -[08:42:52.587904] Epoch: [2] [680/3229] lr: 0.000013 grad_norm: 0.8076 (0.7644) closs: 1.0142 (1.0376) time: 4.0666 data: 0.0002 max mem: 54684 -[08:43:33.204090] Epoch: [2] [690/3229] lr: 0.000013 grad_norm: 0.7571 (0.7643) closs: 1.0510 (1.0373) time: 4.0764 data: 0.0002 max mem: 54684 -[08:44:13.752622] Epoch: [2] [700/3229] lr: 0.000013 grad_norm: 0.7474 (0.7643) closs: 1.0141 (1.0369) time: 4.0582 data: 0.0002 max mem: 54684 -[08:44:54.180675] Epoch: [2] [710/3229] lr: 0.000013 grad_norm: 0.7605 (0.7641) closs: 1.0175 (1.0365) time: 4.0488 data: 0.0002 max mem: 54684 -[08:45:34.801694] Epoch: [2] [720/3229] lr: 0.000013 grad_norm: 0.7678 (0.7645) closs: 1.0230 (1.0360) time: 4.0524 data: 0.0002 max mem: 54684 -[08:46:15.397922] Epoch: [2] [730/3229] lr: 0.000013 grad_norm: 0.7292 (0.7638) closs: 1.0149 (1.0362) time: 4.0608 data: 0.0002 max mem: 54684 -[08:46:56.241797] Epoch: [2] [740/3229] lr: 0.000013 grad_norm: 0.7162 (0.7634) closs: 1.0468 (1.0362) time: 4.0719 data: 0.0002 max mem: 54684 -[08:47:37.401308] Epoch: [2] [750/3229] lr: 0.000013 grad_norm: 0.7125 (0.7625) closs: 1.0468 (1.0361) time: 4.1001 data: 0.0002 max mem: 54684 -[08:48:17.975003] Epoch: [2] [760/3229] lr: 0.000013 grad_norm: 0.7230 (0.7623) closs: 0.9952 (1.0357) time: 4.0866 data: 0.0002 max mem: 54684 -[08:48:58.599971] Epoch: [2] [770/3229] lr: 0.000013 grad_norm: 0.7391 (0.7618) closs: 0.9999 (1.0354) time: 4.0599 data: 0.0002 max mem: 54684 -[08:49:39.723248] Epoch: [2] [780/3229] lr: 0.000013 grad_norm: 0.7642 (0.7620) closs: 1.0226 (1.0358) time: 4.0873 data: 0.0002 max mem: 54684 -[08:50:20.513740] Epoch: [2] [790/3229] lr: 0.000013 grad_norm: 0.7684 (0.7622) closs: 1.0461 (1.0352) time: 4.0956 data: 0.0002 max mem: 54684 -[08:51:00.788007] Epoch: [2] [800/3229] lr: 0.000013 grad_norm: 0.7667 (0.7621) closs: 1.0416 (1.0349) time: 4.0532 data: 0.0002 max mem: 54684 -[08:51:42.084938] Epoch: [2] [810/3229] lr: 0.000012 grad_norm: 0.7514 (0.7623) closs: 1.0492 (1.0351) time: 4.0785 data: 0.0002 max mem: 54684 -[08:52:22.683852] Epoch: [2] [820/3229] lr: 0.000012 grad_norm: 0.7899 (0.7626) closs: 1.0387 (1.0346) time: 4.0947 data: 0.0002 max mem: 54684 -[08:53:03.136708] Epoch: [2] [830/3229] lr: 0.000012 grad_norm: 0.7319 (0.7623) closs: 1.0123 (1.0341) time: 4.0525 data: 0.0002 max mem: 54684 -[08:53:44.061472] Epoch: [2] [840/3229] lr: 0.000012 grad_norm: 0.7243 (0.7622) closs: 1.0273 (1.0344) time: 4.0688 data: 0.0002 max mem: 54684 -[08:54:24.982996] Epoch: [2] [850/3229] lr: 0.000012 grad_norm: 0.7494 (0.7625) closs: 1.0501 (1.0346) time: 4.0923 data: 0.0002 max mem: 54684 -[08:55:05.938568] Epoch: [2] [860/3229] lr: 0.000012 grad_norm: 0.7728 (0.7625) closs: 1.0500 (1.0344) time: 4.0938 data: 0.0002 max mem: 54684 -[08:55:47.280626] Epoch: [2] [870/3229] lr: 0.000012 grad_norm: 0.7828 (0.7630) closs: 1.0542 (1.0347) time: 4.1148 data: 0.0002 max mem: 54684 -[08:56:27.932374] Epoch: [2] [880/3229] lr: 0.000012 grad_norm: 0.7772 (0.7629) closs: 1.0480 (1.0344) time: 4.0996 data: 0.0002 max mem: 54684 -[08:57:09.157992] Epoch: [2] [890/3229] lr: 0.000012 grad_norm: 0.7710 (0.7629) closs: 1.0352 (1.0346) time: 4.0938 data: 0.0002 max mem: 54684 -[08:57:50.048167] Epoch: [2] [900/3229] lr: 0.000012 grad_norm: 0.7654 (0.7630) closs: 1.0552 (1.0348) time: 4.1057 data: 0.0002 max mem: 54684 -[08:58:31.441896] Epoch: [2] [910/3229] lr: 0.000012 grad_norm: 0.7748 (0.7631) closs: 1.0552 (1.0352) time: 4.1141 data: 0.0002 max mem: 54684 -[08:59:12.358818] Epoch: [2] [920/3229] lr: 0.000012 grad_norm: 0.7834 (0.7633) closs: 1.0587 (1.0353) time: 4.1155 data: 0.0002 max mem: 54684 -[08:59:53.384497] Epoch: [2] [930/3229] lr: 0.000012 grad_norm: 0.7600 (0.7639) closs: 1.0610 (1.0358) time: 4.0971 data: 0.0002 max mem: 54684 -[09:00:34.296386] Epoch: [2] [940/3229] lr: 0.000012 grad_norm: 0.7367 (0.7634) closs: 1.0361 (1.0355) time: 4.0968 data: 0.0002 max mem: 54684 -[09:01:15.300491] Epoch: [2] [950/3229] lr: 0.000012 grad_norm: 0.7532 (0.7637) closs: 1.0273 (1.0357) time: 4.0957 data: 0.0002 max mem: 54684 -[09:01:56.214466] Epoch: [2] [960/3229] lr: 0.000012 grad_norm: 0.8101 (0.7637) closs: 1.0385 (1.0358) time: 4.0958 data: 0.0002 max mem: 54684 -[09:02:37.461375] Epoch: [2] [970/3229] lr: 0.000012 grad_norm: 0.7784 (0.7640) closs: 1.0385 (1.0359) time: 4.1080 data: 0.0002 max mem: 54684 -[09:03:18.133192] Epoch: [2] [980/3229] lr: 0.000012 grad_norm: 0.7784 (0.7641) closs: 1.0226 (1.0356) time: 4.0959 data: 0.0002 max mem: 54684 -[09:03:58.519062] Epoch: [2] [990/3229] lr: 0.000011 grad_norm: 0.7619 (0.7637) closs: 1.0127 (1.0353) time: 4.0528 data: 0.0002 max mem: 54684 -[09:04:39.761660] Epoch: [2] [1000/3229] lr: 0.000011 grad_norm: 0.7440 (0.7638) closs: 1.0497 (1.0358) time: 4.0814 data: 0.0002 max mem: 54684 -[09:05:20.726355] Epoch: [2] [1010/3229] lr: 0.000011 grad_norm: 0.7440 (0.7637) closs: 1.0649 (1.0361) time: 4.1103 data: 0.0002 max mem: 54684 -[09:06:00.952110] Epoch: [2] [1020/3229] lr: 0.000011 grad_norm: 0.7606 (0.7640) closs: 1.0374 (1.0356) time: 4.0595 data: 0.0002 max mem: 54684 -[09:06:41.558460] Epoch: [2] [1030/3229] lr: 0.000011 grad_norm: 0.7384 (0.7636) closs: 1.0224 (1.0354) time: 4.0415 data: 0.0002 max mem: 54684 -[09:07:22.293637] Epoch: [2] [1040/3229] lr: 0.000011 grad_norm: 0.7438 (0.7637) closs: 1.0008 (1.0349) time: 4.0670 data: 0.0002 max mem: 54684 -[09:08:03.484133] Epoch: [2] [1050/3229] lr: 0.000011 grad_norm: 0.7482 (0.7634) closs: 1.0428 (1.0352) time: 4.0962 data: 0.0002 max mem: 54684 -[09:08:44.382701] Epoch: [2] [1060/3229] lr: 0.000011 grad_norm: 0.7485 (0.7636) closs: 1.0560 (1.0352) time: 4.1044 data: 0.0002 max mem: 54684 -[09:09:25.808204] Epoch: [2] [1070/3229] lr: 0.000011 grad_norm: 0.7534 (0.7633) closs: 1.0265 (1.0352) time: 4.1161 data: 0.0002 max mem: 54684 -[09:10:06.913588] Epoch: [2] [1080/3229] lr: 0.000011 grad_norm: 0.7569 (0.7634) closs: 1.0402 (1.0353) time: 4.1265 data: 0.0002 max mem: 54684 -[09:10:47.784050] Epoch: [2] [1090/3229] lr: 0.000011 grad_norm: 0.7794 (0.7635) closs: 1.0821 (1.0357) time: 4.0987 data: 0.0002 max mem: 54684 -[09:11:29.039121] Epoch: [2] [1100/3229] lr: 0.000011 grad_norm: 0.7814 (0.7636) closs: 1.0679 (1.0358) time: 4.1062 data: 0.0002 max mem: 54684 -[09:12:10.373961] Epoch: [2] [1110/3229] lr: 0.000011 grad_norm: 0.7643 (0.7639) closs: 1.0508 (1.0360) time: 4.1294 data: 0.0002 max mem: 54684 -[09:12:51.449473] Epoch: [2] [1120/3229] lr: 0.000011 grad_norm: 0.7562 (0.7638) closs: 1.0466 (1.0358) time: 4.1205 data: 0.0002 max mem: 54684 -[09:13:32.329462] Epoch: [2] [1130/3229] lr: 0.000011 grad_norm: 0.7943 (0.7646) closs: 1.0470 (1.0359) time: 4.0977 data: 0.0002 max mem: 54684 -[09:14:12.321437] Epoch: [2] [1140/3229] lr: 0.000011 grad_norm: 0.8189 (0.7647) closs: 1.0094 (1.0354) time: 4.0435 data: 0.0002 max mem: 54684 -[09:14:53.353944] Epoch: [2] [1150/3229] lr: 0.000011 grad_norm: 0.8029 (0.7651) closs: 0.9823 (1.0353) time: 4.0512 data: 0.0002 max mem: 54684 -[09:15:34.257366] Epoch: [2] [1160/3229] lr: 0.000011 grad_norm: 0.8163 (0.7653) closs: 1.0455 (1.0354) time: 4.0967 data: 0.0002 max mem: 54684 -[09:16:14.798544] Epoch: [2] [1170/3229] lr: 0.000011 grad_norm: 0.8128 (0.7652) closs: 1.0318 (1.0353) time: 4.0722 data: 0.0002 max mem: 54684 -[09:16:55.872961] Epoch: [2] [1180/3229] lr: 0.000010 grad_norm: 0.7566 (0.7651) closs: 1.0318 (1.0353) time: 4.0807 data: 0.0002 max mem: 54684 -[09:17:36.505833] Epoch: [2] [1190/3229] lr: 0.000010 grad_norm: 0.7433 (0.7650) closs: 1.0522 (1.0355) time: 4.0853 data: 0.0002 max mem: 54684 -[09:18:17.759241] Epoch: [2] [1200/3229] lr: 0.000010 grad_norm: 0.7621 (0.7651) closs: 1.0598 (1.0356) time: 4.0942 data: 0.0002 max mem: 54684 -[09:18:59.258398] Epoch: [2] [1210/3229] lr: 0.000010 grad_norm: 0.7694 (0.7652) closs: 1.0676 (1.0360) time: 4.1376 data: 0.0002 max mem: 54684 -[09:19:40.458187] Epoch: [2] [1220/3229] lr: 0.000010 grad_norm: 0.7466 (0.7650) closs: 1.0545 (1.0359) time: 4.1349 data: 0.0002 max mem: 54684 -[09:20:21.687691] Epoch: [2] [1230/3229] lr: 0.000010 grad_norm: 0.7782 (0.7653) closs: 1.0469 (1.0362) time: 4.1214 data: 0.0002 max mem: 54684 -[09:21:02.306360] Epoch: [2] [1240/3229] lr: 0.000010 grad_norm: 0.7785 (0.7651) closs: 1.0469 (1.0361) time: 4.0923 data: 0.0002 max mem: 54684 -[09:21:43.494418] Epoch: [2] [1250/3229] lr: 0.000010 grad_norm: 0.7669 (0.7654) closs: 1.0804 (1.0366) time: 4.0903 data: 0.0002 max mem: 54684 -[09:22:24.223450] Epoch: [2] [1260/3229] lr: 0.000010 grad_norm: 0.7949 (0.7653) closs: 1.0789 (1.0365) time: 4.0958 data: 0.0002 max mem: 54684 -[09:23:05.137178] Epoch: [2] [1270/3229] lr: 0.000010 grad_norm: 0.7839 (0.7655) closs: 1.0398 (1.0366) time: 4.0821 data: 0.0002 max mem: 54684 -[09:23:45.713305] Epoch: [2] [1280/3229] lr: 0.000010 grad_norm: 0.7820 (0.7654) closs: 1.0345 (1.0364) time: 4.0744 data: 0.0002 max mem: 54684 -[09:24:26.883899] Epoch: [2] [1290/3229] lr: 0.000010 grad_norm: 0.7848 (0.7655) closs: 1.0622 (1.0368) time: 4.0873 data: 0.0002 max mem: 54684 -[09:25:07.882033] Epoch: [2] [1300/3229] lr: 0.000010 grad_norm: 0.7626 (0.7653) closs: 1.0601 (1.0370) time: 4.1084 data: 0.0002 max mem: 54684 -[09:25:48.788547] Epoch: [2] [1310/3229] lr: 0.000010 grad_norm: 0.7626 (0.7655) closs: 1.0317 (1.0368) time: 4.0952 data: 0.0002 max mem: 54684 -[09:26:29.719803] Epoch: [2] [1320/3229] lr: 0.000010 grad_norm: 0.7593 (0.7652) closs: 1.0151 (1.0369) time: 4.0918 data: 0.0002 max mem: 54684 -[09:27:10.299624] Epoch: [2] [1330/3229] lr: 0.000010 grad_norm: 0.7602 (0.7652) closs: 1.0215 (1.0365) time: 4.0755 data: 0.0002 max mem: 54684 -[09:27:50.751310] Epoch: [2] [1340/3229] lr: 0.000010 grad_norm: 0.7643 (0.7651) closs: 0.9875 (1.0362) time: 4.0515 data: 0.0002 max mem: 54684 -[09:28:31.398595] Epoch: [2] [1350/3229] lr: 0.000010 grad_norm: 0.7591 (0.7650) closs: 0.9875 (1.0358) time: 4.0549 data: 0.0002 max mem: 54684 -[09:29:12.305782] Epoch: [2] [1360/3229] lr: 0.000010 grad_norm: 0.7741 (0.7651) closs: 1.0085 (1.0361) time: 4.0777 data: 0.0001 max mem: 54684 -[09:29:53.571306] Epoch: [2] [1370/3229] lr: 0.000010 grad_norm: 0.7792 (0.7653) closs: 1.0583 (1.0362) time: 4.1086 data: 0.0002 max mem: 54684 -[09:30:34.513851] Epoch: [2] [1380/3229] lr: 0.000009 grad_norm: 0.8100 (0.7656) closs: 1.0583 (1.0366) time: 4.1103 data: 0.0001 max mem: 54684 -[09:31:15.446793] Epoch: [2] [1390/3229] lr: 0.000009 grad_norm: 0.8007 (0.7656) closs: 1.0600 (1.0367) time: 4.0937 data: 0.0001 max mem: 54684 -[09:31:56.718293] Epoch: [2] [1400/3229] lr: 0.000009 grad_norm: 0.7785 (0.7658) closs: 1.0538 (1.0368) time: 4.1101 data: 0.0001 max mem: 54684 -[09:32:37.066477] Epoch: [2] [1410/3229] lr: 0.000009 grad_norm: 0.7777 (0.7654) closs: 1.0459 (1.0365) time: 4.0809 data: 0.0002 max mem: 54684 -[09:33:18.072485] Epoch: [2] [1420/3229] lr: 0.000009 grad_norm: 0.7777 (0.7656) closs: 1.0269 (1.0366) time: 4.0676 data: 0.0001 max mem: 54684 -[09:33:58.688418] Epoch: [2] [1430/3229] lr: 0.000009 grad_norm: 0.7950 (0.7658) closs: 1.0548 (1.0367) time: 4.0810 data: 0.0002 max mem: 54684 -[09:34:40.028938] Epoch: [2] [1440/3229] lr: 0.000009 grad_norm: 0.7547 (0.7655) closs: 1.0359 (1.0365) time: 4.0977 data: 0.0002 max mem: 54684 -[09:35:20.652994] Epoch: [2] [1450/3229] lr: 0.000009 grad_norm: 0.7292 (0.7654) closs: 1.0334 (1.0363) time: 4.0982 data: 0.0001 max mem: 54684 -[09:36:00.716699] Epoch: [2] [1460/3229] lr: 0.000009 grad_norm: 0.7434 (0.7651) closs: 0.9868 (1.0360) time: 4.0343 data: 0.0002 max mem: 54684 -[09:36:41.682870] Epoch: [2] [1470/3229] lr: 0.000009 grad_norm: 0.7452 (0.7652) closs: 0.9904 (1.0359) time: 4.0514 data: 0.0001 max mem: 54684 -[09:37:22.665178] Epoch: [2] [1480/3229] lr: 0.000009 grad_norm: 0.7568 (0.7653) closs: 1.0227 (1.0358) time: 4.0974 data: 0.0001 max mem: 54684 -[09:38:03.979174] Epoch: [2] [1490/3229] lr: 0.000009 grad_norm: 0.7775 (0.7653) closs: 1.0474 (1.0360) time: 4.1147 data: 0.0002 max mem: 54684 -[09:38:45.030747] Epoch: [2] [1500/3229] lr: 0.000009 grad_norm: 0.7560 (0.7653) closs: 1.0474 (1.0360) time: 4.1182 data: 0.0002 max mem: 54684 -[09:39:25.831863] Epoch: [2] [1510/3229] lr: 0.000009 grad_norm: 0.7644 (0.7652) closs: 1.0295 (1.0358) time: 4.0926 data: 0.0001 max mem: 54684 -[09:40:06.337863] Epoch: [2] [1520/3229] lr: 0.000009 grad_norm: 0.7842 (0.7653) closs: 1.0512 (1.0360) time: 4.0653 data: 0.0001 max mem: 54684 -[09:40:46.718752] Epoch: [2] [1530/3229] lr: 0.000009 grad_norm: 0.7452 (0.7650) closs: 1.0411 (1.0359) time: 4.0443 data: 0.0002 max mem: 54684 -[09:41:27.479284] Epoch: [2] [1540/3229] lr: 0.000009 grad_norm: 0.7451 (0.7647) closs: 1.0411 (1.0359) time: 4.0570 data: 0.0002 max mem: 54684 -[09:42:08.444014] Epoch: [2] [1550/3229] lr: 0.000009 grad_norm: 0.7502 (0.7648) closs: 1.0511 (1.0361) time: 4.0862 data: 0.0002 max mem: 54684 -[09:42:49.492626] Epoch: [2] [1560/3229] lr: 0.000009 grad_norm: 0.7626 (0.7648) closs: 1.0354 (1.0359) time: 4.1006 data: 0.0002 max mem: 54684 -[09:43:30.218498] Epoch: [2] [1570/3229] lr: 0.000009 grad_norm: 0.7822 (0.7649) closs: 1.0210 (1.0362) time: 4.0886 data: 0.0002 max mem: 54684 -[09:44:11.014656] Epoch: [2] [1580/3229] lr: 0.000009 grad_norm: 0.7599 (0.7648) closs: 1.0582 (1.0360) time: 4.0760 data: 0.0001 max mem: 54684 -[09:44:52.430295] Epoch: [2] [1590/3229] lr: 0.000009 grad_norm: 0.7556 (0.7649) closs: 1.0126 (1.0362) time: 4.1105 data: 0.0001 max mem: 54684 -[09:45:32.708025] Epoch: [2] [1600/3229] lr: 0.000009 grad_norm: 0.7689 (0.7650) closs: 1.0061 (1.0360) time: 4.0846 data: 0.0002 max mem: 54684 -[09:46:13.311218] Epoch: [2] [1610/3229] lr: 0.000008 grad_norm: 0.7788 (0.7652) closs: 1.0035 (1.0361) time: 4.0440 data: 0.0002 max mem: 54684 -[09:46:55.090230] Epoch: [2] [1620/3229] lr: 0.000008 grad_norm: 0.7534 (0.7651) closs: 1.0395 (1.0361) time: 4.1190 data: 0.0002 max mem: 54684 -[09:47:36.090209] Epoch: [2] [1630/3229] lr: 0.000008 grad_norm: 0.7519 (0.7651) closs: 1.0562 (1.0361) time: 4.1389 data: 0.0002 max mem: 54684 -[09:48:17.317549] Epoch: [2] [1640/3229] lr: 0.000008 grad_norm: 0.7637 (0.7651) closs: 1.0442 (1.0362) time: 4.1113 data: 0.0002 max mem: 54684 -[09:48:58.147416] Epoch: [2] [1650/3229] lr: 0.000008 grad_norm: 0.7460 (0.7650) closs: 1.0442 (1.0361) time: 4.1028 data: 0.0002 max mem: 54684 -[09:49:38.823915] Epoch: [2] [1660/3229] lr: 0.000008 grad_norm: 0.7707 (0.7653) closs: 0.9922 (1.0360) time: 4.0752 data: 0.0002 max mem: 54684 -[09:50:20.328398] Epoch: [2] [1670/3229] lr: 0.000008 grad_norm: 0.7983 (0.7656) closs: 1.0671 (1.0362) time: 4.1090 data: 0.0002 max mem: 54684 -[09:51:01.223284] Epoch: [2] [1680/3229] lr: 0.000008 grad_norm: 0.8044 (0.7656) closs: 1.0319 (1.0360) time: 4.1199 data: 0.0001 max mem: 54684 -[09:51:41.770661] Epoch: [2] [1690/3229] lr: 0.000008 grad_norm: 0.7965 (0.7656) closs: 1.0218 (1.0362) time: 4.0720 data: 0.0002 max mem: 54684 -[09:52:23.631344] Epoch: [2] [1700/3229] lr: 0.000008 grad_norm: 0.7809 (0.7658) closs: 1.0307 (1.0362) time: 4.1203 data: 0.0002 max mem: 54684 -[09:53:03.971107] Epoch: [2] [1710/3229] lr: 0.000008 grad_norm: 0.7531 (0.7655) closs: 1.0542 (1.0362) time: 4.1099 data: 0.0001 max mem: 54684 -[09:53:44.185439] Epoch: [2] [1720/3229] lr: 0.000008 grad_norm: 0.7503 (0.7655) closs: 1.0655 (1.0362) time: 4.0276 data: 0.0002 max mem: 54684 -[09:54:25.528421] Epoch: [2] [1730/3229] lr: 0.000008 grad_norm: 0.7650 (0.7655) closs: 1.0793 (1.0364) time: 4.0778 data: 0.0002 max mem: 54684 -[09:55:06.203734] Epoch: [2] [1740/3229] lr: 0.000008 grad_norm: 0.7650 (0.7657) closs: 1.0568 (1.0362) time: 4.1008 data: 0.0002 max mem: 54684 -[09:55:46.816613] Epoch: [2] [1750/3229] lr: 0.000008 grad_norm: 0.7542 (0.7655) closs: 1.0362 (1.0364) time: 4.0644 data: 0.0002 max mem: 54684 -[09:56:27.712286] Epoch: [2] [1760/3229] lr: 0.000008 grad_norm: 0.7409 (0.7656) closs: 1.0798 (1.0365) time: 4.0754 data: 0.0002 max mem: 54684 -[09:57:09.011822] Epoch: [2] [1770/3229] lr: 0.000008 grad_norm: 0.7625 (0.7657) closs: 1.0369 (1.0366) time: 4.1097 data: 0.0002 max mem: 54684 -[09:57:50.358048] Epoch: [2] [1780/3229] lr: 0.000008 grad_norm: 0.7780 (0.7660) closs: 1.0474 (1.0366) time: 4.1322 data: 0.0002 max mem: 54684 -[09:58:30.935480] Epoch: [2] [1790/3229] lr: 0.000008 grad_norm: 0.7607 (0.7658) closs: 1.0474 (1.0366) time: 4.0961 data: 0.0002 max mem: 54684 -[09:59:12.191909] Epoch: [2] [1800/3229] lr: 0.000008 grad_norm: 0.7704 (0.7660) closs: 1.0390 (1.0366) time: 4.0916 data: 0.0002 max mem: 54684 -[09:59:53.188925] Epoch: [2] [1810/3229] lr: 0.000008 grad_norm: 0.7880 (0.7661) closs: 1.0390 (1.0365) time: 4.1126 data: 0.0002 max mem: 54684 -[10:00:34.288625] Epoch: [2] [1820/3229] lr: 0.000008 grad_norm: 0.8177 (0.7663) closs: 1.0470 (1.0366) time: 4.1048 data: 0.0002 max mem: 54684 -[10:01:15.216189] Epoch: [2] [1830/3229] lr: 0.000008 grad_norm: 0.7464 (0.7662) closs: 1.0408 (1.0365) time: 4.1013 data: 0.0002 max mem: 54684 -[10:01:56.484523] Epoch: [2] [1840/3229] lr: 0.000008 grad_norm: 0.7634 (0.7664) closs: 1.0408 (1.0367) time: 4.1097 data: 0.0002 max mem: 54684 -[10:02:36.820581] Epoch: [2] [1850/3229] lr: 0.000008 grad_norm: 0.7761 (0.7663) closs: 1.0366 (1.0366) time: 4.0802 data: 0.0002 max mem: 54684 -[10:03:17.820103] Epoch: [2] [1860/3229] lr: 0.000007 grad_norm: 0.7713 (0.7665) closs: 1.0248 (1.0365) time: 4.0667 data: 0.0002 max mem: 54684 -[10:03:58.463854] Epoch: [2] [1870/3229] lr: 0.000007 grad_norm: 0.7713 (0.7666) closs: 1.0134 (1.0362) time: 4.0821 data: 0.0002 max mem: 54684 -[10:04:39.335047] Epoch: [2] [1880/3229] lr: 0.000007 grad_norm: 0.7584 (0.7665) closs: 1.0266 (1.0362) time: 4.0757 data: 0.0002 max mem: 54684 -[10:05:20.773148] Epoch: [2] [1890/3229] lr: 0.000007 grad_norm: 0.7505 (0.7666) closs: 1.0527 (1.0364) time: 4.1154 data: 0.0002 max mem: 54684 -[10:06:01.774211] Epoch: [2] [1900/3229] lr: 0.000007 grad_norm: 0.7830 (0.7667) closs: 1.0807 (1.0365) time: 4.1219 data: 0.0002 max mem: 54684 -[10:06:43.015666] Epoch: [2] [1910/3229] lr: 0.000007 grad_norm: 0.7959 (0.7668) closs: 1.0580 (1.0366) time: 4.1121 data: 0.0002 max mem: 54684 -[10:07:22.898606] Epoch: [2] [1920/3229] lr: 0.000007 grad_norm: 0.7359 (0.7665) closs: 1.0269 (1.0364) time: 4.0562 data: 0.0002 max mem: 54684 -[10:08:03.733126] Epoch: [2] [1930/3229] lr: 0.000007 grad_norm: 0.7359 (0.7667) closs: 1.0022 (1.0362) time: 4.0358 data: 0.0002 max mem: 54684 -[10:08:44.009814] Epoch: [2] [1940/3229] lr: 0.000007 grad_norm: 0.7834 (0.7665) closs: 1.0467 (1.0361) time: 4.0555 data: 0.0002 max mem: 54684 -[10:09:24.934480] Epoch: [2] [1950/3229] lr: 0.000007 grad_norm: 0.7408 (0.7665) closs: 1.0467 (1.0360) time: 4.0600 data: 0.0002 max mem: 54684 -[10:10:05.513930] Epoch: [2] [1960/3229] lr: 0.000007 grad_norm: 0.7564 (0.7664) closs: 1.0360 (1.0360) time: 4.0751 data: 0.0002 max mem: 54684 -[10:10:47.318139] Epoch: [2] [1970/3229] lr: 0.000007 grad_norm: 0.7470 (0.7664) closs: 1.0260 (1.0361) time: 4.1191 data: 0.0002 max mem: 54684 -[10:11:28.590511] Epoch: [2] [1980/3229] lr: 0.000007 grad_norm: 0.7601 (0.7667) closs: 1.0797 (1.0363) time: 4.1538 data: 0.0002 max mem: 54684 -[10:12:09.496980] Epoch: [2] [1990/3229] lr: 0.000007 grad_norm: 0.7790 (0.7667) closs: 1.0679 (1.0363) time: 4.1089 data: 0.0002 max mem: 54684 -[10:12:50.748606] Epoch: [2] [2000/3229] lr: 0.000007 grad_norm: 0.7970 (0.7670) closs: 1.0297 (1.0364) time: 4.1078 data: 0.0002 max mem: 54684 -[10:13:32.182612] Epoch: [2] [2010/3229] lr: 0.000007 grad_norm: 0.8054 (0.7671) closs: 1.0496 (1.0365) time: 4.1342 data: 0.0002 max mem: 54684 -[10:14:13.410630] Epoch: [2] [2020/3229] lr: 0.000007 grad_norm: 0.7780 (0.7672) closs: 1.0542 (1.0367) time: 4.1330 data: 0.0002 max mem: 54684 -[10:14:54.361523] Epoch: [2] [2030/3229] lr: 0.000007 grad_norm: 0.8040 (0.7677) closs: 1.0542 (1.0368) time: 4.1089 data: 0.0002 max mem: 54684 -[10:15:35.931648] Epoch: [2] [2040/3229] lr: 0.000007 grad_norm: 0.8409 (0.7681) closs: 1.0486 (1.0369) time: 4.1260 data: 0.0002 max mem: 54684 -[10:16:16.746598] Epoch: [2] [2050/3229] lr: 0.000007 grad_norm: 0.7987 (0.7682) closs: 1.0486 (1.0368) time: 4.1192 data: 0.0002 max mem: 54684 -[10:16:58.012617] Epoch: [2] [2060/3229] lr: 0.000007 grad_norm: 0.8159 (0.7685) closs: 1.0473 (1.0369) time: 4.1040 data: 0.0002 max mem: 54684 -[10:17:38.676913] Epoch: [2] [2070/3229] lr: 0.000007 grad_norm: 0.7728 (0.7682) closs: 1.0073 (1.0366) time: 4.0965 data: 0.0002 max mem: 54684 -[10:18:20.035619] Epoch: [2] [2080/3229] lr: 0.000007 grad_norm: 0.7435 (0.7684) closs: 1.0386 (1.0368) time: 4.1011 data: 0.0002 max mem: 54684 -[10:19:01.109902] Epoch: [2] [2090/3229] lr: 0.000007 grad_norm: 0.7514 (0.7683) closs: 1.0462 (1.0367) time: 4.1216 data: 0.0002 max mem: 54684 -[10:19:42.145840] Epoch: [2] [2100/3229] lr: 0.000007 grad_norm: 0.7514 (0.7683) closs: 1.0461 (1.0368) time: 4.1054 data: 0.0002 max mem: 54684 -[10:20:21.912487] Epoch: [2] [2110/3229] lr: 0.000007 grad_norm: 0.7724 (0.7682) closs: 1.0394 (1.0367) time: 4.0401 data: 0.0002 max mem: 54684 -[10:21:02.609058] Epoch: [2] [2120/3229] lr: 0.000007 grad_norm: 0.7465 (0.7681) closs: 1.0394 (1.0367) time: 4.0231 data: 0.0002 max mem: 54684 -[10:21:43.356036] Epoch: [2] [2130/3229] lr: 0.000007 grad_norm: 0.7602 (0.7681) closs: 1.0324 (1.0366) time: 4.0721 data: 0.0002 max mem: 54684 -[10:22:24.153129] Epoch: [2] [2140/3229] lr: 0.000007 grad_norm: 0.7624 (0.7679) closs: 1.0118 (1.0365) time: 4.0771 data: 0.0002 max mem: 54684 -[10:23:04.987144] Epoch: [2] [2150/3229] lr: 0.000007 grad_norm: 0.7632 (0.7680) closs: 1.0690 (1.0368) time: 4.0815 data: 0.0002 max mem: 54684 -[10:23:46.301006] Epoch: [2] [2160/3229] lr: 0.000007 grad_norm: 0.7924 (0.7682) closs: 1.0757 (1.0370) time: 4.1073 data: 0.0002 max mem: 54684 -[10:24:27.430902] Epoch: [2] [2170/3229] lr: 0.000007 grad_norm: 0.7768 (0.7682) closs: 1.0597 (1.0370) time: 4.1221 data: 0.0002 max mem: 54684 -[10:25:08.536310] Epoch: [2] [2180/3229] lr: 0.000006 grad_norm: 0.7764 (0.7683) closs: 1.0400 (1.0370) time: 4.1117 data: 0.0002 max mem: 54684 -[10:25:49.054789] Epoch: [2] [2190/3229] lr: 0.000006 grad_norm: 0.7764 (0.7683) closs: 1.0429 (1.0370) time: 4.0811 data: 0.0002 max mem: 54684 -[10:26:28.974982] Epoch: [2] [2200/3229] lr: 0.000006 grad_norm: 0.7571 (0.7680) closs: 0.9973 (1.0367) time: 4.0219 data: 0.0002 max mem: 54684 -[10:27:10.250973] Epoch: [2] [2210/3229] lr: 0.000006 grad_norm: 0.7298 (0.7679) closs: 1.0033 (1.0367) time: 4.0597 data: 0.0002 max mem: 54684 -[10:27:51.496280] Epoch: [2] [2220/3229] lr: 0.000006 grad_norm: 0.7623 (0.7681) closs: 1.0618 (1.0369) time: 4.1260 data: 0.0002 max mem: 54684 -[10:28:32.694107] Epoch: [2] [2230/3229] lr: 0.000006 grad_norm: 0.8018 (0.7682) closs: 1.1010 (1.0371) time: 4.1221 data: 0.0002 max mem: 54684 -[10:29:13.690626] Epoch: [2] [2240/3229] lr: 0.000006 grad_norm: 0.7677 (0.7682) closs: 1.0752 (1.0372) time: 4.1097 data: 0.0002 max mem: 54684 -[10:29:55.103277] Epoch: [2] [2250/3229] lr: 0.000006 grad_norm: 0.7532 (0.7683) closs: 1.1006 (1.0373) time: 4.1204 data: 0.0002 max mem: 54684 -[10:30:36.017463] Epoch: [2] [2260/3229] lr: 0.000006 grad_norm: 0.7805 (0.7683) closs: 1.0913 (1.0374) time: 4.1163 data: 0.0002 max mem: 54684 -[10:31:17.232848] Epoch: [2] [2270/3229] lr: 0.000006 grad_norm: 0.7978 (0.7685) closs: 1.0719 (1.0376) time: 4.1064 data: 0.0002 max mem: 54684 -[10:31:57.554662] Epoch: [2] [2280/3229] lr: 0.000006 grad_norm: 0.7693 (0.7683) closs: 1.0401 (1.0375) time: 4.0768 data: 0.0002 max mem: 54684 -[10:32:38.982581] Epoch: [2] [2290/3229] lr: 0.000006 grad_norm: 0.7707 (0.7684) closs: 1.0389 (1.0375) time: 4.0874 data: 0.0002 max mem: 54684 -[10:33:19.900400] Epoch: [2] [2300/3229] lr: 0.000006 grad_norm: 0.7707 (0.7684) closs: 1.0546 (1.0376) time: 4.1172 data: 0.0002 max mem: 54684 -[10:34:00.422956] Epoch: [2] [2310/3229] lr: 0.000006 grad_norm: 0.7404 (0.7683) closs: 1.0432 (1.0375) time: 4.0720 data: 0.0002 max mem: 54684 -[10:34:41.838857] Epoch: [2] [2320/3229] lr: 0.000006 grad_norm: 0.7949 (0.7686) closs: 1.0168 (1.0374) time: 4.0969 data: 0.0002 max mem: 54684 -[10:35:22.205090] Epoch: [2] [2330/3229] lr: 0.000006 grad_norm: 0.7612 (0.7684) closs: 1.0087 (1.0372) time: 4.0890 data: 0.0002 max mem: 54684 -[10:36:03.158561] Epoch: [2] [2340/3229] lr: 0.000006 grad_norm: 0.7306 (0.7685) closs: 1.0268 (1.0373) time: 4.0659 data: 0.0002 max mem: 54684 -[10:36:44.005255] Epoch: [2] [2350/3229] lr: 0.000006 grad_norm: 0.7974 (0.7686) closs: 1.0339 (1.0373) time: 4.0899 data: 0.0002 max mem: 54684 -[10:37:24.060589] Epoch: [2] [2360/3229] lr: 0.000006 grad_norm: 0.7845 (0.7683) closs: 0.9994 (1.0371) time: 4.0450 data: 0.0002 max mem: 54684 -[10:38:04.953222] Epoch: [2] [2370/3229] lr: 0.000006 grad_norm: 0.7452 (0.7684) closs: 0.9928 (1.0371) time: 4.0473 data: 0.0002 max mem: 54684 -[10:38:45.575555] Epoch: [2] [2380/3229] lr: 0.000006 grad_norm: 0.7639 (0.7682) closs: 1.0037 (1.0371) time: 4.0757 data: 0.0002 max mem: 54684 -[10:39:26.779560] Epoch: [2] [2390/3229] lr: 0.000006 grad_norm: 0.7854 (0.7685) closs: 1.0731 (1.0372) time: 4.0913 data: 0.0002 max mem: 54684 -[10:40:07.277159] Epoch: [2] [2400/3229] lr: 0.000006 grad_norm: 0.7837 (0.7683) closs: 1.0746 (1.0371) time: 4.0850 data: 0.0002 max mem: 54684 -[10:40:47.656246] Epoch: [2] [2410/3229] lr: 0.000006 grad_norm: 0.7215 (0.7681) closs: 1.0545 (1.0370) time: 4.0438 data: 0.0002 max mem: 54684 -[10:41:28.261216] Epoch: [2] [2420/3229] lr: 0.000006 grad_norm: 0.7847 (0.7682) closs: 1.0446 (1.0370) time: 4.0491 data: 0.0002 max mem: 54684 -[10:42:09.186454] Epoch: [2] [2430/3229] lr: 0.000006 grad_norm: 0.7847 (0.7682) closs: 1.0480 (1.0370) time: 4.0764 data: 0.0002 max mem: 54684 -[10:42:50.581684] Epoch: [2] [2440/3229] lr: 0.000006 grad_norm: 0.7911 (0.7684) closs: 1.0551 (1.0370) time: 4.1160 data: 0.0002 max mem: 54684 -[10:43:31.660191] Epoch: [2] [2450/3229] lr: 0.000006 grad_norm: 0.7996 (0.7685) closs: 1.0322 (1.0370) time: 4.1236 data: 0.0002 max mem: 54684 -[10:44:12.906840] Epoch: [2] [2460/3229] lr: 0.000006 grad_norm: 0.7953 (0.7685) closs: 1.0322 (1.0370) time: 4.1162 data: 0.0002 max mem: 54684 -[10:44:54.081525] Epoch: [2] [2470/3229] lr: 0.000006 grad_norm: 0.7730 (0.7686) closs: 1.0670 (1.0371) time: 4.1210 data: 0.0002 max mem: 54684 -[10:45:35.195478] Epoch: [2] [2480/3229] lr: 0.000006 grad_norm: 0.7885 (0.7687) closs: 1.0536 (1.0371) time: 4.1144 data: 0.0002 max mem: 54684 -[10:46:16.528666] Epoch: [2] [2490/3229] lr: 0.000006 grad_norm: 0.7880 (0.7687) closs: 1.0536 (1.0373) time: 4.1223 data: 0.0002 max mem: 54684 -[10:46:56.834636] Epoch: [2] [2500/3229] lr: 0.000006 grad_norm: 0.7751 (0.7687) closs: 1.0654 (1.0372) time: 4.0819 data: 0.0002 max mem: 54684 -[10:47:38.101086] Epoch: [2] [2510/3229] lr: 0.000006 grad_norm: 0.7705 (0.7688) closs: 1.0349 (1.0373) time: 4.0786 data: 0.0002 max mem: 54684 -[10:48:18.776789] Epoch: [2] [2520/3229] lr: 0.000006 grad_norm: 0.7878 (0.7688) closs: 1.0416 (1.0373) time: 4.0970 data: 0.0002 max mem: 54684 -[10:48:59.471432] Epoch: [2] [2530/3229] lr: 0.000006 grad_norm: 0.7899 (0.7688) closs: 1.0064 (1.0373) time: 4.0685 data: 0.0002 max mem: 54684 -[10:49:40.706118] Epoch: [2] [2540/3229] lr: 0.000006 grad_norm: 0.7632 (0.7690) closs: 0.9999 (1.0372) time: 4.0964 data: 0.0002 max mem: 54684 -[10:50:21.879580] Epoch: [2] [2550/3229] lr: 0.000006 grad_norm: 0.8102 (0.7692) closs: 1.0065 (1.0372) time: 4.1203 data: 0.0002 max mem: 54684 -[10:51:03.280777] Epoch: [2] [2560/3229] lr: 0.000006 grad_norm: 0.7923 (0.7692) closs: 1.0337 (1.0373) time: 4.1287 data: 0.0002 max mem: 54684 -[10:51:44.513039] Epoch: [2] [2570/3229] lr: 0.000006 grad_norm: 0.7990 (0.7697) closs: 1.0381 (1.0373) time: 4.1316 data: 0.0002 max mem: 54684 -[10:52:25.106616] Epoch: [2] [2580/3229] lr: 0.000006 grad_norm: 0.8055 (0.7697) closs: 1.0194 (1.0372) time: 4.0912 data: 0.0002 max mem: 54684 -[10:53:05.456512] Epoch: [2] [2590/3229] lr: 0.000006 grad_norm: 0.7706 (0.7696) closs: 1.0125 (1.0371) time: 4.0471 data: 0.0002 max mem: 54684 -[10:53:46.140129] Epoch: [2] [2600/3229] lr: 0.000006 grad_norm: 0.7412 (0.7694) closs: 1.0323 (1.0371) time: 4.0516 data: 0.0002 max mem: 54684 -[10:54:27.649177] Epoch: [2] [2610/3229] lr: 0.000006 grad_norm: 0.7625 (0.7696) closs: 1.0566 (1.0371) time: 4.1096 data: 0.0002 max mem: 54684 -[10:55:08.432178] Epoch: [2] [2620/3229] lr: 0.000006 grad_norm: 0.8024 (0.7697) closs: 1.0176 (1.0371) time: 4.1145 data: 0.0002 max mem: 54684 -[10:55:49.037158] Epoch: [2] [2630/3229] lr: 0.000005 grad_norm: 0.7726 (0.7697) closs: 1.0176 (1.0369) time: 4.0693 data: 0.0002 max mem: 54684 -[10:56:30.399477] Epoch: [2] [2640/3229] lr: 0.000005 grad_norm: 0.7389 (0.7696) closs: 1.0279 (1.0371) time: 4.0983 data: 0.0002 max mem: 54684 -[10:57:11.914088] Epoch: [2] [2650/3229] lr: 0.000005 grad_norm: 0.7500 (0.7697) closs: 1.0760 (1.0372) time: 4.1438 data: 0.0002 max mem: 54684 -[10:57:53.051298] Epoch: [2] [2660/3229] lr: 0.000005 grad_norm: 0.7937 (0.7698) closs: 1.0704 (1.0373) time: 4.1325 data: 0.0002 max mem: 54684 -[10:58:34.322783] Epoch: [2] [2670/3229] lr: 0.000005 grad_norm: 0.8010 (0.7699) closs: 1.0704 (1.0374) time: 4.1204 data: 0.0002 max mem: 54684 -[10:59:15.014004] Epoch: [2] [2680/3229] lr: 0.000005 grad_norm: 0.8010 (0.7699) closs: 1.0494 (1.0374) time: 4.0981 data: 0.0002 max mem: 54684 -[10:59:55.439695] Epoch: [2] [2690/3229] lr: 0.000005 grad_norm: 0.7508 (0.7698) closs: 1.0343 (1.0373) time: 4.0558 data: 0.0002 max mem: 54684 -[11:00:36.264761] Epoch: [2] [2700/3229] lr: 0.000005 grad_norm: 0.7522 (0.7699) closs: 1.0379 (1.0374) time: 4.0625 data: 0.0002 max mem: 54684 -[11:01:17.279401] Epoch: [2] [2710/3229] lr: 0.000005 grad_norm: 0.7584 (0.7698) closs: 1.0813 (1.0375) time: 4.0919 data: 0.0002 max mem: 54684 -[11:01:58.168258] Epoch: [2] [2720/3229] lr: 0.000005 grad_norm: 0.7584 (0.7697) closs: 1.0343 (1.0375) time: 4.0951 data: 0.0002 max mem: 54684 -[11:02:39.456366] Epoch: [2] [2730/3229] lr: 0.000005 grad_norm: 0.7529 (0.7697) closs: 1.0343 (1.0375) time: 4.1088 data: 0.0002 max mem: 54684 -[11:03:20.315865] Epoch: [2] [2740/3229] lr: 0.000005 grad_norm: 0.7555 (0.7697) closs: 1.0462 (1.0375) time: 4.1073 data: 0.0002 max mem: 54684 -[11:04:02.063978] Epoch: [2] [2750/3229] lr: 0.000005 grad_norm: 0.7999 (0.7700) closs: 1.0785 (1.0377) time: 4.1303 data: 0.0002 max mem: 54684 -[11:04:42.801143] Epoch: [2] [2760/3229] lr: 0.000005 grad_norm: 0.7942 (0.7699) closs: 1.0842 (1.0378) time: 4.1242 data: 0.0002 max mem: 54684 -[11:05:23.770788] Epoch: [2] [2770/3229] lr: 0.000005 grad_norm: 0.7815 (0.7700) closs: 1.0259 (1.0377) time: 4.0853 data: 0.0002 max mem: 54684 -[11:06:04.580722] Epoch: [2] [2780/3229] lr: 0.000005 grad_norm: 0.7815 (0.7699) closs: 1.0562 (1.0378) time: 4.0889 data: 0.0002 max mem: 54684 -[11:06:45.682994] Epoch: [2] [2790/3229] lr: 0.000005 grad_norm: 0.7723 (0.7699) closs: 1.0843 (1.0379) time: 4.0955 data: 0.0002 max mem: 54684 -[11:07:26.664509] Epoch: [2] [2800/3229] lr: 0.000005 grad_norm: 0.7851 (0.7700) closs: 1.0863 (1.0381) time: 4.1041 data: 0.0002 max mem: 54684 -[11:08:07.660594] Epoch: [2] [2810/3229] lr: 0.000005 grad_norm: 0.7649 (0.7699) closs: 1.0497 (1.0380) time: 4.0988 data: 0.0002 max mem: 54684 -[11:08:48.441366] Epoch: [2] [2820/3229] lr: 0.000005 grad_norm: 0.7366 (0.7698) closs: 1.0300 (1.0380) time: 4.0888 data: 0.0002 max mem: 54684 -[11:09:29.171313] Epoch: [2] [2830/3229] lr: 0.000005 grad_norm: 0.7514 (0.7698) closs: 1.0300 (1.0378) time: 4.0755 data: 0.0002 max mem: 54684 -[11:10:09.985836] Epoch: [2] [2840/3229] lr: 0.000005 grad_norm: 0.7665 (0.7697) closs: 0.9997 (1.0377) time: 4.0772 data: 0.0002 max mem: 54684 -[11:10:51.230047] Epoch: [2] [2850/3229] lr: 0.000005 grad_norm: 0.7750 (0.7698) closs: 1.0333 (1.0378) time: 4.1029 data: 0.0002 max mem: 54684 -[11:11:31.685949] Epoch: [2] [2860/3229] lr: 0.000005 grad_norm: 0.7660 (0.7697) closs: 1.0579 (1.0378) time: 4.0849 data: 0.0002 max mem: 54684 -[11:12:12.974896] Epoch: [2] [2870/3229] lr: 0.000005 grad_norm: 0.7565 (0.7698) closs: 1.0466 (1.0378) time: 4.0872 data: 0.0002 max mem: 54684 -[11:12:54.373299] Epoch: [2] [2880/3229] lr: 0.000005 grad_norm: 0.7817 (0.7699) closs: 1.0407 (1.0379) time: 4.1343 data: 0.0002 max mem: 54684 -[11:13:35.283572] Epoch: [2] [2890/3229] lr: 0.000005 grad_norm: 0.7849 (0.7698) closs: 1.0329 (1.0377) time: 4.1154 data: 0.0002 max mem: 54684 -[11:14:16.118014] Epoch: [2] [2900/3229] lr: 0.000005 grad_norm: 0.7950 (0.7700) closs: 1.0221 (1.0376) time: 4.0872 data: 0.0002 max mem: 54684 -[11:14:57.161039] Epoch: [2] [2910/3229] lr: 0.000005 grad_norm: 0.8078 (0.7701) closs: 1.0416 (1.0376) time: 4.0938 data: 0.0002 max mem: 54684 -[11:15:38.521781] Epoch: [2] [2920/3229] lr: 0.000005 grad_norm: 0.7894 (0.7701) closs: 1.0502 (1.0377) time: 4.1201 data: 0.0002 max mem: 54684 -[11:16:19.106288] Epoch: [2] [2930/3229] lr: 0.000005 grad_norm: 0.7824 (0.7701) closs: 1.0595 (1.0377) time: 4.0972 data: 0.0002 max mem: 54684 -[11:17:00.252881] Epoch: [2] [2940/3229] lr: 0.000005 grad_norm: 0.7871 (0.7701) closs: 1.0469 (1.0376) time: 4.0865 data: 0.0002 max mem: 54684 -[11:17:40.642013] Epoch: [2] [2950/3229] lr: 0.000005 grad_norm: 0.7919 (0.7701) closs: 1.0446 (1.0375) time: 4.0767 data: 0.0002 max mem: 54684 -[11:18:20.871868] Epoch: [2] [2960/3229] lr: 0.000005 grad_norm: 0.7919 (0.7701) closs: 0.9901 (1.0374) time: 4.0309 data: 0.0002 max mem: 54684 -[11:19:01.539668] Epoch: [2] [2970/3229] lr: 0.000005 grad_norm: 0.8446 (0.7702) closs: 1.0398 (1.0373) time: 4.0448 data: 0.0002 max mem: 54684 -[11:19:42.663674] Epoch: [2] [2980/3229] lr: 0.000005 grad_norm: 0.7707 (0.7702) closs: 1.0450 (1.0374) time: 4.0895 data: 0.0002 max mem: 54684 -[11:20:23.730795] Epoch: [2] [2990/3229] lr: 0.000005 grad_norm: 0.7557 (0.7702) closs: 1.0745 (1.0375) time: 4.1095 data: 0.0002 max mem: 54684 -[11:21:04.740958] Epoch: [2] [3000/3229] lr: 0.000005 grad_norm: 0.7990 (0.7704) closs: 1.0864 (1.0375) time: 4.1038 data: 0.0002 max mem: 54684 -[11:21:45.020285] Epoch: [2] [3010/3229] lr: 0.000005 grad_norm: 0.8011 (0.7705) closs: 0.9966 (1.0374) time: 4.0644 data: 0.0002 max mem: 54684 -[11:22:25.881577] Epoch: [2] [3020/3229] lr: 0.000005 grad_norm: 0.7592 (0.7705) closs: 1.0149 (1.0375) time: 4.0570 data: 0.0002 max mem: 54684 -[11:23:07.305216] Epoch: [2] [3030/3229] lr: 0.000005 grad_norm: 0.7607 (0.7706) closs: 1.0597 (1.0376) time: 4.1142 data: 0.0002 max mem: 54684 -[11:23:48.348449] Epoch: [2] [3040/3229] lr: 0.000005 grad_norm: 0.7679 (0.7705) closs: 1.0597 (1.0375) time: 4.1233 data: 0.0002 max mem: 54684 -[11:24:29.596151] Epoch: [2] [3050/3229] lr: 0.000005 grad_norm: 0.7677 (0.7705) closs: 1.0172 (1.0375) time: 4.1145 data: 0.0002 max mem: 54684 -[11:25:10.442041] Epoch: [2] [3060/3229] lr: 0.000005 grad_norm: 0.7748 (0.7706) closs: 1.0172 (1.0375) time: 4.1046 data: 0.0002 max mem: 54684 -[11:25:51.521912] Epoch: [2] [3070/3229] lr: 0.000005 grad_norm: 0.8210 (0.7707) closs: 1.0409 (1.0374) time: 4.0962 data: 0.0002 max mem: 54684 -[11:26:32.713656] Epoch: [2] [3080/3229] lr: 0.000005 grad_norm: 0.7547 (0.7706) closs: 1.0294 (1.0373) time: 4.1135 data: 0.0002 max mem: 54684 -[11:27:13.511213] Epoch: [2] [3090/3229] lr: 0.000005 grad_norm: 0.7538 (0.7706) closs: 1.0294 (1.0374) time: 4.0994 data: 0.0002 max mem: 54684 -[11:27:54.680369] Epoch: [2] [3100/3229] lr: 0.000005 grad_norm: 0.7631 (0.7707) closs: 1.0731 (1.0376) time: 4.0983 data: 0.0002 max mem: 54684 -[11:28:35.049341] Epoch: [2] [3110/3229] lr: 0.000005 grad_norm: 0.7713 (0.7705) closs: 1.0357 (1.0373) time: 4.0768 data: 0.0002 max mem: 54684 -[11:29:15.418890] Epoch: [2] [3120/3229] lr: 0.000005 grad_norm: 0.7078 (0.7705) closs: 1.0218 (1.0374) time: 4.0369 data: 0.0002 max mem: 54684 -[11:29:56.061537] Epoch: [2] [3130/3229] lr: 0.000005 grad_norm: 0.7244 (0.7703) closs: 1.0440 (1.0374) time: 4.0505 data: 0.0002 max mem: 54684 -[11:30:36.293253] Epoch: [2] [3140/3229] lr: 0.000005 grad_norm: 0.7669 (0.7705) closs: 1.0339 (1.0373) time: 4.0437 data: 0.0002 max mem: 54684 -[11:31:16.325807] Epoch: [2] [3150/3229] lr: 0.000005 grad_norm: 0.7848 (0.7703) closs: 1.0084 (1.0372) time: 4.0131 data: 0.0004 max mem: 54684 -[11:31:57.799087] Epoch: [2] [3160/3229] lr: 0.000005 grad_norm: 0.7618 (0.7704) closs: 1.0084 (1.0372) time: 4.0752 data: 0.0003 max mem: 54684 -[11:32:38.264034] Epoch: [2] [3170/3229] lr: 0.000005 grad_norm: 0.7903 (0.7705) closs: 1.0556 (1.0371) time: 4.0968 data: 0.0002 max mem: 54684 -[11:33:18.885238] Epoch: [2] [3180/3229] lr: 0.000005 grad_norm: 0.7486 (0.7705) closs: 1.0379 (1.0372) time: 4.0542 data: 0.0002 max mem: 54684 -[11:33:59.955564] Epoch: [2] [3190/3229] lr: 0.000005 grad_norm: 0.7458 (0.7706) closs: 1.0379 (1.0372) time: 4.0845 data: 0.0002 max mem: 54684 -[11:34:41.295405] Epoch: [2] [3200/3229] lr: 0.000005 grad_norm: 0.7953 (0.7707) closs: 1.0546 (1.0373) time: 4.1204 data: 0.0002 max mem: 54684 -[11:35:22.419516] Epoch: [2] [3210/3229] lr: 0.000005 grad_norm: 0.7931 (0.7707) closs: 1.0572 (1.0373) time: 4.1231 data: 0.0002 max mem: 54684 -[11:36:02.526362] Epoch: [2] [3220/3229] lr: 0.000005 grad_norm: 0.7683 (0.7707) closs: 1.0178 (1.0372) time: 4.0615 data: 0.0001 max mem: 54684 -[11:36:35.973354] Epoch: [2] Total time: 3:40:17 -[11:36:35.974297] Averaged stats: lr: 0.000005 grad_norm: 0.7420 (0.7708) closs: 1.0624 (1.0381) -[11:36:36.337101] model saved -[11:36:38.023619] optimizer saved -[11:36:38.024212] other rank-common saved -[11:36:38.029179] rank-specific saved -[11:36:38.029388] Training time 11:00:31 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.model.pth deleted file mode 100644 index e6ab1a9f90df742fbbeaaff57f991959b09536a9..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5c2bc3748c8253570b7f39e83b85b98efc58dcc9d578c70d964eff23c0758e79 -size 90952079 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 873c2d5e2a0e5d11184ce80750cb6df975ae3062..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:47c860eebadd34a2340393d7937edd7b8f916dc12b6cae7486098680d99b71ca -size 204403795 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.other.pth deleted file mode 100644 index 669cd774b329a01f638aeb2c9f895a064804cad8..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:11832166d9d4ff336967be7a3443fe621f59525b7288833d8f702a8055db6a4b -size 1815 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch0/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.model.pth deleted file mode 100644 index f4fa7f11285853eb02a4d8cde1c2b03fa67d7f60..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c4abed9d7d9977796a1f57677a44751bf53987df68bcd07387f76938dd12242b -size 90952079 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 47050520ca1f588b641aae8a1fad6667ce2e3d52..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d849554de5927f9602e5c338b3f5129ede71c30c49491ce3c80a38db97b97b6a -size 204403795 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.other.pth deleted file mode 100644 index 9a72b8f0c8ae208a697aa2072777c23dfac1e589..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:95adb9a94ac0f196f54dab790f14bfc69fd17b28e0ac05fc63d5330038b53d1f -size 1815 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch1/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.model.pth deleted file mode 100644 index 79297a203c019abd4e9dee80083eaa1bd809ddec..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2ef65a1c31775b30f1ca6689ee1775bc46797f24774e3ba5f6c519191b300041 -size 90952079 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 99234b7cc0f09d9206b5cfdf92cf59852bd365e5..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f027a5cb40b85d2ad50263a5170c8cf5124b3298b2d9f3f2881679345b27be5b -size 204403795 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.other.pth deleted file mode 100644 index eff1ae059eabc81c95811daac0c373442a99a167..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:66313368fa7978c2341a231255af3cd5880b57c79fe18bd08e682f33a319765b -size 1815 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/epoch2/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/log.txt b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/log.txt deleted file mode 100644 index 250ab85f95ff9f0eae9e25c9b73f8a650ff8dd69..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/log.txt +++ /dev/null @@ -1,3 +0,0 @@ -{"train_lr": 8.508628427162843e-05, "train_grad_norm": 0.7172298170107, "train_closs": 1.0847456962903907, "epoch": 0, "val_lr": 8.508628427162843e-05, "val_grad_norm": 0.7172298170107, "val_closs": 1.0847456962903907} -{"train_lr": 5.755148397302138e-05, "train_grad_norm": 0.5609518145953734, "train_closs": 1.0388896298569104, "epoch": 1, "val_lr": 5.755148397302138e-05, "val_grad_norm": 0.5609518145953734, "val_closs": 1.0388896298569104} -{"train_lr": 1.4361453685455227e-05, "train_grad_norm": 0.5708354987866298, "train_closs": 1.0262249000958887, "epoch": 2, "val_lr": 1.4361453685455227e-05, "val_grad_norm": 0.5708354987866298, "val_closs": 1.0262249000958887} diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/output.log b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/output.log deleted file mode 100644 index d5e339d7f62f18caf95300266bf0613e86bc4fbd..0000000000000000000000000000000000000000 --- a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4/output.log +++ /dev/null @@ -1,4738 +0,0 @@ -WARNING:torch.distributed.run: -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -| distributed init (rank 5): env://, gpu 5 -| distributed init (rank 4): env://, gpu 4 -| distributed init (rank 7): env://, gpu 7 -| distributed init (rank 1): env://, gpu 1 -| distributed init (rank 6): env://, gpu 6 -| distributed init (rank 2): env://, gpu 2 -| distributed init (rank 3): env://, gpu 3 -| distributed init (rank 0): env://, gpu 0 -[22:49:15.994849] > initializing model parallel with size 1 -[22:49:15.994911] > initializing ddp with size 8 -[22:49:15.994917] > initializing pipeline with size 1 -[22:49:16.163185] job dir: /data/liuyijiang/mmlab/LLaMA2-Accessory/accessory -[22:49:16.163260] Namespace(batch_size=16, -accum_iter=1, -llama_type='llama_qformerv2_peft', -llama_config=['../checkpoints/llama2/Llama-2-13b/params.json', -'configs/model/finetune/sg/llamaPeft_normBiasLora.json'], -no_visual=False, -tokenizer_path='../checkpoints/llama2/Llama-2-13b/tokenizer.model', -pretrained_path='../checkpoints/mm/lamaQformerv2_13b/finetuned/', -pretrained_type='consolidated', -weight_decay=0.02, -lr=0.0001, -min_lr=5e-06, -epochs=3, -warmup_epochs=0.2, -clip_grad=2, -max_words=512, -dialog=False, -data_config='configs/data/finetune/mm/alpaca_llava.yaml', -output_dir='output/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B_lr1e-4', -log_dir='./output_dir', -save_interval=1, -only_save_trainable=True, -device='cuda', -seed=0, -resume='', -num_workers=16, -pin_mem=True, -world_size=8, -local_rank=-1, -dist_on_itp=False, -dist_url='env://', -model_parallel_size=1, -data_parallel='sdp', -precision='bf16', -checkpointing=True, -quant=True, -rank=0, -gpu=0, -distributed=True, -dist_backend='nccl') -[22:49:16.163895] Start initialization. -[22:49:16.163930] ## Processing on RANK 0. -[22:49:16.173919] Model Args: - ModelArgs(dim=5120, n_layers=40, n_heads=40, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, rope_scaling=None, lora_rank=16, bias_tuning=True) -[22:50:48.722048] build llama model with qformerv2 - Loading checkpoint shards: 0%| | 0/2 [00:00 -[23:22:27.864578] Start training for 3 epochs -[23:22:27.878956] log_dir: ./output_dir -[23:22:43.639857] Epoch: [0] [0/3229] lr: 0.000000 grad_norm: 2.3643 (2.3643) closs: 1.5947 (1.5947) time: 15.7601 data: 8.9894 max mem: 36209 -[23:23:23.795186] Epoch: [0] [10/3229] lr: 0.000002 grad_norm: 2.2598 (2.1838) closs: 1.3996 (1.3615) time: 5.0832 data: 0.8175 max mem: 54683 -[23:24:04.512812] Epoch: [0] [20/3229] lr: 0.000003 grad_norm: 2.2598 (2.2052) closs: 1.4486 (1.4404) time: 4.0436 data: 0.0002 max mem: 54683 -[23:24:44.625235] Epoch: [0] [30/3229] lr: 0.000005 grad_norm: 2.2136 (2.1731) closs: 1.4720 (1.4210) time: 4.0414 data: 0.0002 max mem: 54683 -[23:25:25.084742] Epoch: [0] [40/3229] lr: 0.000006 grad_norm: 2.0006 (2.0774) closs: 1.3830 (1.3968) time: 4.0285 data: 0.0002 max mem: 54683 -[23:26:06.204362] Epoch: [0] [50/3229] lr: 0.000008 grad_norm: 1.6879 (1.9920) closs: 1.3858 (1.4054) time: 4.0789 data: 0.0002 max mem: 54683 -[23:26:46.695153] Epoch: [0] [60/3229] lr: 0.000009 grad_norm: 1.5039 (1.8916) closs: 1.3927 (1.3882) time: 4.0804 data: 0.0002 max mem: 54683 -[23:27:27.187491] Epoch: [0] [70/3229] lr: 0.000011 grad_norm: 1.2135 (1.7928) closs: 1.2954 (1.3740) time: 4.0491 data: 0.0002 max mem: 54683 -[23:28:08.005236] Epoch: [0] [80/3229] lr: 0.000012 grad_norm: 1.0767 (1.7056) closs: 1.2613 (1.3602) time: 4.0654 data: 0.0002 max mem: 54683 -[23:28:48.149547] Epoch: [0] [90/3229] lr: 0.000014 grad_norm: 0.9057 (1.6181) closs: 1.2135 (1.3429) time: 4.0480 data: 0.0002 max mem: 54683 -[23:29:28.932772] Epoch: [0] [100/3229] lr: 0.000015 grad_norm: 0.8791 (1.5484) closs: 1.1981 (1.3292) time: 4.0463 data: 0.0002 max mem: 54683 -[23:30:09.405095] Epoch: [0] [110/3229] lr: 0.000017 grad_norm: 0.8511 (1.4835) closs: 1.2455 (1.3201) time: 4.0627 data: 0.0002 max mem: 54683 -[23:30:50.534293] Epoch: [0] [120/3229] lr: 0.000019 grad_norm: 0.8250 (1.4295) closs: 1.2488 (1.3134) time: 4.0800 data: 0.0002 max mem: 54683 -[23:31:30.701816] Epoch: [0] [130/3229] lr: 0.000020 grad_norm: 0.8259 (1.3840) closs: 1.2241 (1.2992) time: 4.0648 data: 0.0002 max mem: 54683 -[23:32:11.513620] Epoch: [0] [140/3229] lr: 0.000022 grad_norm: 0.8054 (1.3437) closs: 1.1745 (1.2890) time: 4.0489 data: 0.0002 max mem: 54683 -[23:32:52.318143] Epoch: [0] [150/3229] lr: 0.000023 grad_norm: 0.8054 (1.3094) closs: 1.1397 (1.2791) time: 4.0807 data: 0.0002 max mem: 54683 -[23:33:33.496187] Epoch: [0] [160/3229] lr: 0.000025 grad_norm: 0.7959 (1.2774) closs: 1.1462 (1.2732) time: 4.0991 data: 0.0002 max mem: 54683 -[23:34:14.627260] Epoch: [0] [170/3229] lr: 0.000026 grad_norm: 0.8027 (1.2542) closs: 1.1510 (1.2673) time: 4.1154 data: 0.0002 max mem: 54683 -[23:34:55.768947] Epoch: [0] [180/3229] lr: 0.000028 grad_norm: 0.8896 (1.2328) closs: 1.1858 (1.2634) time: 4.1136 data: 0.0002 max mem: 54683 -[23:35:36.915854] Epoch: [0] [190/3229] lr: 0.000029 grad_norm: 0.8623 (1.2141) closs: 1.2039 (1.2594) time: 4.1144 data: 0.0002 max mem: 54683 -[23:36:18.384153] Epoch: [0] [200/3229] lr: 0.000031 grad_norm: 0.8326 (1.1966) closs: 1.1849 (1.2539) time: 4.1307 data: 0.0002 max mem: 54683 -[23:36:59.200348] Epoch: [0] [210/3229] lr: 0.000033 grad_norm: 0.8326 (1.1812) closs: 1.1680 (1.2500) time: 4.1141 data: 0.0002 max mem: 54684 -[23:37:39.707942] Epoch: [0] [220/3229] lr: 0.000034 grad_norm: 0.8574 (1.1681) closs: 1.1718 (1.2469) time: 4.0661 data: 0.0002 max mem: 54684 -[23:38:20.825766] Epoch: [0] [230/3229] lr: 0.000036 grad_norm: 0.8711 (1.1558) closs: 1.1853 (1.2439) time: 4.0812 data: 0.0002 max mem: 54684 -[23:39:01.980002] Epoch: [0] [240/3229] lr: 0.000037 grad_norm: 0.8711 (1.1440) closs: 1.1805 (1.2402) time: 4.1135 data: 0.0002 max mem: 54684 -[23:39:42.136851] Epoch: [0] [250/3229] lr: 0.000039 grad_norm: 0.8573 (1.1322) closs: 1.1219 (1.2355) time: 4.0655 data: 0.0002 max mem: 54684 -[23:40:21.973104] Epoch: [0] [260/3229] lr: 0.000040 grad_norm: 0.8358 (1.1208) closs: 1.1161 (1.2306) time: 3.9996 data: 0.0002 max mem: 54684 -[23:41:02.441963] Epoch: [0] [270/3229] lr: 0.000042 grad_norm: 0.8472 (1.1130) closs: 1.1082 (1.2250) time: 4.0152 data: 0.0002 max mem: 54684 -[23:41:43.505784] Epoch: [0] [280/3229] lr: 0.000043 grad_norm: 0.8502 (1.1037) closs: 1.1082 (1.2214) time: 4.0766 data: 0.0002 max mem: 54684 -[23:42:23.986322] Epoch: [0] [290/3229] lr: 0.000045 grad_norm: 0.8341 (1.0958) closs: 1.1663 (1.2195) time: 4.0771 data: 0.0002 max mem: 54684 -[23:43:04.792000] Epoch: [0] [300/3229] lr: 0.000046 grad_norm: 0.9007 (1.0917) closs: 1.1762 (1.2175) time: 4.0642 data: 0.0002 max mem: 54684 -[23:43:44.938269] Epoch: [0] [310/3229] lr: 0.000048 grad_norm: 0.8587 (1.0835) closs: 1.1480 (1.2142) time: 4.0475 data: 0.0002 max mem: 54684 -[23:44:24.933818] Epoch: [0] [320/3229] lr: 0.000050 grad_norm: 0.8564 (1.0763) closs: 1.1218 (1.2100) time: 4.0070 data: 0.0002 max mem: 54684 -[23:45:05.719682] Epoch: [0] [330/3229] lr: 0.000051 grad_norm: 0.8780 (1.0704) closs: 1.1218 (1.2077) time: 4.0390 data: 0.0002 max mem: 54684 -[23:45:46.851537] Epoch: [0] [340/3229] lr: 0.000053 grad_norm: 0.8750 (1.0652) closs: 1.1309 (1.2051) time: 4.0958 data: 0.0002 max mem: 54684 -[23:46:27.990122] Epoch: [0] [350/3229] lr: 0.000054 grad_norm: 0.8628 (1.0599) closs: 1.1119 (1.2035) time: 4.1134 data: 0.0002 max mem: 54684 -[23:47:08.873601] Epoch: [0] [360/3229] lr: 0.000056 grad_norm: 0.8519 (1.0544) closs: 1.1044 (1.2005) time: 4.1010 data: 0.0002 max mem: 54684 -[23:47:49.692722] Epoch: [0] [370/3229] lr: 0.000057 grad_norm: 0.8766 (1.0503) closs: 1.1115 (1.1989) time: 4.0851 data: 0.0002 max mem: 54684 -[23:48:30.164462] Epoch: [0] [380/3229] lr: 0.000059 grad_norm: 0.8993 (1.0490) closs: 1.1255 (1.1965) time: 4.0645 data: 0.0002 max mem: 54684 -[23:49:10.664172] Epoch: [0] [390/3229] lr: 0.000060 grad_norm: 0.8927 (1.0446) closs: 1.1042 (1.1944) time: 4.0485 data: 0.0002 max mem: 54684 -[23:49:51.854914] Epoch: [0] [400/3229] lr: 0.000062 grad_norm: 0.8927 (1.0418) closs: 1.1041 (1.1918) time: 4.0844 data: 0.0002 max mem: 54684 -[23:50:32.019553] Epoch: [0] [410/3229] lr: 0.000063 grad_norm: 0.8438 (1.0362) closs: 1.0953 (1.1888) time: 4.0677 data: 0.0002 max mem: 54684 -[23:51:13.159641] Epoch: [0] [420/3229] lr: 0.000065 grad_norm: 0.8438 (1.0345) closs: 1.0719 (1.1871) time: 4.0652 data: 0.0002 max mem: 54684 -[23:51:53.971366] Epoch: [0] [430/3229] lr: 0.000067 grad_norm: 0.8894 (1.0317) closs: 1.0982 (1.1850) time: 4.0975 data: 0.0003 max mem: 54684 -[23:52:34.857552] Epoch: [0] [440/3229] lr: 0.000068 grad_norm: 0.8894 (1.0279) closs: 1.1149 (1.1830) time: 4.0848 data: 0.0003 max mem: 54684 -[23:53:15.996672] Epoch: [0] [450/3229] lr: 0.000070 grad_norm: 0.8831 (1.0249) closs: 1.1271 (1.1822) time: 4.1012 data: 0.0002 max mem: 54684 -[23:53:56.818198] Epoch: [0] [460/3229] lr: 0.000071 grad_norm: 0.8629 (1.0207) closs: 1.1466 (1.1812) time: 4.0980 data: 0.0002 max mem: 54684 -[23:54:36.960756] Epoch: [0] [470/3229] lr: 0.000073 grad_norm: 0.8488 (1.0179) closs: 1.1201 (1.1796) time: 4.0481 data: 0.0002 max mem: 54684 -[23:55:18.098024] Epoch: [0] [480/3229] lr: 0.000074 grad_norm: 0.8674 (1.0151) closs: 1.0820 (1.1774) time: 4.0639 data: 0.0002 max mem: 54684 -[23:55:58.929103] Epoch: [0] [490/3229] lr: 0.000076 grad_norm: 0.8972 (1.0126) closs: 1.0761 (1.1759) time: 4.0983 data: 0.0002 max mem: 54684 -[23:56:39.402425] Epoch: [0] [500/3229] lr: 0.000077 grad_norm: 0.8575 (1.0095) closs: 1.1150 (1.1745) time: 4.0651 data: 0.0002 max mem: 54684 -[23:57:20.334121] Epoch: [0] [510/3229] lr: 0.000079 grad_norm: 0.8890 (1.0078) closs: 1.1150 (1.1731) time: 4.0702 data: 0.0002 max mem: 54684 -[23:58:01.372021] Epoch: [0] [520/3229] lr: 0.000081 grad_norm: 0.8890 (1.0051) closs: 1.1070 (1.1710) time: 4.0984 data: 0.0002 max mem: 54684 -[23:58:42.144275] Epoch: [0] [530/3229] lr: 0.000082 grad_norm: 0.8428 (1.0018) closs: 1.1070 (1.1706) time: 4.0904 data: 0.0002 max mem: 54684 -[23:59:22.926638] Epoch: [0] [540/3229] lr: 0.000084 grad_norm: 0.8212 (0.9987) closs: 1.1551 (1.1702) time: 4.0777 data: 0.0002 max mem: 54684 -[00:00:03.149794] Epoch: [0] [550/3229] lr: 0.000085 grad_norm: 0.8337 (0.9958) closs: 1.1042 (1.1677) time: 4.0502 data: 0.0002 max mem: 54684 -[00:00:44.238609] Epoch: [0] [560/3229] lr: 0.000087 grad_norm: 0.8550 (0.9929) closs: 1.0702 (1.1667) time: 4.0655 data: 0.0002 max mem: 54684 -[00:01:24.695417] Epoch: [0] [570/3229] lr: 0.000088 grad_norm: 0.8658 (0.9915) closs: 1.1175 (1.1665) time: 4.0772 data: 0.0002 max mem: 54684 -[00:02:05.811689] Epoch: [0] [580/3229] lr: 0.000090 grad_norm: 0.8485 (0.9889) closs: 1.1632 (1.1661) time: 4.0786 data: 0.0002 max mem: 54684 -[00:02:46.823399] Epoch: [0] [590/3229] lr: 0.000091 grad_norm: 0.8317 (0.9862) closs: 1.1259 (1.1654) time: 4.1063 data: 0.0002 max mem: 54684 -[00:03:26.797835] Epoch: [0] [600/3229] lr: 0.000093 grad_norm: 0.7965 (0.9830) closs: 1.1026 (1.1643) time: 4.0492 data: 0.0002 max mem: 54684 -[00:04:07.579881] Epoch: [0] [610/3229] lr: 0.000094 grad_norm: 0.7848 (0.9798) closs: 1.1218 (1.1633) time: 4.0378 data: 0.0002 max mem: 54684 -[00:04:48.700597] Epoch: [0] [620/3229] lr: 0.000096 grad_norm: 0.8221 (0.9776) closs: 1.1324 (1.1630) time: 4.0951 data: 0.0002 max mem: 54684 -[00:05:29.642483] Epoch: [0] [630/3229] lr: 0.000098 grad_norm: 0.8110 (0.9744) closs: 1.1305 (1.1621) time: 4.1031 data: 0.0002 max mem: 54684 -[00:06:10.262537] Epoch: [0] [640/3229] lr: 0.000099 grad_norm: 0.8096 (0.9728) closs: 1.1009 (1.1604) time: 4.0780 data: 0.0002 max mem: 54684 -[00:06:50.715955] Epoch: [0] [650/3229] lr: 0.000100 grad_norm: 0.8278 (0.9702) closs: 1.0615 (1.1584) time: 4.0536 data: 0.0002 max mem: 54684 -[00:07:31.507808] Epoch: [0] [660/3229] lr: 0.000100 grad_norm: 0.7748 (0.9673) closs: 1.0615 (1.1577) time: 4.0622 data: 0.0002 max mem: 54684 -[00:08:12.189092] Epoch: [0] [670/3229] lr: 0.000100 grad_norm: 0.7953 (0.9656) closs: 1.1140 (1.1564) time: 4.0736 data: 0.0002 max mem: 54684 -[00:08:53.457197] Epoch: [0] [680/3229] lr: 0.000100 grad_norm: 0.8219 (0.9638) closs: 1.1346 (1.1563) time: 4.0974 data: 0.0002 max mem: 54684 -[00:09:34.230467] Epoch: [0] [690/3229] lr: 0.000100 grad_norm: 0.8043 (0.9615) closs: 1.1368 (1.1555) time: 4.1020 data: 0.0002 max mem: 54684 -[00:10:14.702838] Epoch: [0] [700/3229] lr: 0.000100 grad_norm: 0.7837 (0.9586) closs: 1.1245 (1.1542) time: 4.0622 data: 0.0002 max mem: 54684 -[00:10:55.922485] Epoch: [0] [710/3229] lr: 0.000100 grad_norm: 0.7837 (0.9568) closs: 1.1252 (1.1540) time: 4.0845 data: 0.0002 max mem: 54684 -[00:11:36.370423] Epoch: [0] [720/3229] lr: 0.000100 grad_norm: 0.7956 (0.9543) closs: 1.1058 (1.1526) time: 4.0833 data: 0.0002 max mem: 54684 -[00:12:17.473440] Epoch: [0] [730/3229] lr: 0.000100 grad_norm: 0.8050 (0.9522) closs: 1.0626 (1.1514) time: 4.0775 data: 0.0002 max mem: 54684 -[00:12:57.932552] Epoch: [0] [740/3229] lr: 0.000100 grad_norm: 0.7773 (0.9494) closs: 1.0728 (1.1505) time: 4.0780 data: 0.0002 max mem: 54684 -[00:13:39.080143] Epoch: [0] [750/3229] lr: 0.000100 grad_norm: 0.7764 (0.9473) closs: 1.0979 (1.1498) time: 4.0803 data: 0.0002 max mem: 54684 -[00:14:19.877475] Epoch: [0] [760/3229] lr: 0.000100 grad_norm: 0.7764 (0.9447) closs: 1.1119 (1.1494) time: 4.0972 data: 0.0002 max mem: 54684 -[00:15:00.672644] Epoch: [0] [770/3229] lr: 0.000100 grad_norm: 0.7562 (0.9422) closs: 1.1187 (1.1483) time: 4.0796 data: 0.0002 max mem: 54684 -[00:15:41.143591] Epoch: [0] [780/3229] lr: 0.000100 grad_norm: 0.7531 (0.9400) closs: 1.0727 (1.1473) time: 4.0632 data: 0.0002 max mem: 54684 -[00:16:21.920284] Epoch: [0] [790/3229] lr: 0.000100 grad_norm: 0.7644 (0.9379) closs: 1.0704 (1.1460) time: 4.0623 data: 0.0002 max mem: 54684 -[00:17:02.702494] Epoch: [0] [800/3229] lr: 0.000100 grad_norm: 0.7653 (0.9362) closs: 1.0833 (1.1455) time: 4.0779 data: 0.0002 max mem: 54684 -[00:17:43.158066] Epoch: [0] [810/3229] lr: 0.000100 grad_norm: 0.7914 (0.9341) closs: 1.0861 (1.1447) time: 4.0618 data: 0.0002 max mem: 54684 -[00:18:23.620826] Epoch: [0] [820/3229] lr: 0.000100 grad_norm: 0.7539 (0.9321) closs: 1.0861 (1.1438) time: 4.0459 data: 0.0002 max mem: 54684 -[00:19:04.787142] Epoch: [0] [830/3229] lr: 0.000100 grad_norm: 0.7176 (0.9296) closs: 1.0843 (1.1434) time: 4.0814 data: 0.0002 max mem: 54684 -[00:19:45.891691] Epoch: [0] [840/3229] lr: 0.000100 grad_norm: 0.7310 (0.9275) closs: 1.0950 (1.1429) time: 4.1135 data: 0.0002 max mem: 54684 -[00:20:26.345278] Epoch: [0] [850/3229] lr: 0.000100 grad_norm: 0.7597 (0.9258) closs: 1.0950 (1.1420) time: 4.0778 data: 0.0002 max mem: 54684 -[00:21:06.501382] Epoch: [0] [860/3229] lr: 0.000100 grad_norm: 0.7321 (0.9235) closs: 1.0948 (1.1410) time: 4.0304 data: 0.0002 max mem: 54684 -[00:21:48.059783] Epoch: [0] [870/3229] lr: 0.000100 grad_norm: 0.7269 (0.9213) closs: 1.0870 (1.1405) time: 4.0857 data: 0.0002 max mem: 54684 -[00:22:29.175074] Epoch: [0] [880/3229] lr: 0.000100 grad_norm: 0.7318 (0.9197) closs: 1.0882 (1.1403) time: 4.1336 data: 0.0002 max mem: 54684 -[00:23:10.331842] Epoch: [0] [890/3229] lr: 0.000100 grad_norm: 0.7552 (0.9178) closs: 1.1258 (1.1399) time: 4.1135 data: 0.0002 max mem: 54684 -[00:23:51.116622] Epoch: [0] [900/3229] lr: 0.000100 grad_norm: 0.7552 (0.9160) closs: 1.1166 (1.1395) time: 4.0970 data: 0.0002 max mem: 54684 -[00:24:32.291626] Epoch: [0] [910/3229] lr: 0.000100 grad_norm: 0.7255 (0.9140) closs: 1.1076 (1.1388) time: 4.0979 data: 0.0002 max mem: 54684 -[00:25:12.749474] Epoch: [0] [920/3229] lr: 0.000100 grad_norm: 0.7110 (0.9118) closs: 1.0895 (1.1382) time: 4.0816 data: 0.0002 max mem: 54684 -[00:25:53.206260] Epoch: [0] [930/3229] lr: 0.000100 grad_norm: 0.7317 (0.9100) closs: 1.1072 (1.1377) time: 4.0457 data: 0.0002 max mem: 54684 -[00:26:34.341822] Epoch: [0] [940/3229] lr: 0.000100 grad_norm: 0.7409 (0.9084) closs: 1.1083 (1.1372) time: 4.0796 data: 0.0002 max mem: 54684 -[00:27:15.438692] Epoch: [0] [950/3229] lr: 0.000100 grad_norm: 0.7488 (0.9067) closs: 1.0973 (1.1366) time: 4.1116 data: 0.0002 max mem: 54684 -[00:27:56.584356] Epoch: [0] [960/3229] lr: 0.000100 grad_norm: 0.7426 (0.9057) closs: 1.0873 (1.1361) time: 4.1121 data: 0.0002 max mem: 54684 -[00:28:37.382221] Epoch: [0] [970/3229] lr: 0.000100 grad_norm: 0.7408 (0.9040) closs: 1.0993 (1.1355) time: 4.0971 data: 0.0002 max mem: 54684 -[00:29:17.838797] Epoch: [0] [980/3229] lr: 0.000100 grad_norm: 0.7385 (0.9023) closs: 1.1009 (1.1351) time: 4.0627 data: 0.0002 max mem: 54684 -[00:29:59.329393] Epoch: [0] [990/3229] lr: 0.000100 grad_norm: 0.7272 (0.9007) closs: 1.1175 (1.1350) time: 4.0973 data: 0.0002 max mem: 54684 -[00:30:39.787315] Epoch: [0] [1000/3229] lr: 0.000100 grad_norm: 0.7447 (0.8993) closs: 1.1062 (1.1341) time: 4.0974 data: 0.0002 max mem: 54684 -[00:31:20.577977] Epoch: [0] [1010/3229] lr: 0.000100 grad_norm: 0.7520 (0.8976) closs: 1.0464 (1.1330) time: 4.0624 data: 0.0002 max mem: 54684 -[00:32:01.381502] Epoch: [0] [1020/3229] lr: 0.000100 grad_norm: 0.7217 (0.8957) closs: 1.0770 (1.1327) time: 4.0796 data: 0.0002 max mem: 54684 -[00:32:42.943123] Epoch: [0] [1030/3229] lr: 0.000100 grad_norm: 0.7217 (0.8942) closs: 1.1380 (1.1326) time: 4.1182 data: 0.0002 max mem: 54684 -[00:33:23.714134] Epoch: [0] [1040/3229] lr: 0.000100 grad_norm: 0.7318 (0.8925) closs: 1.0825 (1.1320) time: 4.1166 data: 0.0002 max mem: 54684 -[00:34:04.838184] Epoch: [0] [1050/3229] lr: 0.000100 grad_norm: 0.7374 (0.8912) closs: 1.0988 (1.1318) time: 4.0947 data: 0.0002 max mem: 54684 -[00:34:46.091225] Epoch: [0] [1060/3229] lr: 0.000100 grad_norm: 0.7631 (0.8897) closs: 1.1092 (1.1316) time: 4.1188 data: 0.0002 max mem: 54684 -[00:35:26.212506] Epoch: [0] [1070/3229] lr: 0.000099 grad_norm: 0.7038 (0.8877) closs: 1.0837 (1.1307) time: 4.0686 data: 0.0002 max mem: 54684 -[00:36:07.340784] Epoch: [0] [1080/3229] lr: 0.000099 grad_norm: 0.7197 (0.8864) closs: 1.0947 (1.1307) time: 4.0624 data: 0.0002 max mem: 54684 -[00:36:48.165191] Epoch: [0] [1090/3229] lr: 0.000099 grad_norm: 0.7339 (0.8849) closs: 1.1309 (1.1305) time: 4.0976 data: 0.0002 max mem: 54684 -[00:37:28.183396] Epoch: [0] [1100/3229] lr: 0.000099 grad_norm: 0.7004 (0.8833) closs: 1.0832 (1.1297) time: 4.0421 data: 0.0002 max mem: 54684 -[00:38:08.517357] Epoch: [0] [1110/3229] lr: 0.000099 grad_norm: 0.7150 (0.8816) closs: 1.0764 (1.1290) time: 4.0175 data: 0.0002 max mem: 54684 -[00:38:49.323103] Epoch: [0] [1120/3229] lr: 0.000099 grad_norm: 0.7361 (0.8804) closs: 1.0986 (1.1287) time: 4.0569 data: 0.0002 max mem: 54684 -[00:39:30.123571] Epoch: [0] [1130/3229] lr: 0.000099 grad_norm: 0.7361 (0.8792) closs: 1.0772 (1.1279) time: 4.0802 data: 0.0002 max mem: 54684 -[00:40:10.803257] Epoch: [0] [1140/3229] lr: 0.000099 grad_norm: 0.7173 (0.8776) closs: 1.0506 (1.1273) time: 4.0739 data: 0.0002 max mem: 54684 -[00:40:51.438742] Epoch: [0] [1150/3229] lr: 0.000099 grad_norm: 0.6859 (0.8758) closs: 1.0506 (1.1263) time: 4.0657 data: 0.0002 max mem: 54684 -[00:41:31.922675] Epoch: [0] [1160/3229] lr: 0.000099 grad_norm: 0.6937 (0.8745) closs: 1.0536 (1.1256) time: 4.0559 data: 0.0002 max mem: 54684 -[00:42:12.389017] Epoch: [0] [1170/3229] lr: 0.000099 grad_norm: 0.7231 (0.8731) closs: 1.0545 (1.1245) time: 4.0474 data: 0.0002 max mem: 54684 -[00:42:52.804008] Epoch: [0] [1180/3229] lr: 0.000099 grad_norm: 0.7039 (0.8716) closs: 1.0439 (1.1238) time: 4.0440 data: 0.0002 max mem: 54684 -[00:43:33.442370] Epoch: [0] [1190/3229] lr: 0.000099 grad_norm: 0.7185 (0.8703) closs: 1.0626 (1.1232) time: 4.0526 data: 0.0002 max mem: 54684 -[00:44:14.248252] Epoch: [0] [1200/3229] lr: 0.000099 grad_norm: 0.7064 (0.8688) closs: 1.0911 (1.1229) time: 4.0721 data: 0.0002 max mem: 54684 -[00:44:54.730599] Epoch: [0] [1210/3229] lr: 0.000099 grad_norm: 0.6838 (0.8671) closs: 1.0769 (1.1222) time: 4.0643 data: 0.0002 max mem: 54684 -[00:45:35.439411] Epoch: [0] [1220/3229] lr: 0.000099 grad_norm: 0.6745 (0.8657) closs: 1.0487 (1.1217) time: 4.0595 data: 0.0002 max mem: 54684 -[00:46:16.740734] Epoch: [0] [1230/3229] lr: 0.000099 grad_norm: 0.6783 (0.8642) closs: 1.1197 (1.1218) time: 4.1004 data: 0.0002 max mem: 54684 -[00:46:57.546801] Epoch: [0] [1240/3229] lr: 0.000099 grad_norm: 0.7019 (0.8631) closs: 1.1150 (1.1215) time: 4.1053 data: 0.0002 max mem: 54684 -[00:47:38.021108] Epoch: [0] [1250/3229] lr: 0.000099 grad_norm: 0.7065 (0.8617) closs: 1.0681 (1.1212) time: 4.0640 data: 0.0002 max mem: 54684 -[00:48:18.636920] Epoch: [0] [1260/3229] lr: 0.000099 grad_norm: 0.6828 (0.8605) closs: 1.0594 (1.1205) time: 4.0544 data: 0.0002 max mem: 54684 -[00:48:59.298551] Epoch: [0] [1270/3229] lr: 0.000099 grad_norm: 0.6828 (0.8593) closs: 1.0436 (1.1198) time: 4.0638 data: 0.0002 max mem: 54684 -[00:49:40.435891] Epoch: [0] [1280/3229] lr: 0.000099 grad_norm: 0.6738 (0.8580) closs: 1.0650 (1.1194) time: 4.0899 data: 0.0002 max mem: 54684 -[00:50:20.609485] Epoch: [0] [1290/3229] lr: 0.000099 grad_norm: 0.6694 (0.8565) closs: 1.0689 (1.1191) time: 4.0655 data: 0.0002 max mem: 54684 -[00:51:00.959404] Epoch: [0] [1300/3229] lr: 0.000099 grad_norm: 0.6727 (0.8552) closs: 1.0693 (1.1186) time: 4.0261 data: 0.0002 max mem: 54684 -[00:51:41.919049] Epoch: [0] [1310/3229] lr: 0.000099 grad_norm: 0.6736 (0.8538) closs: 1.0793 (1.1182) time: 4.0654 data: 0.0002 max mem: 54684 -[00:52:22.383012] Epoch: [0] [1320/3229] lr: 0.000099 grad_norm: 0.6669 (0.8526) closs: 1.0662 (1.1176) time: 4.0711 data: 0.0002 max mem: 54684 -[00:53:03.188621] Epoch: [0] [1330/3229] lr: 0.000099 grad_norm: 0.6665 (0.8512) closs: 1.0662 (1.1174) time: 4.0634 data: 0.0002 max mem: 54684 -[00:53:44.567629] Epoch: [0] [1340/3229] lr: 0.000099 grad_norm: 0.6947 (0.8502) closs: 1.1050 (1.1174) time: 4.1092 data: 0.0002 max mem: 54684 -[00:54:25.195105] Epoch: [0] [1350/3229] lr: 0.000099 grad_norm: 0.7112 (0.8491) closs: 1.0731 (1.1170) time: 4.1003 data: 0.0002 max mem: 54684 -[00:55:05.687079] Epoch: [0] [1360/3229] lr: 0.000099 grad_norm: 0.6784 (0.8480) closs: 1.0562 (1.1165) time: 4.0559 data: 0.0002 max mem: 54684 -[00:55:46.805581] Epoch: [0] [1370/3229] lr: 0.000099 grad_norm: 0.6794 (0.8470) closs: 1.0625 (1.1162) time: 4.0805 data: 0.0002 max mem: 54684 -[00:56:28.138612] Epoch: [0] [1380/3229] lr: 0.000098 grad_norm: 0.6794 (0.8460) closs: 1.0625 (1.1159) time: 4.1225 data: 0.0002 max mem: 54684 -[00:57:08.790631] Epoch: [0] [1390/3229] lr: 0.000098 grad_norm: 0.6731 (0.8447) closs: 1.0694 (1.1155) time: 4.0992 data: 0.0002 max mem: 54684 -[00:57:49.591828] Epoch: [0] [1400/3229] lr: 0.000098 grad_norm: 0.6726 (0.8434) closs: 1.0810 (1.1152) time: 4.0726 data: 0.0002 max mem: 54684 -[00:58:30.383532] Epoch: [0] [1410/3229] lr: 0.000098 grad_norm: 0.6579 (0.8422) closs: 1.0879 (1.1149) time: 4.0796 data: 0.0002 max mem: 54684 -[00:59:11.318426] Epoch: [0] [1420/3229] lr: 0.000098 grad_norm: 0.6579 (0.8409) closs: 1.0965 (1.1149) time: 4.0863 data: 0.0002 max mem: 54684 -[00:59:51.955092] Epoch: [0] [1430/3229] lr: 0.000098 grad_norm: 0.6626 (0.8394) closs: 1.0851 (1.1145) time: 4.0785 data: 0.0002 max mem: 54684 -[01:00:32.756437] Epoch: [0] [1440/3229] lr: 0.000098 grad_norm: 0.6626 (0.8383) closs: 1.0851 (1.1144) time: 4.0718 data: 0.0002 max mem: 54684 -[01:01:13.548959] Epoch: [0] [1450/3229] lr: 0.000098 grad_norm: 0.6736 (0.8370) closs: 1.1211 (1.1145) time: 4.0796 data: 0.0002 max mem: 54684 -[01:01:54.540506] Epoch: [0] [1460/3229] lr: 0.000098 grad_norm: 0.6815 (0.8361) closs: 1.1259 (1.1144) time: 4.0891 data: 0.0002 max mem: 54684 -[01:02:35.496041] Epoch: [0] [1470/3229] lr: 0.000098 grad_norm: 0.7130 (0.8352) closs: 1.0750 (1.1140) time: 4.0973 data: 0.0002 max mem: 54684 -[01:03:16.615434] Epoch: [0] [1480/3229] lr: 0.000098 grad_norm: 0.6600 (0.8339) closs: 1.0744 (1.1137) time: 4.1037 data: 0.0002 max mem: 54684 -[01:03:57.734129] Epoch: [0] [1490/3229] lr: 0.000098 grad_norm: 0.6600 (0.8330) closs: 1.0960 (1.1136) time: 4.1118 data: 0.0002 max mem: 54684 -[01:04:39.265592] Epoch: [0] [1500/3229] lr: 0.000098 grad_norm: 0.6993 (0.8322) closs: 1.0960 (1.1136) time: 4.1324 data: 0.0002 max mem: 54684 -[01:05:20.047181] Epoch: [0] [1510/3229] lr: 0.000098 grad_norm: 0.6993 (0.8313) closs: 1.0891 (1.1133) time: 4.1156 data: 0.0002 max mem: 54684 -[01:06:01.147748] Epoch: [0] [1520/3229] lr: 0.000098 grad_norm: 0.6925 (0.8304) closs: 1.0764 (1.1131) time: 4.0940 data: 0.0002 max mem: 54684 -[01:06:42.349323] Epoch: [0] [1530/3229] lr: 0.000098 grad_norm: 0.6751 (0.8295) closs: 1.0807 (1.1129) time: 4.1150 data: 0.0002 max mem: 54684 -[01:07:23.760475] Epoch: [0] [1540/3229] lr: 0.000098 grad_norm: 0.6925 (0.8287) closs: 1.0949 (1.1126) time: 4.1306 data: 0.0002 max mem: 54684 -[01:08:03.939766] Epoch: [0] [1550/3229] lr: 0.000098 grad_norm: 0.6828 (0.8275) closs: 1.0497 (1.1118) time: 4.0795 data: 0.0002 max mem: 54684 -[01:08:44.774063] Epoch: [0] [1560/3229] lr: 0.000098 grad_norm: 0.6374 (0.8264) closs: 1.0350 (1.1114) time: 4.0506 data: 0.0002 max mem: 54684 -[01:09:25.978765] Epoch: [0] [1570/3229] lr: 0.000098 grad_norm: 0.6246 (0.8253) closs: 1.0372 (1.1111) time: 4.1019 data: 0.0002 max mem: 54684 -[01:10:06.789325] Epoch: [0] [1580/3229] lr: 0.000098 grad_norm: 0.6425 (0.8243) closs: 1.0611 (1.1106) time: 4.1007 data: 0.0002 max mem: 54684 -[01:10:47.602967] Epoch: [0] [1590/3229] lr: 0.000097 grad_norm: 0.6555 (0.8233) closs: 1.0611 (1.1102) time: 4.0811 data: 0.0002 max mem: 54684 -[01:11:27.430855] Epoch: [0] [1600/3229] lr: 0.000097 grad_norm: 0.6335 (0.8220) closs: 1.0391 (1.1095) time: 4.0320 data: 0.0002 max mem: 54684 -[01:12:08.472840] Epoch: [0] [1610/3229] lr: 0.000097 grad_norm: 0.6397 (0.8210) closs: 1.0789 (1.1094) time: 4.0434 data: 0.0002 max mem: 54684 -[01:12:49.720690] Epoch: [0] [1620/3229] lr: 0.000097 grad_norm: 0.6731 (0.8203) closs: 1.1052 (1.1095) time: 4.1144 data: 0.0002 max mem: 54684 -[01:13:30.798702] Epoch: [0] [1630/3229] lr: 0.000097 grad_norm: 0.6657 (0.8194) closs: 1.0879 (1.1092) time: 4.1162 data: 0.0002 max mem: 54684 -[01:14:11.875270] Epoch: [0] [1640/3229] lr: 0.000097 grad_norm: 0.6634 (0.8185) closs: 1.0959 (1.1093) time: 4.1077 data: 0.0002 max mem: 54684 -[01:14:52.887302] Epoch: [0] [1650/3229] lr: 0.000097 grad_norm: 0.6733 (0.8176) closs: 1.0812 (1.1090) time: 4.1044 data: 0.0002 max mem: 54684 -[01:15:33.822367] Epoch: [0] [1660/3229] lr: 0.000097 grad_norm: 0.6436 (0.8166) closs: 1.0691 (1.1089) time: 4.0973 data: 0.0002 max mem: 54684 -[01:16:14.261803] Epoch: [0] [1670/3229] lr: 0.000097 grad_norm: 0.6324 (0.8155) closs: 1.0902 (1.1088) time: 4.0687 data: 0.0002 max mem: 54684 -[01:16:55.379849] Epoch: [0] [1680/3229] lr: 0.000097 grad_norm: 0.6285 (0.8143) closs: 1.0896 (1.1088) time: 4.0778 data: 0.0002 max mem: 54684 -[01:17:35.740413] Epoch: [0] [1690/3229] lr: 0.000097 grad_norm: 0.6079 (0.8132) closs: 1.0718 (1.1084) time: 4.0739 data: 0.0002 max mem: 54684 -[01:18:16.750968] Epoch: [0] [1700/3229] lr: 0.000097 grad_norm: 0.6269 (0.8122) closs: 1.0566 (1.1082) time: 4.0685 data: 0.0002 max mem: 54684 -[01:18:56.907298] Epoch: [0] [1710/3229] lr: 0.000097 grad_norm: 0.6192 (0.8112) closs: 1.0566 (1.1077) time: 4.0583 data: 0.0002 max mem: 54684 -[01:19:37.360724] Epoch: [0] [1720/3229] lr: 0.000097 grad_norm: 0.6271 (0.8102) closs: 1.0554 (1.1074) time: 4.0304 data: 0.0002 max mem: 54684 -[01:20:18.619365] Epoch: [0] [1730/3229] lr: 0.000097 grad_norm: 0.6375 (0.8094) closs: 1.0484 (1.1072) time: 4.0855 data: 0.0002 max mem: 54684 -[01:20:59.595694] Epoch: [0] [1740/3229] lr: 0.000097 grad_norm: 0.6375 (0.8083) closs: 1.0680 (1.1071) time: 4.1117 data: 0.0002 max mem: 54684 -[01:21:40.383403] Epoch: [0] [1750/3229] lr: 0.000097 grad_norm: 0.6502 (0.8075) closs: 1.0761 (1.1069) time: 4.0881 data: 0.0002 max mem: 54684 -[01:22:21.505953] Epoch: [0] [1760/3229] lr: 0.000096 grad_norm: 0.6573 (0.8066) closs: 1.1000 (1.1068) time: 4.0954 data: 0.0002 max mem: 54684 -[01:23:02.622161] Epoch: [0] [1770/3229] lr: 0.000096 grad_norm: 0.6351 (0.8057) closs: 1.0783 (1.1064) time: 4.1119 data: 0.0002 max mem: 54684 -[01:23:43.593447] Epoch: [0] [1780/3229] lr: 0.000096 grad_norm: 0.6351 (0.8047) closs: 1.0777 (1.1061) time: 4.1043 data: 0.0002 max mem: 54684 -[01:24:24.461912] Epoch: [0] [1790/3229] lr: 0.000096 grad_norm: 0.6306 (0.8036) closs: 1.0259 (1.1058) time: 4.0919 data: 0.0002 max mem: 54684 -[01:25:05.008923] Epoch: [0] [1800/3229] lr: 0.000096 grad_norm: 0.6409 (0.8027) closs: 1.0266 (1.1055) time: 4.0707 data: 0.0002 max mem: 54684 -[01:25:46.132901] Epoch: [0] [1810/3229] lr: 0.000096 grad_norm: 0.6404 (0.8017) closs: 1.0714 (1.1053) time: 4.0835 data: 0.0002 max mem: 54684 -[01:26:26.759139] Epoch: [0] [1820/3229] lr: 0.000096 grad_norm: 0.6292 (0.8007) closs: 1.0945 (1.1052) time: 4.0874 data: 0.0002 max mem: 54684 -[01:27:07.555099] Epoch: [0] [1830/3229] lr: 0.000096 grad_norm: 0.6392 (0.7998) closs: 1.0834 (1.1051) time: 4.0710 data: 0.0002 max mem: 54684 -[01:27:47.364766] Epoch: [0] [1840/3229] lr: 0.000096 grad_norm: 0.5939 (0.7986) closs: 1.0471 (1.1047) time: 4.0302 data: 0.0002 max mem: 54684 -[01:28:28.031123] Epoch: [0] [1850/3229] lr: 0.000096 grad_norm: 0.6241 (0.7977) closs: 1.0452 (1.1044) time: 4.0237 data: 0.0002 max mem: 54684 -[01:29:08.346102] Epoch: [0] [1860/3229] lr: 0.000096 grad_norm: 0.6386 (0.7967) closs: 1.0565 (1.1041) time: 4.0490 data: 0.0002 max mem: 54684 -[01:29:49.454480] Epoch: [0] [1870/3229] lr: 0.000096 grad_norm: 0.6596 (0.7961) closs: 1.0934 (1.1040) time: 4.0711 data: 0.0002 max mem: 54684 -[01:30:30.259488] Epoch: [0] [1880/3229] lr: 0.000096 grad_norm: 0.6550 (0.7952) closs: 1.0679 (1.1036) time: 4.0956 data: 0.0002 max mem: 54684 -[01:31:11.247510] Epoch: [0] [1890/3229] lr: 0.000096 grad_norm: 0.6164 (0.7943) closs: 1.0539 (1.1034) time: 4.0896 data: 0.0002 max mem: 54684 -[01:31:52.539214] Epoch: [0] [1900/3229] lr: 0.000096 grad_norm: 0.6490 (0.7938) closs: 1.0805 (1.1033) time: 4.1139 data: 0.0002 max mem: 54684 -[01:32:33.012926] Epoch: [0] [1910/3229] lr: 0.000095 grad_norm: 0.6397 (0.7929) closs: 1.0446 (1.1028) time: 4.0882 data: 0.0002 max mem: 54684 -[01:33:13.158983] Epoch: [0] [1920/3229] lr: 0.000095 grad_norm: 0.6288 (0.7920) closs: 1.0213 (1.1023) time: 4.0309 data: 0.0002 max mem: 54684 -[01:33:54.586231] Epoch: [0] [1930/3229] lr: 0.000095 grad_norm: 0.6317 (0.7912) closs: 1.0280 (1.1022) time: 4.0786 data: 0.0002 max mem: 54684 -[01:34:35.894563] Epoch: [0] [1940/3229] lr: 0.000095 grad_norm: 0.6358 (0.7905) closs: 1.1057 (1.1023) time: 4.1367 data: 0.0002 max mem: 54684 -[01:35:17.015653] Epoch: [0] [1950/3229] lr: 0.000095 grad_norm: 0.6502 (0.7900) closs: 1.0936 (1.1022) time: 4.1214 data: 0.0002 max mem: 54684 -[01:35:57.478499] Epoch: [0] [1960/3229] lr: 0.000095 grad_norm: 0.6502 (0.7891) closs: 1.0743 (1.1020) time: 4.0791 data: 0.0002 max mem: 54684 -[01:36:38.315306] Epoch: [0] [1970/3229] lr: 0.000095 grad_norm: 0.6454 (0.7885) closs: 1.0743 (1.1019) time: 4.0649 data: 0.0002 max mem: 54684 -[01:37:18.937387] Epoch: [0] [1980/3229] lr: 0.000095 grad_norm: 0.6482 (0.7877) closs: 1.0680 (1.1017) time: 4.0729 data: 0.0002 max mem: 54684 -[01:38:00.087656] Epoch: [0] [1990/3229] lr: 0.000095 grad_norm: 0.6580 (0.7872) closs: 1.0680 (1.1015) time: 4.0885 data: 0.0002 max mem: 54684 -[01:38:40.255048] Epoch: [0] [2000/3229] lr: 0.000095 grad_norm: 0.6547 (0.7863) closs: 1.0624 (1.1011) time: 4.0658 data: 0.0002 max mem: 54684 -[01:39:22.009369] Epoch: [0] [2010/3229] lr: 0.000095 grad_norm: 0.6437 (0.7856) closs: 1.0624 (1.1009) time: 4.0960 data: 0.0002 max mem: 54684 -[01:40:02.651703] Epoch: [0] [2020/3229] lr: 0.000095 grad_norm: 0.6437 (0.7849) closs: 1.0730 (1.1008) time: 4.1198 data: 0.0002 max mem: 54684 -[01:40:42.813224] Epoch: [0] [2030/3229] lr: 0.000095 grad_norm: 0.6359 (0.7840) closs: 1.0540 (1.1003) time: 4.0401 data: 0.0002 max mem: 54684 -[01:41:23.609220] Epoch: [0] [2040/3229] lr: 0.000095 grad_norm: 0.6075 (0.7831) closs: 1.0542 (1.1001) time: 4.0478 data: 0.0002 max mem: 54684 -[01:42:04.985880] Epoch: [0] [2050/3229] lr: 0.000094 grad_norm: 0.6034 (0.7823) closs: 1.0908 (1.1002) time: 4.1086 data: 0.0002 max mem: 54684 -[01:42:45.962473] Epoch: [0] [2060/3229] lr: 0.000094 grad_norm: 0.6032 (0.7815) closs: 1.1014 (1.1001) time: 4.1176 data: 0.0002 max mem: 54684 -[01:43:26.108426] Epoch: [0] [2070/3229] lr: 0.000094 grad_norm: 0.6324 (0.7809) closs: 1.0512 (1.0997) time: 4.0561 data: 0.0002 max mem: 54684 -[01:44:07.013619] Epoch: [0] [2080/3229] lr: 0.000094 grad_norm: 0.6366 (0.7801) closs: 1.0372 (1.0995) time: 4.0525 data: 0.0002 max mem: 54684 -[01:44:48.024718] Epoch: [0] [2090/3229] lr: 0.000094 grad_norm: 0.6275 (0.7794) closs: 1.0783 (1.0992) time: 4.0957 data: 0.0002 max mem: 54684 -[01:45:28.323605] Epoch: [0] [2100/3229] lr: 0.000094 grad_norm: 0.6155 (0.7786) closs: 1.0706 (1.0991) time: 4.0654 data: 0.0002 max mem: 54684 -[01:46:09.119391] Epoch: [0] [2110/3229] lr: 0.000094 grad_norm: 0.6284 (0.7779) closs: 1.0700 (1.0989) time: 4.0547 data: 0.0002 max mem: 54684 -[01:46:49.684021] Epoch: [0] [2120/3229] lr: 0.000094 grad_norm: 0.6332 (0.7772) closs: 1.0607 (1.0986) time: 4.0680 data: 0.0002 max mem: 54684 -[01:47:30.511414] Epoch: [0] [2130/3229] lr: 0.000094 grad_norm: 0.6332 (0.7764) closs: 1.0698 (1.0986) time: 4.0695 data: 0.0002 max mem: 54684 -[01:48:11.156731] Epoch: [0] [2140/3229] lr: 0.000094 grad_norm: 0.6349 (0.7757) closs: 1.0916 (1.0985) time: 4.0736 data: 0.0002 max mem: 54684 -[01:48:51.920791] Epoch: [0] [2150/3229] lr: 0.000094 grad_norm: 0.6271 (0.7751) closs: 1.0552 (1.0983) time: 4.0704 data: 0.0002 max mem: 54684 -[01:49:32.907279] Epoch: [0] [2160/3229] lr: 0.000094 grad_norm: 0.6144 (0.7744) closs: 1.0773 (1.0983) time: 4.0875 data: 0.0002 max mem: 54684 -[01:50:14.443893] Epoch: [0] [2170/3229] lr: 0.000093 grad_norm: 0.6341 (0.7737) closs: 1.1436 (1.0986) time: 4.1261 data: 0.0002 max mem: 54684 -[01:50:55.743917] Epoch: [0] [2180/3229] lr: 0.000093 grad_norm: 0.6366 (0.7730) closs: 1.0898 (1.0985) time: 4.1418 data: 0.0002 max mem: 54684 -[01:51:36.202814] Epoch: [0] [2190/3229] lr: 0.000093 grad_norm: 0.6366 (0.7725) closs: 1.0797 (1.0984) time: 4.0879 data: 0.0002 max mem: 54684 -[01:52:17.113173] Epoch: [0] [2200/3229] lr: 0.000093 grad_norm: 0.6388 (0.7718) closs: 1.0543 (1.0983) time: 4.0684 data: 0.0002 max mem: 54684 -[01:52:58.036227] Epoch: [0] [2210/3229] lr: 0.000093 grad_norm: 0.6292 (0.7711) closs: 1.0506 (1.0982) time: 4.0916 data: 0.0002 max mem: 54684 -[01:53:38.979227] Epoch: [0] [2220/3229] lr: 0.000093 grad_norm: 0.6303 (0.7705) closs: 1.0667 (1.0979) time: 4.0932 data: 0.0002 max mem: 54684 -[01:54:19.782478] Epoch: [0] [2230/3229] lr: 0.000093 grad_norm: 0.6258 (0.7698) closs: 1.0397 (1.0977) time: 4.0872 data: 0.0002 max mem: 54684 -[01:55:00.467224] Epoch: [0] [2240/3229] lr: 0.000093 grad_norm: 0.6217 (0.7693) closs: 1.0617 (1.0974) time: 4.0743 data: 0.0002 max mem: 54684 -[01:55:41.351470] Epoch: [0] [2250/3229] lr: 0.000093 grad_norm: 0.6183 (0.7686) closs: 1.0617 (1.0973) time: 4.0784 data: 0.0002 max mem: 54684 -[01:56:22.358347] Epoch: [0] [2260/3229] lr: 0.000093 grad_norm: 0.6090 (0.7679) closs: 1.0609 (1.0971) time: 4.0945 data: 0.0002 max mem: 54684 -[01:57:03.481704] Epoch: [0] [2270/3229] lr: 0.000093 grad_norm: 0.6140 (0.7673) closs: 1.0597 (1.0969) time: 4.1064 data: 0.0002 max mem: 54684 -[01:57:44.512674] Epoch: [0] [2280/3229] lr: 0.000093 grad_norm: 0.6077 (0.7666) closs: 1.0421 (1.0968) time: 4.1076 data: 0.0002 max mem: 54684 -[01:58:25.617467] Epoch: [0] [2290/3229] lr: 0.000092 grad_norm: 0.5894 (0.7659) closs: 1.0421 (1.0967) time: 4.1067 data: 0.0002 max mem: 54684 -[01:59:06.748646] Epoch: [0] [2300/3229] lr: 0.000092 grad_norm: 0.6177 (0.7654) closs: 1.1147 (1.0967) time: 4.1117 data: 0.0002 max mem: 54684 -[01:59:46.929884] Epoch: [0] [2310/3229] lr: 0.000092 grad_norm: 0.6242 (0.7647) closs: 1.0786 (1.0965) time: 4.0656 data: 0.0002 max mem: 54684 -[02:00:27.941038] Epoch: [0] [2320/3229] lr: 0.000092 grad_norm: 0.6335 (0.7641) closs: 1.0572 (1.0963) time: 4.0596 data: 0.0002 max mem: 54684 -[02:01:09.062412] Epoch: [0] [2330/3229] lr: 0.000092 grad_norm: 0.5965 (0.7634) closs: 1.0347 (1.0959) time: 4.1066 data: 0.0002 max mem: 54684 -[02:01:49.850718] Epoch: [0] [2340/3229] lr: 0.000092 grad_norm: 0.5994 (0.7628) closs: 1.0367 (1.0959) time: 4.0954 data: 0.0002 max mem: 54684 -[02:02:30.342915] Epoch: [0] [2350/3229] lr: 0.000092 grad_norm: 0.6362 (0.7621) closs: 1.0736 (1.0957) time: 4.0640 data: 0.0002 max mem: 54684 -[02:03:11.287984] Epoch: [0] [2360/3229] lr: 0.000092 grad_norm: 0.6227 (0.7616) closs: 1.0493 (1.0956) time: 4.0718 data: 0.0002 max mem: 54684 -[02:03:52.343325] Epoch: [0] [2370/3229] lr: 0.000092 grad_norm: 0.6227 (0.7610) closs: 1.0744 (1.0957) time: 4.0999 data: 0.0002 max mem: 54684 -[02:04:33.448316] Epoch: [0] [2380/3229] lr: 0.000092 grad_norm: 0.5984 (0.7603) closs: 1.1016 (1.0957) time: 4.1079 data: 0.0002 max mem: 54684 -[02:05:13.893563] Epoch: [0] [2390/3229] lr: 0.000092 grad_norm: 0.5984 (0.7597) closs: 1.0778 (1.0955) time: 4.0774 data: 0.0002 max mem: 54684 -[02:05:54.988858] Epoch: [0] [2400/3229] lr: 0.000091 grad_norm: 0.6343 (0.7591) closs: 1.0643 (1.0953) time: 4.0770 data: 0.0002 max mem: 54684 -[02:06:36.309038] Epoch: [0] [2410/3229] lr: 0.000091 grad_norm: 0.6401 (0.7586) closs: 1.0573 (1.0952) time: 4.1207 data: 0.0002 max mem: 54684 -[02:07:17.084821] Epoch: [0] [2420/3229] lr: 0.000091 grad_norm: 0.6369 (0.7581) closs: 1.0577 (1.0950) time: 4.1047 data: 0.0002 max mem: 54684 -[02:07:57.540598] Epoch: [0] [2430/3229] lr: 0.000091 grad_norm: 0.6204 (0.7575) closs: 1.0577 (1.0947) time: 4.0615 data: 0.0002 max mem: 54684 -[02:08:39.007315] Epoch: [0] [2440/3229] lr: 0.000091 grad_norm: 0.6199 (0.7569) closs: 1.0658 (1.0947) time: 4.0961 data: 0.0002 max mem: 54684 -[02:09:20.629948] Epoch: [0] [2450/3229] lr: 0.000091 grad_norm: 0.6166 (0.7564) closs: 1.1124 (1.0947) time: 4.1544 data: 0.0002 max mem: 54684 -[02:10:01.711671] Epoch: [0] [2460/3229] lr: 0.000091 grad_norm: 0.6152 (0.7559) closs: 1.0793 (1.0946) time: 4.1352 data: 0.0002 max mem: 54684 -[02:10:42.828111] Epoch: [0] [2470/3229] lr: 0.000091 grad_norm: 0.6222 (0.7554) closs: 1.0420 (1.0943) time: 4.1098 data: 0.0002 max mem: 54684 -[02:11:23.897608] Epoch: [0] [2480/3229] lr: 0.000091 grad_norm: 0.6073 (0.7548) closs: 1.0518 (1.0941) time: 4.1092 data: 0.0002 max mem: 54684 -[02:12:05.274949] Epoch: [0] [2490/3229] lr: 0.000091 grad_norm: 0.6073 (0.7543) closs: 1.0648 (1.0941) time: 4.1223 data: 0.0002 max mem: 54684 -[02:12:46.406795] Epoch: [0] [2500/3229] lr: 0.000090 grad_norm: 0.6085 (0.7537) closs: 1.1052 (1.0941) time: 4.1254 data: 0.0002 max mem: 54684 -[02:13:26.863288] Epoch: [0] [2510/3229] lr: 0.000090 grad_norm: 0.5947 (0.7531) closs: 1.0760 (1.0939) time: 4.0794 data: 0.0002 max mem: 54684 -[02:14:08.186519] Epoch: [0] [2520/3229] lr: 0.000090 grad_norm: 0.5880 (0.7526) closs: 1.0614 (1.0938) time: 4.0889 data: 0.0002 max mem: 54684 -[02:14:49.191285] Epoch: [0] [2530/3229] lr: 0.000090 grad_norm: 0.6019 (0.7520) closs: 1.0730 (1.0938) time: 4.1163 data: 0.0002 max mem: 54684 -[02:15:29.975783] Epoch: [0] [2540/3229] lr: 0.000090 grad_norm: 0.5906 (0.7514) closs: 1.0601 (1.0936) time: 4.0894 data: 0.0002 max mem: 54684 -[02:16:10.768922] Epoch: [0] [2550/3229] lr: 0.000090 grad_norm: 0.5906 (0.7508) closs: 1.0423 (1.0934) time: 4.0788 data: 0.0002 max mem: 54684 -[02:16:52.174050] Epoch: [0] [2560/3229] lr: 0.000090 grad_norm: 0.5961 (0.7503) closs: 1.0423 (1.0933) time: 4.1098 data: 0.0002 max mem: 54684 -[02:17:33.116524] Epoch: [0] [2570/3229] lr: 0.000090 grad_norm: 0.5933 (0.7497) closs: 1.0581 (1.0932) time: 4.1173 data: 0.0002 max mem: 54684 -[02:18:14.196101] Epoch: [0] [2580/3229] lr: 0.000090 grad_norm: 0.5860 (0.7491) closs: 1.0619 (1.0931) time: 4.1010 data: 0.0002 max mem: 54684 -[02:18:54.324378] Epoch: [0] [2590/3229] lr: 0.000090 grad_norm: 0.5765 (0.7484) closs: 1.0785 (1.0929) time: 4.0603 data: 0.0002 max mem: 54684 -[02:19:35.831532] Epoch: [0] [2600/3229] lr: 0.000089 grad_norm: 0.5866 (0.7479) closs: 1.0738 (1.0928) time: 4.0817 data: 0.0002 max mem: 54684 -[02:20:16.483023] Epoch: [0] [2610/3229] lr: 0.000089 grad_norm: 0.6122 (0.7473) closs: 1.0686 (1.0925) time: 4.1079 data: 0.0002 max mem: 54684 -[02:20:56.936228] Epoch: [0] [2620/3229] lr: 0.000089 grad_norm: 0.5854 (0.7467) closs: 1.0702 (1.0924) time: 4.0552 data: 0.0002 max mem: 54684 -[02:21:38.121722] Epoch: [0] [2630/3229] lr: 0.000089 grad_norm: 0.5962 (0.7461) closs: 1.0889 (1.0925) time: 4.0819 data: 0.0002 max mem: 54684 -[02:22:19.198229] Epoch: [0] [2640/3229] lr: 0.000089 grad_norm: 0.6168 (0.7457) closs: 1.0931 (1.0924) time: 4.1130 data: 0.0002 max mem: 54684 -[02:22:59.823945] Epoch: [0] [2650/3229] lr: 0.000089 grad_norm: 0.5995 (0.7450) closs: 1.0365 (1.0922) time: 4.0850 data: 0.0002 max mem: 54684 -[02:23:40.933433] Epoch: [0] [2660/3229] lr: 0.000089 grad_norm: 0.5974 (0.7446) closs: 1.0331 (1.0920) time: 4.0867 data: 0.0002 max mem: 54684 -[02:24:21.838608] Epoch: [0] [2670/3229] lr: 0.000089 grad_norm: 0.6172 (0.7441) closs: 1.0431 (1.0918) time: 4.1007 data: 0.0002 max mem: 54684 -[02:25:02.759550] Epoch: [0] [2680/3229] lr: 0.000089 grad_norm: 0.6097 (0.7436) closs: 1.0506 (1.0917) time: 4.0912 data: 0.0002 max mem: 54684 -[02:25:43.386770] Epoch: [0] [2690/3229] lr: 0.000089 grad_norm: 0.5930 (0.7430) closs: 1.0564 (1.0916) time: 4.0773 data: 0.0002 max mem: 54684 -[02:26:23.853609] Epoch: [0] [2700/3229] lr: 0.000088 grad_norm: 0.5717 (0.7423) closs: 1.0674 (1.0915) time: 4.0546 data: 0.0002 max mem: 54684 -[02:27:04.832163] Epoch: [0] [2710/3229] lr: 0.000088 grad_norm: 0.5725 (0.7417) closs: 1.0830 (1.0914) time: 4.0722 data: 0.0002 max mem: 54684 -[02:27:46.111218] Epoch: [0] [2720/3229] lr: 0.000088 grad_norm: 0.5860 (0.7414) closs: 1.0830 (1.0914) time: 4.1128 data: 0.0002 max mem: 54684 -[02:28:27.035746] Epoch: [0] [2730/3229] lr: 0.000088 grad_norm: 0.5996 (0.7410) closs: 1.1146 (1.0913) time: 4.1101 data: 0.0002 max mem: 54684 -[02:29:07.490028] Epoch: [0] [2740/3229] lr: 0.000088 grad_norm: 0.6072 (0.7405) closs: 1.0595 (1.0911) time: 4.0689 data: 0.0002 max mem: 54684 -[02:29:48.509847] Epoch: [0] [2750/3229] lr: 0.000088 grad_norm: 0.6080 (0.7400) closs: 1.0567 (1.0910) time: 4.0736 data: 0.0002 max mem: 54684 -[02:30:29.784566] Epoch: [0] [2760/3229] lr: 0.000088 grad_norm: 0.5941 (0.7394) closs: 1.0694 (1.0909) time: 4.1147 data: 0.0002 max mem: 54684 -[02:31:10.397341] Epoch: [0] [2770/3229] lr: 0.000088 grad_norm: 0.5914 (0.7388) closs: 1.0694 (1.0907) time: 4.0943 data: 0.0002 max mem: 54684 -[02:31:50.521320] Epoch: [0] [2780/3229] lr: 0.000088 grad_norm: 0.5933 (0.7382) closs: 1.0263 (1.0903) time: 4.0368 data: 0.0002 max mem: 54684 -[02:32:31.199358] Epoch: [0] [2790/3229] lr: 0.000087 grad_norm: 0.5933 (0.7377) closs: 1.0229 (1.0902) time: 4.0400 data: 0.0002 max mem: 54684 -[02:33:11.823438] Epoch: [0] [2800/3229] lr: 0.000087 grad_norm: 0.5871 (0.7372) closs: 1.0686 (1.0900) time: 4.0650 data: 0.0002 max mem: 54684 -[02:33:52.788200] Epoch: [0] [2810/3229] lr: 0.000087 grad_norm: 0.5766 (0.7369) closs: 1.0498 (1.0899) time: 4.0794 data: 0.0002 max mem: 54684 -[02:34:33.570995] Epoch: [0] [2820/3229] lr: 0.000087 grad_norm: 0.5867 (0.7364) closs: 1.0498 (1.0897) time: 4.0873 data: 0.0002 max mem: 54684 -[02:35:14.475184] Epoch: [0] [2830/3229] lr: 0.000087 grad_norm: 0.5867 (0.7360) closs: 1.0506 (1.0897) time: 4.0843 data: 0.0002 max mem: 54684 -[02:35:55.072862] Epoch: [0] [2840/3229] lr: 0.000087 grad_norm: 0.5796 (0.7354) closs: 1.0448 (1.0896) time: 4.0750 data: 0.0002 max mem: 54684 -[02:36:36.343452] Epoch: [0] [2850/3229] lr: 0.000087 grad_norm: 0.5795 (0.7349) closs: 1.0341 (1.0895) time: 4.0934 data: 0.0002 max mem: 54684 -[02:37:16.453657] Epoch: [0] [2860/3229] lr: 0.000087 grad_norm: 0.5890 (0.7343) closs: 1.0884 (1.0894) time: 4.0690 data: 0.0002 max mem: 54684 -[02:37:57.847510] Epoch: [0] [2870/3229] lr: 0.000087 grad_norm: 0.6032 (0.7338) closs: 1.0884 (1.0893) time: 4.0751 data: 0.0002 max mem: 54684 -[02:38:38.666224] Epoch: [0] [2880/3229] lr: 0.000086 grad_norm: 0.6020 (0.7333) closs: 1.1037 (1.0894) time: 4.1106 data: 0.0002 max mem: 54684 -[02:39:19.313654] Epoch: [0] [2890/3229] lr: 0.000086 grad_norm: 0.5895 (0.7328) closs: 1.0991 (1.0892) time: 4.0732 data: 0.0002 max mem: 54684 -[02:40:00.426834] Epoch: [0] [2900/3229] lr: 0.000086 grad_norm: 0.5895 (0.7323) closs: 1.0807 (1.0892) time: 4.0880 data: 0.0002 max mem: 54684 -[02:40:41.449641] Epoch: [0] [2910/3229] lr: 0.000086 grad_norm: 0.6071 (0.7318) closs: 1.0677 (1.0890) time: 4.1067 data: 0.0002 max mem: 54684 -[02:41:22.717088] Epoch: [0] [2920/3229] lr: 0.000086 grad_norm: 0.5769 (0.7313) closs: 1.0795 (1.0890) time: 4.1145 data: 0.0002 max mem: 54684 -[02:42:03.989019] Epoch: [0] [2930/3229] lr: 0.000086 grad_norm: 0.6046 (0.7310) closs: 1.0829 (1.0889) time: 4.1269 data: 0.0002 max mem: 54684 -[02:42:44.765771] Epoch: [0] [2940/3229] lr: 0.000086 grad_norm: 0.6112 (0.7306) closs: 1.0782 (1.0890) time: 4.1024 data: 0.0002 max mem: 54684 -[02:43:26.121816] Epoch: [0] [2950/3229] lr: 0.000086 grad_norm: 0.5910 (0.7301) closs: 1.0738 (1.0889) time: 4.1066 data: 0.0002 max mem: 54684 -[02:44:06.980865] Epoch: [0] [2960/3229] lr: 0.000085 grad_norm: 0.5530 (0.7294) closs: 1.0462 (1.0887) time: 4.1107 data: 0.0002 max mem: 54684 -[02:44:47.929066] Epoch: [0] [2970/3229] lr: 0.000085 grad_norm: 0.5539 (0.7290) closs: 1.0193 (1.0885) time: 4.0903 data: 0.0002 max mem: 54684 -[02:45:29.038596] Epoch: [0] [2980/3229] lr: 0.000085 grad_norm: 0.5897 (0.7286) closs: 1.0596 (1.0885) time: 4.1028 data: 0.0002 max mem: 54684 -[02:46:10.337131] Epoch: [0] [2990/3229] lr: 0.000085 grad_norm: 0.6092 (0.7282) closs: 1.0325 (1.0882) time: 4.1203 data: 0.0002 max mem: 54684 -[02:46:51.160184] Epoch: [0] [3000/3229] lr: 0.000085 grad_norm: 0.6092 (0.7278) closs: 1.0017 (1.0880) time: 4.1060 data: 0.0002 max mem: 54684 -[02:47:32.092042] Epoch: [0] [3010/3229] lr: 0.000085 grad_norm: 0.5830 (0.7272) closs: 1.0432 (1.0880) time: 4.0877 data: 0.0002 max mem: 54684 -[02:48:13.191664] Epoch: [0] [3020/3229] lr: 0.000085 grad_norm: 0.5738 (0.7268) closs: 1.0643 (1.0879) time: 4.1015 data: 0.0002 max mem: 54684 -[02:48:53.607366] Epoch: [0] [3030/3229] lr: 0.000085 grad_norm: 0.5865 (0.7262) closs: 1.0520 (1.0876) time: 4.0757 data: 0.0002 max mem: 54684 -[02:49:34.770223] Epoch: [0] [3040/3229] lr: 0.000084 grad_norm: 0.5963 (0.7258) closs: 1.0520 (1.0875) time: 4.0789 data: 0.0002 max mem: 54684 -[02:50:15.718625] Epoch: [0] [3050/3229] lr: 0.000084 grad_norm: 0.5844 (0.7253) closs: 1.0391 (1.0874) time: 4.1055 data: 0.0002 max mem: 54684 -[02:50:56.816806] Epoch: [0] [3060/3229] lr: 0.000084 grad_norm: 0.5844 (0.7249) closs: 1.0391 (1.0873) time: 4.1023 data: 0.0002 max mem: 54684 -[02:51:38.020430] Epoch: [0] [3070/3229] lr: 0.000084 grad_norm: 0.5790 (0.7244) closs: 1.0698 (1.0873) time: 4.1150 data: 0.0002 max mem: 54684 -[02:52:18.623922] Epoch: [0] [3080/3229] lr: 0.000084 grad_norm: 0.5625 (0.7238) closs: 1.0568 (1.0871) time: 4.0903 data: 0.0002 max mem: 54684 -[02:52:59.070119] Epoch: [0] [3090/3229] lr: 0.000084 grad_norm: 0.5895 (0.7234) closs: 1.0600 (1.0870) time: 4.0524 data: 0.0002 max mem: 54684 -[02:53:39.601820] Epoch: [0] [3100/3229] lr: 0.000084 grad_norm: 0.5724 (0.7228) closs: 1.1135 (1.0870) time: 4.0488 data: 0.0002 max mem: 54684 -[02:54:20.332654] Epoch: [0] [3110/3229] lr: 0.000084 grad_norm: 0.5543 (0.7223) closs: 1.0445 (1.0869) time: 4.0631 data: 0.0002 max mem: 54684 -[02:55:01.284722] Epoch: [0] [3120/3229] lr: 0.000084 grad_norm: 0.5879 (0.7219) closs: 1.0445 (1.0867) time: 4.0841 data: 0.0002 max mem: 54684 -[02:55:41.419647] Epoch: [0] [3130/3229] lr: 0.000083 grad_norm: 0.5835 (0.7213) closs: 1.0551 (1.0865) time: 4.0543 data: 0.0002 max mem: 54684 -[02:56:21.204464] Epoch: [0] [3140/3229] lr: 0.000083 grad_norm: 0.5494 (0.7208) closs: 1.0305 (1.0863) time: 3.9959 data: 0.0002 max mem: 54684 -[02:57:02.592698] Epoch: [0] [3150/3229] lr: 0.000083 grad_norm: 0.5644 (0.7203) closs: 1.0305 (1.0862) time: 4.0586 data: 0.0002 max mem: 54684 -[02:57:42.889503] Epoch: [0] [3160/3229] lr: 0.000083 grad_norm: 0.5874 (0.7199) closs: 1.0279 (1.0860) time: 4.0842 data: 0.0002 max mem: 54684 -[02:58:23.673238] Epoch: [0] [3170/3229] lr: 0.000083 grad_norm: 0.5838 (0.7196) closs: 1.0648 (1.0859) time: 4.0540 data: 0.0002 max mem: 54684 -[02:59:04.635512] Epoch: [0] [3180/3229] lr: 0.000083 grad_norm: 0.6047 (0.7193) closs: 1.0742 (1.0858) time: 4.0872 data: 0.0002 max mem: 54684 -[02:59:45.556337] Epoch: [0] [3190/3229] lr: 0.000083 grad_norm: 0.5944 (0.7188) closs: 1.0606 (1.0857) time: 4.0941 data: 0.0002 max mem: 54684 -[03:00:26.818291] Epoch: [0] [3200/3229] lr: 0.000082 grad_norm: 0.5647 (0.7183) closs: 1.0673 (1.0856) time: 4.1091 data: 0.0003 max mem: 54684 -[03:01:07.594946] Epoch: [0] [3210/3229] lr: 0.000082 grad_norm: 0.5647 (0.7179) closs: 1.0773 (1.0855) time: 4.1019 data: 0.0003 max mem: 54684 -[03:01:48.926992] Epoch: [0] [3220/3229] lr: 0.000082 grad_norm: 0.5837 (0.7176) closs: 1.0710 (1.0855) time: 4.1054 data: 0.0001 max mem: 54684 -[03:02:22.158116] Epoch: [0] Total time: 3:39:54 -[03:02:22.159151] Averaged stats: lr: 0.000082 grad_norm: 0.5870 (0.7172) closs: 1.0331 (1.0847) -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -[03:02:22.501825] model saved -/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -[03:02:24.206463] optimizer saved -[03:02:24.207016] other rank-common saved -[03:02:24.212083] rank-specific saved -[03:02:24.225861] log_dir: ./output_dir -[03:02:37.059687] Epoch: [1] [0/3229] lr: 0.000082 grad_norm: 0.6132 (0.6132) closs: 0.9999 (0.9999) time: 12.8331 data: 8.7530 max mem: 54684 -[03:03:18.632210] Epoch: [1] [10/3229] lr: 0.000082 grad_norm: 0.5647 (0.5912) closs: 1.0765 (1.0837) time: 4.9459 data: 0.7958 max mem: 54684 -[03:03:59.051367] Epoch: [1] [20/3229] lr: 0.000082 grad_norm: 0.5647 (0.5861) closs: 1.0516 (1.0566) time: 4.0995 data: 0.0001 max mem: 54684 -[03:04:40.503119] Epoch: [1] [30/3229] lr: 0.000082 grad_norm: 0.5915 (0.5939) closs: 1.0516 (1.0691) time: 4.0935 data: 0.0002 max mem: 54684 -[03:05:22.097537] Epoch: [1] [40/3229] lr: 0.000082 grad_norm: 0.5850 (0.5910) closs: 1.0820 (1.0720) time: 4.1522 data: 0.0002 max mem: 54684 -[03:06:03.090896] Epoch: [1] [50/3229] lr: 0.000081 grad_norm: 0.5815 (0.5900) closs: 1.0677 (1.0644) time: 4.1293 data: 0.0002 max mem: 54684 -[03:06:43.196597] Epoch: [1] [60/3229] lr: 0.000081 grad_norm: 0.5899 (0.5894) closs: 1.0078 (1.0537) time: 4.0549 data: 0.0002 max mem: 54684 -[03:07:24.613357] Epoch: [1] [70/3229] lr: 0.000081 grad_norm: 0.6045 (0.5897) closs: 1.0219 (1.0506) time: 4.0761 data: 0.0002 max mem: 54684 -[03:08:04.515157] Epoch: [1] [80/3229] lr: 0.000081 grad_norm: 0.5799 (0.5831) closs: 1.0161 (1.0428) time: 4.0659 data: 0.0002 max mem: 54684 -[03:08:45.467975] Epoch: [1] [90/3229] lr: 0.000081 grad_norm: 0.5556 (0.5835) closs: 1.0144 (1.0444) time: 4.0427 data: 0.0002 max mem: 54684 -[03:09:26.243171] Epoch: [1] [100/3229] lr: 0.000081 grad_norm: 0.5795 (0.5835) closs: 1.0435 (1.0434) time: 4.0863 data: 0.0002 max mem: 54684 -[03:10:08.038819] Epoch: [1] [110/3229] lr: 0.000081 grad_norm: 0.5723 (0.5835) closs: 1.0376 (1.0418) time: 4.1285 data: 0.0002 max mem: 54684 -[03:10:48.939404] Epoch: [1] [120/3229] lr: 0.000081 grad_norm: 0.5722 (0.5819) closs: 1.0377 (1.0430) time: 4.1347 data: 0.0002 max mem: 54684 -[03:11:30.055410] Epoch: [1] [130/3229] lr: 0.000080 grad_norm: 0.5803 (0.5823) closs: 1.0633 (1.0459) time: 4.1008 data: 0.0002 max mem: 54684 -[03:12:10.774513] Epoch: [1] [140/3229] lr: 0.000080 grad_norm: 0.5809 (0.5827) closs: 1.0694 (1.0474) time: 4.0917 data: 0.0002 max mem: 54684 -[03:12:51.481838] Epoch: [1] [150/3229] lr: 0.000080 grad_norm: 0.5672 (0.5807) closs: 1.0533 (1.0461) time: 4.0713 data: 0.0002 max mem: 54684 -[03:13:32.160702] Epoch: [1] [160/3229] lr: 0.000080 grad_norm: 0.5565 (0.5796) closs: 1.0420 (1.0467) time: 4.0692 data: 0.0001 max mem: 54684 -[03:14:12.777131] Epoch: [1] [170/3229] lr: 0.000080 grad_norm: 0.5744 (0.5800) closs: 1.0506 (1.0438) time: 4.0647 data: 0.0001 max mem: 54684 -[03:14:53.223464] Epoch: [1] [180/3229] lr: 0.000080 grad_norm: 0.6011 (0.5837) closs: 1.0397 (1.0437) time: 4.0531 data: 0.0002 max mem: 54684 -[03:15:34.236634] Epoch: [1] [190/3229] lr: 0.000080 grad_norm: 0.5986 (0.5832) closs: 1.0226 (1.0418) time: 4.0729 data: 0.0002 max mem: 54684 -[03:16:15.503140] Epoch: [1] [200/3229] lr: 0.000079 grad_norm: 0.5903 (0.5841) closs: 1.0284 (1.0425) time: 4.1139 data: 0.0002 max mem: 54684 -[03:16:56.870515] Epoch: [1] [210/3229] lr: 0.000079 grad_norm: 0.5972 (0.5856) closs: 1.0789 (1.0449) time: 4.1316 data: 0.0002 max mem: 54684 -[03:17:37.377683] Epoch: [1] [220/3229] lr: 0.000079 grad_norm: 0.6051 (0.5856) closs: 1.0734 (1.0451) time: 4.0937 data: 0.0002 max mem: 54684 -[03:18:17.897913] Epoch: [1] [230/3229] lr: 0.000079 grad_norm: 0.5783 (0.5854) closs: 1.0634 (1.0431) time: 4.0513 data: 0.0002 max mem: 54684 -[03:18:58.845292] Epoch: [1] [240/3229] lr: 0.000079 grad_norm: 0.5760 (0.5856) closs: 1.0062 (1.0415) time: 4.0733 data: 0.0002 max mem: 54684 -[03:19:40.143287] Epoch: [1] [250/3229] lr: 0.000079 grad_norm: 0.5830 (0.5858) closs: 1.0155 (1.0413) time: 4.1122 data: 0.0002 max mem: 54684 -[03:20:21.253868] Epoch: [1] [260/3229] lr: 0.000079 grad_norm: 0.5724 (0.5857) closs: 1.0472 (1.0420) time: 4.1204 data: 0.0002 max mem: 54684 -[03:21:02.310900] Epoch: [1] [270/3229] lr: 0.000079 grad_norm: 0.5693 (0.5852) closs: 1.0750 (1.0425) time: 4.1083 data: 0.0002 max mem: 54684 -[03:21:42.925981] Epoch: [1] [280/3229] lr: 0.000078 grad_norm: 0.5739 (0.5850) closs: 1.0208 (1.0422) time: 4.0835 data: 0.0002 max mem: 54684 -[03:22:23.870461] Epoch: [1] [290/3229] lr: 0.000078 grad_norm: 0.5811 (0.5850) closs: 1.0208 (1.0421) time: 4.0779 data: 0.0002 max mem: 54684 -[03:23:03.649607] Epoch: [1] [300/3229] lr: 0.000078 grad_norm: 0.5807 (0.5838) closs: 1.0100 (1.0409) time: 4.0361 data: 0.0002 max mem: 54684 -[03:23:44.397242] Epoch: [1] [310/3229] lr: 0.000078 grad_norm: 0.5614 (0.5834) closs: 0.9924 (1.0399) time: 4.0263 data: 0.0002 max mem: 54684 -[03:24:25.692114] Epoch: [1] [320/3229] lr: 0.000078 grad_norm: 0.5705 (0.5838) closs: 1.0024 (1.0397) time: 4.1021 data: 0.0002 max mem: 54684 -[03:25:06.316947] Epoch: [1] [330/3229] lr: 0.000078 grad_norm: 0.5706 (0.5832) closs: 1.0348 (1.0391) time: 4.0959 data: 0.0002 max mem: 54684 -[03:25:47.408916] Epoch: [1] [340/3229] lr: 0.000078 grad_norm: 0.5612 (0.5823) closs: 1.0617 (1.0411) time: 4.0858 data: 0.0002 max mem: 54684 -[03:26:27.973524] Epoch: [1] [350/3229] lr: 0.000077 grad_norm: 0.5532 (0.5812) closs: 1.0858 (1.0398) time: 4.0828 data: 0.0002 max mem: 54684 -[03:27:08.836154] Epoch: [1] [360/3229] lr: 0.000077 grad_norm: 0.5532 (0.5813) closs: 1.0425 (1.0395) time: 4.0713 data: 0.0002 max mem: 54684 -[03:27:50.188867] Epoch: [1] [370/3229] lr: 0.000077 grad_norm: 0.5685 (0.5813) closs: 1.0543 (1.0401) time: 4.1107 data: 0.0002 max mem: 54684 -[03:28:31.322423] Epoch: [1] [380/3229] lr: 0.000077 grad_norm: 0.5652 (0.5814) closs: 1.0788 (1.0413) time: 4.1243 data: 0.0002 max mem: 54684 -[03:29:11.945890] Epoch: [1] [390/3229] lr: 0.000077 grad_norm: 0.5715 (0.5806) closs: 1.0404 (1.0400) time: 4.0878 data: 0.0002 max mem: 54684 -[03:29:52.453441] Epoch: [1] [400/3229] lr: 0.000077 grad_norm: 0.5882 (0.5809) closs: 0.9902 (1.0388) time: 4.0565 data: 0.0002 max mem: 54684 -[03:30:33.398665] Epoch: [1] [410/3229] lr: 0.000077 grad_norm: 0.5950 (0.5812) closs: 1.0371 (1.0393) time: 4.0726 data: 0.0002 max mem: 54684 -[03:31:13.860269] Epoch: [1] [420/3229] lr: 0.000076 grad_norm: 0.5764 (0.5808) closs: 1.0707 (1.0392) time: 4.0703 data: 0.0002 max mem: 54684 -[03:31:55.414571] Epoch: [1] [430/3229] lr: 0.000076 grad_norm: 0.5494 (0.5805) closs: 1.0707 (1.0403) time: 4.1007 data: 0.0002 max mem: 54684 -[03:32:35.855299] Epoch: [1] [440/3229] lr: 0.000076 grad_norm: 0.5548 (0.5802) closs: 1.0658 (1.0402) time: 4.0997 data: 0.0002 max mem: 54684 -[03:33:16.452210] Epoch: [1] [450/3229] lr: 0.000076 grad_norm: 0.5614 (0.5798) closs: 1.0492 (1.0401) time: 4.0518 data: 0.0004 max mem: 54684 -[03:33:57.234098] Epoch: [1] [460/3229] lr: 0.000076 grad_norm: 0.5710 (0.5794) closs: 1.0546 (1.0406) time: 4.0689 data: 0.0004 max mem: 54684 -[03:34:38.211753] Epoch: [1] [470/3229] lr: 0.000076 grad_norm: 0.5683 (0.5792) closs: 1.0546 (1.0399) time: 4.0879 data: 0.0002 max mem: 54684 -[03:35:19.148555] Epoch: [1] [480/3229] lr: 0.000076 grad_norm: 0.5668 (0.5788) closs: 1.0823 (1.0410) time: 4.0957 data: 0.0002 max mem: 54684 -[03:35:59.926822] Epoch: [1] [490/3229] lr: 0.000075 grad_norm: 0.5872 (0.5795) closs: 1.0733 (1.0407) time: 4.0857 data: 0.0002 max mem: 54684 -[03:36:40.816831] Epoch: [1] [500/3229] lr: 0.000075 grad_norm: 0.5882 (0.5796) closs: 1.0178 (1.0404) time: 4.0833 data: 0.0002 max mem: 54684 -[03:37:21.882692] Epoch: [1] [510/3229] lr: 0.000075 grad_norm: 0.5975 (0.5799) closs: 1.0400 (1.0405) time: 4.0977 data: 0.0002 max mem: 54684 -[03:38:02.986718] Epoch: [1] [520/3229] lr: 0.000075 grad_norm: 0.5864 (0.5798) closs: 1.0698 (1.0415) time: 4.1084 data: 0.0002 max mem: 54684 -[03:38:44.292702] Epoch: [1] [530/3229] lr: 0.000075 grad_norm: 0.5751 (0.5799) closs: 1.0699 (1.0420) time: 4.1204 data: 0.0002 max mem: 54684 -[03:39:25.188063] Epoch: [1] [540/3229] lr: 0.000075 grad_norm: 0.5619 (0.5795) closs: 1.0750 (1.0422) time: 4.1100 data: 0.0002 max mem: 54684 -[03:40:06.326097] Epoch: [1] [550/3229] lr: 0.000075 grad_norm: 0.5697 (0.5793) closs: 1.0631 (1.0421) time: 4.1016 data: 0.0002 max mem: 54684 -[03:40:46.941861] Epoch: [1] [560/3229] lr: 0.000074 grad_norm: 0.5769 (0.5789) closs: 1.0369 (1.0421) time: 4.0876 data: 0.0002 max mem: 54684 -[03:41:27.715194] Epoch: [1] [570/3229] lr: 0.000074 grad_norm: 0.5715 (0.5790) closs: 1.0353 (1.0420) time: 4.0694 data: 0.0002 max mem: 54684 -[03:42:08.311161] Epoch: [1] [580/3229] lr: 0.000074 grad_norm: 0.5662 (0.5787) closs: 1.0276 (1.0416) time: 4.0684 data: 0.0002 max mem: 54684 -[03:42:49.459226] Epoch: [1] [590/3229] lr: 0.000074 grad_norm: 0.5645 (0.5789) closs: 1.0383 (1.0416) time: 4.0871 data: 0.0002 max mem: 54684 -[03:43:29.764469] Epoch: [1] [600/3229] lr: 0.000074 grad_norm: 0.5632 (0.5782) closs: 1.0646 (1.0416) time: 4.0726 data: 0.0002 max mem: 54684 -[03:44:10.863831] Epoch: [1] [610/3229] lr: 0.000074 grad_norm: 0.5650 (0.5784) closs: 1.0527 (1.0416) time: 4.0702 data: 0.0002 max mem: 54684 -[03:44:52.100000] Epoch: [1] [620/3229] lr: 0.000073 grad_norm: 0.5675 (0.5784) closs: 1.0475 (1.0417) time: 4.1167 data: 0.0002 max mem: 54684 -[03:45:32.780967] Epoch: [1] [630/3229] lr: 0.000073 grad_norm: 0.5615 (0.5781) closs: 1.0306 (1.0406) time: 4.0958 data: 0.0002 max mem: 54684 -[03:46:13.717184] Epoch: [1] [640/3229] lr: 0.000073 grad_norm: 0.5615 (0.5779) closs: 1.0113 (1.0406) time: 4.0808 data: 0.0002 max mem: 54684 -[03:46:54.518077] Epoch: [1] [650/3229] lr: 0.000073 grad_norm: 0.5599 (0.5776) closs: 1.0253 (1.0406) time: 4.0868 data: 0.0002 max mem: 54684 -[03:47:35.653148] Epoch: [1] [660/3229] lr: 0.000073 grad_norm: 0.5541 (0.5776) closs: 1.0473 (1.0408) time: 4.0967 data: 0.0002 max mem: 54684 -[03:48:16.527626] Epoch: [1] [670/3229] lr: 0.000073 grad_norm: 0.5589 (0.5774) closs: 1.0642 (1.0410) time: 4.1004 data: 0.0002 max mem: 54684 -[03:48:57.838597] Epoch: [1] [680/3229] lr: 0.000073 grad_norm: 0.5645 (0.5773) closs: 1.0592 (1.0413) time: 4.1092 data: 0.0002 max mem: 54684 -[03:49:38.297230] Epoch: [1] [690/3229] lr: 0.000072 grad_norm: 0.5645 (0.5770) closs: 1.0436 (1.0416) time: 4.0884 data: 0.0002 max mem: 54684 -[03:50:19.056069] Epoch: [1] [700/3229] lr: 0.000072 grad_norm: 0.5649 (0.5768) closs: 1.0510 (1.0413) time: 4.0608 data: 0.0002 max mem: 54684 -[03:50:59.983545] Epoch: [1] [710/3229] lr: 0.000072 grad_norm: 0.5638 (0.5763) closs: 1.0680 (1.0411) time: 4.0843 data: 0.0002 max mem: 54684 -[03:51:41.244969] Epoch: [1] [720/3229] lr: 0.000072 grad_norm: 0.5580 (0.5763) closs: 1.0682 (1.0419) time: 4.1094 data: 0.0002 max mem: 54684 -[03:52:22.017980] Epoch: [1] [730/3229] lr: 0.000072 grad_norm: 0.5673 (0.5762) closs: 1.0718 (1.0422) time: 4.1017 data: 0.0002 max mem: 54684 -[03:53:02.115581] Epoch: [1] [740/3229] lr: 0.000072 grad_norm: 0.5340 (0.5757) closs: 1.0090 (1.0416) time: 4.0435 data: 0.0002 max mem: 54684 -[03:53:43.063539] Epoch: [1] [750/3229] lr: 0.000072 grad_norm: 0.5403 (0.5754) closs: 1.0249 (1.0419) time: 4.0522 data: 0.0002 max mem: 54684 -[03:54:24.299312] Epoch: [1] [760/3229] lr: 0.000071 grad_norm: 0.5616 (0.5762) closs: 1.0686 (1.0422) time: 4.1091 data: 0.0002 max mem: 54684 -[03:55:05.072533] Epoch: [1] [770/3229] lr: 0.000071 grad_norm: 0.6088 (0.5766) closs: 1.0797 (1.0428) time: 4.1004 data: 0.0002 max mem: 54684 -[03:55:46.171867] Epoch: [1] [780/3229] lr: 0.000071 grad_norm: 0.5813 (0.5764) closs: 1.0868 (1.0434) time: 4.0936 data: 0.0002 max mem: 54684 -[03:56:27.138912] Epoch: [1] [790/3229] lr: 0.000071 grad_norm: 0.5429 (0.5760) closs: 1.0689 (1.0430) time: 4.1033 data: 0.0002 max mem: 54684 -[03:57:08.406933] Epoch: [1] [800/3229] lr: 0.000071 grad_norm: 0.5489 (0.5760) closs: 1.0240 (1.0432) time: 4.1117 data: 0.0002 max mem: 54684 -[03:57:48.557636] Epoch: [1] [810/3229] lr: 0.000071 grad_norm: 0.5489 (0.5756) closs: 1.0244 (1.0427) time: 4.0709 data: 0.0002 max mem: 54684 -[03:58:29.984938] Epoch: [1] [820/3229] lr: 0.000070 grad_norm: 0.5609 (0.5756) closs: 1.0117 (1.0426) time: 4.0788 data: 0.0002 max mem: 54684 -[03:59:10.607826] Epoch: [1] [830/3229] lr: 0.000070 grad_norm: 0.5660 (0.5753) closs: 1.0388 (1.0428) time: 4.1024 data: 0.0002 max mem: 54684 -[03:59:51.579481] Epoch: [1] [840/3229] lr: 0.000070 grad_norm: 0.5696 (0.5752) closs: 1.0748 (1.0434) time: 4.0797 data: 0.0002 max mem: 54684 -[04:00:32.354623] Epoch: [1] [850/3229] lr: 0.000070 grad_norm: 0.5696 (0.5751) closs: 1.0576 (1.0436) time: 4.0873 data: 0.0002 max mem: 54684 -[04:01:13.476385] Epoch: [1] [860/3229] lr: 0.000070 grad_norm: 0.5586 (0.5748) closs: 1.0391 (1.0437) time: 4.0948 data: 0.0002 max mem: 54684 -[04:01:54.062613] Epoch: [1] [870/3229] lr: 0.000070 grad_norm: 0.5624 (0.5746) closs: 1.0365 (1.0434) time: 4.0853 data: 0.0002 max mem: 54684 -[04:02:34.708901] Epoch: [1] [880/3229] lr: 0.000070 grad_norm: 0.5675 (0.5744) closs: 1.0142 (1.0432) time: 4.0616 data: 0.0002 max mem: 54684 -[04:03:15.525860] Epoch: [1] [890/3229] lr: 0.000069 grad_norm: 0.5504 (0.5742) closs: 1.0676 (1.0438) time: 4.0731 data: 0.0002 max mem: 54684 -[04:03:56.736301] Epoch: [1] [900/3229] lr: 0.000069 grad_norm: 0.5477 (0.5740) closs: 1.0676 (1.0439) time: 4.1013 data: 0.0002 max mem: 54684 -[04:04:36.920488] Epoch: [1] [910/3229] lr: 0.000069 grad_norm: 0.5477 (0.5738) closs: 1.0670 (1.0441) time: 4.0697 data: 0.0002 max mem: 54684 -[04:05:18.168891] Epoch: [1] [920/3229] lr: 0.000069 grad_norm: 0.5736 (0.5739) closs: 1.0477 (1.0440) time: 4.0716 data: 0.0002 max mem: 54684 -[04:05:59.267874] Epoch: [1] [930/3229] lr: 0.000069 grad_norm: 0.5780 (0.5739) closs: 1.0477 (1.0441) time: 4.1173 data: 0.0002 max mem: 54684 -[04:06:40.055537] Epoch: [1] [940/3229] lr: 0.000069 grad_norm: 0.5780 (0.5739) closs: 1.0047 (1.0436) time: 4.0943 data: 0.0002 max mem: 54684 -[04:07:21.013845] Epoch: [1] [950/3229] lr: 0.000068 grad_norm: 0.5719 (0.5737) closs: 0.9697 (1.0432) time: 4.0872 data: 0.0002 max mem: 54684 -[04:08:01.949372] Epoch: [1] [960/3229] lr: 0.000068 grad_norm: 0.5583 (0.5738) closs: 0.9883 (1.0433) time: 4.0946 data: 0.0002 max mem: 54684 -[04:08:43.032157] Epoch: [1] [970/3229] lr: 0.000068 grad_norm: 0.5604 (0.5737) closs: 1.0494 (1.0434) time: 4.1009 data: 0.0002 max mem: 54684 -[04:09:24.176825] Epoch: [1] [980/3229] lr: 0.000068 grad_norm: 0.5677 (0.5737) closs: 1.0494 (1.0436) time: 4.1113 data: 0.0002 max mem: 54684 -[04:10:05.360266] Epoch: [1] [990/3229] lr: 0.000068 grad_norm: 0.5701 (0.5737) closs: 1.0626 (1.0438) time: 4.1163 data: 0.0002 max mem: 54684 -[04:10:45.379904] Epoch: [1] [1000/3229] lr: 0.000068 grad_norm: 0.5526 (0.5731) closs: 1.0626 (1.0436) time: 4.0601 data: 0.0002 max mem: 54684 -[04:11:26.290114] Epoch: [1] [1010/3229] lr: 0.000068 grad_norm: 0.5378 (0.5730) closs: 1.0114 (1.0433) time: 4.0464 data: 0.0002 max mem: 54684 -[04:12:06.766731] Epoch: [1] [1020/3229] lr: 0.000067 grad_norm: 0.5378 (0.5726) closs: 1.0514 (1.0431) time: 4.0693 data: 0.0002 max mem: 54684 -[04:12:47.630579] Epoch: [1] [1030/3229] lr: 0.000067 grad_norm: 0.5303 (0.5722) closs: 1.0395 (1.0431) time: 4.0670 data: 0.0002 max mem: 54684 -[04:13:28.259482] Epoch: [1] [1040/3229] lr: 0.000067 grad_norm: 0.5331 (0.5720) closs: 1.0344 (1.0428) time: 4.0746 data: 0.0002 max mem: 54684 -[04:14:09.205642] Epoch: [1] [1050/3229] lr: 0.000067 grad_norm: 0.5576 (0.5721) closs: 1.0379 (1.0427) time: 4.0787 data: 0.0002 max mem: 54684 -[04:14:50.247334] Epoch: [1] [1060/3229] lr: 0.000067 grad_norm: 0.5656 (0.5720) closs: 1.0449 (1.0427) time: 4.0993 data: 0.0002 max mem: 54684 -[04:15:31.279333] Epoch: [1] [1070/3229] lr: 0.000067 grad_norm: 0.5656 (0.5720) closs: 1.0465 (1.0427) time: 4.1036 data: 0.0002 max mem: 54684 -[04:16:11.385132] Epoch: [1] [1080/3229] lr: 0.000066 grad_norm: 0.5653 (0.5719) closs: 1.0382 (1.0424) time: 4.0568 data: 0.0002 max mem: 54684 -[04:16:51.949602] Epoch: [1] [1090/3229] lr: 0.000066 grad_norm: 0.5558 (0.5717) closs: 1.0446 (1.0424) time: 4.0334 data: 0.0002 max mem: 54684 -[04:17:33.042818] Epoch: [1] [1100/3229] lr: 0.000066 grad_norm: 0.5585 (0.5716) closs: 1.0113 (1.0422) time: 4.0828 data: 0.0002 max mem: 54684 -[04:18:14.099932] Epoch: [1] [1110/3229] lr: 0.000066 grad_norm: 0.5712 (0.5716) closs: 1.0113 (1.0422) time: 4.1074 data: 0.0002 max mem: 54684 -[04:18:54.199882] Epoch: [1] [1120/3229] lr: 0.000066 grad_norm: 0.5757 (0.5718) closs: 1.0226 (1.0422) time: 4.0578 data: 0.0002 max mem: 54684 -[04:19:35.557680] Epoch: [1] [1130/3229] lr: 0.000066 grad_norm: 0.5680 (0.5719) closs: 1.0245 (1.0422) time: 4.0728 data: 0.0002 max mem: 54684 -[04:20:16.893984] Epoch: [1] [1140/3229] lr: 0.000065 grad_norm: 0.5726 (0.5720) closs: 1.0635 (1.0426) time: 4.1346 data: 0.0002 max mem: 54684 -[04:20:57.545187] Epoch: [1] [1150/3229] lr: 0.000065 grad_norm: 0.5726 (0.5718) closs: 1.0640 (1.0426) time: 4.0993 data: 0.0002 max mem: 54684 -[04:21:38.378059] Epoch: [1] [1160/3229] lr: 0.000065 grad_norm: 0.5435 (0.5716) closs: 1.0238 (1.0422) time: 4.0741 data: 0.0002 max mem: 54684 -[04:22:19.327764] Epoch: [1] [1170/3229] lr: 0.000065 grad_norm: 0.5553 (0.5715) closs: 1.0354 (1.0423) time: 4.0891 data: 0.0002 max mem: 54684 -[04:23:00.226177] Epoch: [1] [1180/3229] lr: 0.000065 grad_norm: 0.5553 (0.5713) closs: 1.0479 (1.0421) time: 4.0923 data: 0.0002 max mem: 54684 -[04:23:41.482756] Epoch: [1] [1190/3229] lr: 0.000065 grad_norm: 0.5714 (0.5714) closs: 1.0327 (1.0423) time: 4.1077 data: 0.0002 max mem: 54684 -[04:24:22.236750] Epoch: [1] [1200/3229] lr: 0.000065 grad_norm: 0.5960 (0.5716) closs: 1.0605 (1.0422) time: 4.1005 data: 0.0002 max mem: 54684 -[04:25:03.615465] Epoch: [1] [1210/3229] lr: 0.000064 grad_norm: 0.6099 (0.5717) closs: 1.0683 (1.0424) time: 4.1066 data: 0.0002 max mem: 54684 -[04:25:44.975618] Epoch: [1] [1220/3229] lr: 0.000064 grad_norm: 0.5627 (0.5715) closs: 1.0795 (1.0429) time: 4.1369 data: 0.0002 max mem: 54684 -[04:26:26.237903] Epoch: [1] [1230/3229] lr: 0.000064 grad_norm: 0.5499 (0.5714) closs: 1.0667 (1.0427) time: 4.1311 data: 0.0002 max mem: 54684 -[04:27:07.332364] Epoch: [1] [1240/3229] lr: 0.000064 grad_norm: 0.5514 (0.5713) closs: 1.0283 (1.0430) time: 4.1178 data: 0.0002 max mem: 54684 -[04:27:48.026409] Epoch: [1] [1250/3229] lr: 0.000064 grad_norm: 0.5443 (0.5710) closs: 1.0283 (1.0428) time: 4.0894 data: 0.0002 max mem: 54684 -[04:28:28.843640] Epoch: [1] [1260/3229] lr: 0.000064 grad_norm: 0.5485 (0.5708) closs: 1.0062 (1.0426) time: 4.0755 data: 0.0002 max mem: 54684 -[04:29:09.817124] Epoch: [1] [1270/3229] lr: 0.000063 grad_norm: 0.5490 (0.5707) closs: 0.9909 (1.0422) time: 4.0895 data: 0.0002 max mem: 54684 -[04:29:50.601796] Epoch: [1] [1280/3229] lr: 0.000063 grad_norm: 0.5495 (0.5705) closs: 1.0278 (1.0423) time: 4.0878 data: 0.0002 max mem: 54684 -[04:30:31.934810] Epoch: [1] [1290/3229] lr: 0.000063 grad_norm: 0.5482 (0.5705) closs: 1.0337 (1.0423) time: 4.1058 data: 0.0002 max mem: 54684 -[04:31:12.615588] Epoch: [1] [1300/3229] lr: 0.000063 grad_norm: 0.5511 (0.5707) closs: 1.0266 (1.0421) time: 4.1006 data: 0.0002 max mem: 54684 -[04:31:53.248997] Epoch: [1] [1310/3229] lr: 0.000063 grad_norm: 0.5585 (0.5706) closs: 1.0733 (1.0423) time: 4.0656 data: 0.0002 max mem: 54684 -[04:32:33.350563] Epoch: [1] [1320/3229] lr: 0.000063 grad_norm: 0.5479 (0.5704) closs: 1.0733 (1.0422) time: 4.0367 data: 0.0002 max mem: 54684 -[04:33:13.792288] Epoch: [1] [1330/3229] lr: 0.000062 grad_norm: 0.5475 (0.5701) closs: 1.0293 (1.0419) time: 4.0271 data: 0.0002 max mem: 54684 -[04:33:53.862733] Epoch: [1] [1340/3229] lr: 0.000062 grad_norm: 0.5442 (0.5697) closs: 1.0232 (1.0417) time: 4.0255 data: 0.0002 max mem: 54684 -[04:34:35.166000] Epoch: [1] [1350/3229] lr: 0.000062 grad_norm: 0.5631 (0.5697) closs: 1.0530 (1.0418) time: 4.0686 data: 0.0002 max mem: 54684 -[04:35:15.962952] Epoch: [1] [1360/3229] lr: 0.000062 grad_norm: 0.5428 (0.5694) closs: 1.0668 (1.0418) time: 4.1049 data: 0.0002 max mem: 54684 -[04:35:57.016930] Epoch: [1] [1370/3229] lr: 0.000062 grad_norm: 0.5385 (0.5692) closs: 1.0461 (1.0418) time: 4.0925 data: 0.0002 max mem: 54684 -[04:36:37.994351] Epoch: [1] [1380/3229] lr: 0.000062 grad_norm: 0.5438 (0.5692) closs: 1.0512 (1.0421) time: 4.1015 data: 0.0002 max mem: 54684 -[04:37:19.248670] Epoch: [1] [1390/3229] lr: 0.000061 grad_norm: 0.5749 (0.5692) closs: 1.0790 (1.0425) time: 4.1115 data: 0.0002 max mem: 54684 -[04:38:00.339847] Epoch: [1] [1400/3229] lr: 0.000061 grad_norm: 0.5749 (0.5692) closs: 1.0582 (1.0426) time: 4.1172 data: 0.0002 max mem: 54684 -[04:38:41.671763] Epoch: [1] [1410/3229] lr: 0.000061 grad_norm: 0.5611 (0.5692) closs: 1.0523 (1.0426) time: 4.1211 data: 0.0002 max mem: 54684 -[04:39:22.741937] Epoch: [1] [1420/3229] lr: 0.000061 grad_norm: 0.5466 (0.5694) closs: 1.0461 (1.0424) time: 4.1200 data: 0.0002 max mem: 54684 -[04:40:03.672318] Epoch: [1] [1430/3229] lr: 0.000061 grad_norm: 0.5480 (0.5693) closs: 1.0340 (1.0421) time: 4.1000 data: 0.0002 max mem: 54684 -[04:40:44.764299] Epoch: [1] [1440/3229] lr: 0.000061 grad_norm: 0.5512 (0.5694) closs: 1.0340 (1.0421) time: 4.1011 data: 0.0002 max mem: 54684 -[04:41:25.545847] Epoch: [1] [1450/3229] lr: 0.000061 grad_norm: 0.5475 (0.5692) closs: 1.0492 (1.0420) time: 4.0936 data: 0.0002 max mem: 54684 -[04:42:06.794631] Epoch: [1] [1460/3229] lr: 0.000060 grad_norm: 0.5468 (0.5692) closs: 1.0348 (1.0419) time: 4.1015 data: 0.0002 max mem: 54684 -[04:42:47.814991] Epoch: [1] [1470/3229] lr: 0.000060 grad_norm: 0.5492 (0.5690) closs: 1.0385 (1.0419) time: 4.1134 data: 0.0002 max mem: 54684 -[04:43:28.390855] Epoch: [1] [1480/3229] lr: 0.000060 grad_norm: 0.5438 (0.5687) closs: 1.0385 (1.0419) time: 4.0797 data: 0.0002 max mem: 54684 -[04:44:09.427647] Epoch: [1] [1490/3229] lr: 0.000060 grad_norm: 0.5306 (0.5687) closs: 1.0286 (1.0419) time: 4.0806 data: 0.0002 max mem: 54684 -[04:44:50.977883] Epoch: [1] [1500/3229] lr: 0.000060 grad_norm: 0.5577 (0.5687) closs: 1.0527 (1.0421) time: 4.1293 data: 0.0002 max mem: 54684 -[04:45:31.600069] Epoch: [1] [1510/3229] lr: 0.000060 grad_norm: 0.5509 (0.5684) closs: 1.0796 (1.0422) time: 4.1085 data: 0.0002 max mem: 54684 -[04:46:12.441511] Epoch: [1] [1520/3229] lr: 0.000059 grad_norm: 0.5530 (0.5684) closs: 1.0560 (1.0422) time: 4.0731 data: 0.0002 max mem: 54684 -[04:46:54.036109] Epoch: [1] [1530/3229] lr: 0.000059 grad_norm: 0.5716 (0.5684) closs: 1.0673 (1.0424) time: 4.1217 data: 0.0002 max mem: 54684 -[04:47:34.994855] Epoch: [1] [1540/3229] lr: 0.000059 grad_norm: 0.5716 (0.5684) closs: 1.0673 (1.0423) time: 4.1276 data: 0.0002 max mem: 54684 -[04:48:16.034968] Epoch: [1] [1550/3229] lr: 0.000059 grad_norm: 0.5361 (0.5683) closs: 1.0618 (1.0423) time: 4.0999 data: 0.0002 max mem: 54684 -[04:48:57.320961] Epoch: [1] [1560/3229] lr: 0.000059 grad_norm: 0.5683 (0.5682) closs: 1.0718 (1.0426) time: 4.1162 data: 0.0002 max mem: 54684 -[04:49:39.100283] Epoch: [1] [1570/3229] lr: 0.000059 grad_norm: 0.5620 (0.5682) closs: 1.0528 (1.0426) time: 4.1532 data: 0.0002 max mem: 54684 -[04:50:20.220872] Epoch: [1] [1580/3229] lr: 0.000058 grad_norm: 0.5433 (0.5680) closs: 1.0445 (1.0425) time: 4.1449 data: 0.0002 max mem: 54684 -[04:51:01.075841] Epoch: [1] [1590/3229] lr: 0.000058 grad_norm: 0.5502 (0.5680) closs: 1.0167 (1.0423) time: 4.0987 data: 0.0002 max mem: 54684 -[04:51:42.368473] Epoch: [1] [1600/3229] lr: 0.000058 grad_norm: 0.5734 (0.5682) closs: 1.0031 (1.0421) time: 4.1073 data: 0.0002 max mem: 54684 -[04:52:23.762144] Epoch: [1] [1610/3229] lr: 0.000058 grad_norm: 0.5806 (0.5683) closs: 1.0540 (1.0423) time: 4.1342 data: 0.0002 max mem: 54684 -[04:53:05.341366] Epoch: [1] [1620/3229] lr: 0.000058 grad_norm: 0.5798 (0.5684) closs: 1.0742 (1.0424) time: 4.1486 data: 0.0002 max mem: 54684 -[04:53:46.502603] Epoch: [1] [1630/3229] lr: 0.000058 grad_norm: 0.5766 (0.5685) closs: 1.0438 (1.0426) time: 4.1370 data: 0.0002 max mem: 54684 -[04:54:27.790824] Epoch: [1] [1640/3229] lr: 0.000057 grad_norm: 0.5684 (0.5684) closs: 1.0736 (1.0429) time: 4.1224 data: 0.0002 max mem: 54684 -[04:55:08.620731] Epoch: [1] [1650/3229] lr: 0.000057 grad_norm: 0.5621 (0.5683) closs: 1.0539 (1.0427) time: 4.1058 data: 0.0002 max mem: 54684 -[04:55:50.049173] Epoch: [1] [1660/3229] lr: 0.000057 grad_norm: 0.5620 (0.5683) closs: 1.0615 (1.0431) time: 4.1129 data: 0.0002 max mem: 54684 -[04:56:31.228990] Epoch: [1] [1670/3229] lr: 0.000057 grad_norm: 0.5627 (0.5682) closs: 1.0626 (1.0430) time: 4.1303 data: 0.0002 max mem: 54684 -[04:57:11.929105] Epoch: [1] [1680/3229] lr: 0.000057 grad_norm: 0.5532 (0.5681) closs: 1.0347 (1.0429) time: 4.0939 data: 0.0002 max mem: 54684 -[04:57:53.331536] Epoch: [1] [1690/3229] lr: 0.000057 grad_norm: 0.5532 (0.5681) closs: 1.0592 (1.0430) time: 4.1051 data: 0.0002 max mem: 54684 -[04:58:34.426701] Epoch: [1] [1700/3229] lr: 0.000056 grad_norm: 0.5586 (0.5680) closs: 1.0485 (1.0429) time: 4.1248 data: 0.0002 max mem: 54684 -[04:59:15.593463] Epoch: [1] [1710/3229] lr: 0.000056 grad_norm: 0.5681 (0.5680) closs: 1.0279 (1.0429) time: 4.1130 data: 0.0002 max mem: 54684 -[04:59:56.978578] Epoch: [1] [1720/3229] lr: 0.000056 grad_norm: 0.5599 (0.5681) closs: 1.0635 (1.0431) time: 4.1275 data: 0.0002 max mem: 54684 -[05:00:37.607286] Epoch: [1] [1730/3229] lr: 0.000056 grad_norm: 0.5583 (0.5679) closs: 1.0566 (1.0430) time: 4.1006 data: 0.0002 max mem: 54684 -[05:01:18.379952] Epoch: [1] [1740/3229] lr: 0.000056 grad_norm: 0.5404 (0.5678) closs: 1.0427 (1.0430) time: 4.0700 data: 0.0002 max mem: 54684 -[05:01:59.543844] Epoch: [1] [1750/3229] lr: 0.000056 grad_norm: 0.5516 (0.5678) closs: 1.0297 (1.0430) time: 4.0968 data: 0.0002 max mem: 54684 -[05:02:40.638109] Epoch: [1] [1760/3229] lr: 0.000055 grad_norm: 0.5584 (0.5677) closs: 1.0583 (1.0430) time: 4.1128 data: 0.0002 max mem: 54684 -[05:03:21.341435] Epoch: [1] [1770/3229] lr: 0.000055 grad_norm: 0.5533 (0.5677) closs: 1.0667 (1.0430) time: 4.0898 data: 0.0002 max mem: 54684 -[05:04:02.807448] Epoch: [1] [1780/3229] lr: 0.000055 grad_norm: 0.5304 (0.5675) closs: 1.0489 (1.0430) time: 4.1084 data: 0.0002 max mem: 54684 -[05:04:43.977861] Epoch: [1] [1790/3229] lr: 0.000055 grad_norm: 0.5479 (0.5675) closs: 1.0389 (1.0430) time: 4.1318 data: 0.0002 max mem: 54684 -[05:05:25.455837] Epoch: [1] [1800/3229] lr: 0.000055 grad_norm: 0.5531 (0.5675) closs: 1.0299 (1.0429) time: 4.1324 data: 0.0002 max mem: 54684 -[05:06:05.153859] Epoch: [1] [1810/3229] lr: 0.000055 grad_norm: 0.5444 (0.5672) closs: 0.9641 (1.0427) time: 4.0587 data: 0.0002 max mem: 54684 -[05:06:46.267998] Epoch: [1] [1820/3229] lr: 0.000054 grad_norm: 0.5334 (0.5672) closs: 1.0317 (1.0428) time: 4.0405 data: 0.0002 max mem: 54684 -[05:07:27.102663] Epoch: [1] [1830/3229] lr: 0.000054 grad_norm: 0.5543 (0.5671) closs: 1.0532 (1.0428) time: 4.0974 data: 0.0002 max mem: 54684 -[05:08:08.570305] Epoch: [1] [1840/3229] lr: 0.000054 grad_norm: 0.5617 (0.5671) closs: 1.0540 (1.0428) time: 4.1151 data: 0.0002 max mem: 54684 -[05:08:49.571553] Epoch: [1] [1850/3229] lr: 0.000054 grad_norm: 0.5632 (0.5671) closs: 1.0409 (1.0427) time: 4.1234 data: 0.0002 max mem: 54684 -[05:09:30.680753] Epoch: [1] [1860/3229] lr: 0.000054 grad_norm: 0.5681 (0.5671) closs: 1.0322 (1.0426) time: 4.1055 data: 0.0002 max mem: 54684 -[05:10:11.522470] Epoch: [1] [1870/3229] lr: 0.000054 grad_norm: 0.5681 (0.5671) closs: 1.0278 (1.0426) time: 4.0975 data: 0.0002 max mem: 54684 -[05:10:52.916261] Epoch: [1] [1880/3229] lr: 0.000053 grad_norm: 0.5520 (0.5670) closs: 1.0321 (1.0427) time: 4.1117 data: 0.0002 max mem: 54684 -[05:11:33.551352] Epoch: [1] [1890/3229] lr: 0.000053 grad_norm: 0.5682 (0.5670) closs: 1.0321 (1.0426) time: 4.1014 data: 0.0002 max mem: 54684 -[05:12:14.981387] Epoch: [1] [1900/3229] lr: 0.000053 grad_norm: 0.5668 (0.5670) closs: 1.0400 (1.0427) time: 4.1032 data: 0.0002 max mem: 54684 -[05:12:56.144297] Epoch: [1] [1910/3229] lr: 0.000053 grad_norm: 0.5629 (0.5670) closs: 1.0446 (1.0426) time: 4.1296 data: 0.0002 max mem: 54684 -[05:13:36.914042] Epoch: [1] [1920/3229] lr: 0.000053 grad_norm: 0.5629 (0.5671) closs: 1.0190 (1.0425) time: 4.0966 data: 0.0002 max mem: 54684 -[05:14:17.425182] Epoch: [1] [1930/3229] lr: 0.000053 grad_norm: 0.5425 (0.5668) closs: 1.0006 (1.0422) time: 4.0640 data: 0.0002 max mem: 54684 -[05:14:58.117995] Epoch: [1] [1940/3229] lr: 0.000052 grad_norm: 0.5504 (0.5667) closs: 1.0006 (1.0422) time: 4.0601 data: 0.0002 max mem: 54684 -[05:15:39.283905] Epoch: [1] [1950/3229] lr: 0.000052 grad_norm: 0.5645 (0.5668) closs: 1.0802 (1.0424) time: 4.0929 data: 0.0002 max mem: 54684 -[05:16:19.758558] Epoch: [1] [1960/3229] lr: 0.000052 grad_norm: 0.5571 (0.5666) closs: 1.0615 (1.0424) time: 4.0820 data: 0.0002 max mem: 54684 -[05:17:01.117349] Epoch: [1] [1970/3229] lr: 0.000052 grad_norm: 0.5513 (0.5667) closs: 1.0395 (1.0423) time: 4.0916 data: 0.0002 max mem: 54684 -[05:17:41.172799] Epoch: [1] [1980/3229] lr: 0.000052 grad_norm: 0.5372 (0.5664) closs: 1.0187 (1.0421) time: 4.0706 data: 0.0002 max mem: 54684 -[05:18:22.369121] Epoch: [1] [1990/3229] lr: 0.000052 grad_norm: 0.5372 (0.5664) closs: 1.0199 (1.0420) time: 4.0625 data: 0.0002 max mem: 54684 -[05:19:03.491186] Epoch: [1] [2000/3229] lr: 0.000051 grad_norm: 0.5705 (0.5664) closs: 1.0474 (1.0420) time: 4.1159 data: 0.0002 max mem: 54684 -[05:19:44.836678] Epoch: [1] [2010/3229] lr: 0.000051 grad_norm: 0.5739 (0.5665) closs: 1.0563 (1.0420) time: 4.1233 data: 0.0002 max mem: 54684 -[05:20:26.174525] Epoch: [1] [2020/3229] lr: 0.000051 grad_norm: 0.5739 (0.5664) closs: 1.0676 (1.0422) time: 4.1341 data: 0.0002 max mem: 54684 -[05:21:07.005141] Epoch: [1] [2030/3229] lr: 0.000051 grad_norm: 0.5539 (0.5664) closs: 1.0687 (1.0421) time: 4.1084 data: 0.0002 max mem: 54684 -[05:21:47.805581] Epoch: [1] [2040/3229] lr: 0.000051 grad_norm: 0.5457 (0.5662) closs: 1.0106 (1.0419) time: 4.0815 data: 0.0002 max mem: 54684 -[05:22:28.761748] Epoch: [1] [2050/3229] lr: 0.000051 grad_norm: 0.5360 (0.5659) closs: 1.0411 (1.0418) time: 4.0878 data: 0.0002 max mem: 54684 -[05:23:09.259309] Epoch: [1] [2060/3229] lr: 0.000050 grad_norm: 0.5417 (0.5658) closs: 1.0135 (1.0417) time: 4.0726 data: 0.0002 max mem: 54684 -[05:23:49.900804] Epoch: [1] [2070/3229] lr: 0.000050 grad_norm: 0.5499 (0.5657) closs: 1.0281 (1.0417) time: 4.0569 data: 0.0002 max mem: 54684 -[05:24:30.222114] Epoch: [1] [2080/3229] lr: 0.000050 grad_norm: 0.5455 (0.5656) closs: 1.0377 (1.0416) time: 4.0481 data: 0.0002 max mem: 54684 -[05:25:10.002823] Epoch: [1] [2090/3229] lr: 0.000050 grad_norm: 0.5285 (0.5654) closs: 0.9802 (1.0412) time: 4.0050 data: 0.0002 max mem: 54684 -[05:25:51.027399] Epoch: [1] [2100/3229] lr: 0.000050 grad_norm: 0.5517 (0.5654) closs: 0.9688 (1.0411) time: 4.0402 data: 0.0002 max mem: 54684 -[05:26:31.682298] Epoch: [1] [2110/3229] lr: 0.000050 grad_norm: 0.5534 (0.5652) closs: 1.0242 (1.0409) time: 4.0839 data: 0.0002 max mem: 54684 -[05:27:13.027787] Epoch: [1] [2120/3229] lr: 0.000049 grad_norm: 0.5357 (0.5652) closs: 1.0134 (1.0408) time: 4.1000 data: 0.0002 max mem: 54684 -[05:27:53.584846] Epoch: [1] [2130/3229] lr: 0.000049 grad_norm: 0.5528 (0.5651) closs: 1.0308 (1.0409) time: 4.0951 data: 0.0002 max mem: 54684 -[05:28:33.774170] Epoch: [1] [2140/3229] lr: 0.000049 grad_norm: 0.5575 (0.5650) closs: 1.0479 (1.0409) time: 4.0373 data: 0.0002 max mem: 54684 -[05:29:15.125615] Epoch: [1] [2150/3229] lr: 0.000049 grad_norm: 0.5506 (0.5650) closs: 1.0479 (1.0411) time: 4.0770 data: 0.0002 max mem: 54684 -[05:29:56.131252] Epoch: [1] [2160/3229] lr: 0.000049 grad_norm: 0.5534 (0.5650) closs: 1.0885 (1.0414) time: 4.1178 data: 0.0002 max mem: 54684 -[05:30:37.305299] Epoch: [1] [2170/3229] lr: 0.000049 grad_norm: 0.5663 (0.5649) closs: 1.0510 (1.0414) time: 4.1089 data: 0.0002 max mem: 54684 -[05:31:18.140992] Epoch: [1] [2180/3229] lr: 0.000049 grad_norm: 0.5663 (0.5650) closs: 1.0392 (1.0412) time: 4.1004 data: 0.0002 max mem: 54684 -[05:31:58.773232] Epoch: [1] [2190/3229] lr: 0.000048 grad_norm: 0.5678 (0.5649) closs: 1.0366 (1.0413) time: 4.0733 data: 0.0002 max mem: 54684 -[05:32:39.494385] Epoch: [1] [2200/3229] lr: 0.000048 grad_norm: 0.5526 (0.5648) closs: 1.0355 (1.0412) time: 4.0676 data: 0.0002 max mem: 54684 -[05:33:20.916278] Epoch: [1] [2210/3229] lr: 0.000048 grad_norm: 0.5766 (0.5649) closs: 1.0259 (1.0413) time: 4.1071 data: 0.0002 max mem: 54684 -[05:34:01.793686] Epoch: [1] [2220/3229] lr: 0.000048 grad_norm: 0.5669 (0.5648) closs: 1.0546 (1.0414) time: 4.1149 data: 0.0002 max mem: 54684 -[05:34:42.236506] Epoch: [1] [2230/3229] lr: 0.000048 grad_norm: 0.5328 (0.5647) closs: 1.0481 (1.0414) time: 4.0659 data: 0.0002 max mem: 54684 -[05:35:22.940509] Epoch: [1] [2240/3229] lr: 0.000048 grad_norm: 0.5445 (0.5646) closs: 1.0309 (1.0413) time: 4.0573 data: 0.0002 max mem: 54684 -[05:36:03.732476] Epoch: [1] [2250/3229] lr: 0.000047 grad_norm: 0.5293 (0.5645) closs: 1.0312 (1.0413) time: 4.0747 data: 0.0002 max mem: 54684 -[05:36:44.583014] Epoch: [1] [2260/3229] lr: 0.000047 grad_norm: 0.5293 (0.5645) closs: 1.0900 (1.0415) time: 4.0821 data: 0.0002 max mem: 54684 -[05:37:25.753741] Epoch: [1] [2270/3229] lr: 0.000047 grad_norm: 0.5454 (0.5645) closs: 1.0398 (1.0414) time: 4.1010 data: 0.0002 max mem: 54684 -[05:38:06.354374] Epoch: [1] [2280/3229] lr: 0.000047 grad_norm: 0.5742 (0.5645) closs: 1.0278 (1.0414) time: 4.0885 data: 0.0002 max mem: 54684 -[05:38:47.481707] Epoch: [1] [2290/3229] lr: 0.000047 grad_norm: 0.5408 (0.5643) closs: 1.0250 (1.0412) time: 4.0863 data: 0.0002 max mem: 54684 -[05:39:28.305137] Epoch: [1] [2300/3229] lr: 0.000047 grad_norm: 0.5408 (0.5643) closs: 1.0250 (1.0412) time: 4.0975 data: 0.0002 max mem: 54684 -[05:40:09.396276] Epoch: [1] [2310/3229] lr: 0.000046 grad_norm: 0.5551 (0.5642) closs: 1.0423 (1.0412) time: 4.0957 data: 0.0002 max mem: 54684 -[05:40:50.316973] Epoch: [1] [2320/3229] lr: 0.000046 grad_norm: 0.5588 (0.5643) closs: 1.0553 (1.0412) time: 4.1005 data: 0.0002 max mem: 54684 -[05:41:31.693870] Epoch: [1] [2330/3229] lr: 0.000046 grad_norm: 0.5540 (0.5642) closs: 1.0652 (1.0413) time: 4.1148 data: 0.0002 max mem: 54684 -[05:42:12.473663] Epoch: [1] [2340/3229] lr: 0.000046 grad_norm: 0.5477 (0.5642) closs: 1.0847 (1.0415) time: 4.1078 data: 0.0002 max mem: 54684 -[05:42:53.480264] Epoch: [1] [2350/3229] lr: 0.000046 grad_norm: 0.5727 (0.5644) closs: 1.0970 (1.0416) time: 4.0893 data: 0.0002 max mem: 54684 -[05:43:33.754469] Epoch: [1] [2360/3229] lr: 0.000046 grad_norm: 0.5439 (0.5642) closs: 1.0598 (1.0416) time: 4.0640 data: 0.0002 max mem: 54684 -[05:44:14.489959] Epoch: [1] [2370/3229] lr: 0.000045 grad_norm: 0.5382 (0.5642) closs: 1.0300 (1.0414) time: 4.0504 data: 0.0002 max mem: 54684 -[05:44:55.310165] Epoch: [1] [2380/3229] lr: 0.000045 grad_norm: 0.5399 (0.5642) closs: 1.0297 (1.0415) time: 4.0777 data: 0.0002 max mem: 54684 -[05:45:35.666164] Epoch: [1] [2390/3229] lr: 0.000045 grad_norm: 0.5510 (0.5642) closs: 1.0400 (1.0414) time: 4.0587 data: 0.0002 max mem: 54684 -[05:46:16.639176] Epoch: [1] [2400/3229] lr: 0.000045 grad_norm: 0.5578 (0.5642) closs: 1.0323 (1.0413) time: 4.0664 data: 0.0002 max mem: 54684 -[05:46:56.787546] Epoch: [1] [2410/3229] lr: 0.000045 grad_norm: 0.5509 (0.5640) closs: 1.0328 (1.0414) time: 4.0560 data: 0.0002 max mem: 54684 -[05:47:37.581569] Epoch: [1] [2420/3229] lr: 0.000045 grad_norm: 0.5329 (0.5640) closs: 1.0172 (1.0412) time: 4.0471 data: 0.0002 max mem: 54684 -[05:48:18.952327] Epoch: [1] [2430/3229] lr: 0.000044 grad_norm: 0.5562 (0.5640) closs: 1.0190 (1.0412) time: 4.1082 data: 0.0002 max mem: 54684 -[05:48:59.548816] Epoch: [1] [2440/3229] lr: 0.000044 grad_norm: 0.5517 (0.5639) closs: 1.0202 (1.0412) time: 4.0983 data: 0.0002 max mem: 54684 -[05:49:40.816318] Epoch: [1] [2450/3229] lr: 0.000044 grad_norm: 0.5512 (0.5639) closs: 1.0606 (1.0414) time: 4.0931 data: 0.0002 max mem: 54684 -[05:50:21.612829] Epoch: [1] [2460/3229] lr: 0.000044 grad_norm: 0.5661 (0.5639) closs: 1.0619 (1.0415) time: 4.1031 data: 0.0002 max mem: 54684 -[05:51:02.032606] Epoch: [1] [2470/3229] lr: 0.000044 grad_norm: 0.5228 (0.5637) closs: 1.0373 (1.0414) time: 4.0607 data: 0.0002 max mem: 54684 -[05:51:42.669343] Epoch: [1] [2480/3229] lr: 0.000044 grad_norm: 0.5141 (0.5637) closs: 1.0318 (1.0414) time: 4.0528 data: 0.0002 max mem: 54684 -[05:52:23.947666] Epoch: [1] [2490/3229] lr: 0.000043 grad_norm: 0.5556 (0.5637) closs: 1.0080 (1.0413) time: 4.0957 data: 0.0002 max mem: 54684 -[05:53:04.419032] Epoch: [1] [2500/3229] lr: 0.000043 grad_norm: 0.5681 (0.5636) closs: 0.9925 (1.0411) time: 4.0874 data: 0.0002 max mem: 54684 -[05:53:44.810109] Epoch: [1] [2510/3229] lr: 0.000043 grad_norm: 0.5536 (0.5635) closs: 1.0025 (1.0409) time: 4.0431 data: 0.0002 max mem: 54684 -[05:54:25.457976] Epoch: [1] [2520/3229] lr: 0.000043 grad_norm: 0.5589 (0.5635) closs: 1.0145 (1.0408) time: 4.0519 data: 0.0002 max mem: 54684 -[05:55:06.177805] Epoch: [1] [2530/3229] lr: 0.000043 grad_norm: 0.5589 (0.5634) closs: 1.0310 (1.0409) time: 4.0683 data: 0.0002 max mem: 54684 -[05:55:46.968680] Epoch: [1] [2540/3229] lr: 0.000043 grad_norm: 0.5613 (0.5634) closs: 1.0699 (1.0409) time: 4.0755 data: 0.0002 max mem: 54684 -[05:56:28.310551] Epoch: [1] [2550/3229] lr: 0.000042 grad_norm: 0.5632 (0.5634) closs: 1.0699 (1.0410) time: 4.1066 data: 0.0002 max mem: 54684 -[05:57:08.977358] Epoch: [1] [2560/3229] lr: 0.000042 grad_norm: 0.5632 (0.5634) closs: 1.0260 (1.0409) time: 4.1004 data: 0.0002 max mem: 54684 -[05:57:49.989591] Epoch: [1] [2570/3229] lr: 0.000042 grad_norm: 0.5523 (0.5633) closs: 1.0473 (1.0411) time: 4.0839 data: 0.0002 max mem: 54684 -[05:58:31.203250] Epoch: [1] [2580/3229] lr: 0.000042 grad_norm: 0.5427 (0.5633) closs: 1.0885 (1.0413) time: 4.1112 data: 0.0002 max mem: 54684 -[05:59:12.459151] Epoch: [1] [2590/3229] lr: 0.000042 grad_norm: 0.5640 (0.5634) closs: 1.0807 (1.0414) time: 4.1234 data: 0.0002 max mem: 54684 -[05:59:53.419598] Epoch: [1] [2600/3229] lr: 0.000042 grad_norm: 0.5771 (0.5633) closs: 1.0600 (1.0413) time: 4.1108 data: 0.0002 max mem: 54684 -[06:00:34.153034] Epoch: [1] [2610/3229] lr: 0.000041 grad_norm: 0.5546 (0.5632) closs: 0.9869 (1.0412) time: 4.0846 data: 0.0002 max mem: 54684 -[06:01:15.086879] Epoch: [1] [2620/3229] lr: 0.000041 grad_norm: 0.5557 (0.5632) closs: 0.9869 (1.0411) time: 4.0833 data: 0.0002 max mem: 54684 -[06:01:56.283580] Epoch: [1] [2630/3229] lr: 0.000041 grad_norm: 0.5558 (0.5632) closs: 1.0490 (1.0413) time: 4.1065 data: 0.0002 max mem: 54684 -[06:02:37.719061] Epoch: [1] [2640/3229] lr: 0.000041 grad_norm: 0.5613 (0.5632) closs: 1.0643 (1.0413) time: 4.1315 data: 0.0002 max mem: 54684 -[06:03:18.915077] Epoch: [1] [2650/3229] lr: 0.000041 grad_norm: 0.5604 (0.5632) closs: 1.0350 (1.0413) time: 4.1315 data: 0.0002 max mem: 54684 -[06:03:59.811949] Epoch: [1] [2660/3229] lr: 0.000041 grad_norm: 0.5654 (0.5632) closs: 1.0307 (1.0414) time: 4.1046 data: 0.0002 max mem: 54684 -[06:04:40.786635] Epoch: [1] [2670/3229] lr: 0.000041 grad_norm: 0.5775 (0.5633) closs: 1.0293 (1.0413) time: 4.0935 data: 0.0002 max mem: 54684 -[06:05:22.161936] Epoch: [1] [2680/3229] lr: 0.000040 grad_norm: 0.5595 (0.5632) closs: 1.0402 (1.0413) time: 4.1174 data: 0.0002 max mem: 54684 -[06:06:02.707096] Epoch: [1] [2690/3229] lr: 0.000040 grad_norm: 0.5369 (0.5631) closs: 1.0413 (1.0413) time: 4.0960 data: 0.0002 max mem: 54684 -[06:06:43.753741] Epoch: [1] [2700/3229] lr: 0.000040 grad_norm: 0.5548 (0.5631) closs: 1.0285 (1.0413) time: 4.0795 data: 0.0002 max mem: 54684 -[06:07:23.992092] Epoch: [1] [2710/3229] lr: 0.000040 grad_norm: 0.5629 (0.5632) closs: 1.0146 (1.0411) time: 4.0642 data: 0.0002 max mem: 54684 -[06:08:05.152339] Epoch: [1] [2720/3229] lr: 0.000040 grad_norm: 0.5591 (0.5632) closs: 1.0251 (1.0410) time: 4.0699 data: 0.0002 max mem: 54684 -[06:08:46.239502] Epoch: [1] [2730/3229] lr: 0.000040 grad_norm: 0.5534 (0.5631) closs: 1.0251 (1.0410) time: 4.1123 data: 0.0002 max mem: 54684 -[06:09:27.133028] Epoch: [1] [2740/3229] lr: 0.000039 grad_norm: 0.5534 (0.5631) closs: 1.0251 (1.0410) time: 4.0990 data: 0.0002 max mem: 54684 -[06:10:07.799998] Epoch: [1] [2750/3229] lr: 0.000039 grad_norm: 0.5710 (0.5630) closs: 1.0134 (1.0409) time: 4.0780 data: 0.0002 max mem: 54684 -[06:10:49.245922] Epoch: [1] [2760/3229] lr: 0.000039 grad_norm: 0.5520 (0.5630) closs: 1.0222 (1.0409) time: 4.1056 data: 0.0002 max mem: 54684 -[06:11:30.008146] Epoch: [1] [2770/3229] lr: 0.000039 grad_norm: 0.5520 (0.5630) closs: 1.0567 (1.0410) time: 4.1103 data: 0.0002 max mem: 54684 -[06:12:11.059504] Epoch: [1] [2780/3229] lr: 0.000039 grad_norm: 0.5508 (0.5629) closs: 1.0601 (1.0410) time: 4.0906 data: 0.0002 max mem: 54684 -[06:12:51.244841] Epoch: [1] [2790/3229] lr: 0.000039 grad_norm: 0.5376 (0.5628) closs: 1.0292 (1.0410) time: 4.0618 data: 0.0002 max mem: 54684 -[06:13:32.442464] Epoch: [1] [2800/3229] lr: 0.000038 grad_norm: 0.5376 (0.5627) closs: 1.0258 (1.0410) time: 4.0691 data: 0.0002 max mem: 54684 -[06:14:12.900525] Epoch: [1] [2810/3229] lr: 0.000038 grad_norm: 0.5395 (0.5626) closs: 1.0165 (1.0408) time: 4.0827 data: 0.0002 max mem: 54684 -[06:14:54.239092] Epoch: [1] [2820/3229] lr: 0.000038 grad_norm: 0.5645 (0.5626) closs: 1.0285 (1.0410) time: 4.0898 data: 0.0002 max mem: 54684 -[06:15:34.778065] Epoch: [1] [2830/3229] lr: 0.000038 grad_norm: 0.5645 (0.5627) closs: 1.0790 (1.0411) time: 4.0938 data: 0.0002 max mem: 54684 -[06:16:15.572869] Epoch: [1] [2840/3229] lr: 0.000038 grad_norm: 0.5464 (0.5627) closs: 1.0548 (1.0411) time: 4.0666 data: 0.0002 max mem: 54684 -[06:16:55.693103] Epoch: [1] [2850/3229] lr: 0.000038 grad_norm: 0.5409 (0.5625) closs: 0.9916 (1.0409) time: 4.0457 data: 0.0002 max mem: 54684 -[06:17:36.370544] Epoch: [1] [2860/3229] lr: 0.000038 grad_norm: 0.5450 (0.5624) closs: 1.0093 (1.0409) time: 4.0398 data: 0.0002 max mem: 54684 -[06:18:16.935227] Epoch: [1] [2870/3229] lr: 0.000037 grad_norm: 0.5436 (0.5623) closs: 1.0331 (1.0408) time: 4.0620 data: 0.0002 max mem: 54684 -[06:18:58.140683] Epoch: [1] [2880/3229] lr: 0.000037 grad_norm: 0.5241 (0.5622) closs: 1.0383 (1.0409) time: 4.0884 data: 0.0002 max mem: 54684 -[06:19:38.588481] Epoch: [1] [2890/3229] lr: 0.000037 grad_norm: 0.5241 (0.5620) closs: 1.0275 (1.0409) time: 4.0826 data: 0.0002 max mem: 54684 -[06:20:19.641183] Epoch: [1] [2900/3229] lr: 0.000037 grad_norm: 0.5396 (0.5620) closs: 0.9919 (1.0407) time: 4.0750 data: 0.0002 max mem: 54684 -[06:21:00.790118] Epoch: [1] [2910/3229] lr: 0.000037 grad_norm: 0.5550 (0.5620) closs: 1.0125 (1.0409) time: 4.1100 data: 0.0002 max mem: 54684 -[06:21:42.244795] Epoch: [1] [2920/3229] lr: 0.000037 grad_norm: 0.5740 (0.5621) closs: 1.0125 (1.0408) time: 4.1301 data: 0.0002 max mem: 54684 -[06:22:22.364951] Epoch: [1] [2930/3229] lr: 0.000036 grad_norm: 0.5768 (0.5620) closs: 1.0067 (1.0407) time: 4.0787 data: 0.0002 max mem: 54684 -[06:23:03.086429] Epoch: [1] [2940/3229] lr: 0.000036 grad_norm: 0.5621 (0.5620) closs: 1.0128 (1.0406) time: 4.0420 data: 0.0002 max mem: 54684 -[06:23:43.579564] Epoch: [1] [2950/3229] lr: 0.000036 grad_norm: 0.5615 (0.5620) closs: 1.0128 (1.0405) time: 4.0607 data: 0.0002 max mem: 54684 -[06:24:24.403706] Epoch: [1] [2960/3229] lr: 0.000036 grad_norm: 0.5591 (0.5619) closs: 1.0059 (1.0404) time: 4.0658 data: 0.0002 max mem: 54684 -[06:25:05.177195] Epoch: [1] [2970/3229] lr: 0.000036 grad_norm: 0.5515 (0.5619) closs: 1.0162 (1.0403) time: 4.0798 data: 0.0002 max mem: 54684 -[06:25:45.583856] Epoch: [1] [2980/3229] lr: 0.000036 grad_norm: 0.5408 (0.5618) closs: 1.0081 (1.0402) time: 4.0589 data: 0.0002 max mem: 54684 -[06:26:26.764185] Epoch: [1] [2990/3229] lr: 0.000036 grad_norm: 0.5408 (0.5618) closs: 1.0228 (1.0403) time: 4.0793 data: 0.0002 max mem: 54684 -[06:27:08.232014] Epoch: [1] [3000/3229] lr: 0.000035 grad_norm: 0.5497 (0.5618) closs: 1.0729 (1.0404) time: 4.1323 data: 0.0002 max mem: 54684 -[06:27:49.342316] Epoch: [1] [3010/3229] lr: 0.000035 grad_norm: 0.5438 (0.5618) closs: 1.0628 (1.0404) time: 4.1288 data: 0.0002 max mem: 54684 -[06:28:30.414609] Epoch: [1] [3020/3229] lr: 0.000035 grad_norm: 0.5503 (0.5618) closs: 1.0482 (1.0403) time: 4.1091 data: 0.0002 max mem: 54684 -[06:29:11.081220] Epoch: [1] [3030/3229] lr: 0.000035 grad_norm: 0.5568 (0.5618) closs: 1.0190 (1.0402) time: 4.0869 data: 0.0002 max mem: 54684 -[06:29:52.489647] Epoch: [1] [3040/3229] lr: 0.000035 grad_norm: 0.5868 (0.5618) closs: 1.0277 (1.0403) time: 4.1037 data: 0.0002 max mem: 54684 -[06:30:33.073169] Epoch: [1] [3050/3229] lr: 0.000035 grad_norm: 0.5631 (0.5618) closs: 1.0394 (1.0403) time: 4.0995 data: 0.0002 max mem: 54684 -[06:31:14.013273] Epoch: [1] [3060/3229] lr: 0.000034 grad_norm: 0.5527 (0.5618) closs: 1.0443 (1.0403) time: 4.0761 data: 0.0002 max mem: 54684 -[06:31:54.597721] Epoch: [1] [3070/3229] lr: 0.000034 grad_norm: 0.5434 (0.5617) closs: 1.0250 (1.0402) time: 4.0762 data: 0.0001 max mem: 54684 -[06:32:35.628475] Epoch: [1] [3080/3229] lr: 0.000034 grad_norm: 0.5593 (0.5617) closs: 1.0317 (1.0403) time: 4.0807 data: 0.0002 max mem: 54684 -[06:33:15.743784] Epoch: [1] [3090/3229] lr: 0.000034 grad_norm: 0.5484 (0.5615) closs: 1.0518 (1.0402) time: 4.0572 data: 0.0002 max mem: 54684 -[06:33:56.228495] Epoch: [1] [3100/3229] lr: 0.000034 grad_norm: 0.5205 (0.5615) closs: 0.9813 (1.0401) time: 4.0299 data: 0.0002 max mem: 54684 -[06:34:37.480013] Epoch: [1] [3110/3229] lr: 0.000034 grad_norm: 0.5548 (0.5615) closs: 1.0324 (1.0402) time: 4.0867 data: 0.0002 max mem: 54684 -[06:35:18.864626] Epoch: [1] [3120/3229] lr: 0.000034 grad_norm: 0.5548 (0.5615) closs: 1.0525 (1.0403) time: 4.1317 data: 0.0002 max mem: 54684 -[06:35:59.777377] Epoch: [1] [3130/3229] lr: 0.000033 grad_norm: 0.5428 (0.5614) closs: 1.0628 (1.0403) time: 4.1148 data: 0.0002 max mem: 54684 -[06:36:40.713675] Epoch: [1] [3140/3229] lr: 0.000033 grad_norm: 0.5397 (0.5614) closs: 1.0246 (1.0402) time: 4.0924 data: 0.0002 max mem: 54684 -[06:37:21.338622] Epoch: [1] [3150/3229] lr: 0.000033 grad_norm: 0.5575 (0.5614) closs: 1.0182 (1.0403) time: 4.0780 data: 0.0002 max mem: 54684 -[06:38:01.899004] Epoch: [1] [3160/3229] lr: 0.000033 grad_norm: 0.5606 (0.5613) closs: 1.0391 (1.0402) time: 4.0592 data: 0.0002 max mem: 54684 -[06:38:42.174203] Epoch: [1] [3170/3229] lr: 0.000033 grad_norm: 0.5272 (0.5613) closs: 0.9944 (1.0401) time: 4.0417 data: 0.0002 max mem: 54684 -[06:39:22.446835] Epoch: [1] [3180/3229] lr: 0.000033 grad_norm: 0.5411 (0.5612) closs: 1.0080 (1.0400) time: 4.0273 data: 0.0002 max mem: 54684 -[06:40:03.306382] Epoch: [1] [3190/3229] lr: 0.000032 grad_norm: 0.5411 (0.5612) closs: 1.0599 (1.0399) time: 4.0565 data: 0.0002 max mem: 54684 -[06:40:43.545456] Epoch: [1] [3200/3229] lr: 0.000032 grad_norm: 0.5426 (0.5612) closs: 1.0677 (1.0399) time: 4.0549 data: 0.0003 max mem: 54684 -[06:41:24.089321] Epoch: [1] [3210/3229] lr: 0.000032 grad_norm: 0.5565 (0.5611) closs: 1.0318 (1.0398) time: 4.0391 data: 0.0003 max mem: 54684 -[06:42:05.015907] Epoch: [1] [3220/3229] lr: 0.000032 grad_norm: 0.5565 (0.5611) closs: 1.0518 (1.0399) time: 4.0735 data: 0.0001 max mem: 54684 -[06:42:37.506577] Epoch: [1] Total time: 3:40:13 -[06:42:37.507422] Averaged stats: lr: 0.000032 grad_norm: 0.5404 (0.5610) closs: 1.0248 (1.0389) -[06:42:37.848477] model saved -[06:42:39.524269] optimizer saved -[06:42:39.524869] other rank-common saved -[06:42:39.529811] rank-specific saved -[06:42:39.543909] log_dir: ./output_dir -[06:42:52.332922] Epoch: [2] [0/3229] lr: 0.000032 grad_norm: 0.6074 (0.6074) closs: 0.9961 (0.9961) time: 12.7882 data: 8.6913 max mem: 54684 -[06:43:33.251059] Epoch: [2] [10/3229] lr: 0.000032 grad_norm: 0.5549 (0.5515) closs: 1.0108 (1.0250) time: 4.8823 data: 0.7903 max mem: 54684 -[06:44:14.062725] Epoch: [2] [20/3229] lr: 0.000032 grad_norm: 0.5604 (0.5634) closs: 1.0132 (1.0331) time: 4.0864 data: 0.0002 max mem: 54684 -[06:44:54.680230] Epoch: [2] [30/3229] lr: 0.000031 grad_norm: 0.5646 (0.5620) closs: 1.0207 (1.0222) time: 4.0714 data: 0.0002 max mem: 54684 -[06:45:36.094248] Epoch: [2] [40/3229] lr: 0.000031 grad_norm: 0.5666 (0.5645) closs: 1.0272 (1.0325) time: 4.1015 data: 0.0002 max mem: 54684 -[06:46:17.586387] Epoch: [2] [50/3229] lr: 0.000031 grad_norm: 0.5721 (0.5678) closs: 1.0488 (1.0357) time: 4.1452 data: 0.0002 max mem: 54684 -[06:46:58.623930] Epoch: [2] [60/3229] lr: 0.000031 grad_norm: 0.5721 (0.5684) closs: 1.0254 (1.0385) time: 4.1264 data: 0.0002 max mem: 54684 -[06:47:38.804594] Epoch: [2] [70/3229] lr: 0.000031 grad_norm: 0.5414 (0.5640) closs: 1.0238 (1.0330) time: 4.0608 data: 0.0002 max mem: 54684 -[06:48:19.867348] Epoch: [2] [80/3229] lr: 0.000031 grad_norm: 0.5383 (0.5644) closs: 1.0562 (1.0336) time: 4.0621 data: 0.0002 max mem: 54684 -[06:49:00.413656] Epoch: [2] [90/3229] lr: 0.000031 grad_norm: 0.5648 (0.5635) closs: 1.0639 (1.0332) time: 4.0804 data: 0.0002 max mem: 54684 -[06:49:41.951060] Epoch: [2] [100/3229] lr: 0.000030 grad_norm: 0.5660 (0.5647) closs: 1.0607 (1.0380) time: 4.1041 data: 0.0002 max mem: 54684 -[06:50:22.979532] Epoch: [2] [110/3229] lr: 0.000030 grad_norm: 0.5676 (0.5650) closs: 1.0512 (1.0370) time: 4.1282 data: 0.0002 max mem: 54684 -[06:51:04.182535] Epoch: [2] [120/3229] lr: 0.000030 grad_norm: 0.5679 (0.5652) closs: 1.0298 (1.0344) time: 4.1115 data: 0.0002 max mem: 54684 -[06:51:45.596931] Epoch: [2] [130/3229] lr: 0.000030 grad_norm: 0.5720 (0.5668) closs: 1.0407 (1.0365) time: 4.1308 data: 0.0002 max mem: 54684 -[06:52:25.981391] Epoch: [2] [140/3229] lr: 0.000030 grad_norm: 0.5643 (0.5656) closs: 1.0547 (1.0368) time: 4.0899 data: 0.0002 max mem: 54684 -[06:53:06.534877] Epoch: [2] [150/3229] lr: 0.000030 grad_norm: 0.5585 (0.5647) closs: 1.0643 (1.0353) time: 4.0468 data: 0.0002 max mem: 54684 -[06:53:47.561712] Epoch: [2] [160/3229] lr: 0.000030 grad_norm: 0.5707 (0.5644) closs: 1.0438 (1.0367) time: 4.0789 data: 0.0002 max mem: 54684 -[06:54:28.735079] Epoch: [2] [170/3229] lr: 0.000029 grad_norm: 0.5660 (0.5639) closs: 1.0206 (1.0351) time: 4.1099 data: 0.0002 max mem: 54684 -[06:55:09.714577] Epoch: [2] [180/3229] lr: 0.000029 grad_norm: 0.5598 (0.5634) closs: 1.0089 (1.0340) time: 4.1076 data: 0.0002 max mem: 54684 -[06:55:50.561442] Epoch: [2] [190/3229] lr: 0.000029 grad_norm: 0.5587 (0.5624) closs: 1.0149 (1.0343) time: 4.0913 data: 0.0002 max mem: 54684 -[06:56:30.629863] Epoch: [2] [200/3229] lr: 0.000029 grad_norm: 0.5501 (0.5617) closs: 1.0149 (1.0314) time: 4.0457 data: 0.0002 max mem: 54684 -[06:57:10.515610] Epoch: [2] [210/3229] lr: 0.000029 grad_norm: 0.5364 (0.5595) closs: 0.9756 (1.0304) time: 3.9976 data: 0.0002 max mem: 54684 -[06:57:51.901450] Epoch: [2] [220/3229] lr: 0.000029 grad_norm: 0.5558 (0.5605) closs: 1.0786 (1.0334) time: 4.0635 data: 0.0002 max mem: 54684 -[06:58:33.059748] Epoch: [2] [230/3229] lr: 0.000029 grad_norm: 0.5713 (0.5605) closs: 1.0664 (1.0330) time: 4.1271 data: 0.0002 max mem: 54684 -[06:59:13.767459] Epoch: [2] [240/3229] lr: 0.000028 grad_norm: 0.5606 (0.5594) closs: 1.0463 (1.0324) time: 4.0932 data: 0.0002 max mem: 54684 -[06:59:54.366626] Epoch: [2] [250/3229] lr: 0.000028 grad_norm: 0.5597 (0.5591) closs: 1.0473 (1.0327) time: 4.0653 data: 0.0002 max mem: 54684 -[07:00:35.448278] Epoch: [2] [260/3229] lr: 0.000028 grad_norm: 0.5615 (0.5590) closs: 1.0473 (1.0337) time: 4.0840 data: 0.0002 max mem: 54684 -[07:01:16.282495] Epoch: [2] [270/3229] lr: 0.000028 grad_norm: 0.5692 (0.5587) closs: 1.0175 (1.0332) time: 4.0957 data: 0.0002 max mem: 54684 -[07:01:57.347029] Epoch: [2] [280/3229] lr: 0.000028 grad_norm: 0.5577 (0.5587) closs: 1.0175 (1.0331) time: 4.0949 data: 0.0002 max mem: 54684 -[07:02:38.198946] Epoch: [2] [290/3229] lr: 0.000028 grad_norm: 0.5458 (0.5578) closs: 1.0612 (1.0344) time: 4.0958 data: 0.0002 max mem: 54684 -[07:03:19.538016] Epoch: [2] [300/3229] lr: 0.000028 grad_norm: 0.5543 (0.5585) closs: 1.0482 (1.0345) time: 4.1095 data: 0.0002 max mem: 54684 -[07:04:00.398339] Epoch: [2] [310/3229] lr: 0.000027 grad_norm: 0.5669 (0.5587) closs: 1.0378 (1.0337) time: 4.1099 data: 0.0002 max mem: 54684 -[07:04:41.202079] Epoch: [2] [320/3229] lr: 0.000027 grad_norm: 0.5484 (0.5583) closs: 1.0331 (1.0335) time: 4.0831 data: 0.0002 max mem: 54684 -[07:05:22.294422] Epoch: [2] [330/3229] lr: 0.000027 grad_norm: 0.5676 (0.5594) closs: 1.0367 (1.0340) time: 4.0947 data: 0.0002 max mem: 54684 -[07:06:02.338520] Epoch: [2] [340/3229] lr: 0.000027 grad_norm: 0.5636 (0.5582) closs: 1.0367 (1.0338) time: 4.0568 data: 0.0002 max mem: 54684 -[07:06:43.190555] Epoch: [2] [350/3229] lr: 0.000027 grad_norm: 0.5512 (0.5587) closs: 1.0080 (1.0331) time: 4.0447 data: 0.0002 max mem: 54684 -[07:07:23.981238] Epoch: [2] [360/3229] lr: 0.000027 grad_norm: 0.5835 (0.5591) closs: 0.9637 (1.0310) time: 4.0821 data: 0.0002 max mem: 54684 -[07:08:04.100084] Epoch: [2] [370/3229] lr: 0.000027 grad_norm: 0.5483 (0.5592) closs: 0.9577 (1.0297) time: 4.0454 data: 0.0002 max mem: 54684 -[07:08:45.172933] Epoch: [2] [380/3229] lr: 0.000026 grad_norm: 0.5730 (0.5599) closs: 1.0405 (1.0300) time: 4.0595 data: 0.0002 max mem: 54684 -[07:09:26.437556] Epoch: [2] [390/3229] lr: 0.000026 grad_norm: 0.5819 (0.5601) closs: 1.0400 (1.0298) time: 4.1168 data: 0.0002 max mem: 54684 -[07:10:06.949904] Epoch: [2] [400/3229] lr: 0.000026 grad_norm: 0.5788 (0.5598) closs: 0.9957 (1.0292) time: 4.0888 data: 0.0002 max mem: 54684 -[07:10:47.514701] Epoch: [2] [410/3229] lr: 0.000026 grad_norm: 0.5763 (0.5596) closs: 1.0425 (1.0290) time: 4.0538 data: 0.0002 max mem: 54684 -[07:11:28.002800] Epoch: [2] [420/3229] lr: 0.000026 grad_norm: 0.5434 (0.5590) closs: 1.0287 (1.0276) time: 4.0526 data: 0.0002 max mem: 54684 -[07:12:08.954598] Epoch: [2] [430/3229] lr: 0.000026 grad_norm: 0.5331 (0.5590) closs: 1.0064 (1.0278) time: 4.0719 data: 0.0002 max mem: 54684 -[07:12:50.182692] Epoch: [2] [440/3229] lr: 0.000026 grad_norm: 0.5630 (0.5593) closs: 1.0357 (1.0279) time: 4.1089 data: 0.0002 max mem: 54684 -[07:13:31.159720] Epoch: [2] [450/3229] lr: 0.000025 grad_norm: 0.5743 (0.5594) closs: 1.0429 (1.0276) time: 4.1102 data: 0.0002 max mem: 54684 -[07:14:12.156189] Epoch: [2] [460/3229] lr: 0.000025 grad_norm: 0.5510 (0.5592) closs: 0.9867 (1.0272) time: 4.0986 data: 0.0002 max mem: 54684 -[07:14:53.144867] Epoch: [2] [470/3229] lr: 0.000025 grad_norm: 0.5739 (0.5619) closs: 1.0229 (1.0268) time: 4.0992 data: 0.0002 max mem: 54684 -[07:15:33.994729] Epoch: [2] [480/3229] lr: 0.000025 grad_norm: 0.5799 (0.5615) closs: 1.0344 (1.0262) time: 4.0919 data: 0.0002 max mem: 54684 -[07:16:14.919123] Epoch: [2] [490/3229] lr: 0.000025 grad_norm: 0.5602 (0.5613) closs: 1.0123 (1.0258) time: 4.0887 data: 0.0002 max mem: 54684 -[07:16:55.482144] Epoch: [2] [500/3229] lr: 0.000025 grad_norm: 0.5650 (0.5613) closs: 1.0011 (1.0257) time: 4.0743 data: 0.0002 max mem: 54684 -[07:17:36.009652] Epoch: [2] [510/3229] lr: 0.000025 grad_norm: 0.5666 (0.5608) closs: 1.0011 (1.0253) time: 4.0545 data: 0.0002 max mem: 54684 -[07:18:17.007812] Epoch: [2] [520/3229] lr: 0.000025 grad_norm: 0.5666 (0.5610) closs: 1.0167 (1.0254) time: 4.0762 data: 0.0002 max mem: 54684 -[07:18:57.928751] Epoch: [2] [530/3229] lr: 0.000024 grad_norm: 0.5620 (0.5609) closs: 1.0517 (1.0253) time: 4.0959 data: 0.0002 max mem: 54684 -[07:19:38.814132] Epoch: [2] [540/3229] lr: 0.000024 grad_norm: 0.5700 (0.5613) closs: 1.0398 (1.0259) time: 4.0903 data: 0.0002 max mem: 54684 -[07:20:19.972410] Epoch: [2] [550/3229] lr: 0.000024 grad_norm: 0.5808 (0.5616) closs: 1.0295 (1.0258) time: 4.1021 data: 0.0002 max mem: 54684 -[07:21:01.186855] Epoch: [2] [560/3229] lr: 0.000024 grad_norm: 0.5597 (0.5613) closs: 1.0380 (1.0259) time: 4.1186 data: 0.0002 max mem: 54684 -[07:21:42.387976] Epoch: [2] [570/3229] lr: 0.000024 grad_norm: 0.5589 (0.5615) closs: 1.0135 (1.0256) time: 4.1207 data: 0.0002 max mem: 54684 -[07:22:23.711719] Epoch: [2] [580/3229] lr: 0.000024 grad_norm: 0.5725 (0.5619) closs: 1.0135 (1.0259) time: 4.1262 data: 0.0002 max mem: 54684 -[07:23:04.803921] Epoch: [2] [590/3229] lr: 0.000024 grad_norm: 0.5810 (0.5624) closs: 1.0295 (1.0264) time: 4.1207 data: 0.0002 max mem: 54684 -[07:23:46.170461] Epoch: [2] [600/3229] lr: 0.000023 grad_norm: 0.5884 (0.5628) closs: 1.0485 (1.0269) time: 4.1229 data: 0.0002 max mem: 54684 -[07:24:27.382259] Epoch: [2] [610/3229] lr: 0.000023 grad_norm: 0.5673 (0.5631) closs: 1.0487 (1.0275) time: 4.1289 data: 0.0002 max mem: 54684 -[07:25:08.711019] Epoch: [2] [620/3229] lr: 0.000023 grad_norm: 0.5802 (0.5636) closs: 1.0487 (1.0279) time: 4.1270 data: 0.0002 max mem: 54684 -[07:25:49.496061] Epoch: [2] [630/3229] lr: 0.000023 grad_norm: 0.5873 (0.5637) closs: 1.0284 (1.0272) time: 4.1056 data: 0.0002 max mem: 54684 -[07:26:30.840270] Epoch: [2] [640/3229] lr: 0.000023 grad_norm: 0.5644 (0.5637) closs: 1.0242 (1.0272) time: 4.1064 data: 0.0002 max mem: 54684 -[07:27:11.536611] Epoch: [2] [650/3229] lr: 0.000023 grad_norm: 0.5738 (0.5637) closs: 1.0264 (1.0271) time: 4.1020 data: 0.0002 max mem: 54684 -[07:27:52.721716] Epoch: [2] [660/3229] lr: 0.000023 grad_norm: 0.5873 (0.5642) closs: 1.0264 (1.0268) time: 4.0940 data: 0.0002 max mem: 54684 -[07:28:33.018219] Epoch: [2] [670/3229] lr: 0.000023 grad_norm: 0.5817 (0.5643) closs: 0.9670 (1.0256) time: 4.0740 data: 0.0002 max mem: 54684 -[07:29:13.959783] Epoch: [2] [680/3229] lr: 0.000022 grad_norm: 0.5840 (0.5644) closs: 1.0050 (1.0255) time: 4.0618 data: 0.0002 max mem: 54684 -[07:29:54.691528] Epoch: [2] [690/3229] lr: 0.000022 grad_norm: 0.5650 (0.5646) closs: 1.0360 (1.0252) time: 4.0836 data: 0.0002 max mem: 54684 -[07:30:35.214294] Epoch: [2] [700/3229] lr: 0.000022 grad_norm: 0.5549 (0.5646) closs: 0.9915 (1.0249) time: 4.0627 data: 0.0002 max mem: 54684 -[07:31:15.579049] Epoch: [2] [710/3229] lr: 0.000022 grad_norm: 0.5666 (0.5645) closs: 0.9990 (1.0244) time: 4.0443 data: 0.0002 max mem: 54684 -[07:31:56.182963] Epoch: [2] [720/3229] lr: 0.000022 grad_norm: 0.5709 (0.5647) closs: 1.0062 (1.0239) time: 4.0484 data: 0.0002 max mem: 54684 -[07:32:36.940307] Epoch: [2] [730/3229] lr: 0.000022 grad_norm: 0.5456 (0.5644) closs: 1.0022 (1.0242) time: 4.0680 data: 0.0002 max mem: 54684 -[07:33:17.790404] Epoch: [2] [740/3229] lr: 0.000022 grad_norm: 0.5405 (0.5641) closs: 1.0311 (1.0242) time: 4.0803 data: 0.0002 max mem: 54684 -[07:33:58.876257] Epoch: [2] [750/3229] lr: 0.000022 grad_norm: 0.5392 (0.5636) closs: 1.0301 (1.0240) time: 4.0967 data: 0.0002 max mem: 54684 -[07:34:39.404388] Epoch: [2] [760/3229] lr: 0.000021 grad_norm: 0.5370 (0.5635) closs: 0.9803 (1.0237) time: 4.0806 data: 0.0002 max mem: 54684 -[07:35:20.103499] Epoch: [2] [770/3229] lr: 0.000021 grad_norm: 0.5533 (0.5636) closs: 0.9830 (1.0234) time: 4.0613 data: 0.0002 max mem: 54684 -[07:36:01.311015] Epoch: [2] [780/3229] lr: 0.000021 grad_norm: 0.5643 (0.5637) closs: 1.0087 (1.0238) time: 4.0953 data: 0.0002 max mem: 54684 -[07:36:42.010206] Epoch: [2] [790/3229] lr: 0.000021 grad_norm: 0.5717 (0.5642) closs: 1.0332 (1.0232) time: 4.0953 data: 0.0002 max mem: 54684 -[07:37:22.202073] Epoch: [2] [800/3229] lr: 0.000021 grad_norm: 0.5769 (0.5641) closs: 1.0252 (1.0229) time: 4.0445 data: 0.0002 max mem: 54684 -[07:38:03.544700] Epoch: [2] [810/3229] lr: 0.000021 grad_norm: 0.5595 (0.5642) closs: 1.0364 (1.0230) time: 4.0767 data: 0.0002 max mem: 54684 -[07:38:44.043917] Epoch: [2] [820/3229] lr: 0.000021 grad_norm: 0.5705 (0.5643) closs: 1.0233 (1.0226) time: 4.0920 data: 0.0002 max mem: 54684 -[07:39:24.427927] Epoch: [2] [830/3229] lr: 0.000021 grad_norm: 0.5705 (0.5641) closs: 1.0029 (1.0221) time: 4.0441 data: 0.0002 max mem: 54684 -[07:40:05.213763] Epoch: [2] [840/3229] lr: 0.000020 grad_norm: 0.5476 (0.5641) closs: 1.0162 (1.0224) time: 4.0584 data: 0.0002 max mem: 54684 -[07:40:46.257738] Epoch: [2] [850/3229] lr: 0.000020 grad_norm: 0.5579 (0.5643) closs: 1.0359 (1.0226) time: 4.0914 data: 0.0002 max mem: 54684 -[07:41:27.073907] Epoch: [2] [860/3229] lr: 0.000020 grad_norm: 0.5654 (0.5644) closs: 1.0328 (1.0223) time: 4.0929 data: 0.0002 max mem: 54684 -[07:42:08.541626] Epoch: [2] [870/3229] lr: 0.000020 grad_norm: 0.5884 (0.5648) closs: 1.0390 (1.0226) time: 4.1141 data: 0.0002 max mem: 54684 -[07:42:49.010976] Epoch: [2] [880/3229] lr: 0.000020 grad_norm: 0.5884 (0.5647) closs: 1.0361 (1.0224) time: 4.0968 data: 0.0002 max mem: 54684 -[07:43:30.376441] Epoch: [2] [890/3229] lr: 0.000020 grad_norm: 0.5687 (0.5648) closs: 1.0268 (1.0226) time: 4.0917 data: 0.0002 max mem: 54684 -[07:44:11.214842] Epoch: [2] [900/3229] lr: 0.000020 grad_norm: 0.5649 (0.5650) closs: 1.0391 (1.0227) time: 4.1101 data: 0.0002 max mem: 54684 -[07:44:52.760157] Epoch: [2] [910/3229] lr: 0.000020 grad_norm: 0.5771 (0.5651) closs: 1.0420 (1.0231) time: 4.1191 data: 0.0002 max mem: 54684 -[07:45:33.530222] Epoch: [2] [920/3229] lr: 0.000019 grad_norm: 0.5793 (0.5652) closs: 1.0535 (1.0232) time: 4.1157 data: 0.0002 max mem: 54684 -[07:46:14.506847] Epoch: [2] [930/3229] lr: 0.000019 grad_norm: 0.5620 (0.5652) closs: 1.0553 (1.0238) time: 4.0873 data: 0.0002 max mem: 54684 -[07:46:55.368426] Epoch: [2] [940/3229] lr: 0.000019 grad_norm: 0.5562 (0.5652) closs: 1.0217 (1.0234) time: 4.0918 data: 0.0002 max mem: 54684 -[07:47:36.461145] Epoch: [2] [950/3229] lr: 0.000019 grad_norm: 0.5638 (0.5654) closs: 1.0119 (1.0236) time: 4.0976 data: 0.0002 max mem: 54684 -[07:48:17.376220] Epoch: [2] [960/3229] lr: 0.000019 grad_norm: 0.5859 (0.5654) closs: 1.0316 (1.0237) time: 4.1003 data: 0.0002 max mem: 54684 -[07:48:58.590641] Epoch: [2] [970/3229] lr: 0.000019 grad_norm: 0.5773 (0.5656) closs: 1.0316 (1.0239) time: 4.1064 data: 0.0002 max mem: 54684 -[07:49:39.107966] Epoch: [2] [980/3229] lr: 0.000019 grad_norm: 0.5773 (0.5657) closs: 1.0133 (1.0236) time: 4.0865 data: 0.0002 max mem: 54684 -[07:50:19.601649] Epoch: [2] [990/3229] lr: 0.000019 grad_norm: 0.5666 (0.5655) closs: 1.0012 (1.0233) time: 4.0505 data: 0.0002 max mem: 54684 -[07:51:00.873589] Epoch: [2] [1000/3229] lr: 0.000019 grad_norm: 0.5538 (0.5656) closs: 1.0434 (1.0238) time: 4.0882 data: 0.0002 max mem: 54684 -[07:51:41.761900] Epoch: [2] [1010/3229] lr: 0.000018 grad_norm: 0.5582 (0.5656) closs: 1.0501 (1.0241) time: 4.1079 data: 0.0002 max mem: 54684 -[07:52:21.925593] Epoch: [2] [1020/3229] lr: 0.000018 grad_norm: 0.5700 (0.5658) closs: 1.0206 (1.0236) time: 4.0525 data: 0.0002 max mem: 54684 -[07:53:02.807691] Epoch: [2] [1030/3229] lr: 0.000018 grad_norm: 0.5521 (0.5656) closs: 1.0060 (1.0235) time: 4.0522 data: 0.0002 max mem: 54684 -[07:53:43.361018] Epoch: [2] [1040/3229] lr: 0.000018 grad_norm: 0.5521 (0.5656) closs: 0.9824 (1.0230) time: 4.0717 data: 0.0002 max mem: 54684 -[07:54:24.671047] Epoch: [2] [1050/3229] lr: 0.000018 grad_norm: 0.5499 (0.5655) closs: 1.0254 (1.0232) time: 4.0931 data: 0.0002 max mem: 54684 -[07:55:05.437645] Epoch: [2] [1060/3229] lr: 0.000018 grad_norm: 0.5612 (0.5655) closs: 1.0412 (1.0233) time: 4.1038 data: 0.0002 max mem: 54684 -[07:55:46.843185] Epoch: [2] [1070/3229] lr: 0.000018 grad_norm: 0.5612 (0.5653) closs: 1.0080 (1.0233) time: 4.1085 data: 0.0002 max mem: 54684 -[07:56:27.719997] Epoch: [2] [1080/3229] lr: 0.000018 grad_norm: 0.5554 (0.5654) closs: 1.0295 (1.0234) time: 4.1140 data: 0.0002 max mem: 54684 -[07:57:08.643703] Epoch: [2] [1090/3229] lr: 0.000018 grad_norm: 0.5717 (0.5654) closs: 1.0642 (1.0237) time: 4.0900 data: 0.0002 max mem: 54684 -[07:57:49.855063] Epoch: [2] [1100/3229] lr: 0.000017 grad_norm: 0.5673 (0.5654) closs: 1.0575 (1.0238) time: 4.1067 data: 0.0002 max mem: 54684 -[07:58:31.249901] Epoch: [2] [1110/3229] lr: 0.000017 grad_norm: 0.5670 (0.5656) closs: 1.0418 (1.0240) time: 4.1302 data: 0.0002 max mem: 54684 -[07:59:12.171662] Epoch: [2] [1120/3229] lr: 0.000017 grad_norm: 0.5640 (0.5655) closs: 1.0369 (1.0238) time: 4.1158 data: 0.0002 max mem: 54684 -[07:59:53.140104] Epoch: [2] [1130/3229] lr: 0.000017 grad_norm: 0.5731 (0.5658) closs: 1.0386 (1.0239) time: 4.0944 data: 0.0002 max mem: 54684 -[08:00:32.932831] Epoch: [2] [1140/3229] lr: 0.000017 grad_norm: 0.6018 (0.5660) closs: 0.9926 (1.0234) time: 4.0380 data: 0.0002 max mem: 54684 -[08:01:14.046816] Epoch: [2] [1150/3229] lr: 0.000017 grad_norm: 0.5912 (0.5662) closs: 0.9690 (1.0232) time: 4.0453 data: 0.0002 max mem: 54684 -[08:01:55.095228] Epoch: [2] [1160/3229] lr: 0.000017 grad_norm: 0.6100 (0.5664) closs: 1.0358 (1.0234) time: 4.1081 data: 0.0002 max mem: 54684 -[08:02:35.605443] Epoch: [2] [1170/3229] lr: 0.000017 grad_norm: 0.6100 (0.5663) closs: 1.0243 (1.0233) time: 4.0779 data: 0.0002 max mem: 54684 -[08:03:16.565669] Epoch: [2] [1180/3229] lr: 0.000017 grad_norm: 0.5632 (0.5663) closs: 1.0243 (1.0233) time: 4.0735 data: 0.0002 max mem: 54684 -[08:03:57.204562] Epoch: [2] [1190/3229] lr: 0.000016 grad_norm: 0.5631 (0.5662) closs: 1.0358 (1.0235) time: 4.0799 data: 0.0002 max mem: 54684 -[08:04:38.643319] Epoch: [2] [1200/3229] lr: 0.000016 grad_norm: 0.5646 (0.5663) closs: 1.0465 (1.0236) time: 4.1038 data: 0.0002 max mem: 54684 -[08:05:19.875642] Epoch: [2] [1210/3229] lr: 0.000016 grad_norm: 0.5646 (0.5663) closs: 1.0653 (1.0240) time: 4.1335 data: 0.0002 max mem: 54684 -[08:06:00.968858] Epoch: [2] [1220/3229] lr: 0.000016 grad_norm: 0.5569 (0.5661) closs: 1.0412 (1.0239) time: 4.1162 data: 0.0002 max mem: 54684 -[08:06:42.285568] Epoch: [2] [1230/3229] lr: 0.000016 grad_norm: 0.5760 (0.5663) closs: 1.0333 (1.0241) time: 4.1204 data: 0.0002 max mem: 54684 -[08:07:23.000807] Epoch: [2] [1240/3229] lr: 0.000016 grad_norm: 0.5760 (0.5662) closs: 1.0333 (1.0241) time: 4.1015 data: 0.0002 max mem: 54684 -[08:08:04.264643] Epoch: [2] [1250/3229] lr: 0.000016 grad_norm: 0.5775 (0.5664) closs: 1.0675 (1.0246) time: 4.0989 data: 0.0002 max mem: 54684 -[08:08:44.969858] Epoch: [2] [1260/3229] lr: 0.000016 grad_norm: 0.5808 (0.5663) closs: 1.0700 (1.0245) time: 4.0984 data: 0.0002 max mem: 54684 -[08:09:25.973545] Epoch: [2] [1270/3229] lr: 0.000016 grad_norm: 0.5808 (0.5664) closs: 1.0291 (1.0246) time: 4.0854 data: 0.0002 max mem: 54684 -[08:10:06.727519] Epoch: [2] [1280/3229] lr: 0.000015 grad_norm: 0.5845 (0.5664) closs: 1.0281 (1.0244) time: 4.0878 data: 0.0002 max mem: 54684 -[08:10:47.690909] Epoch: [2] [1290/3229] lr: 0.000015 grad_norm: 0.5669 (0.5663) closs: 1.0443 (1.0248) time: 4.0858 data: 0.0002 max mem: 54684 -[08:11:28.781544] Epoch: [2] [1300/3229] lr: 0.000015 grad_norm: 0.5669 (0.5663) closs: 1.0462 (1.0250) time: 4.1026 data: 0.0002 max mem: 54684 -[08:12:09.710793] Epoch: [2] [1310/3229] lr: 0.000015 grad_norm: 0.5716 (0.5664) closs: 1.0190 (1.0248) time: 4.1009 data: 0.0002 max mem: 54684 -[08:12:50.835464] Epoch: [2] [1320/3229] lr: 0.000015 grad_norm: 0.5617 (0.5662) closs: 1.0010 (1.0248) time: 4.1026 data: 0.0002 max mem: 54684 -[08:13:31.399909] Epoch: [2] [1330/3229] lr: 0.000015 grad_norm: 0.5617 (0.5662) closs: 1.0131 (1.0245) time: 4.0844 data: 0.0002 max mem: 54684 -[08:14:11.778149] Epoch: [2] [1340/3229] lr: 0.000015 grad_norm: 0.5725 (0.5661) closs: 0.9745 (1.0241) time: 4.0471 data: 0.0002 max mem: 54684 -[08:14:52.478894] Epoch: [2] [1350/3229] lr: 0.000015 grad_norm: 0.5570 (0.5660) closs: 0.9745 (1.0238) time: 4.0539 data: 0.0002 max mem: 54684 -[08:15:33.588932] Epoch: [2] [1360/3229] lr: 0.000015 grad_norm: 0.5678 (0.5661) closs: 0.9983 (1.0241) time: 4.0905 data: 0.0002 max mem: 54684 -[08:16:14.787838] Epoch: [2] [1370/3229] lr: 0.000015 grad_norm: 0.5731 (0.5662) closs: 1.0447 (1.0241) time: 4.1154 data: 0.0002 max mem: 54684 -[08:16:55.897936] Epoch: [2] [1380/3229] lr: 0.000014 grad_norm: 0.5857 (0.5663) closs: 1.0447 (1.0246) time: 4.1154 data: 0.0002 max mem: 54684 -[08:17:36.835149] Epoch: [2] [1390/3229] lr: 0.000014 grad_norm: 0.5857 (0.5664) closs: 1.0514 (1.0247) time: 4.1023 data: 0.0002 max mem: 54684 -[08:18:18.203506] Epoch: [2] [1400/3229] lr: 0.000014 grad_norm: 0.5794 (0.5665) closs: 1.0346 (1.0247) time: 4.1152 data: 0.0002 max mem: 54684 -[08:18:58.505738] Epoch: [2] [1410/3229] lr: 0.000014 grad_norm: 0.5650 (0.5664) closs: 1.0340 (1.0245) time: 4.0835 data: 0.0002 max mem: 54684 -[08:19:39.586158] Epoch: [2] [1420/3229] lr: 0.000014 grad_norm: 0.5650 (0.5665) closs: 1.0154 (1.0246) time: 4.0691 data: 0.0002 max mem: 54684 -[08:20:20.294950] Epoch: [2] [1430/3229] lr: 0.000014 grad_norm: 0.5863 (0.5666) closs: 1.0423 (1.0247) time: 4.0894 data: 0.0002 max mem: 54684 -[08:21:01.589354] Epoch: [2] [1440/3229] lr: 0.000014 grad_norm: 0.5637 (0.5665) closs: 1.0237 (1.0245) time: 4.1001 data: 0.0002 max mem: 54684 -[08:21:42.141787] Epoch: [2] [1450/3229] lr: 0.000014 grad_norm: 0.5447 (0.5664) closs: 1.0227 (1.0243) time: 4.0923 data: 0.0002 max mem: 54684 -[08:22:22.256203] Epoch: [2] [1460/3229] lr: 0.000014 grad_norm: 0.5513 (0.5663) closs: 0.9757 (1.0240) time: 4.0333 data: 0.0002 max mem: 54684 -[08:23:03.179340] Epoch: [2] [1470/3229] lr: 0.000014 grad_norm: 0.5653 (0.5663) closs: 0.9796 (1.0239) time: 4.0518 data: 0.0002 max mem: 54684 -[08:23:44.284317] Epoch: [2] [1480/3229] lr: 0.000014 grad_norm: 0.5706 (0.5664) closs: 1.0076 (1.0239) time: 4.1013 data: 0.0002 max mem: 54684 -[08:24:25.519566] Epoch: [2] [1490/3229] lr: 0.000013 grad_norm: 0.5768 (0.5664) closs: 1.0317 (1.0240) time: 4.1169 data: 0.0002 max mem: 54684 -[08:25:06.727554] Epoch: [2] [1500/3229] lr: 0.000013 grad_norm: 0.5675 (0.5664) closs: 1.0317 (1.0240) time: 4.1221 data: 0.0002 max mem: 54684 -[08:25:47.335026] Epoch: [2] [1510/3229] lr: 0.000013 grad_norm: 0.5676 (0.5664) closs: 1.0193 (1.0239) time: 4.0907 data: 0.0002 max mem: 54684 -[08:26:28.009053] Epoch: [2] [1520/3229] lr: 0.000013 grad_norm: 0.5730 (0.5664) closs: 1.0532 (1.0241) time: 4.0640 data: 0.0002 max mem: 54684 -[08:27:08.283681] Epoch: [2] [1530/3229] lr: 0.000013 grad_norm: 0.5565 (0.5662) closs: 1.0299 (1.0239) time: 4.0474 data: 0.0002 max mem: 54684 -[08:27:49.232202] Epoch: [2] [1540/3229] lr: 0.000013 grad_norm: 0.5565 (0.5661) closs: 1.0269 (1.0240) time: 4.0611 data: 0.0002 max mem: 54684 -[08:28:30.229388] Epoch: [2] [1550/3229] lr: 0.000013 grad_norm: 0.5616 (0.5662) closs: 1.0446 (1.0241) time: 4.0972 data: 0.0002 max mem: 54684 -[08:29:11.247931] Epoch: [2] [1560/3229] lr: 0.000013 grad_norm: 0.5649 (0.5661) closs: 1.0224 (1.0240) time: 4.1007 data: 0.0002 max mem: 54684 -[08:29:51.889980] Epoch: [2] [1570/3229] lr: 0.000013 grad_norm: 0.5717 (0.5662) closs: 1.0066 (1.0243) time: 4.0830 data: 0.0002 max mem: 54684 -[08:30:32.727479] Epoch: [2] [1580/3229] lr: 0.000013 grad_norm: 0.5624 (0.5661) closs: 1.0446 (1.0241) time: 4.0739 data: 0.0002 max mem: 54684 -[08:31:13.949901] Epoch: [2] [1590/3229] lr: 0.000012 grad_norm: 0.5624 (0.5662) closs: 0.9921 (1.0243) time: 4.1029 data: 0.0002 max mem: 54684 -[08:31:54.201566] Epoch: [2] [1600/3229] lr: 0.000012 grad_norm: 0.5771 (0.5663) closs: 0.9877 (1.0241) time: 4.0736 data: 0.0002 max mem: 54684 -[08:32:34.735954] Epoch: [2] [1610/3229] lr: 0.000012 grad_norm: 0.5812 (0.5664) closs: 0.9910 (1.0241) time: 4.0392 data: 0.0002 max mem: 54684 -[08:33:16.428448] Epoch: [2] [1620/3229] lr: 0.000012 grad_norm: 0.5737 (0.5665) closs: 1.0327 (1.0241) time: 4.1113 data: 0.0002 max mem: 54684 -[08:33:57.468499] Epoch: [2] [1630/3229] lr: 0.000012 grad_norm: 0.5634 (0.5664) closs: 1.0435 (1.0241) time: 4.1366 data: 0.0002 max mem: 54684 -[08:34:38.562935] Epoch: [2] [1640/3229] lr: 0.000012 grad_norm: 0.5634 (0.5664) closs: 1.0311 (1.0243) time: 4.1067 data: 0.0002 max mem: 54684 -[08:35:19.378294] Epoch: [2] [1650/3229] lr: 0.000012 grad_norm: 0.5610 (0.5663) closs: 1.0311 (1.0242) time: 4.0954 data: 0.0002 max mem: 54684 -[08:36:00.271634] Epoch: [2] [1660/3229] lr: 0.000012 grad_norm: 0.5621 (0.5664) closs: 0.9766 (1.0241) time: 4.0854 data: 0.0002 max mem: 54684 -[08:36:41.474243] Epoch: [2] [1670/3229] lr: 0.000012 grad_norm: 0.5854 (0.5666) closs: 1.0585 (1.0243) time: 4.1047 data: 0.0002 max mem: 54684 -[08:37:22.515951] Epoch: [2] [1680/3229] lr: 0.000012 grad_norm: 0.5781 (0.5666) closs: 1.0171 (1.0241) time: 4.1122 data: 0.0002 max mem: 54684 -[08:38:02.968774] Epoch: [2] [1690/3229] lr: 0.000012 grad_norm: 0.5756 (0.5666) closs: 1.0070 (1.0242) time: 4.0747 data: 0.0002 max mem: 54684 -[08:38:44.713956] Epoch: [2] [1700/3229] lr: 0.000012 grad_norm: 0.5763 (0.5666) closs: 1.0158 (1.0242) time: 4.1098 data: 0.0002 max mem: 54684 -[08:39:25.060084] Epoch: [2] [1710/3229] lr: 0.000011 grad_norm: 0.5616 (0.5665) closs: 1.0406 (1.0243) time: 4.1045 data: 0.0002 max mem: 54684 -[08:40:05.596502] Epoch: [2] [1720/3229] lr: 0.000011 grad_norm: 0.5514 (0.5665) closs: 1.0451 (1.0243) time: 4.0441 data: 0.0002 max mem: 54684 -[08:40:46.724610] Epoch: [2] [1730/3229] lr: 0.000011 grad_norm: 0.5627 (0.5665) closs: 1.0642 (1.0244) time: 4.0832 data: 0.0002 max mem: 54684 -[08:41:27.461413] Epoch: [2] [1740/3229] lr: 0.000011 grad_norm: 0.5660 (0.5666) closs: 1.0424 (1.0243) time: 4.0932 data: 0.0002 max mem: 54684 -[08:42:08.157448] Epoch: [2] [1750/3229] lr: 0.000011 grad_norm: 0.5676 (0.5665) closs: 1.0295 (1.0244) time: 4.0716 data: 0.0002 max mem: 54684 -[08:42:48.966233] Epoch: [2] [1760/3229] lr: 0.000011 grad_norm: 0.5615 (0.5666) closs: 1.0657 (1.0245) time: 4.0752 data: 0.0002 max mem: 54684 -[08:43:30.179827] Epoch: [2] [1770/3229] lr: 0.000011 grad_norm: 0.5615 (0.5667) closs: 1.0207 (1.0247) time: 4.1010 data: 0.0002 max mem: 54684 -[08:44:11.623138] Epoch: [2] [1780/3229] lr: 0.000011 grad_norm: 0.5781 (0.5668) closs: 1.0357 (1.0247) time: 4.1328 data: 0.0002 max mem: 54684 -[08:44:52.356211] Epoch: [2] [1790/3229] lr: 0.000011 grad_norm: 0.5796 (0.5668) closs: 1.0357 (1.0246) time: 4.1088 data: 0.0002 max mem: 54684 -[08:45:33.510164] Epoch: [2] [1800/3229] lr: 0.000011 grad_norm: 0.5773 (0.5670) closs: 1.0281 (1.0247) time: 4.0943 data: 0.0002 max mem: 54684 -[08:46:14.313566] Epoch: [2] [1810/3229] lr: 0.000011 grad_norm: 0.5773 (0.5670) closs: 1.0281 (1.0246) time: 4.0978 data: 0.0002 max mem: 54684 -[08:46:55.460563] Epoch: [2] [1820/3229] lr: 0.000011 grad_norm: 0.5854 (0.5671) closs: 1.0385 (1.0247) time: 4.0975 data: 0.0002 max mem: 54684 -[08:47:36.509580] Epoch: [2] [1830/3229] lr: 0.000011 grad_norm: 0.5626 (0.5671) closs: 1.0241 (1.0246) time: 4.1097 data: 0.0002 max mem: 54684 -[08:48:17.677647] Epoch: [2] [1840/3229] lr: 0.000010 grad_norm: 0.5677 (0.5672) closs: 1.0266 (1.0247) time: 4.1108 data: 0.0002 max mem: 54684 -[08:48:57.971461] Epoch: [2] [1850/3229] lr: 0.000010 grad_norm: 0.5741 (0.5672) closs: 1.0266 (1.0246) time: 4.0730 data: 0.0002 max mem: 54684 -[08:49:38.910848] Epoch: [2] [1860/3229] lr: 0.000010 grad_norm: 0.5630 (0.5673) closs: 1.0133 (1.0246) time: 4.0616 data: 0.0002 max mem: 54684 -[08:50:19.579669] Epoch: [2] [1870/3229] lr: 0.000010 grad_norm: 0.5608 (0.5674) closs: 1.0030 (1.0243) time: 4.0803 data: 0.0002 max mem: 54684 -[08:51:00.472658] Epoch: [2] [1880/3229] lr: 0.000010 grad_norm: 0.5605 (0.5673) closs: 1.0132 (1.0243) time: 4.0780 data: 0.0002 max mem: 54684 -[08:51:41.817338] Epoch: [2] [1890/3229] lr: 0.000010 grad_norm: 0.5564 (0.5675) closs: 1.0360 (1.0245) time: 4.1118 data: 0.0002 max mem: 54684 -[08:52:22.696669] Epoch: [2] [1900/3229] lr: 0.000010 grad_norm: 0.5788 (0.5675) closs: 1.0678 (1.0246) time: 4.1111 data: 0.0002 max mem: 54684 -[08:53:04.048673] Epoch: [2] [1910/3229] lr: 0.000010 grad_norm: 0.5831 (0.5676) closs: 1.0454 (1.0247) time: 4.1115 data: 0.0002 max mem: 54684 -[08:53:44.004385] Epoch: [2] [1920/3229] lr: 0.000010 grad_norm: 0.5480 (0.5675) closs: 1.0063 (1.0245) time: 4.0653 data: 0.0002 max mem: 54684 -[08:54:24.721597] Epoch: [2] [1930/3229] lr: 0.000010 grad_norm: 0.5612 (0.5676) closs: 0.9927 (1.0243) time: 4.0336 data: 0.0002 max mem: 54684 -[08:55:04.941963] Epoch: [2] [1940/3229] lr: 0.000010 grad_norm: 0.5813 (0.5675) closs: 1.0320 (1.0241) time: 4.0468 data: 0.0002 max mem: 54684 -[08:55:45.992823] Epoch: [2] [1950/3229] lr: 0.000010 grad_norm: 0.5605 (0.5675) closs: 1.0352 (1.0240) time: 4.0635 data: 0.0002 max mem: 54684 -[08:56:26.486944] Epoch: [2] [1960/3229] lr: 0.000010 grad_norm: 0.5619 (0.5674) closs: 1.0194 (1.0241) time: 4.0772 data: 0.0002 max mem: 54684 -[08:57:07.906961] Epoch: [2] [1970/3229] lr: 0.000009 grad_norm: 0.5508 (0.5674) closs: 1.0129 (1.0241) time: 4.0956 data: 0.0002 max mem: 54684 -[08:57:49.115637] Epoch: [2] [1980/3229] lr: 0.000009 grad_norm: 0.5616 (0.5675) closs: 1.0654 (1.0244) time: 4.1314 data: 0.0002 max mem: 54684 -[08:58:30.163889] Epoch: [2] [1990/3229] lr: 0.000009 grad_norm: 0.5789 (0.5676) closs: 1.0512 (1.0244) time: 4.1128 data: 0.0002 max mem: 54684 -[08:59:11.320270] Epoch: [2] [2000/3229] lr: 0.000009 grad_norm: 0.5798 (0.5678) closs: 1.0228 (1.0245) time: 4.1102 data: 0.0002 max mem: 54684 -[08:59:52.707044] Epoch: [2] [2010/3229] lr: 0.000009 grad_norm: 0.5890 (0.5679) closs: 1.0398 (1.0246) time: 4.1271 data: 0.0002 max mem: 54684 -[09:00:34.019560] Epoch: [2] [2020/3229] lr: 0.000009 grad_norm: 0.5755 (0.5679) closs: 1.0402 (1.0248) time: 4.1349 data: 0.0002 max mem: 54684 -[09:01:14.914969] Epoch: [2] [2030/3229] lr: 0.000009 grad_norm: 0.5881 (0.5682) closs: 1.0402 (1.0248) time: 4.1103 data: 0.0002 max mem: 54684 -[09:01:56.056629] Epoch: [2] [2040/3229] lr: 0.000009 grad_norm: 0.5943 (0.5683) closs: 1.0367 (1.0249) time: 4.1018 data: 0.0002 max mem: 54684 -[09:02:36.707902] Epoch: [2] [2050/3229] lr: 0.000009 grad_norm: 0.5845 (0.5684) closs: 1.0345 (1.0248) time: 4.0896 data: 0.0002 max mem: 54684 -[09:03:17.997205] Epoch: [2] [2060/3229] lr: 0.000009 grad_norm: 0.5942 (0.5686) closs: 1.0345 (1.0249) time: 4.0970 data: 0.0002 max mem: 54684 -[09:03:58.576728] Epoch: [2] [2070/3229] lr: 0.000009 grad_norm: 0.5693 (0.5684) closs: 0.9970 (1.0246) time: 4.0934 data: 0.0002 max mem: 54684 -[09:04:39.774170] Epoch: [2] [2080/3229] lr: 0.000009 grad_norm: 0.5613 (0.5685) closs: 1.0260 (1.0248) time: 4.0888 data: 0.0002 max mem: 54684 -[09:05:20.813724] Epoch: [2] [2090/3229] lr: 0.000009 grad_norm: 0.5638 (0.5684) closs: 1.0313 (1.0248) time: 4.1118 data: 0.0002 max mem: 54684 -[09:06:01.815018] Epoch: [2] [2100/3229] lr: 0.000009 grad_norm: 0.5638 (0.5684) closs: 1.0313 (1.0249) time: 4.1020 data: 0.0002 max mem: 54684 -[09:06:41.722289] Epoch: [2] [2110/3229] lr: 0.000009 grad_norm: 0.5757 (0.5684) closs: 1.0256 (1.0247) time: 4.0454 data: 0.0002 max mem: 54684 -[09:07:22.224103] Epoch: [2] [2120/3229] lr: 0.000008 grad_norm: 0.5549 (0.5683) closs: 1.0256 (1.0247) time: 4.0204 data: 0.0002 max mem: 54684 -[09:08:02.949927] Epoch: [2] [2130/3229] lr: 0.000008 grad_norm: 0.5599 (0.5683) closs: 1.0180 (1.0246) time: 4.0613 data: 0.0002 max mem: 54684 -[09:08:43.598145] Epoch: [2] [2140/3229] lr: 0.000008 grad_norm: 0.5644 (0.5682) closs: 1.0041 (1.0246) time: 4.0686 data: 0.0002 max mem: 54684 -[09:09:24.526465] Epoch: [2] [2150/3229] lr: 0.000008 grad_norm: 0.5686 (0.5683) closs: 1.0608 (1.0248) time: 4.0788 data: 0.0002 max mem: 54684 -[09:10:05.683146] Epoch: [2] [2160/3229] lr: 0.000008 grad_norm: 0.5769 (0.5684) closs: 1.0634 (1.0250) time: 4.1042 data: 0.0002 max mem: 54684 -[09:10:46.714264] Epoch: [2] [2170/3229] lr: 0.000008 grad_norm: 0.5744 (0.5684) closs: 1.0447 (1.0250) time: 4.1093 data: 0.0002 max mem: 54684 -[09:11:27.676475] Epoch: [2] [2180/3229] lr: 0.000008 grad_norm: 0.5801 (0.5686) closs: 1.0257 (1.0251) time: 4.0996 data: 0.0002 max mem: 54684 -[09:12:08.191970] Epoch: [2] [2190/3229] lr: 0.000008 grad_norm: 0.5781 (0.5687) closs: 1.0314 (1.0250) time: 4.0738 data: 0.0002 max mem: 54684 -[09:12:48.089750] Epoch: [2] [2200/3229] lr: 0.000008 grad_norm: 0.5526 (0.5685) closs: 0.9832 (1.0247) time: 4.0206 data: 0.0002 max mem: 54684 -[09:13:29.107901] Epoch: [2] [2210/3229] lr: 0.000008 grad_norm: 0.5469 (0.5685) closs: 0.9921 (1.0247) time: 4.0457 data: 0.0002 max mem: 54684 -[09:14:10.541589] Epoch: [2] [2220/3229] lr: 0.000008 grad_norm: 0.5703 (0.5686) closs: 1.0479 (1.0249) time: 4.1225 data: 0.0002 max mem: 54684 -[09:14:51.639545] Epoch: [2] [2230/3229] lr: 0.000008 grad_norm: 0.5834 (0.5687) closs: 1.0858 (1.0251) time: 4.1265 data: 0.0002 max mem: 54684 -[09:15:32.541681] Epoch: [2] [2240/3229] lr: 0.000008 grad_norm: 0.5796 (0.5688) closs: 1.0672 (1.0252) time: 4.0999 data: 0.0002 max mem: 54684 -[09:16:13.933486] Epoch: [2] [2250/3229] lr: 0.000008 grad_norm: 0.5624 (0.5688) closs: 1.0850 (1.0254) time: 4.1146 data: 0.0002 max mem: 54684 -[09:16:55.107667] Epoch: [2] [2260/3229] lr: 0.000008 grad_norm: 0.5624 (0.5688) closs: 1.0799 (1.0254) time: 4.1282 data: 0.0002 max mem: 54684 -[09:17:36.230920] Epoch: [2] [2270/3229] lr: 0.000008 grad_norm: 0.5839 (0.5690) closs: 1.0627 (1.0256) time: 4.1148 data: 0.0002 max mem: 54684 -[09:18:16.446570] Epoch: [2] [2280/3229] lr: 0.000008 grad_norm: 0.5705 (0.5688) closs: 1.0307 (1.0255) time: 4.0669 data: 0.0002 max mem: 54684 -[09:18:57.941452] Epoch: [2] [2290/3229] lr: 0.000008 grad_norm: 0.5705 (0.5689) closs: 1.0202 (1.0255) time: 4.0855 data: 0.0002 max mem: 54684 -[09:19:38.974231] Epoch: [2] [2300/3229] lr: 0.000007 grad_norm: 0.5739 (0.5689) closs: 1.0446 (1.0256) time: 4.1263 data: 0.0002 max mem: 54684 -[09:20:19.518699] Epoch: [2] [2310/3229] lr: 0.000007 grad_norm: 0.5646 (0.5688) closs: 1.0285 (1.0256) time: 4.0788 data: 0.0002 max mem: 54684 -[09:21:00.784491] Epoch: [2] [2320/3229] lr: 0.000007 grad_norm: 0.5741 (0.5690) closs: 1.0014 (1.0254) time: 4.0904 data: 0.0002 max mem: 54684 -[09:21:41.070228] Epoch: [2] [2330/3229] lr: 0.000007 grad_norm: 0.5626 (0.5689) closs: 0.9974 (1.0253) time: 4.0775 data: 0.0002 max mem: 54684 -[09:22:22.069016] Epoch: [2] [2340/3229] lr: 0.000007 grad_norm: 0.5483 (0.5690) closs: 1.0038 (1.0253) time: 4.0642 data: 0.0002 max mem: 54684 -[09:23:02.881018] Epoch: [2] [2350/3229] lr: 0.000007 grad_norm: 0.5861 (0.5690) closs: 1.0259 (1.0253) time: 4.0905 data: 0.0002 max mem: 54684 -[09:23:42.648379] Epoch: [2] [2360/3229] lr: 0.000007 grad_norm: 0.5786 (0.5688) closs: 0.9870 (1.0251) time: 4.0289 data: 0.0002 max mem: 54684 -[09:24:23.453167] Epoch: [2] [2370/3229] lr: 0.000007 grad_norm: 0.5539 (0.5689) closs: 0.9815 (1.0251) time: 4.0285 data: 0.0002 max mem: 54684 -[09:25:04.136256] Epoch: [2] [2380/3229] lr: 0.000007 grad_norm: 0.5700 (0.5688) closs: 0.9940 (1.0251) time: 4.0743 data: 0.0002 max mem: 54684 -[09:25:45.337800] Epoch: [2] [2390/3229] lr: 0.000007 grad_norm: 0.5788 (0.5690) closs: 1.0564 (1.0252) time: 4.0942 data: 0.0002 max mem: 54684 -[09:26:25.739172] Epoch: [2] [2400/3229] lr: 0.000007 grad_norm: 0.5754 (0.5689) closs: 1.0595 (1.0252) time: 4.0801 data: 0.0002 max mem: 54684 -[09:27:05.964659] Epoch: [2] [2410/3229] lr: 0.000007 grad_norm: 0.5509 (0.5688) closs: 1.0436 (1.0250) time: 4.0313 data: 0.0002 max mem: 54684 -[09:27:46.656831] Epoch: [2] [2420/3229] lr: 0.000007 grad_norm: 0.5834 (0.5689) closs: 1.0413 (1.0250) time: 4.0458 data: 0.0002 max mem: 54684 -[09:28:27.493028] Epoch: [2] [2430/3229] lr: 0.000007 grad_norm: 0.5834 (0.5689) closs: 1.0413 (1.0250) time: 4.0764 data: 0.0002 max mem: 54684 -[09:29:08.900562] Epoch: [2] [2440/3229] lr: 0.000007 grad_norm: 0.5791 (0.5690) closs: 1.0370 (1.0250) time: 4.1121 data: 0.0002 max mem: 54684 -[09:29:49.772338] Epoch: [2] [2450/3229] lr: 0.000007 grad_norm: 0.5795 (0.5690) closs: 1.0208 (1.0250) time: 4.1139 data: 0.0002 max mem: 54684 -[09:30:31.137605] Epoch: [2] [2460/3229] lr: 0.000007 grad_norm: 0.5857 (0.5691) closs: 1.0189 (1.0250) time: 4.1118 data: 0.0002 max mem: 54684 -[09:31:12.282940] Epoch: [2] [2470/3229] lr: 0.000007 grad_norm: 0.5806 (0.5691) closs: 1.0523 (1.0251) time: 4.1255 data: 0.0002 max mem: 54684 -[09:31:53.328571] Epoch: [2] [2480/3229] lr: 0.000007 grad_norm: 0.5814 (0.5691) closs: 1.0437 (1.0251) time: 4.1095 data: 0.0002 max mem: 54684 -[09:32:34.487775] Epoch: [2] [2490/3229] lr: 0.000007 grad_norm: 0.5810 (0.5692) closs: 1.0437 (1.0253) time: 4.1102 data: 0.0002 max mem: 54684 -[09:33:14.819132] Epoch: [2] [2500/3229] lr: 0.000007 grad_norm: 0.5729 (0.5691) closs: 1.0522 (1.0252) time: 4.0745 data: 0.0002 max mem: 54684 -[09:33:55.981325] Epoch: [2] [2510/3229] lr: 0.000006 grad_norm: 0.5781 (0.5692) closs: 1.0177 (1.0253) time: 4.0746 data: 0.0002 max mem: 54684 -[09:34:36.645545] Epoch: [2] [2520/3229] lr: 0.000006 grad_norm: 0.5797 (0.5692) closs: 1.0270 (1.0253) time: 4.0913 data: 0.0002 max mem: 54684 -[09:35:17.317073] Epoch: [2] [2530/3229] lr: 0.000006 grad_norm: 0.5780 (0.5692) closs: 0.9929 (1.0253) time: 4.0667 data: 0.0002 max mem: 54684 -[09:35:58.540016] Epoch: [2] [2540/3229] lr: 0.000006 grad_norm: 0.5648 (0.5693) closs: 0.9905 (1.0252) time: 4.0947 data: 0.0002 max mem: 54684 -[09:36:39.741723] Epoch: [2] [2550/3229] lr: 0.000006 grad_norm: 0.6054 (0.5695) closs: 0.9905 (1.0251) time: 4.1212 data: 0.0002 max mem: 54684 -[09:37:21.111863] Epoch: [2] [2560/3229] lr: 0.000006 grad_norm: 0.5947 (0.5695) closs: 1.0207 (1.0253) time: 4.1285 data: 0.0002 max mem: 54684 -[09:38:02.411083] Epoch: [2] [2570/3229] lr: 0.000006 grad_norm: 0.6009 (0.5698) closs: 1.0260 (1.0253) time: 4.1334 data: 0.0002 max mem: 54684 -[09:38:43.003989] Epoch: [2] [2580/3229] lr: 0.000006 grad_norm: 0.6009 (0.5698) closs: 1.0059 (1.0252) time: 4.0945 data: 0.0002 max mem: 54684 -[09:39:23.209991] Epoch: [2] [2590/3229] lr: 0.000006 grad_norm: 0.5756 (0.5697) closs: 0.9988 (1.0251) time: 4.0399 data: 0.0002 max mem: 54684 -[09:40:03.959666] Epoch: [2] [2600/3229] lr: 0.000006 grad_norm: 0.5520 (0.5696) closs: 1.0161 (1.0251) time: 4.0477 data: 0.0002 max mem: 54684 -[09:40:45.247321] Epoch: [2] [2610/3229] lr: 0.000006 grad_norm: 0.5601 (0.5697) closs: 1.0427 (1.0251) time: 4.1018 data: 0.0002 max mem: 54684 -[09:41:26.150398] Epoch: [2] [2620/3229] lr: 0.000006 grad_norm: 0.5791 (0.5698) closs: 1.0075 (1.0251) time: 4.1095 data: 0.0002 max mem: 54684 -[09:42:06.649226] Epoch: [2] [2630/3229] lr: 0.000006 grad_norm: 0.5703 (0.5697) closs: 1.0049 (1.0249) time: 4.0700 data: 0.0002 max mem: 54684 -[09:42:48.023850] Epoch: [2] [2640/3229] lr: 0.000006 grad_norm: 0.5535 (0.5697) closs: 1.0153 (1.0251) time: 4.0936 data: 0.0002 max mem: 54684 -[09:43:29.335232] Epoch: [2] [2650/3229] lr: 0.000006 grad_norm: 0.5492 (0.5698) closs: 1.0620 (1.0252) time: 4.1342 data: 0.0002 max mem: 54684 -[09:44:10.557760] Epoch: [2] [2660/3229] lr: 0.000006 grad_norm: 0.5899 (0.5698) closs: 1.0605 (1.0253) time: 4.1266 data: 0.0002 max mem: 54684 -[09:44:51.395990] Epoch: [2] [2670/3229] lr: 0.000006 grad_norm: 0.5899 (0.5699) closs: 1.0605 (1.0254) time: 4.1030 data: 0.0002 max mem: 54684 -[09:45:32.073560] Epoch: [2] [2680/3229] lr: 0.000006 grad_norm: 0.5913 (0.5699) closs: 1.0339 (1.0254) time: 4.0757 data: 0.0002 max mem: 54684 -[09:46:12.446721] Epoch: [2] [2690/3229] lr: 0.000006 grad_norm: 0.5587 (0.5698) closs: 1.0229 (1.0253) time: 4.0525 data: 0.0002 max mem: 54684 -[09:46:53.342194] Epoch: [2] [2700/3229] lr: 0.000006 grad_norm: 0.5619 (0.5699) closs: 1.0266 (1.0254) time: 4.0634 data: 0.0002 max mem: 54684 -[09:47:34.230607] Epoch: [2] [2710/3229] lr: 0.000006 grad_norm: 0.5640 (0.5699) closs: 1.0674 (1.0255) time: 4.0891 data: 0.0002 max mem: 54684 -[09:48:14.935975] Epoch: [2] [2720/3229] lr: 0.000006 grad_norm: 0.5622 (0.5698) closs: 1.0250 (1.0254) time: 4.0796 data: 0.0002 max mem: 54684 -[09:48:56.405424] Epoch: [2] [2730/3229] lr: 0.000006 grad_norm: 0.5622 (0.5699) closs: 1.0256 (1.0255) time: 4.1087 data: 0.0002 max mem: 54684 -[09:49:37.167283] Epoch: [2] [2740/3229] lr: 0.000006 grad_norm: 0.5708 (0.5699) closs: 1.0323 (1.0255) time: 4.1115 data: 0.0002 max mem: 54684 -[09:50:18.316921] Epoch: [2] [2750/3229] lr: 0.000006 grad_norm: 0.5862 (0.5700) closs: 1.0592 (1.0257) time: 4.0955 data: 0.0002 max mem: 54684 -[09:50:59.109573] Epoch: [2] [2760/3229] lr: 0.000006 grad_norm: 0.5722 (0.5700) closs: 1.0711 (1.0257) time: 4.0970 data: 0.0002 max mem: 54684 -[09:51:40.256779] Epoch: [2] [2770/3229] lr: 0.000006 grad_norm: 0.5834 (0.5701) closs: 1.0126 (1.0257) time: 4.0969 data: 0.0002 max mem: 54684 -[09:52:21.033361] Epoch: [2] [2780/3229] lr: 0.000006 grad_norm: 0.5853 (0.5701) closs: 1.0494 (1.0258) time: 4.0961 data: 0.0002 max mem: 54684 -[09:53:01.853593] Epoch: [2] [2790/3229] lr: 0.000006 grad_norm: 0.5853 (0.5701) closs: 1.0702 (1.0259) time: 4.0798 data: 0.0002 max mem: 54684 -[09:53:42.906057] Epoch: [2] [2800/3229] lr: 0.000006 grad_norm: 0.5772 (0.5701) closs: 1.0605 (1.0260) time: 4.0936 data: 0.0002 max mem: 54684 -[09:54:24.030976] Epoch: [2] [2810/3229] lr: 0.000006 grad_norm: 0.5667 (0.5702) closs: 1.0430 (1.0260) time: 4.1088 data: 0.0002 max mem: 54684 -[09:55:04.890562] Epoch: [2] [2820/3229] lr: 0.000005 grad_norm: 0.5547 (0.5701) closs: 1.0131 (1.0259) time: 4.0992 data: 0.0002 max mem: 54684 -[09:55:45.343232] Epoch: [2] [2830/3229] lr: 0.000005 grad_norm: 0.5592 (0.5701) closs: 1.0131 (1.0258) time: 4.0655 data: 0.0002 max mem: 54684 -[09:56:26.023640] Epoch: [2] [2840/3229] lr: 0.000005 grad_norm: 0.5666 (0.5700) closs: 0.9882 (1.0257) time: 4.0566 data: 0.0002 max mem: 54684 -[09:57:07.375630] Epoch: [2] [2850/3229] lr: 0.000005 grad_norm: 0.5781 (0.5701) closs: 1.0202 (1.0258) time: 4.1016 data: 0.0002 max mem: 54684 -[09:57:47.932124] Epoch: [2] [2860/3229] lr: 0.000005 grad_norm: 0.5781 (0.5701) closs: 1.0452 (1.0258) time: 4.0954 data: 0.0002 max mem: 54684 -[09:58:28.805268] Epoch: [2] [2870/3229] lr: 0.000005 grad_norm: 0.5698 (0.5702) closs: 1.0353 (1.0257) time: 4.0714 data: 0.0002 max mem: 54684 -[09:59:10.101929] Epoch: [2] [2880/3229] lr: 0.000005 grad_norm: 0.5847 (0.5703) closs: 1.0293 (1.0259) time: 4.1084 data: 0.0002 max mem: 54684 -[09:59:51.257363] Epoch: [2] [2890/3229] lr: 0.000005 grad_norm: 0.5847 (0.5703) closs: 1.0182 (1.0256) time: 4.1225 data: 0.0002 max mem: 54684 -[10:00:32.076254] Epoch: [2] [2900/3229] lr: 0.000005 grad_norm: 0.5885 (0.5703) closs: 1.0088 (1.0256) time: 4.0987 data: 0.0002 max mem: 54684 -[10:01:13.043669] Epoch: [2] [2910/3229] lr: 0.000005 grad_norm: 0.5893 (0.5704) closs: 1.0251 (1.0256) time: 4.0892 data: 0.0002 max mem: 54684 -[10:01:54.261319] Epoch: [2] [2920/3229] lr: 0.000005 grad_norm: 0.5820 (0.5704) closs: 1.0368 (1.0257) time: 4.1092 data: 0.0002 max mem: 54684 -[10:02:35.084970] Epoch: [2] [2930/3229] lr: 0.000005 grad_norm: 0.5802 (0.5704) closs: 1.0487 (1.0256) time: 4.1020 data: 0.0002 max mem: 54684 -[10:03:16.214095] Epoch: [2] [2940/3229] lr: 0.000005 grad_norm: 0.5631 (0.5704) closs: 1.0370 (1.0256) time: 4.0976 data: 0.0002 max mem: 54684 -[10:03:56.615821] Epoch: [2] [2950/3229] lr: 0.000005 grad_norm: 0.5883 (0.5704) closs: 1.0335 (1.0255) time: 4.0765 data: 0.0002 max mem: 54684 -[10:04:36.729356] Epoch: [2] [2960/3229] lr: 0.000005 grad_norm: 0.5883 (0.5704) closs: 0.9811 (1.0254) time: 4.0257 data: 0.0002 max mem: 54684 -[10:05:17.405065] Epoch: [2] [2970/3229] lr: 0.000005 grad_norm: 0.6052 (0.5705) closs: 1.0261 (1.0253) time: 4.0394 data: 0.0002 max mem: 54684 -[10:05:58.588228] Epoch: [2] [2980/3229] lr: 0.000005 grad_norm: 0.5752 (0.5705) closs: 1.0282 (1.0253) time: 4.0929 data: 0.0002 max mem: 54684 -[10:06:39.618475] Epoch: [2] [2990/3229] lr: 0.000005 grad_norm: 0.5626 (0.5706) closs: 1.0620 (1.0255) time: 4.1106 data: 0.0002 max mem: 54684 -[10:07:20.501679] Epoch: [2] [3000/3229] lr: 0.000005 grad_norm: 0.5966 (0.5707) closs: 1.0762 (1.0255) time: 4.0956 data: 0.0002 max mem: 54684 -[10:08:00.845965] Epoch: [2] [3010/3229] lr: 0.000005 grad_norm: 0.6088 (0.5708) closs: 0.9854 (1.0254) time: 4.0613 data: 0.0002 max mem: 54684 -[10:08:41.987050] Epoch: [2] [3020/3229] lr: 0.000005 grad_norm: 0.5670 (0.5708) closs: 1.0015 (1.0254) time: 4.0742 data: 0.0002 max mem: 54684 -[10:09:23.349997] Epoch: [2] [3030/3229] lr: 0.000005 grad_norm: 0.5670 (0.5708) closs: 1.0505 (1.0256) time: 4.1251 data: 0.0002 max mem: 54684 -[10:10:04.141407] Epoch: [2] [3040/3229] lr: 0.000005 grad_norm: 0.5687 (0.5708) closs: 1.0496 (1.0255) time: 4.1076 data: 0.0002 max mem: 54684 -[10:10:45.555383] Epoch: [2] [3050/3229] lr: 0.000005 grad_norm: 0.5653 (0.5708) closs: 1.0073 (1.0255) time: 4.1102 data: 0.0002 max mem: 54684 -[10:11:26.441571] Epoch: [2] [3060/3229] lr: 0.000005 grad_norm: 0.5727 (0.5708) closs: 1.0073 (1.0255) time: 4.1149 data: 0.0002 max mem: 54684 -[10:12:07.543100] Epoch: [2] [3070/3229] lr: 0.000005 grad_norm: 0.5985 (0.5709) closs: 1.0305 (1.0254) time: 4.0993 data: 0.0002 max mem: 54684 -[10:12:48.448900] Epoch: [2] [3080/3229] lr: 0.000005 grad_norm: 0.5755 (0.5709) closs: 1.0231 (1.0253) time: 4.1003 data: 0.0002 max mem: 54684 -[10:13:29.389636] Epoch: [2] [3090/3229] lr: 0.000005 grad_norm: 0.5588 (0.5708) closs: 1.0231 (1.0254) time: 4.0923 data: 0.0002 max mem: 54684 -[10:14:10.537731] Epoch: [2] [3100/3229] lr: 0.000005 grad_norm: 0.5669 (0.5709) closs: 1.0588 (1.0256) time: 4.1044 data: 0.0002 max mem: 54684 -[10:14:50.940345] Epoch: [2] [3110/3229] lr: 0.000005 grad_norm: 0.5674 (0.5708) closs: 1.0216 (1.0253) time: 4.0775 data: 0.0002 max mem: 54684 -[10:15:31.303846] Epoch: [2] [3120/3229] lr: 0.000005 grad_norm: 0.5456 (0.5708) closs: 1.0082 (1.0254) time: 4.0382 data: 0.0002 max mem: 54684 -[10:16:11.884304] Epoch: [2] [3130/3229] lr: 0.000005 grad_norm: 0.5562 (0.5707) closs: 1.0350 (1.0254) time: 4.0471 data: 0.0002 max mem: 54684 -[10:16:52.052398] Epoch: [2] [3140/3229] lr: 0.000005 grad_norm: 0.5667 (0.5708) closs: 1.0199 (1.0254) time: 4.0374 data: 0.0002 max mem: 54684 -[10:17:32.075073] Epoch: [2] [3150/3229] lr: 0.000005 grad_norm: 0.5711 (0.5707) closs: 0.9967 (1.0252) time: 4.0095 data: 0.0002 max mem: 54684 -[10:18:13.395303] Epoch: [2] [3160/3229] lr: 0.000005 grad_norm: 0.5789 (0.5708) closs: 0.9967 (1.0252) time: 4.0671 data: 0.0002 max mem: 54684 -[10:18:53.970480] Epoch: [2] [3170/3229] lr: 0.000005 grad_norm: 0.5914 (0.5708) closs: 1.0415 (1.0252) time: 4.0947 data: 0.0002 max mem: 54684 -[10:19:34.476943] Epoch: [2] [3180/3229] lr: 0.000005 grad_norm: 0.5590 (0.5708) closs: 1.0227 (1.0252) time: 4.0540 data: 0.0002 max mem: 54684 -[10:20:15.519690] Epoch: [2] [3190/3229] lr: 0.000005 grad_norm: 0.5594 (0.5708) closs: 1.0227 (1.0252) time: 4.0774 data: 0.0002 max mem: 54684 -[10:20:56.962083] Epoch: [2] [3200/3229] lr: 0.000005 grad_norm: 0.5860 (0.5709) closs: 1.0387 (1.0253) time: 4.1242 data: 0.0003 max mem: 54684 -[10:21:38.056015] Epoch: [2] [3210/3229] lr: 0.000005 grad_norm: 0.5810 (0.5709) closs: 1.0467 (1.0253) time: 4.1268 data: 0.0003 max mem: 54684 -[10:22:17.979929] Epoch: [2] [3220/3229] lr: 0.000005 grad_norm: 0.5591 (0.5708) closs: 1.0067 (1.0252) time: 4.0508 data: 0.0001 max mem: 54684 -[10:22:51.264502] Epoch: [2] Total time: 3:40:11 -[10:22:51.265330] Averaged stats: lr: 0.000005 grad_norm: 0.5545 (0.5708) closs: 1.0458 (1.0262) -[10:22:51.602101] model saved -[10:22:53.305325] optimizer saved -[10:22:53.305942] other rank-common saved -[10:22:53.310857] rank-specific saved -[10:22:53.311056] Training time 11:00:25 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.model.pth deleted file mode 100644 index 76baca48914c9ea8944b781bd72bdcc53122ffd6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e0e65a2cbc92bd926b57acf3f986bccab80793cafe0e2e8b0f18566fafb58cc9 -size 90930987 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 60fcd5447400048b70bd12781a7ac2ff3f591896..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e474f0071285386afbebdb6c6fb925be5ef3e9f3349a22c36fbffe48d77ea7c9 -size 204320439 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.other.pth deleted file mode 100644 index 75738418626b6e3173bda41a86ec5da7dc593ab8..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ac78a8280454755c93a56b940b311201a0a8911b1b5f05c35d04486388b998fe -size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.model.pth deleted file mode 100644 index 111c6c89cd3eab9b2e2e78b3c93b55e42b7179cc..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d8fc318f143f893d3305b5abda9853ef6ba090d582b39122bfe0ef61ef2620d5 -size 90930987 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index b4645edcecd610180b6946730ba37462aaf675f3..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b2fdf6406551f7380e09fda1858fac263c8d47f335664c8f1f26058d416bb70e -size 204320439 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.other.pth deleted file mode 100644 index 50bd574ffb5cad6148a8e51c9ab05e3d80027b40..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0f168ff92d4b19bd4c714a344bfcdc36e1203ddcff5c9504a63acf3bdfb043b3 -size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.model.pth deleted file mode 100644 index d9e5d36c34d582a7b6e6b39a2d4a9186a4e338d4..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c07fd4364e9b806715c985cd4a4905c02bc2c67014227d4322effc78e87d42f4 -size 90930987 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 92472b6308ca7910f824edde08c94c6fc4473912..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9b3ffb824593175223cc1cc760088a45e55bb33fcdd62e37e42249c03c9c9d36 -size 204320439 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.other.pth deleted file mode 100644 index d31ff745058632b1ad1e193fae6bdeb5d0a8a172..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d63f55f7f82591356144cd8ce6e7acba5e908efe4759350a3af15f31d4015567 -size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.model.pth deleted file mode 100644 index f9dfdab397433ffafcd9aaaa95d61dea3ffa4e9b..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4092488fa46d10b7694923c74532faf866919d6c39c9a6d52bc4ab1e505bcb45 -size 90930987 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 42bf93bb5ab71a68c015eabd2d345ca309073368..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8abcacc8b53a2d27740c8543bb5adcb88e2ab89e9aa5cd403e30ed2d9563dc1c -size 204320439 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.other.pth deleted file mode 100644 index 8c07662de2e6d6147924de4c5427234211f7d032..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:27afc0d923f1973f23206d7bbc02b246e988a3fe94eebdedc4e14b31fa3b5801 -size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/log.txt b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/log.txt deleted file mode 100644 index 63c991be766a6e7b09f62eb27253c47ba558cf0e..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/log.txt +++ /dev/null @@ -1,4 +0,0 @@ -{"train_lr": 2.49692118226601e-05, "train_grad_norm": 1.03953114467595, "train_closs": 0.8988287092961849, "epoch": 0, "val_lr": 2.49692118226601e-05, "val_grad_norm": 1.03953114467595, "val_closs": 0.8988287092961849} -{"train_lr": 4.6114274981403966e-05, "train_grad_norm": 0.9151975991837497, "train_closs": 0.854513919164468, "epoch": 1, "val_lr": 4.6114274981403966e-05, "val_grad_norm": 0.9151975991837497, "val_closs": 0.854513919164468} -{"train_lr": 2.751385467980297e-05, "train_grad_norm": 0.8807328767670787, "train_closs": 0.8423879001418064, "epoch": 2, "val_lr": 2.751385467980297e-05, "val_grad_norm": 0.8807328767670787, "val_closs": 0.8423879001418064} -{"train_lr": 8.899579698398978e-06, "train_grad_norm": 0.9092244758394551, "train_closs": 0.8364003172804937, "epoch": 3, "val_lr": 8.899579698398978e-06, "val_grad_norm": 0.9092244758394551, "val_closs": 0.8364003172804937} diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/output.log b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/output.log deleted file mode 100644 index 3ce76611f6b1af197c27e3fedae6374f65ada327..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/output.log +++ /dev/null @@ -1,648 +0,0 @@ -WARNING:torch.distributed.run: -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -| distributed init (rank 1): env://, gpu 1 -| distributed init (rank 6): env://, gpu 6 -| distributed init (rank 0): env://, gpu 0 -| distributed init (rank 5): env://, gpu 5 -| distributed init (rank 7): env://, gpu 7 -| distributed init (rank 3): env://, gpu 3 -| distributed init (rank 4): env://, gpu 4 -| distributed init (rank 2): env://, gpu 2 -[05:56:44.962935] > initializing model parallel with size 1 -[05:56:44.963012] > initializing ddp with size 8 -[05:56:44.963019] > initializing pipeline with size 1 -[05:56:45.119503] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory -[05:56:45.119584] Namespace(batch_size=8, -accum_iter=1, -llama_type='llama_peft', -llama_config=['../checkpoints/llama2/Llama-2-13b/params.json', -'configs/model/finetune/sg/llamaPeft_normBiasLora.json'], -no_visual=True, -tokenizer_path='../checkpoints/llama2/Llama-2-13b/tokenizer.model', -pretrained_path='../checkpoints/llama2/Llama-2-13b/', -pretrained_type='meta_ori', -weight_decay=0.02, -lr=5e-05, -min_lr=5e-06, -epochs=4, -warmup_epochs=1.0, -clip_grad=2, -max_words=512, -dialog=False, -data_config='configs/data/finetune/sg/alpaca.yaml', -output_dir='output/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B', -log_dir='./output_dir', -save_interval=1, -only_save_trainable=True, -device='cuda', -seed=0, -resume='', -num_workers=24, -pin_mem=True, -world_size=8, -local_rank=-1, -dist_on_itp=False, -dist_url='env://', -model_parallel_size=1, -data_parallel='sdp', -precision='bf16', -checkpointing=True, -quant=True, -rank=0, -gpu=0, -distributed=True, -dist_backend='nccl') -[05:56:45.120384] Start initialization. -[05:56:45.120416] ## Processing on RANK 0. -[05:56:45.129261] Model Args: - ModelArgs(dim=5120, n_layers=40, n_heads=40, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=16, bias_tuning=True) -[05:58:19.701205] Model is Peft: True -[05:58:19.709591] Trainable parameter count : 65131520 (local rank), 65131520 (all). -[05:58:19.822258] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/ -[05:58:52.688570] ## Quantizing model to 4bit! - Qunatization Process: 0%| | 0/1047 [00:00 -[06:27:21.584239] Start training for 4 epochs -[06:27:21.595470] log_dir: ./output_dir -[06:27:27.291530] Epoch: [0] [0/812] lr: 0.000000 grad_norm: 1.9510 (1.9510) closs: 1.0616 (1.0616) time: 5.6952 data: 1.5912 max mem: 18825 -[06:27:48.233874] Epoch: [0] [10/812] lr: 0.000001 grad_norm: 2.1544 (2.1439) closs: 1.0616 (1.0547) time: 2.4215 data: 0.1448 max mem: 28042 -[06:28:09.492825] Epoch: [0] [20/812] lr: 0.000001 grad_norm: 2.0091 (2.0561) closs: 0.9999 (1.0350) time: 2.1100 data: 0.0002 max mem: 28042 -[06:28:30.750007] Epoch: [0] [30/812] lr: 0.000002 grad_norm: 1.9742 (2.0773) closs: 1.0489 (1.0550) time: 2.1257 data: 0.0002 max mem: 28042 -[06:28:51.915590] Epoch: [0] [40/812] lr: 0.000002 grad_norm: 1.9236 (2.0170) closs: 1.0628 (1.0608) time: 2.1210 data: 0.0002 max mem: 28042 -[06:29:13.261311] Epoch: [0] [50/812] lr: 0.000003 grad_norm: 1.6794 (1.9642) closs: 1.0594 (1.0599) time: 2.1255 data: 0.0002 max mem: 28042 -[06:29:34.413120] Epoch: [0] [60/812] lr: 0.000004 grad_norm: 1.5823 (1.8870) closs: 1.0342 (1.0539) time: 2.1248 data: 0.0002 max mem: 28042 -[06:29:55.626442] Epoch: [0] [70/812] lr: 0.000004 grad_norm: 1.4000 (1.8062) closs: 1.0269 (1.0558) time: 2.1181 data: 0.0002 max mem: 28042 -[06:30:16.918661] Epoch: [0] [80/812] lr: 0.000005 grad_norm: 1.2024 (1.7370) closs: 1.0211 (1.0531) time: 2.1252 data: 0.0002 max mem: 28042 -[06:30:38.196705] Epoch: [0] [90/812] lr: 0.000006 grad_norm: 1.1287 (1.6695) closs: 0.9826 (1.0414) time: 2.1284 data: 0.0002 max mem: 28042 -[06:30:59.467468] Epoch: [0] [100/812] lr: 0.000006 grad_norm: 1.0506 (1.6085) closs: 0.9512 (1.0363) time: 2.1274 data: 0.0002 max mem: 28042 -[06:31:20.727971] Epoch: [0] [110/812] lr: 0.000007 grad_norm: 0.9873 (1.5522) closs: 0.9416 (1.0241) time: 2.1265 data: 0.0002 max mem: 28042 -[06:31:41.936817] Epoch: [0] [120/812] lr: 0.000007 grad_norm: 0.9233 (1.5061) closs: 0.9447 (1.0219) time: 2.1234 data: 0.0002 max mem: 28042 -[06:32:03.041717] Epoch: [0] [130/812] lr: 0.000008 grad_norm: 0.9836 (1.4640) closs: 0.9609 (1.0167) time: 2.1156 data: 0.0002 max mem: 28042 -[06:32:24.295834] Epoch: [0] [140/812] lr: 0.000009 grad_norm: 0.9415 (1.4258) closs: 0.8978 (1.0079) time: 2.1179 data: 0.0002 max mem: 28042 -[06:32:45.543342] Epoch: [0] [150/812] lr: 0.000009 grad_norm: 0.8967 (1.3994) closs: 0.8940 (1.0025) time: 2.1250 data: 0.0002 max mem: 28042 -[06:33:06.837739] Epoch: [0] [160/812] lr: 0.000010 grad_norm: 0.9221 (1.3720) closs: 0.9130 (0.9991) time: 2.1270 data: 0.0002 max mem: 28042 -[06:33:28.086917] Epoch: [0] [170/812] lr: 0.000010 grad_norm: 0.9860 (1.3484) closs: 0.9068 (0.9924) time: 2.1271 data: 0.0002 max mem: 28042 -[06:33:49.368902] Epoch: [0] [180/812] lr: 0.000011 grad_norm: 0.9860 (1.3266) closs: 0.8552 (0.9843) time: 2.1265 data: 0.0002 max mem: 28042 -[06:34:10.555964] Epoch: [0] [190/812] lr: 0.000012 grad_norm: 0.9269 (1.3048) closs: 0.8552 (0.9786) time: 2.1234 data: 0.0002 max mem: 28042 -[06:34:31.919181] Epoch: [0] [200/812] lr: 0.000012 grad_norm: 0.9269 (1.2870) closs: 0.9133 (0.9764) time: 2.1274 data: 0.0002 max mem: 28042 -[06:34:53.191153] Epoch: [0] [210/812] lr: 0.000013 grad_norm: 0.9518 (1.2708) closs: 0.9159 (0.9724) time: 2.1317 data: 0.0002 max mem: 28042 -[06:35:14.531503] Epoch: [0] [220/812] lr: 0.000014 grad_norm: 0.9473 (1.2543) closs: 0.9002 (0.9697) time: 2.1305 data: 0.0002 max mem: 28042 -[06:35:35.775538] Epoch: [0] [230/812] lr: 0.000014 grad_norm: 0.8726 (1.2400) closs: 0.8788 (0.9638) time: 2.1291 data: 0.0002 max mem: 28042 -[06:35:57.064040] Epoch: [0] [240/812] lr: 0.000015 grad_norm: 0.8857 (1.2265) closs: 0.8438 (0.9603) time: 2.1266 data: 0.0002 max mem: 28042 -[06:36:18.204677] Epoch: [0] [250/812] lr: 0.000015 grad_norm: 0.8589 (1.2119) closs: 0.9044 (0.9582) time: 2.1214 data: 0.0002 max mem: 28042 -[06:36:39.553053] Epoch: [0] [260/812] lr: 0.000016 grad_norm: 0.8594 (1.2025) closs: 0.8971 (0.9549) time: 2.1244 data: 0.0002 max mem: 28042 -[06:37:00.855213] Epoch: [0] [270/812] lr: 0.000017 grad_norm: 0.9139 (1.1965) closs: 0.8980 (0.9543) time: 2.1324 data: 0.0002 max mem: 28042 -[06:37:22.140492] Epoch: [0] [280/812] lr: 0.000017 grad_norm: 0.9011 (1.1889) closs: 0.9115 (0.9515) time: 2.1293 data: 0.0002 max mem: 28042 -[06:37:43.447171] Epoch: [0] [290/812] lr: 0.000018 grad_norm: 0.9554 (1.1825) closs: 0.8680 (0.9484) time: 2.1295 data: 0.0002 max mem: 28042 -[06:38:04.736791] Epoch: [0] [300/812] lr: 0.000018 grad_norm: 0.9554 (1.1737) closs: 0.8583 (0.9459) time: 2.1297 data: 0.0002 max mem: 28042 -[06:38:25.924120] Epoch: [0] [310/812] lr: 0.000019 grad_norm: 0.8799 (1.1661) closs: 0.8818 (0.9452) time: 2.1238 data: 0.0002 max mem: 28042 -[06:38:47.257933] Epoch: [0] [320/812] lr: 0.000020 grad_norm: 0.9149 (1.1595) closs: 0.9034 (0.9449) time: 2.1260 data: 0.0002 max mem: 28042 -[06:39:08.559953] Epoch: [0] [330/812] lr: 0.000020 grad_norm: 0.9245 (1.1522) closs: 0.8678 (0.9414) time: 2.1317 data: 0.0002 max mem: 28042 -[06:39:29.852361] Epoch: [0] [340/812] lr: 0.000021 grad_norm: 0.9209 (1.1468) closs: 0.8413 (0.9386) time: 2.1296 data: 0.0002 max mem: 28042 -[06:39:51.145032] Epoch: [0] [350/812] lr: 0.000022 grad_norm: 0.9571 (1.1421) closs: 0.8830 (0.9398) time: 2.1292 data: 0.0002 max mem: 28042 -[06:40:12.364246] Epoch: [0] [360/812] lr: 0.000022 grad_norm: 0.9706 (1.1378) closs: 0.8992 (0.9382) time: 2.1255 data: 0.0002 max mem: 28042 -[06:40:33.511071] Epoch: [0] [370/812] lr: 0.000023 grad_norm: 0.8997 (1.1309) closs: 0.8396 (0.9361) time: 2.1182 data: 0.0002 max mem: 28042 -[06:40:54.788872] Epoch: [0] [380/812] lr: 0.000023 grad_norm: 0.8940 (1.1266) closs: 0.8361 (0.9348) time: 2.1212 data: 0.0002 max mem: 28042 -[06:41:16.033192] Epoch: [0] [390/812] lr: 0.000024 grad_norm: 0.9331 (1.1217) closs: 0.8695 (0.9335) time: 2.1260 data: 0.0002 max mem: 28042 -[06:41:37.222343] Epoch: [0] [400/812] lr: 0.000025 grad_norm: 0.9583 (1.1184) closs: 0.8604 (0.9323) time: 2.1216 data: 0.0002 max mem: 28042 -[06:41:58.490518] Epoch: [0] [410/812] lr: 0.000025 grad_norm: 0.8561 (1.1128) closs: 0.8604 (0.9309) time: 2.1228 data: 0.0002 max mem: 28042 -[06:42:19.739979] Epoch: [0] [420/812] lr: 0.000026 grad_norm: 0.8646 (1.1088) closs: 0.8581 (0.9291) time: 2.1258 data: 0.0002 max mem: 28042 -[06:42:40.820073] Epoch: [0] [430/812] lr: 0.000026 grad_norm: 0.8915 (1.1065) closs: 0.8364 (0.9265) time: 2.1164 data: 0.0002 max mem: 28042 -[06:43:02.062800] Epoch: [0] [440/812] lr: 0.000027 grad_norm: 0.8808 (1.1014) closs: 0.7983 (0.9241) time: 2.1161 data: 0.0002 max mem: 28042 -[06:43:23.326377] Epoch: [0] [450/812] lr: 0.000028 grad_norm: 0.8808 (1.0998) closs: 0.8111 (0.9218) time: 2.1252 data: 0.0002 max mem: 28042 -[06:43:44.565401] Epoch: [0] [460/812] lr: 0.000028 grad_norm: 0.8849 (1.0957) closs: 0.8498 (0.9205) time: 2.1250 data: 0.0002 max mem: 28042 -[06:44:05.812308] Epoch: [0] [470/812] lr: 0.000029 grad_norm: 0.8793 (1.0957) closs: 0.8498 (0.9200) time: 2.1242 data: 0.0002 max mem: 28042 -[06:44:27.069455] Epoch: [0] [480/812] lr: 0.000030 grad_norm: 0.9377 (1.0929) closs: 0.9008 (0.9193) time: 2.1251 data: 0.0002 max mem: 28042 -[06:44:48.177755] Epoch: [0] [490/812] lr: 0.000030 grad_norm: 0.9086 (1.0893) closs: 0.8924 (0.9187) time: 2.1182 data: 0.0002 max mem: 28042 -[06:45:09.416351] Epoch: [0] [500/812] lr: 0.000031 grad_norm: 0.8987 (1.0874) closs: 0.8698 (0.9174) time: 2.1173 data: 0.0002 max mem: 28042 -[06:45:30.630868] Epoch: [0] [510/812] lr: 0.000031 grad_norm: 0.8835 (1.0869) closs: 0.8536 (0.9153) time: 2.1226 data: 0.0002 max mem: 28042 -[06:45:51.904027] Epoch: [0] [520/812] lr: 0.000032 grad_norm: 0.9220 (1.0837) closs: 0.7934 (0.9142) time: 2.1243 data: 0.0002 max mem: 28042 -[06:46:13.161036] Epoch: [0] [530/812] lr: 0.000033 grad_norm: 0.9652 (1.0826) closs: 0.8786 (0.9135) time: 2.1264 data: 0.0002 max mem: 28042 -[06:46:34.432979] Epoch: [0] [540/812] lr: 0.000033 grad_norm: 0.9935 (1.0810) closs: 0.9075 (0.9130) time: 2.1264 data: 0.0002 max mem: 28042 -[06:46:55.524047] Epoch: [0] [550/812] lr: 0.000034 grad_norm: 0.9487 (1.0826) closs: 0.8710 (0.9118) time: 2.1181 data: 0.0002 max mem: 28042 -[06:47:16.807565] Epoch: [0] [560/812] lr: 0.000034 grad_norm: 0.9651 (1.0816) closs: 0.8440 (0.9113) time: 2.1186 data: 0.0002 max mem: 28042 -[06:47:38.094895] Epoch: [0] [570/812] lr: 0.000035 grad_norm: 0.9088 (1.0788) closs: 0.8497 (0.9105) time: 2.1285 data: 0.0002 max mem: 28042 -[06:47:59.329657] Epoch: [0] [580/812] lr: 0.000036 grad_norm: 0.9026 (1.0786) closs: 0.8642 (0.9103) time: 2.1260 data: 0.0002 max mem: 28042 -[06:48:20.589047] Epoch: [0] [590/812] lr: 0.000036 grad_norm: 0.9026 (1.0764) closs: 0.8642 (0.9095) time: 2.1246 data: 0.0002 max mem: 28042 -[06:48:41.877780] Epoch: [0] [600/812] lr: 0.000037 grad_norm: 0.8705 (1.0731) closs: 0.8507 (0.9088) time: 2.1273 data: 0.0002 max mem: 28042 -[06:49:03.085078] Epoch: [0] [610/812] lr: 0.000038 grad_norm: 0.8442 (1.0703) closs: 0.8255 (0.9068) time: 2.1247 data: 0.0002 max mem: 28042 -[06:49:24.290564] Epoch: [0] [620/812] lr: 0.000038 grad_norm: 0.8969 (1.0716) closs: 0.7882 (0.9062) time: 2.1206 data: 0.0002 max mem: 28042 -[06:49:45.503873] Epoch: [0] [630/812] lr: 0.000039 grad_norm: 0.9548 (1.0711) closs: 0.8524 (0.9050) time: 2.1209 data: 0.0002 max mem: 28042 -[06:50:06.759717] Epoch: [0] [640/812] lr: 0.000039 grad_norm: 0.9548 (1.0689) closs: 0.8578 (0.9048) time: 2.1234 data: 0.0002 max mem: 28042 -[06:50:28.061959] Epoch: [0] [650/812] lr: 0.000040 grad_norm: 0.8640 (1.0660) closs: 0.8476 (0.9035) time: 2.1278 data: 0.0002 max mem: 28042 -[06:50:49.325746] Epoch: [0] [660/812] lr: 0.000041 grad_norm: 0.8640 (1.0636) closs: 0.8116 (0.9030) time: 2.1282 data: 0.0002 max mem: 28042 -[06:51:10.485211] Epoch: [0] [670/812] lr: 0.000041 grad_norm: 0.8712 (1.0605) closs: 0.8405 (0.9025) time: 2.1211 data: 0.0002 max mem: 28042 -[06:51:31.762440] Epoch: [0] [680/812] lr: 0.000042 grad_norm: 0.8712 (1.0583) closs: 0.8536 (0.9021) time: 2.1218 data: 0.0002 max mem: 28042 -[06:51:53.108436] Epoch: [0] [690/812] lr: 0.000042 grad_norm: 0.8425 (1.0562) closs: 0.8619 (0.9016) time: 2.1311 data: 0.0002 max mem: 28042 -[06:52:14.345086] Epoch: [0] [700/812] lr: 0.000043 grad_norm: 0.8661 (1.0545) closs: 0.8556 (0.9012) time: 2.1291 data: 0.0002 max mem: 28042 -[06:52:35.562580] Epoch: [0] [710/812] lr: 0.000044 grad_norm: 0.9346 (1.0539) closs: 0.8271 (0.9002) time: 2.1226 data: 0.0002 max mem: 28042 -[06:52:56.788771] Epoch: [0] [720/812] lr: 0.000044 grad_norm: 0.9284 (1.0529) closs: 0.8355 (0.8998) time: 2.1221 data: 0.0002 max mem: 28042 -[06:53:17.889187] Epoch: [0] [730/812] lr: 0.000045 grad_norm: 0.9407 (1.0518) closs: 0.8588 (0.8990) time: 2.1163 data: 0.0002 max mem: 28042 -[06:53:39.137183] Epoch: [0] [740/812] lr: 0.000046 grad_norm: 0.9572 (1.0510) closs: 0.8287 (0.8978) time: 2.1173 data: 0.0002 max mem: 28042 -[06:54:00.392567] Epoch: [0] [750/812] lr: 0.000046 grad_norm: 0.8869 (1.0493) closs: 0.8349 (0.8985) time: 2.1251 data: 0.0002 max mem: 28042 -[06:54:21.698073] Epoch: [0] [760/812] lr: 0.000047 grad_norm: 0.8591 (1.0473) closs: 0.8724 (0.8979) time: 2.1280 data: 0.0002 max mem: 28042 -[06:54:42.998694] Epoch: [0] [770/812] lr: 0.000047 grad_norm: 0.8692 (1.0454) closs: 0.8389 (0.8970) time: 2.1302 data: 0.0002 max mem: 28042 -[06:55:04.236494] Epoch: [0] [780/812] lr: 0.000048 grad_norm: 0.9315 (1.0442) closs: 0.8393 (0.8966) time: 2.1268 data: 0.0002 max mem: 28042 -[06:55:25.352416] Epoch: [0] [790/812] lr: 0.000049 grad_norm: 0.8830 (1.0420) closs: 0.8393 (0.8957) time: 2.1176 data: 0.0001 max mem: 28042 -[06:55:46.583325] Epoch: [0] [800/812] lr: 0.000049 grad_norm: 0.8569 (1.0408) closs: 0.8439 (0.8953) time: 2.1173 data: 0.0001 max mem: 28042 -[06:56:07.855749] Epoch: [0] [810/812] lr: 0.000050 grad_norm: 0.8814 (1.0395) closs: 0.8229 (0.8941) time: 2.1251 data: 0.0001 max mem: 28042 -[06:56:10.209403] Epoch: [0] Total time: 0:28:48 -[06:56:10.212512] Averaged stats: lr: 0.000050 grad_norm: 0.8814 (1.0395) closs: 0.8177 (0.8988) -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -[06:56:10.615675] model saved -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -[06:56:12.374769] optimizer saved -[06:56:12.375482] other rank-common saved -[06:56:12.381202] rank-specific saved -[06:56:12.391881] log_dir: ./output_dir -[06:56:15.877787] Epoch: [1] [0/812] lr: 0.000050 grad_norm: 0.7418 (0.7418) closs: 0.6434 (0.6434) time: 3.4849 data: 1.3481 max mem: 28042 -[06:56:37.172127] Epoch: [1] [10/812] lr: 0.000050 grad_norm: 0.9125 (0.9429) closs: 0.8020 (0.7882) time: 2.2526 data: 0.1227 max mem: 28042 -[06:56:58.553564] Epoch: [1] [20/812] lr: 0.000050 grad_norm: 0.9125 (0.9737) closs: 0.8134 (0.8498) time: 2.1337 data: 0.0002 max mem: 28042 -[06:57:19.841517] Epoch: [1] [30/812] lr: 0.000050 grad_norm: 0.8671 (0.9519) closs: 0.8293 (0.8457) time: 2.1334 data: 0.0002 max mem: 28042 -[06:57:41.138608] Epoch: [1] [40/812] lr: 0.000050 grad_norm: 0.8634 (0.9318) closs: 0.8733 (0.8656) time: 2.1292 data: 0.0002 max mem: 28042 -[06:58:02.227816] Epoch: [1] [50/812] lr: 0.000050 grad_norm: 0.8460 (0.9328) closs: 0.8800 (0.8613) time: 2.1192 data: 0.0002 max mem: 28042 -[06:58:23.466948] Epoch: [1] [60/812] lr: 0.000050 grad_norm: 0.8895 (0.9279) closs: 0.8352 (0.8591) time: 2.1163 data: 0.0002 max mem: 28042 -[06:58:44.680097] Epoch: [1] [70/812] lr: 0.000050 grad_norm: 0.9388 (0.9401) closs: 0.8626 (0.8619) time: 2.1225 data: 0.0002 max mem: 28042 -[06:59:05.983269] Epoch: [1] [80/812] lr: 0.000050 grad_norm: 0.9289 (0.9347) closs: 0.8646 (0.8577) time: 2.1257 data: 0.0002 max mem: 28042 -[06:59:27.380557] Epoch: [1] [90/812] lr: 0.000050 grad_norm: 0.8478 (0.9265) closs: 0.8646 (0.8618) time: 2.1349 data: 0.0002 max mem: 28042 -[06:59:48.640207] Epoch: [1] [100/812] lr: 0.000050 grad_norm: 0.8774 (0.9255) closs: 0.8923 (0.8640) time: 2.1328 data: 0.0002 max mem: 28042 -[07:00:09.758341] Epoch: [1] [110/812] lr: 0.000050 grad_norm: 0.8900 (0.9251) closs: 0.8698 (0.8612) time: 2.1188 data: 0.0002 max mem: 28042 -[07:00:30.981964] Epoch: [1] [120/812] lr: 0.000050 grad_norm: 0.9182 (0.9269) closs: 0.8698 (0.8617) time: 2.1170 data: 0.0002 max mem: 28042 -[07:00:52.245385] Epoch: [1] [130/812] lr: 0.000050 grad_norm: 0.9182 (0.9259) closs: 0.8665 (0.8617) time: 2.1243 data: 0.0002 max mem: 28042 -[07:01:13.558548] Epoch: [1] [140/812] lr: 0.000050 grad_norm: 0.8672 (0.9211) closs: 0.8531 (0.8614) time: 2.1287 data: 0.0002 max mem: 28042 -[07:01:34.846520] Epoch: [1] [150/812] lr: 0.000050 grad_norm: 0.8460 (0.9249) closs: 0.8726 (0.8611) time: 2.1300 data: 0.0002 max mem: 28042 -[07:01:56.152223] Epoch: [1] [160/812] lr: 0.000050 grad_norm: 0.9007 (0.9226) closs: 0.8726 (0.8635) time: 2.1296 data: 0.0002 max mem: 28042 -[07:02:17.282750] Epoch: [1] [170/812] lr: 0.000049 grad_norm: 0.8338 (0.9195) closs: 0.9167 (0.8660) time: 2.1217 data: 0.0002 max mem: 28042 -[07:02:38.524204] Epoch: [1] [180/812] lr: 0.000049 grad_norm: 0.8527 (0.9223) closs: 0.8779 (0.8662) time: 2.1185 data: 0.0002 max mem: 28042 -[07:02:59.827288] Epoch: [1] [190/812] lr: 0.000049 grad_norm: 0.8880 (0.9228) closs: 0.8079 (0.8646) time: 2.1271 data: 0.0002 max mem: 28042 -[07:03:21.168075] Epoch: [1] [200/812] lr: 0.000049 grad_norm: 0.8588 (0.9203) closs: 0.8388 (0.8627) time: 2.1321 data: 0.0002 max mem: 28042 -[07:03:42.467849] Epoch: [1] [210/812] lr: 0.000049 grad_norm: 0.8416 (0.9177) closs: 0.8623 (0.8648) time: 2.1319 data: 0.0002 max mem: 28042 -[07:04:03.731453] Epoch: [1] [220/812] lr: 0.000049 grad_norm: 0.8283 (0.9154) closs: 0.8735 (0.8637) time: 2.1281 data: 0.0002 max mem: 28042 -[07:04:24.911070] Epoch: [1] [230/812] lr: 0.000049 grad_norm: 0.8495 (0.9127) closs: 0.8099 (0.8620) time: 2.1221 data: 0.0002 max mem: 28042 -[07:04:46.205620] Epoch: [1] [240/812] lr: 0.000049 grad_norm: 0.8495 (0.9115) closs: 0.8409 (0.8631) time: 2.1236 data: 0.0002 max mem: 28042 -[07:05:07.516767] Epoch: [1] [250/812] lr: 0.000049 grad_norm: 0.8600 (0.9131) closs: 0.8848 (0.8622) time: 2.1302 data: 0.0002 max mem: 28042 -[07:05:28.819912] Epoch: [1] [260/812] lr: 0.000049 grad_norm: 0.8600 (0.9108) closs: 0.8859 (0.8627) time: 2.1306 data: 0.0002 max mem: 28042 -[07:05:50.052732] Epoch: [1] [270/812] lr: 0.000049 grad_norm: 0.8403 (0.9102) closs: 0.8677 (0.8615) time: 2.1267 data: 0.0002 max mem: 28042 -[07:06:11.298621] Epoch: [1] [280/812] lr: 0.000049 grad_norm: 0.9042 (0.9123) closs: 0.8677 (0.8617) time: 2.1239 data: 0.0002 max mem: 28042 -[07:06:32.419845] Epoch: [1] [290/812] lr: 0.000048 grad_norm: 0.9813 (0.9163) closs: 0.8537 (0.8620) time: 2.1183 data: 0.0002 max mem: 28042 -[07:06:53.663078] Epoch: [1] [300/812] lr: 0.000048 grad_norm: 0.8997 (0.9148) closs: 0.8405 (0.8609) time: 2.1181 data: 0.0002 max mem: 28042 -[07:07:14.922464] Epoch: [1] [310/812] lr: 0.000048 grad_norm: 0.8997 (0.9155) closs: 0.8405 (0.8602) time: 2.1251 data: 0.0002 max mem: 28042 -[07:07:36.151134] Epoch: [1] [320/812] lr: 0.000048 grad_norm: 0.8916 (0.9157) closs: 0.8480 (0.8609) time: 2.1243 data: 0.0002 max mem: 28042 -[07:07:57.365292] Epoch: [1] [330/812] lr: 0.000048 grad_norm: 0.8389 (0.9149) closs: 0.8297 (0.8603) time: 2.1221 data: 0.0002 max mem: 28042 -[07:08:18.608023] Epoch: [1] [340/812] lr: 0.000048 grad_norm: 0.8646 (0.9134) closs: 0.7981 (0.8584) time: 2.1228 data: 0.0002 max mem: 28042 -[07:08:39.766957] Epoch: [1] [350/812] lr: 0.000048 grad_norm: 0.8646 (0.9123) closs: 0.7981 (0.8579) time: 2.1200 data: 0.0002 max mem: 28042 -[07:09:01.054096] Epoch: [1] [360/812] lr: 0.000048 grad_norm: 0.8671 (0.9124) closs: 0.7807 (0.8556) time: 2.1222 data: 0.0002 max mem: 28042 -[07:09:22.343988] Epoch: [1] [370/812] lr: 0.000047 grad_norm: 0.9250 (0.9165) closs: 0.7688 (0.8546) time: 2.1288 data: 0.0002 max mem: 28042 -[07:09:43.668019] Epoch: [1] [380/812] lr: 0.000047 grad_norm: 0.9275 (0.9156) closs: 0.8322 (0.8548) time: 2.1306 data: 0.0003 max mem: 28042 -[07:10:04.936534] Epoch: [1] [390/812] lr: 0.000047 grad_norm: 0.8581 (0.9324) closs: 0.8617 (0.8550) time: 2.1296 data: 0.0003 max mem: 28042 -[07:10:26.269641] Epoch: [1] [400/812] lr: 0.000047 grad_norm: 0.8561 (0.9311) closs: 0.8443 (0.8548) time: 2.1300 data: 0.0002 max mem: 28042 -[07:10:47.398303] Epoch: [1] [410/812] lr: 0.000047 grad_norm: 0.8857 (0.9329) closs: 0.8655 (0.8563) time: 2.1230 data: 0.0002 max mem: 28042 -[07:11:08.680671] Epoch: [1] [420/812] lr: 0.000047 grad_norm: 0.9295 (0.9312) closs: 0.9093 (0.8567) time: 2.1205 data: 0.0002 max mem: 28042 -[07:11:29.894053] Epoch: [1] [430/812] lr: 0.000047 grad_norm: 0.8829 (0.9323) closs: 0.8460 (0.8558) time: 2.1247 data: 0.0002 max mem: 28042 -[07:11:51.120078] Epoch: [1] [440/812] lr: 0.000046 grad_norm: 0.8876 (0.9315) closs: 0.8460 (0.8558) time: 2.1219 data: 0.0002 max mem: 28042 -[07:12:12.348027] Epoch: [1] [450/812] lr: 0.000046 grad_norm: 0.8868 (0.9304) closs: 0.8481 (0.8554) time: 2.1226 data: 0.0002 max mem: 28042 -[07:12:33.569320] Epoch: [1] [460/812] lr: 0.000046 grad_norm: 0.9059 (0.9318) closs: 0.8101 (0.8548) time: 2.1224 data: 0.0002 max mem: 28042 -[07:12:54.684647] Epoch: [1] [470/812] lr: 0.000046 grad_norm: 0.9218 (0.9311) closs: 0.8178 (0.8549) time: 2.1168 data: 0.0002 max mem: 28042 -[07:13:15.972916] Epoch: [1] [480/812] lr: 0.000046 grad_norm: 0.8772 (0.9306) closs: 0.8478 (0.8555) time: 2.1201 data: 0.0002 max mem: 28042 -[07:13:37.223460] Epoch: [1] [490/812] lr: 0.000046 grad_norm: 0.8772 (0.9295) closs: 0.8584 (0.8561) time: 2.1269 data: 0.0002 max mem: 28042 -[07:13:58.457753] Epoch: [1] [500/812] lr: 0.000045 grad_norm: 0.8557 (0.9276) closs: 0.8375 (0.8556) time: 2.1242 data: 0.0002 max mem: 28042 -[07:14:19.699206] Epoch: [1] [510/812] lr: 0.000045 grad_norm: 0.7918 (0.9274) closs: 0.8128 (0.8553) time: 2.1237 data: 0.0002 max mem: 28042 -[07:14:41.023843] Epoch: [1] [520/812] lr: 0.000045 grad_norm: 0.8657 (0.9260) closs: 0.8301 (0.8552) time: 2.1282 data: 0.0002 max mem: 28042 -[07:15:02.146542] Epoch: [1] [530/812] lr: 0.000045 grad_norm: 0.8365 (0.9253) closs: 0.8398 (0.8553) time: 2.1223 data: 0.0003 max mem: 28042 -[07:15:23.393129] Epoch: [1] [540/812] lr: 0.000045 grad_norm: 0.8390 (0.9242) closs: 0.8310 (0.8553) time: 2.1184 data: 0.0003 max mem: 28042 -[07:15:44.662588] Epoch: [1] [550/812] lr: 0.000045 grad_norm: 0.8286 (0.9227) closs: 0.8088 (0.8546) time: 2.1257 data: 0.0002 max mem: 28042 -[07:16:05.961040] Epoch: [1] [560/812] lr: 0.000044 grad_norm: 0.8162 (0.9208) closs: 0.8267 (0.8545) time: 2.1283 data: 0.0002 max mem: 28042 -[07:16:27.247815] Epoch: [1] [570/812] lr: 0.000044 grad_norm: 0.8102 (0.9196) closs: 0.8561 (0.8549) time: 2.1292 data: 0.0002 max mem: 28042 -[07:16:48.505828] Epoch: [1] [580/812] lr: 0.000044 grad_norm: 0.8414 (0.9193) closs: 0.8574 (0.8542) time: 2.1272 data: 0.0002 max mem: 28042 -[07:17:09.646613] Epoch: [1] [590/812] lr: 0.000044 grad_norm: 0.9224 (0.9192) closs: 0.8602 (0.8545) time: 2.1199 data: 0.0002 max mem: 28042 -[07:17:30.846001] Epoch: [1] [600/812] lr: 0.000044 grad_norm: 0.9177 (0.9188) closs: 0.8599 (0.8548) time: 2.1169 data: 0.0002 max mem: 28042 -[07:17:52.084387] Epoch: [1] [610/812] lr: 0.000043 grad_norm: 0.8469 (0.9183) closs: 0.8411 (0.8548) time: 2.1218 data: 0.0002 max mem: 28042 -[07:18:13.385659] Epoch: [1] [620/812] lr: 0.000043 grad_norm: 0.8469 (0.9170) closs: 0.8457 (0.8548) time: 2.1269 data: 0.0002 max mem: 28042 -[07:18:34.632870] Epoch: [1] [630/812] lr: 0.000043 grad_norm: 0.8292 (0.9164) closs: 0.8625 (0.8553) time: 2.1273 data: 0.0002 max mem: 28042 -[07:18:55.965669] Epoch: [1] [640/812] lr: 0.000043 grad_norm: 0.8478 (0.9155) closs: 0.8567 (0.8553) time: 2.1289 data: 0.0002 max mem: 28042 -[07:19:17.086640] Epoch: [1] [650/812] lr: 0.000043 grad_norm: 0.8298 (0.9147) closs: 0.8326 (0.8550) time: 2.1226 data: 0.0004 max mem: 28042 -[07:19:38.331115] Epoch: [1] [660/812] lr: 0.000042 grad_norm: 0.8200 (0.9143) closs: 0.8794 (0.8560) time: 2.1182 data: 0.0004 max mem: 28042 -[07:19:59.582006] Epoch: [1] [670/812] lr: 0.000042 grad_norm: 0.8277 (0.9136) closs: 0.9154 (0.8567) time: 2.1247 data: 0.0002 max mem: 28042 -[07:20:20.825085] Epoch: [1] [680/812] lr: 0.000042 grad_norm: 0.8543 (0.9137) closs: 0.9115 (0.8573) time: 2.1246 data: 0.0002 max mem: 28042 -[07:20:42.055735] Epoch: [1] [690/812] lr: 0.000042 grad_norm: 0.9597 (0.9188) closs: 0.9179 (0.8578) time: 2.1236 data: 0.0003 max mem: 28042 -[07:21:03.290761] Epoch: [1] [700/812] lr: 0.000041 grad_norm: 0.8645 (0.9179) closs: 0.8496 (0.8577) time: 2.1232 data: 0.0002 max mem: 28042 -[07:21:24.434877] Epoch: [1] [710/812] lr: 0.000041 grad_norm: 0.8440 (0.9172) closs: 0.8131 (0.8574) time: 2.1189 data: 0.0002 max mem: 28042 -[07:21:45.655921] Epoch: [1] [720/812] lr: 0.000041 grad_norm: 0.8724 (0.9173) closs: 0.7997 (0.8569) time: 2.1182 data: 0.0002 max mem: 28042 -[07:22:06.935228] Epoch: [1] [730/812] lr: 0.000041 grad_norm: 0.8932 (0.9170) closs: 0.8529 (0.8571) time: 2.1249 data: 0.0002 max mem: 28042 -[07:22:28.172880] Epoch: [1] [740/812] lr: 0.000041 grad_norm: 0.8932 (0.9174) closs: 0.8546 (0.8569) time: 2.1258 data: 0.0002 max mem: 28042 -[07:22:49.386611] Epoch: [1] [750/812] lr: 0.000040 grad_norm: 0.8588 (0.9167) closs: 0.8256 (0.8568) time: 2.1225 data: 0.0002 max mem: 28042 -[07:23:10.577933] Epoch: [1] [760/812] lr: 0.000040 grad_norm: 0.8566 (0.9159) closs: 0.8375 (0.8571) time: 2.1202 data: 0.0002 max mem: 28042 -[07:23:31.706111] Epoch: [1] [770/812] lr: 0.000040 grad_norm: 0.8276 (0.9146) closs: 0.8624 (0.8566) time: 2.1159 data: 0.0002 max mem: 28042 -[07:23:53.046796] Epoch: [1] [780/812] lr: 0.000040 grad_norm: 0.8238 (0.9141) closs: 0.8051 (0.8564) time: 2.1234 data: 0.0002 max mem: 28042 -[07:24:14.279679] Epoch: [1] [790/812] lr: 0.000039 grad_norm: 0.8885 (0.9144) closs: 0.8512 (0.8566) time: 2.1286 data: 0.0002 max mem: 28042 -[07:24:35.475030] Epoch: [1] [800/812] lr: 0.000039 grad_norm: 0.8738 (0.9139) closs: 0.8451 (0.8562) time: 2.1213 data: 0.0002 max mem: 28042 -[07:24:56.734344] Epoch: [1] [810/812] lr: 0.000039 grad_norm: 0.8738 (0.9154) closs: 0.8451 (0.8562) time: 2.1227 data: 0.0002 max mem: 28042 -[07:24:59.165687] Epoch: [1] Total time: 0:28:46 -[07:24:59.168439] Averaged stats: lr: 0.000039 grad_norm: 0.8738 (0.9152) closs: 0.8673 (0.8545) -[07:24:59.501598] model saved -[07:25:01.228520] optimizer saved -[07:25:01.229115] other rank-common saved -[07:25:01.232745] rank-specific saved -[07:25:01.242649] log_dir: ./output_dir -[07:25:04.605755] Epoch: [2] [0/812] lr: 0.000039 grad_norm: 0.8173 (0.8173) closs: 0.8037 (0.8037) time: 3.3621 data: 1.2087 max mem: 28042 -[07:25:25.938470] Epoch: [2] [10/812] lr: 0.000038 grad_norm: 0.7978 (0.8008) closs: 0.8347 (0.8409) time: 2.2449 data: 0.1101 max mem: 28042 -[07:25:47.222423] Epoch: [2] [20/812] lr: 0.000038 grad_norm: 0.8004 (0.8113) closs: 0.8347 (0.8317) time: 2.1308 data: 0.0002 max mem: 28042 -[07:26:08.393469] Epoch: [2] [30/812] lr: 0.000038 grad_norm: 0.8724 (0.8343) closs: 0.8318 (0.8414) time: 2.1227 data: 0.0002 max mem: 28042 -[07:26:29.541814] Epoch: [2] [40/812] lr: 0.000038 grad_norm: 0.8632 (0.8308) closs: 0.8318 (0.8407) time: 2.1159 data: 0.0002 max mem: 28042 -[07:26:50.705595] Epoch: [2] [50/812] lr: 0.000037 grad_norm: 0.8126 (0.8463) closs: 0.8506 (0.8366) time: 2.1155 data: 0.0002 max mem: 28042 -[07:27:11.930514] Epoch: [2] [60/812] lr: 0.000037 grad_norm: 0.8214 (0.8468) closs: 0.8533 (0.8395) time: 2.1194 data: 0.0002 max mem: 28042 -[07:27:33.224038] Epoch: [2] [70/812] lr: 0.000037 grad_norm: 0.8380 (0.8558) closs: 0.8522 (0.8441) time: 2.1258 data: 0.0002 max mem: 28042 -[07:27:54.451669] Epoch: [2] [80/812] lr: 0.000037 grad_norm: 0.8503 (0.8651) closs: 0.8207 (0.8425) time: 2.1260 data: 0.0002 max mem: 28042 -[07:28:15.680781] Epoch: [2] [90/812] lr: 0.000036 grad_norm: 0.8135 (0.8623) closs: 0.8026 (0.8396) time: 2.1228 data: 0.0002 max mem: 28042 -[07:28:37.066597] Epoch: [2] [100/812] lr: 0.000036 grad_norm: 0.8527 (0.8702) closs: 0.8127 (0.8409) time: 2.1307 data: 0.0002 max mem: 28042 -[07:28:58.274244] Epoch: [2] [110/812] lr: 0.000036 grad_norm: 0.8377 (0.8658) closs: 0.8242 (0.8359) time: 2.1296 data: 0.0002 max mem: 28042 -[07:29:19.632451] Epoch: [2] [120/812] lr: 0.000036 grad_norm: 0.8040 (0.8644) closs: 0.8393 (0.8376) time: 2.1282 data: 0.0002 max mem: 28042 -[07:29:41.020148] Epoch: [2] [130/812] lr: 0.000035 grad_norm: 0.8352 (0.8648) closs: 0.8316 (0.8367) time: 2.1372 data: 0.0002 max mem: 28042 -[07:30:02.395455] Epoch: [2] [140/812] lr: 0.000035 grad_norm: 0.8786 (0.8695) closs: 0.8135 (0.8362) time: 2.1381 data: 0.0002 max mem: 28042 -[07:30:23.808228] Epoch: [2] [150/812] lr: 0.000035 grad_norm: 0.8822 (0.8688) closs: 0.8453 (0.8374) time: 2.1393 data: 0.0002 max mem: 28042 -[07:30:45.164764] Epoch: [2] [160/812] lr: 0.000035 grad_norm: 0.8822 (0.8746) closs: 0.8842 (0.8382) time: 2.1384 data: 0.0002 max mem: 28042 -[07:31:06.315458] Epoch: [2] [170/812] lr: 0.000034 grad_norm: 0.8529 (0.8701) closs: 0.8118 (0.8371) time: 2.1253 data: 0.0002 max mem: 28042 -[07:31:27.656439] Epoch: [2] [180/812] lr: 0.000034 grad_norm: 0.8679 (0.8743) closs: 0.8118 (0.8369) time: 2.1245 data: 0.0002 max mem: 28042 -[07:31:49.023759] Epoch: [2] [190/812] lr: 0.000034 grad_norm: 0.8803 (0.8730) closs: 0.8327 (0.8381) time: 2.1353 data: 0.0002 max mem: 28042 -[07:32:10.391465] Epoch: [2] [200/812] lr: 0.000033 grad_norm: 0.8540 (0.8725) closs: 0.8327 (0.8392) time: 2.1367 data: 0.0002 max mem: 28042 -[07:32:31.801438] Epoch: [2] [210/812] lr: 0.000033 grad_norm: 0.8493 (0.8724) closs: 0.8227 (0.8396) time: 2.1388 data: 0.0002 max mem: 28042 -[07:32:53.101914] Epoch: [2] [220/812] lr: 0.000033 grad_norm: 0.8557 (0.8723) closs: 0.8429 (0.8411) time: 2.1354 data: 0.0003 max mem: 28042 -[07:33:14.125610] Epoch: [2] [230/812] lr: 0.000033 grad_norm: 0.8792 (0.8748) closs: 0.8429 (0.8417) time: 2.1161 data: 0.0003 max mem: 28042 -[07:33:35.442382] Epoch: [2] [240/812] lr: 0.000032 grad_norm: 0.8701 (0.8736) closs: 0.8309 (0.8416) time: 2.1169 data: 0.0002 max mem: 28042 -[07:33:56.828157] Epoch: [2] [250/812] lr: 0.000032 grad_norm: 0.8521 (0.8742) closs: 0.8317 (0.8422) time: 2.1351 data: 0.0002 max mem: 28042 -[07:34:18.204872] Epoch: [2] [260/812] lr: 0.000032 grad_norm: 0.8521 (0.8736) closs: 0.7921 (0.8415) time: 2.1380 data: 0.0002 max mem: 28042 -[07:34:39.611093] Epoch: [2] [270/812] lr: 0.000031 grad_norm: 0.7877 (0.8718) closs: 0.7921 (0.8428) time: 2.1391 data: 0.0002 max mem: 28042 -[07:35:00.999360] Epoch: [2] [280/812] lr: 0.000031 grad_norm: 0.8320 (0.8718) closs: 0.8277 (0.8442) time: 2.1396 data: 0.0002 max mem: 28042 -[07:35:22.112658] Epoch: [2] [290/812] lr: 0.000031 grad_norm: 0.8661 (0.8723) closs: 0.8074 (0.8410) time: 2.1250 data: 0.0002 max mem: 28042 -[07:35:43.467764] Epoch: [2] [300/812] lr: 0.000031 grad_norm: 0.8647 (0.8730) closs: 0.8054 (0.8405) time: 2.1233 data: 0.0002 max mem: 28042 -[07:36:04.824780] Epoch: [2] [310/812] lr: 0.000030 grad_norm: 0.8826 (0.8761) closs: 0.8440 (0.8406) time: 2.1355 data: 0.0002 max mem: 28042 -[07:36:26.170930] Epoch: [2] [320/812] lr: 0.000030 grad_norm: 0.8778 (0.8759) closs: 0.8422 (0.8418) time: 2.1351 data: 0.0002 max mem: 28042 -[07:36:47.504093] Epoch: [2] [330/812] lr: 0.000030 grad_norm: 0.8626 (0.8788) closs: 0.8727 (0.8434) time: 2.1339 data: 0.0002 max mem: 28042 -[07:37:08.880216] Epoch: [2] [340/812] lr: 0.000029 grad_norm: 0.8626 (0.8780) closs: 0.8866 (0.8442) time: 2.1354 data: 0.0002 max mem: 28042 -[07:37:30.026320] Epoch: [2] [350/812] lr: 0.000029 grad_norm: 0.8242 (0.8770) closs: 0.8389 (0.8438) time: 2.1260 data: 0.0002 max mem: 28042 -[07:37:51.342100] Epoch: [2] [360/812] lr: 0.000029 grad_norm: 0.8242 (0.8763) closs: 0.8495 (0.8446) time: 2.1230 data: 0.0002 max mem: 28042 -[07:38:12.672288] Epoch: [2] [370/812] lr: 0.000029 grad_norm: 0.8642 (0.8774) closs: 0.8439 (0.8445) time: 2.1322 data: 0.0002 max mem: 28042 -[07:38:34.038903] Epoch: [2] [380/812] lr: 0.000028 grad_norm: 0.8655 (0.8792) closs: 0.8376 (0.8437) time: 2.1348 data: 0.0002 max mem: 28042 -[07:38:55.416450] Epoch: [2] [390/812] lr: 0.000028 grad_norm: 0.8655 (0.8799) closs: 0.8200 (0.8424) time: 2.1371 data: 0.0002 max mem: 28042 -[07:39:16.805648] Epoch: [2] [400/812] lr: 0.000028 grad_norm: 0.9245 (0.8820) closs: 0.8217 (0.8425) time: 2.1383 data: 0.0002 max mem: 28042 -[07:39:37.947526] Epoch: [2] [410/812] lr: 0.000027 grad_norm: 0.8268 (0.8809) closs: 0.8217 (0.8423) time: 2.1265 data: 0.0002 max mem: 28042 -[07:39:59.300724] Epoch: [2] [420/812] lr: 0.000027 grad_norm: 0.7987 (0.8815) closs: 0.8043 (0.8423) time: 2.1247 data: 0.0004 max mem: 28042 -[07:40:20.670550] Epoch: [2] [430/812] lr: 0.000027 grad_norm: 0.8388 (0.8824) closs: 0.8112 (0.8429) time: 2.1361 data: 0.0004 max mem: 28042 -[07:40:42.075112] Epoch: [2] [440/812] lr: 0.000027 grad_norm: 0.8653 (0.8825) closs: 0.8614 (0.8427) time: 2.1386 data: 0.0002 max mem: 28042 -[07:41:03.398003] Epoch: [2] [450/812] lr: 0.000026 grad_norm: 0.8892 (0.8833) closs: 0.8563 (0.8438) time: 2.1363 data: 0.0002 max mem: 28042 -[07:41:24.767838] Epoch: [2] [460/812] lr: 0.000026 grad_norm: 0.8508 (0.8832) closs: 0.9083 (0.8458) time: 2.1346 data: 0.0002 max mem: 28042 -[07:41:45.899249] Epoch: [2] [470/812] lr: 0.000026 grad_norm: 0.8296 (0.8824) closs: 0.8864 (0.8453) time: 2.1250 data: 0.0002 max mem: 28042 -[07:42:07.234898] Epoch: [2] [480/812] lr: 0.000025 grad_norm: 0.8476 (0.8834) closs: 0.8653 (0.8465) time: 2.1233 data: 0.0002 max mem: 28042 -[07:42:28.526171] Epoch: [2] [490/812] lr: 0.000025 grad_norm: 0.8476 (0.8827) closs: 0.8379 (0.8457) time: 2.1313 data: 0.0002 max mem: 28042 -[07:42:49.834652] Epoch: [2] [500/812] lr: 0.000025 grad_norm: 0.8437 (0.8834) closs: 0.8202 (0.8453) time: 2.1299 data: 0.0002 max mem: 28042 -[07:43:11.253722] Epoch: [2] [510/812] lr: 0.000024 grad_norm: 0.8629 (0.8832) closs: 0.8411 (0.8454) time: 2.1363 data: 0.0002 max mem: 28042 -[07:43:32.569350] Epoch: [2] [520/812] lr: 0.000024 grad_norm: 0.8694 (0.8837) closs: 0.8411 (0.8454) time: 2.1366 data: 0.0002 max mem: 28042 -[07:43:53.723009] Epoch: [2] [530/812] lr: 0.000024 grad_norm: 0.8694 (0.8828) closs: 0.8377 (0.8456) time: 2.1234 data: 0.0002 max mem: 28042 -[07:44:15.068725] Epoch: [2] [540/812] lr: 0.000024 grad_norm: 0.8512 (0.8832) closs: 0.8377 (0.8454) time: 2.1249 data: 0.0002 max mem: 28042 -[07:44:36.418823] Epoch: [2] [550/812] lr: 0.000023 grad_norm: 0.8541 (0.8821) closs: 0.8485 (0.8453) time: 2.1347 data: 0.0002 max mem: 28042 -[07:44:57.735533] Epoch: [2] [560/812] lr: 0.000023 grad_norm: 0.7920 (0.8821) closs: 0.8278 (0.8453) time: 2.1333 data: 0.0002 max mem: 28042 -[07:45:19.044480] Epoch: [2] [570/812] lr: 0.000023 grad_norm: 0.8350 (0.8831) closs: 0.8197 (0.8447) time: 2.1312 data: 0.0002 max mem: 28042 -[07:45:40.336507] Epoch: [2] [580/812] lr: 0.000022 grad_norm: 0.9544 (0.8832) closs: 0.8045 (0.8436) time: 2.1300 data: 0.0002 max mem: 28042 -[07:46:01.445076] Epoch: [2] [590/812] lr: 0.000022 grad_norm: 0.8539 (0.8832) closs: 0.8222 (0.8426) time: 2.1200 data: 0.0002 max mem: 28042 -[07:46:22.758448] Epoch: [2] [600/812] lr: 0.000022 grad_norm: 0.8545 (0.8826) closs: 0.8357 (0.8426) time: 2.1210 data: 0.0002 max mem: 28042 -[07:46:43.975732] Epoch: [2] [610/812] lr: 0.000022 grad_norm: 0.8283 (0.8824) closs: 0.8244 (0.8424) time: 2.1265 data: 0.0002 max mem: 28042 -[07:47:05.349197] Epoch: [2] [620/812] lr: 0.000021 grad_norm: 0.8456 (0.8825) closs: 0.8244 (0.8423) time: 2.1295 data: 0.0002 max mem: 28042 -[07:47:26.735767] Epoch: [2] [630/812] lr: 0.000021 grad_norm: 0.8456 (0.8814) closs: 0.8121 (0.8423) time: 2.1379 data: 0.0002 max mem: 28042 -[07:47:48.056890] Epoch: [2] [640/812] lr: 0.000021 grad_norm: 0.8785 (0.8820) closs: 0.8153 (0.8423) time: 2.1353 data: 0.0002 max mem: 28042 -[07:48:09.177512] Epoch: [2] [650/812] lr: 0.000021 grad_norm: 0.8796 (0.8835) closs: 0.7759 (0.8408) time: 2.1220 data: 0.0002 max mem: 28042 -[07:48:30.537949] Epoch: [2] [660/812] lr: 0.000020 grad_norm: 0.8254 (0.8828) closs: 0.7714 (0.8401) time: 2.1240 data: 0.0002 max mem: 28042 -[07:48:51.921824] Epoch: [2] [670/812] lr: 0.000020 grad_norm: 0.8632 (0.8835) closs: 0.7948 (0.8396) time: 2.1371 data: 0.0002 max mem: 28042 -[07:49:13.251435] Epoch: [2] [680/812] lr: 0.000020 grad_norm: 0.8754 (0.8834) closs: 0.8484 (0.8402) time: 2.1356 data: 0.0002 max mem: 28042 -[07:49:34.579325] Epoch: [2] [690/812] lr: 0.000019 grad_norm: 0.8018 (0.8829) closs: 0.8826 (0.8405) time: 2.1328 data: 0.0002 max mem: 28042 -[07:49:55.977559] Epoch: [2] [700/812] lr: 0.000019 grad_norm: 0.8168 (0.8827) closs: 0.8405 (0.8403) time: 2.1362 data: 0.0002 max mem: 28042 -[07:50:17.140172] Epoch: [2] [710/812] lr: 0.000019 grad_norm: 0.8241 (0.8818) closs: 0.7813 (0.8396) time: 2.1280 data: 0.0002 max mem: 28042 -[07:50:38.489617] Epoch: [2] [720/812] lr: 0.000019 grad_norm: 0.7747 (0.8815) closs: 0.7818 (0.8397) time: 2.1255 data: 0.0002 max mem: 28042 -[07:50:59.854958] Epoch: [2] [730/812] lr: 0.000018 grad_norm: 0.8660 (0.8817) closs: 0.8288 (0.8405) time: 2.1357 data: 0.0002 max mem: 28042 -[07:51:21.193447] Epoch: [2] [740/812] lr: 0.000018 grad_norm: 0.8745 (0.8813) closs: 0.8010 (0.8404) time: 2.1351 data: 0.0002 max mem: 28042 -[07:51:42.504050] Epoch: [2] [750/812] lr: 0.000018 grad_norm: 0.8117 (0.8810) closs: 0.8010 (0.8404) time: 2.1324 data: 0.0002 max mem: 28042 -[07:52:03.789461] Epoch: [2] [760/812] lr: 0.000018 grad_norm: 0.8396 (0.8808) closs: 0.8350 (0.8399) time: 2.1297 data: 0.0002 max mem: 28042 -[07:52:24.867624] Epoch: [2] [770/812] lr: 0.000017 grad_norm: 0.8502 (0.8811) closs: 0.8475 (0.8405) time: 2.1181 data: 0.0002 max mem: 28042 -[07:52:46.212501] Epoch: [2] [780/812] lr: 0.000017 grad_norm: 0.8794 (0.8809) closs: 0.8875 (0.8410) time: 2.1211 data: 0.0002 max mem: 28042 -[07:53:07.550117] Epoch: [2] [790/812] lr: 0.000017 grad_norm: 0.7999 (0.8805) closs: 0.8674 (0.8414) time: 2.1341 data: 0.0001 max mem: 28042 -[07:53:28.932999] Epoch: [2] [800/812] lr: 0.000017 grad_norm: 0.8289 (0.8808) closs: 0.8283 (0.8416) time: 2.1359 data: 0.0001 max mem: 28042 -[07:53:50.278871] Epoch: [2] [810/812] lr: 0.000016 grad_norm: 0.8302 (0.8809) closs: 0.8504 (0.8421) time: 2.1364 data: 0.0001 max mem: 28042 -[07:53:52.714588] Epoch: [2] Total time: 0:28:51 -[07:53:52.730137] Averaged stats: lr: 0.000016 grad_norm: 0.8289 (0.8807) closs: 0.8552 (0.8424) -[07:53:53.203661] model saved -[07:53:54.901871] optimizer saved -[07:53:54.902645] other rank-common saved -[07:53:54.906903] rank-specific saved -[07:53:54.917520] log_dir: ./output_dir -[07:53:58.191492] Epoch: [3] [0/812] lr: 0.000016 grad_norm: 1.0490 (1.0490) closs: 0.6441 (0.6441) time: 3.2729 data: 1.1646 max mem: 28042 -[07:54:19.479554] Epoch: [3] [10/812] lr: 0.000016 grad_norm: 0.8902 (0.9635) closs: 0.9038 (0.8853) time: 2.2327 data: 0.1061 max mem: 28042 -[07:54:40.626378] Epoch: [3] [20/812] lr: 0.000016 grad_norm: 0.8645 (0.9616) closs: 0.8700 (0.8582) time: 2.1217 data: 0.0002 max mem: 28042 -[07:55:01.731665] Epoch: [3] [30/812] lr: 0.000016 grad_norm: 0.8400 (0.9256) closs: 0.8400 (0.8562) time: 2.1125 data: 0.0002 max mem: 28042 -[07:55:22.885001] Epoch: [3] [40/812] lr: 0.000015 grad_norm: 0.8370 (0.9113) closs: 0.8383 (0.8448) time: 2.1129 data: 0.0002 max mem: 28042 -[07:55:44.040122] Epoch: [3] [50/812] lr: 0.000015 grad_norm: 0.8304 (0.8985) closs: 0.8214 (0.8448) time: 2.1153 data: 0.0002 max mem: 28042 -[07:56:05.256778] Epoch: [3] [60/812] lr: 0.000015 grad_norm: 0.8304 (0.9031) closs: 0.8439 (0.8496) time: 2.1185 data: 0.0002 max mem: 28042 -[07:56:26.408434] Epoch: [3] [70/812] lr: 0.000015 grad_norm: 0.8670 (0.9090) closs: 0.8136 (0.8462) time: 2.1183 data: 0.0002 max mem: 28042 -[07:56:47.651858] Epoch: [3] [80/812] lr: 0.000014 grad_norm: 0.8294 (0.9039) closs: 0.8295 (0.8458) time: 2.1197 data: 0.0002 max mem: 28042 -[07:57:08.789407] Epoch: [3] [90/812] lr: 0.000014 grad_norm: 0.8278 (0.9153) closs: 0.8492 (0.8447) time: 2.1190 data: 0.0002 max mem: 28042 -[07:57:29.972419] Epoch: [3] [100/812] lr: 0.000014 grad_norm: 0.8774 (0.9115) closs: 0.8209 (0.8421) time: 2.1159 data: 0.0002 max mem: 28042 -[07:57:51.294111] Epoch: [3] [110/812] lr: 0.000014 grad_norm: 0.8553 (0.9095) closs: 0.7868 (0.8363) time: 2.1252 data: 0.0002 max mem: 28042 -[07:58:12.671601] Epoch: [3] [120/812] lr: 0.000013 grad_norm: 0.8556 (0.9050) closs: 0.7868 (0.8408) time: 2.1349 data: 0.0003 max mem: 28042 -[07:58:33.964903] Epoch: [3] [130/812] lr: 0.000013 grad_norm: 0.8585 (0.9045) closs: 0.8151 (0.8379) time: 2.1335 data: 0.0003 max mem: 28042 -[07:58:55.294678] Epoch: [3] [140/812] lr: 0.000013 grad_norm: 0.8542 (0.9024) closs: 0.8471 (0.8414) time: 2.1311 data: 0.0002 max mem: 28042 -[07:59:16.596797] Epoch: [3] [150/812] lr: 0.000013 grad_norm: 0.8433 (0.9080) closs: 0.8486 (0.8407) time: 2.1315 data: 0.0002 max mem: 28042 -[07:59:37.813975] Epoch: [3] [160/812] lr: 0.000012 grad_norm: 0.8266 (0.9031) closs: 0.7934 (0.8408) time: 2.1259 data: 0.0002 max mem: 28042 -[07:59:59.101368] Epoch: [3] [170/812] lr: 0.000012 grad_norm: 0.8266 (0.9000) closs: 0.8247 (0.8395) time: 2.1252 data: 0.0002 max mem: 28042 -[08:00:20.346641] Epoch: [3] [180/812] lr: 0.000012 grad_norm: 0.8709 (0.9001) closs: 0.8101 (0.8379) time: 2.1266 data: 0.0002 max mem: 28042 -[08:00:41.591960] Epoch: [3] [190/812] lr: 0.000012 grad_norm: 0.9020 (0.9010) closs: 0.8010 (0.8377) time: 2.1245 data: 0.0002 max mem: 28042 -[08:01:02.880467] Epoch: [3] [200/812] lr: 0.000012 grad_norm: 0.8726 (0.8965) closs: 0.8493 (0.8391) time: 2.1266 data: 0.0002 max mem: 28042 -[08:01:24.071233] Epoch: [3] [210/812] lr: 0.000011 grad_norm: 0.7843 (0.8941) closs: 0.8166 (0.8381) time: 2.1239 data: 0.0002 max mem: 28042 -[08:01:45.236262] Epoch: [3] [220/812] lr: 0.000011 grad_norm: 0.8370 (0.8924) closs: 0.8164 (0.8396) time: 2.1177 data: 0.0002 max mem: 28042 -[08:02:06.489504] Epoch: [3] [230/812] lr: 0.000011 grad_norm: 0.8645 (0.8946) closs: 0.8308 (0.8402) time: 2.1208 data: 0.0002 max mem: 28042 -[08:02:27.784203] Epoch: [3] [240/812] lr: 0.000011 grad_norm: 0.8777 (0.8973) closs: 0.8308 (0.8405) time: 2.1273 data: 0.0002 max mem: 28042 -[08:02:49.078010] Epoch: [3] [250/812] lr: 0.000011 grad_norm: 0.8627 (0.8957) closs: 0.8112 (0.8399) time: 2.1293 data: 0.0002 max mem: 28042 -[08:03:10.423309] Epoch: [3] [260/812] lr: 0.000010 grad_norm: 0.7976 (0.8924) closs: 0.7705 (0.8383) time: 2.1319 data: 0.0002 max mem: 28042 -[08:03:31.704948] Epoch: [3] [270/812] lr: 0.000010 grad_norm: 0.8373 (0.8915) closs: 0.7730 (0.8388) time: 2.1313 data: 0.0002 max mem: 28042 -[08:03:52.854770] Epoch: [3] [280/812] lr: 0.000010 grad_norm: 0.8283 (0.8906) closs: 0.7952 (0.8401) time: 2.1215 data: 0.0002 max mem: 28042 -[08:04:14.113366] Epoch: [3] [290/812] lr: 0.000010 grad_norm: 0.8470 (0.8935) closs: 0.7852 (0.8392) time: 2.1203 data: 0.0002 max mem: 28042 -[08:04:35.342838] Epoch: [3] [300/812] lr: 0.000010 grad_norm: 0.9499 (0.8946) closs: 0.7852 (0.8366) time: 2.1243 data: 0.0003 max mem: 28042 -[08:04:56.618987] Epoch: [3] [310/812] lr: 0.000010 grad_norm: 0.9211 (0.8958) closs: 0.8185 (0.8384) time: 2.1252 data: 0.0003 max mem: 28042 -[08:05:17.910768] Epoch: [3] [320/812] lr: 0.000009 grad_norm: 0.8922 (0.8954) closs: 0.8641 (0.8389) time: 2.1283 data: 0.0003 max mem: 28042 -[08:05:39.221330] Epoch: [3] [330/812] lr: 0.000009 grad_norm: 0.8283 (0.8942) closs: 0.8408 (0.8387) time: 2.1300 data: 0.0002 max mem: 28042 -[08:06:00.377654] Epoch: [3] [340/812] lr: 0.000009 grad_norm: 0.8666 (0.8945) closs: 0.8742 (0.8393) time: 2.1233 data: 0.0002 max mem: 28042 -[08:06:21.664630] Epoch: [3] [350/812] lr: 0.000009 grad_norm: 0.8586 (0.8937) closs: 0.8742 (0.8399) time: 2.1221 data: 0.0002 max mem: 28042 -[08:06:42.936458] Epoch: [3] [360/812] lr: 0.000009 grad_norm: 0.8542 (0.8931) closs: 0.8490 (0.8404) time: 2.1279 data: 0.0002 max mem: 28042 -[08:07:04.181192] Epoch: [3] [370/812] lr: 0.000009 grad_norm: 0.8234 (0.8933) closs: 0.8577 (0.8406) time: 2.1257 data: 0.0002 max mem: 28042 -[08:07:25.478197] Epoch: [3] [380/812] lr: 0.000008 grad_norm: 0.8222 (0.8926) closs: 0.8238 (0.8396) time: 2.1270 data: 0.0002 max mem: 28042 -[08:07:46.790628] Epoch: [3] [390/812] lr: 0.000008 grad_norm: 0.8357 (0.8927) closs: 0.7723 (0.8386) time: 2.1304 data: 0.0002 max mem: 28042 -[08:08:07.856380] Epoch: [3] [400/812] lr: 0.000008 grad_norm: 0.8908 (0.8945) closs: 0.8394 (0.8390) time: 2.1188 data: 0.0002 max mem: 28042 -[08:08:29.129808] Epoch: [3] [410/812] lr: 0.000008 grad_norm: 0.8908 (0.8952) closs: 0.8187 (0.8376) time: 2.1169 data: 0.0002 max mem: 28042 -[08:08:50.389711] Epoch: [3] [420/812] lr: 0.000008 grad_norm: 0.8459 (0.8954) closs: 0.7789 (0.8370) time: 2.1266 data: 0.0002 max mem: 28042 -[08:09:11.665127] Epoch: [3] [430/812] lr: 0.000008 grad_norm: 0.8785 (0.8963) closs: 0.7809 (0.8356) time: 2.1267 data: 0.0002 max mem: 28042 -[08:09:32.923390] Epoch: [3] [440/812] lr: 0.000008 grad_norm: 0.8872 (0.8970) closs: 0.7809 (0.8346) time: 2.1266 data: 0.0002 max mem: 28042 -[08:09:54.151497] Epoch: [3] [450/812] lr: 0.000007 grad_norm: 0.8938 (0.8975) closs: 0.7968 (0.8343) time: 2.1242 data: 0.0002 max mem: 28042 -[08:10:15.305060] Epoch: [3] [460/812] lr: 0.000007 grad_norm: 0.8952 (0.9105) closs: 0.8061 (0.8337) time: 2.1190 data: 0.0002 max mem: 28042 -[08:10:36.576963] Epoch: [3] [470/812] lr: 0.000007 grad_norm: 0.8742 (0.9107) closs: 0.7998 (0.8334) time: 2.1212 data: 0.0002 max mem: 28042 -[08:10:57.844902] Epoch: [3] [480/812] lr: 0.000007 grad_norm: 0.8667 (0.9107) closs: 0.7924 (0.8329) time: 2.1269 data: 0.0002 max mem: 28042 -[08:11:19.115588] Epoch: [3] [490/812] lr: 0.000007 grad_norm: 0.8655 (0.9100) closs: 0.7802 (0.8330) time: 2.1269 data: 0.0002 max mem: 28042 -[08:11:40.448298] Epoch: [3] [500/812] lr: 0.000007 grad_norm: 0.8168 (0.9101) closs: 0.8021 (0.8326) time: 2.1301 data: 0.0005 max mem: 28042 -[08:12:01.736491] Epoch: [3] [510/812] lr: 0.000007 grad_norm: 0.8157 (0.9084) closs: 0.8268 (0.8339) time: 2.1310 data: 0.0004 max mem: 28042 -[08:12:22.852744] Epoch: [3] [520/812] lr: 0.000007 grad_norm: 0.8698 (0.9092) closs: 0.9299 (0.8346) time: 2.1201 data: 0.0002 max mem: 28042 -[08:12:44.116451] Epoch: [3] [530/812] lr: 0.000006 grad_norm: 0.8853 (0.9092) closs: 0.7947 (0.8340) time: 2.1189 data: 0.0002 max mem: 28042 -[08:13:05.415617] Epoch: [3] [540/812] lr: 0.000006 grad_norm: 0.8769 (0.9088) closs: 0.7747 (0.8337) time: 2.1281 data: 0.0002 max mem: 28042 -[08:13:26.702689] Epoch: [3] [550/812] lr: 0.000006 grad_norm: 0.8467 (0.9087) closs: 0.8074 (0.8329) time: 2.1292 data: 0.0002 max mem: 28042 -[08:13:47.965371] Epoch: [3] [560/812] lr: 0.000006 grad_norm: 0.8314 (0.9073) closs: 0.8062 (0.8323) time: 2.1274 data: 0.0002 max mem: 28042 -[08:14:09.281838] Epoch: [3] [570/812] lr: 0.000006 grad_norm: 0.8763 (0.9081) closs: 0.8191 (0.8329) time: 2.1289 data: 0.0002 max mem: 28042 -[08:14:30.423561] Epoch: [3] [580/812] lr: 0.000006 grad_norm: 0.9098 (0.9078) closs: 0.8414 (0.8333) time: 2.1228 data: 0.0002 max mem: 28042 -[08:14:51.732903] Epoch: [3] [590/812] lr: 0.000006 grad_norm: 0.8236 (0.9058) closs: 0.8080 (0.8328) time: 2.1225 data: 0.0002 max mem: 28042 -[08:15:13.032032] Epoch: [3] [600/812] lr: 0.000006 grad_norm: 0.8071 (0.9055) closs: 0.7610 (0.8326) time: 2.1303 data: 0.0002 max mem: 28042 -[08:15:34.338684] Epoch: [3] [610/812] lr: 0.000006 grad_norm: 0.8387 (0.9068) closs: 0.8489 (0.8333) time: 2.1302 data: 0.0002 max mem: 28042 -[08:15:55.610535] Epoch: [3] [620/812] lr: 0.000006 grad_norm: 0.8382 (0.9067) closs: 0.8438 (0.8335) time: 2.1289 data: 0.0002 max mem: 28042 -[08:16:16.889308] Epoch: [3] [630/812] lr: 0.000006 grad_norm: 0.8850 (0.9083) closs: 0.7955 (0.8327) time: 2.1275 data: 0.0002 max mem: 28042 -[08:16:38.059748] Epoch: [3] [640/812] lr: 0.000006 grad_norm: 0.8985 (0.9078) closs: 0.7955 (0.8326) time: 2.1224 data: 0.0002 max mem: 28042 -[08:16:59.340361] Epoch: [3] [650/812] lr: 0.000005 grad_norm: 0.8872 (0.9074) closs: 0.8339 (0.8337) time: 2.1225 data: 0.0002 max mem: 28042 -[08:17:20.632265] Epoch: [3] [660/812] lr: 0.000005 grad_norm: 0.8939 (0.9068) closs: 0.8938 (0.8344) time: 2.1286 data: 0.0002 max mem: 28042 -[08:17:41.842573] Epoch: [3] [670/812] lr: 0.000005 grad_norm: 0.8939 (0.9067) closs: 0.8269 (0.8338) time: 2.1250 data: 0.0002 max mem: 28042 -[08:18:03.079831] Epoch: [3] [680/812] lr: 0.000005 grad_norm: 0.8363 (0.9068) closs: 0.8251 (0.8344) time: 2.1223 data: 0.0002 max mem: 28042 -[08:18:24.338829] Epoch: [3] [690/812] lr: 0.000005 grad_norm: 0.8847 (0.9081) closs: 0.8305 (0.8341) time: 2.1247 data: 0.0002 max mem: 28042 -[08:18:45.483996] Epoch: [3] [700/812] lr: 0.000005 grad_norm: 0.8847 (0.9070) closs: 0.8262 (0.8343) time: 2.1201 data: 0.0002 max mem: 28042 -[08:19:06.675603] Epoch: [3] [710/812] lr: 0.000005 grad_norm: 0.8213 (0.9068) closs: 0.8601 (0.8347) time: 2.1168 data: 0.0002 max mem: 28042 -[08:19:27.902064] Epoch: [3] [720/812] lr: 0.000005 grad_norm: 0.8174 (0.9057) closs: 0.8440 (0.8350) time: 2.1208 data: 0.0002 max mem: 28042 -[08:19:49.131969] Epoch: [3] [730/812] lr: 0.000005 grad_norm: 0.8260 (0.9056) closs: 0.8061 (0.8343) time: 2.1227 data: 0.0002 max mem: 28042 -[08:20:10.427471] Epoch: [3] [740/812] lr: 0.000005 grad_norm: 0.8716 (0.9058) closs: 0.8154 (0.8347) time: 2.1262 data: 0.0002 max mem: 28042 -[08:20:31.811740] Epoch: [3] [750/812] lr: 0.000005 grad_norm: 0.8902 (0.9072) closs: 0.8183 (0.8343) time: 2.1339 data: 0.0002 max mem: 28042 -[08:20:52.966245] Epoch: [3] [760/812] lr: 0.000005 grad_norm: 0.8902 (0.9094) closs: 0.8189 (0.8348) time: 2.1269 data: 0.0002 max mem: 28042 -[08:21:14.257344] Epoch: [3] [770/812] lr: 0.000005 grad_norm: 0.8943 (0.9086) closs: 0.8266 (0.8348) time: 2.1222 data: 0.0002 max mem: 28042 -[08:21:35.557339] Epoch: [3] [780/812] lr: 0.000005 grad_norm: 0.8404 (0.9086) closs: 0.8143 (0.8347) time: 2.1295 data: 0.0002 max mem: 28042 -[08:21:56.831315] Epoch: [3] [790/812] lr: 0.000005 grad_norm: 0.8876 (0.9092) closs: 0.8193 (0.8347) time: 2.1286 data: 0.0002 max mem: 28042 -[08:22:18.140914] Epoch: [3] [800/812] lr: 0.000005 grad_norm: 0.8628 (0.9097) closs: 0.8552 (0.8350) time: 2.1291 data: 0.0002 max mem: 28042 -[08:22:39.368185] Epoch: [3] [810/812] lr: 0.000005 grad_norm: 0.8627 (0.9094) closs: 0.9012 (0.8357) time: 2.1268 data: 0.0002 max mem: 28042 -[08:22:41.821347] Epoch: [3] Total time: 0:28:46 -[08:22:41.825202] Averaged stats: lr: 0.000005 grad_norm: 0.8627 (0.9092) closs: 0.9007 (0.8364) -[08:22:42.226562] model saved -[08:22:43.924087] optimizer saved -[08:22:43.925003] other rank-common saved -[08:22:43.931834] rank-specific saved -[08:22:43.932142] Training time 1:55:22 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.model.pth deleted file mode 100644 index f050e55b07facfaec1e5a03de296bde911239050..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7d5c15a2808412d27835f14c0fc05ef1fc310923302af1fb9c02c445a59b7304 -size 58162939 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 6aa599035681e620d1e8181b9ffd84a7bf151194..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f565b9183e251c048fae68754e0b1a015c651fd43b0cd71a3982f9b4d1077dd -size 130819127 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.other.pth deleted file mode 100644 index e317414ddacd9d596ca17754b1fb347e8c620137..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:11ad37201918bfc6a749b9c14caf28e560ac54b89a2d0fa3be30e50c49fab9cd -size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.model.pth deleted file mode 100644 index 331afc6d009c4c4bdce415e1e05fe0a7c6fd04f8..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4c997936cd82af0a6012259bafc3d2e9e2a6c0acd90f8762ec953c18c052c8c4 -size 58162939 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 9f5c4bf62d29d14a7652cf8b73c31183e5b6b245..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:586da76697344d1409f6d0331176114e72fd47b98091fd6c258a4792eea01a82 -size 130819127 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.other.pth deleted file mode 100644 index 2c36cacd3ff477892660b4bdf6361cc4ab5bf40a..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e6d32e2b269680e43657c7c735c0be6f8fd7672e839232b8381fccdf09d36792 -size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.model.pth deleted file mode 100644 index e4a9509a579916c44cb1ef97e02d070092a0fe9a..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c9b7acb10eef238d30c123f47b15af4c9c443ef414ab5f8737674883bf3d5189 -size 58162939 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 90e77163e87c30cafa46def5bdb669dc533a6005..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:57e071734f60c90e438181c8ae0d2dd00a0cf7081ac674445bbeec548315a98a -size 130819127 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.other.pth deleted file mode 100644 index e415f2ef15b0b4d08b397865c99738db07bf0a07..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3da9395be70ad1f7631e92aec6d4efc745e19ca78afde1004e4193976aea056d -size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.model.pth deleted file mode 100644 index e71af1b7c315b37a4aea851461f8dcefbd6e27d7..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.model.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:71aece4919e4237bb2920f0dfd74e23ac51a04fd74a56ab8efcab06962a6d3cd -size 58162939 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth deleted file mode 100644 index 2da15d2c95c60871c58625ee883ee9fff7857cc6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9f8c23cd50d34ab5d423af68a8e4a2fa3ba11191f6cbd5146638cc478d9f3bb6 -size 130819127 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.other.pth deleted file mode 100644 index b9125500778ab300ca76c74d5da10a548c4985af..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.other.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f931fea34d05be0494c0ab9718bce37ad922d058b64b820c329cd71d918f3a70 -size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth deleted file mode 100644 index 52b71af1a9ce3ed182e1185cac54dc42f12a5fb6..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth deleted file mode 100644 index 20d239dfd49c5dfac4b0e9262df10a199c383e22..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth deleted file mode 100644 index 44d15a9615f46731b4d1be2302ed11c2e22c5889..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth deleted file mode 100644 index c02a05b764b46a3e2ea7f50bab8449d0128a76d9..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth deleted file mode 100644 index f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth deleted file mode 100644 index 93470a083d27c6e079dfb735e0a4fa8b7f6b0249..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth deleted file mode 100644 index 90e3ca8659ab49b709193c41ea8923e9f7217d09..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth deleted file mode 100644 index 6530350b10d02e206562d6d0b29a46a26d742899..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce -size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/log.txt b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/log.txt deleted file mode 100644 index 641deafc203ae506ccbc5ac592b83c4532ba0041..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/log.txt +++ /dev/null @@ -1,4 +0,0 @@ -{"train_lr": 2.49692118226601e-05, "train_grad_norm": 0.6733022602687916, "train_closs": 0.9640464621499695, "epoch": 0, "val_lr": 2.49692118226601e-05, "val_grad_norm": 0.6733022602687916, "val_closs": 0.9640464621499695} -{"train_lr": 4.6114274981403966e-05, "train_grad_norm": 0.5561584743299508, "train_closs": 0.9136834735638035, "epoch": 1, "val_lr": 4.6114274981403966e-05, "val_grad_norm": 0.5561584743299508, "val_closs": 0.9136834735638035} -{"train_lr": 2.751385467980297e-05, "train_grad_norm": 0.5519019591764276, "train_closs": 0.9010383364437102, "epoch": 2, "val_lr": 2.751385467980297e-05, "val_grad_norm": 0.5519019591764276, "val_closs": 0.9010383364437102} -{"train_lr": 8.899579698398978e-06, "train_grad_norm": 0.5554111024796082, "train_closs": 0.8950173630897561, "epoch": 3, "val_lr": 8.899579698398978e-06, "val_grad_norm": 0.5554111024796082, "val_closs": 0.8950173630897561} diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/output.log b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/output.log deleted file mode 100644 index 9d6d14604302702cadfc48b3ace8125e28a27c43..0000000000000000000000000000000000000000 --- a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/output.log +++ /dev/null @@ -1,648 +0,0 @@ -WARNING:torch.distributed.run: -***************************************** -Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -***************************************** -| distributed init (rank 1): env://, gpu 1 -| distributed init (rank 6): env://, gpu 6 -| distributed init (rank 4): env://, gpu 4 -| distributed init (rank 7): env://, gpu 7 -| distributed init (rank 3): env://, gpu 3 -| distributed init (rank 0): env://, gpu 0 -| distributed init (rank 2): env://, gpu 2 -| distributed init (rank 5): env://, gpu 5 -[04:34:34.123451] > initializing model parallel with size 1 -[04:34:34.123679] > initializing ddp with size 8 -[04:34:34.123687] > initializing pipeline with size 1 -[04:34:34.273785] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory -[04:34:34.273908] Namespace(batch_size=8, -accum_iter=1, -llama_type='llama_peft', -llama_config=['../checkpoints/llama2/Llama-2-7b/params.json', -'configs/model/finetune/sg/llamaPeft_normBiasLora.json'], -no_visual=True, -tokenizer_path='../checkpoints/llama2/Llama-2-7b/tokenizer.model', -pretrained_path='../checkpoints/llama2/Llama-2-7b/', -pretrained_type='meta_ori', -weight_decay=0.02, -lr=5e-05, -min_lr=5e-06, -epochs=4, -warmup_epochs=1.0, -clip_grad=2, -max_words=512, -dialog=False, -data_config='configs/data/finetune/sg/alpaca.yaml', -output_dir='output/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B', -log_dir='./output_dir', -save_interval=1, -only_save_trainable=True, -device='cuda', -seed=0, -resume='', -num_workers=24, -pin_mem=True, -world_size=8, -local_rank=-1, -dist_on_itp=False, -dist_url='env://', -model_parallel_size=1, -data_parallel='sdp', -precision='bf16', -checkpointing=True, -quant=True, -rank=0, -gpu=0, -distributed=True, -dist_backend='nccl') -[04:34:34.276112] Start initialization. -[04:34:34.276172] ## Processing on RANK 0. -[04:34:34.285967] Model Args: - ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=16, bias_tuning=True) -[04:35:23.173165] Model is Peft: True -[04:35:23.179946] Trainable parameter count : 41603072 (local rank), 41603072 (all). -[04:35:23.288029] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/ -[04:35:40.542782] ## Quantizing model to 4bit! - Qunatization Process: 0%| | 0/839 [00:00 -[04:49:56.134049] Start training for 4 epochs -[04:49:56.141961] log_dir: ./output_dir -[04:50:00.993524] Epoch: [0] [0/812] lr: 0.000000 grad_norm: 1.6694 (1.6694) closs: 1.1100 (1.1100) time: 4.8504 data: 1.6775 max mem: 11767 -[04:50:13.467172] Epoch: [0] [10/812] lr: 0.000001 grad_norm: 1.7142 (1.6843) closs: 1.1138 (1.1233) time: 1.5749 data: 0.1527 max mem: 17666 -[04:50:25.645879] Epoch: [0] [20/812] lr: 0.000001 grad_norm: 1.6067 (1.6129) closs: 1.0717 (1.1035) time: 1.2325 data: 0.0002 max mem: 17666 -[04:50:37.815063] Epoch: [0] [30/812] lr: 0.000002 grad_norm: 1.6067 (1.6224) closs: 1.1138 (1.1220) time: 1.2173 data: 0.0002 max mem: 17666 -[04:50:50.097974] Epoch: [0] [40/812] lr: 0.000002 grad_norm: 1.5736 (1.5833) closs: 1.1291 (1.1294) time: 1.2225 data: 0.0001 max mem: 17666 -[04:51:02.303652] Epoch: [0] [50/812] lr: 0.000003 grad_norm: 1.3909 (1.5353) closs: 1.1291 (1.1312) time: 1.2244 data: 0.0002 max mem: 17666 -[04:51:14.652585] Epoch: [0] [60/812] lr: 0.000004 grad_norm: 1.2569 (1.4798) closs: 1.1374 (1.1263) time: 1.2277 data: 0.0002 max mem: 17666 -[04:51:26.911672] Epoch: [0] [70/812] lr: 0.000004 grad_norm: 1.1196 (1.4223) closs: 1.0760 (1.1289) time: 1.2303 data: 0.0002 max mem: 17666 -[04:51:39.110388] Epoch: [0] [80/812] lr: 0.000005 grad_norm: 0.9717 (1.3646) closs: 1.0850 (1.1260) time: 1.2228 data: 0.0002 max mem: 17666 -[04:51:51.365652] Epoch: [0] [90/812] lr: 0.000006 grad_norm: 0.8888 (1.3056) closs: 1.0433 (1.1139) time: 1.2226 data: 0.0002 max mem: 17666 -[04:52:03.587350] Epoch: [0] [100/812] lr: 0.000006 grad_norm: 0.7781 (1.2548) closs: 1.0227 (1.1095) time: 1.2238 data: 0.0002 max mem: 17666 -[04:52:15.847329] Epoch: [0] [110/812] lr: 0.000007 grad_norm: 0.7166 (1.2090) closs: 1.0220 (1.0972) time: 1.2240 data: 0.0002 max mem: 17666 -[04:52:28.053228] Epoch: [0] [120/812] lr: 0.000007 grad_norm: 0.7216 (1.1685) closs: 1.0220 (1.0963) time: 1.2232 data: 0.0002 max mem: 17666 -[04:52:40.306358] Epoch: [0] [130/812] lr: 0.000008 grad_norm: 0.7216 (1.1348) closs: 1.0442 (1.0918) time: 1.2229 data: 0.0002 max mem: 17666 -[04:52:52.564879] Epoch: [0] [140/812] lr: 0.000009 grad_norm: 0.6922 (1.1040) closs: 0.9522 (1.0824) time: 1.2255 data: 0.0002 max mem: 17666 -[04:53:04.777107] Epoch: [0] [150/812] lr: 0.000009 grad_norm: 0.6689 (1.0747) closs: 0.9576 (1.0768) time: 1.2235 data: 0.0002 max mem: 17666 -[04:53:17.031527] Epoch: [0] [160/812] lr: 0.000010 grad_norm: 0.6162 (1.0461) closs: 0.9987 (1.0734) time: 1.2233 data: 0.0002 max mem: 17666 -[04:53:29.214924] Epoch: [0] [170/812] lr: 0.000010 grad_norm: 0.6162 (1.0222) closs: 0.9767 (1.0659) time: 1.2218 data: 0.0002 max mem: 17666 -[04:53:41.435558] Epoch: [0] [180/812] lr: 0.000011 grad_norm: 0.6274 (0.9998) closs: 0.9133 (1.0574) time: 1.2201 data: 0.0002 max mem: 17666 -[04:53:53.641153] Epoch: [0] [190/812] lr: 0.000012 grad_norm: 0.5874 (0.9789) closs: 0.9174 (1.0514) time: 1.2212 data: 0.0002 max mem: 17666 -[04:54:05.868672] Epoch: [0] [200/812] lr: 0.000012 grad_norm: 0.5874 (0.9605) closs: 0.9700 (1.0485) time: 1.2216 data: 0.0002 max mem: 17666 -[04:54:18.107142] Epoch: [0] [210/812] lr: 0.000013 grad_norm: 0.5872 (0.9425) closs: 0.9847 (1.0443) time: 1.2232 data: 0.0002 max mem: 17666 -[04:54:30.330679] Epoch: [0] [220/812] lr: 0.000014 grad_norm: 0.5699 (0.9259) closs: 0.9661 (1.0414) time: 1.2230 data: 0.0001 max mem: 17666 -[04:54:42.605734] Epoch: [0] [230/812] lr: 0.000014 grad_norm: 0.5536 (0.9112) closs: 0.9191 (1.0351) time: 1.2249 data: 0.0001 max mem: 17666 -[04:54:54.840846] Epoch: [0] [240/812] lr: 0.000015 grad_norm: 0.5524 (0.8976) closs: 0.9078 (1.0315) time: 1.2254 data: 0.0002 max mem: 17666 -[04:55:07.114906] Epoch: [0] [250/812] lr: 0.000015 grad_norm: 0.5982 (0.8887) closs: 0.9491 (1.0288) time: 1.2254 data: 0.0002 max mem: 17666 -[04:55:19.357896] Epoch: [0] [260/812] lr: 0.000016 grad_norm: 0.5592 (0.8752) closs: 0.9431 (1.0253) time: 1.2258 data: 0.0002 max mem: 17666 -[04:55:31.788387] Epoch: [0] [270/812] lr: 0.000017 grad_norm: 0.5482 (0.8643) closs: 0.9608 (1.0244) time: 1.2336 data: 0.0002 max mem: 17666 -[04:55:44.053664] Epoch: [0] [280/812] lr: 0.000017 grad_norm: 0.5581 (0.8548) closs: 0.9608 (1.0214) time: 1.2347 data: 0.0001 max mem: 17666 -[04:55:56.289359] Epoch: [0] [290/812] lr: 0.000018 grad_norm: 0.5606 (0.8459) closs: 0.9531 (1.0183) time: 1.2250 data: 0.0001 max mem: 17666 -[04:56:08.574032] Epoch: [0] [300/812] lr: 0.000018 grad_norm: 0.5730 (0.8372) closs: 0.9457 (1.0158) time: 1.2260 data: 0.0001 max mem: 17666 -[04:56:20.819959] Epoch: [0] [310/812] lr: 0.000019 grad_norm: 0.5730 (0.8287) closs: 0.9613 (1.0152) time: 1.2265 data: 0.0002 max mem: 17666 -[04:56:33.077045] Epoch: [0] [320/812] lr: 0.000020 grad_norm: 0.5720 (0.8213) closs: 0.9613 (1.0149) time: 1.2251 data: 0.0002 max mem: 17666 -[04:56:45.304449] Epoch: [0] [330/812] lr: 0.000020 grad_norm: 0.5427 (0.8132) closs: 0.9252 (1.0109) time: 1.2241 data: 0.0002 max mem: 17666 -[04:56:57.563745] Epoch: [0] [340/812] lr: 0.000021 grad_norm: 0.5427 (0.8059) closs: 0.9062 (1.0078) time: 1.2242 data: 0.0002 max mem: 17666 -[04:57:09.823556] Epoch: [0] [350/812] lr: 0.000022 grad_norm: 0.5474 (0.7994) closs: 0.9406 (1.0088) time: 1.2259 data: 0.0002 max mem: 17666 -[04:57:22.044133] Epoch: [0] [360/812] lr: 0.000022 grad_norm: 0.5634 (0.7934) closs: 0.9744 (1.0071) time: 1.2240 data: 0.0002 max mem: 17666 -[04:57:34.303848] Epoch: [0] [370/812] lr: 0.000023 grad_norm: 0.5715 (0.7872) closs: 0.9075 (1.0046) time: 1.2239 data: 0.0002 max mem: 17666 -[04:57:46.512890] Epoch: [0] [380/812] lr: 0.000023 grad_norm: 0.5607 (0.7815) closs: 0.9075 (1.0032) time: 1.2234 data: 0.0002 max mem: 17666 -[04:57:58.775654] Epoch: [0] [390/812] lr: 0.000024 grad_norm: 0.5731 (0.7765) closs: 0.9216 (1.0017) time: 1.2235 data: 0.0002 max mem: 17666 -[04:58:11.005032] Epoch: [0] [400/812] lr: 0.000025 grad_norm: 0.5846 (0.7721) closs: 0.9061 (1.0003) time: 1.2245 data: 0.0002 max mem: 17666 -[04:58:23.252387] Epoch: [0] [410/812] lr: 0.000025 grad_norm: 0.5909 (0.7688) closs: 0.9044 (0.9990) time: 1.2238 data: 0.0002 max mem: 17666 -[04:58:35.540811] Epoch: [0] [420/812] lr: 0.000026 grad_norm: 0.5704 (0.7638) closs: 0.8909 (0.9968) time: 1.2267 data: 0.0002 max mem: 17666 -[04:58:47.754683] Epoch: [0] [430/812] lr: 0.000026 grad_norm: 0.5641 (0.7602) closs: 0.8689 (0.9940) time: 1.2250 data: 0.0002 max mem: 17666 -[04:59:00.034244] Epoch: [0] [440/812] lr: 0.000027 grad_norm: 0.5588 (0.7557) closs: 0.8496 (0.9914) time: 1.2246 data: 0.0002 max mem: 17666 -[04:59:12.243723] Epoch: [0] [450/812] lr: 0.000028 grad_norm: 0.5588 (0.7518) closs: 0.8680 (0.9887) time: 1.2244 data: 0.0002 max mem: 17666 -[04:59:24.510184] Epoch: [0] [460/812] lr: 0.000028 grad_norm: 0.5679 (0.7476) closs: 0.8964 (0.9874) time: 1.2237 data: 0.0002 max mem: 17666 -[04:59:36.736038] Epoch: [0] [470/812] lr: 0.000029 grad_norm: 0.5676 (0.7444) closs: 0.9041 (0.9869) time: 1.2245 data: 0.0002 max mem: 17666 -[04:59:48.991058] Epoch: [0] [480/812] lr: 0.000030 grad_norm: 0.5570 (0.7411) closs: 0.9610 (0.9865) time: 1.2240 data: 0.0002 max mem: 17666 -[05:00:01.241427] Epoch: [0] [490/812] lr: 0.000030 grad_norm: 0.5518 (0.7368) closs: 0.9518 (0.9857) time: 1.2252 data: 0.0002 max mem: 17666 -[05:00:13.456707] Epoch: [0] [500/812] lr: 0.000031 grad_norm: 0.5348 (0.7332) closs: 0.9462 (0.9844) time: 1.2232 data: 0.0002 max mem: 17666 -[05:00:25.704700] Epoch: [0] [510/812] lr: 0.000031 grad_norm: 0.5677 (0.7322) closs: 0.9125 (0.9823) time: 1.2231 data: 0.0002 max mem: 17666 -[05:00:37.923520] Epoch: [0] [520/812] lr: 0.000032 grad_norm: 0.5883 (0.7326) closs: 0.8610 (0.9811) time: 1.2233 data: 0.0002 max mem: 17666 -[05:00:50.181172] Epoch: [0] [530/812] lr: 0.000033 grad_norm: 0.5724 (0.7299) closs: 0.9259 (0.9802) time: 1.2238 data: 0.0002 max mem: 17666 -[05:01:02.403659] Epoch: [0] [540/812] lr: 0.000033 grad_norm: 0.5683 (0.7269) closs: 0.9560 (0.9795) time: 1.2239 data: 0.0002 max mem: 17666 -[05:01:14.663952] Epoch: [0] [550/812] lr: 0.000034 grad_norm: 0.5575 (0.7241) closs: 0.9308 (0.9781) time: 1.2241 data: 0.0002 max mem: 17666 -[05:01:26.950895] Epoch: [0] [560/812] lr: 0.000034 grad_norm: 0.5575 (0.7212) closs: 0.8756 (0.9775) time: 1.2273 data: 0.0003 max mem: 17666 -[05:01:39.165892] Epoch: [0] [570/812] lr: 0.000035 grad_norm: 0.5441 (0.7187) closs: 0.9025 (0.9766) time: 1.2250 data: 0.0003 max mem: 17666 -[05:01:51.405903] Epoch: [0] [580/812] lr: 0.000036 grad_norm: 0.5507 (0.7162) closs: 0.9051 (0.9764) time: 1.2227 data: 0.0002 max mem: 17666 -[05:02:03.610466] Epoch: [0] [590/812] lr: 0.000036 grad_norm: 0.5490 (0.7134) closs: 0.9051 (0.9755) time: 1.2222 data: 0.0002 max mem: 17666 -[05:02:15.849292] Epoch: [0] [600/812] lr: 0.000037 grad_norm: 0.5563 (0.7107) closs: 0.9230 (0.9747) time: 1.2221 data: 0.0002 max mem: 17666 -[05:02:28.073108] Epoch: [0] [610/812] lr: 0.000038 grad_norm: 0.5737 (0.7087) closs: 0.8677 (0.9726) time: 1.2231 data: 0.0003 max mem: 17666 -[05:02:40.301764] Epoch: [0] [620/812] lr: 0.000038 grad_norm: 0.5858 (0.7066) closs: 0.8455 (0.9718) time: 1.2226 data: 0.0003 max mem: 17666 -[05:02:52.540493] Epoch: [0] [630/812] lr: 0.000039 grad_norm: 0.5850 (0.7049) closs: 0.8993 (0.9707) time: 1.2233 data: 0.0002 max mem: 17666 -[05:03:04.773003] Epoch: [0] [640/812] lr: 0.000039 grad_norm: 0.5764 (0.7030) closs: 0.9169 (0.9703) time: 1.2235 data: 0.0002 max mem: 17666 -[05:03:17.077033] Epoch: [0] [650/812] lr: 0.000040 grad_norm: 0.5656 (0.7012) closs: 0.9087 (0.9689) time: 1.2268 data: 0.0002 max mem: 17666 -[05:03:29.323502] Epoch: [0] [660/812] lr: 0.000041 grad_norm: 0.5623 (0.6993) closs: 0.8749 (0.9684) time: 1.2274 data: 0.0002 max mem: 17666 -[05:03:41.591619] Epoch: [0] [670/812] lr: 0.000041 grad_norm: 0.5259 (0.6967) closs: 0.8953 (0.9678) time: 1.2257 data: 0.0002 max mem: 17666 -[05:03:53.817646] Epoch: [0] [680/812] lr: 0.000042 grad_norm: 0.5327 (0.6945) closs: 0.9142 (0.9673) time: 1.2246 data: 0.0002 max mem: 17666 -[05:04:06.075349] Epoch: [0] [690/812] lr: 0.000042 grad_norm: 0.5405 (0.6927) closs: 0.9243 (0.9669) time: 1.2241 data: 0.0002 max mem: 17666 -[05:04:18.348610] Epoch: [0] [700/812] lr: 0.000043 grad_norm: 0.5495 (0.6909) closs: 0.9243 (0.9666) time: 1.2265 data: 0.0002 max mem: 17666 -[05:04:30.558726] Epoch: [0] [710/812] lr: 0.000044 grad_norm: 0.5495 (0.6890) closs: 0.8956 (0.9654) time: 1.2241 data: 0.0002 max mem: 17666 -[05:04:42.827196] Epoch: [0] [720/812] lr: 0.000044 grad_norm: 0.5697 (0.6876) closs: 0.8870 (0.9649) time: 1.2239 data: 0.0002 max mem: 17666 -[05:04:55.071487] Epoch: [0] [730/812] lr: 0.000045 grad_norm: 0.5697 (0.6858) closs: 0.8934 (0.9640) time: 1.2256 data: 0.0002 max mem: 17666 -[05:05:07.340719] Epoch: [0] [740/812] lr: 0.000046 grad_norm: 0.5596 (0.6840) closs: 0.8871 (0.9627) time: 1.2256 data: 0.0002 max mem: 17666 -[05:05:19.598714] Epoch: [0] [750/812] lr: 0.000046 grad_norm: 0.5451 (0.6823) closs: 0.9205 (0.9636) time: 1.2263 data: 0.0002 max mem: 17666 -[05:05:31.859047] Epoch: [0] [760/812] lr: 0.000047 grad_norm: 0.5443 (0.6806) closs: 0.9429 (0.9627) time: 1.2258 data: 0.0002 max mem: 17666 -[05:05:44.113014] Epoch: [0] [770/812] lr: 0.000047 grad_norm: 0.5571 (0.6793) closs: 0.9180 (0.9618) time: 1.2256 data: 0.0002 max mem: 17666 -[05:05:56.310853] Epoch: [0] [780/812] lr: 0.000048 grad_norm: 0.5778 (0.6778) closs: 0.9180 (0.9614) time: 1.2225 data: 0.0001 max mem: 17666 -[05:06:08.553058] Epoch: [0] [790/812] lr: 0.000049 grad_norm: 0.5687 (0.6765) closs: 0.8819 (0.9604) time: 1.2219 data: 0.0001 max mem: 17666 -[05:06:20.746249] Epoch: [0] [800/812] lr: 0.000049 grad_norm: 0.5436 (0.6748) closs: 0.8819 (0.9599) time: 1.2217 data: 0.0001 max mem: 17666 -[05:06:33.006020] Epoch: [0] [810/812] lr: 0.000050 grad_norm: 0.5421 (0.6735) closs: 0.8738 (0.9586) time: 1.2226 data: 0.0001 max mem: 17666 -[05:06:34.434061] Epoch: [0] Total time: 0:16:38 -[05:06:34.435967] Averaged stats: lr: 0.000050 grad_norm: 0.5358 (0.6733) closs: 0.8738 (0.9640) -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -[05:06:34.685369] model saved -/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. - warnings.warn( -[05:06:36.004693] optimizer saved -[05:06:36.005298] other rank-common saved -[05:06:36.008307] rank-specific saved -[05:06:36.016492] log_dir: ./output_dir -[05:06:38.409445] Epoch: [1] [0/812] lr: 0.000050 grad_norm: 0.5716 (0.5716) closs: 0.7068 (0.7068) time: 2.3920 data: 1.1581 max mem: 17666 -[05:06:50.657381] Epoch: [1] [10/812] lr: 0.000050 grad_norm: 0.5716 (0.5743) closs: 0.8450 (0.8412) time: 1.3308 data: 0.1054 max mem: 17666 -[05:07:02.826034] Epoch: [1] [20/812] lr: 0.000050 grad_norm: 0.5748 (0.5730) closs: 0.8714 (0.9120) time: 1.2208 data: 0.0001 max mem: 17666 -[05:07:14.934821] Epoch: [1] [30/812] lr: 0.000050 grad_norm: 0.5622 (0.5677) closs: 0.9029 (0.9080) time: 1.2138 data: 0.0001 max mem: 17666 -[05:07:27.078058] Epoch: [1] [40/812] lr: 0.000050 grad_norm: 0.5479 (0.5654) closs: 0.9393 (0.9264) time: 1.2125 data: 0.0001 max mem: 17666 -[05:07:39.214844] Epoch: [1] [50/812] lr: 0.000050 grad_norm: 0.5380 (0.5637) closs: 0.9393 (0.9220) time: 1.2139 data: 0.0001 max mem: 17666 -[05:07:51.352136] Epoch: [1] [60/812] lr: 0.000050 grad_norm: 0.5607 (0.5649) closs: 0.9001 (0.9198) time: 1.2136 data: 0.0001 max mem: 17666 -[05:08:03.517114] Epoch: [1] [70/812] lr: 0.000050 grad_norm: 0.5700 (0.5683) closs: 0.9085 (0.9224) time: 1.2150 data: 0.0001 max mem: 17666 -[05:08:15.729009] Epoch: [1] [80/812] lr: 0.000050 grad_norm: 0.5498 (0.5672) closs: 0.9337 (0.9186) time: 1.2188 data: 0.0001 max mem: 17666 -[05:08:27.942413] Epoch: [1] [90/812] lr: 0.000050 grad_norm: 0.5372 (0.5618) closs: 0.9337 (0.9233) time: 1.2212 data: 0.0002 max mem: 17666 -[05:08:40.177210] Epoch: [1] [100/812] lr: 0.000050 grad_norm: 0.5399 (0.5635) closs: 0.9525 (0.9257) time: 1.2223 data: 0.0002 max mem: 17666 -[05:08:52.395851] Epoch: [1] [110/812] lr: 0.000050 grad_norm: 0.5470 (0.5630) closs: 0.9445 (0.9224) time: 1.2226 data: 0.0002 max mem: 17666 -[05:09:04.620910] Epoch: [1] [120/812] lr: 0.000050 grad_norm: 0.5660 (0.5649) closs: 0.9445 (0.9235) time: 1.2221 data: 0.0002 max mem: 17666 -[05:09:16.823910] Epoch: [1] [130/812] lr: 0.000050 grad_norm: 0.5666 (0.5642) closs: 0.9266 (0.9232) time: 1.2213 data: 0.0002 max mem: 17666 -[05:09:29.044814] Epoch: [1] [140/812] lr: 0.000050 grad_norm: 0.5469 (0.5636) closs: 0.9191 (0.9222) time: 1.2211 data: 0.0002 max mem: 17666 -[05:09:41.250901] Epoch: [1] [150/812] lr: 0.000050 grad_norm: 0.5427 (0.5631) closs: 0.9318 (0.9220) time: 1.2213 data: 0.0002 max mem: 17666 -[05:09:53.461830] Epoch: [1] [160/812] lr: 0.000050 grad_norm: 0.5499 (0.5632) closs: 0.9413 (0.9241) time: 1.2208 data: 0.0002 max mem: 17666 -[05:10:05.704359] Epoch: [1] [170/812] lr: 0.000049 grad_norm: 0.5499 (0.5628) closs: 0.9694 (0.9263) time: 1.2226 data: 0.0002 max mem: 17666 -[05:10:17.913859] Epoch: [1] [180/812] lr: 0.000049 grad_norm: 0.5187 (0.5618) closs: 0.9341 (0.9267) time: 1.2225 data: 0.0002 max mem: 17666 -[05:10:30.152053] Epoch: [1] [190/812] lr: 0.000049 grad_norm: 0.5289 (0.5600) closs: 0.8866 (0.9248) time: 1.2223 data: 0.0002 max mem: 17666 -[05:10:42.370346] Epoch: [1] [200/812] lr: 0.000049 grad_norm: 0.5296 (0.5589) closs: 0.8866 (0.9232) time: 1.2228 data: 0.0002 max mem: 17666 -[05:10:54.621739] Epoch: [1] [210/812] lr: 0.000049 grad_norm: 0.5406 (0.5576) closs: 0.9429 (0.9253) time: 1.2234 data: 0.0002 max mem: 17666 -[05:11:06.854883] Epoch: [1] [220/812] lr: 0.000049 grad_norm: 0.5362 (0.5573) closs: 0.9438 (0.9241) time: 1.2242 data: 0.0002 max mem: 17666 -[05:11:19.064849] Epoch: [1] [230/812] lr: 0.000049 grad_norm: 0.5260 (0.5562) closs: 0.8665 (0.9228) time: 1.2221 data: 0.0002 max mem: 17666 -[05:11:31.300169] Epoch: [1] [240/812] lr: 0.000049 grad_norm: 0.5371 (0.5563) closs: 0.8964 (0.9237) time: 1.2222 data: 0.0001 max mem: 17666 -[05:11:43.496321] Epoch: [1] [250/812] lr: 0.000049 grad_norm: 0.5503 (0.5565) closs: 0.9451 (0.9228) time: 1.2215 data: 0.0001 max mem: 17666 -[05:11:55.764297] Epoch: [1] [260/812] lr: 0.000049 grad_norm: 0.5451 (0.5569) closs: 0.9342 (0.9233) time: 1.2231 data: 0.0002 max mem: 17666 -[05:12:07.971728] Epoch: [1] [270/812] lr: 0.000049 grad_norm: 0.5451 (0.5568) closs: 0.9317 (0.9219) time: 1.2237 data: 0.0002 max mem: 17666 -[05:12:20.187157] Epoch: [1] [280/812] lr: 0.000049 grad_norm: 0.5521 (0.5574) closs: 0.9182 (0.9220) time: 1.2211 data: 0.0002 max mem: 17666 -[05:12:32.426547] Epoch: [1] [290/812] lr: 0.000048 grad_norm: 0.5328 (0.5572) closs: 0.9268 (0.9225) time: 1.2227 data: 0.0002 max mem: 17666 -[05:12:44.641825] Epoch: [1] [300/812] lr: 0.000048 grad_norm: 0.5328 (0.5566) closs: 0.8957 (0.9212) time: 1.2227 data: 0.0002 max mem: 17666 -[05:12:56.872549] Epoch: [1] [310/812] lr: 0.000048 grad_norm: 0.5379 (0.5574) closs: 0.8875 (0.9203) time: 1.2222 data: 0.0002 max mem: 17666 -[05:13:09.087618] Epoch: [1] [320/812] lr: 0.000048 grad_norm: 0.5465 (0.5576) closs: 0.8875 (0.9207) time: 1.2222 data: 0.0002 max mem: 17666 -[05:13:21.321873] Epoch: [1] [330/812] lr: 0.000048 grad_norm: 0.5207 (0.5569) closs: 0.8802 (0.9200) time: 1.2224 data: 0.0002 max mem: 17666 -[05:13:33.555776] Epoch: [1] [340/812] lr: 0.000048 grad_norm: 0.5206 (0.5569) closs: 0.8556 (0.9179) time: 1.2233 data: 0.0002 max mem: 17666 -[05:13:45.822566] Epoch: [1] [350/812] lr: 0.000048 grad_norm: 0.5441 (0.5565) closs: 0.8780 (0.9175) time: 1.2250 data: 0.0001 max mem: 17666 -[05:13:58.058935] Epoch: [1] [360/812] lr: 0.000048 grad_norm: 0.5441 (0.5563) closs: 0.8524 (0.9153) time: 1.2251 data: 0.0001 max mem: 17666 -[05:14:10.306817] Epoch: [1] [370/812] lr: 0.000047 grad_norm: 0.5542 (0.5571) closs: 0.8262 (0.9143) time: 1.2241 data: 0.0002 max mem: 17666 -[05:14:22.557391] Epoch: [1] [380/812] lr: 0.000047 grad_norm: 0.5502 (0.5566) closs: 0.8911 (0.9145) time: 1.2249 data: 0.0001 max mem: 17666 -[05:14:34.775480] Epoch: [1] [390/812] lr: 0.000047 grad_norm: 0.5379 (0.5564) closs: 0.9057 (0.9150) time: 1.2234 data: 0.0001 max mem: 17666 -[05:14:47.019726] Epoch: [1] [400/812] lr: 0.000047 grad_norm: 0.5379 (0.5560) closs: 0.9057 (0.9147) time: 1.2231 data: 0.0001 max mem: 17666 -[05:14:59.254412] Epoch: [1] [410/812] lr: 0.000047 grad_norm: 0.5308 (0.5556) closs: 0.9153 (0.9160) time: 1.2239 data: 0.0001 max mem: 17666 -[05:15:11.483844] Epoch: [1] [420/812] lr: 0.000047 grad_norm: 0.5308 (0.5551) closs: 0.9592 (0.9165) time: 1.2231 data: 0.0001 max mem: 17666 -[05:15:23.696028] Epoch: [1] [430/812] lr: 0.000047 grad_norm: 0.5458 (0.5561) closs: 0.8909 (0.9157) time: 1.2220 data: 0.0002 max mem: 17666 -[05:15:35.910601] Epoch: [1] [440/812] lr: 0.000046 grad_norm: 0.5484 (0.5561) closs: 0.9059 (0.9157) time: 1.2213 data: 0.0002 max mem: 17666 -[05:15:48.141787] Epoch: [1] [450/812] lr: 0.000046 grad_norm: 0.5484 (0.5563) closs: 0.9138 (0.9153) time: 1.2222 data: 0.0002 max mem: 17666 -[05:16:00.351582] Epoch: [1] [460/812] lr: 0.000046 grad_norm: 0.5362 (0.5557) closs: 0.8589 (0.9147) time: 1.2220 data: 0.0002 max mem: 17666 -[05:16:12.585800] Epoch: [1] [470/812] lr: 0.000046 grad_norm: 0.5292 (0.5555) closs: 0.8808 (0.9147) time: 1.2221 data: 0.0002 max mem: 17666 -[05:16:24.813078] Epoch: [1] [480/812] lr: 0.000046 grad_norm: 0.5494 (0.5564) closs: 0.8981 (0.9153) time: 1.2230 data: 0.0002 max mem: 17666 -[05:16:37.044404] Epoch: [1] [490/812] lr: 0.000046 grad_norm: 0.5662 (0.5563) closs: 0.9437 (0.9160) time: 1.2229 data: 0.0002 max mem: 17666 -[05:16:49.306306] Epoch: [1] [500/812] lr: 0.000045 grad_norm: 0.5418 (0.5562) closs: 0.9002 (0.9156) time: 1.2246 data: 0.0002 max mem: 17666 -[05:17:01.504233] Epoch: [1] [510/812] lr: 0.000045 grad_norm: 0.5569 (0.5596) closs: 0.8913 (0.9152) time: 1.2229 data: 0.0001 max mem: 17666 -[05:17:13.732894] Epoch: [1] [520/812] lr: 0.000045 grad_norm: 0.5569 (0.5594) closs: 0.9053 (0.9152) time: 1.2213 data: 0.0002 max mem: 17666 -[05:17:25.950468] Epoch: [1] [530/812] lr: 0.000045 grad_norm: 0.5472 (0.5591) closs: 0.8929 (0.9153) time: 1.2222 data: 0.0001 max mem: 17666 -[05:17:38.187123] Epoch: [1] [540/812] lr: 0.000045 grad_norm: 0.5325 (0.5590) closs: 0.8836 (0.9154) time: 1.2226 data: 0.0001 max mem: 17666 -[05:17:50.416363] Epoch: [1] [550/812] lr: 0.000045 grad_norm: 0.5393 (0.5585) closs: 0.8734 (0.9148) time: 1.2232 data: 0.0002 max mem: 17666 -[05:18:02.631787] Epoch: [1] [560/812] lr: 0.000044 grad_norm: 0.5416 (0.5586) closs: 0.9068 (0.9150) time: 1.2222 data: 0.0002 max mem: 17666 -[05:18:14.860124] Epoch: [1] [570/812] lr: 0.000044 grad_norm: 0.5634 (0.5589) closs: 0.9202 (0.9154) time: 1.2221 data: 0.0002 max mem: 17666 -[05:18:27.083101] Epoch: [1] [580/812] lr: 0.000044 grad_norm: 0.5520 (0.5588) closs: 0.9180 (0.9145) time: 1.2225 data: 0.0002 max mem: 17666 -[05:18:39.321277] Epoch: [1] [590/812] lr: 0.000044 grad_norm: 0.5561 (0.5592) closs: 0.9180 (0.9147) time: 1.2230 data: 0.0002 max mem: 17666 -[05:18:51.538167] Epoch: [1] [600/812] lr: 0.000044 grad_norm: 0.5392 (0.5588) closs: 0.9240 (0.9150) time: 1.2227 data: 0.0002 max mem: 17666 -[05:19:03.786880] Epoch: [1] [610/812] lr: 0.000043 grad_norm: 0.5296 (0.5585) closs: 0.8944 (0.9151) time: 1.2232 data: 0.0001 max mem: 17666 -[05:19:16.029554] Epoch: [1] [620/812] lr: 0.000043 grad_norm: 0.5438 (0.5580) closs: 0.9104 (0.9152) time: 1.2245 data: 0.0002 max mem: 17666 -[05:19:28.287030] Epoch: [1] [630/812] lr: 0.000043 grad_norm: 0.5312 (0.5574) closs: 0.9339 (0.9155) time: 1.2249 data: 0.0002 max mem: 17666 -[05:19:40.534557] Epoch: [1] [640/812] lr: 0.000043 grad_norm: 0.5312 (0.5572) closs: 0.9009 (0.9155) time: 1.2252 data: 0.0002 max mem: 17666 -[05:19:52.747120] Epoch: [1] [650/812] lr: 0.000043 grad_norm: 0.5282 (0.5566) closs: 0.8875 (0.9153) time: 1.2229 data: 0.0001 max mem: 17666 -[05:20:04.986132] Epoch: [1] [660/812] lr: 0.000042 grad_norm: 0.5202 (0.5564) closs: 0.9420 (0.9163) time: 1.2225 data: 0.0001 max mem: 17666 -[05:20:17.207197] Epoch: [1] [670/812] lr: 0.000042 grad_norm: 0.5224 (0.5563) closs: 0.9829 (0.9170) time: 1.2229 data: 0.0002 max mem: 17666 -[05:20:29.408267] Epoch: [1] [680/812] lr: 0.000042 grad_norm: 0.5301 (0.5566) closs: 0.9759 (0.9177) time: 1.2210 data: 0.0002 max mem: 17666 -[05:20:41.615333] Epoch: [1] [690/812] lr: 0.000042 grad_norm: 0.5389 (0.5570) closs: 0.9759 (0.9181) time: 1.2203 data: 0.0002 max mem: 17666 -[05:20:53.843553] Epoch: [1] [700/812] lr: 0.000041 grad_norm: 0.5477 (0.5568) closs: 0.9354 (0.9181) time: 1.2217 data: 0.0002 max mem: 17666 -[05:21:06.076892] Epoch: [1] [710/812] lr: 0.000041 grad_norm: 0.5477 (0.5567) closs: 0.8810 (0.9178) time: 1.2230 data: 0.0002 max mem: 17666 -[05:21:18.276846] Epoch: [1] [720/812] lr: 0.000041 grad_norm: 0.5319 (0.5565) closs: 0.8641 (0.9173) time: 1.2216 data: 0.0001 max mem: 17666 -[05:21:30.499197] Epoch: [1] [730/812] lr: 0.000041 grad_norm: 0.5319 (0.5562) closs: 0.9143 (0.9173) time: 1.2211 data: 0.0001 max mem: 17666 -[05:21:42.697532] Epoch: [1] [740/812] lr: 0.000041 grad_norm: 0.5363 (0.5563) closs: 0.9228 (0.9173) time: 1.2210 data: 0.0001 max mem: 17666 -[05:21:54.912903] Epoch: [1] [750/812] lr: 0.000040 grad_norm: 0.5423 (0.5559) closs: 0.8785 (0.9170) time: 1.2206 data: 0.0001 max mem: 17666 -[05:22:07.135892] Epoch: [1] [760/812] lr: 0.000040 grad_norm: 0.5280 (0.5560) closs: 0.8854 (0.9174) time: 1.2219 data: 0.0002 max mem: 17666 -[05:22:19.333692] Epoch: [1] [770/812] lr: 0.000040 grad_norm: 0.5221 (0.5555) closs: 0.9338 (0.9169) time: 1.2210 data: 0.0001 max mem: 17666 -[05:22:31.551464] Epoch: [1] [780/812] lr: 0.000040 grad_norm: 0.5360 (0.5559) closs: 0.8506 (0.9166) time: 1.2207 data: 0.0001 max mem: 17666 -[05:22:43.738476] Epoch: [1] [790/812] lr: 0.000039 grad_norm: 0.5617 (0.5560) closs: 0.8934 (0.9169) time: 1.2202 data: 0.0001 max mem: 17666 -[05:22:55.936330] Epoch: [1] [800/812] lr: 0.000039 grad_norm: 0.5488 (0.5560) closs: 0.9056 (0.9165) time: 1.2192 data: 0.0001 max mem: 17666 -[05:23:08.115449] Epoch: [1] [810/812] lr: 0.000039 grad_norm: 0.5498 (0.5563) closs: 0.9065 (0.9165) time: 1.2188 data: 0.0001 max mem: 17666 -[05:23:09.610496] Epoch: [1] Total time: 0:16:33 -[05:23:09.633077] Averaged stats: lr: 0.000039 grad_norm: 0.5498 (0.5562) closs: 0.9187 (0.9137) -[05:23:09.936052] model saved -[05:23:11.238391] optimizer saved -[05:23:11.238903] other rank-common saved -[05:23:11.241868] rank-specific saved -[05:23:11.249988] log_dir: ./output_dir -[05:23:13.663887] Epoch: [2] [0/812] lr: 0.000039 grad_norm: 0.7125 (0.7125) closs: 0.8745 (0.8745) time: 2.4131 data: 1.1672 max mem: 17666 -[05:23:25.915941] Epoch: [2] [10/812] lr: 0.000038 grad_norm: 0.5159 (0.5531) closs: 0.8802 (0.8976) time: 1.3331 data: 0.1062 max mem: 17666 -[05:23:38.174870] Epoch: [2] [20/812] lr: 0.000038 grad_norm: 0.5206 (0.5430) closs: 0.8802 (0.8902) time: 1.2255 data: 0.0002 max mem: 17666 -[05:23:50.412310] Epoch: [2] [30/812] lr: 0.000038 grad_norm: 0.5260 (0.5434) closs: 0.8835 (0.8971) time: 1.2247 data: 0.0002 max mem: 17666 -[05:24:02.710838] Epoch: [2] [40/812] lr: 0.000038 grad_norm: 0.5309 (0.5425) closs: 0.8968 (0.8971) time: 1.2267 data: 0.0002 max mem: 17666 -[05:24:15.054535] Epoch: [2] [50/812] lr: 0.000037 grad_norm: 0.5321 (0.5424) closs: 0.9001 (0.8951) time: 1.2320 data: 0.0004 max mem: 17666 -[05:24:27.318243] Epoch: [2] [60/812] lr: 0.000037 grad_norm: 0.5432 (0.5419) closs: 0.9220 (0.8985) time: 1.2303 data: 0.0004 max mem: 17666 -[05:24:39.554150] Epoch: [2] [70/812] lr: 0.000037 grad_norm: 0.5432 (0.5457) closs: 0.9125 (0.9035) time: 1.2249 data: 0.0002 max mem: 17666 -[05:24:51.781126] Epoch: [2] [80/812] lr: 0.000037 grad_norm: 0.5337 (0.5448) closs: 0.8991 (0.9013) time: 1.2231 data: 0.0002 max mem: 17666 -[05:25:03.998186] Epoch: [2] [90/812] lr: 0.000036 grad_norm: 0.5328 (0.5431) closs: 0.8633 (0.8985) time: 1.2221 data: 0.0002 max mem: 17666 -[05:25:16.195929] Epoch: [2] [100/812] lr: 0.000036 grad_norm: 0.5328 (0.5437) closs: 0.8633 (0.8997) time: 1.2207 data: 0.0002 max mem: 17666 -[05:25:28.433549] Epoch: [2] [110/812] lr: 0.000036 grad_norm: 0.5378 (0.5440) closs: 0.8652 (0.8944) time: 1.2217 data: 0.0002 max mem: 17666 -[05:25:40.630768] Epoch: [2] [120/812] lr: 0.000036 grad_norm: 0.5295 (0.5427) closs: 0.8916 (0.8959) time: 1.2217 data: 0.0002 max mem: 17666 -[05:25:52.846608] Epoch: [2] [130/812] lr: 0.000035 grad_norm: 0.5252 (0.5422) closs: 0.8785 (0.8946) time: 1.2206 data: 0.0002 max mem: 17666 -[05:26:05.118006] Epoch: [2] [140/812] lr: 0.000035 grad_norm: 0.5381 (0.5433) closs: 0.8633 (0.8939) time: 1.2243 data: 0.0002 max mem: 17666 -[05:26:17.382923] Epoch: [2] [150/812] lr: 0.000035 grad_norm: 0.5409 (0.5437) closs: 0.8702 (0.8949) time: 1.2267 data: 0.0002 max mem: 17666 -[05:26:29.624515] Epoch: [2] [160/812] lr: 0.000035 grad_norm: 0.5378 (0.5439) closs: 0.9324 (0.8958) time: 1.2253 data: 0.0002 max mem: 17666 -[05:26:41.866362] Epoch: [2] [170/812] lr: 0.000034 grad_norm: 0.5373 (0.5433) closs: 0.8786 (0.8948) time: 1.2241 data: 0.0002 max mem: 17666 -[05:26:54.115542] Epoch: [2] [180/812] lr: 0.000034 grad_norm: 0.5291 (0.5426) closs: 0.8532 (0.8941) time: 1.2245 data: 0.0002 max mem: 17666 -[05:27:06.354062] Epoch: [2] [190/812] lr: 0.000034 grad_norm: 0.5375 (0.5448) closs: 0.8871 (0.8957) time: 1.2243 data: 0.0002 max mem: 17666 -[05:27:18.583612] Epoch: [2] [200/812] lr: 0.000033 grad_norm: 0.5450 (0.5459) closs: 0.9092 (0.8974) time: 1.2233 data: 0.0002 max mem: 17666 -[05:27:30.827297] Epoch: [2] [210/812] lr: 0.000033 grad_norm: 0.5450 (0.5465) closs: 0.8947 (0.8979) time: 1.2236 data: 0.0002 max mem: 17666 -[05:27:43.099277] Epoch: [2] [220/812] lr: 0.000033 grad_norm: 0.5412 (0.5470) closs: 0.9164 (0.8990) time: 1.2257 data: 0.0002 max mem: 17666 -[05:27:55.311466] Epoch: [2] [230/812] lr: 0.000033 grad_norm: 0.5342 (0.5467) closs: 0.9164 (0.8996) time: 1.2241 data: 0.0002 max mem: 17666 -[05:28:07.527387] Epoch: [2] [240/812] lr: 0.000032 grad_norm: 0.5269 (0.5467) closs: 0.8845 (0.8995) time: 1.2213 data: 0.0002 max mem: 17666 -[05:28:19.757509] Epoch: [2] [250/812] lr: 0.000032 grad_norm: 0.5346 (0.5467) closs: 0.8747 (0.9001) time: 1.2222 data: 0.0002 max mem: 17666 -[05:28:31.953868] Epoch: [2] [260/812] lr: 0.000032 grad_norm: 0.5462 (0.5475) closs: 0.8479 (0.8997) time: 1.2213 data: 0.0002 max mem: 17666 -[05:28:44.170627] Epoch: [2] [270/812] lr: 0.000031 grad_norm: 0.5350 (0.5466) closs: 0.8574 (0.9012) time: 1.2206 data: 0.0002 max mem: 17666 -[05:28:56.370480] Epoch: [2] [280/812] lr: 0.000031 grad_norm: 0.5331 (0.5468) closs: 0.8961 (0.9028) time: 1.2208 data: 0.0002 max mem: 17666 -[05:29:08.686787] Epoch: [2] [290/812] lr: 0.000031 grad_norm: 0.5416 (0.5475) closs: 0.8706 (0.8995) time: 1.2257 data: 0.0002 max mem: 17666 -[05:29:20.929953] Epoch: [2] [300/812] lr: 0.000031 grad_norm: 0.5308 (0.5467) closs: 0.8766 (0.8992) time: 1.2279 data: 0.0002 max mem: 17666 -[05:29:33.133327] Epoch: [2] [310/812] lr: 0.000030 grad_norm: 0.5419 (0.5486) closs: 0.9146 (0.8993) time: 1.2223 data: 0.0002 max mem: 17666 -[05:29:45.380974] Epoch: [2] [320/812] lr: 0.000030 grad_norm: 0.5416 (0.5480) closs: 0.9215 (0.9009) time: 1.2225 data: 0.0002 max mem: 17666 -[05:29:57.586882] Epoch: [2] [330/812] lr: 0.000030 grad_norm: 0.5708 (0.5498) closs: 0.9306 (0.9025) time: 1.2226 data: 0.0002 max mem: 17666 -[05:30:09.877899] Epoch: [2] [340/812] lr: 0.000029 grad_norm: 0.5762 (0.5500) closs: 0.9558 (0.9035) time: 1.2248 data: 0.0002 max mem: 17666 -[05:30:22.088804] Epoch: [2] [350/812] lr: 0.000029 grad_norm: 0.5395 (0.5498) closs: 0.9168 (0.9032) time: 1.2250 data: 0.0002 max mem: 17666 -[05:30:34.326928] Epoch: [2] [360/812] lr: 0.000029 grad_norm: 0.5362 (0.5499) closs: 0.8943 (0.9040) time: 1.2224 data: 0.0002 max mem: 17666 -[05:30:46.554197] Epoch: [2] [370/812] lr: 0.000029 grad_norm: 0.5362 (0.5499) closs: 0.8968 (0.9041) time: 1.2232 data: 0.0002 max mem: 17666 -[05:30:58.787894] Epoch: [2] [380/812] lr: 0.000028 grad_norm: 0.5605 (0.5500) closs: 0.8968 (0.9033) time: 1.2230 data: 0.0002 max mem: 17666 -[05:31:11.026268] Epoch: [2] [390/812] lr: 0.000028 grad_norm: 0.5654 (0.5508) closs: 0.8928 (0.9019) time: 1.2235 data: 0.0002 max mem: 17666 -[05:31:23.250055] Epoch: [2] [400/812] lr: 0.000028 grad_norm: 0.5627 (0.5509) closs: 0.8756 (0.9017) time: 1.2230 data: 0.0002 max mem: 17666 -[05:31:35.499135] Epoch: [2] [410/812] lr: 0.000027 grad_norm: 0.5489 (0.5520) closs: 0.8756 (0.9014) time: 1.2236 data: 0.0002 max mem: 17666 -[05:31:47.722078] Epoch: [2] [420/812] lr: 0.000027 grad_norm: 0.5539 (0.5521) closs: 0.8572 (0.9015) time: 1.2235 data: 0.0002 max mem: 17666 -[05:32:00.005353] Epoch: [2] [430/812] lr: 0.000027 grad_norm: 0.5353 (0.5520) closs: 0.8631 (0.9021) time: 1.2252 data: 0.0002 max mem: 17666 -[05:32:12.261554] Epoch: [2] [440/812] lr: 0.000027 grad_norm: 0.5384 (0.5529) closs: 0.9113 (0.9020) time: 1.2269 data: 0.0002 max mem: 17666 -[05:32:24.476152] Epoch: [2] [450/812] lr: 0.000026 grad_norm: 0.5707 (0.5529) closs: 0.9142 (0.9032) time: 1.2235 data: 0.0002 max mem: 17666 -[05:32:36.730553] Epoch: [2] [460/812] lr: 0.000026 grad_norm: 0.5230 (0.5534) closs: 0.9659 (0.9053) time: 1.2234 data: 0.0002 max mem: 17666 -[05:32:48.963624] Epoch: [2] [470/812] lr: 0.000026 grad_norm: 0.5223 (0.5531) closs: 0.9463 (0.9048) time: 1.2243 data: 0.0002 max mem: 17666 -[05:33:01.221874] Epoch: [2] [480/812] lr: 0.000025 grad_norm: 0.5196 (0.5530) closs: 0.9217 (0.9058) time: 1.2245 data: 0.0002 max mem: 17666 -[05:33:13.454427] Epoch: [2] [490/812] lr: 0.000025 grad_norm: 0.5165 (0.5525) closs: 0.9051 (0.9051) time: 1.2245 data: 0.0002 max mem: 17666 -[05:33:25.723568] Epoch: [2] [500/812] lr: 0.000025 grad_norm: 0.5228 (0.5525) closs: 0.8909 (0.9047) time: 1.2250 data: 0.0002 max mem: 17666 -[05:33:38.061671] Epoch: [2] [510/812] lr: 0.000024 grad_norm: 0.5380 (0.5523) closs: 0.8909 (0.9047) time: 1.2303 data: 0.0002 max mem: 17666 -[05:33:50.262136] Epoch: [2] [520/812] lr: 0.000024 grad_norm: 0.5317 (0.5519) closs: 0.8782 (0.9047) time: 1.2268 data: 0.0002 max mem: 17666 -[05:34:02.535828] Epoch: [2] [530/812] lr: 0.000024 grad_norm: 0.5358 (0.5519) closs: 0.8886 (0.9047) time: 1.2236 data: 0.0002 max mem: 17666 -[05:34:14.922873] Epoch: [2] [540/812] lr: 0.000024 grad_norm: 0.5384 (0.5516) closs: 0.8886 (0.9045) time: 1.2330 data: 0.0002 max mem: 17666 -[05:34:27.171951] Epoch: [2] [550/812] lr: 0.000023 grad_norm: 0.5198 (0.5508) closs: 0.8887 (0.9045) time: 1.2317 data: 0.0002 max mem: 17666 -[05:34:39.396660] Epoch: [2] [560/812] lr: 0.000023 grad_norm: 0.5260 (0.5509) closs: 0.8877 (0.9046) time: 1.2236 data: 0.0002 max mem: 17666 -[05:34:51.640988] Epoch: [2] [570/812] lr: 0.000023 grad_norm: 0.5627 (0.5513) closs: 0.8820 (0.9039) time: 1.2234 data: 0.0001 max mem: 17666 -[05:35:03.912231] Epoch: [2] [580/812] lr: 0.000022 grad_norm: 0.5364 (0.5511) closs: 0.8524 (0.9029) time: 1.2257 data: 0.0002 max mem: 17666 -[05:35:16.149707] Epoch: [2] [590/812] lr: 0.000022 grad_norm: 0.5502 (0.5518) closs: 0.8858 (0.9019) time: 1.2254 data: 0.0002 max mem: 17666 -[05:35:28.402157] Epoch: [2] [600/812] lr: 0.000022 grad_norm: 0.5616 (0.5521) closs: 0.8858 (0.9020) time: 1.2244 data: 0.0001 max mem: 17666 -[05:35:40.600967] Epoch: [2] [610/812] lr: 0.000022 grad_norm: 0.5462 (0.5522) closs: 0.8935 (0.9018) time: 1.2225 data: 0.0001 max mem: 17666 -[05:35:52.829356] Epoch: [2] [620/812] lr: 0.000021 grad_norm: 0.5462 (0.5521) closs: 0.8935 (0.9016) time: 1.2213 data: 0.0002 max mem: 17666 -[05:36:05.050971] Epoch: [2] [630/812] lr: 0.000021 grad_norm: 0.5440 (0.5521) closs: 0.8583 (0.9015) time: 1.2224 data: 0.0002 max mem: 17666 -[05:36:17.274985] Epoch: [2] [640/812] lr: 0.000021 grad_norm: 0.5375 (0.5518) closs: 0.8728 (0.9015) time: 1.2222 data: 0.0002 max mem: 17666 -[05:36:29.516934] Epoch: [2] [650/812] lr: 0.000021 grad_norm: 0.5418 (0.5523) closs: 0.8445 (0.8999) time: 1.2232 data: 0.0002 max mem: 17666 -[05:36:41.764958] Epoch: [2] [660/812] lr: 0.000020 grad_norm: 0.5287 (0.5519) closs: 0.8310 (0.8993) time: 1.2244 data: 0.0002 max mem: 17666 -[05:36:54.012876] Epoch: [2] [670/812] lr: 0.000020 grad_norm: 0.5246 (0.5520) closs: 0.8673 (0.8988) time: 1.2247 data: 0.0002 max mem: 17666 -[05:37:06.248293] Epoch: [2] [680/812] lr: 0.000020 grad_norm: 0.5323 (0.5519) closs: 0.9063 (0.8993) time: 1.2241 data: 0.0002 max mem: 17666 -[05:37:18.499422] Epoch: [2] [690/812] lr: 0.000019 grad_norm: 0.5320 (0.5520) closs: 0.9313 (0.8995) time: 1.2243 data: 0.0002 max mem: 17666 -[05:37:30.723896] Epoch: [2] [700/812] lr: 0.000019 grad_norm: 0.5400 (0.5520) closs: 0.8904 (0.8992) time: 1.2237 data: 0.0002 max mem: 17666 -[05:37:42.959287] Epoch: [2] [710/812] lr: 0.000019 grad_norm: 0.5535 (0.5522) closs: 0.8538 (0.8986) time: 1.2229 data: 0.0002 max mem: 17666 -[05:37:55.217857] Epoch: [2] [720/812] lr: 0.000019 grad_norm: 0.5448 (0.5520) closs: 0.8520 (0.8987) time: 1.2246 data: 0.0002 max mem: 17666 -[05:38:07.427583] Epoch: [2] [730/812] lr: 0.000018 grad_norm: 0.5490 (0.5522) closs: 0.8863 (0.8994) time: 1.2233 data: 0.0002 max mem: 17666 -[05:38:19.686100] Epoch: [2] [740/812] lr: 0.000018 grad_norm: 0.5623 (0.5524) closs: 0.8536 (0.8994) time: 1.2233 data: 0.0002 max mem: 17666 -[05:38:31.903113] Epoch: [2] [750/812] lr: 0.000018 grad_norm: 0.5355 (0.5521) closs: 0.8654 (0.8993) time: 1.2237 data: 0.0002 max mem: 17666 -[05:38:44.115597] Epoch: [2] [760/812] lr: 0.000018 grad_norm: 0.5332 (0.5522) closs: 0.8976 (0.8987) time: 1.2214 data: 0.0002 max mem: 17666 -[05:38:56.321914] Epoch: [2] [770/812] lr: 0.000017 grad_norm: 0.5436 (0.5522) closs: 0.9065 (0.8994) time: 1.2209 data: 0.0002 max mem: 17666 -[05:39:08.539215] Epoch: [2] [780/812] lr: 0.000017 grad_norm: 0.5429 (0.5522) closs: 0.9551 (0.8999) time: 1.2211 data: 0.0002 max mem: 17666 -[05:39:20.776202] Epoch: [2] [790/812] lr: 0.000017 grad_norm: 0.5429 (0.5520) closs: 0.9152 (0.9003) time: 1.2226 data: 0.0001 max mem: 17666 -[05:39:33.004979] Epoch: [2] [800/812] lr: 0.000017 grad_norm: 0.5364 (0.5520) closs: 0.8958 (0.9006) time: 1.2232 data: 0.0001 max mem: 17666 -[05:39:45.263298] Epoch: [2] [810/812] lr: 0.000016 grad_norm: 0.5421 (0.5519) closs: 0.9132 (0.9010) time: 1.2243 data: 0.0001 max mem: 17666 -[05:39:46.761821] Epoch: [2] Total time: 0:16:35 -[05:39:46.769483] Averaged stats: lr: 0.000016 grad_norm: 0.5364 (0.5519) closs: 0.9132 (0.9010) -[05:39:47.092441] model saved -[05:39:48.487045] optimizer saved -[05:39:48.487749] other rank-common saved -[05:39:48.491225] rank-specific saved -[05:39:48.499785] log_dir: ./output_dir -[05:39:50.944123] Epoch: [3] [0/812] lr: 0.000016 grad_norm: 0.5562 (0.5562) closs: 0.6677 (0.6677) time: 2.4434 data: 1.2020 max mem: 17666 -[05:40:03.157106] Epoch: [3] [10/812] lr: 0.000016 grad_norm: 0.5337 (0.5507) closs: 0.9688 (0.9449) time: 1.3323 data: 0.1094 max mem: 17666 -[05:40:15.422338] Epoch: [3] [20/812] lr: 0.000016 grad_norm: 0.5337 (0.5547) closs: 0.9130 (0.9159) time: 1.2238 data: 0.0002 max mem: 17666 -[05:40:27.617120] Epoch: [3] [30/812] lr: 0.000016 grad_norm: 0.5153 (0.5406) closs: 0.9044 (0.9138) time: 1.2229 data: 0.0002 max mem: 17666 -[05:40:39.859947] Epoch: [3] [40/812] lr: 0.000015 grad_norm: 0.5153 (0.5433) closs: 0.8974 (0.9028) time: 1.2218 data: 0.0002 max mem: 17666 -[05:40:52.133298] Epoch: [3] [50/812] lr: 0.000015 grad_norm: 0.5365 (0.5437) closs: 0.9004 (0.9038) time: 1.2257 data: 0.0002 max mem: 17666 -[05:41:04.316686] Epoch: [3] [60/812] lr: 0.000015 grad_norm: 0.5300 (0.5426) closs: 0.8967 (0.9080) time: 1.2228 data: 0.0002 max mem: 17666 -[05:41:16.709159] Epoch: [3] [70/812] lr: 0.000015 grad_norm: 0.5389 (0.5437) closs: 0.8772 (0.9053) time: 1.2287 data: 0.0002 max mem: 17666 -[05:41:28.981080] Epoch: [3] [80/812] lr: 0.000014 grad_norm: 0.5358 (0.5439) closs: 0.8995 (0.9040) time: 1.2331 data: 0.0002 max mem: 17666 -[05:41:41.218043] Epoch: [3] [90/812] lr: 0.000014 grad_norm: 0.5358 (0.5482) closs: 0.9016 (0.9026) time: 1.2254 data: 0.0002 max mem: 17666 -[05:41:53.459622] Epoch: [3] [100/812] lr: 0.000014 grad_norm: 0.5407 (0.5488) closs: 0.8973 (0.9010) time: 1.2238 data: 0.0002 max mem: 17666 -[05:42:05.684572] Epoch: [3] [110/812] lr: 0.000014 grad_norm: 0.5407 (0.5518) closs: 0.8370 (0.8944) time: 1.2233 data: 0.0002 max mem: 17666 -[05:42:17.955761] Epoch: [3] [120/812] lr: 0.000013 grad_norm: 0.5543 (0.5515) closs: 0.8370 (0.8983) time: 1.2247 data: 0.0002 max mem: 17666 -[05:42:30.181846] Epoch: [3] [130/812] lr: 0.000013 grad_norm: 0.5379 (0.5507) closs: 0.8520 (0.8947) time: 1.2248 data: 0.0002 max mem: 17666 -[05:42:42.456814] Epoch: [3] [140/812] lr: 0.000013 grad_norm: 0.5309 (0.5496) closs: 0.8633 (0.8987) time: 1.2250 data: 0.0002 max mem: 17666 -[05:42:54.678890] Epoch: [3] [150/812] lr: 0.000013 grad_norm: 0.5271 (0.5505) closs: 0.9299 (0.8988) time: 1.2248 data: 0.0002 max mem: 17666 -[05:43:06.935982] Epoch: [3] [160/812] lr: 0.000012 grad_norm: 0.5488 (0.5496) closs: 0.8740 (0.8989) time: 1.2239 data: 0.0002 max mem: 17666 -[05:43:19.181914] Epoch: [3] [170/812] lr: 0.000012 grad_norm: 0.5464 (0.5495) closs: 0.8753 (0.8977) time: 1.2251 data: 0.0002 max mem: 17666 -[05:43:31.400721] Epoch: [3] [180/812] lr: 0.000012 grad_norm: 0.5487 (0.5500) closs: 0.8691 (0.8962) time: 1.2232 data: 0.0002 max mem: 17666 -[05:43:43.639535] Epoch: [3] [190/812] lr: 0.000012 grad_norm: 0.5439 (0.5500) closs: 0.8474 (0.8957) time: 1.2228 data: 0.0002 max mem: 17666 -[05:43:55.888343] Epoch: [3] [200/812] lr: 0.000012 grad_norm: 0.5147 (0.5471) closs: 0.9034 (0.8973) time: 1.2243 data: 0.0002 max mem: 17666 -[05:44:08.136204] Epoch: [3] [210/812] lr: 0.000011 grad_norm: 0.5083 (0.5460) closs: 0.8917 (0.8965) time: 1.2248 data: 0.0002 max mem: 17666 -[05:44:20.360724] Epoch: [3] [220/812] lr: 0.000011 grad_norm: 0.5294 (0.5456) closs: 0.8673 (0.8980) time: 1.2235 data: 0.0002 max mem: 17666 -[05:44:32.597597] Epoch: [3] [230/812] lr: 0.000011 grad_norm: 0.5309 (0.5452) closs: 0.8934 (0.8987) time: 1.2230 data: 0.0002 max mem: 17666 -[05:44:44.822067] Epoch: [3] [240/812] lr: 0.000011 grad_norm: 0.5445 (0.5462) closs: 0.8918 (0.8988) time: 1.2230 data: 0.0002 max mem: 17666 -[05:44:57.071616] Epoch: [3] [250/812] lr: 0.000011 grad_norm: 0.5577 (0.5471) closs: 0.8841 (0.8983) time: 1.2236 data: 0.0002 max mem: 17666 -[05:45:09.334382] Epoch: [3] [260/812] lr: 0.000010 grad_norm: 0.5525 (0.5469) closs: 0.8411 (0.8965) time: 1.2255 data: 0.0002 max mem: 17666 -[05:45:21.576233] Epoch: [3] [270/812] lr: 0.000010 grad_norm: 0.5297 (0.5460) closs: 0.8316 (0.8970) time: 1.2252 data: 0.0002 max mem: 17666 -[05:45:33.792847] Epoch: [3] [280/812] lr: 0.000010 grad_norm: 0.5256 (0.5458) closs: 0.8528 (0.8986) time: 1.2228 data: 0.0002 max mem: 17666 -[05:45:45.985161] Epoch: [3] [290/812] lr: 0.000010 grad_norm: 0.5459 (0.5469) closs: 0.8481 (0.8978) time: 1.2204 data: 0.0002 max mem: 17666 -[05:45:58.246023] Epoch: [3] [300/812] lr: 0.000010 grad_norm: 0.5559 (0.5478) closs: 0.8454 (0.8950) time: 1.2226 data: 0.0002 max mem: 17666 -[05:46:10.538520] Epoch: [3] [310/812] lr: 0.000010 grad_norm: 0.5453 (0.5486) closs: 0.8714 (0.8966) time: 1.2276 data: 0.0002 max mem: 17666 -[05:46:22.758193] Epoch: [3] [320/812] lr: 0.000009 grad_norm: 0.5333 (0.5487) closs: 0.9090 (0.8970) time: 1.2255 data: 0.0002 max mem: 17666 -[05:46:35.009523] Epoch: [3] [330/812] lr: 0.000009 grad_norm: 0.5240 (0.5482) closs: 0.9090 (0.8967) time: 1.2235 data: 0.0002 max mem: 17666 -[05:46:47.248170] Epoch: [3] [340/812] lr: 0.000009 grad_norm: 0.5495 (0.5481) closs: 0.9205 (0.8972) time: 1.2244 data: 0.0002 max mem: 17666 -[05:46:59.513079] Epoch: [3] [350/812] lr: 0.000009 grad_norm: 0.5428 (0.5478) closs: 0.9117 (0.8979) time: 1.2251 data: 0.0002 max mem: 17666 -[05:47:11.756475] Epoch: [3] [360/812] lr: 0.000009 grad_norm: 0.5528 (0.5483) closs: 0.9040 (0.8985) time: 1.2253 data: 0.0002 max mem: 17666 -[05:47:24.001759] Epoch: [3] [370/812] lr: 0.000009 grad_norm: 0.5583 (0.5497) closs: 0.9428 (0.8988) time: 1.2244 data: 0.0002 max mem: 17666 -[05:47:36.258369] Epoch: [3] [380/812] lr: 0.000008 grad_norm: 0.5532 (0.5498) closs: 0.8826 (0.8977) time: 1.2250 data: 0.0002 max mem: 17666 -[05:47:48.534397] Epoch: [3] [390/812] lr: 0.000008 grad_norm: 0.5464 (0.5498) closs: 0.8169 (0.8967) time: 1.2266 data: 0.0002 max mem: 17666 -[05:48:00.757119] Epoch: [3] [400/812] lr: 0.000008 grad_norm: 0.5397 (0.5499) closs: 0.8881 (0.8972) time: 1.2249 data: 0.0002 max mem: 17666 -[05:48:12.965384] Epoch: [3] [410/812] lr: 0.000008 grad_norm: 0.5424 (0.5500) closs: 0.8712 (0.8959) time: 1.2215 data: 0.0002 max mem: 17666 -[05:48:25.205411] Epoch: [3] [420/812] lr: 0.000008 grad_norm: 0.5482 (0.5500) closs: 0.8673 (0.8952) time: 1.2223 data: 0.0002 max mem: 17666 -[05:48:37.398291] Epoch: [3] [430/812] lr: 0.000008 grad_norm: 0.5379 (0.5502) closs: 0.8604 (0.8939) time: 1.2216 data: 0.0002 max mem: 17666 -[05:48:49.634277] Epoch: [3] [440/812] lr: 0.000008 grad_norm: 0.5367 (0.5501) closs: 0.8604 (0.8932) time: 1.2214 data: 0.0002 max mem: 17666 -[05:49:01.842468] Epoch: [3] [450/812] lr: 0.000007 grad_norm: 0.5338 (0.5509) closs: 0.8707 (0.8928) time: 1.2221 data: 0.0002 max mem: 17666 -[05:49:14.048276] Epoch: [3] [460/812] lr: 0.000007 grad_norm: 0.5654 (0.5517) closs: 0.8607 (0.8921) time: 1.2206 data: 0.0002 max mem: 17666 -[05:49:26.270248] Epoch: [3] [470/812] lr: 0.000007 grad_norm: 0.5506 (0.5519) closs: 0.8607 (0.8917) time: 1.2213 data: 0.0002 max mem: 17666 -[05:49:38.467867] Epoch: [3] [480/812] lr: 0.000007 grad_norm: 0.5433 (0.5522) closs: 0.8556 (0.8913) time: 1.2209 data: 0.0002 max mem: 17666 -[05:49:50.683011] Epoch: [3] [490/812] lr: 0.000007 grad_norm: 0.5388 (0.5518) closs: 0.8579 (0.8916) time: 1.2206 data: 0.0002 max mem: 17666 -[05:50:02.890194] Epoch: [3] [500/812] lr: 0.000007 grad_norm: 0.5285 (0.5519) closs: 0.8663 (0.8911) time: 1.2210 data: 0.0002 max mem: 17666 -[05:50:15.194581] Epoch: [3] [510/812] lr: 0.000007 grad_norm: 0.5267 (0.5519) closs: 0.8912 (0.8925) time: 1.2255 data: 0.0002 max mem: 17666 -[05:50:27.434194] Epoch: [3] [520/812] lr: 0.000007 grad_norm: 0.5485 (0.5521) closs: 0.9753 (0.8933) time: 1.2271 data: 0.0002 max mem: 17666 -[05:50:39.651081] Epoch: [3] [530/812] lr: 0.000006 grad_norm: 0.5609 (0.5526) closs: 0.8542 (0.8926) time: 1.2228 data: 0.0002 max mem: 17666 -[05:50:51.885523] Epoch: [3] [540/812] lr: 0.000006 grad_norm: 0.5575 (0.5528) closs: 0.8232 (0.8925) time: 1.2225 data: 0.0002 max mem: 17666 -[05:51:04.114756] Epoch: [3] [550/812] lr: 0.000006 grad_norm: 0.5378 (0.5530) closs: 0.8856 (0.8916) time: 1.2231 data: 0.0002 max mem: 17666 -[05:51:16.348968] Epoch: [3] [560/812] lr: 0.000006 grad_norm: 0.5246 (0.5528) closs: 0.8477 (0.8911) time: 1.2231 data: 0.0002 max mem: 17666 -[05:51:28.542827] Epoch: [3] [570/812] lr: 0.000006 grad_norm: 0.5556 (0.5531) closs: 0.8762 (0.8917) time: 1.2213 data: 0.0002 max mem: 17666 -[05:51:40.755871] Epoch: [3] [580/812] lr: 0.000006 grad_norm: 0.5559 (0.5532) closs: 0.9160 (0.8922) time: 1.2203 data: 0.0002 max mem: 17666 -[05:51:52.996942] Epoch: [3] [590/812] lr: 0.000006 grad_norm: 0.5523 (0.5531) closs: 0.8803 (0.8920) time: 1.2226 data: 0.0002 max mem: 17666 -[05:52:05.218398] Epoch: [3] [600/812] lr: 0.000006 grad_norm: 0.5592 (0.5532) closs: 0.8175 (0.8915) time: 1.2231 data: 0.0002 max mem: 17666 -[05:52:17.508068] Epoch: [3] [610/812] lr: 0.000006 grad_norm: 0.5655 (0.5533) closs: 0.9026 (0.8923) time: 1.2255 data: 0.0002 max mem: 17666 -[05:52:29.722507] Epoch: [3] [620/812] lr: 0.000006 grad_norm: 0.5540 (0.5537) closs: 0.8860 (0.8926) time: 1.2251 data: 0.0002 max mem: 17666 -[05:52:41.951732] Epoch: [3] [630/812] lr: 0.000006 grad_norm: 0.5512 (0.5543) closs: 0.8709 (0.8918) time: 1.2221 data: 0.0002 max mem: 17666 -[05:52:54.184550] Epoch: [3] [640/812] lr: 0.000006 grad_norm: 0.5541 (0.5544) closs: 0.8467 (0.8915) time: 1.2230 data: 0.0002 max mem: 17666 -[05:53:06.428821] Epoch: [3] [650/812] lr: 0.000005 grad_norm: 0.5534 (0.5547) closs: 0.8995 (0.8926) time: 1.2238 data: 0.0002 max mem: 17666 -[05:53:18.690090] Epoch: [3] [660/812] lr: 0.000005 grad_norm: 0.5558 (0.5549) closs: 0.9336 (0.8932) time: 1.2252 data: 0.0002 max mem: 17666 -[05:53:30.911141] Epoch: [3] [670/812] lr: 0.000005 grad_norm: 0.5558 (0.5548) closs: 0.8835 (0.8925) time: 1.2240 data: 0.0002 max mem: 17666 -[05:53:43.161458] Epoch: [3] [680/812] lr: 0.000005 grad_norm: 0.5321 (0.5547) closs: 0.8819 (0.8931) time: 1.2235 data: 0.0002 max mem: 17666 -[05:53:55.371169] Epoch: [3] [690/812] lr: 0.000005 grad_norm: 0.5240 (0.5545) closs: 0.8819 (0.8929) time: 1.2229 data: 0.0002 max mem: 17666 -[05:54:07.586578] Epoch: [3] [700/812] lr: 0.000005 grad_norm: 0.5243 (0.5545) closs: 0.8890 (0.8933) time: 1.2212 data: 0.0002 max mem: 17666 -[05:54:19.795256] Epoch: [3] [710/812] lr: 0.000005 grad_norm: 0.5534 (0.5550) closs: 0.9166 (0.8937) time: 1.2211 data: 0.0002 max mem: 17666 -[05:54:32.015414] Epoch: [3] [720/812] lr: 0.000005 grad_norm: 0.5337 (0.5548) closs: 0.9047 (0.8940) time: 1.2214 data: 0.0002 max mem: 17666 -[05:54:44.241974] Epoch: [3] [730/812] lr: 0.000005 grad_norm: 0.5269 (0.5549) closs: 0.8744 (0.8932) time: 1.2223 data: 0.0002 max mem: 17666 -[05:54:56.452329] Epoch: [3] [740/812] lr: 0.000005 grad_norm: 0.5342 (0.5546) closs: 0.8736 (0.8936) time: 1.2218 data: 0.0002 max mem: 17666 -[05:55:08.685255] Epoch: [3] [750/812] lr: 0.000005 grad_norm: 0.5347 (0.5546) closs: 0.8736 (0.8932) time: 1.2221 data: 0.0002 max mem: 17666 -[05:55:20.895362] Epoch: [3] [760/812] lr: 0.000005 grad_norm: 0.5396 (0.5548) closs: 0.8711 (0.8937) time: 1.2221 data: 0.0002 max mem: 17666 -[05:55:33.133468] Epoch: [3] [770/812] lr: 0.000005 grad_norm: 0.5376 (0.5548) closs: 0.8961 (0.8938) time: 1.2223 data: 0.0002 max mem: 17666 -[05:55:45.329927] Epoch: [3] [780/812] lr: 0.000005 grad_norm: 0.5376 (0.5551) closs: 0.8806 (0.8935) time: 1.2217 data: 0.0002 max mem: 17666 -[05:55:57.537681] Epoch: [3] [790/812] lr: 0.000005 grad_norm: 0.5535 (0.5551) closs: 0.8785 (0.8936) time: 1.2201 data: 0.0001 max mem: 17666 -[05:56:09.778726] Epoch: [3] [800/812] lr: 0.000005 grad_norm: 0.5479 (0.5551) closs: 0.9185 (0.8940) time: 1.2224 data: 0.0001 max mem: 17666 -[05:56:21.990260] Epoch: [3] [810/812] lr: 0.000005 grad_norm: 0.5479 (0.5552) closs: 0.9699 (0.8947) time: 1.2226 data: 0.0001 max mem: 17666 -[05:56:23.509702] Epoch: [3] Total time: 0:16:35 -[05:56:23.512943] Averaged stats: lr: 0.000005 grad_norm: 0.5555 (0.5554) closs: 0.9666 (0.8950) -[05:56:23.780574] model saved -[05:56:25.200026] optimizer saved -[05:56:25.200659] other rank-common saved -[05:56:25.203616] rank-specific saved -[05:56:25.203811] Training time 1:06:29