diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..6d00377f554cfe2800301419cb1d888f10f800ab --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcbe6ac8e5588bd2dbf6f2fa10826e9efd617af80fa5495358165d8dccfa19c9 +size 90952079 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..314b3ea31b149d73d691efd6f496b97f6a84e1d7 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51942e1cdd4fc4bb9aa2a9649a26fdd2e3822e4eb924e4e55084b8e09a90ea29 +size 90952079 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..26d42273836e83896673b1fe4cd0c1c6fd644004 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10f9cf84a8ccd5c8c1dfa19876aab016341e6c5c44a461108eb4423f345204ec +size 204403795 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..7e913b448dd37999501c9e98ec60f8d9d1ca1241 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d37ea8ab9cda46cc28c0964d75caf944b1305770a4bc789c738b99991c8672b8 +size 1815 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch0/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..eeb2b771457f443b3d8654d4cb077da3e5cded14 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ed39d3860a4c5b79b338c9f98875ba542b909b8bac0991be734db1360554e02 +size 90952079 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..83a3e825c1453e522c81944f203440d027874e82 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a26da94d90384063a4c8c4d7f9f630cd714d7eb07f25fce0a58ea54182cbb9b8 +size 204403795 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..8073af126d00c3679383f16a39784114786f2372 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56b9b1f946e7d9a93adaab132ff2381cef31be146bae4ac5763f3249d98fa378 +size 1815 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch1/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.model.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..6d00377f554cfe2800301419cb1d888f10f800ab --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcbe6ac8e5588bd2dbf6f2fa10826e9efd617af80fa5495358165d8dccfa19c9 +size 90952079 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..808c3563296dc0a7be82da6869671c500c55404f --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db07aed2b90e65be9083d5431d37272028e06d4b92d6ba3067f059976e7ff591 +size 204403795 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.other.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..68ac231f9774f7a4a6254684400eb280cd69e9fa --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ac90e9e7304e3e220667308999a777273eaea82ea51cb28003c1fa9e40738a9 +size 1815 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00000-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00001-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00002-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00003-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00004-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00005-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00006-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00007-of-00008.pth b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/epoch2/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/log.txt b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..6abb734a07c980ed975f724eb9dad68f2e62731a --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/log.txt @@ -0,0 +1,3 @@ +{"train_lr": 4.2672042852177435e-05, "train_grad_norm": 0.8576727652187944, "train_closs": 1.0961337126687645, "epoch": 0, "val_lr": 4.2672042852177435e-05, "val_grad_norm": 0.8576727652187944, "val_closs": 1.0961337126687645} +{"train_lr": 2.989280819774688e-05, "train_grad_norm": 0.7565592593381042, "train_closs": 1.0483260756908241, "epoch": 1, "val_lr": 2.989280819774688e-05, "val_grad_norm": 0.7565592593381042, "val_closs": 1.0483260756908241} +{"train_lr": 9.43437279837357e-06, "train_grad_norm": 0.7707539895124279, "train_closs": 1.0381248756278665, "epoch": 2, "val_lr": 9.43437279837357e-06, "val_grad_norm": 0.7707539895124279, "val_closs": 1.0381248756278665} diff --git a/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/output.log b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/output.log new file mode 100644 index 0000000000000000000000000000000000000000..3e8df40b9236cc509af3a1865e95527f2edb136a --- /dev/null +++ b/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B/output.log @@ -0,0 +1,2801 @@ +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +| distributed init (rank 6): env://, gpu 6 +| distributed init (rank 3): env://, gpu 3 +| distributed init (rank 1): env://, gpu 1 +| distributed init (rank 7): env://, gpu 7 +| distributed init (rank 5): env://, gpu 5 +| distributed init (rank 2): env://, gpu 2 +| distributed init (rank 0): env://, gpu 0 +| distributed init (rank 4): env://, gpu 4 +[00:02:04.046928] > initializing model parallel with size 1 +[00:02:04.047015] > initializing ddp with size 8 +[00:02:04.047022] > initializing pipeline with size 1 +[00:02:04.226045] job dir: /data/liuyijiang/mmlab/LLaMA2-Accessory/accessory +[00:02:04.226122] Namespace(batch_size=16, +accum_iter=1, +llama_type='llama_qformerv2_peft', +llama_config=['../checkpoints/llama2/Llama-2-13b/params.json', +'configs/model/finetune/sg/llamaPeft_normBiasLora.json'], +no_visual=False, +tokenizer_path='../checkpoints/llama2/Llama-2-13b/tokenizer.model', +pretrained_path='../checkpoints/mm/lamaQformerv2_13b/finetuned/', +pretrained_type='consolidated', +weight_decay=0.02, +lr=5e-05, +min_lr=5e-06, +epochs=3, +warmup_epochs=0.2, +clip_grad=2, +max_words=512, +dialog=False, +data_config='configs/data/finetune/mm/alpaca_llava.yaml', +output_dir='output/finetune/mm/alpacaLlava_llamaQformerv2Peft_QF_13B', +log_dir='./output_dir', +save_interval=1, +only_save_trainable=True, +device='cuda', +seed=0, +resume='', +num_workers=16, +pin_mem=True, +world_size=8, +local_rank=-1, +dist_on_itp=False, +dist_url='env://', +model_parallel_size=1, +data_parallel='sdp', +precision='bf16', +checkpointing=True, +quant=True, +rank=0, +gpu=0, +distributed=True, +dist_backend='nccl') +[00:02:04.226978] Start initialization. +[00:02:04.227022] ## Processing on RANK 0. +[00:02:04.237574] Model Args: + ModelArgs(dim=5120, n_layers=40, n_heads=40, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=16, bias_tuning=True) +[00:03:36.399161] build llama model with qformerv2 +[00:03:36.779030] (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /Salesforce/blip2-opt-2.7b/resolve/main/config.json (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1007)')))"), '(Request ID: f000589d-f862-41f8-832e-73fc0c96ee6a)') + Loading checkpoint shards: 0%| | 0/2 [00:00 +[00:36:06.281575] Start training for 3 epochs +[00:36:06.296296] log_dir: ./output_dir +[00:36:22.985451] Epoch: [0] [0/3229] lr: 0.000000 grad_norm: 2.3647 (2.3647) closs: 1.5947 (1.5947) time: 16.6883 data: 8.5180 max mem: 36209 +[00:37:03.163928] Epoch: [0] [10/3229] lr: 0.000001 grad_norm: 2.2614 (2.1844) closs: 1.3985 (1.3614) time: 5.1696 data: 0.7746 max mem: 54683 +[00:37:43.938889] Epoch: [0] [20/3229] lr: 0.000002 grad_norm: 2.2614 (2.2117) closs: 1.4480 (1.4415) time: 4.0476 data: 0.0002 max mem: 54683 +[00:38:24.104529] Epoch: [0] [30/3229] lr: 0.000002 grad_norm: 2.2231 (2.2052) closs: 1.4753 (1.4253) time: 4.0470 data: 0.0002 max mem: 54683 +[00:39:05.082555] Epoch: [0] [40/3229] lr: 0.000003 grad_norm: 2.1752 (2.1914) closs: 1.4110 (1.4059) time: 4.0571 data: 0.0002 max mem: 54683 +[00:39:46.229231] Epoch: [0] [50/3229] lr: 0.000004 grad_norm: 2.0509 (2.2594) closs: 1.4161 (1.4234) time: 4.1061 data: 0.0002 max mem: 54683 +[00:40:26.733155] Epoch: [0] [60/3229] lr: 0.000005 grad_norm: 1.9159 (2.1852) closs: 1.4598 (1.4133) time: 4.0824 data: 0.0003 max mem: 54683 +[00:41:07.256178] Epoch: [0] [70/3229] lr: 0.000005 grad_norm: 1.7466 (2.1041) closs: 1.4069 (1.4059) time: 4.0513 data: 0.0003 max mem: 54683 +[00:41:48.461794] Epoch: [0] [80/3229] lr: 0.000006 grad_norm: 1.4995 (2.0192) closs: 1.3245 (1.3975) time: 4.0864 data: 0.0002 max mem: 54683 +[00:42:28.626897] Epoch: [0] [90/3229] lr: 0.000007 grad_norm: 1.2119 (1.9324) closs: 1.2737 (1.3830) time: 4.0684 data: 0.0002 max mem: 54683 +[00:43:09.428679] Epoch: [0] [100/3229] lr: 0.000008 grad_norm: 1.1636 (1.8590) closs: 1.2619 (1.3700) time: 4.0482 data: 0.0003 max mem: 54683 +[00:43:49.914083] Epoch: [0] [110/3229] lr: 0.000009 grad_norm: 1.0306 (1.7801) closs: 1.2915 (1.3614) time: 4.0643 data: 0.0003 max mem: 54683 +[00:44:31.063164] Epoch: [0] [120/3229] lr: 0.000009 grad_norm: 0.9462 (1.7095) closs: 1.3081 (1.3546) time: 4.0816 data: 0.0002 max mem: 54683 +[00:45:11.261841] Epoch: [0] [130/3229] lr: 0.000010 grad_norm: 0.9337 (1.6472) closs: 1.2631 (1.3397) time: 4.0673 data: 0.0002 max mem: 54683 +[00:45:52.112790] Epoch: [0] [140/3229] lr: 0.000011 grad_norm: 0.8573 (1.5931) closs: 1.2118 (1.3288) time: 4.0524 data: 0.0002 max mem: 54683 +[00:46:32.944599] Epoch: [0] [150/3229] lr: 0.000012 grad_norm: 0.8517 (1.5447) closs: 1.1676 (1.3182) time: 4.0841 data: 0.0003 max mem: 54683 +[00:47:14.287016] Epoch: [0] [160/3229] lr: 0.000012 grad_norm: 0.8240 (1.4999) closs: 1.1756 (1.3118) time: 4.1086 data: 0.0003 max mem: 54683 +[00:47:55.388906] Epoch: [0] [170/3229] lr: 0.000013 grad_norm: 0.8136 (1.4618) closs: 1.1849 (1.3054) time: 4.1221 data: 0.0003 max mem: 54683 +[00:48:36.512000] Epoch: [0] [180/3229] lr: 0.000014 grad_norm: 0.8954 (1.4305) closs: 1.2078 (1.3010) time: 4.1112 data: 0.0002 max mem: 54683 +[00:49:17.622250] Epoch: [0] [190/3229] lr: 0.000015 grad_norm: 0.8401 (1.3994) closs: 1.2296 (1.2966) time: 4.1116 data: 0.0002 max mem: 54683 +[00:49:58.917542] Epoch: [0] [200/3229] lr: 0.000015 grad_norm: 0.8401 (1.3746) closs: 1.2198 (1.2906) time: 4.1202 data: 0.0002 max mem: 54683 +[00:50:39.721214] Epoch: [0] [210/3229] lr: 0.000016 grad_norm: 0.8446 (1.3499) closs: 1.1936 (1.2863) time: 4.1049 data: 0.0002 max mem: 54684 +[00:51:20.209941] Epoch: [0] [220/3229] lr: 0.000017 grad_norm: 0.8360 (1.3275) closs: 1.2039 (1.2828) time: 4.0645 data: 0.0002 max mem: 54684 +[00:52:01.340571] Epoch: [0] [230/3229] lr: 0.000018 grad_norm: 0.8406 (1.3077) closs: 1.2159 (1.2796) time: 4.0809 data: 0.0002 max mem: 54684 +[00:52:42.293014] Epoch: [0] [240/3229] lr: 0.000019 grad_norm: 0.8406 (1.2881) closs: 1.2119 (1.2756) time: 4.1041 data: 0.0002 max mem: 54684 +[00:53:22.448171] Epoch: [0] [250/3229] lr: 0.000019 grad_norm: 0.8104 (1.2702) closs: 1.1467 (1.2704) time: 4.0553 data: 0.0002 max mem: 54684 +[00:54:02.261862] Epoch: [0] [260/3229] lr: 0.000020 grad_norm: 0.8084 (1.2538) closs: 1.1398 (1.2651) time: 3.9984 data: 0.0002 max mem: 54684 +[00:54:42.733423] Epoch: [0] [270/3229] lr: 0.000021 grad_norm: 0.8337 (1.2407) closs: 1.1322 (1.2590) time: 4.0142 data: 0.0002 max mem: 54684 +[00:55:23.786823] Epoch: [0] [280/3229] lr: 0.000022 grad_norm: 0.8337 (1.2265) closs: 1.1322 (1.2549) time: 4.0762 data: 0.0002 max mem: 54684 +[00:56:04.261686] Epoch: [0] [290/3229] lr: 0.000022 grad_norm: 0.8146 (1.2134) closs: 1.1835 (1.2526) time: 4.0763 data: 0.0002 max mem: 54684 +[00:56:45.065966] Epoch: [0] [300/3229] lr: 0.000023 grad_norm: 0.8291 (1.2034) closs: 1.2046 (1.2503) time: 4.0639 data: 0.0002 max mem: 54684 +[00:57:25.203092] Epoch: [0] [310/3229] lr: 0.000024 grad_norm: 0.8414 (1.1916) closs: 1.1724 (1.2467) time: 4.0470 data: 0.0002 max mem: 54684 +[00:58:05.237988] Epoch: [0] [320/3229] lr: 0.000025 grad_norm: 0.8399 (1.1806) closs: 1.1496 (1.2420) time: 4.0085 data: 0.0002 max mem: 54684 +[00:58:46.046197] Epoch: [0] [330/3229] lr: 0.000026 grad_norm: 0.8551 (1.1711) closs: 1.1496 (1.2392) time: 4.0421 data: 0.0002 max mem: 54684 +[00:59:27.204738] Epoch: [0] [340/3229] lr: 0.000026 grad_norm: 0.8768 (1.1625) closs: 1.1481 (1.2362) time: 4.0983 data: 0.0002 max mem: 54684 +[01:00:08.357924] Epoch: [0] [350/3229] lr: 0.000027 grad_norm: 0.8572 (1.1541) closs: 1.1341 (1.2344) time: 4.1155 data: 0.0002 max mem: 54684 +[01:00:49.082581] Epoch: [0] [360/3229] lr: 0.000028 grad_norm: 0.8620 (1.1473) closs: 1.1321 (1.2311) time: 4.0938 data: 0.0002 max mem: 54684 +[01:01:29.908089] Epoch: [0] [370/3229] lr: 0.000029 grad_norm: 0.9078 (1.1413) closs: 1.1322 (1.2292) time: 4.0774 data: 0.0002 max mem: 54684 +[01:02:10.408877] Epoch: [0] [380/3229] lr: 0.000029 grad_norm: 0.9115 (1.1385) closs: 1.1444 (1.2265) time: 4.0662 data: 0.0002 max mem: 54684 +[01:02:50.910726] Epoch: [0] [390/3229] lr: 0.000030 grad_norm: 0.8868 (1.1318) closs: 1.1205 (1.2241) time: 4.0501 data: 0.0002 max mem: 54684 +[01:03:32.368355] Epoch: [0] [400/3229] lr: 0.000031 grad_norm: 0.8768 (1.1270) closs: 1.1205 (1.2212) time: 4.0979 data: 0.0002 max mem: 54684 +[01:04:12.522689] Epoch: [0] [410/3229] lr: 0.000032 grad_norm: 0.8617 (1.1200) closs: 1.1138 (1.2181) time: 4.0805 data: 0.0002 max mem: 54684 +[01:04:53.670656] Epoch: [0] [420/3229] lr: 0.000033 grad_norm: 0.8747 (1.1175) closs: 1.0839 (1.2160) time: 4.0650 data: 0.0002 max mem: 54684 +[01:05:34.517336] Epoch: [0] [430/3229] lr: 0.000033 grad_norm: 0.9140 (1.1135) closs: 1.1140 (1.2136) time: 4.0997 data: 0.0002 max mem: 54684 +[01:06:15.357490] Epoch: [0] [440/3229] lr: 0.000034 grad_norm: 0.9000 (1.1083) closs: 1.1255 (1.2112) time: 4.0843 data: 0.0002 max mem: 54684 +[01:06:56.510385] Epoch: [0] [450/3229] lr: 0.000035 grad_norm: 0.9130 (1.1047) closs: 1.1443 (1.2102) time: 4.0996 data: 0.0002 max mem: 54684 +[01:07:37.338507] Epoch: [0] [460/3229] lr: 0.000036 grad_norm: 0.8889 (1.0996) closs: 1.1569 (1.2089) time: 4.0990 data: 0.0002 max mem: 54684 +[01:08:17.507539] Epoch: [0] [470/3229] lr: 0.000036 grad_norm: 0.8632 (1.0962) closs: 1.1353 (1.2070) time: 4.0498 data: 0.0002 max mem: 54684 +[01:08:58.904572] Epoch: [0] [480/3229] lr: 0.000037 grad_norm: 0.9061 (1.0920) closs: 1.0907 (1.2045) time: 4.0782 data: 0.0002 max mem: 54684 +[01:09:39.746010] Epoch: [0] [490/3229] lr: 0.000038 grad_norm: 0.9314 (1.0890) closs: 1.0907 (1.2028) time: 4.1118 data: 0.0002 max mem: 54684 +[01:10:20.231136] Epoch: [0] [500/3229] lr: 0.000039 grad_norm: 0.9235 (1.0854) closs: 1.1249 (1.2011) time: 4.0663 data: 0.0002 max mem: 54684 +[01:11:01.054523] Epoch: [0] [510/3229] lr: 0.000039 grad_norm: 0.9309 (1.0834) closs: 1.1249 (1.1995) time: 4.0654 data: 0.0002 max mem: 54684 +[01:11:42.308142] Epoch: [0] [520/3229] lr: 0.000040 grad_norm: 0.9541 (1.0806) closs: 1.1226 (1.1972) time: 4.1038 data: 0.0002 max mem: 54684 +[01:12:23.110317] Epoch: [0] [530/3229] lr: 0.000041 grad_norm: 0.9516 (1.0777) closs: 1.1269 (1.1965) time: 4.1027 data: 0.0002 max mem: 54684 +[01:13:03.936397] Epoch: [0] [540/3229] lr: 0.000042 grad_norm: 0.9204 (1.0814) closs: 1.1651 (1.1959) time: 4.0813 data: 0.0002 max mem: 54684 +[01:13:44.208566] Epoch: [0] [550/3229] lr: 0.000043 grad_norm: 0.9204 (1.0785) closs: 1.1192 (1.1931) time: 4.0548 data: 0.0002 max mem: 54684 +[01:14:25.497954] Epoch: [0] [560/3229] lr: 0.000043 grad_norm: 0.9295 (1.0758) closs: 1.0839 (1.1919) time: 4.0780 data: 0.0002 max mem: 54684 +[01:15:05.977736] Epoch: [0] [570/3229] lr: 0.000044 grad_norm: 0.9723 (1.0751) closs: 1.1325 (1.1915) time: 4.0884 data: 0.0002 max mem: 54684 +[01:15:47.138810] Epoch: [0] [580/3229] lr: 0.000045 grad_norm: 0.9485 (1.0728) closs: 1.1764 (1.1910) time: 4.0820 data: 0.0002 max mem: 54684 +[01:16:28.229266] Epoch: [0] [590/3229] lr: 0.000046 grad_norm: 0.9286 (1.0704) closs: 1.1431 (1.1902) time: 4.1125 data: 0.0002 max mem: 54684 +[01:17:08.494066] Epoch: [0] [600/3229] lr: 0.000046 grad_norm: 0.9124 (1.0680) closs: 1.1127 (1.1888) time: 4.0677 data: 0.0002 max mem: 54684 +[01:17:49.333146] Epoch: [0] [610/3229] lr: 0.000047 grad_norm: 0.8772 (1.0649) closs: 1.1387 (1.1877) time: 4.0551 data: 0.0002 max mem: 54684 +[01:18:30.513096] Epoch: [0] [620/3229] lr: 0.000048 grad_norm: 0.8946 (1.0631) closs: 1.1490 (1.1873) time: 4.1009 data: 0.0002 max mem: 54684 +[01:19:12.132382] Epoch: [0] [630/3229] lr: 0.000049 grad_norm: 0.8910 (1.0597) closs: 1.1475 (1.1862) time: 4.1399 data: 0.0002 max mem: 54684 +[01:19:52.876937] Epoch: [0] [640/3229] lr: 0.000050 grad_norm: 0.8884 (1.0579) closs: 1.1173 (1.1843) time: 4.1181 data: 0.0002 max mem: 54684 +[01:20:33.485645] Epoch: [0] [650/3229] lr: 0.000050 grad_norm: 0.8884 (1.0558) closs: 1.0704 (1.1821) time: 4.0676 data: 0.0002 max mem: 54684 +[01:21:14.337520] Epoch: [0] [660/3229] lr: 0.000050 grad_norm: 0.8974 (1.0531) closs: 1.0758 (1.1812) time: 4.0730 data: 0.0002 max mem: 54684 +[01:21:54.945867] Epoch: [0] [670/3229] lr: 0.000050 grad_norm: 0.9121 (1.0517) closs: 1.1193 (1.1797) time: 4.0729 data: 0.0002 max mem: 54684 +[01:22:36.533925] Epoch: [0] [680/3229] lr: 0.000050 grad_norm: 0.9163 (1.0500) closs: 1.1473 (1.1795) time: 4.1097 data: 0.0002 max mem: 54684 +[01:23:17.315950] Epoch: [0] [690/3229] lr: 0.000050 grad_norm: 0.9097 (1.0480) closs: 1.1498 (1.1785) time: 4.1184 data: 0.0002 max mem: 54684 +[01:23:57.791032] Epoch: [0] [700/3229] lr: 0.000050 grad_norm: 0.8856 (1.0454) closs: 1.1322 (1.1770) time: 4.0628 data: 0.0003 max mem: 54684 +[01:24:38.841536] Epoch: [0] [710/3229] lr: 0.000050 grad_norm: 0.8786 (1.0446) closs: 1.1357 (1.1766) time: 4.0762 data: 0.0003 max mem: 54684 +[01:25:19.414277] Epoch: [0] [720/3229] lr: 0.000050 grad_norm: 0.9131 (1.0427) closs: 1.1209 (1.1750) time: 4.0811 data: 0.0002 max mem: 54684 +[01:26:00.560290] Epoch: [0] [730/3229] lr: 0.000050 grad_norm: 0.9133 (1.0408) closs: 1.0723 (1.1737) time: 4.0859 data: 0.0002 max mem: 54684 +[01:26:41.063904] Epoch: [0] [740/3229] lr: 0.000050 grad_norm: 0.8809 (1.0382) closs: 1.0816 (1.1727) time: 4.0824 data: 0.0002 max mem: 54684 +[01:27:22.243420] Epoch: [0] [750/3229] lr: 0.000050 grad_norm: 0.8721 (1.0366) closs: 1.1144 (1.1718) time: 4.0841 data: 0.0002 max mem: 54684 +[01:28:03.061085] Epoch: [0] [760/3229] lr: 0.000050 grad_norm: 0.8563 (1.0340) closs: 1.1227 (1.1713) time: 4.0998 data: 0.0002 max mem: 54684 +[01:28:43.881750] Epoch: [0] [770/3229] lr: 0.000050 grad_norm: 0.8563 (1.0319) closs: 1.1247 (1.1701) time: 4.0818 data: 0.0002 max mem: 54684 +[01:29:24.382823] Epoch: [0] [780/3229] lr: 0.000050 grad_norm: 0.8666 (1.0297) closs: 1.0862 (1.1689) time: 4.0660 data: 0.0003 max mem: 54684 +[01:30:05.263440] Epoch: [0] [790/3229] lr: 0.000050 grad_norm: 0.8803 (1.0278) closs: 1.0773 (1.1675) time: 4.0690 data: 0.0002 max mem: 54684 +[01:30:46.132360] Epoch: [0] [800/3229] lr: 0.000050 grad_norm: 0.8803 (1.0263) closs: 1.0871 (1.1669) time: 4.0874 data: 0.0002 max mem: 54684 +[01:31:26.624450] Epoch: [0] [810/3229] lr: 0.000050 grad_norm: 0.8782 (1.0245) closs: 1.0958 (1.1659) time: 4.0680 data: 0.0002 max mem: 54684 +[01:32:07.126461] Epoch: [0] [820/3229] lr: 0.000050 grad_norm: 0.8762 (1.0225) closs: 1.0958 (1.1649) time: 4.0496 data: 0.0002 max mem: 54684 +[01:32:48.359187] Epoch: [0] [830/3229] lr: 0.000050 grad_norm: 0.8256 (1.0204) closs: 1.0959 (1.1644) time: 4.0867 data: 0.0002 max mem: 54684 +[01:33:29.469866] Epoch: [0] [840/3229] lr: 0.000050 grad_norm: 0.8423 (1.0185) closs: 1.1185 (1.1638) time: 4.1171 data: 0.0002 max mem: 54684 +[01:34:09.970107] Epoch: [0] [850/3229] lr: 0.000050 grad_norm: 0.8891 (1.0170) closs: 1.1185 (1.1627) time: 4.0805 data: 0.0002 max mem: 54684 +[01:34:50.161303] Epoch: [0] [860/3229] lr: 0.000050 grad_norm: 0.8891 (1.0154) closs: 1.0997 (1.1617) time: 4.0345 data: 0.0002 max mem: 54684 +[01:35:31.765849] Epoch: [0] [870/3229] lr: 0.000050 grad_norm: 0.8328 (1.0135) closs: 1.0972 (1.1610) time: 4.0897 data: 0.0002 max mem: 54684 +[01:36:12.904367] Epoch: [0] [880/3229] lr: 0.000050 grad_norm: 0.8507 (1.0124) closs: 1.0972 (1.1607) time: 4.1371 data: 0.0002 max mem: 54684 +[01:36:54.052581] Epoch: [0] [890/3229] lr: 0.000050 grad_norm: 0.8788 (1.0109) closs: 1.1339 (1.1602) time: 4.1143 data: 0.0002 max mem: 54684 +[01:37:34.868212] Epoch: [0] [900/3229] lr: 0.000050 grad_norm: 0.8585 (1.0094) closs: 1.1287 (1.1597) time: 4.0981 data: 0.0002 max mem: 54684 +[01:38:16.113165] Epoch: [0] [910/3229] lr: 0.000050 grad_norm: 0.8376 (1.0077) closs: 1.1211 (1.1589) time: 4.1030 data: 0.0002 max mem: 54684 +[01:38:56.583668] Epoch: [0] [920/3229] lr: 0.000050 grad_norm: 0.8361 (1.0058) closs: 1.0987 (1.1582) time: 4.0857 data: 0.0002 max mem: 54684 +[01:39:37.055401] Epoch: [0] [930/3229] lr: 0.000050 grad_norm: 0.8472 (1.0044) closs: 1.1219 (1.1577) time: 4.0470 data: 0.0002 max mem: 54684 +[01:40:18.207956] Epoch: [0] [940/3229] lr: 0.000050 grad_norm: 0.8701 (1.0031) closs: 1.1219 (1.1571) time: 4.0811 data: 0.0002 max mem: 54684 +[01:40:59.474520] Epoch: [0] [950/3229] lr: 0.000050 grad_norm: 0.8784 (1.0016) closs: 1.1027 (1.1563) time: 4.1209 data: 0.0002 max mem: 54684 +[01:41:40.642522] Epoch: [0] [960/3229] lr: 0.000050 grad_norm: 0.8791 (1.0010) closs: 1.0988 (1.1557) time: 4.1217 data: 0.0002 max mem: 54684 +[01:42:21.476520] Epoch: [0] [970/3229] lr: 0.000050 grad_norm: 0.8580 (0.9997) closs: 1.1107 (1.1550) time: 4.1000 data: 0.0002 max mem: 54684 +[01:43:01.957616] Epoch: [0] [980/3229] lr: 0.000050 grad_norm: 0.8710 (0.9985) closs: 1.1156 (1.1545) time: 4.0657 data: 0.0002 max mem: 54684 +[01:43:43.560118] Epoch: [0] [990/3229] lr: 0.000050 grad_norm: 0.8756 (0.9976) closs: 1.1236 (1.1544) time: 4.1041 data: 0.0002 max mem: 54684 +[01:44:24.011660] Epoch: [0] [1000/3229] lr: 0.000050 grad_norm: 0.8893 (0.9967) closs: 1.1164 (1.1533) time: 4.1026 data: 0.0002 max mem: 54684 +[01:45:04.795094] Epoch: [0] [1010/3229] lr: 0.000050 grad_norm: 0.8719 (0.9951) closs: 1.0624 (1.1522) time: 4.0617 data: 0.0002 max mem: 54684 +[01:45:45.580136] Epoch: [0] [1020/3229] lr: 0.000050 grad_norm: 0.8502 (0.9936) closs: 1.0902 (1.1518) time: 4.0784 data: 0.0002 max mem: 54684 +[01:46:27.178733] Epoch: [0] [1030/3229] lr: 0.000050 grad_norm: 0.8502 (0.9924) closs: 1.1499 (1.1517) time: 4.1191 data: 0.0002 max mem: 54684 +[01:47:07.978058] Epoch: [0] [1040/3229] lr: 0.000050 grad_norm: 0.8731 (0.9909) closs: 1.0968 (1.1510) time: 4.1198 data: 0.0002 max mem: 54684 +[01:47:49.138856] Epoch: [0] [1050/3229] lr: 0.000050 grad_norm: 0.8731 (0.9900) closs: 1.1148 (1.1508) time: 4.0979 data: 0.0002 max mem: 54684 +[01:48:30.427876] Epoch: [0] [1060/3229] lr: 0.000050 grad_norm: 0.8918 (0.9889) closs: 1.1191 (1.1504) time: 4.1224 data: 0.0002 max mem: 54684 +[01:49:10.546105] Epoch: [0] [1070/3229] lr: 0.000050 grad_norm: 0.8288 (0.9871) closs: 1.0949 (1.1494) time: 4.0703 data: 0.0002 max mem: 54684 +[01:49:51.703975] Epoch: [0] [1080/3229] lr: 0.000050 grad_norm: 0.8567 (0.9861) closs: 1.1071 (1.1494) time: 4.0637 data: 0.0002 max mem: 54684 +[01:50:32.529173] Epoch: [0] [1090/3229] lr: 0.000050 grad_norm: 0.8706 (0.9851) closs: 1.1351 (1.1491) time: 4.0991 data: 0.0002 max mem: 54684 +[01:51:12.582431] Epoch: [0] [1100/3229] lr: 0.000050 grad_norm: 0.8475 (0.9835) closs: 1.0957 (1.1482) time: 4.0438 data: 0.0002 max mem: 54684 +[01:51:53.016781] Epoch: [0] [1110/3229] lr: 0.000050 grad_norm: 0.8386 (0.9821) closs: 1.0904 (1.1474) time: 4.0243 data: 0.0002 max mem: 54684 +[01:52:33.815856] Epoch: [0] [1120/3229] lr: 0.000050 grad_norm: 0.8762 (0.9811) closs: 1.1067 (1.1470) time: 4.0616 data: 0.0002 max mem: 54684 +[01:53:14.619418] Epoch: [0] [1130/3229] lr: 0.000050 grad_norm: 0.8767 (0.9802) closs: 1.0902 (1.1461) time: 4.0801 data: 0.0002 max mem: 54684 +[01:53:55.215391] Epoch: [0] [1140/3229] lr: 0.000050 grad_norm: 0.8585 (0.9789) closs: 1.0626 (1.1455) time: 4.0699 data: 0.0002 max mem: 54684 +[01:54:36.057646] Epoch: [0] [1150/3229] lr: 0.000050 grad_norm: 0.8274 (0.9774) closs: 1.0626 (1.1444) time: 4.0718 data: 0.0002 max mem: 54684 +[01:55:16.560302] Epoch: [0] [1160/3229] lr: 0.000050 grad_norm: 0.8237 (0.9766) closs: 1.0592 (1.1436) time: 4.0672 data: 0.0002 max mem: 54684 +[01:55:57.057967] Epoch: [0] [1170/3229] lr: 0.000050 grad_norm: 0.8557 (0.9756) closs: 1.0592 (1.1425) time: 4.0499 data: 0.0002 max mem: 54684 +[01:56:37.362333] Epoch: [0] [1180/3229] lr: 0.000050 grad_norm: 0.8549 (0.9744) closs: 1.0497 (1.1417) time: 4.0400 data: 0.0002 max mem: 54684 +[01:57:18.113945] Epoch: [0] [1190/3229] lr: 0.000050 grad_norm: 0.8588 (0.9733) closs: 1.0712 (1.1410) time: 4.0527 data: 0.0002 max mem: 54684 +[01:57:58.946014] Epoch: [0] [1200/3229] lr: 0.000050 grad_norm: 0.8545 (0.9721) closs: 1.1016 (1.1406) time: 4.0791 data: 0.0002 max mem: 54684 +[01:58:39.470964] Epoch: [0] [1210/3229] lr: 0.000050 grad_norm: 0.8155 (0.9705) closs: 1.0832 (1.1398) time: 4.0678 data: 0.0002 max mem: 54684 +[01:59:20.231476] Epoch: [0] [1220/3229] lr: 0.000050 grad_norm: 0.8030 (0.9693) closs: 1.0567 (1.1392) time: 4.0642 data: 0.0002 max mem: 54684 +[02:00:01.565697] Epoch: [0] [1230/3229] lr: 0.000050 grad_norm: 0.8324 (0.9682) closs: 1.1310 (1.1394) time: 4.1047 data: 0.0002 max mem: 54684 +[02:00:42.411163] Epoch: [0] [1240/3229] lr: 0.000050 grad_norm: 0.8455 (0.9676) closs: 1.1277 (1.1390) time: 4.1089 data: 0.0002 max mem: 54684 +[02:01:22.913104] Epoch: [0] [1250/3229] lr: 0.000050 grad_norm: 0.8509 (0.9664) closs: 1.0848 (1.1386) time: 4.0673 data: 0.0002 max mem: 54684 +[02:02:03.681895] Epoch: [0] [1260/3229] lr: 0.000049 grad_norm: 0.8150 (0.9655) closs: 1.0657 (1.1378) time: 4.0635 data: 0.0002 max mem: 54684 +[02:02:44.382490] Epoch: [0] [1270/3229] lr: 0.000049 grad_norm: 0.8150 (0.9645) closs: 1.0536 (1.1371) time: 4.0734 data: 0.0002 max mem: 54684 +[02:03:25.548675] Epoch: [0] [1280/3229] lr: 0.000049 grad_norm: 0.8142 (0.9634) closs: 1.0764 (1.1366) time: 4.0933 data: 0.0002 max mem: 54684 +[02:04:05.764704] Epoch: [0] [1290/3229] lr: 0.000049 grad_norm: 0.8104 (0.9624) closs: 1.0764 (1.1362) time: 4.0690 data: 0.0002 max mem: 54684 +[02:04:46.308118] Epoch: [0] [1300/3229] lr: 0.000049 grad_norm: 0.8170 (0.9614) closs: 1.0782 (1.1357) time: 4.0379 data: 0.0002 max mem: 54684 +[02:05:27.268012] Epoch: [0] [1310/3229] lr: 0.000049 grad_norm: 0.8101 (0.9602) closs: 1.0872 (1.1352) time: 4.0751 data: 0.0002 max mem: 54684 +[02:06:07.751784] Epoch: [0] [1320/3229] lr: 0.000049 grad_norm: 0.8024 (0.9591) closs: 1.0757 (1.1345) time: 4.0721 data: 0.0002 max mem: 54684 +[02:06:48.603185] Epoch: [0] [1330/3229] lr: 0.000049 grad_norm: 0.8059 (0.9579) closs: 1.0757 (1.1343) time: 4.0667 data: 0.0002 max mem: 54684 +[02:07:29.989007] Epoch: [0] [1340/3229] lr: 0.000049 grad_norm: 0.8320 (0.9575) closs: 1.1105 (1.1342) time: 4.1118 data: 0.0002 max mem: 54684 +[02:08:10.751898] Epoch: [0] [1350/3229] lr: 0.000049 grad_norm: 0.8769 (0.9566) closs: 1.0763 (1.1338) time: 4.1074 data: 0.0002 max mem: 54684 +[02:08:51.277537] Epoch: [0] [1360/3229] lr: 0.000049 grad_norm: 0.8228 (0.9556) closs: 1.0611 (1.1332) time: 4.0644 data: 0.0002 max mem: 54684 +[02:09:32.452759] Epoch: [0] [1370/3229] lr: 0.000049 grad_norm: 0.8208 (0.9549) closs: 1.0753 (1.1329) time: 4.0850 data: 0.0002 max mem: 54684 +[02:10:13.889635] Epoch: [0] [1380/3229] lr: 0.000049 grad_norm: 0.8204 (0.9543) closs: 1.0725 (1.1325) time: 4.1305 data: 0.0002 max mem: 54684 +[02:10:54.494775] Epoch: [0] [1390/3229] lr: 0.000049 grad_norm: 0.8187 (0.9533) closs: 1.0725 (1.1321) time: 4.1020 data: 0.0002 max mem: 54684 +[02:11:35.317556] Epoch: [0] [1400/3229] lr: 0.000049 grad_norm: 0.8187 (0.9522) closs: 1.0902 (1.1317) time: 4.0713 data: 0.0002 max mem: 54684 +[02:12:16.125134] Epoch: [0] [1410/3229] lr: 0.000049 grad_norm: 0.7985 (0.9512) closs: 1.0906 (1.1314) time: 4.0814 data: 0.0002 max mem: 54684 +[02:12:57.277533] Epoch: [0] [1420/3229] lr: 0.000049 grad_norm: 0.8158 (0.9501) closs: 1.1104 (1.1314) time: 4.0979 data: 0.0002 max mem: 54684 +[02:13:37.904443] Epoch: [0] [1430/3229] lr: 0.000049 grad_norm: 0.8008 (0.9488) closs: 1.0989 (1.1309) time: 4.0889 data: 0.0002 max mem: 54684 +[02:14:18.743491] Epoch: [0] [1440/3229] lr: 0.000049 grad_norm: 0.8008 (0.9479) closs: 1.0963 (1.1307) time: 4.0732 data: 0.0002 max mem: 54684 +[02:14:59.571598] Epoch: [0] [1450/3229] lr: 0.000049 grad_norm: 0.8153 (0.9467) closs: 1.1322 (1.1308) time: 4.0833 data: 0.0002 max mem: 54684 +[02:15:40.772886] Epoch: [0] [1460/3229] lr: 0.000049 grad_norm: 0.8315 (0.9465) closs: 1.1439 (1.1306) time: 4.1014 data: 0.0002 max mem: 54684 +[02:16:21.736547] Epoch: [0] [1470/3229] lr: 0.000049 grad_norm: 0.8342 (0.9457) closs: 1.0850 (1.1301) time: 4.1082 data: 0.0002 max mem: 54684 +[02:17:02.910967] Epoch: [0] [1480/3229] lr: 0.000049 grad_norm: 0.7991 (0.9446) closs: 1.0827 (1.1298) time: 4.1068 data: 0.0002 max mem: 54684 +[02:17:44.083251] Epoch: [0] [1490/3229] lr: 0.000049 grad_norm: 0.7991 (0.9442) closs: 1.1022 (1.1297) time: 4.1173 data: 0.0002 max mem: 54684 +[02:18:26.042677] Epoch: [0] [1500/3229] lr: 0.000049 grad_norm: 0.8720 (0.9438) closs: 1.1096 (1.1297) time: 4.1565 data: 0.0002 max mem: 54684 +[02:19:06.874119] Epoch: [0] [1510/3229] lr: 0.000049 grad_norm: 0.8466 (0.9430) closs: 1.0920 (1.1293) time: 4.1395 data: 0.0002 max mem: 54684 +[02:19:48.004002] Epoch: [0] [1520/3229] lr: 0.000049 grad_norm: 0.8431 (0.9425) closs: 1.0898 (1.1291) time: 4.0980 data: 0.0002 max mem: 54684 +[02:20:29.271406] Epoch: [0] [1530/3229] lr: 0.000049 grad_norm: 0.8229 (0.9419) closs: 1.0907 (1.1289) time: 4.1198 data: 0.0002 max mem: 54684 +[02:21:10.763947] Epoch: [0] [1540/3229] lr: 0.000049 grad_norm: 0.8301 (0.9413) closs: 1.1024 (1.1285) time: 4.1379 data: 0.0002 max mem: 54684 +[02:21:50.930108] Epoch: [0] [1550/3229] lr: 0.000049 grad_norm: 0.8301 (0.9403) closs: 1.0550 (1.1277) time: 4.0829 data: 0.0002 max mem: 54684 +[02:22:31.739856] Epoch: [0] [1560/3229] lr: 0.000049 grad_norm: 0.7819 (0.9392) closs: 1.0473 (1.1272) time: 4.0487 data: 0.0002 max mem: 54684 +[02:23:12.906228] Epoch: [0] [1570/3229] lr: 0.000049 grad_norm: 0.7600 (0.9384) closs: 1.0405 (1.1268) time: 4.0987 data: 0.0002 max mem: 54684 +[02:23:54.080302] Epoch: [0] [1580/3229] lr: 0.000049 grad_norm: 0.8070 (0.9376) closs: 1.0689 (1.1264) time: 4.1169 data: 0.0002 max mem: 54684 +[02:24:34.900214] Epoch: [0] [1590/3229] lr: 0.000049 grad_norm: 0.8137 (0.9367) closs: 1.0689 (1.1259) time: 4.0996 data: 0.0002 max mem: 54684 +[02:25:14.745725] Epoch: [0] [1600/3229] lr: 0.000049 grad_norm: 0.7806 (0.9354) closs: 1.0465 (1.1251) time: 4.0332 data: 0.0002 max mem: 54684 +[02:25:55.704023] Epoch: [0] [1610/3229] lr: 0.000049 grad_norm: 0.8074 (0.9348) closs: 1.0935 (1.1250) time: 4.0401 data: 0.0002 max mem: 54684 +[02:26:37.197949] Epoch: [0] [1620/3229] lr: 0.000049 grad_norm: 0.8162 (0.9342) closs: 1.1148 (1.1250) time: 4.1225 data: 0.0002 max mem: 54684 +[02:27:18.326238] Epoch: [0] [1630/3229] lr: 0.000049 grad_norm: 0.8162 (0.9335) closs: 1.0970 (1.1247) time: 4.1310 data: 0.0002 max mem: 54684 +[02:27:59.451089] Epoch: [0] [1640/3229] lr: 0.000049 grad_norm: 0.8227 (0.9328) closs: 1.1007 (1.1248) time: 4.1126 data: 0.0002 max mem: 54684 +[02:28:40.359589] Epoch: [0] [1650/3229] lr: 0.000049 grad_norm: 0.8227 (0.9321) closs: 1.0987 (1.1244) time: 4.1016 data: 0.0002 max mem: 54684 +[02:29:21.499384] Epoch: [0] [1660/3229] lr: 0.000049 grad_norm: 0.7906 (0.9314) closs: 1.0815 (1.1243) time: 4.1023 data: 0.0002 max mem: 54684 +[02:30:01.961500] Epoch: [0] [1670/3229] lr: 0.000049 grad_norm: 0.7906 (0.9306) closs: 1.1051 (1.1242) time: 4.0800 data: 0.0002 max mem: 54684 +[02:30:43.093943] Epoch: [0] [1680/3229] lr: 0.000049 grad_norm: 0.7620 (0.9296) closs: 1.0947 (1.1241) time: 4.0797 data: 0.0002 max mem: 54684 +[02:31:23.378880] Epoch: [0] [1690/3229] lr: 0.000049 grad_norm: 0.7389 (0.9285) closs: 1.0770 (1.1237) time: 4.0708 data: 0.0002 max mem: 54684 +[02:32:04.494682] Epoch: [0] [1700/3229] lr: 0.000049 grad_norm: 0.7654 (0.9278) closs: 1.0655 (1.1234) time: 4.0700 data: 0.0002 max mem: 54684 +[02:32:44.682897] Epoch: [0] [1710/3229] lr: 0.000048 grad_norm: 0.7788 (0.9267) closs: 1.0655 (1.1229) time: 4.0651 data: 0.0002 max mem: 54684 +[02:33:25.176480] Epoch: [0] [1720/3229] lr: 0.000048 grad_norm: 0.7793 (0.9260) closs: 1.0609 (1.1226) time: 4.0340 data: 0.0002 max mem: 54684 +[02:34:06.615615] Epoch: [0] [1730/3229] lr: 0.000048 grad_norm: 0.7983 (0.9254) closs: 1.0603 (1.1224) time: 4.0966 data: 0.0002 max mem: 54684 +[02:34:47.629024] Epoch: [0] [1740/3229] lr: 0.000048 grad_norm: 0.7874 (0.9245) closs: 1.0761 (1.1222) time: 4.1226 data: 0.0002 max mem: 54684 +[02:35:28.439728] Epoch: [0] [1750/3229] lr: 0.000048 grad_norm: 0.8113 (0.9241) closs: 1.0861 (1.1220) time: 4.0911 data: 0.0002 max mem: 54684 +[02:36:09.569880] Epoch: [0] [1760/3229] lr: 0.000048 grad_norm: 0.8314 (0.9236) closs: 1.1081 (1.1218) time: 4.0970 data: 0.0002 max mem: 54684 +[02:36:50.672861] Epoch: [0] [1770/3229] lr: 0.000048 grad_norm: 0.8306 (0.9229) closs: 1.0888 (1.1214) time: 4.1116 data: 0.0002 max mem: 54684 +[02:37:31.667284] Epoch: [0] [1780/3229] lr: 0.000048 grad_norm: 0.8092 (0.9221) closs: 1.0832 (1.1211) time: 4.1048 data: 0.0002 max mem: 54684 +[02:38:12.473138] Epoch: [0] [1790/3229] lr: 0.000048 grad_norm: 0.7678 (0.9212) closs: 1.0395 (1.1207) time: 4.0899 data: 0.0002 max mem: 54684 +[02:38:52.942058] Epoch: [0] [1800/3229] lr: 0.000048 grad_norm: 0.7929 (0.9204) closs: 1.0356 (1.1204) time: 4.0637 data: 0.0002 max mem: 54684 +[02:39:33.666135] Epoch: [0] [1810/3229] lr: 0.000048 grad_norm: 0.7905 (0.9195) closs: 1.0807 (1.1202) time: 4.0596 data: 0.0002 max mem: 54684 +[02:40:14.381280] Epoch: [0] [1820/3229] lr: 0.000048 grad_norm: 0.7724 (0.9187) closs: 1.1055 (1.1200) time: 4.0719 data: 0.0002 max mem: 54684 +[02:40:55.197221] Epoch: [0] [1830/3229] lr: 0.000048 grad_norm: 0.7724 (0.9180) closs: 1.0897 (1.1199) time: 4.0765 data: 0.0002 max mem: 54684 +[02:41:35.043279] Epoch: [0] [1840/3229] lr: 0.000048 grad_norm: 0.7413 (0.9168) closs: 1.0573 (1.1195) time: 4.0330 data: 0.0002 max mem: 54684 +[02:42:16.132889] Epoch: [0] [1850/3229] lr: 0.000048 grad_norm: 0.7712 (0.9161) closs: 1.0468 (1.1191) time: 4.0467 data: 0.0002 max mem: 54684 +[02:42:56.518513] Epoch: [0] [1860/3229] lr: 0.000048 grad_norm: 0.7990 (0.9154) closs: 1.0714 (1.1188) time: 4.0737 data: 0.0002 max mem: 54684 +[02:43:37.655055] Epoch: [0] [1870/3229] lr: 0.000048 grad_norm: 0.8236 (0.9150) closs: 1.1020 (1.1187) time: 4.0760 data: 0.0002 max mem: 54684 +[02:44:18.501800] Epoch: [0] [1880/3229] lr: 0.000048 grad_norm: 0.8055 (0.9143) closs: 1.0773 (1.1183) time: 4.0991 data: 0.0002 max mem: 54684 +[02:44:59.327626] Epoch: [0] [1890/3229] lr: 0.000048 grad_norm: 0.7469 (0.9135) closs: 1.0612 (1.1180) time: 4.0836 data: 0.0002 max mem: 54684 +[02:45:40.957982] Epoch: [0] [1900/3229] lr: 0.000048 grad_norm: 0.8118 (0.9132) closs: 1.0947 (1.1179) time: 4.1227 data: 0.0002 max mem: 54684 +[02:46:21.456520] Epoch: [0] [1910/3229] lr: 0.000048 grad_norm: 0.8009 (0.9124) closs: 1.0516 (1.1173) time: 4.1064 data: 0.0002 max mem: 54684 +[02:47:01.609291] Epoch: [0] [1920/3229] lr: 0.000048 grad_norm: 0.8009 (0.9121) closs: 1.0265 (1.1168) time: 4.0325 data: 0.0002 max mem: 54684 +[02:47:43.618438] Epoch: [0] [1930/3229] lr: 0.000048 grad_norm: 0.8074 (0.9115) closs: 1.0395 (1.1167) time: 4.1080 data: 0.0002 max mem: 54684 +[02:48:24.870145] Epoch: [0] [1940/3229] lr: 0.000048 grad_norm: 0.7995 (0.9110) closs: 1.1143 (1.1167) time: 4.1630 data: 0.0002 max mem: 54684 +[02:49:06.019794] Epoch: [0] [1950/3229] lr: 0.000048 grad_norm: 0.8393 (0.9109) closs: 1.1143 (1.1166) time: 4.1200 data: 0.0001 max mem: 54684 +[02:49:46.491553] Epoch: [0] [1960/3229] lr: 0.000048 grad_norm: 0.8373 (0.9102) closs: 1.0866 (1.1165) time: 4.0810 data: 0.0002 max mem: 54684 +[02:50:27.704222] Epoch: [0] [1970/3229] lr: 0.000048 grad_norm: 0.8284 (0.9099) closs: 1.0866 (1.1163) time: 4.0841 data: 0.0002 max mem: 54684 +[02:51:08.271766] Epoch: [0] [1980/3229] lr: 0.000048 grad_norm: 0.7955 (0.9092) closs: 1.0799 (1.1161) time: 4.0890 data: 0.0002 max mem: 54684 +[02:51:49.381940] Epoch: [0] [1990/3229] lr: 0.000048 grad_norm: 0.7955 (0.9090) closs: 1.0799 (1.1159) time: 4.0838 data: 0.0002 max mem: 54684 +[02:52:29.511376] Epoch: [0] [2000/3229] lr: 0.000048 grad_norm: 0.8154 (0.9083) closs: 1.0683 (1.1154) time: 4.0619 data: 0.0002 max mem: 54684 +[02:53:10.897602] Epoch: [0] [2010/3229] lr: 0.000048 grad_norm: 0.8103 (0.9078) closs: 1.0683 (1.1152) time: 4.0757 data: 0.0002 max mem: 54684 +[02:53:51.562695] Epoch: [0] [2020/3229] lr: 0.000047 grad_norm: 0.8103 (0.9072) closs: 1.0827 (1.1151) time: 4.1025 data: 0.0002 max mem: 54684 +[02:54:31.690739] Epoch: [0] [2030/3229] lr: 0.000047 grad_norm: 0.7805 (0.9065) closs: 1.0595 (1.1145) time: 4.0396 data: 0.0002 max mem: 54684 +[02:55:12.459763] Epoch: [0] [2040/3229] lr: 0.000047 grad_norm: 0.7728 (0.9058) closs: 1.0613 (1.1143) time: 4.0448 data: 0.0002 max mem: 54684 +[02:55:54.276314] Epoch: [0] [2050/3229] lr: 0.000047 grad_norm: 0.7637 (0.9051) closs: 1.1000 (1.1144) time: 4.1292 data: 0.0002 max mem: 54684 +[02:56:35.163977] Epoch: [0] [2060/3229] lr: 0.000047 grad_norm: 0.7644 (0.9045) closs: 1.1134 (1.1143) time: 4.1351 data: 0.0002 max mem: 54684 +[02:57:15.269544] Epoch: [0] [2070/3229] lr: 0.000047 grad_norm: 0.7892 (0.9039) closs: 1.0588 (1.1138) time: 4.0496 data: 0.0002 max mem: 54684 +[02:57:56.040287] Epoch: [0] [2080/3229] lr: 0.000047 grad_norm: 0.8042 (0.9033) closs: 1.0447 (1.1136) time: 4.0438 data: 0.0002 max mem: 54684 +[02:58:36.828734] Epoch: [0] [2090/3229] lr: 0.000047 grad_norm: 0.7867 (0.9027) closs: 1.0914 (1.1133) time: 4.0779 data: 0.0002 max mem: 54684 +[02:59:17.056084] Epoch: [0] [2100/3229] lr: 0.000047 grad_norm: 0.7766 (0.9021) closs: 1.0796 (1.1131) time: 4.0507 data: 0.0002 max mem: 54684 +[02:59:57.850948] Epoch: [0] [2110/3229] lr: 0.000047 grad_norm: 0.7773 (0.9016) closs: 1.0748 (1.1129) time: 4.0510 data: 0.0002 max mem: 54684 +[03:00:38.445171] Epoch: [0] [2120/3229] lr: 0.000047 grad_norm: 0.8051 (0.9012) closs: 1.0689 (1.1126) time: 4.0694 data: 0.0002 max mem: 54684 +[03:01:19.387745] Epoch: [0] [2130/3229] lr: 0.000047 grad_norm: 0.8057 (0.9006) closs: 1.0761 (1.1126) time: 4.0768 data: 0.0002 max mem: 54684 +[03:01:59.964804] Epoch: [0] [2140/3229] lr: 0.000047 grad_norm: 0.8057 (0.9000) closs: 1.1012 (1.1124) time: 4.0759 data: 0.0002 max mem: 54684 +[03:02:40.770931] Epoch: [0] [2150/3229] lr: 0.000047 grad_norm: 0.7907 (0.8995) closs: 1.0643 (1.1121) time: 4.0691 data: 0.0002 max mem: 54684 +[03:03:21.686342] Epoch: [0] [2160/3229] lr: 0.000047 grad_norm: 0.7718 (0.8990) closs: 1.0844 (1.1122) time: 4.0860 data: 0.0002 max mem: 54684 +[03:04:02.991052] Epoch: [0] [2170/3229] lr: 0.000047 grad_norm: 0.7893 (0.8986) closs: 1.1509 (1.1124) time: 4.1109 data: 0.0002 max mem: 54684 +[03:04:44.216654] Epoch: [0] [2180/3229] lr: 0.000047 grad_norm: 0.8023 (0.8981) closs: 1.0995 (1.1123) time: 4.1264 data: 0.0002 max mem: 54684 +[03:05:24.677881] Epoch: [0] [2190/3229] lr: 0.000047 grad_norm: 0.8062 (0.8976) closs: 1.0876 (1.1122) time: 4.0843 data: 0.0002 max mem: 54684 +[03:06:05.666908] Epoch: [0] [2200/3229] lr: 0.000047 grad_norm: 0.8079 (0.8971) closs: 1.0597 (1.1120) time: 4.0724 data: 0.0002 max mem: 54684 +[03:06:46.537094] Epoch: [0] [2210/3229] lr: 0.000047 grad_norm: 0.8028 (0.8967) closs: 1.0597 (1.1120) time: 4.0929 data: 0.0002 max mem: 54684 +[03:07:27.457503] Epoch: [0] [2220/3229] lr: 0.000047 grad_norm: 0.8028 (0.8962) closs: 1.0845 (1.1117) time: 4.0895 data: 0.0002 max mem: 54684 +[03:08:08.278999] Epoch: [0] [2230/3229] lr: 0.000047 grad_norm: 0.7934 (0.8957) closs: 1.0507 (1.1114) time: 4.0870 data: 0.0002 max mem: 54684 +[03:08:48.891856] Epoch: [0] [2240/3229] lr: 0.000047 grad_norm: 0.7621 (0.8951) closs: 1.0717 (1.1111) time: 4.0717 data: 0.0002 max mem: 54684 +[03:09:29.597430] Epoch: [0] [2250/3229] lr: 0.000047 grad_norm: 0.7756 (0.8946) closs: 1.0717 (1.1110) time: 4.0659 data: 0.0002 max mem: 54684 +[03:10:10.515536] Epoch: [0] [2260/3229] lr: 0.000047 grad_norm: 0.7727 (0.8940) closs: 1.0694 (1.1108) time: 4.0811 data: 0.0002 max mem: 54684 +[03:10:51.637059] Epoch: [0] [2270/3229] lr: 0.000047 grad_norm: 0.7727 (0.8936) closs: 1.0694 (1.1106) time: 4.1019 data: 0.0002 max mem: 54684 +[03:11:32.595121] Epoch: [0] [2280/3229] lr: 0.000046 grad_norm: 0.7681 (0.8929) closs: 1.0451 (1.1103) time: 4.1039 data: 0.0002 max mem: 54684 +[03:12:13.624396] Epoch: [0] [2290/3229] lr: 0.000046 grad_norm: 0.7482 (0.8924) closs: 1.0431 (1.1103) time: 4.0993 data: 0.0002 max mem: 54684 +[03:12:54.747566] Epoch: [0] [2300/3229] lr: 0.000046 grad_norm: 0.7703 (0.8920) closs: 1.1258 (1.1103) time: 4.1076 data: 0.0002 max mem: 54684 +[03:13:34.899825] Epoch: [0] [2310/3229] lr: 0.000046 grad_norm: 0.7731 (0.8914) closs: 1.0817 (1.1101) time: 4.0637 data: 0.0002 max mem: 54684 +[03:14:15.886367] Epoch: [0] [2320/3229] lr: 0.000046 grad_norm: 0.8009 (0.8911) closs: 1.0688 (1.1099) time: 4.0569 data: 0.0002 max mem: 54684 +[03:14:56.926589] Epoch: [0] [2330/3229] lr: 0.000046 grad_norm: 0.7560 (0.8905) closs: 1.0458 (1.1094) time: 4.1013 data: 0.0002 max mem: 54684 +[03:15:37.678425] Epoch: [0] [2340/3229] lr: 0.000046 grad_norm: 0.7591 (0.8902) closs: 1.0475 (1.1094) time: 4.0895 data: 0.0002 max mem: 54684 +[03:16:18.128561] Epoch: [0] [2350/3229] lr: 0.000046 grad_norm: 0.8028 (0.8896) closs: 1.0808 (1.1091) time: 4.0600 data: 0.0002 max mem: 54684 +[03:16:59.115639] Epoch: [0] [2360/3229] lr: 0.000046 grad_norm: 0.7890 (0.8892) closs: 1.0613 (1.1090) time: 4.0718 data: 0.0002 max mem: 54684 +[03:17:39.826484] Epoch: [0] [2370/3229] lr: 0.000046 grad_norm: 0.7886 (0.8888) closs: 1.0808 (1.1091) time: 4.0848 data: 0.0002 max mem: 54684 +[03:18:20.921523] Epoch: [0] [2380/3229] lr: 0.000046 grad_norm: 0.7769 (0.8883) closs: 1.1109 (1.1091) time: 4.0902 data: 0.0002 max mem: 54684 +[03:19:01.354222] Epoch: [0] [2390/3229] lr: 0.000046 grad_norm: 0.7578 (0.8877) closs: 1.0823 (1.1089) time: 4.0763 data: 0.0002 max mem: 54684 +[03:19:42.341834] Epoch: [0] [2400/3229] lr: 0.000046 grad_norm: 0.7938 (0.8873) closs: 1.0722 (1.1086) time: 4.0710 data: 0.0002 max mem: 54684 +[03:20:23.981102] Epoch: [0] [2410/3229] lr: 0.000046 grad_norm: 0.8303 (0.8871) closs: 1.0663 (1.1085) time: 4.1313 data: 0.0002 max mem: 54684 +[03:21:04.723017] Epoch: [0] [2420/3229] lr: 0.000046 grad_norm: 0.8142 (0.8867) closs: 1.0663 (1.1083) time: 4.1190 data: 0.0002 max mem: 54684 +[03:21:45.161994] Epoch: [0] [2430/3229] lr: 0.000046 grad_norm: 0.7785 (0.8863) closs: 1.0653 (1.1080) time: 4.0590 data: 0.0002 max mem: 54684 +[03:22:26.409460] Epoch: [0] [2440/3229] lr: 0.000046 grad_norm: 0.7811 (0.8859) closs: 1.0728 (1.1080) time: 4.0843 data: 0.0002 max mem: 54684 +[03:23:07.737120] Epoch: [0] [2450/3229] lr: 0.000046 grad_norm: 0.7773 (0.8855) closs: 1.1207 (1.1079) time: 4.1287 data: 0.0002 max mem: 54684 +[03:23:48.818206] Epoch: [0] [2460/3229] lr: 0.000046 grad_norm: 0.7819 (0.8853) closs: 1.0838 (1.1078) time: 4.1204 data: 0.0002 max mem: 54684 +[03:24:29.929209] Epoch: [0] [2470/3229] lr: 0.000046 grad_norm: 0.8072 (0.8851) closs: 1.0518 (1.1075) time: 4.1095 data: 0.0002 max mem: 54684 +[03:25:10.886008] Epoch: [0] [2480/3229] lr: 0.000046 grad_norm: 0.7804 (0.8846) closs: 1.0573 (1.1073) time: 4.1033 data: 0.0002 max mem: 54684 +[03:25:52.557958] Epoch: [0] [2490/3229] lr: 0.000046 grad_norm: 0.7737 (0.8845) closs: 1.0763 (1.1072) time: 4.1314 data: 0.0002 max mem: 54684 +[03:26:33.671349] Epoch: [0] [2500/3229] lr: 0.000045 grad_norm: 0.7789 (0.8840) closs: 1.1176 (1.1073) time: 4.1392 data: 0.0002 max mem: 54684 +[03:27:14.135334] Epoch: [0] [2510/3229] lr: 0.000045 grad_norm: 0.7701 (0.8835) closs: 1.0906 (1.1070) time: 4.0788 data: 0.0002 max mem: 54684 +[03:27:55.432584] Epoch: [0] [2520/3229] lr: 0.000045 grad_norm: 0.7708 (0.8832) closs: 1.0723 (1.1069) time: 4.0880 data: 0.0002 max mem: 54684 +[03:28:36.466155] Epoch: [0] [2530/3229] lr: 0.000045 grad_norm: 0.7946 (0.8827) closs: 1.0810 (1.1069) time: 4.1165 data: 0.0002 max mem: 54684 +[03:29:17.239455] Epoch: [0] [2540/3229] lr: 0.000045 grad_norm: 0.7615 (0.8823) closs: 1.0746 (1.1067) time: 4.0903 data: 0.0002 max mem: 54684 +[03:29:58.019807] Epoch: [0] [2550/3229] lr: 0.000045 grad_norm: 0.7483 (0.8819) closs: 1.0516 (1.1065) time: 4.0776 data: 0.0002 max mem: 54684 +[03:30:39.329898] Epoch: [0] [2560/3229] lr: 0.000045 grad_norm: 0.7708 (0.8816) closs: 1.0516 (1.1063) time: 4.1045 data: 0.0002 max mem: 54684 +[03:31:20.317679] Epoch: [0] [2570/3229] lr: 0.000045 grad_norm: 0.7788 (0.8812) closs: 1.0652 (1.1062) time: 4.1148 data: 0.0002 max mem: 54684 +[03:32:01.406782] Epoch: [0] [2580/3229] lr: 0.000045 grad_norm: 0.7574 (0.8808) closs: 1.0688 (1.1062) time: 4.1038 data: 0.0002 max mem: 54684 +[03:32:41.538400] Epoch: [0] [2590/3229] lr: 0.000045 grad_norm: 0.7515 (0.8802) closs: 1.0843 (1.1059) time: 4.0610 data: 0.0002 max mem: 54684 +[03:33:22.786415] Epoch: [0] [2600/3229] lr: 0.000045 grad_norm: 0.8014 (0.8799) closs: 1.0804 (1.1058) time: 4.0689 data: 0.0002 max mem: 54684 +[03:34:03.508058] Epoch: [0] [2610/3229] lr: 0.000045 grad_norm: 0.8014 (0.8794) closs: 1.0795 (1.1055) time: 4.0984 data: 0.0002 max mem: 54684 +[03:34:43.958992] Epoch: [0] [2620/3229] lr: 0.000045 grad_norm: 0.7646 (0.8789) closs: 1.0806 (1.1054) time: 4.0586 data: 0.0002 max mem: 54684 +[03:35:25.187896] Epoch: [0] [2630/3229] lr: 0.000045 grad_norm: 0.7696 (0.8785) closs: 1.1043 (1.1055) time: 4.0839 data: 0.0002 max mem: 54684 +[03:36:06.087296] Epoch: [0] [2640/3229] lr: 0.000045 grad_norm: 0.8024 (0.8782) closs: 1.1064 (1.1054) time: 4.1063 data: 0.0002 max mem: 54684 +[03:36:46.644106] Epoch: [0] [2650/3229] lr: 0.000045 grad_norm: 0.7676 (0.8777) closs: 1.0427 (1.1051) time: 4.0727 data: 0.0002 max mem: 54684 +[03:37:27.758262] Epoch: [0] [2660/3229] lr: 0.000045 grad_norm: 0.7810 (0.8776) closs: 1.0405 (1.1049) time: 4.0835 data: 0.0002 max mem: 54684 +[03:38:08.680275] Epoch: [0] [2670/3229] lr: 0.000045 grad_norm: 0.8024 (0.8774) closs: 1.0546 (1.1047) time: 4.1017 data: 0.0002 max mem: 54684 +[03:38:49.498507] Epoch: [0] [2680/3229] lr: 0.000045 grad_norm: 0.7959 (0.8770) closs: 1.0579 (1.1046) time: 4.0869 data: 0.0002 max mem: 54684 +[03:39:30.230437] Epoch: [0] [2690/3229] lr: 0.000045 grad_norm: 0.7603 (0.8766) closs: 1.0688 (1.1044) time: 4.0774 data: 0.0002 max mem: 54684 +[03:40:10.699217] Epoch: [0] [2700/3229] lr: 0.000045 grad_norm: 0.7214 (0.8762) closs: 1.0769 (1.1043) time: 4.0600 data: 0.0002 max mem: 54684 +[03:40:51.627601] Epoch: [0] [2710/3229] lr: 0.000044 grad_norm: 0.7282 (0.8757) closs: 1.0952 (1.1042) time: 4.0698 data: 0.0002 max mem: 54684 +[03:41:32.962290] Epoch: [0] [2720/3229] lr: 0.000044 grad_norm: 0.7610 (0.8755) closs: 1.0952 (1.1042) time: 4.1131 data: 0.0002 max mem: 54684 +[03:42:13.881942] Epoch: [0] [2730/3229] lr: 0.000044 grad_norm: 0.7852 (0.8752) closs: 1.1198 (1.1041) time: 4.1127 data: 0.0002 max mem: 54684 +[03:42:54.354106] Epoch: [0] [2740/3229] lr: 0.000044 grad_norm: 0.7901 (0.8749) closs: 1.0710 (1.1039) time: 4.0695 data: 0.0002 max mem: 54684 +[03:43:35.270721] Epoch: [0] [2750/3229] lr: 0.000044 grad_norm: 0.7921 (0.8746) closs: 1.0635 (1.1037) time: 4.0694 data: 0.0002 max mem: 54684 +[03:44:16.560153] Epoch: [0] [2760/3229] lr: 0.000044 grad_norm: 0.7653 (0.8742) closs: 1.0744 (1.1036) time: 4.1102 data: 0.0002 max mem: 54684 +[03:44:57.121862] Epoch: [0] [2770/3229] lr: 0.000044 grad_norm: 0.7653 (0.8738) closs: 1.0760 (1.1034) time: 4.0925 data: 0.0002 max mem: 54684 +[03:45:37.257683] Epoch: [0] [2780/3229] lr: 0.000044 grad_norm: 0.7473 (0.8732) closs: 1.0355 (1.1030) time: 4.0348 data: 0.0002 max mem: 54684 +[03:46:17.901019] Epoch: [0] [2790/3229] lr: 0.000044 grad_norm: 0.7473 (0.8729) closs: 1.0248 (1.1028) time: 4.0389 data: 0.0002 max mem: 54684 +[03:46:58.454326] Epoch: [0] [2800/3229] lr: 0.000044 grad_norm: 0.7700 (0.8725) closs: 1.0784 (1.1027) time: 4.0598 data: 0.0002 max mem: 54684 +[03:47:39.337936] Epoch: [0] [2810/3229] lr: 0.000044 grad_norm: 0.7546 (0.8720) closs: 1.0590 (1.1026) time: 4.0718 data: 0.0002 max mem: 54684 +[03:48:20.119515] Epoch: [0] [2820/3229] lr: 0.000044 grad_norm: 0.7763 (0.8717) closs: 1.0590 (1.1024) time: 4.0832 data: 0.0002 max mem: 54684 +[03:49:01.136368] Epoch: [0] [2830/3229] lr: 0.000044 grad_norm: 0.7699 (0.8713) closs: 1.0643 (1.1023) time: 4.0899 data: 0.0002 max mem: 54684 +[03:49:41.768883] Epoch: [0] [2840/3229] lr: 0.000044 grad_norm: 0.7428 (0.8708) closs: 1.0609 (1.1022) time: 4.0824 data: 0.0002 max mem: 54684 +[03:50:22.978721] Epoch: [0] [2850/3229] lr: 0.000044 grad_norm: 0.7529 (0.8704) closs: 1.0452 (1.1021) time: 4.0921 data: 0.0002 max mem: 54684 +[03:51:03.103441] Epoch: [0] [2860/3229] lr: 0.000044 grad_norm: 0.7534 (0.8699) closs: 1.0985 (1.1020) time: 4.0667 data: 0.0002 max mem: 54684 +[03:51:44.422934] Epoch: [0] [2870/3229] lr: 0.000044 grad_norm: 0.7563 (0.8695) closs: 1.0943 (1.1019) time: 4.0721 data: 0.0002 max mem: 54684 +[03:52:25.402059] Epoch: [0] [2880/3229] lr: 0.000044 grad_norm: 0.7777 (0.8691) closs: 1.1125 (1.1020) time: 4.1149 data: 0.0002 max mem: 54684 +[03:53:06.008254] Epoch: [0] [2890/3229] lr: 0.000043 grad_norm: 0.7777 (0.8688) closs: 1.1062 (1.1018) time: 4.0792 data: 0.0002 max mem: 54684 +[03:53:47.142690] Epoch: [0] [2900/3229] lr: 0.000043 grad_norm: 0.7781 (0.8685) closs: 1.0867 (1.1017) time: 4.0870 data: 0.0002 max mem: 54684 +[03:54:28.114121] Epoch: [0] [2910/3229] lr: 0.000043 grad_norm: 0.7930 (0.8682) closs: 1.0739 (1.1015) time: 4.1052 data: 0.0002 max mem: 54684 +[03:55:09.721076] Epoch: [0] [2920/3229] lr: 0.000043 grad_norm: 0.7677 (0.8679) closs: 1.0869 (1.1015) time: 4.1289 data: 0.0002 max mem: 54684 +[03:55:50.967003] Epoch: [0] [2930/3229] lr: 0.000043 grad_norm: 0.8109 (0.8678) closs: 1.0987 (1.1014) time: 4.1426 data: 0.0002 max mem: 54684 +[03:56:31.795882] Epoch: [0] [2940/3229] lr: 0.000043 grad_norm: 0.8109 (0.8675) closs: 1.0884 (1.1015) time: 4.1037 data: 0.0002 max mem: 54684 +[03:57:13.073814] Epoch: [0] [2950/3229] lr: 0.000043 grad_norm: 0.7720 (0.8672) closs: 1.0852 (1.1014) time: 4.1053 data: 0.0002 max mem: 54684 +[03:57:54.032832] Epoch: [0] [2960/3229] lr: 0.000043 grad_norm: 0.7005 (0.8666) closs: 1.0610 (1.1012) time: 4.1118 data: 0.0002 max mem: 54684 +[03:58:34.931320] Epoch: [0] [2970/3229] lr: 0.000043 grad_norm: 0.7079 (0.8663) closs: 1.0292 (1.1009) time: 4.0928 data: 0.0002 max mem: 54684 +[03:59:16.050440] Epoch: [0] [2980/3229] lr: 0.000043 grad_norm: 0.7676 (0.8661) closs: 1.0698 (1.1009) time: 4.1008 data: 0.0002 max mem: 54684 +[03:59:57.343357] Epoch: [0] [2990/3229] lr: 0.000043 grad_norm: 0.7941 (0.8659) closs: 1.0382 (1.1006) time: 4.1205 data: 0.0002 max mem: 54684 +[04:00:38.675134] Epoch: [0] [3000/3229] lr: 0.000043 grad_norm: 0.8105 (0.8657) closs: 1.0089 (1.1003) time: 4.1312 data: 0.0002 max mem: 54684 +[04:01:19.570077] Epoch: [0] [3010/3229] lr: 0.000043 grad_norm: 0.7900 (0.8653) closs: 1.0515 (1.1003) time: 4.1113 data: 0.0002 max mem: 54684 +[04:02:00.671184] Epoch: [0] [3020/3229] lr: 0.000043 grad_norm: 0.7517 (0.8650) closs: 1.0734 (1.1003) time: 4.0997 data: 0.0002 max mem: 54684 +[04:02:40.993797] Epoch: [0] [3030/3229] lr: 0.000043 grad_norm: 0.7689 (0.8646) closs: 1.0596 (1.1000) time: 4.0711 data: 0.0002 max mem: 54684 +[04:03:22.299784] Epoch: [0] [3040/3229] lr: 0.000043 grad_norm: 0.7714 (0.8643) closs: 1.0596 (1.0998) time: 4.0814 data: 0.0002 max mem: 54684 +[04:04:03.204281] Epoch: [0] [3050/3229] lr: 0.000043 grad_norm: 0.7578 (0.8639) closs: 1.0455 (1.0997) time: 4.1105 data: 0.0002 max mem: 54684 +[04:04:44.303035] Epoch: [0] [3060/3229] lr: 0.000043 grad_norm: 0.7642 (0.8637) closs: 1.0497 (1.0996) time: 4.1001 data: 0.0002 max mem: 54684 +[04:05:25.256193] Epoch: [0] [3070/3229] lr: 0.000042 grad_norm: 0.7642 (0.8633) closs: 1.0778 (1.0996) time: 4.1025 data: 0.0002 max mem: 54684 +[04:06:05.981901] Epoch: [0] [3080/3229] lr: 0.000042 grad_norm: 0.7319 (0.8628) closs: 1.0596 (1.0994) time: 4.0839 data: 0.0002 max mem: 54684 +[04:06:46.434233] Epoch: [0] [3090/3229] lr: 0.000042 grad_norm: 0.7442 (0.8624) closs: 1.0718 (1.0993) time: 4.0588 data: 0.0002 max mem: 54684 +[04:07:27.009193] Epoch: [0] [3100/3229] lr: 0.000042 grad_norm: 0.7568 (0.8619) closs: 1.1194 (1.0992) time: 4.0513 data: 0.0002 max mem: 54684 +[04:08:07.508250] Epoch: [0] [3110/3229] lr: 0.000042 grad_norm: 0.7233 (0.8615) closs: 1.0551 (1.0991) time: 4.0536 data: 0.0002 max mem: 54684 +[04:08:48.595599] Epoch: [0] [3120/3229] lr: 0.000042 grad_norm: 0.7674 (0.8612) closs: 1.0551 (1.0989) time: 4.0793 data: 0.0002 max mem: 54684 +[04:09:28.728428] Epoch: [0] [3130/3229] lr: 0.000042 grad_norm: 0.7612 (0.8607) closs: 1.0629 (1.0987) time: 4.0609 data: 0.0002 max mem: 54684 +[04:10:08.523858] Epoch: [0] [3140/3229] lr: 0.000042 grad_norm: 0.6953 (0.8602) closs: 1.0381 (1.0985) time: 3.9964 data: 0.0002 max mem: 54684 +[04:10:49.845722] Epoch: [0] [3150/3229] lr: 0.000042 grad_norm: 0.7319 (0.8599) closs: 1.0381 (1.0984) time: 4.0558 data: 0.0002 max mem: 54684 +[04:11:30.298486] Epoch: [0] [3160/3229] lr: 0.000042 grad_norm: 0.7591 (0.8595) closs: 1.0370 (1.0981) time: 4.0887 data: 0.0002 max mem: 54684 +[04:12:11.068402] Epoch: [0] [3170/3229] lr: 0.000042 grad_norm: 0.7661 (0.8592) closs: 1.0615 (1.0980) time: 4.0611 data: 0.0002 max mem: 54684 +[04:12:51.939751] Epoch: [0] [3180/3229] lr: 0.000042 grad_norm: 0.8030 (0.8590) closs: 1.0806 (1.0979) time: 4.0820 data: 0.0002 max mem: 54684 +[04:13:32.816037] Epoch: [0] [3190/3229] lr: 0.000042 grad_norm: 0.7835 (0.8587) closs: 1.0738 (1.0978) time: 4.0873 data: 0.0002 max mem: 54684 +[04:14:14.251177] Epoch: [0] [3200/3229] lr: 0.000042 grad_norm: 0.7516 (0.8583) closs: 1.0760 (1.0977) time: 4.1155 data: 0.0003 max mem: 54684 +[04:14:55.041953] Epoch: [0] [3210/3229] lr: 0.000042 grad_norm: 0.7438 (0.8580) closs: 1.0863 (1.0976) time: 4.1112 data: 0.0003 max mem: 54684 +[04:15:36.390692] Epoch: [0] [3220/3229] lr: 0.000042 grad_norm: 0.7658 (0.8579) closs: 1.0767 (1.0975) time: 4.1069 data: 0.0001 max mem: 54684 +[04:16:09.561812] Epoch: [0] Total time: 3:40:03 +[04:16:09.562785] Averaged stats: lr: 0.000042 grad_norm: 0.7787 (0.8577) closs: 1.0483 (1.0961) +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[04:16:09.927151] model saved +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[04:16:11.631099] optimizer saved +[04:16:11.631728] other rank-common saved +[04:16:11.636736] rank-specific saved +[04:16:11.651089] log_dir: ./output_dir +[04:16:23.993611] Epoch: [1] [0/3229] lr: 0.000042 grad_norm: 0.8670 (0.8670) closs: 1.0075 (1.0075) time: 12.3415 data: 7.9810 max mem: 54684 +[04:17:06.440076] Epoch: [1] [10/3229] lr: 0.000041 grad_norm: 0.7389 (0.8121) closs: 1.0826 (1.0946) time: 4.9807 data: 0.7257 max mem: 54684 +[04:17:46.869458] Epoch: [1] [20/3229] lr: 0.000041 grad_norm: 0.7389 (0.7906) closs: 1.0572 (1.0652) time: 4.1437 data: 0.0002 max mem: 54684 +[04:18:28.213204] Epoch: [1] [30/3229] lr: 0.000041 grad_norm: 0.7856 (0.7980) closs: 1.0665 (1.0783) time: 4.0886 data: 0.0002 max mem: 54684 +[04:19:09.261576] Epoch: [1] [40/3229] lr: 0.000041 grad_norm: 0.7665 (0.7884) closs: 1.0941 (1.0814) time: 4.1195 data: 0.0002 max mem: 54684 +[04:19:50.348250] Epoch: [1] [50/3229] lr: 0.000041 grad_norm: 0.7463 (0.7844) closs: 1.0827 (1.0739) time: 4.1067 data: 0.0002 max mem: 54684 +[04:20:30.454474] Epoch: [1] [60/3229] lr: 0.000041 grad_norm: 0.7801 (0.7795) closs: 1.0234 (1.0633) time: 4.0596 data: 0.0002 max mem: 54684 +[04:21:11.357215] Epoch: [1] [70/3229] lr: 0.000041 grad_norm: 0.8025 (0.7812) closs: 1.0314 (1.0602) time: 4.0504 data: 0.0002 max mem: 54684 +[04:21:51.356188] Epoch: [1] [80/3229] lr: 0.000041 grad_norm: 0.7522 (0.7691) closs: 1.0226 (1.0522) time: 4.0450 data: 0.0002 max mem: 54684 +[04:22:33.043953] Epoch: [1] [90/3229] lr: 0.000041 grad_norm: 0.7018 (0.7681) closs: 1.0252 (1.0538) time: 4.0843 data: 0.0002 max mem: 54684 +[04:23:13.831663] Epoch: [1] [100/3229] lr: 0.000041 grad_norm: 0.7546 (0.7674) closs: 1.0535 (1.0529) time: 4.1237 data: 0.0002 max mem: 54684 +[04:23:55.181411] Epoch: [1] [110/3229] lr: 0.000041 grad_norm: 0.7521 (0.7679) closs: 1.0461 (1.0515) time: 4.1068 data: 0.0002 max mem: 54684 +[04:24:35.870155] Epoch: [1] [120/3229] lr: 0.000041 grad_norm: 0.7521 (0.7650) closs: 1.0400 (1.0525) time: 4.1019 data: 0.0002 max mem: 54684 +[04:25:17.325692] Epoch: [1] [130/3229] lr: 0.000041 grad_norm: 0.7635 (0.7660) closs: 1.0768 (1.0555) time: 4.1071 data: 0.0002 max mem: 54684 +[04:25:58.111171] Epoch: [1] [140/3229] lr: 0.000041 grad_norm: 0.7635 (0.7666) closs: 1.0801 (1.0569) time: 4.1120 data: 0.0002 max mem: 54684 +[04:26:38.709088] Epoch: [1] [150/3229] lr: 0.000041 grad_norm: 0.7391 (0.7641) closs: 1.0635 (1.0556) time: 4.0691 data: 0.0002 max mem: 54684 +[04:27:19.437075] Epoch: [1] [160/3229] lr: 0.000041 grad_norm: 0.7284 (0.7647) closs: 1.0512 (1.0563) time: 4.0662 data: 0.0002 max mem: 54684 +[04:28:00.075659] Epoch: [1] [170/3229] lr: 0.000040 grad_norm: 0.7635 (0.7653) closs: 1.0589 (1.0533) time: 4.0683 data: 0.0002 max mem: 54684 +[04:28:40.520775] Epoch: [1] [180/3229] lr: 0.000040 grad_norm: 0.7911 (0.7696) closs: 1.0460 (1.0532) time: 4.0541 data: 0.0002 max mem: 54684 +[04:29:21.220750] Epoch: [1] [190/3229] lr: 0.000040 grad_norm: 0.7859 (0.7694) closs: 1.0297 (1.0513) time: 4.0572 data: 0.0002 max mem: 54684 +[04:30:02.479262] Epoch: [1] [200/3229] lr: 0.000040 grad_norm: 0.7721 (0.7706) closs: 1.0362 (1.0522) time: 4.0979 data: 0.0002 max mem: 54684 +[04:30:43.921995] Epoch: [1] [210/3229] lr: 0.000040 grad_norm: 0.8007 (0.7734) closs: 1.0904 (1.0546) time: 4.1350 data: 0.0002 max mem: 54684 +[04:31:24.392183] Epoch: [1] [220/3229] lr: 0.000040 grad_norm: 0.8126 (0.7731) closs: 1.0870 (1.0548) time: 4.0956 data: 0.0002 max mem: 54684 +[04:32:04.710459] Epoch: [1] [230/3229] lr: 0.000040 grad_norm: 0.7601 (0.7722) closs: 1.0787 (1.0528) time: 4.0394 data: 0.0002 max mem: 54684 +[04:32:45.662972] Epoch: [1] [240/3229] lr: 0.000040 grad_norm: 0.7548 (0.7715) closs: 1.0161 (1.0511) time: 4.0635 data: 0.0002 max mem: 54684 +[04:33:27.043779] Epoch: [1] [250/3229] lr: 0.000040 grad_norm: 0.7622 (0.7718) closs: 1.0228 (1.0510) time: 4.1166 data: 0.0002 max mem: 54684 +[04:34:08.159916] Epoch: [1] [260/3229] lr: 0.000040 grad_norm: 0.7596 (0.7716) closs: 1.0571 (1.0517) time: 4.1248 data: 0.0002 max mem: 54684 +[04:34:49.118869] Epoch: [1] [270/3229] lr: 0.000040 grad_norm: 0.7411 (0.7711) closs: 1.0854 (1.0523) time: 4.1037 data: 0.0002 max mem: 54684 +[04:35:29.743500] Epoch: [1] [280/3229] lr: 0.000040 grad_norm: 0.7411 (0.7708) closs: 1.0320 (1.0520) time: 4.0791 data: 0.0002 max mem: 54684 +[04:36:10.896285] Epoch: [1] [290/3229] lr: 0.000040 grad_norm: 0.7698 (0.7711) closs: 1.0320 (1.0519) time: 4.0888 data: 0.0002 max mem: 54684 +[04:36:50.693561] Epoch: [1] [300/3229] lr: 0.000040 grad_norm: 0.7667 (0.7693) closs: 1.0176 (1.0507) time: 4.0474 data: 0.0002 max mem: 54684 +[04:37:31.371110] Epoch: [1] [310/3229] lr: 0.000040 grad_norm: 0.7233 (0.7689) closs: 1.0032 (1.0496) time: 4.0237 data: 0.0002 max mem: 54684 +[04:38:12.621107] Epoch: [1] [320/3229] lr: 0.000039 grad_norm: 0.7637 (0.7696) closs: 1.0102 (1.0494) time: 4.0963 data: 0.0002 max mem: 54684 +[04:38:53.280696] Epoch: [1] [330/3229] lr: 0.000039 grad_norm: 0.7538 (0.7692) closs: 1.0404 (1.0488) time: 4.0954 data: 0.0002 max mem: 54684 +[04:39:34.384153] Epoch: [1] [340/3229] lr: 0.000039 grad_norm: 0.7223 (0.7678) closs: 1.0712 (1.0507) time: 4.0881 data: 0.0002 max mem: 54684 +[04:40:14.745485] Epoch: [1] [350/3229] lr: 0.000039 grad_norm: 0.7186 (0.7656) closs: 1.0880 (1.0494) time: 4.0732 data: 0.0002 max mem: 54684 +[04:40:55.725764] Epoch: [1] [360/3229] lr: 0.000039 grad_norm: 0.7281 (0.7657) closs: 1.0522 (1.0491) time: 4.0670 data: 0.0002 max mem: 54684 +[04:41:37.083851] Epoch: [1] [370/3229] lr: 0.000039 grad_norm: 0.7441 (0.7654) closs: 1.0619 (1.0497) time: 4.1169 data: 0.0002 max mem: 54684 +[04:42:18.173848] Epoch: [1] [380/3229] lr: 0.000039 grad_norm: 0.7505 (0.7652) closs: 1.0880 (1.0510) time: 4.1223 data: 0.0002 max mem: 54684 +[04:42:58.469739] Epoch: [1] [390/3229] lr: 0.000039 grad_norm: 0.7717 (0.7642) closs: 1.0462 (1.0496) time: 4.0692 data: 0.0002 max mem: 54684 +[04:43:39.241200] Epoch: [1] [400/3229] lr: 0.000039 grad_norm: 0.7749 (0.7649) closs: 0.9971 (1.0484) time: 4.0533 data: 0.0002 max mem: 54684 +[04:44:20.120315] Epoch: [1] [410/3229] lr: 0.000039 grad_norm: 0.8080 (0.7657) closs: 1.0498 (1.0489) time: 4.0825 data: 0.0002 max mem: 54684 +[04:45:00.557566] Epoch: [1] [420/3229] lr: 0.000039 grad_norm: 0.7665 (0.7651) closs: 1.0759 (1.0488) time: 4.0658 data: 0.0002 max mem: 54684 +[04:45:41.828412] Epoch: [1] [430/3229] lr: 0.000039 grad_norm: 0.7265 (0.7650) closs: 1.0759 (1.0498) time: 4.0853 data: 0.0002 max mem: 54684 +[04:46:22.509399] Epoch: [1] [440/3229] lr: 0.000039 grad_norm: 0.7462 (0.7646) closs: 1.0742 (1.0497) time: 4.0975 data: 0.0002 max mem: 54684 +[04:47:03.187153] Epoch: [1] [450/3229] lr: 0.000039 grad_norm: 0.7618 (0.7645) closs: 1.0617 (1.0497) time: 4.0679 data: 0.0002 max mem: 54684 +[04:47:44.104151] Epoch: [1] [460/3229] lr: 0.000039 grad_norm: 0.7541 (0.7639) closs: 1.0582 (1.0501) time: 4.0797 data: 0.0002 max mem: 54684 +[04:48:24.641461] Epoch: [1] [470/3229] lr: 0.000038 grad_norm: 0.7528 (0.7635) closs: 1.0582 (1.0494) time: 4.0727 data: 0.0002 max mem: 54684 +[04:49:05.751178] Epoch: [1] [480/3229] lr: 0.000038 grad_norm: 0.7481 (0.7628) closs: 1.0947 (1.0506) time: 4.0823 data: 0.0002 max mem: 54684 +[04:49:46.537424] Epoch: [1] [490/3229] lr: 0.000038 grad_norm: 0.7552 (0.7629) closs: 1.0837 (1.0503) time: 4.0947 data: 0.0002 max mem: 54684 +[04:50:27.314255] Epoch: [1] [500/3229] lr: 0.000038 grad_norm: 0.7712 (0.7634) closs: 1.0294 (1.0500) time: 4.0781 data: 0.0002 max mem: 54684 +[04:51:08.310823] Epoch: [1] [510/3229] lr: 0.000038 grad_norm: 0.8222 (0.7643) closs: 1.0444 (1.0500) time: 4.0886 data: 0.0002 max mem: 54684 +[04:51:49.708115] Epoch: [1] [520/3229] lr: 0.000038 grad_norm: 0.7931 (0.7644) closs: 1.0786 (1.0511) time: 4.1196 data: 0.0002 max mem: 54684 +[04:52:30.929265] Epoch: [1] [530/3229] lr: 0.000038 grad_norm: 0.7668 (0.7648) closs: 1.0823 (1.0516) time: 4.1309 data: 0.0002 max mem: 54684 +[04:53:11.799214] Epoch: [1] [540/3229] lr: 0.000038 grad_norm: 0.7484 (0.7644) closs: 1.0851 (1.0518) time: 4.1045 data: 0.0002 max mem: 54684 +[04:53:52.617648] Epoch: [1] [550/3229] lr: 0.000038 grad_norm: 0.7576 (0.7642) closs: 1.0762 (1.0517) time: 4.0844 data: 0.0002 max mem: 54684 +[04:54:33.511754] Epoch: [1] [560/3229] lr: 0.000038 grad_norm: 0.7606 (0.7637) closs: 1.0481 (1.0516) time: 4.0856 data: 0.0002 max mem: 54684 +[04:55:14.276726] Epoch: [1] [570/3229] lr: 0.000038 grad_norm: 0.7616 (0.7649) closs: 1.0444 (1.0516) time: 4.0829 data: 0.0002 max mem: 54684 +[04:55:54.841137] Epoch: [1] [580/3229] lr: 0.000038 grad_norm: 0.7616 (0.7646) closs: 1.0423 (1.0511) time: 4.0664 data: 0.0002 max mem: 54684 +[04:56:35.646680] Epoch: [1] [590/3229] lr: 0.000038 grad_norm: 0.7542 (0.7649) closs: 1.0440 (1.0512) time: 4.0684 data: 0.0002 max mem: 54684 +[04:57:16.334209] Epoch: [1] [600/3229] lr: 0.000038 grad_norm: 0.7437 (0.7638) closs: 1.0745 (1.0512) time: 4.0746 data: 0.0002 max mem: 54684 +[04:57:57.448181] Epoch: [1] [610/3229] lr: 0.000038 grad_norm: 0.7439 (0.7642) closs: 1.0596 (1.0512) time: 4.0900 data: 0.0002 max mem: 54684 +[04:58:38.714028] Epoch: [1] [620/3229] lr: 0.000037 grad_norm: 0.7602 (0.7641) closs: 1.0594 (1.0513) time: 4.1189 data: 0.0002 max mem: 54684 +[04:59:19.266620] Epoch: [1] [630/3229] lr: 0.000037 grad_norm: 0.7325 (0.7639) closs: 1.0401 (1.0502) time: 4.0909 data: 0.0002 max mem: 54684 +[05:00:00.368157] Epoch: [1] [640/3229] lr: 0.000037 grad_norm: 0.7513 (0.7638) closs: 1.0248 (1.0503) time: 4.0826 data: 0.0002 max mem: 54684 +[05:00:41.160239] Epoch: [1] [650/3229] lr: 0.000037 grad_norm: 0.7513 (0.7635) closs: 1.0358 (1.0502) time: 4.0946 data: 0.0002 max mem: 54684 +[05:01:22.138831] Epoch: [1] [660/3229] lr: 0.000037 grad_norm: 0.7456 (0.7637) closs: 1.0590 (1.0504) time: 4.0885 data: 0.0002 max mem: 54684 +[05:02:02.946232] Epoch: [1] [670/3229] lr: 0.000037 grad_norm: 0.7457 (0.7634) closs: 1.0762 (1.0507) time: 4.0892 data: 0.0002 max mem: 54684 +[05:02:44.502278] Epoch: [1] [680/3229] lr: 0.000037 grad_norm: 0.7440 (0.7634) closs: 1.0678 (1.0509) time: 4.1181 data: 0.0002 max mem: 54684 +[05:03:24.951117] Epoch: [1] [690/3229] lr: 0.000037 grad_norm: 0.7456 (0.7630) closs: 1.0585 (1.0513) time: 4.1002 data: 0.0002 max mem: 54684 +[05:04:05.578068] Epoch: [1] [700/3229] lr: 0.000037 grad_norm: 0.7712 (0.7629) closs: 1.0648 (1.0509) time: 4.0537 data: 0.0002 max mem: 54684 +[05:04:46.382125] Epoch: [1] [710/3229] lr: 0.000037 grad_norm: 0.7536 (0.7623) closs: 1.0831 (1.0508) time: 4.0715 data: 0.0002 max mem: 54684 +[05:05:27.836191] Epoch: [1] [720/3229] lr: 0.000037 grad_norm: 0.7533 (0.7623) closs: 1.0761 (1.0516) time: 4.1128 data: 0.0002 max mem: 54684 +[05:06:08.635002] Epoch: [1] [730/3229] lr: 0.000037 grad_norm: 0.7805 (0.7629) closs: 1.0763 (1.0519) time: 4.1126 data: 0.0002 max mem: 54684 +[05:06:48.634490] Epoch: [1] [740/3229] lr: 0.000037 grad_norm: 0.7186 (0.7620) closs: 1.0197 (1.0513) time: 4.0398 data: 0.0002 max mem: 54684 +[05:07:29.445391] Epoch: [1] [750/3229] lr: 0.000037 grad_norm: 0.7025 (0.7616) closs: 1.0359 (1.0516) time: 4.0405 data: 0.0002 max mem: 54684 +[05:08:11.035741] Epoch: [1] [760/3229] lr: 0.000036 grad_norm: 0.7647 (0.7623) closs: 1.0791 (1.0519) time: 4.1200 data: 0.0002 max mem: 54684 +[05:08:51.808203] Epoch: [1] [770/3229] lr: 0.000036 grad_norm: 0.8157 (0.7630) closs: 1.0874 (1.0524) time: 4.1181 data: 0.0002 max mem: 54684 +[05:09:32.821381] Epoch: [1] [780/3229] lr: 0.000036 grad_norm: 0.7640 (0.7628) closs: 1.0954 (1.0530) time: 4.0892 data: 0.0002 max mem: 54684 +[05:10:13.634195] Epoch: [1] [790/3229] lr: 0.000036 grad_norm: 0.7289 (0.7624) closs: 1.0748 (1.0527) time: 4.0912 data: 0.0002 max mem: 54684 +[05:10:55.053924] Epoch: [1] [800/3229] lr: 0.000036 grad_norm: 0.7343 (0.7624) closs: 1.0321 (1.0528) time: 4.1116 data: 0.0002 max mem: 54684 +[05:11:35.196793] Epoch: [1] [810/3229] lr: 0.000036 grad_norm: 0.7367 (0.7620) closs: 1.0322 (1.0524) time: 4.0781 data: 0.0002 max mem: 54684 +[05:12:16.522962] Epoch: [1] [820/3229] lr: 0.000036 grad_norm: 0.7401 (0.7621) closs: 1.0246 (1.0522) time: 4.0734 data: 0.0002 max mem: 54684 +[05:12:57.137015] Epoch: [1] [830/3229] lr: 0.000036 grad_norm: 0.7595 (0.7617) closs: 1.0450 (1.0524) time: 4.0969 data: 0.0002 max mem: 54684 +[05:13:38.237630] Epoch: [1] [840/3229] lr: 0.000036 grad_norm: 0.7652 (0.7616) closs: 1.0833 (1.0530) time: 4.0857 data: 0.0002 max mem: 54684 +[05:14:19.021316] Epoch: [1] [850/3229] lr: 0.000036 grad_norm: 0.7652 (0.7614) closs: 1.0680 (1.0532) time: 4.0941 data: 0.0002 max mem: 54684 +[05:14:59.987219] Epoch: [1] [860/3229] lr: 0.000036 grad_norm: 0.7362 (0.7610) closs: 1.0479 (1.0534) time: 4.0874 data: 0.0002 max mem: 54684 +[05:15:40.669534] Epoch: [1] [870/3229] lr: 0.000036 grad_norm: 0.7348 (0.7608) closs: 1.0472 (1.0530) time: 4.0823 data: 0.0002 max mem: 54684 +[05:16:21.345203] Epoch: [1] [880/3229] lr: 0.000036 grad_norm: 0.7769 (0.7607) closs: 1.0219 (1.0529) time: 4.0678 data: 0.0002 max mem: 54684 +[05:17:02.174965] Epoch: [1] [890/3229] lr: 0.000036 grad_norm: 0.7393 (0.7603) closs: 1.0763 (1.0534) time: 4.0752 data: 0.0002 max mem: 54684 +[05:17:43.153839] Epoch: [1] [900/3229] lr: 0.000035 grad_norm: 0.7229 (0.7600) closs: 1.0763 (1.0535) time: 4.0904 data: 0.0002 max mem: 54684 +[05:18:23.532375] Epoch: [1] [910/3229] lr: 0.000035 grad_norm: 0.7346 (0.7597) closs: 1.0722 (1.0536) time: 4.0678 data: 0.0002 max mem: 54684 +[05:19:04.844983] Epoch: [1] [920/3229] lr: 0.000035 grad_norm: 0.7679 (0.7600) closs: 1.0613 (1.0536) time: 4.0845 data: 0.0002 max mem: 54684 +[05:19:45.960496] Epoch: [1] [930/3229] lr: 0.000035 grad_norm: 0.7816 (0.7600) closs: 1.0613 (1.0537) time: 4.1213 data: 0.0002 max mem: 54684 +[05:20:26.646864] Epoch: [1] [940/3229] lr: 0.000035 grad_norm: 0.7816 (0.7601) closs: 1.0160 (1.0532) time: 4.0900 data: 0.0002 max mem: 54684 +[05:21:07.563052] Epoch: [1] [950/3229] lr: 0.000035 grad_norm: 0.7621 (0.7605) closs: 0.9814 (1.0528) time: 4.0801 data: 0.0002 max mem: 54684 +[05:21:48.451416] Epoch: [1] [960/3229] lr: 0.000035 grad_norm: 0.7508 (0.7605) closs: 0.9968 (1.0529) time: 4.0902 data: 0.0002 max mem: 54684 +[05:22:29.552875] Epoch: [1] [970/3229] lr: 0.000035 grad_norm: 0.7519 (0.7605) closs: 1.0570 (1.0530) time: 4.0994 data: 0.0002 max mem: 54684 +[05:23:10.530952] Epoch: [1] [980/3229] lr: 0.000035 grad_norm: 0.7625 (0.7606) closs: 1.0570 (1.0532) time: 4.1039 data: 0.0002 max mem: 54684 +[05:23:51.878010] Epoch: [1] [990/3229] lr: 0.000035 grad_norm: 0.7625 (0.7608) closs: 1.0764 (1.0534) time: 4.1162 data: 0.0002 max mem: 54684 +[05:24:31.849857] Epoch: [1] [1000/3229] lr: 0.000035 grad_norm: 0.7446 (0.7599) closs: 1.0764 (1.0532) time: 4.0659 data: 0.0002 max mem: 54684 +[05:25:12.639207] Epoch: [1] [1010/3229] lr: 0.000035 grad_norm: 0.7020 (0.7597) closs: 1.0242 (1.0529) time: 4.0380 data: 0.0002 max mem: 54684 +[05:25:52.977496] Epoch: [1] [1020/3229] lr: 0.000035 grad_norm: 0.7052 (0.7594) closs: 1.0555 (1.0527) time: 4.0563 data: 0.0002 max mem: 54684 +[05:26:34.003363] Epoch: [1] [1030/3229] lr: 0.000034 grad_norm: 0.7147 (0.7588) closs: 1.0503 (1.0527) time: 4.0681 data: 0.0002 max mem: 54684 +[05:27:14.573525] Epoch: [1] [1040/3229] lr: 0.000034 grad_norm: 0.7147 (0.7586) closs: 1.0450 (1.0524) time: 4.0797 data: 0.0002 max mem: 54684 +[05:27:55.498440] Epoch: [1] [1050/3229] lr: 0.000034 grad_norm: 0.7522 (0.7589) closs: 1.0486 (1.0523) time: 4.0747 data: 0.0002 max mem: 54684 +[05:28:36.363003] Epoch: [1] [1060/3229] lr: 0.000034 grad_norm: 0.7522 (0.7588) closs: 1.0553 (1.0523) time: 4.0894 data: 0.0002 max mem: 54684 +[05:29:17.566434] Epoch: [1] [1070/3229] lr: 0.000034 grad_norm: 0.7544 (0.7588) closs: 1.0536 (1.0523) time: 4.1033 data: 0.0002 max mem: 54684 +[05:29:57.687127] Epoch: [1] [1080/3229] lr: 0.000034 grad_norm: 0.7544 (0.7586) closs: 1.0454 (1.0519) time: 4.0661 data: 0.0002 max mem: 54684 +[05:30:38.261585] Epoch: [1] [1090/3229] lr: 0.000034 grad_norm: 0.7317 (0.7584) closs: 1.0487 (1.0519) time: 4.0347 data: 0.0002 max mem: 54684 +[05:31:19.146600] Epoch: [1] [1100/3229] lr: 0.000034 grad_norm: 0.7461 (0.7585) closs: 1.0237 (1.0518) time: 4.0729 data: 0.0002 max mem: 54684 +[05:32:00.230890] Epoch: [1] [1110/3229] lr: 0.000034 grad_norm: 0.7654 (0.7586) closs: 1.0237 (1.0518) time: 4.0984 data: 0.0002 max mem: 54684 +[05:32:40.323167] Epoch: [1] [1120/3229] lr: 0.000034 grad_norm: 0.7725 (0.7589) closs: 1.0261 (1.0517) time: 4.0588 data: 0.0002 max mem: 54684 +[05:33:21.572530] Epoch: [1] [1130/3229] lr: 0.000034 grad_norm: 0.7690 (0.7592) closs: 1.0308 (1.0517) time: 4.0670 data: 0.0002 max mem: 54684 +[05:34:02.762302] Epoch: [1] [1140/3229] lr: 0.000034 grad_norm: 0.7708 (0.7595) closs: 1.0746 (1.0522) time: 4.1219 data: 0.0002 max mem: 54684 +[05:34:43.703986] Epoch: [1] [1150/3229] lr: 0.000034 grad_norm: 0.7656 (0.7593) closs: 1.0766 (1.0521) time: 4.1065 data: 0.0003 max mem: 54684 +[05:35:24.484939] Epoch: [1] [1160/3229] lr: 0.000034 grad_norm: 0.7151 (0.7591) closs: 1.0380 (1.0517) time: 4.0861 data: 0.0003 max mem: 54684 +[05:36:05.401846] Epoch: [1] [1170/3229] lr: 0.000033 grad_norm: 0.7522 (0.7591) closs: 1.0499 (1.0519) time: 4.0848 data: 0.0002 max mem: 54684 +[05:36:45.957264] Epoch: [1] [1180/3229] lr: 0.000033 grad_norm: 0.7522 (0.7588) closs: 1.0587 (1.0517) time: 4.0735 data: 0.0002 max mem: 54684 +[05:37:27.507925] Epoch: [1] [1190/3229] lr: 0.000033 grad_norm: 0.7522 (0.7589) closs: 1.0445 (1.0518) time: 4.1052 data: 0.0002 max mem: 54684 +[05:38:08.323255] Epoch: [1] [1200/3229] lr: 0.000033 grad_norm: 0.8299 (0.7594) closs: 1.0666 (1.0517) time: 4.1182 data: 0.0002 max mem: 54684 +[05:38:49.657429] Epoch: [1] [1210/3229] lr: 0.000033 grad_norm: 0.8322 (0.7596) closs: 1.0805 (1.0520) time: 4.1074 data: 0.0002 max mem: 54684 +[05:39:30.796365] Epoch: [1] [1220/3229] lr: 0.000033 grad_norm: 0.7634 (0.7597) closs: 1.0942 (1.0524) time: 4.1236 data: 0.0002 max mem: 54684 +[05:40:12.472149] Epoch: [1] [1230/3229] lr: 0.000033 grad_norm: 0.7564 (0.7597) closs: 1.0754 (1.0523) time: 4.1407 data: 0.0002 max mem: 54684 +[05:40:53.596788] Epoch: [1] [1240/3229] lr: 0.000033 grad_norm: 0.7499 (0.7610) closs: 1.0346 (1.0525) time: 4.1400 data: 0.0002 max mem: 54684 +[05:41:34.343746] Epoch: [1] [1250/3229] lr: 0.000033 grad_norm: 0.7309 (0.7605) closs: 1.0346 (1.0524) time: 4.0935 data: 0.0002 max mem: 54684 +[05:42:14.850396] Epoch: [1] [1260/3229] lr: 0.000033 grad_norm: 0.7357 (0.7604) closs: 1.0046 (1.0521) time: 4.0626 data: 0.0002 max mem: 54684 +[05:42:55.902233] Epoch: [1] [1270/3229] lr: 0.000033 grad_norm: 0.7459 (0.7603) closs: 0.9939 (1.0517) time: 4.0779 data: 0.0002 max mem: 54684 +[05:43:36.700285] Epoch: [1] [1280/3229] lr: 0.000033 grad_norm: 0.7424 (0.7602) closs: 1.0316 (1.0518) time: 4.0924 data: 0.0002 max mem: 54684 +[05:44:18.029631] Epoch: [1] [1290/3229] lr: 0.000033 grad_norm: 0.7584 (0.7604) closs: 1.0437 (1.0518) time: 4.1063 data: 0.0002 max mem: 54684 +[05:44:58.460022] Epoch: [1] [1300/3229] lr: 0.000032 grad_norm: 0.7471 (0.7602) closs: 1.0371 (1.0516) time: 4.0879 data: 0.0002 max mem: 54684 +[05:45:39.434680] Epoch: [1] [1310/3229] lr: 0.000032 grad_norm: 0.7390 (0.7601) closs: 1.0832 (1.0518) time: 4.0702 data: 0.0002 max mem: 54684 +[05:46:19.533138] Epoch: [1] [1320/3229] lr: 0.000032 grad_norm: 0.7437 (0.7601) closs: 1.0843 (1.0517) time: 4.0536 data: 0.0002 max mem: 54684 +[05:46:59.861164] Epoch: [1] [1330/3229] lr: 0.000032 grad_norm: 0.7416 (0.7597) closs: 1.0420 (1.0514) time: 4.0213 data: 0.0002 max mem: 54684 +[05:47:39.762335] Epoch: [1] [1340/3229] lr: 0.000032 grad_norm: 0.7236 (0.7592) closs: 1.0353 (1.0512) time: 4.0114 data: 0.0002 max mem: 54684 +[05:48:21.193611] Epoch: [1] [1350/3229] lr: 0.000032 grad_norm: 0.7413 (0.7594) closs: 1.0614 (1.0513) time: 4.0666 data: 0.0002 max mem: 54684 +[05:49:01.979501] Epoch: [1] [1360/3229] lr: 0.000032 grad_norm: 0.7393 (0.7591) closs: 1.0725 (1.0512) time: 4.1108 data: 0.0002 max mem: 54684 +[05:49:42.964934] Epoch: [1] [1370/3229] lr: 0.000032 grad_norm: 0.7161 (0.7588) closs: 1.0531 (1.0512) time: 4.0885 data: 0.0002 max mem: 54684 +[05:50:23.751298] Epoch: [1] [1380/3229] lr: 0.000032 grad_norm: 0.7334 (0.7589) closs: 1.0633 (1.0516) time: 4.0885 data: 0.0002 max mem: 54684 +[05:51:05.208897] Epoch: [1] [1390/3229] lr: 0.000032 grad_norm: 0.7776 (0.7591) closs: 1.0852 (1.0519) time: 4.1121 data: 0.0002 max mem: 54684 +[05:51:46.307285] Epoch: [1] [1400/3229] lr: 0.000032 grad_norm: 0.7727 (0.7591) closs: 1.0727 (1.0521) time: 4.1277 data: 0.0002 max mem: 54684 +[05:52:27.639855] Epoch: [1] [1410/3229] lr: 0.000032 grad_norm: 0.7589 (0.7592) closs: 1.0593 (1.0520) time: 4.1215 data: 0.0002 max mem: 54684 +[05:53:08.510464] Epoch: [1] [1420/3229] lr: 0.000032 grad_norm: 0.7454 (0.7591) closs: 1.0521 (1.0518) time: 4.1101 data: 0.0002 max mem: 54684 +[05:53:49.534800] Epoch: [1] [1430/3229] lr: 0.000031 grad_norm: 0.7239 (0.7589) closs: 1.0510 (1.0515) time: 4.0947 data: 0.0002 max mem: 54684 +[05:54:30.634251] Epoch: [1] [1440/3229] lr: 0.000031 grad_norm: 0.7376 (0.7591) closs: 1.0525 (1.0516) time: 4.1061 data: 0.0002 max mem: 54684 +[05:55:11.319746] Epoch: [1] [1450/3229] lr: 0.000031 grad_norm: 0.7381 (0.7590) closs: 1.0587 (1.0514) time: 4.0892 data: 0.0002 max mem: 54684 +[05:55:52.498426] Epoch: [1] [1460/3229] lr: 0.000031 grad_norm: 0.7446 (0.7591) closs: 1.0462 (1.0513) time: 4.0931 data: 0.0002 max mem: 54684 +[05:56:33.716084] Epoch: [1] [1470/3229] lr: 0.000031 grad_norm: 0.7441 (0.7589) closs: 1.0490 (1.0514) time: 4.1198 data: 0.0002 max mem: 54684 +[05:57:14.183953] Epoch: [1] [1480/3229] lr: 0.000031 grad_norm: 0.7353 (0.7586) closs: 1.0471 (1.0514) time: 4.0842 data: 0.0002 max mem: 54684 +[05:57:55.151729] Epoch: [1] [1490/3229] lr: 0.000031 grad_norm: 0.7442 (0.7585) closs: 1.0415 (1.0514) time: 4.0717 data: 0.0002 max mem: 54684 +[05:58:36.455544] Epoch: [1] [1500/3229] lr: 0.000031 grad_norm: 0.7452 (0.7585) closs: 1.0665 (1.0515) time: 4.1135 data: 0.0002 max mem: 54684 +[05:59:17.089096] Epoch: [1] [1510/3229] lr: 0.000031 grad_norm: 0.7419 (0.7581) closs: 1.0889 (1.0517) time: 4.0968 data: 0.0002 max mem: 54684 +[05:59:57.877708] Epoch: [1] [1520/3229] lr: 0.000031 grad_norm: 0.7463 (0.7583) closs: 1.0725 (1.0517) time: 4.0710 data: 0.0002 max mem: 54684 +[06:00:39.181341] Epoch: [1] [1530/3229] lr: 0.000031 grad_norm: 0.7860 (0.7584) closs: 1.0767 (1.0519) time: 4.1045 data: 0.0002 max mem: 54684 +[06:01:20.218941] Epoch: [1] [1540/3229] lr: 0.000031 grad_norm: 0.7508 (0.7584) closs: 1.0767 (1.0517) time: 4.1170 data: 0.0002 max mem: 54684 +[06:02:01.186493] Epoch: [1] [1550/3229] lr: 0.000031 grad_norm: 0.7369 (0.7583) closs: 1.0714 (1.0517) time: 4.1002 data: 0.0002 max mem: 54684 +[06:02:42.438164] Epoch: [1] [1560/3229] lr: 0.000030 grad_norm: 0.7650 (0.7584) closs: 1.0850 (1.0520) time: 4.1109 data: 0.0002 max mem: 54684 +[06:03:23.689124] Epoch: [1] [1570/3229] lr: 0.000030 grad_norm: 0.7622 (0.7584) closs: 1.0709 (1.0521) time: 4.1251 data: 0.0002 max mem: 54684 +[06:04:04.719315] Epoch: [1] [1580/3229] lr: 0.000030 grad_norm: 0.7378 (0.7583) closs: 1.0553 (1.0520) time: 4.1140 data: 0.0002 max mem: 54684 +[06:04:45.698136] Epoch: [1] [1590/3229] lr: 0.000030 grad_norm: 0.7576 (0.7584) closs: 1.0282 (1.0517) time: 4.1004 data: 0.0002 max mem: 54684 +[06:05:27.010602] Epoch: [1] [1600/3229] lr: 0.000030 grad_norm: 0.7794 (0.7588) closs: 1.0155 (1.0516) time: 4.1145 data: 0.0002 max mem: 54684 +[06:06:08.217639] Epoch: [1] [1610/3229] lr: 0.000030 grad_norm: 0.7703 (0.7590) closs: 1.0618 (1.0518) time: 4.1259 data: 0.0002 max mem: 54684 +[06:06:49.708388] Epoch: [1] [1620/3229] lr: 0.000030 grad_norm: 0.7704 (0.7592) closs: 1.0795 (1.0519) time: 4.1348 data: 0.0002 max mem: 54684 +[06:07:30.802140] Epoch: [1] [1630/3229] lr: 0.000030 grad_norm: 0.7723 (0.7593) closs: 1.0570 (1.0521) time: 4.1292 data: 0.0002 max mem: 54684 +[06:08:12.042914] Epoch: [1] [1640/3229] lr: 0.000030 grad_norm: 0.7690 (0.7593) closs: 1.0839 (1.0523) time: 4.1167 data: 0.0002 max mem: 54684 +[06:08:52.552185] Epoch: [1] [1650/3229] lr: 0.000030 grad_norm: 0.7620 (0.7592) closs: 1.0666 (1.0522) time: 4.0874 data: 0.0002 max mem: 54684 +[06:09:34.109821] Epoch: [1] [1660/3229] lr: 0.000030 grad_norm: 0.7443 (0.7592) closs: 1.0719 (1.0526) time: 4.1033 data: 0.0002 max mem: 54684 +[06:10:15.226675] Epoch: [1] [1670/3229] lr: 0.000030 grad_norm: 0.7640 (0.7592) closs: 1.0737 (1.0524) time: 4.1337 data: 0.0002 max mem: 54684 +[06:10:55.893166] Epoch: [1] [1680/3229] lr: 0.000030 grad_norm: 0.7557 (0.7591) closs: 1.0457 (1.0524) time: 4.0891 data: 0.0002 max mem: 54684 +[06:11:37.000880] Epoch: [1] [1690/3229] lr: 0.000029 grad_norm: 0.7449 (0.7591) closs: 1.0646 (1.0525) time: 4.0886 data: 0.0002 max mem: 54684 +[06:12:18.347873] Epoch: [1] [1700/3229] lr: 0.000029 grad_norm: 0.7646 (0.7592) closs: 1.0581 (1.0523) time: 4.1227 data: 0.0002 max mem: 54684 +[06:12:59.455957] Epoch: [1] [1710/3229] lr: 0.000029 grad_norm: 0.7725 (0.7593) closs: 1.0392 (1.0523) time: 4.1227 data: 0.0002 max mem: 54684 +[06:13:40.686492] Epoch: [1] [1720/3229] lr: 0.000029 grad_norm: 0.7731 (0.7595) closs: 1.0769 (1.0526) time: 4.1169 data: 0.0002 max mem: 54684 +[06:14:21.266645] Epoch: [1] [1730/3229] lr: 0.000029 grad_norm: 0.7730 (0.7594) closs: 1.0708 (1.0525) time: 4.0905 data: 0.0002 max mem: 54684 +[06:15:02.058571] Epoch: [1] [1740/3229] lr: 0.000029 grad_norm: 0.7642 (0.7593) closs: 1.0508 (1.0524) time: 4.0685 data: 0.0002 max mem: 54684 +[06:15:43.158278] Epoch: [1] [1750/3229] lr: 0.000029 grad_norm: 0.7606 (0.7594) closs: 1.0358 (1.0525) time: 4.0945 data: 0.0002 max mem: 54684 +[06:16:24.131577] Epoch: [1] [1760/3229] lr: 0.000029 grad_norm: 0.7606 (0.7593) closs: 1.0631 (1.0525) time: 4.1036 data: 0.0002 max mem: 54684 +[06:17:04.589751] Epoch: [1] [1770/3229] lr: 0.000029 grad_norm: 0.7370 (0.7591) closs: 1.0766 (1.0524) time: 4.0715 data: 0.0002 max mem: 54684 +[06:17:46.162753] Epoch: [1] [1780/3229] lr: 0.000029 grad_norm: 0.7053 (0.7589) closs: 1.0610 (1.0525) time: 4.1015 data: 0.0002 max mem: 54684 +[06:18:27.288021] Epoch: [1] [1790/3229] lr: 0.000029 grad_norm: 0.7304 (0.7589) closs: 1.0519 (1.0525) time: 4.1348 data: 0.0002 max mem: 54684 +[06:19:08.597875] Epoch: [1] [1800/3229] lr: 0.000029 grad_norm: 0.7718 (0.7593) closs: 1.0343 (1.0524) time: 4.1217 data: 0.0002 max mem: 54684 +[06:19:48.096823] Epoch: [1] [1810/3229] lr: 0.000028 grad_norm: 0.7355 (0.7589) closs: 0.9675 (1.0521) time: 4.0404 data: 0.0002 max mem: 54684 +[06:20:29.329365] Epoch: [1] [1820/3229] lr: 0.000028 grad_norm: 0.7154 (0.7589) closs: 1.0411 (1.0522) time: 4.0365 data: 0.0002 max mem: 54684 +[06:21:10.086608] Epoch: [1] [1830/3229] lr: 0.000028 grad_norm: 0.7412 (0.7589) closs: 1.0599 (1.0522) time: 4.0994 data: 0.0002 max mem: 54684 +[06:21:51.381549] Epoch: [1] [1840/3229] lr: 0.000028 grad_norm: 0.7621 (0.7589) closs: 1.0664 (1.0523) time: 4.1025 data: 0.0002 max mem: 54684 +[06:22:32.162442] Epoch: [1] [1850/3229] lr: 0.000028 grad_norm: 0.7662 (0.7590) closs: 1.0554 (1.0522) time: 4.1037 data: 0.0002 max mem: 54684 +[06:23:13.433902] Epoch: [1] [1860/3229] lr: 0.000028 grad_norm: 0.7852 (0.7590) closs: 1.0432 (1.0521) time: 4.1025 data: 0.0002 max mem: 54684 +[06:23:54.209307] Epoch: [1] [1870/3229] lr: 0.000028 grad_norm: 0.7674 (0.7591) closs: 1.0409 (1.0521) time: 4.1023 data: 0.0002 max mem: 54684 +[06:24:35.546365] Epoch: [1] [1880/3229] lr: 0.000028 grad_norm: 0.7487 (0.7591) closs: 1.0409 (1.0522) time: 4.1056 data: 0.0002 max mem: 54684 +[06:25:15.982687] Epoch: [1] [1890/3229] lr: 0.000028 grad_norm: 0.7683 (0.7591) closs: 1.0359 (1.0521) time: 4.0886 data: 0.0002 max mem: 54684 +[06:25:57.440510] Epoch: [1] [1900/3229] lr: 0.000028 grad_norm: 0.7670 (0.7592) closs: 1.0557 (1.0522) time: 4.0946 data: 0.0002 max mem: 54684 +[06:26:38.556623] Epoch: [1] [1910/3229] lr: 0.000028 grad_norm: 0.7661 (0.7592) closs: 1.0564 (1.0521) time: 4.1286 data: 0.0002 max mem: 54684 +[06:27:19.228566] Epoch: [1] [1920/3229] lr: 0.000028 grad_norm: 0.7542 (0.7590) closs: 1.0294 (1.0521) time: 4.0893 data: 0.0002 max mem: 54684 +[06:27:59.439243] Epoch: [1] [1930/3229] lr: 0.000028 grad_norm: 0.7197 (0.7587) closs: 1.0121 (1.0517) time: 4.0441 data: 0.0002 max mem: 54684 +[06:28:40.298280] Epoch: [1] [1940/3229] lr: 0.000027 grad_norm: 0.7343 (0.7586) closs: 1.0121 (1.0518) time: 4.0534 data: 0.0002 max mem: 54684 +[06:29:21.418176] Epoch: [1] [1950/3229] lr: 0.000027 grad_norm: 0.7666 (0.7588) closs: 1.0867 (1.0519) time: 4.0989 data: 0.0002 max mem: 54684 +[06:30:01.756178] Epoch: [1] [1960/3229] lr: 0.000027 grad_norm: 0.7619 (0.7587) closs: 1.0618 (1.0519) time: 4.0728 data: 0.0002 max mem: 54684 +[06:30:43.062087] Epoch: [1] [1970/3229] lr: 0.000027 grad_norm: 0.7588 (0.7588) closs: 1.0484 (1.0518) time: 4.0821 data: 0.0002 max mem: 54684 +[06:31:23.143763] Epoch: [1] [1980/3229] lr: 0.000027 grad_norm: 0.7496 (0.7584) closs: 1.0332 (1.0516) time: 4.0693 data: 0.0002 max mem: 54684 +[06:32:04.271038] Epoch: [1] [1990/3229] lr: 0.000027 grad_norm: 0.7496 (0.7585) closs: 1.0258 (1.0515) time: 4.0604 data: 0.0002 max mem: 54684 +[06:32:45.254334] Epoch: [1] [2000/3229] lr: 0.000027 grad_norm: 0.7825 (0.7587) closs: 1.0567 (1.0515) time: 4.1055 data: 0.0002 max mem: 54684 +[06:33:26.464223] Epoch: [1] [2010/3229] lr: 0.000027 grad_norm: 0.7723 (0.7588) closs: 1.0622 (1.0516) time: 4.1096 data: 0.0002 max mem: 54684 +[06:34:07.977608] Epoch: [1] [2020/3229] lr: 0.000027 grad_norm: 0.7569 (0.7588) closs: 1.0776 (1.0517) time: 4.1361 data: 0.0002 max mem: 54684 +[06:34:48.895957] Epoch: [1] [2030/3229] lr: 0.000027 grad_norm: 0.7560 (0.7586) closs: 1.0790 (1.0517) time: 4.1215 data: 0.0002 max mem: 54684 +[06:35:29.451828] Epoch: [1] [2040/3229] lr: 0.000027 grad_norm: 0.7256 (0.7584) closs: 1.0240 (1.0514) time: 4.0736 data: 0.0002 max mem: 54684 +[06:36:10.091914] Epoch: [1] [2050/3229] lr: 0.000027 grad_norm: 0.7247 (0.7581) closs: 1.0561 (1.0513) time: 4.0597 data: 0.0002 max mem: 54684 +[06:36:50.691448] Epoch: [1] [2060/3229] lr: 0.000027 grad_norm: 0.7255 (0.7579) closs: 1.0204 (1.0512) time: 4.0619 data: 0.0002 max mem: 54684 +[06:37:31.150888] Epoch: [1] [2070/3229] lr: 0.000026 grad_norm: 0.7327 (0.7578) closs: 1.0344 (1.0512) time: 4.0529 data: 0.0002 max mem: 54684 +[06:38:11.548200] Epoch: [1] [2080/3229] lr: 0.000026 grad_norm: 0.7327 (0.7576) closs: 1.0414 (1.0511) time: 4.0428 data: 0.0002 max mem: 54684 +[06:38:51.240169] Epoch: [1] [2090/3229] lr: 0.000026 grad_norm: 0.7045 (0.7574) closs: 0.9855 (1.0507) time: 4.0044 data: 0.0002 max mem: 54684 +[06:39:32.307282] Epoch: [1] [2100/3229] lr: 0.000026 grad_norm: 0.7700 (0.7575) closs: 0.9816 (1.0506) time: 4.0379 data: 0.0002 max mem: 54684 +[06:40:12.900721] Epoch: [1] [2110/3229] lr: 0.000026 grad_norm: 0.7683 (0.7572) closs: 1.0346 (1.0504) time: 4.0830 data: 0.0002 max mem: 54684 +[06:40:54.073838] Epoch: [1] [2120/3229] lr: 0.000026 grad_norm: 0.7175 (0.7572) closs: 1.0233 (1.0503) time: 4.0883 data: 0.0002 max mem: 54684 +[06:41:34.494204] Epoch: [1] [2130/3229] lr: 0.000026 grad_norm: 0.7567 (0.7575) closs: 1.0420 (1.0504) time: 4.0796 data: 0.0002 max mem: 54684 +[06:42:14.786527] Epoch: [1] [2140/3229] lr: 0.000026 grad_norm: 0.7746 (0.7574) closs: 1.0572 (1.0504) time: 4.0356 data: 0.0002 max mem: 54684 +[06:42:56.042219] Epoch: [1] [2150/3229] lr: 0.000026 grad_norm: 0.7436 (0.7574) closs: 1.0572 (1.0506) time: 4.0773 data: 0.0002 max mem: 54684 +[06:43:36.873461] Epoch: [1] [2160/3229] lr: 0.000026 grad_norm: 0.7565 (0.7574) closs: 1.1052 (1.0509) time: 4.1043 data: 0.0002 max mem: 54684 +[06:44:18.036372] Epoch: [1] [2170/3229] lr: 0.000026 grad_norm: 0.7559 (0.7573) closs: 1.0642 (1.0508) time: 4.0996 data: 0.0002 max mem: 54684 +[06:44:58.954633] Epoch: [1] [2180/3229] lr: 0.000026 grad_norm: 0.7512 (0.7574) closs: 1.0519 (1.0507) time: 4.1040 data: 0.0002 max mem: 54684 +[06:45:39.550709] Epoch: [1] [2190/3229] lr: 0.000026 grad_norm: 0.7721 (0.7580) closs: 1.0434 (1.0508) time: 4.0757 data: 0.0002 max mem: 54684 +[06:46:20.109102] Epoch: [1] [2200/3229] lr: 0.000025 grad_norm: 0.7721 (0.7579) closs: 1.0396 (1.0507) time: 4.0577 data: 0.0002 max mem: 54684 +[06:47:01.387992] Epoch: [1] [2210/3229] lr: 0.000025 grad_norm: 0.7774 (0.7582) closs: 1.0393 (1.0508) time: 4.0918 data: 0.0002 max mem: 54684 +[06:47:42.292906] Epoch: [1] [2220/3229] lr: 0.000025 grad_norm: 0.7702 (0.7580) closs: 1.0655 (1.0509) time: 4.1091 data: 0.0002 max mem: 54684 +[06:48:22.615690] Epoch: [1] [2230/3229] lr: 0.000025 grad_norm: 0.7125 (0.7578) closs: 1.0602 (1.0509) time: 4.0613 data: 0.0002 max mem: 54684 +[06:49:03.122150] Epoch: [1] [2240/3229] lr: 0.000025 grad_norm: 0.7564 (0.7580) closs: 1.0392 (1.0507) time: 4.0414 data: 0.0002 max mem: 54684 +[06:49:43.913988] Epoch: [1] [2250/3229] lr: 0.000025 grad_norm: 0.7275 (0.7587) closs: 1.0392 (1.0508) time: 4.0649 data: 0.0002 max mem: 54684 +[06:50:24.836181] Epoch: [1] [2260/3229] lr: 0.000025 grad_norm: 0.7275 (0.7588) closs: 1.0988 (1.0510) time: 4.0856 data: 0.0002 max mem: 54684 +[06:51:05.820250] Epoch: [1] [2270/3229] lr: 0.000025 grad_norm: 0.7384 (0.7587) closs: 1.0513 (1.0508) time: 4.0953 data: 0.0002 max mem: 54684 +[06:51:46.269588] Epoch: [1] [2280/3229] lr: 0.000025 grad_norm: 0.7730 (0.7588) closs: 1.0402 (1.0509) time: 4.0716 data: 0.0002 max mem: 54684 +[06:52:27.410955] Epoch: [1] [2290/3229] lr: 0.000025 grad_norm: 0.7238 (0.7586) closs: 1.0385 (1.0507) time: 4.0795 data: 0.0002 max mem: 54684 +[06:53:08.358905] Epoch: [1] [2300/3229] lr: 0.000025 grad_norm: 0.7204 (0.7586) closs: 1.0344 (1.0506) time: 4.1044 data: 0.0002 max mem: 54684 +[06:53:49.351820] Epoch: [1] [2310/3229] lr: 0.000025 grad_norm: 0.7408 (0.7585) closs: 1.0498 (1.0506) time: 4.0970 data: 0.0002 max mem: 54684 +[06:54:30.150638] Epoch: [1] [2320/3229] lr: 0.000025 grad_norm: 0.7644 (0.7586) closs: 1.0665 (1.0507) time: 4.0895 data: 0.0002 max mem: 54684 +[06:55:11.609155] Epoch: [1] [2330/3229] lr: 0.000024 grad_norm: 0.7644 (0.7586) closs: 1.0783 (1.0508) time: 4.1128 data: 0.0002 max mem: 54684 +[06:55:52.526582] Epoch: [1] [2340/3229] lr: 0.000024 grad_norm: 0.7481 (0.7587) closs: 1.0944 (1.0510) time: 4.1187 data: 0.0002 max mem: 54684 +[06:56:33.528322] Epoch: [1] [2350/3229] lr: 0.000024 grad_norm: 0.7847 (0.7587) closs: 1.1101 (1.0511) time: 4.0959 data: 0.0002 max mem: 54684 +[06:57:13.657474] Epoch: [1] [2360/3229] lr: 0.000024 grad_norm: 0.7256 (0.7585) closs: 1.0694 (1.0511) time: 4.0565 data: 0.0002 max mem: 54684 +[06:57:54.348834] Epoch: [1] [2370/3229] lr: 0.000024 grad_norm: 0.7192 (0.7584) closs: 1.0375 (1.0509) time: 4.0410 data: 0.0002 max mem: 54684 +[06:58:35.300759] Epoch: [1] [2380/3229] lr: 0.000024 grad_norm: 0.7204 (0.7584) closs: 1.0375 (1.0509) time: 4.0821 data: 0.0002 max mem: 54684 +[06:59:15.644906] Epoch: [1] [2390/3229] lr: 0.000024 grad_norm: 0.7417 (0.7583) closs: 1.0504 (1.0508) time: 4.0647 data: 0.0002 max mem: 54684 +[06:59:56.423928] Epoch: [1] [2400/3229] lr: 0.000024 grad_norm: 0.7580 (0.7583) closs: 1.0390 (1.0508) time: 4.0561 data: 0.0002 max mem: 54684 +[07:00:36.732262] Epoch: [1] [2410/3229] lr: 0.000024 grad_norm: 0.7551 (0.7580) closs: 1.0390 (1.0508) time: 4.0543 data: 0.0002 max mem: 54684 +[07:01:17.509223] Epoch: [1] [2420/3229] lr: 0.000024 grad_norm: 0.7224 (0.7579) closs: 1.0253 (1.0507) time: 4.0542 data: 0.0002 max mem: 54684 +[07:01:59.103175] Epoch: [1] [2430/3229] lr: 0.000024 grad_norm: 0.7534 (0.7580) closs: 1.0289 (1.0507) time: 4.1185 data: 0.0002 max mem: 54684 +[07:02:39.633371] Epoch: [1] [2440/3229] lr: 0.000024 grad_norm: 0.7523 (0.7579) closs: 1.0303 (1.0506) time: 4.1061 data: 0.0002 max mem: 54684 +[07:03:21.099615] Epoch: [1] [2450/3229] lr: 0.000024 grad_norm: 0.7399 (0.7580) closs: 1.0666 (1.0508) time: 4.0998 data: 0.0002 max mem: 54684 +[07:04:01.876483] Epoch: [1] [2460/3229] lr: 0.000023 grad_norm: 0.7601 (0.7597) closs: 1.0710 (1.0509) time: 4.1121 data: 0.0002 max mem: 54684 +[07:04:42.177832] Epoch: [1] [2470/3229] lr: 0.000023 grad_norm: 0.7070 (0.7594) closs: 1.0346 (1.0508) time: 4.0538 data: 0.0002 max mem: 54684 +[07:05:22.730871] Epoch: [1] [2480/3229] lr: 0.000023 grad_norm: 0.6798 (0.7592) closs: 1.0346 (1.0508) time: 4.0427 data: 0.0002 max mem: 54684 +[07:06:04.230691] Epoch: [1] [2490/3229] lr: 0.000023 grad_norm: 0.7520 (0.7593) closs: 1.0195 (1.0507) time: 4.1026 data: 0.0002 max mem: 54684 +[07:06:44.671239] Epoch: [1] [2500/3229] lr: 0.000023 grad_norm: 0.7559 (0.7592) closs: 1.0066 (1.0506) time: 4.0970 data: 0.0002 max mem: 54684 +[07:07:25.306096] Epoch: [1] [2510/3229] lr: 0.000023 grad_norm: 0.7394 (0.7590) closs: 1.0125 (1.0504) time: 4.0537 data: 0.0002 max mem: 54684 +[07:08:05.672440] Epoch: [1] [2520/3229] lr: 0.000023 grad_norm: 0.7341 (0.7589) closs: 1.0304 (1.0502) time: 4.0500 data: 0.0002 max mem: 54684 +[07:08:46.342159] Epoch: [1] [2530/3229] lr: 0.000023 grad_norm: 0.7582 (0.7590) closs: 1.0381 (1.0503) time: 4.0517 data: 0.0002 max mem: 54684 +[07:09:27.086959] Epoch: [1] [2540/3229] lr: 0.000023 grad_norm: 0.7684 (0.7590) closs: 1.0815 (1.0503) time: 4.0707 data: 0.0002 max mem: 54684 +[07:10:08.439021] Epoch: [1] [2550/3229] lr: 0.000023 grad_norm: 0.7684 (0.7590) closs: 1.0815 (1.0505) time: 4.1048 data: 0.0002 max mem: 54684 +[07:10:49.035524] Epoch: [1] [2560/3229] lr: 0.000023 grad_norm: 0.7595 (0.7590) closs: 1.0433 (1.0504) time: 4.0974 data: 0.0002 max mem: 54684 +[07:11:30.100693] Epoch: [1] [2570/3229] lr: 0.000023 grad_norm: 0.7435 (0.7588) closs: 1.0562 (1.0506) time: 4.0830 data: 0.0002 max mem: 54684 +[07:12:11.188418] Epoch: [1] [2580/3229] lr: 0.000023 grad_norm: 0.7404 (0.7589) closs: 1.1022 (1.0507) time: 4.1076 data: 0.0002 max mem: 54684 +[07:12:52.475637] Epoch: [1] [2590/3229] lr: 0.000022 grad_norm: 0.7772 (0.7591) closs: 1.0955 (1.0509) time: 4.1187 data: 0.0002 max mem: 54684 +[07:13:33.482943] Epoch: [1] [2600/3229] lr: 0.000022 grad_norm: 0.7876 (0.7590) closs: 1.0745 (1.0508) time: 4.1147 data: 0.0002 max mem: 54684 +[07:14:14.212298] Epoch: [1] [2610/3229] lr: 0.000022 grad_norm: 0.7625 (0.7589) closs: 1.0001 (1.0507) time: 4.0868 data: 0.0002 max mem: 54684 +[07:14:55.108401] Epoch: [1] [2620/3229] lr: 0.000022 grad_norm: 0.7431 (0.7588) closs: 1.0001 (1.0506) time: 4.0812 data: 0.0002 max mem: 54684 +[07:15:36.195488] Epoch: [1] [2630/3229] lr: 0.000022 grad_norm: 0.7435 (0.7590) closs: 1.0573 (1.0508) time: 4.0991 data: 0.0002 max mem: 54684 +[07:16:17.647209] Epoch: [1] [2640/3229] lr: 0.000022 grad_norm: 0.7787 (0.7590) closs: 1.0782 (1.0508) time: 4.1269 data: 0.0002 max mem: 54684 +[07:16:58.892659] Epoch: [1] [2650/3229] lr: 0.000022 grad_norm: 0.7769 (0.7590) closs: 1.0465 (1.0508) time: 4.1348 data: 0.0002 max mem: 54684 +[07:17:39.799115] Epoch: [1] [2660/3229] lr: 0.000022 grad_norm: 0.7668 (0.7590) closs: 1.0389 (1.0509) time: 4.1075 data: 0.0002 max mem: 54684 +[07:18:20.650599] Epoch: [1] [2670/3229] lr: 0.000022 grad_norm: 0.7837 (0.7592) closs: 1.0408 (1.0508) time: 4.0878 data: 0.0002 max mem: 54684 +[07:19:01.954445] Epoch: [1] [2680/3229] lr: 0.000022 grad_norm: 0.7896 (0.7591) closs: 1.0526 (1.0508) time: 4.1077 data: 0.0002 max mem: 54684 +[07:19:42.555933] Epoch: [1] [2690/3229] lr: 0.000022 grad_norm: 0.7348 (0.7591) closs: 1.0526 (1.0508) time: 4.0952 data: 0.0002 max mem: 54684 +[07:20:23.489822] Epoch: [1] [2700/3229] lr: 0.000022 grad_norm: 0.7487 (0.7592) closs: 1.0357 (1.0508) time: 4.0767 data: 0.0002 max mem: 54684 +[07:21:03.709034] Epoch: [1] [2710/3229] lr: 0.000022 grad_norm: 0.7536 (0.7593) closs: 1.0253 (1.0506) time: 4.0576 data: 0.0002 max mem: 54684 +[07:21:44.808335] Epoch: [1] [2720/3229] lr: 0.000021 grad_norm: 0.7750 (0.7593) closs: 1.0362 (1.0505) time: 4.0659 data: 0.0002 max mem: 54684 +[07:22:26.050486] Epoch: [1] [2730/3229] lr: 0.000021 grad_norm: 0.7431 (0.7593) closs: 1.0379 (1.0505) time: 4.1170 data: 0.0002 max mem: 54684 +[07:23:06.929188] Epoch: [1] [2740/3229] lr: 0.000021 grad_norm: 0.7314 (0.7592) closs: 1.0382 (1.0505) time: 4.1060 data: 0.0002 max mem: 54684 +[07:23:47.376913] Epoch: [1] [2750/3229] lr: 0.000021 grad_norm: 0.7727 (0.7592) closs: 1.0259 (1.0503) time: 4.0663 data: 0.0002 max mem: 54684 +[07:24:28.781962] Epoch: [1] [2760/3229] lr: 0.000021 grad_norm: 0.7543 (0.7591) closs: 1.0343 (1.0504) time: 4.0926 data: 0.0002 max mem: 54684 +[07:25:09.722107] Epoch: [1] [2770/3229] lr: 0.000021 grad_norm: 0.7543 (0.7592) closs: 1.0747 (1.0505) time: 4.1172 data: 0.0002 max mem: 54684 +[07:25:50.621826] Epoch: [1] [2780/3229] lr: 0.000021 grad_norm: 0.7577 (0.7591) closs: 1.0669 (1.0505) time: 4.0919 data: 0.0002 max mem: 54684 +[07:26:30.847780] Epoch: [1] [2790/3229] lr: 0.000021 grad_norm: 0.7133 (0.7590) closs: 1.0389 (1.0505) time: 4.0562 data: 0.0002 max mem: 54684 +[07:27:12.004919] Epoch: [1] [2800/3229] lr: 0.000021 grad_norm: 0.7479 (0.7588) closs: 1.0389 (1.0505) time: 4.0691 data: 0.0002 max mem: 54684 +[07:27:52.590244] Epoch: [1] [2810/3229] lr: 0.000021 grad_norm: 0.7538 (0.7590) closs: 1.0261 (1.0503) time: 4.0871 data: 0.0002 max mem: 54684 +[07:28:33.829143] Epoch: [1] [2820/3229] lr: 0.000021 grad_norm: 0.7608 (0.7591) closs: 1.0348 (1.0505) time: 4.0911 data: 0.0002 max mem: 54684 +[07:29:14.344969] Epoch: [1] [2830/3229] lr: 0.000021 grad_norm: 0.7575 (0.7589) closs: 1.0923 (1.0506) time: 4.0877 data: 0.0002 max mem: 54684 +[07:29:54.993752] Epoch: [1] [2840/3229] lr: 0.000021 grad_norm: 0.7351 (0.7589) closs: 1.0629 (1.0506) time: 4.0582 data: 0.0002 max mem: 54684 +[07:30:35.251355] Epoch: [1] [2850/3229] lr: 0.000020 grad_norm: 0.7529 (0.7587) closs: 1.0003 (1.0504) time: 4.0453 data: 0.0002 max mem: 54684 +[07:31:15.852435] Epoch: [1] [2860/3229] lr: 0.000020 grad_norm: 0.7370 (0.7586) closs: 1.0153 (1.0504) time: 4.0429 data: 0.0002 max mem: 54684 +[07:31:56.405734] Epoch: [1] [2870/3229] lr: 0.000020 grad_norm: 0.7265 (0.7583) closs: 1.0422 (1.0503) time: 4.0577 data: 0.0002 max mem: 54684 +[07:32:37.536609] Epoch: [1] [2880/3229] lr: 0.000020 grad_norm: 0.7202 (0.7582) closs: 1.0430 (1.0504) time: 4.0841 data: 0.0002 max mem: 54684 +[07:33:18.137999] Epoch: [1] [2890/3229] lr: 0.000020 grad_norm: 0.7069 (0.7580) closs: 1.0299 (1.0504) time: 4.0866 data: 0.0002 max mem: 54684 +[07:33:59.054025] Epoch: [1] [2900/3229] lr: 0.000020 grad_norm: 0.7126 (0.7579) closs: 0.9991 (1.0503) time: 4.0758 data: 0.0002 max mem: 54684 +[07:34:40.302514] Epoch: [1] [2910/3229] lr: 0.000020 grad_norm: 0.7618 (0.7580) closs: 1.0246 (1.0504) time: 4.1082 data: 0.0002 max mem: 54684 +[07:35:21.613397] Epoch: [1] [2920/3229] lr: 0.000020 grad_norm: 0.7726 (0.7581) closs: 1.0246 (1.0504) time: 4.1279 data: 0.0002 max mem: 54684 +[07:36:01.885012] Epoch: [1] [2930/3229] lr: 0.000020 grad_norm: 0.7795 (0.7580) closs: 1.0148 (1.0502) time: 4.0791 data: 0.0002 max mem: 54684 +[07:36:42.517521] Epoch: [1] [2940/3229] lr: 0.000020 grad_norm: 0.7532 (0.7580) closs: 1.0222 (1.0501) time: 4.0451 data: 0.0002 max mem: 54684 +[07:37:22.983411] Epoch: [1] [2950/3229] lr: 0.000020 grad_norm: 0.7556 (0.7579) closs: 1.0219 (1.0500) time: 4.0549 data: 0.0002 max mem: 54684 +[07:38:03.810435] Epoch: [1] [2960/3229] lr: 0.000020 grad_norm: 0.7329 (0.7577) closs: 1.0213 (1.0499) time: 4.0646 data: 0.0002 max mem: 54684 +[07:38:44.734197] Epoch: [1] [2970/3229] lr: 0.000020 grad_norm: 0.7669 (0.7578) closs: 1.0230 (1.0499) time: 4.0875 data: 0.0002 max mem: 54684 +[07:39:25.067815] Epoch: [1] [2980/3229] lr: 0.000020 grad_norm: 0.7443 (0.7575) closs: 1.0170 (1.0497) time: 4.0628 data: 0.0002 max mem: 54684 +[07:40:06.277140] Epoch: [1] [2990/3229] lr: 0.000019 grad_norm: 0.7237 (0.7575) closs: 1.0334 (1.0498) time: 4.0771 data: 0.0002 max mem: 54684 +[07:40:47.518619] Epoch: [1] [3000/3229] lr: 0.000019 grad_norm: 0.7443 (0.7576) closs: 1.0808 (1.0499) time: 4.1225 data: 0.0002 max mem: 54684 +[07:41:28.756394] Epoch: [1] [3010/3229] lr: 0.000019 grad_norm: 0.7306 (0.7576) closs: 1.0731 (1.0499) time: 4.1239 data: 0.0002 max mem: 54684 +[07:42:09.820428] Epoch: [1] [3020/3229] lr: 0.000019 grad_norm: 0.7376 (0.7576) closs: 1.0600 (1.0498) time: 4.1150 data: 0.0002 max mem: 54684 +[07:42:50.371264] Epoch: [1] [3030/3229] lr: 0.000019 grad_norm: 0.7764 (0.7576) closs: 1.0239 (1.0497) time: 4.0807 data: 0.0002 max mem: 54684 +[07:43:31.735769] Epoch: [1] [3040/3229] lr: 0.000019 grad_norm: 0.8111 (0.7577) closs: 1.0480 (1.0498) time: 4.0957 data: 0.0002 max mem: 54684 +[07:44:12.340935] Epoch: [1] [3050/3229] lr: 0.000019 grad_norm: 0.7614 (0.7576) closs: 1.0513 (1.0498) time: 4.0984 data: 0.0002 max mem: 54684 +[07:44:53.338278] Epoch: [1] [3060/3229] lr: 0.000019 grad_norm: 0.7472 (0.7577) closs: 1.0586 (1.0498) time: 4.0801 data: 0.0002 max mem: 54684 +[07:45:33.983619] Epoch: [1] [3070/3229] lr: 0.000019 grad_norm: 0.7472 (0.7576) closs: 1.0348 (1.0497) time: 4.0821 data: 0.0002 max mem: 54684 +[07:46:14.880787] Epoch: [1] [3080/3229] lr: 0.000019 grad_norm: 0.7548 (0.7576) closs: 1.0463 (1.0498) time: 4.0771 data: 0.0002 max mem: 54684 +[07:46:55.135508] Epoch: [1] [3090/3229] lr: 0.000019 grad_norm: 0.7275 (0.7574) closs: 1.0621 (1.0497) time: 4.0575 data: 0.0002 max mem: 54684 +[07:47:35.489073] Epoch: [1] [3100/3229] lr: 0.000019 grad_norm: 0.7098 (0.7573) closs: 0.9939 (1.0496) time: 4.0303 data: 0.0002 max mem: 54684 +[07:48:16.842485] Epoch: [1] [3110/3229] lr: 0.000019 grad_norm: 0.7606 (0.7574) closs: 1.0400 (1.0497) time: 4.0853 data: 0.0002 max mem: 54684 +[07:48:58.060831] Epoch: [1] [3120/3229] lr: 0.000019 grad_norm: 0.7567 (0.7574) closs: 1.0607 (1.0498) time: 4.1285 data: 0.0002 max mem: 54684 +[07:49:39.119706] Epoch: [1] [3130/3229] lr: 0.000018 grad_norm: 0.7286 (0.7572) closs: 1.0710 (1.0499) time: 4.1138 data: 0.0002 max mem: 54684 +[07:50:19.961668] Epoch: [1] [3140/3229] lr: 0.000018 grad_norm: 0.7239 (0.7572) closs: 1.0326 (1.0498) time: 4.0950 data: 0.0002 max mem: 54684 +[07:51:00.627490] Epoch: [1] [3150/3229] lr: 0.000018 grad_norm: 0.7701 (0.7572) closs: 1.0245 (1.0498) time: 4.0753 data: 0.0002 max mem: 54684 +[07:51:41.096988] Epoch: [1] [3160/3229] lr: 0.000018 grad_norm: 0.7701 (0.7572) closs: 1.0417 (1.0497) time: 4.0567 data: 0.0002 max mem: 54684 +[07:52:21.540824] Epoch: [1] [3170/3229] lr: 0.000018 grad_norm: 0.7199 (0.7571) closs: 1.0092 (1.0496) time: 4.0456 data: 0.0002 max mem: 54684 +[07:53:01.782122] Epoch: [1] [3180/3229] lr: 0.000018 grad_norm: 0.7290 (0.7570) closs: 1.0158 (1.0495) time: 4.0342 data: 0.0002 max mem: 54684 +[07:53:42.520581] Epoch: [1] [3190/3229] lr: 0.000018 grad_norm: 0.7290 (0.7569) closs: 1.0676 (1.0495) time: 4.0489 data: 0.0002 max mem: 54684 +[07:54:22.768996] Epoch: [1] [3200/3229] lr: 0.000018 grad_norm: 0.7260 (0.7568) closs: 1.0775 (1.0495) time: 4.0493 data: 0.0002 max mem: 54684 +[07:55:03.308755] Epoch: [1] [3210/3229] lr: 0.000018 grad_norm: 0.7512 (0.7568) closs: 1.0424 (1.0494) time: 4.0393 data: 0.0002 max mem: 54684 +[07:55:44.129491] Epoch: [1] [3220/3229] lr: 0.000018 grad_norm: 0.7508 (0.7567) closs: 1.0736 (1.0494) time: 4.0680 data: 0.0001 max mem: 54684 +[07:56:16.699747] Epoch: [1] Total time: 3:40:05 +[07:56:16.736123] Averaged stats: lr: 0.000018 grad_norm: 0.7281 (0.7566) closs: 1.0383 (1.0483) +[07:56:17.102213] model saved +[07:56:18.915517] optimizer saved +[07:56:18.916157] other rank-common saved +[07:56:18.922035] rank-specific saved +[07:56:18.936513] log_dir: ./output_dir +[07:56:31.268719] Epoch: [2] [0/3229] lr: 0.000018 grad_norm: 0.8262 (0.8262) closs: 1.0084 (1.0084) time: 12.3313 data: 8.2426 max mem: 54684 +[07:57:12.183294] Epoch: [2] [10/3229] lr: 0.000018 grad_norm: 0.7358 (0.7439) closs: 1.0242 (1.0370) time: 4.8405 data: 0.7495 max mem: 54684 +[07:57:52.839749] Epoch: [2] [20/3229] lr: 0.000018 grad_norm: 0.7358 (0.7614) closs: 1.0264 (1.0467) time: 4.0785 data: 0.0002 max mem: 54684 +[07:58:33.435224] Epoch: [2] [30/3229] lr: 0.000018 grad_norm: 0.7811 (0.7651) closs: 1.0351 (1.0354) time: 4.0625 data: 0.0002 max mem: 54684 +[07:59:15.317025] Epoch: [2] [40/3229] lr: 0.000017 grad_norm: 0.7708 (0.7695) closs: 1.0460 (1.0453) time: 4.1238 data: 0.0002 max mem: 54684 +[07:59:56.906136] Epoch: [2] [50/3229] lr: 0.000017 grad_norm: 0.7742 (0.7745) closs: 1.0633 (1.0491) time: 4.1735 data: 0.0002 max mem: 54684 +[08:00:37.793068] Epoch: [2] [60/3229] lr: 0.000017 grad_norm: 0.7780 (0.7750) closs: 1.0396 (1.0518) time: 4.1237 data: 0.0002 max mem: 54684 +[08:01:17.989880] Epoch: [2] [70/3229] lr: 0.000017 grad_norm: 0.7406 (0.7669) closs: 1.0464 (1.0460) time: 4.0541 data: 0.0002 max mem: 54684 +[08:01:59.026748] Epoch: [2] [80/3229] lr: 0.000017 grad_norm: 0.7321 (0.7684) closs: 1.0689 (1.0462) time: 4.0616 data: 0.0002 max mem: 54684 +[08:02:39.476084] Epoch: [2] [90/3229] lr: 0.000017 grad_norm: 0.7710 (0.7672) closs: 1.0697 (1.0457) time: 4.0742 data: 0.0002 max mem: 54684 +[08:03:20.882001] Epoch: [2] [100/3229] lr: 0.000017 grad_norm: 0.7731 (0.7683) closs: 1.0697 (1.0507) time: 4.0927 data: 0.0002 max mem: 54684 +[08:04:01.732738] Epoch: [2] [110/3229] lr: 0.000017 grad_norm: 0.7636 (0.7693) closs: 1.0633 (1.0496) time: 4.1128 data: 0.0002 max mem: 54684 +[08:04:43.243616] Epoch: [2] [120/3229] lr: 0.000017 grad_norm: 0.7653 (0.7695) closs: 1.0444 (1.0471) time: 4.1180 data: 0.0002 max mem: 54684 +[08:05:24.640326] Epoch: [2] [130/3229] lr: 0.000017 grad_norm: 0.7895 (0.7719) closs: 1.0530 (1.0490) time: 4.1453 data: 0.0002 max mem: 54684 +[08:06:04.919803] Epoch: [2] [140/3229] lr: 0.000017 grad_norm: 0.7767 (0.7695) closs: 1.0625 (1.0491) time: 4.0837 data: 0.0002 max mem: 54684 +[08:06:45.485984] Epoch: [2] [150/3229] lr: 0.000017 grad_norm: 0.7487 (0.7669) closs: 1.0756 (1.0476) time: 4.0422 data: 0.0002 max mem: 54684 +[08:07:26.589317] Epoch: [2] [160/3229] lr: 0.000017 grad_norm: 0.7543 (0.7668) closs: 1.0570 (1.0491) time: 4.0834 data: 0.0002 max mem: 54684 +[08:08:07.809141] Epoch: [2] [170/3229] lr: 0.000017 grad_norm: 0.7705 (0.7656) closs: 1.0286 (1.0475) time: 4.1161 data: 0.0002 max mem: 54684 +[08:08:48.382644] Epoch: [2] [180/3229] lr: 0.000016 grad_norm: 0.7462 (0.7654) closs: 1.0195 (1.0463) time: 4.0896 data: 0.0002 max mem: 54684 +[08:09:29.267697] Epoch: [2] [190/3229] lr: 0.000016 grad_norm: 0.7566 (0.7639) closs: 1.0265 (1.0465) time: 4.0729 data: 0.0002 max mem: 54684 +[08:10:09.377776] Epoch: [2] [200/3229] lr: 0.000016 grad_norm: 0.7441 (0.7628) closs: 1.0265 (1.0435) time: 4.0497 data: 0.0002 max mem: 54684 +[08:10:49.302139] Epoch: [2] [210/3229] lr: 0.000016 grad_norm: 0.6802 (0.7596) closs: 0.9949 (1.0425) time: 4.0017 data: 0.0002 max mem: 54684 +[08:11:30.519323] Epoch: [2] [220/3229] lr: 0.000016 grad_norm: 0.7500 (0.7609) closs: 1.0888 (1.0456) time: 4.0570 data: 0.0002 max mem: 54684 +[08:12:11.704337] Epoch: [2] [230/3229] lr: 0.000016 grad_norm: 0.7773 (0.7609) closs: 1.0791 (1.0452) time: 4.1200 data: 0.0002 max mem: 54684 +[08:12:52.393373] Epoch: [2] [240/3229] lr: 0.000016 grad_norm: 0.7641 (0.7595) closs: 1.0536 (1.0445) time: 4.0936 data: 0.0002 max mem: 54684 +[08:13:33.064691] Epoch: [2] [250/3229] lr: 0.000016 grad_norm: 0.7542 (0.7585) closs: 1.0588 (1.0449) time: 4.0680 data: 0.0002 max mem: 54684 +[08:14:14.045473] Epoch: [2] [260/3229] lr: 0.000016 grad_norm: 0.7569 (0.7583) closs: 1.0562 (1.0460) time: 4.0825 data: 0.0002 max mem: 54684 +[08:14:54.905661] Epoch: [2] [270/3229] lr: 0.000016 grad_norm: 0.7574 (0.7579) closs: 1.0383 (1.0455) time: 4.0920 data: 0.0002 max mem: 54684 +[08:15:35.926717] Epoch: [2] [280/3229] lr: 0.000016 grad_norm: 0.7558 (0.7577) closs: 1.0300 (1.0455) time: 4.0940 data: 0.0002 max mem: 54684 +[08:16:16.814567] Epoch: [2] [290/3229] lr: 0.000016 grad_norm: 0.7416 (0.7564) closs: 1.0680 (1.0467) time: 4.0954 data: 0.0002 max mem: 54684 +[08:16:58.068460] Epoch: [2] [300/3229] lr: 0.000016 grad_norm: 0.7514 (0.7573) closs: 1.0594 (1.0468) time: 4.1070 data: 0.0002 max mem: 54684 +[08:17:38.927633] Epoch: [2] [310/3229] lr: 0.000016 grad_norm: 0.7797 (0.7577) closs: 1.0474 (1.0460) time: 4.1056 data: 0.0002 max mem: 54684 +[08:18:19.652536] Epoch: [2] [320/3229] lr: 0.000016 grad_norm: 0.7461 (0.7563) closs: 1.0455 (1.0458) time: 4.0791 data: 0.0002 max mem: 54684 +[08:19:00.882256] Epoch: [2] [330/3229] lr: 0.000015 grad_norm: 0.7702 (0.7581) closs: 1.0466 (1.0463) time: 4.0977 data: 0.0002 max mem: 54684 +[08:19:40.850985] Epoch: [2] [340/3229] lr: 0.000015 grad_norm: 0.7544 (0.7554) closs: 1.0466 (1.0460) time: 4.0599 data: 0.0002 max mem: 54684 +[08:20:21.704908] Epoch: [2] [350/3229] lr: 0.000015 grad_norm: 0.7372 (0.7563) closs: 1.0080 (1.0453) time: 4.0411 data: 0.0002 max mem: 54684 +[08:21:02.268285] Epoch: [2] [360/3229] lr: 0.000015 grad_norm: 0.8174 (0.7568) closs: 0.9763 (1.0430) time: 4.0708 data: 0.0002 max mem: 54684 +[08:21:42.525189] Epoch: [2] [370/3229] lr: 0.000015 grad_norm: 0.7369 (0.7568) closs: 0.9641 (1.0418) time: 4.0410 data: 0.0002 max mem: 54684 +[08:22:23.507510] Epoch: [2] [380/3229] lr: 0.000015 grad_norm: 0.7944 (0.7578) closs: 1.0548 (1.0421) time: 4.0619 data: 0.0002 max mem: 54684 +[08:23:04.788986] Epoch: [2] [390/3229] lr: 0.000015 grad_norm: 0.7995 (0.7582) closs: 1.0502 (1.0420) time: 4.1131 data: 0.0002 max mem: 54684 +[08:23:45.036107] Epoch: [2] [400/3229] lr: 0.000015 grad_norm: 0.7850 (0.7579) closs: 1.0068 (1.0413) time: 4.0764 data: 0.0002 max mem: 54684 +[08:24:25.660061] Epoch: [2] [410/3229] lr: 0.000015 grad_norm: 0.7850 (0.7576) closs: 1.0540 (1.0410) time: 4.0435 data: 0.0002 max mem: 54684 +[08:25:06.015694] Epoch: [2] [420/3229] lr: 0.000015 grad_norm: 0.7293 (0.7570) closs: 1.0468 (1.0396) time: 4.0489 data: 0.0002 max mem: 54684 +[08:25:46.999427] Epoch: [2] [430/3229] lr: 0.000015 grad_norm: 0.7238 (0.7570) closs: 1.0225 (1.0398) time: 4.0669 data: 0.0002 max mem: 54684 +[08:26:27.949780] Epoch: [2] [440/3229] lr: 0.000015 grad_norm: 0.7509 (0.7575) closs: 1.0472 (1.0399) time: 4.0966 data: 0.0002 max mem: 54684 +[08:27:08.849435] Epoch: [2] [450/3229] lr: 0.000015 grad_norm: 0.7973 (0.7579) closs: 1.0547 (1.0396) time: 4.0924 data: 0.0002 max mem: 54684 +[08:27:49.906838] Epoch: [2] [460/3229] lr: 0.000015 grad_norm: 0.7565 (0.7575) closs: 0.9997 (1.0391) time: 4.0978 data: 0.0002 max mem: 54684 +[08:28:30.907563] Epoch: [2] [470/3229] lr: 0.000015 grad_norm: 0.7846 (0.7587) closs: 1.0305 (1.0388) time: 4.1028 data: 0.0002 max mem: 54684 +[08:29:11.501494] Epoch: [2] [480/3229] lr: 0.000015 grad_norm: 0.7925 (0.7583) closs: 1.0507 (1.0382) time: 4.0797 data: 0.0002 max mem: 54684 +[08:29:52.450393] Epoch: [2] [490/3229] lr: 0.000014 grad_norm: 0.7658 (0.7582) closs: 1.0217 (1.0378) time: 4.0771 data: 0.0002 max mem: 54684 +[08:30:33.131092] Epoch: [2] [500/3229] lr: 0.000014 grad_norm: 0.7658 (0.7582) closs: 1.0195 (1.0377) time: 4.0814 data: 0.0002 max mem: 54684 +[08:31:13.731758] Epoch: [2] [510/3229] lr: 0.000014 grad_norm: 0.7490 (0.7574) closs: 1.0083 (1.0372) time: 4.0640 data: 0.0002 max mem: 54684 +[08:31:54.606631] Epoch: [2] [520/3229] lr: 0.000014 grad_norm: 0.7661 (0.7578) closs: 1.0242 (1.0374) time: 4.0737 data: 0.0002 max mem: 54684 +[08:32:35.652465] Epoch: [2] [530/3229] lr: 0.000014 grad_norm: 0.7661 (0.7574) closs: 1.0650 (1.0373) time: 4.0960 data: 0.0002 max mem: 54684 +[08:33:16.811929] Epoch: [2] [540/3229] lr: 0.000014 grad_norm: 0.7808 (0.7580) closs: 1.0536 (1.0379) time: 4.1102 data: 0.0002 max mem: 54684 +[08:33:58.098430] Epoch: [2] [550/3229] lr: 0.000014 grad_norm: 0.7890 (0.7584) closs: 1.0505 (1.0379) time: 4.1222 data: 0.0002 max mem: 54684 +[08:34:39.012102] Epoch: [2] [560/3229] lr: 0.000014 grad_norm: 0.7536 (0.7579) closs: 1.0541 (1.0380) time: 4.1099 data: 0.0002 max mem: 54684 +[08:35:20.411837] Epoch: [2] [570/3229] lr: 0.000014 grad_norm: 0.7575 (0.7583) closs: 1.0281 (1.0376) time: 4.1156 data: 0.0002 max mem: 54684 +[08:36:01.585232] Epoch: [2] [580/3229] lr: 0.000014 grad_norm: 0.7735 (0.7588) closs: 1.0281 (1.0379) time: 4.1286 data: 0.0002 max mem: 54684 +[08:36:42.866022] Epoch: [2] [590/3229] lr: 0.000014 grad_norm: 0.7978 (0.7597) closs: 1.0419 (1.0385) time: 4.1226 data: 0.0002 max mem: 54684 +[08:37:24.134897] Epoch: [2] [600/3229] lr: 0.000014 grad_norm: 0.7944 (0.7605) closs: 1.0606 (1.0390) time: 4.1274 data: 0.0002 max mem: 54684 +[08:38:05.564805] Epoch: [2] [610/3229] lr: 0.000014 grad_norm: 0.7800 (0.7610) closs: 1.0638 (1.0396) time: 4.1349 data: 0.0002 max mem: 54684 +[08:38:47.052757] Epoch: [2] [620/3229] lr: 0.000014 grad_norm: 0.7830 (0.7618) closs: 1.0701 (1.0400) time: 4.1458 data: 0.0002 max mem: 54684 +[08:39:28.070271] Epoch: [2] [630/3229] lr: 0.000014 grad_norm: 0.7867 (0.7620) closs: 1.0398 (1.0394) time: 4.1252 data: 0.0002 max mem: 54684 +[08:40:09.431657] Epoch: [2] [640/3229] lr: 0.000014 grad_norm: 0.7636 (0.7620) closs: 1.0375 (1.0393) time: 4.1189 data: 0.0002 max mem: 54684 +[08:40:50.029316] Epoch: [2] [650/3229] lr: 0.000013 grad_norm: 0.7721 (0.7620) closs: 1.0376 (1.0392) time: 4.0979 data: 0.0002 max mem: 54684 +[08:41:31.253698] Epoch: [2] [660/3229] lr: 0.000013 grad_norm: 0.8149 (0.7639) closs: 1.0376 (1.0390) time: 4.0910 data: 0.0002 max mem: 54684 +[08:42:11.674957] Epoch: [2] [670/3229] lr: 0.000013 grad_norm: 0.7910 (0.7640) closs: 0.9759 (1.0377) time: 4.0822 data: 0.0002 max mem: 54684 +[08:42:52.587904] Epoch: [2] [680/3229] lr: 0.000013 grad_norm: 0.8076 (0.7644) closs: 1.0142 (1.0376) time: 4.0666 data: 0.0002 max mem: 54684 +[08:43:33.204090] Epoch: [2] [690/3229] lr: 0.000013 grad_norm: 0.7571 (0.7643) closs: 1.0510 (1.0373) time: 4.0764 data: 0.0002 max mem: 54684 +[08:44:13.752622] Epoch: [2] [700/3229] lr: 0.000013 grad_norm: 0.7474 (0.7643) closs: 1.0141 (1.0369) time: 4.0582 data: 0.0002 max mem: 54684 +[08:44:54.180675] Epoch: [2] [710/3229] lr: 0.000013 grad_norm: 0.7605 (0.7641) closs: 1.0175 (1.0365) time: 4.0488 data: 0.0002 max mem: 54684 +[08:45:34.801694] Epoch: [2] [720/3229] lr: 0.000013 grad_norm: 0.7678 (0.7645) closs: 1.0230 (1.0360) time: 4.0524 data: 0.0002 max mem: 54684 +[08:46:15.397922] Epoch: [2] [730/3229] lr: 0.000013 grad_norm: 0.7292 (0.7638) closs: 1.0149 (1.0362) time: 4.0608 data: 0.0002 max mem: 54684 +[08:46:56.241797] Epoch: [2] [740/3229] lr: 0.000013 grad_norm: 0.7162 (0.7634) closs: 1.0468 (1.0362) time: 4.0719 data: 0.0002 max mem: 54684 +[08:47:37.401308] Epoch: [2] [750/3229] lr: 0.000013 grad_norm: 0.7125 (0.7625) closs: 1.0468 (1.0361) time: 4.1001 data: 0.0002 max mem: 54684 +[08:48:17.975003] Epoch: [2] [760/3229] lr: 0.000013 grad_norm: 0.7230 (0.7623) closs: 0.9952 (1.0357) time: 4.0866 data: 0.0002 max mem: 54684 +[08:48:58.599971] Epoch: [2] [770/3229] lr: 0.000013 grad_norm: 0.7391 (0.7618) closs: 0.9999 (1.0354) time: 4.0599 data: 0.0002 max mem: 54684 +[08:49:39.723248] Epoch: [2] [780/3229] lr: 0.000013 grad_norm: 0.7642 (0.7620) closs: 1.0226 (1.0358) time: 4.0873 data: 0.0002 max mem: 54684 +[08:50:20.513740] Epoch: [2] [790/3229] lr: 0.000013 grad_norm: 0.7684 (0.7622) closs: 1.0461 (1.0352) time: 4.0956 data: 0.0002 max mem: 54684 +[08:51:00.788007] Epoch: [2] [800/3229] lr: 0.000013 grad_norm: 0.7667 (0.7621) closs: 1.0416 (1.0349) time: 4.0532 data: 0.0002 max mem: 54684 +[08:51:42.084938] Epoch: [2] [810/3229] lr: 0.000012 grad_norm: 0.7514 (0.7623) closs: 1.0492 (1.0351) time: 4.0785 data: 0.0002 max mem: 54684 +[08:52:22.683852] Epoch: [2] [820/3229] lr: 0.000012 grad_norm: 0.7899 (0.7626) closs: 1.0387 (1.0346) time: 4.0947 data: 0.0002 max mem: 54684 +[08:53:03.136708] Epoch: [2] [830/3229] lr: 0.000012 grad_norm: 0.7319 (0.7623) closs: 1.0123 (1.0341) time: 4.0525 data: 0.0002 max mem: 54684 +[08:53:44.061472] Epoch: [2] [840/3229] lr: 0.000012 grad_norm: 0.7243 (0.7622) closs: 1.0273 (1.0344) time: 4.0688 data: 0.0002 max mem: 54684 +[08:54:24.982996] Epoch: [2] [850/3229] lr: 0.000012 grad_norm: 0.7494 (0.7625) closs: 1.0501 (1.0346) time: 4.0923 data: 0.0002 max mem: 54684 +[08:55:05.938568] Epoch: [2] [860/3229] lr: 0.000012 grad_norm: 0.7728 (0.7625) closs: 1.0500 (1.0344) time: 4.0938 data: 0.0002 max mem: 54684 +[08:55:47.280626] Epoch: [2] [870/3229] lr: 0.000012 grad_norm: 0.7828 (0.7630) closs: 1.0542 (1.0347) time: 4.1148 data: 0.0002 max mem: 54684 +[08:56:27.932374] Epoch: [2] [880/3229] lr: 0.000012 grad_norm: 0.7772 (0.7629) closs: 1.0480 (1.0344) time: 4.0996 data: 0.0002 max mem: 54684 +[08:57:09.157992] Epoch: [2] [890/3229] lr: 0.000012 grad_norm: 0.7710 (0.7629) closs: 1.0352 (1.0346) time: 4.0938 data: 0.0002 max mem: 54684 +[08:57:50.048167] Epoch: [2] [900/3229] lr: 0.000012 grad_norm: 0.7654 (0.7630) closs: 1.0552 (1.0348) time: 4.1057 data: 0.0002 max mem: 54684 +[08:58:31.441896] Epoch: [2] [910/3229] lr: 0.000012 grad_norm: 0.7748 (0.7631) closs: 1.0552 (1.0352) time: 4.1141 data: 0.0002 max mem: 54684 +[08:59:12.358818] Epoch: [2] [920/3229] lr: 0.000012 grad_norm: 0.7834 (0.7633) closs: 1.0587 (1.0353) time: 4.1155 data: 0.0002 max mem: 54684 +[08:59:53.384497] Epoch: [2] [930/3229] lr: 0.000012 grad_norm: 0.7600 (0.7639) closs: 1.0610 (1.0358) time: 4.0971 data: 0.0002 max mem: 54684 +[09:00:34.296386] Epoch: [2] [940/3229] lr: 0.000012 grad_norm: 0.7367 (0.7634) closs: 1.0361 (1.0355) time: 4.0968 data: 0.0002 max mem: 54684 +[09:01:15.300491] Epoch: [2] [950/3229] lr: 0.000012 grad_norm: 0.7532 (0.7637) closs: 1.0273 (1.0357) time: 4.0957 data: 0.0002 max mem: 54684 +[09:01:56.214466] Epoch: [2] [960/3229] lr: 0.000012 grad_norm: 0.8101 (0.7637) closs: 1.0385 (1.0358) time: 4.0958 data: 0.0002 max mem: 54684 +[09:02:37.461375] Epoch: [2] [970/3229] lr: 0.000012 grad_norm: 0.7784 (0.7640) closs: 1.0385 (1.0359) time: 4.1080 data: 0.0002 max mem: 54684 +[09:03:18.133192] Epoch: [2] [980/3229] lr: 0.000012 grad_norm: 0.7784 (0.7641) closs: 1.0226 (1.0356) time: 4.0959 data: 0.0002 max mem: 54684 +[09:03:58.519062] Epoch: [2] [990/3229] lr: 0.000011 grad_norm: 0.7619 (0.7637) closs: 1.0127 (1.0353) time: 4.0528 data: 0.0002 max mem: 54684 +[09:04:39.761660] Epoch: [2] [1000/3229] lr: 0.000011 grad_norm: 0.7440 (0.7638) closs: 1.0497 (1.0358) time: 4.0814 data: 0.0002 max mem: 54684 +[09:05:20.726355] Epoch: [2] [1010/3229] lr: 0.000011 grad_norm: 0.7440 (0.7637) closs: 1.0649 (1.0361) time: 4.1103 data: 0.0002 max mem: 54684 +[09:06:00.952110] Epoch: [2] [1020/3229] lr: 0.000011 grad_norm: 0.7606 (0.7640) closs: 1.0374 (1.0356) time: 4.0595 data: 0.0002 max mem: 54684 +[09:06:41.558460] Epoch: [2] [1030/3229] lr: 0.000011 grad_norm: 0.7384 (0.7636) closs: 1.0224 (1.0354) time: 4.0415 data: 0.0002 max mem: 54684 +[09:07:22.293637] Epoch: [2] [1040/3229] lr: 0.000011 grad_norm: 0.7438 (0.7637) closs: 1.0008 (1.0349) time: 4.0670 data: 0.0002 max mem: 54684 +[09:08:03.484133] Epoch: [2] [1050/3229] lr: 0.000011 grad_norm: 0.7482 (0.7634) closs: 1.0428 (1.0352) time: 4.0962 data: 0.0002 max mem: 54684 +[09:08:44.382701] Epoch: [2] [1060/3229] lr: 0.000011 grad_norm: 0.7485 (0.7636) closs: 1.0560 (1.0352) time: 4.1044 data: 0.0002 max mem: 54684 +[09:09:25.808204] Epoch: [2] [1070/3229] lr: 0.000011 grad_norm: 0.7534 (0.7633) closs: 1.0265 (1.0352) time: 4.1161 data: 0.0002 max mem: 54684 +[09:10:06.913588] Epoch: [2] [1080/3229] lr: 0.000011 grad_norm: 0.7569 (0.7634) closs: 1.0402 (1.0353) time: 4.1265 data: 0.0002 max mem: 54684 +[09:10:47.784050] Epoch: [2] [1090/3229] lr: 0.000011 grad_norm: 0.7794 (0.7635) closs: 1.0821 (1.0357) time: 4.0987 data: 0.0002 max mem: 54684 +[09:11:29.039121] Epoch: [2] [1100/3229] lr: 0.000011 grad_norm: 0.7814 (0.7636) closs: 1.0679 (1.0358) time: 4.1062 data: 0.0002 max mem: 54684 +[09:12:10.373961] Epoch: [2] [1110/3229] lr: 0.000011 grad_norm: 0.7643 (0.7639) closs: 1.0508 (1.0360) time: 4.1294 data: 0.0002 max mem: 54684 +[09:12:51.449473] Epoch: [2] [1120/3229] lr: 0.000011 grad_norm: 0.7562 (0.7638) closs: 1.0466 (1.0358) time: 4.1205 data: 0.0002 max mem: 54684 +[09:13:32.329462] Epoch: [2] [1130/3229] lr: 0.000011 grad_norm: 0.7943 (0.7646) closs: 1.0470 (1.0359) time: 4.0977 data: 0.0002 max mem: 54684 +[09:14:12.321437] Epoch: [2] [1140/3229] lr: 0.000011 grad_norm: 0.8189 (0.7647) closs: 1.0094 (1.0354) time: 4.0435 data: 0.0002 max mem: 54684 +[09:14:53.353944] Epoch: [2] [1150/3229] lr: 0.000011 grad_norm: 0.8029 (0.7651) closs: 0.9823 (1.0353) time: 4.0512 data: 0.0002 max mem: 54684 +[09:15:34.257366] Epoch: [2] [1160/3229] lr: 0.000011 grad_norm: 0.8163 (0.7653) closs: 1.0455 (1.0354) time: 4.0967 data: 0.0002 max mem: 54684 +[09:16:14.798544] Epoch: [2] [1170/3229] lr: 0.000011 grad_norm: 0.8128 (0.7652) closs: 1.0318 (1.0353) time: 4.0722 data: 0.0002 max mem: 54684 +[09:16:55.872961] Epoch: [2] [1180/3229] lr: 0.000010 grad_norm: 0.7566 (0.7651) closs: 1.0318 (1.0353) time: 4.0807 data: 0.0002 max mem: 54684 +[09:17:36.505833] Epoch: [2] [1190/3229] lr: 0.000010 grad_norm: 0.7433 (0.7650) closs: 1.0522 (1.0355) time: 4.0853 data: 0.0002 max mem: 54684 +[09:18:17.759241] Epoch: [2] [1200/3229] lr: 0.000010 grad_norm: 0.7621 (0.7651) closs: 1.0598 (1.0356) time: 4.0942 data: 0.0002 max mem: 54684 +[09:18:59.258398] Epoch: [2] [1210/3229] lr: 0.000010 grad_norm: 0.7694 (0.7652) closs: 1.0676 (1.0360) time: 4.1376 data: 0.0002 max mem: 54684 +[09:19:40.458187] Epoch: [2] [1220/3229] lr: 0.000010 grad_norm: 0.7466 (0.7650) closs: 1.0545 (1.0359) time: 4.1349 data: 0.0002 max mem: 54684 +[09:20:21.687691] Epoch: [2] [1230/3229] lr: 0.000010 grad_norm: 0.7782 (0.7653) closs: 1.0469 (1.0362) time: 4.1214 data: 0.0002 max mem: 54684 +[09:21:02.306360] Epoch: [2] [1240/3229] lr: 0.000010 grad_norm: 0.7785 (0.7651) closs: 1.0469 (1.0361) time: 4.0923 data: 0.0002 max mem: 54684 +[09:21:43.494418] Epoch: [2] [1250/3229] lr: 0.000010 grad_norm: 0.7669 (0.7654) closs: 1.0804 (1.0366) time: 4.0903 data: 0.0002 max mem: 54684 +[09:22:24.223450] Epoch: [2] [1260/3229] lr: 0.000010 grad_norm: 0.7949 (0.7653) closs: 1.0789 (1.0365) time: 4.0958 data: 0.0002 max mem: 54684 +[09:23:05.137178] Epoch: [2] [1270/3229] lr: 0.000010 grad_norm: 0.7839 (0.7655) closs: 1.0398 (1.0366) time: 4.0821 data: 0.0002 max mem: 54684 +[09:23:45.713305] Epoch: [2] [1280/3229] lr: 0.000010 grad_norm: 0.7820 (0.7654) closs: 1.0345 (1.0364) time: 4.0744 data: 0.0002 max mem: 54684 +[09:24:26.883899] Epoch: [2] [1290/3229] lr: 0.000010 grad_norm: 0.7848 (0.7655) closs: 1.0622 (1.0368) time: 4.0873 data: 0.0002 max mem: 54684 +[09:25:07.882033] Epoch: [2] [1300/3229] lr: 0.000010 grad_norm: 0.7626 (0.7653) closs: 1.0601 (1.0370) time: 4.1084 data: 0.0002 max mem: 54684 +[09:25:48.788547] Epoch: [2] [1310/3229] lr: 0.000010 grad_norm: 0.7626 (0.7655) closs: 1.0317 (1.0368) time: 4.0952 data: 0.0002 max mem: 54684 +[09:26:29.719803] Epoch: [2] [1320/3229] lr: 0.000010 grad_norm: 0.7593 (0.7652) closs: 1.0151 (1.0369) time: 4.0918 data: 0.0002 max mem: 54684 +[09:27:10.299624] Epoch: [2] [1330/3229] lr: 0.000010 grad_norm: 0.7602 (0.7652) closs: 1.0215 (1.0365) time: 4.0755 data: 0.0002 max mem: 54684 +[09:27:50.751310] Epoch: [2] [1340/3229] lr: 0.000010 grad_norm: 0.7643 (0.7651) closs: 0.9875 (1.0362) time: 4.0515 data: 0.0002 max mem: 54684 +[09:28:31.398595] Epoch: [2] [1350/3229] lr: 0.000010 grad_norm: 0.7591 (0.7650) closs: 0.9875 (1.0358) time: 4.0549 data: 0.0002 max mem: 54684 +[09:29:12.305782] Epoch: [2] [1360/3229] lr: 0.000010 grad_norm: 0.7741 (0.7651) closs: 1.0085 (1.0361) time: 4.0777 data: 0.0001 max mem: 54684 +[09:29:53.571306] Epoch: [2] [1370/3229] lr: 0.000010 grad_norm: 0.7792 (0.7653) closs: 1.0583 (1.0362) time: 4.1086 data: 0.0002 max mem: 54684 +[09:30:34.513851] Epoch: [2] [1380/3229] lr: 0.000009 grad_norm: 0.8100 (0.7656) closs: 1.0583 (1.0366) time: 4.1103 data: 0.0001 max mem: 54684 +[09:31:15.446793] Epoch: [2] [1390/3229] lr: 0.000009 grad_norm: 0.8007 (0.7656) closs: 1.0600 (1.0367) time: 4.0937 data: 0.0001 max mem: 54684 +[09:31:56.718293] Epoch: [2] [1400/3229] lr: 0.000009 grad_norm: 0.7785 (0.7658) closs: 1.0538 (1.0368) time: 4.1101 data: 0.0001 max mem: 54684 +[09:32:37.066477] Epoch: [2] [1410/3229] lr: 0.000009 grad_norm: 0.7777 (0.7654) closs: 1.0459 (1.0365) time: 4.0809 data: 0.0002 max mem: 54684 +[09:33:18.072485] Epoch: [2] [1420/3229] lr: 0.000009 grad_norm: 0.7777 (0.7656) closs: 1.0269 (1.0366) time: 4.0676 data: 0.0001 max mem: 54684 +[09:33:58.688418] Epoch: [2] [1430/3229] lr: 0.000009 grad_norm: 0.7950 (0.7658) closs: 1.0548 (1.0367) time: 4.0810 data: 0.0002 max mem: 54684 +[09:34:40.028938] Epoch: [2] [1440/3229] lr: 0.000009 grad_norm: 0.7547 (0.7655) closs: 1.0359 (1.0365) time: 4.0977 data: 0.0002 max mem: 54684 +[09:35:20.652994] Epoch: [2] [1450/3229] lr: 0.000009 grad_norm: 0.7292 (0.7654) closs: 1.0334 (1.0363) time: 4.0982 data: 0.0001 max mem: 54684 +[09:36:00.716699] Epoch: [2] [1460/3229] lr: 0.000009 grad_norm: 0.7434 (0.7651) closs: 0.9868 (1.0360) time: 4.0343 data: 0.0002 max mem: 54684 +[09:36:41.682870] Epoch: [2] [1470/3229] lr: 0.000009 grad_norm: 0.7452 (0.7652) closs: 0.9904 (1.0359) time: 4.0514 data: 0.0001 max mem: 54684 +[09:37:22.665178] Epoch: [2] [1480/3229] lr: 0.000009 grad_norm: 0.7568 (0.7653) closs: 1.0227 (1.0358) time: 4.0974 data: 0.0001 max mem: 54684 +[09:38:03.979174] Epoch: [2] [1490/3229] lr: 0.000009 grad_norm: 0.7775 (0.7653) closs: 1.0474 (1.0360) time: 4.1147 data: 0.0002 max mem: 54684 +[09:38:45.030747] Epoch: [2] [1500/3229] lr: 0.000009 grad_norm: 0.7560 (0.7653) closs: 1.0474 (1.0360) time: 4.1182 data: 0.0002 max mem: 54684 +[09:39:25.831863] Epoch: [2] [1510/3229] lr: 0.000009 grad_norm: 0.7644 (0.7652) closs: 1.0295 (1.0358) time: 4.0926 data: 0.0001 max mem: 54684 +[09:40:06.337863] Epoch: [2] [1520/3229] lr: 0.000009 grad_norm: 0.7842 (0.7653) closs: 1.0512 (1.0360) time: 4.0653 data: 0.0001 max mem: 54684 +[09:40:46.718752] Epoch: [2] [1530/3229] lr: 0.000009 grad_norm: 0.7452 (0.7650) closs: 1.0411 (1.0359) time: 4.0443 data: 0.0002 max mem: 54684 +[09:41:27.479284] Epoch: [2] [1540/3229] lr: 0.000009 grad_norm: 0.7451 (0.7647) closs: 1.0411 (1.0359) time: 4.0570 data: 0.0002 max mem: 54684 +[09:42:08.444014] Epoch: [2] [1550/3229] lr: 0.000009 grad_norm: 0.7502 (0.7648) closs: 1.0511 (1.0361) time: 4.0862 data: 0.0002 max mem: 54684 +[09:42:49.492626] Epoch: [2] [1560/3229] lr: 0.000009 grad_norm: 0.7626 (0.7648) closs: 1.0354 (1.0359) time: 4.1006 data: 0.0002 max mem: 54684 +[09:43:30.218498] Epoch: [2] [1570/3229] lr: 0.000009 grad_norm: 0.7822 (0.7649) closs: 1.0210 (1.0362) time: 4.0886 data: 0.0002 max mem: 54684 +[09:44:11.014656] Epoch: [2] [1580/3229] lr: 0.000009 grad_norm: 0.7599 (0.7648) closs: 1.0582 (1.0360) time: 4.0760 data: 0.0001 max mem: 54684 +[09:44:52.430295] Epoch: [2] [1590/3229] lr: 0.000009 grad_norm: 0.7556 (0.7649) closs: 1.0126 (1.0362) time: 4.1105 data: 0.0001 max mem: 54684 +[09:45:32.708025] Epoch: [2] [1600/3229] lr: 0.000009 grad_norm: 0.7689 (0.7650) closs: 1.0061 (1.0360) time: 4.0846 data: 0.0002 max mem: 54684 +[09:46:13.311218] Epoch: [2] [1610/3229] lr: 0.000008 grad_norm: 0.7788 (0.7652) closs: 1.0035 (1.0361) time: 4.0440 data: 0.0002 max mem: 54684 +[09:46:55.090230] Epoch: [2] [1620/3229] lr: 0.000008 grad_norm: 0.7534 (0.7651) closs: 1.0395 (1.0361) time: 4.1190 data: 0.0002 max mem: 54684 +[09:47:36.090209] Epoch: [2] [1630/3229] lr: 0.000008 grad_norm: 0.7519 (0.7651) closs: 1.0562 (1.0361) time: 4.1389 data: 0.0002 max mem: 54684 +[09:48:17.317549] Epoch: [2] [1640/3229] lr: 0.000008 grad_norm: 0.7637 (0.7651) closs: 1.0442 (1.0362) time: 4.1113 data: 0.0002 max mem: 54684 +[09:48:58.147416] Epoch: [2] [1650/3229] lr: 0.000008 grad_norm: 0.7460 (0.7650) closs: 1.0442 (1.0361) time: 4.1028 data: 0.0002 max mem: 54684 +[09:49:38.823915] Epoch: [2] [1660/3229] lr: 0.000008 grad_norm: 0.7707 (0.7653) closs: 0.9922 (1.0360) time: 4.0752 data: 0.0002 max mem: 54684 +[09:50:20.328398] Epoch: [2] [1670/3229] lr: 0.000008 grad_norm: 0.7983 (0.7656) closs: 1.0671 (1.0362) time: 4.1090 data: 0.0002 max mem: 54684 +[09:51:01.223284] Epoch: [2] [1680/3229] lr: 0.000008 grad_norm: 0.8044 (0.7656) closs: 1.0319 (1.0360) time: 4.1199 data: 0.0001 max mem: 54684 +[09:51:41.770661] Epoch: [2] [1690/3229] lr: 0.000008 grad_norm: 0.7965 (0.7656) closs: 1.0218 (1.0362) time: 4.0720 data: 0.0002 max mem: 54684 +[09:52:23.631344] Epoch: [2] [1700/3229] lr: 0.000008 grad_norm: 0.7809 (0.7658) closs: 1.0307 (1.0362) time: 4.1203 data: 0.0002 max mem: 54684 +[09:53:03.971107] Epoch: [2] [1710/3229] lr: 0.000008 grad_norm: 0.7531 (0.7655) closs: 1.0542 (1.0362) time: 4.1099 data: 0.0001 max mem: 54684 +[09:53:44.185439] Epoch: [2] [1720/3229] lr: 0.000008 grad_norm: 0.7503 (0.7655) closs: 1.0655 (1.0362) time: 4.0276 data: 0.0002 max mem: 54684 +[09:54:25.528421] Epoch: [2] [1730/3229] lr: 0.000008 grad_norm: 0.7650 (0.7655) closs: 1.0793 (1.0364) time: 4.0778 data: 0.0002 max mem: 54684 +[09:55:06.203734] Epoch: [2] [1740/3229] lr: 0.000008 grad_norm: 0.7650 (0.7657) closs: 1.0568 (1.0362) time: 4.1008 data: 0.0002 max mem: 54684 +[09:55:46.816613] Epoch: [2] [1750/3229] lr: 0.000008 grad_norm: 0.7542 (0.7655) closs: 1.0362 (1.0364) time: 4.0644 data: 0.0002 max mem: 54684 +[09:56:27.712286] Epoch: [2] [1760/3229] lr: 0.000008 grad_norm: 0.7409 (0.7656) closs: 1.0798 (1.0365) time: 4.0754 data: 0.0002 max mem: 54684 +[09:57:09.011822] Epoch: [2] [1770/3229] lr: 0.000008 grad_norm: 0.7625 (0.7657) closs: 1.0369 (1.0366) time: 4.1097 data: 0.0002 max mem: 54684 +[09:57:50.358048] Epoch: [2] [1780/3229] lr: 0.000008 grad_norm: 0.7780 (0.7660) closs: 1.0474 (1.0366) time: 4.1322 data: 0.0002 max mem: 54684 +[09:58:30.935480] Epoch: [2] [1790/3229] lr: 0.000008 grad_norm: 0.7607 (0.7658) closs: 1.0474 (1.0366) time: 4.0961 data: 0.0002 max mem: 54684 +[09:59:12.191909] Epoch: [2] [1800/3229] lr: 0.000008 grad_norm: 0.7704 (0.7660) closs: 1.0390 (1.0366) time: 4.0916 data: 0.0002 max mem: 54684 +[09:59:53.188925] Epoch: [2] [1810/3229] lr: 0.000008 grad_norm: 0.7880 (0.7661) closs: 1.0390 (1.0365) time: 4.1126 data: 0.0002 max mem: 54684 +[10:00:34.288625] Epoch: [2] [1820/3229] lr: 0.000008 grad_norm: 0.8177 (0.7663) closs: 1.0470 (1.0366) time: 4.1048 data: 0.0002 max mem: 54684 +[10:01:15.216189] Epoch: [2] [1830/3229] lr: 0.000008 grad_norm: 0.7464 (0.7662) closs: 1.0408 (1.0365) time: 4.1013 data: 0.0002 max mem: 54684 +[10:01:56.484523] Epoch: [2] [1840/3229] lr: 0.000008 grad_norm: 0.7634 (0.7664) closs: 1.0408 (1.0367) time: 4.1097 data: 0.0002 max mem: 54684 +[10:02:36.820581] Epoch: [2] [1850/3229] lr: 0.000008 grad_norm: 0.7761 (0.7663) closs: 1.0366 (1.0366) time: 4.0802 data: 0.0002 max mem: 54684 +[10:03:17.820103] Epoch: [2] [1860/3229] lr: 0.000007 grad_norm: 0.7713 (0.7665) closs: 1.0248 (1.0365) time: 4.0667 data: 0.0002 max mem: 54684 +[10:03:58.463854] Epoch: [2] [1870/3229] lr: 0.000007 grad_norm: 0.7713 (0.7666) closs: 1.0134 (1.0362) time: 4.0821 data: 0.0002 max mem: 54684 +[10:04:39.335047] Epoch: [2] [1880/3229] lr: 0.000007 grad_norm: 0.7584 (0.7665) closs: 1.0266 (1.0362) time: 4.0757 data: 0.0002 max mem: 54684 +[10:05:20.773148] Epoch: [2] [1890/3229] lr: 0.000007 grad_norm: 0.7505 (0.7666) closs: 1.0527 (1.0364) time: 4.1154 data: 0.0002 max mem: 54684 +[10:06:01.774211] Epoch: [2] [1900/3229] lr: 0.000007 grad_norm: 0.7830 (0.7667) closs: 1.0807 (1.0365) time: 4.1219 data: 0.0002 max mem: 54684 +[10:06:43.015666] Epoch: [2] [1910/3229] lr: 0.000007 grad_norm: 0.7959 (0.7668) closs: 1.0580 (1.0366) time: 4.1121 data: 0.0002 max mem: 54684 +[10:07:22.898606] Epoch: [2] [1920/3229] lr: 0.000007 grad_norm: 0.7359 (0.7665) closs: 1.0269 (1.0364) time: 4.0562 data: 0.0002 max mem: 54684 +[10:08:03.733126] Epoch: [2] [1930/3229] lr: 0.000007 grad_norm: 0.7359 (0.7667) closs: 1.0022 (1.0362) time: 4.0358 data: 0.0002 max mem: 54684 +[10:08:44.009814] Epoch: [2] [1940/3229] lr: 0.000007 grad_norm: 0.7834 (0.7665) closs: 1.0467 (1.0361) time: 4.0555 data: 0.0002 max mem: 54684 +[10:09:24.934480] Epoch: [2] [1950/3229] lr: 0.000007 grad_norm: 0.7408 (0.7665) closs: 1.0467 (1.0360) time: 4.0600 data: 0.0002 max mem: 54684 +[10:10:05.513930] Epoch: [2] [1960/3229] lr: 0.000007 grad_norm: 0.7564 (0.7664) closs: 1.0360 (1.0360) time: 4.0751 data: 0.0002 max mem: 54684 +[10:10:47.318139] Epoch: [2] [1970/3229] lr: 0.000007 grad_norm: 0.7470 (0.7664) closs: 1.0260 (1.0361) time: 4.1191 data: 0.0002 max mem: 54684 +[10:11:28.590511] Epoch: [2] [1980/3229] lr: 0.000007 grad_norm: 0.7601 (0.7667) closs: 1.0797 (1.0363) time: 4.1538 data: 0.0002 max mem: 54684 +[10:12:09.496980] Epoch: [2] [1990/3229] lr: 0.000007 grad_norm: 0.7790 (0.7667) closs: 1.0679 (1.0363) time: 4.1089 data: 0.0002 max mem: 54684 +[10:12:50.748606] Epoch: [2] [2000/3229] lr: 0.000007 grad_norm: 0.7970 (0.7670) closs: 1.0297 (1.0364) time: 4.1078 data: 0.0002 max mem: 54684 +[10:13:32.182612] Epoch: [2] [2010/3229] lr: 0.000007 grad_norm: 0.8054 (0.7671) closs: 1.0496 (1.0365) time: 4.1342 data: 0.0002 max mem: 54684 +[10:14:13.410630] Epoch: [2] [2020/3229] lr: 0.000007 grad_norm: 0.7780 (0.7672) closs: 1.0542 (1.0367) time: 4.1330 data: 0.0002 max mem: 54684 +[10:14:54.361523] Epoch: [2] [2030/3229] lr: 0.000007 grad_norm: 0.8040 (0.7677) closs: 1.0542 (1.0368) time: 4.1089 data: 0.0002 max mem: 54684 +[10:15:35.931648] Epoch: [2] [2040/3229] lr: 0.000007 grad_norm: 0.8409 (0.7681) closs: 1.0486 (1.0369) time: 4.1260 data: 0.0002 max mem: 54684 +[10:16:16.746598] Epoch: [2] [2050/3229] lr: 0.000007 grad_norm: 0.7987 (0.7682) closs: 1.0486 (1.0368) time: 4.1192 data: 0.0002 max mem: 54684 +[10:16:58.012617] Epoch: [2] [2060/3229] lr: 0.000007 grad_norm: 0.8159 (0.7685) closs: 1.0473 (1.0369) time: 4.1040 data: 0.0002 max mem: 54684 +[10:17:38.676913] Epoch: [2] [2070/3229] lr: 0.000007 grad_norm: 0.7728 (0.7682) closs: 1.0073 (1.0366) time: 4.0965 data: 0.0002 max mem: 54684 +[10:18:20.035619] Epoch: [2] [2080/3229] lr: 0.000007 grad_norm: 0.7435 (0.7684) closs: 1.0386 (1.0368) time: 4.1011 data: 0.0002 max mem: 54684 +[10:19:01.109902] Epoch: [2] [2090/3229] lr: 0.000007 grad_norm: 0.7514 (0.7683) closs: 1.0462 (1.0367) time: 4.1216 data: 0.0002 max mem: 54684 +[10:19:42.145840] Epoch: [2] [2100/3229] lr: 0.000007 grad_norm: 0.7514 (0.7683) closs: 1.0461 (1.0368) time: 4.1054 data: 0.0002 max mem: 54684 +[10:20:21.912487] Epoch: [2] [2110/3229] lr: 0.000007 grad_norm: 0.7724 (0.7682) closs: 1.0394 (1.0367) time: 4.0401 data: 0.0002 max mem: 54684 +[10:21:02.609058] Epoch: [2] [2120/3229] lr: 0.000007 grad_norm: 0.7465 (0.7681) closs: 1.0394 (1.0367) time: 4.0231 data: 0.0002 max mem: 54684 +[10:21:43.356036] Epoch: [2] [2130/3229] lr: 0.000007 grad_norm: 0.7602 (0.7681) closs: 1.0324 (1.0366) time: 4.0721 data: 0.0002 max mem: 54684 +[10:22:24.153129] Epoch: [2] [2140/3229] lr: 0.000007 grad_norm: 0.7624 (0.7679) closs: 1.0118 (1.0365) time: 4.0771 data: 0.0002 max mem: 54684 +[10:23:04.987144] Epoch: [2] [2150/3229] lr: 0.000007 grad_norm: 0.7632 (0.7680) closs: 1.0690 (1.0368) time: 4.0815 data: 0.0002 max mem: 54684 +[10:23:46.301006] Epoch: [2] [2160/3229] lr: 0.000007 grad_norm: 0.7924 (0.7682) closs: 1.0757 (1.0370) time: 4.1073 data: 0.0002 max mem: 54684 +[10:24:27.430902] Epoch: [2] [2170/3229] lr: 0.000007 grad_norm: 0.7768 (0.7682) closs: 1.0597 (1.0370) time: 4.1221 data: 0.0002 max mem: 54684 +[10:25:08.536310] Epoch: [2] [2180/3229] lr: 0.000006 grad_norm: 0.7764 (0.7683) closs: 1.0400 (1.0370) time: 4.1117 data: 0.0002 max mem: 54684 +[10:25:49.054789] Epoch: [2] [2190/3229] lr: 0.000006 grad_norm: 0.7764 (0.7683) closs: 1.0429 (1.0370) time: 4.0811 data: 0.0002 max mem: 54684 +[10:26:28.974982] Epoch: [2] [2200/3229] lr: 0.000006 grad_norm: 0.7571 (0.7680) closs: 0.9973 (1.0367) time: 4.0219 data: 0.0002 max mem: 54684 +[10:27:10.250973] Epoch: [2] [2210/3229] lr: 0.000006 grad_norm: 0.7298 (0.7679) closs: 1.0033 (1.0367) time: 4.0597 data: 0.0002 max mem: 54684 +[10:27:51.496280] Epoch: [2] [2220/3229] lr: 0.000006 grad_norm: 0.7623 (0.7681) closs: 1.0618 (1.0369) time: 4.1260 data: 0.0002 max mem: 54684 +[10:28:32.694107] Epoch: [2] [2230/3229] lr: 0.000006 grad_norm: 0.8018 (0.7682) closs: 1.1010 (1.0371) time: 4.1221 data: 0.0002 max mem: 54684 +[10:29:13.690626] Epoch: [2] [2240/3229] lr: 0.000006 grad_norm: 0.7677 (0.7682) closs: 1.0752 (1.0372) time: 4.1097 data: 0.0002 max mem: 54684 +[10:29:55.103277] Epoch: [2] [2250/3229] lr: 0.000006 grad_norm: 0.7532 (0.7683) closs: 1.1006 (1.0373) time: 4.1204 data: 0.0002 max mem: 54684 +[10:30:36.017463] Epoch: [2] [2260/3229] lr: 0.000006 grad_norm: 0.7805 (0.7683) closs: 1.0913 (1.0374) time: 4.1163 data: 0.0002 max mem: 54684 +[10:31:17.232848] Epoch: [2] [2270/3229] lr: 0.000006 grad_norm: 0.7978 (0.7685) closs: 1.0719 (1.0376) time: 4.1064 data: 0.0002 max mem: 54684 +[10:31:57.554662] Epoch: [2] [2280/3229] lr: 0.000006 grad_norm: 0.7693 (0.7683) closs: 1.0401 (1.0375) time: 4.0768 data: 0.0002 max mem: 54684 +[10:32:38.982581] Epoch: [2] [2290/3229] lr: 0.000006 grad_norm: 0.7707 (0.7684) closs: 1.0389 (1.0375) time: 4.0874 data: 0.0002 max mem: 54684 +[10:33:19.900400] Epoch: [2] [2300/3229] lr: 0.000006 grad_norm: 0.7707 (0.7684) closs: 1.0546 (1.0376) time: 4.1172 data: 0.0002 max mem: 54684 +[10:34:00.422956] Epoch: [2] [2310/3229] lr: 0.000006 grad_norm: 0.7404 (0.7683) closs: 1.0432 (1.0375) time: 4.0720 data: 0.0002 max mem: 54684 +[10:34:41.838857] Epoch: [2] [2320/3229] lr: 0.000006 grad_norm: 0.7949 (0.7686) closs: 1.0168 (1.0374) time: 4.0969 data: 0.0002 max mem: 54684 +[10:35:22.205090] Epoch: [2] [2330/3229] lr: 0.000006 grad_norm: 0.7612 (0.7684) closs: 1.0087 (1.0372) time: 4.0890 data: 0.0002 max mem: 54684 +[10:36:03.158561] Epoch: [2] [2340/3229] lr: 0.000006 grad_norm: 0.7306 (0.7685) closs: 1.0268 (1.0373) time: 4.0659 data: 0.0002 max mem: 54684 +[10:36:44.005255] Epoch: [2] [2350/3229] lr: 0.000006 grad_norm: 0.7974 (0.7686) closs: 1.0339 (1.0373) time: 4.0899 data: 0.0002 max mem: 54684 +[10:37:24.060589] Epoch: [2] [2360/3229] lr: 0.000006 grad_norm: 0.7845 (0.7683) closs: 0.9994 (1.0371) time: 4.0450 data: 0.0002 max mem: 54684 +[10:38:04.953222] Epoch: [2] [2370/3229] lr: 0.000006 grad_norm: 0.7452 (0.7684) closs: 0.9928 (1.0371) time: 4.0473 data: 0.0002 max mem: 54684 +[10:38:45.575555] Epoch: [2] [2380/3229] lr: 0.000006 grad_norm: 0.7639 (0.7682) closs: 1.0037 (1.0371) time: 4.0757 data: 0.0002 max mem: 54684 +[10:39:26.779560] Epoch: [2] [2390/3229] lr: 0.000006 grad_norm: 0.7854 (0.7685) closs: 1.0731 (1.0372) time: 4.0913 data: 0.0002 max mem: 54684 +[10:40:07.277159] Epoch: [2] [2400/3229] lr: 0.000006 grad_norm: 0.7837 (0.7683) closs: 1.0746 (1.0371) time: 4.0850 data: 0.0002 max mem: 54684 +[10:40:47.656246] Epoch: [2] [2410/3229] lr: 0.000006 grad_norm: 0.7215 (0.7681) closs: 1.0545 (1.0370) time: 4.0438 data: 0.0002 max mem: 54684 +[10:41:28.261216] Epoch: [2] [2420/3229] lr: 0.000006 grad_norm: 0.7847 (0.7682) closs: 1.0446 (1.0370) time: 4.0491 data: 0.0002 max mem: 54684 +[10:42:09.186454] Epoch: [2] [2430/3229] lr: 0.000006 grad_norm: 0.7847 (0.7682) closs: 1.0480 (1.0370) time: 4.0764 data: 0.0002 max mem: 54684 +[10:42:50.581684] Epoch: [2] [2440/3229] lr: 0.000006 grad_norm: 0.7911 (0.7684) closs: 1.0551 (1.0370) time: 4.1160 data: 0.0002 max mem: 54684 +[10:43:31.660191] Epoch: [2] [2450/3229] lr: 0.000006 grad_norm: 0.7996 (0.7685) closs: 1.0322 (1.0370) time: 4.1236 data: 0.0002 max mem: 54684 +[10:44:12.906840] Epoch: [2] [2460/3229] lr: 0.000006 grad_norm: 0.7953 (0.7685) closs: 1.0322 (1.0370) time: 4.1162 data: 0.0002 max mem: 54684 +[10:44:54.081525] Epoch: [2] [2470/3229] lr: 0.000006 grad_norm: 0.7730 (0.7686) closs: 1.0670 (1.0371) time: 4.1210 data: 0.0002 max mem: 54684 +[10:45:35.195478] Epoch: [2] [2480/3229] lr: 0.000006 grad_norm: 0.7885 (0.7687) closs: 1.0536 (1.0371) time: 4.1144 data: 0.0002 max mem: 54684 +[10:46:16.528666] Epoch: [2] [2490/3229] lr: 0.000006 grad_norm: 0.7880 (0.7687) closs: 1.0536 (1.0373) time: 4.1223 data: 0.0002 max mem: 54684 +[10:46:56.834636] Epoch: [2] [2500/3229] lr: 0.000006 grad_norm: 0.7751 (0.7687) closs: 1.0654 (1.0372) time: 4.0819 data: 0.0002 max mem: 54684 +[10:47:38.101086] Epoch: [2] [2510/3229] lr: 0.000006 grad_norm: 0.7705 (0.7688) closs: 1.0349 (1.0373) time: 4.0786 data: 0.0002 max mem: 54684 +[10:48:18.776789] Epoch: [2] [2520/3229] lr: 0.000006 grad_norm: 0.7878 (0.7688) closs: 1.0416 (1.0373) time: 4.0970 data: 0.0002 max mem: 54684 +[10:48:59.471432] Epoch: [2] [2530/3229] lr: 0.000006 grad_norm: 0.7899 (0.7688) closs: 1.0064 (1.0373) time: 4.0685 data: 0.0002 max mem: 54684 +[10:49:40.706118] Epoch: [2] [2540/3229] lr: 0.000006 grad_norm: 0.7632 (0.7690) closs: 0.9999 (1.0372) time: 4.0964 data: 0.0002 max mem: 54684 +[10:50:21.879580] Epoch: [2] [2550/3229] lr: 0.000006 grad_norm: 0.8102 (0.7692) closs: 1.0065 (1.0372) time: 4.1203 data: 0.0002 max mem: 54684 +[10:51:03.280777] Epoch: [2] [2560/3229] lr: 0.000006 grad_norm: 0.7923 (0.7692) closs: 1.0337 (1.0373) time: 4.1287 data: 0.0002 max mem: 54684 +[10:51:44.513039] Epoch: [2] [2570/3229] lr: 0.000006 grad_norm: 0.7990 (0.7697) closs: 1.0381 (1.0373) time: 4.1316 data: 0.0002 max mem: 54684 +[10:52:25.106616] Epoch: [2] [2580/3229] lr: 0.000006 grad_norm: 0.8055 (0.7697) closs: 1.0194 (1.0372) time: 4.0912 data: 0.0002 max mem: 54684 +[10:53:05.456512] Epoch: [2] [2590/3229] lr: 0.000006 grad_norm: 0.7706 (0.7696) closs: 1.0125 (1.0371) time: 4.0471 data: 0.0002 max mem: 54684 +[10:53:46.140129] Epoch: [2] [2600/3229] lr: 0.000006 grad_norm: 0.7412 (0.7694) closs: 1.0323 (1.0371) time: 4.0516 data: 0.0002 max mem: 54684 +[10:54:27.649177] Epoch: [2] [2610/3229] lr: 0.000006 grad_norm: 0.7625 (0.7696) closs: 1.0566 (1.0371) time: 4.1096 data: 0.0002 max mem: 54684 +[10:55:08.432178] Epoch: [2] [2620/3229] lr: 0.000006 grad_norm: 0.8024 (0.7697) closs: 1.0176 (1.0371) time: 4.1145 data: 0.0002 max mem: 54684 +[10:55:49.037158] Epoch: [2] [2630/3229] lr: 0.000005 grad_norm: 0.7726 (0.7697) closs: 1.0176 (1.0369) time: 4.0693 data: 0.0002 max mem: 54684 +[10:56:30.399477] Epoch: [2] [2640/3229] lr: 0.000005 grad_norm: 0.7389 (0.7696) closs: 1.0279 (1.0371) time: 4.0983 data: 0.0002 max mem: 54684 +[10:57:11.914088] Epoch: [2] [2650/3229] lr: 0.000005 grad_norm: 0.7500 (0.7697) closs: 1.0760 (1.0372) time: 4.1438 data: 0.0002 max mem: 54684 +[10:57:53.051298] Epoch: [2] [2660/3229] lr: 0.000005 grad_norm: 0.7937 (0.7698) closs: 1.0704 (1.0373) time: 4.1325 data: 0.0002 max mem: 54684 +[10:58:34.322783] Epoch: [2] [2670/3229] lr: 0.000005 grad_norm: 0.8010 (0.7699) closs: 1.0704 (1.0374) time: 4.1204 data: 0.0002 max mem: 54684 +[10:59:15.014004] Epoch: [2] [2680/3229] lr: 0.000005 grad_norm: 0.8010 (0.7699) closs: 1.0494 (1.0374) time: 4.0981 data: 0.0002 max mem: 54684 +[10:59:55.439695] Epoch: [2] [2690/3229] lr: 0.000005 grad_norm: 0.7508 (0.7698) closs: 1.0343 (1.0373) time: 4.0558 data: 0.0002 max mem: 54684 +[11:00:36.264761] Epoch: [2] [2700/3229] lr: 0.000005 grad_norm: 0.7522 (0.7699) closs: 1.0379 (1.0374) time: 4.0625 data: 0.0002 max mem: 54684 +[11:01:17.279401] Epoch: [2] [2710/3229] lr: 0.000005 grad_norm: 0.7584 (0.7698) closs: 1.0813 (1.0375) time: 4.0919 data: 0.0002 max mem: 54684 +[11:01:58.168258] Epoch: [2] [2720/3229] lr: 0.000005 grad_norm: 0.7584 (0.7697) closs: 1.0343 (1.0375) time: 4.0951 data: 0.0002 max mem: 54684 +[11:02:39.456366] Epoch: [2] [2730/3229] lr: 0.000005 grad_norm: 0.7529 (0.7697) closs: 1.0343 (1.0375) time: 4.1088 data: 0.0002 max mem: 54684 +[11:03:20.315865] Epoch: [2] [2740/3229] lr: 0.000005 grad_norm: 0.7555 (0.7697) closs: 1.0462 (1.0375) time: 4.1073 data: 0.0002 max mem: 54684 +[11:04:02.063978] Epoch: [2] [2750/3229] lr: 0.000005 grad_norm: 0.7999 (0.7700) closs: 1.0785 (1.0377) time: 4.1303 data: 0.0002 max mem: 54684 +[11:04:42.801143] Epoch: [2] [2760/3229] lr: 0.000005 grad_norm: 0.7942 (0.7699) closs: 1.0842 (1.0378) time: 4.1242 data: 0.0002 max mem: 54684 +[11:05:23.770788] Epoch: [2] [2770/3229] lr: 0.000005 grad_norm: 0.7815 (0.7700) closs: 1.0259 (1.0377) time: 4.0853 data: 0.0002 max mem: 54684 +[11:06:04.580722] Epoch: [2] [2780/3229] lr: 0.000005 grad_norm: 0.7815 (0.7699) closs: 1.0562 (1.0378) time: 4.0889 data: 0.0002 max mem: 54684 +[11:06:45.682994] Epoch: [2] [2790/3229] lr: 0.000005 grad_norm: 0.7723 (0.7699) closs: 1.0843 (1.0379) time: 4.0955 data: 0.0002 max mem: 54684 +[11:07:26.664509] Epoch: [2] [2800/3229] lr: 0.000005 grad_norm: 0.7851 (0.7700) closs: 1.0863 (1.0381) time: 4.1041 data: 0.0002 max mem: 54684 +[11:08:07.660594] Epoch: [2] [2810/3229] lr: 0.000005 grad_norm: 0.7649 (0.7699) closs: 1.0497 (1.0380) time: 4.0988 data: 0.0002 max mem: 54684 +[11:08:48.441366] Epoch: [2] [2820/3229] lr: 0.000005 grad_norm: 0.7366 (0.7698) closs: 1.0300 (1.0380) time: 4.0888 data: 0.0002 max mem: 54684 +[11:09:29.171313] Epoch: [2] [2830/3229] lr: 0.000005 grad_norm: 0.7514 (0.7698) closs: 1.0300 (1.0378) time: 4.0755 data: 0.0002 max mem: 54684 +[11:10:09.985836] Epoch: [2] [2840/3229] lr: 0.000005 grad_norm: 0.7665 (0.7697) closs: 0.9997 (1.0377) time: 4.0772 data: 0.0002 max mem: 54684 +[11:10:51.230047] Epoch: [2] [2850/3229] lr: 0.000005 grad_norm: 0.7750 (0.7698) closs: 1.0333 (1.0378) time: 4.1029 data: 0.0002 max mem: 54684 +[11:11:31.685949] Epoch: [2] [2860/3229] lr: 0.000005 grad_norm: 0.7660 (0.7697) closs: 1.0579 (1.0378) time: 4.0849 data: 0.0002 max mem: 54684 +[11:12:12.974896] Epoch: [2] [2870/3229] lr: 0.000005 grad_norm: 0.7565 (0.7698) closs: 1.0466 (1.0378) time: 4.0872 data: 0.0002 max mem: 54684 +[11:12:54.373299] Epoch: [2] [2880/3229] lr: 0.000005 grad_norm: 0.7817 (0.7699) closs: 1.0407 (1.0379) time: 4.1343 data: 0.0002 max mem: 54684 +[11:13:35.283572] Epoch: [2] [2890/3229] lr: 0.000005 grad_norm: 0.7849 (0.7698) closs: 1.0329 (1.0377) time: 4.1154 data: 0.0002 max mem: 54684 +[11:14:16.118014] Epoch: [2] [2900/3229] lr: 0.000005 grad_norm: 0.7950 (0.7700) closs: 1.0221 (1.0376) time: 4.0872 data: 0.0002 max mem: 54684 +[11:14:57.161039] Epoch: [2] [2910/3229] lr: 0.000005 grad_norm: 0.8078 (0.7701) closs: 1.0416 (1.0376) time: 4.0938 data: 0.0002 max mem: 54684 +[11:15:38.521781] Epoch: [2] [2920/3229] lr: 0.000005 grad_norm: 0.7894 (0.7701) closs: 1.0502 (1.0377) time: 4.1201 data: 0.0002 max mem: 54684 +[11:16:19.106288] Epoch: [2] [2930/3229] lr: 0.000005 grad_norm: 0.7824 (0.7701) closs: 1.0595 (1.0377) time: 4.0972 data: 0.0002 max mem: 54684 +[11:17:00.252881] Epoch: [2] [2940/3229] lr: 0.000005 grad_norm: 0.7871 (0.7701) closs: 1.0469 (1.0376) time: 4.0865 data: 0.0002 max mem: 54684 +[11:17:40.642013] Epoch: [2] [2950/3229] lr: 0.000005 grad_norm: 0.7919 (0.7701) closs: 1.0446 (1.0375) time: 4.0767 data: 0.0002 max mem: 54684 +[11:18:20.871868] Epoch: [2] [2960/3229] lr: 0.000005 grad_norm: 0.7919 (0.7701) closs: 0.9901 (1.0374) time: 4.0309 data: 0.0002 max mem: 54684 +[11:19:01.539668] Epoch: [2] [2970/3229] lr: 0.000005 grad_norm: 0.8446 (0.7702) closs: 1.0398 (1.0373) time: 4.0448 data: 0.0002 max mem: 54684 +[11:19:42.663674] Epoch: [2] [2980/3229] lr: 0.000005 grad_norm: 0.7707 (0.7702) closs: 1.0450 (1.0374) time: 4.0895 data: 0.0002 max mem: 54684 +[11:20:23.730795] Epoch: [2] [2990/3229] lr: 0.000005 grad_norm: 0.7557 (0.7702) closs: 1.0745 (1.0375) time: 4.1095 data: 0.0002 max mem: 54684 +[11:21:04.740958] Epoch: [2] [3000/3229] lr: 0.000005 grad_norm: 0.7990 (0.7704) closs: 1.0864 (1.0375) time: 4.1038 data: 0.0002 max mem: 54684 +[11:21:45.020285] Epoch: [2] [3010/3229] lr: 0.000005 grad_norm: 0.8011 (0.7705) closs: 0.9966 (1.0374) time: 4.0644 data: 0.0002 max mem: 54684 +[11:22:25.881577] Epoch: [2] [3020/3229] lr: 0.000005 grad_norm: 0.7592 (0.7705) closs: 1.0149 (1.0375) time: 4.0570 data: 0.0002 max mem: 54684 +[11:23:07.305216] Epoch: [2] [3030/3229] lr: 0.000005 grad_norm: 0.7607 (0.7706) closs: 1.0597 (1.0376) time: 4.1142 data: 0.0002 max mem: 54684 +[11:23:48.348449] Epoch: [2] [3040/3229] lr: 0.000005 grad_norm: 0.7679 (0.7705) closs: 1.0597 (1.0375) time: 4.1233 data: 0.0002 max mem: 54684 +[11:24:29.596151] Epoch: [2] [3050/3229] lr: 0.000005 grad_norm: 0.7677 (0.7705) closs: 1.0172 (1.0375) time: 4.1145 data: 0.0002 max mem: 54684 +[11:25:10.442041] Epoch: [2] [3060/3229] lr: 0.000005 grad_norm: 0.7748 (0.7706) closs: 1.0172 (1.0375) time: 4.1046 data: 0.0002 max mem: 54684 +[11:25:51.521912] Epoch: [2] [3070/3229] lr: 0.000005 grad_norm: 0.8210 (0.7707) closs: 1.0409 (1.0374) time: 4.0962 data: 0.0002 max mem: 54684 +[11:26:32.713656] Epoch: [2] [3080/3229] lr: 0.000005 grad_norm: 0.7547 (0.7706) closs: 1.0294 (1.0373) time: 4.1135 data: 0.0002 max mem: 54684 +[11:27:13.511213] Epoch: [2] [3090/3229] lr: 0.000005 grad_norm: 0.7538 (0.7706) closs: 1.0294 (1.0374) time: 4.0994 data: 0.0002 max mem: 54684 +[11:27:54.680369] Epoch: [2] [3100/3229] lr: 0.000005 grad_norm: 0.7631 (0.7707) closs: 1.0731 (1.0376) time: 4.0983 data: 0.0002 max mem: 54684 +[11:28:35.049341] Epoch: [2] [3110/3229] lr: 0.000005 grad_norm: 0.7713 (0.7705) closs: 1.0357 (1.0373) time: 4.0768 data: 0.0002 max mem: 54684 +[11:29:15.418890] Epoch: [2] [3120/3229] lr: 0.000005 grad_norm: 0.7078 (0.7705) closs: 1.0218 (1.0374) time: 4.0369 data: 0.0002 max mem: 54684 +[11:29:56.061537] Epoch: [2] [3130/3229] lr: 0.000005 grad_norm: 0.7244 (0.7703) closs: 1.0440 (1.0374) time: 4.0505 data: 0.0002 max mem: 54684 +[11:30:36.293253] Epoch: [2] [3140/3229] lr: 0.000005 grad_norm: 0.7669 (0.7705) closs: 1.0339 (1.0373) time: 4.0437 data: 0.0002 max mem: 54684 +[11:31:16.325807] Epoch: [2] [3150/3229] lr: 0.000005 grad_norm: 0.7848 (0.7703) closs: 1.0084 (1.0372) time: 4.0131 data: 0.0004 max mem: 54684 +[11:31:57.799087] Epoch: [2] [3160/3229] lr: 0.000005 grad_norm: 0.7618 (0.7704) closs: 1.0084 (1.0372) time: 4.0752 data: 0.0003 max mem: 54684 +[11:32:38.264034] Epoch: [2] [3170/3229] lr: 0.000005 grad_norm: 0.7903 (0.7705) closs: 1.0556 (1.0371) time: 4.0968 data: 0.0002 max mem: 54684 +[11:33:18.885238] Epoch: [2] [3180/3229] lr: 0.000005 grad_norm: 0.7486 (0.7705) closs: 1.0379 (1.0372) time: 4.0542 data: 0.0002 max mem: 54684 +[11:33:59.955564] Epoch: [2] [3190/3229] lr: 0.000005 grad_norm: 0.7458 (0.7706) closs: 1.0379 (1.0372) time: 4.0845 data: 0.0002 max mem: 54684 +[11:34:41.295405] Epoch: [2] [3200/3229] lr: 0.000005 grad_norm: 0.7953 (0.7707) closs: 1.0546 (1.0373) time: 4.1204 data: 0.0002 max mem: 54684 +[11:35:22.419516] Epoch: [2] [3210/3229] lr: 0.000005 grad_norm: 0.7931 (0.7707) closs: 1.0572 (1.0373) time: 4.1231 data: 0.0002 max mem: 54684 +[11:36:02.526362] Epoch: [2] [3220/3229] lr: 0.000005 grad_norm: 0.7683 (0.7707) closs: 1.0178 (1.0372) time: 4.0615 data: 0.0001 max mem: 54684 +[11:36:35.973354] Epoch: [2] Total time: 3:40:17 +[11:36:35.974297] Averaged stats: lr: 0.000005 grad_norm: 0.7420 (0.7708) closs: 1.0624 (1.0381) +[11:36:36.337101] model saved +[11:36:38.023619] optimizer saved +[11:36:38.024212] other rank-common saved +[11:36:38.029179] rank-specific saved +[11:36:38.029388] Training time 11:00:31 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..76baca48914c9ea8944b781bd72bdcc53122ffd6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0e65a2cbc92bd926b57acf3f986bccab80793cafe0e2e8b0f18566fafb58cc9 +size 90930987 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..60fcd5447400048b70bd12781a7ac2ff3f591896 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e474f0071285386afbebdb6c6fb925be5ef3e9f3349a22c36fbffe48d77ea7c9 +size 204320439 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..75738418626b6e3173bda41a86ec5da7dc593ab8 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac78a8280454755c93a56b940b311201a0a8911b1b5f05c35d04486388b998fe +size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..111c6c89cd3eab9b2e2e78b3c93b55e42b7179cc --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8fc318f143f893d3305b5abda9853ef6ba090d582b39122bfe0ef61ef2620d5 +size 90930987 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..b4645edcecd610180b6946730ba37462aaf675f3 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2fdf6406551f7380e09fda1858fac263c8d47f335664c8f1f26058d416bb70e +size 204320439 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..50bd574ffb5cad6148a8e51c9ab05e3d80027b40 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f168ff92d4b19bd4c714a344bfcdc36e1203ddcff5c9504a63acf3bdfb043b3 +size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9e5d36c34d582a7b6e6b39a2d4a9186a4e338d4 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c07fd4364e9b806715c985cd4a4905c02bc2c67014227d4322effc78e87d42f4 +size 90930987 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..92472b6308ca7910f824edde08c94c6fc4473912 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b3ffb824593175223cc1cc760088a45e55bb33fcdd62e37e42249c03c9c9d36 +size 204320439 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..d31ff745058632b1ad1e193fae6bdeb5d0a8a172 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d63f55f7f82591356144cd8ce6e7acba5e908efe4759350a3af15f31d4015567 +size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9dfdab397433ffafcd9aaaa95d61dea3ffa4e9b --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4092488fa46d10b7694923c74532faf866919d6c39c9a6d52bc4ab1e505bcb45 +size 90930987 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..42bf93bb5ab71a68c015eabd2d345ca309073368 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8abcacc8b53a2d27740c8543bb5adcb88e2ab89e9aa5cd403e30ed2d9563dc1c +size 204320439 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..8c07662de2e6d6147924de4c5427234211f7d032 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27afc0d923f1973f23206d7bbc02b246e988a3fe94eebdedc4e14b31fa3b5801 +size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/log.txt b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..63c991be766a6e7b09f62eb27253c47ba558cf0e --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/log.txt @@ -0,0 +1,4 @@ +{"train_lr": 2.49692118226601e-05, "train_grad_norm": 1.03953114467595, "train_closs": 0.8988287092961849, "epoch": 0, "val_lr": 2.49692118226601e-05, "val_grad_norm": 1.03953114467595, "val_closs": 0.8988287092961849} +{"train_lr": 4.6114274981403966e-05, "train_grad_norm": 0.9151975991837497, "train_closs": 0.854513919164468, "epoch": 1, "val_lr": 4.6114274981403966e-05, "val_grad_norm": 0.9151975991837497, "val_closs": 0.854513919164468} +{"train_lr": 2.751385467980297e-05, "train_grad_norm": 0.8807328767670787, "train_closs": 0.8423879001418064, "epoch": 2, "val_lr": 2.751385467980297e-05, "val_grad_norm": 0.8807328767670787, "val_closs": 0.8423879001418064} +{"train_lr": 8.899579698398978e-06, "train_grad_norm": 0.9092244758394551, "train_closs": 0.8364003172804937, "epoch": 3, "val_lr": 8.899579698398978e-06, "val_grad_norm": 0.9092244758394551, "val_closs": 0.8364003172804937} diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/output.log b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/output.log new file mode 100644 index 0000000000000000000000000000000000000000..3ce76611f6b1af197c27e3fedae6374f65ada327 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B/output.log @@ -0,0 +1,648 @@ +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +| distributed init (rank 1): env://, gpu 1 +| distributed init (rank 6): env://, gpu 6 +| distributed init (rank 0): env://, gpu 0 +| distributed init (rank 5): env://, gpu 5 +| distributed init (rank 7): env://, gpu 7 +| distributed init (rank 3): env://, gpu 3 +| distributed init (rank 4): env://, gpu 4 +| distributed init (rank 2): env://, gpu 2 +[05:56:44.962935] > initializing model parallel with size 1 +[05:56:44.963012] > initializing ddp with size 8 +[05:56:44.963019] > initializing pipeline with size 1 +[05:56:45.119503] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory +[05:56:45.119584] Namespace(batch_size=8, +accum_iter=1, +llama_type='llama_peft', +llama_config=['../checkpoints/llama2/Llama-2-13b/params.json', +'configs/model/finetune/sg/llamaPeft_normBiasLora.json'], +no_visual=True, +tokenizer_path='../checkpoints/llama2/Llama-2-13b/tokenizer.model', +pretrained_path='../checkpoints/llama2/Llama-2-13b/', +pretrained_type='meta_ori', +weight_decay=0.02, +lr=5e-05, +min_lr=5e-06, +epochs=4, +warmup_epochs=1.0, +clip_grad=2, +max_words=512, +dialog=False, +data_config='configs/data/finetune/sg/alpaca.yaml', +output_dir='output/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_13B', +log_dir='./output_dir', +save_interval=1, +only_save_trainable=True, +device='cuda', +seed=0, +resume='', +num_workers=24, +pin_mem=True, +world_size=8, +local_rank=-1, +dist_on_itp=False, +dist_url='env://', +model_parallel_size=1, +data_parallel='sdp', +precision='bf16', +checkpointing=True, +quant=True, +rank=0, +gpu=0, +distributed=True, +dist_backend='nccl') +[05:56:45.120384] Start initialization. +[05:56:45.120416] ## Processing on RANK 0. +[05:56:45.129261] Model Args: + ModelArgs(dim=5120, n_layers=40, n_heads=40, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=16, bias_tuning=True) +[05:58:19.701205] Model is Peft: True +[05:58:19.709591] Trainable parameter count : 65131520 (local rank), 65131520 (all). +[05:58:19.822258] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/ +[05:58:52.688570] ## Quantizing model to 4bit! + Qunatization Process: 0%| | 0/1047 [00:00 +[06:27:21.584239] Start training for 4 epochs +[06:27:21.595470] log_dir: ./output_dir +[06:27:27.291530] Epoch: [0] [0/812] lr: 0.000000 grad_norm: 1.9510 (1.9510) closs: 1.0616 (1.0616) time: 5.6952 data: 1.5912 max mem: 18825 +[06:27:48.233874] Epoch: [0] [10/812] lr: 0.000001 grad_norm: 2.1544 (2.1439) closs: 1.0616 (1.0547) time: 2.4215 data: 0.1448 max mem: 28042 +[06:28:09.492825] Epoch: [0] [20/812] lr: 0.000001 grad_norm: 2.0091 (2.0561) closs: 0.9999 (1.0350) time: 2.1100 data: 0.0002 max mem: 28042 +[06:28:30.750007] Epoch: [0] [30/812] lr: 0.000002 grad_norm: 1.9742 (2.0773) closs: 1.0489 (1.0550) time: 2.1257 data: 0.0002 max mem: 28042 +[06:28:51.915590] Epoch: [0] [40/812] lr: 0.000002 grad_norm: 1.9236 (2.0170) closs: 1.0628 (1.0608) time: 2.1210 data: 0.0002 max mem: 28042 +[06:29:13.261311] Epoch: [0] [50/812] lr: 0.000003 grad_norm: 1.6794 (1.9642) closs: 1.0594 (1.0599) time: 2.1255 data: 0.0002 max mem: 28042 +[06:29:34.413120] Epoch: [0] [60/812] lr: 0.000004 grad_norm: 1.5823 (1.8870) closs: 1.0342 (1.0539) time: 2.1248 data: 0.0002 max mem: 28042 +[06:29:55.626442] Epoch: [0] [70/812] lr: 0.000004 grad_norm: 1.4000 (1.8062) closs: 1.0269 (1.0558) time: 2.1181 data: 0.0002 max mem: 28042 +[06:30:16.918661] Epoch: [0] [80/812] lr: 0.000005 grad_norm: 1.2024 (1.7370) closs: 1.0211 (1.0531) time: 2.1252 data: 0.0002 max mem: 28042 +[06:30:38.196705] Epoch: [0] [90/812] lr: 0.000006 grad_norm: 1.1287 (1.6695) closs: 0.9826 (1.0414) time: 2.1284 data: 0.0002 max mem: 28042 +[06:30:59.467468] Epoch: [0] [100/812] lr: 0.000006 grad_norm: 1.0506 (1.6085) closs: 0.9512 (1.0363) time: 2.1274 data: 0.0002 max mem: 28042 +[06:31:20.727971] Epoch: [0] [110/812] lr: 0.000007 grad_norm: 0.9873 (1.5522) closs: 0.9416 (1.0241) time: 2.1265 data: 0.0002 max mem: 28042 +[06:31:41.936817] Epoch: [0] [120/812] lr: 0.000007 grad_norm: 0.9233 (1.5061) closs: 0.9447 (1.0219) time: 2.1234 data: 0.0002 max mem: 28042 +[06:32:03.041717] Epoch: [0] [130/812] lr: 0.000008 grad_norm: 0.9836 (1.4640) closs: 0.9609 (1.0167) time: 2.1156 data: 0.0002 max mem: 28042 +[06:32:24.295834] Epoch: [0] [140/812] lr: 0.000009 grad_norm: 0.9415 (1.4258) closs: 0.8978 (1.0079) time: 2.1179 data: 0.0002 max mem: 28042 +[06:32:45.543342] Epoch: [0] [150/812] lr: 0.000009 grad_norm: 0.8967 (1.3994) closs: 0.8940 (1.0025) time: 2.1250 data: 0.0002 max mem: 28042 +[06:33:06.837739] Epoch: [0] [160/812] lr: 0.000010 grad_norm: 0.9221 (1.3720) closs: 0.9130 (0.9991) time: 2.1270 data: 0.0002 max mem: 28042 +[06:33:28.086917] Epoch: [0] [170/812] lr: 0.000010 grad_norm: 0.9860 (1.3484) closs: 0.9068 (0.9924) time: 2.1271 data: 0.0002 max mem: 28042 +[06:33:49.368902] Epoch: [0] [180/812] lr: 0.000011 grad_norm: 0.9860 (1.3266) closs: 0.8552 (0.9843) time: 2.1265 data: 0.0002 max mem: 28042 +[06:34:10.555964] Epoch: [0] [190/812] lr: 0.000012 grad_norm: 0.9269 (1.3048) closs: 0.8552 (0.9786) time: 2.1234 data: 0.0002 max mem: 28042 +[06:34:31.919181] Epoch: [0] [200/812] lr: 0.000012 grad_norm: 0.9269 (1.2870) closs: 0.9133 (0.9764) time: 2.1274 data: 0.0002 max mem: 28042 +[06:34:53.191153] Epoch: [0] [210/812] lr: 0.000013 grad_norm: 0.9518 (1.2708) closs: 0.9159 (0.9724) time: 2.1317 data: 0.0002 max mem: 28042 +[06:35:14.531503] Epoch: [0] [220/812] lr: 0.000014 grad_norm: 0.9473 (1.2543) closs: 0.9002 (0.9697) time: 2.1305 data: 0.0002 max mem: 28042 +[06:35:35.775538] Epoch: [0] [230/812] lr: 0.000014 grad_norm: 0.8726 (1.2400) closs: 0.8788 (0.9638) time: 2.1291 data: 0.0002 max mem: 28042 +[06:35:57.064040] Epoch: [0] [240/812] lr: 0.000015 grad_norm: 0.8857 (1.2265) closs: 0.8438 (0.9603) time: 2.1266 data: 0.0002 max mem: 28042 +[06:36:18.204677] Epoch: [0] [250/812] lr: 0.000015 grad_norm: 0.8589 (1.2119) closs: 0.9044 (0.9582) time: 2.1214 data: 0.0002 max mem: 28042 +[06:36:39.553053] Epoch: [0] [260/812] lr: 0.000016 grad_norm: 0.8594 (1.2025) closs: 0.8971 (0.9549) time: 2.1244 data: 0.0002 max mem: 28042 +[06:37:00.855213] Epoch: [0] [270/812] lr: 0.000017 grad_norm: 0.9139 (1.1965) closs: 0.8980 (0.9543) time: 2.1324 data: 0.0002 max mem: 28042 +[06:37:22.140492] Epoch: [0] [280/812] lr: 0.000017 grad_norm: 0.9011 (1.1889) closs: 0.9115 (0.9515) time: 2.1293 data: 0.0002 max mem: 28042 +[06:37:43.447171] Epoch: [0] [290/812] lr: 0.000018 grad_norm: 0.9554 (1.1825) closs: 0.8680 (0.9484) time: 2.1295 data: 0.0002 max mem: 28042 +[06:38:04.736791] Epoch: [0] [300/812] lr: 0.000018 grad_norm: 0.9554 (1.1737) closs: 0.8583 (0.9459) time: 2.1297 data: 0.0002 max mem: 28042 +[06:38:25.924120] Epoch: [0] [310/812] lr: 0.000019 grad_norm: 0.8799 (1.1661) closs: 0.8818 (0.9452) time: 2.1238 data: 0.0002 max mem: 28042 +[06:38:47.257933] Epoch: [0] [320/812] lr: 0.000020 grad_norm: 0.9149 (1.1595) closs: 0.9034 (0.9449) time: 2.1260 data: 0.0002 max mem: 28042 +[06:39:08.559953] Epoch: [0] [330/812] lr: 0.000020 grad_norm: 0.9245 (1.1522) closs: 0.8678 (0.9414) time: 2.1317 data: 0.0002 max mem: 28042 +[06:39:29.852361] Epoch: [0] [340/812] lr: 0.000021 grad_norm: 0.9209 (1.1468) closs: 0.8413 (0.9386) time: 2.1296 data: 0.0002 max mem: 28042 +[06:39:51.145032] Epoch: [0] [350/812] lr: 0.000022 grad_norm: 0.9571 (1.1421) closs: 0.8830 (0.9398) time: 2.1292 data: 0.0002 max mem: 28042 +[06:40:12.364246] Epoch: [0] [360/812] lr: 0.000022 grad_norm: 0.9706 (1.1378) closs: 0.8992 (0.9382) time: 2.1255 data: 0.0002 max mem: 28042 +[06:40:33.511071] Epoch: [0] [370/812] lr: 0.000023 grad_norm: 0.8997 (1.1309) closs: 0.8396 (0.9361) time: 2.1182 data: 0.0002 max mem: 28042 +[06:40:54.788872] Epoch: [0] [380/812] lr: 0.000023 grad_norm: 0.8940 (1.1266) closs: 0.8361 (0.9348) time: 2.1212 data: 0.0002 max mem: 28042 +[06:41:16.033192] Epoch: [0] [390/812] lr: 0.000024 grad_norm: 0.9331 (1.1217) closs: 0.8695 (0.9335) time: 2.1260 data: 0.0002 max mem: 28042 +[06:41:37.222343] Epoch: [0] [400/812] lr: 0.000025 grad_norm: 0.9583 (1.1184) closs: 0.8604 (0.9323) time: 2.1216 data: 0.0002 max mem: 28042 +[06:41:58.490518] Epoch: [0] [410/812] lr: 0.000025 grad_norm: 0.8561 (1.1128) closs: 0.8604 (0.9309) time: 2.1228 data: 0.0002 max mem: 28042 +[06:42:19.739979] Epoch: [0] [420/812] lr: 0.000026 grad_norm: 0.8646 (1.1088) closs: 0.8581 (0.9291) time: 2.1258 data: 0.0002 max mem: 28042 +[06:42:40.820073] Epoch: [0] [430/812] lr: 0.000026 grad_norm: 0.8915 (1.1065) closs: 0.8364 (0.9265) time: 2.1164 data: 0.0002 max mem: 28042 +[06:43:02.062800] Epoch: [0] [440/812] lr: 0.000027 grad_norm: 0.8808 (1.1014) closs: 0.7983 (0.9241) time: 2.1161 data: 0.0002 max mem: 28042 +[06:43:23.326377] Epoch: [0] [450/812] lr: 0.000028 grad_norm: 0.8808 (1.0998) closs: 0.8111 (0.9218) time: 2.1252 data: 0.0002 max mem: 28042 +[06:43:44.565401] Epoch: [0] [460/812] lr: 0.000028 grad_norm: 0.8849 (1.0957) closs: 0.8498 (0.9205) time: 2.1250 data: 0.0002 max mem: 28042 +[06:44:05.812308] Epoch: [0] [470/812] lr: 0.000029 grad_norm: 0.8793 (1.0957) closs: 0.8498 (0.9200) time: 2.1242 data: 0.0002 max mem: 28042 +[06:44:27.069455] Epoch: [0] [480/812] lr: 0.000030 grad_norm: 0.9377 (1.0929) closs: 0.9008 (0.9193) time: 2.1251 data: 0.0002 max mem: 28042 +[06:44:48.177755] Epoch: [0] [490/812] lr: 0.000030 grad_norm: 0.9086 (1.0893) closs: 0.8924 (0.9187) time: 2.1182 data: 0.0002 max mem: 28042 +[06:45:09.416351] Epoch: [0] [500/812] lr: 0.000031 grad_norm: 0.8987 (1.0874) closs: 0.8698 (0.9174) time: 2.1173 data: 0.0002 max mem: 28042 +[06:45:30.630868] Epoch: [0] [510/812] lr: 0.000031 grad_norm: 0.8835 (1.0869) closs: 0.8536 (0.9153) time: 2.1226 data: 0.0002 max mem: 28042 +[06:45:51.904027] Epoch: [0] [520/812] lr: 0.000032 grad_norm: 0.9220 (1.0837) closs: 0.7934 (0.9142) time: 2.1243 data: 0.0002 max mem: 28042 +[06:46:13.161036] Epoch: [0] [530/812] lr: 0.000033 grad_norm: 0.9652 (1.0826) closs: 0.8786 (0.9135) time: 2.1264 data: 0.0002 max mem: 28042 +[06:46:34.432979] Epoch: [0] [540/812] lr: 0.000033 grad_norm: 0.9935 (1.0810) closs: 0.9075 (0.9130) time: 2.1264 data: 0.0002 max mem: 28042 +[06:46:55.524047] Epoch: [0] [550/812] lr: 0.000034 grad_norm: 0.9487 (1.0826) closs: 0.8710 (0.9118) time: 2.1181 data: 0.0002 max mem: 28042 +[06:47:16.807565] Epoch: [0] [560/812] lr: 0.000034 grad_norm: 0.9651 (1.0816) closs: 0.8440 (0.9113) time: 2.1186 data: 0.0002 max mem: 28042 +[06:47:38.094895] Epoch: [0] [570/812] lr: 0.000035 grad_norm: 0.9088 (1.0788) closs: 0.8497 (0.9105) time: 2.1285 data: 0.0002 max mem: 28042 +[06:47:59.329657] Epoch: [0] [580/812] lr: 0.000036 grad_norm: 0.9026 (1.0786) closs: 0.8642 (0.9103) time: 2.1260 data: 0.0002 max mem: 28042 +[06:48:20.589047] Epoch: [0] [590/812] lr: 0.000036 grad_norm: 0.9026 (1.0764) closs: 0.8642 (0.9095) time: 2.1246 data: 0.0002 max mem: 28042 +[06:48:41.877780] Epoch: [0] [600/812] lr: 0.000037 grad_norm: 0.8705 (1.0731) closs: 0.8507 (0.9088) time: 2.1273 data: 0.0002 max mem: 28042 +[06:49:03.085078] Epoch: [0] [610/812] lr: 0.000038 grad_norm: 0.8442 (1.0703) closs: 0.8255 (0.9068) time: 2.1247 data: 0.0002 max mem: 28042 +[06:49:24.290564] Epoch: [0] [620/812] lr: 0.000038 grad_norm: 0.8969 (1.0716) closs: 0.7882 (0.9062) time: 2.1206 data: 0.0002 max mem: 28042 +[06:49:45.503873] Epoch: [0] [630/812] lr: 0.000039 grad_norm: 0.9548 (1.0711) closs: 0.8524 (0.9050) time: 2.1209 data: 0.0002 max mem: 28042 +[06:50:06.759717] Epoch: [0] [640/812] lr: 0.000039 grad_norm: 0.9548 (1.0689) closs: 0.8578 (0.9048) time: 2.1234 data: 0.0002 max mem: 28042 +[06:50:28.061959] Epoch: [0] [650/812] lr: 0.000040 grad_norm: 0.8640 (1.0660) closs: 0.8476 (0.9035) time: 2.1278 data: 0.0002 max mem: 28042 +[06:50:49.325746] Epoch: [0] [660/812] lr: 0.000041 grad_norm: 0.8640 (1.0636) closs: 0.8116 (0.9030) time: 2.1282 data: 0.0002 max mem: 28042 +[06:51:10.485211] Epoch: [0] [670/812] lr: 0.000041 grad_norm: 0.8712 (1.0605) closs: 0.8405 (0.9025) time: 2.1211 data: 0.0002 max mem: 28042 +[06:51:31.762440] Epoch: [0] [680/812] lr: 0.000042 grad_norm: 0.8712 (1.0583) closs: 0.8536 (0.9021) time: 2.1218 data: 0.0002 max mem: 28042 +[06:51:53.108436] Epoch: [0] [690/812] lr: 0.000042 grad_norm: 0.8425 (1.0562) closs: 0.8619 (0.9016) time: 2.1311 data: 0.0002 max mem: 28042 +[06:52:14.345086] Epoch: [0] [700/812] lr: 0.000043 grad_norm: 0.8661 (1.0545) closs: 0.8556 (0.9012) time: 2.1291 data: 0.0002 max mem: 28042 +[06:52:35.562580] Epoch: [0] [710/812] lr: 0.000044 grad_norm: 0.9346 (1.0539) closs: 0.8271 (0.9002) time: 2.1226 data: 0.0002 max mem: 28042 +[06:52:56.788771] Epoch: [0] [720/812] lr: 0.000044 grad_norm: 0.9284 (1.0529) closs: 0.8355 (0.8998) time: 2.1221 data: 0.0002 max mem: 28042 +[06:53:17.889187] Epoch: [0] [730/812] lr: 0.000045 grad_norm: 0.9407 (1.0518) closs: 0.8588 (0.8990) time: 2.1163 data: 0.0002 max mem: 28042 +[06:53:39.137183] Epoch: [0] [740/812] lr: 0.000046 grad_norm: 0.9572 (1.0510) closs: 0.8287 (0.8978) time: 2.1173 data: 0.0002 max mem: 28042 +[06:54:00.392567] Epoch: [0] [750/812] lr: 0.000046 grad_norm: 0.8869 (1.0493) closs: 0.8349 (0.8985) time: 2.1251 data: 0.0002 max mem: 28042 +[06:54:21.698073] Epoch: [0] [760/812] lr: 0.000047 grad_norm: 0.8591 (1.0473) closs: 0.8724 (0.8979) time: 2.1280 data: 0.0002 max mem: 28042 +[06:54:42.998694] Epoch: [0] [770/812] lr: 0.000047 grad_norm: 0.8692 (1.0454) closs: 0.8389 (0.8970) time: 2.1302 data: 0.0002 max mem: 28042 +[06:55:04.236494] Epoch: [0] [780/812] lr: 0.000048 grad_norm: 0.9315 (1.0442) closs: 0.8393 (0.8966) time: 2.1268 data: 0.0002 max mem: 28042 +[06:55:25.352416] Epoch: [0] [790/812] lr: 0.000049 grad_norm: 0.8830 (1.0420) closs: 0.8393 (0.8957) time: 2.1176 data: 0.0001 max mem: 28042 +[06:55:46.583325] Epoch: [0] [800/812] lr: 0.000049 grad_norm: 0.8569 (1.0408) closs: 0.8439 (0.8953) time: 2.1173 data: 0.0001 max mem: 28042 +[06:56:07.855749] Epoch: [0] [810/812] lr: 0.000050 grad_norm: 0.8814 (1.0395) closs: 0.8229 (0.8941) time: 2.1251 data: 0.0001 max mem: 28042 +[06:56:10.209403] Epoch: [0] Total time: 0:28:48 +[06:56:10.212512] Averaged stats: lr: 0.000050 grad_norm: 0.8814 (1.0395) closs: 0.8177 (0.8988) +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[06:56:10.615675] model saved +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[06:56:12.374769] optimizer saved +[06:56:12.375482] other rank-common saved +[06:56:12.381202] rank-specific saved +[06:56:12.391881] log_dir: ./output_dir +[06:56:15.877787] Epoch: [1] [0/812] lr: 0.000050 grad_norm: 0.7418 (0.7418) closs: 0.6434 (0.6434) time: 3.4849 data: 1.3481 max mem: 28042 +[06:56:37.172127] Epoch: [1] [10/812] lr: 0.000050 grad_norm: 0.9125 (0.9429) closs: 0.8020 (0.7882) time: 2.2526 data: 0.1227 max mem: 28042 +[06:56:58.553564] Epoch: [1] [20/812] lr: 0.000050 grad_norm: 0.9125 (0.9737) closs: 0.8134 (0.8498) time: 2.1337 data: 0.0002 max mem: 28042 +[06:57:19.841517] Epoch: [1] [30/812] lr: 0.000050 grad_norm: 0.8671 (0.9519) closs: 0.8293 (0.8457) time: 2.1334 data: 0.0002 max mem: 28042 +[06:57:41.138608] Epoch: [1] [40/812] lr: 0.000050 grad_norm: 0.8634 (0.9318) closs: 0.8733 (0.8656) time: 2.1292 data: 0.0002 max mem: 28042 +[06:58:02.227816] Epoch: [1] [50/812] lr: 0.000050 grad_norm: 0.8460 (0.9328) closs: 0.8800 (0.8613) time: 2.1192 data: 0.0002 max mem: 28042 +[06:58:23.466948] Epoch: [1] [60/812] lr: 0.000050 grad_norm: 0.8895 (0.9279) closs: 0.8352 (0.8591) time: 2.1163 data: 0.0002 max mem: 28042 +[06:58:44.680097] Epoch: [1] [70/812] lr: 0.000050 grad_norm: 0.9388 (0.9401) closs: 0.8626 (0.8619) time: 2.1225 data: 0.0002 max mem: 28042 +[06:59:05.983269] Epoch: [1] [80/812] lr: 0.000050 grad_norm: 0.9289 (0.9347) closs: 0.8646 (0.8577) time: 2.1257 data: 0.0002 max mem: 28042 +[06:59:27.380557] Epoch: [1] [90/812] lr: 0.000050 grad_norm: 0.8478 (0.9265) closs: 0.8646 (0.8618) time: 2.1349 data: 0.0002 max mem: 28042 +[06:59:48.640207] Epoch: [1] [100/812] lr: 0.000050 grad_norm: 0.8774 (0.9255) closs: 0.8923 (0.8640) time: 2.1328 data: 0.0002 max mem: 28042 +[07:00:09.758341] Epoch: [1] [110/812] lr: 0.000050 grad_norm: 0.8900 (0.9251) closs: 0.8698 (0.8612) time: 2.1188 data: 0.0002 max mem: 28042 +[07:00:30.981964] Epoch: [1] [120/812] lr: 0.000050 grad_norm: 0.9182 (0.9269) closs: 0.8698 (0.8617) time: 2.1170 data: 0.0002 max mem: 28042 +[07:00:52.245385] Epoch: [1] [130/812] lr: 0.000050 grad_norm: 0.9182 (0.9259) closs: 0.8665 (0.8617) time: 2.1243 data: 0.0002 max mem: 28042 +[07:01:13.558548] Epoch: [1] [140/812] lr: 0.000050 grad_norm: 0.8672 (0.9211) closs: 0.8531 (0.8614) time: 2.1287 data: 0.0002 max mem: 28042 +[07:01:34.846520] Epoch: [1] [150/812] lr: 0.000050 grad_norm: 0.8460 (0.9249) closs: 0.8726 (0.8611) time: 2.1300 data: 0.0002 max mem: 28042 +[07:01:56.152223] Epoch: [1] [160/812] lr: 0.000050 grad_norm: 0.9007 (0.9226) closs: 0.8726 (0.8635) time: 2.1296 data: 0.0002 max mem: 28042 +[07:02:17.282750] Epoch: [1] [170/812] lr: 0.000049 grad_norm: 0.8338 (0.9195) closs: 0.9167 (0.8660) time: 2.1217 data: 0.0002 max mem: 28042 +[07:02:38.524204] Epoch: [1] [180/812] lr: 0.000049 grad_norm: 0.8527 (0.9223) closs: 0.8779 (0.8662) time: 2.1185 data: 0.0002 max mem: 28042 +[07:02:59.827288] Epoch: [1] [190/812] lr: 0.000049 grad_norm: 0.8880 (0.9228) closs: 0.8079 (0.8646) time: 2.1271 data: 0.0002 max mem: 28042 +[07:03:21.168075] Epoch: [1] [200/812] lr: 0.000049 grad_norm: 0.8588 (0.9203) closs: 0.8388 (0.8627) time: 2.1321 data: 0.0002 max mem: 28042 +[07:03:42.467849] Epoch: [1] [210/812] lr: 0.000049 grad_norm: 0.8416 (0.9177) closs: 0.8623 (0.8648) time: 2.1319 data: 0.0002 max mem: 28042 +[07:04:03.731453] Epoch: [1] [220/812] lr: 0.000049 grad_norm: 0.8283 (0.9154) closs: 0.8735 (0.8637) time: 2.1281 data: 0.0002 max mem: 28042 +[07:04:24.911070] Epoch: [1] [230/812] lr: 0.000049 grad_norm: 0.8495 (0.9127) closs: 0.8099 (0.8620) time: 2.1221 data: 0.0002 max mem: 28042 +[07:04:46.205620] Epoch: [1] [240/812] lr: 0.000049 grad_norm: 0.8495 (0.9115) closs: 0.8409 (0.8631) time: 2.1236 data: 0.0002 max mem: 28042 +[07:05:07.516767] Epoch: [1] [250/812] lr: 0.000049 grad_norm: 0.8600 (0.9131) closs: 0.8848 (0.8622) time: 2.1302 data: 0.0002 max mem: 28042 +[07:05:28.819912] Epoch: [1] [260/812] lr: 0.000049 grad_norm: 0.8600 (0.9108) closs: 0.8859 (0.8627) time: 2.1306 data: 0.0002 max mem: 28042 +[07:05:50.052732] Epoch: [1] [270/812] lr: 0.000049 grad_norm: 0.8403 (0.9102) closs: 0.8677 (0.8615) time: 2.1267 data: 0.0002 max mem: 28042 +[07:06:11.298621] Epoch: [1] [280/812] lr: 0.000049 grad_norm: 0.9042 (0.9123) closs: 0.8677 (0.8617) time: 2.1239 data: 0.0002 max mem: 28042 +[07:06:32.419845] Epoch: [1] [290/812] lr: 0.000048 grad_norm: 0.9813 (0.9163) closs: 0.8537 (0.8620) time: 2.1183 data: 0.0002 max mem: 28042 +[07:06:53.663078] Epoch: [1] [300/812] lr: 0.000048 grad_norm: 0.8997 (0.9148) closs: 0.8405 (0.8609) time: 2.1181 data: 0.0002 max mem: 28042 +[07:07:14.922464] Epoch: [1] [310/812] lr: 0.000048 grad_norm: 0.8997 (0.9155) closs: 0.8405 (0.8602) time: 2.1251 data: 0.0002 max mem: 28042 +[07:07:36.151134] Epoch: [1] [320/812] lr: 0.000048 grad_norm: 0.8916 (0.9157) closs: 0.8480 (0.8609) time: 2.1243 data: 0.0002 max mem: 28042 +[07:07:57.365292] Epoch: [1] [330/812] lr: 0.000048 grad_norm: 0.8389 (0.9149) closs: 0.8297 (0.8603) time: 2.1221 data: 0.0002 max mem: 28042 +[07:08:18.608023] Epoch: [1] [340/812] lr: 0.000048 grad_norm: 0.8646 (0.9134) closs: 0.7981 (0.8584) time: 2.1228 data: 0.0002 max mem: 28042 +[07:08:39.766957] Epoch: [1] [350/812] lr: 0.000048 grad_norm: 0.8646 (0.9123) closs: 0.7981 (0.8579) time: 2.1200 data: 0.0002 max mem: 28042 +[07:09:01.054096] Epoch: [1] [360/812] lr: 0.000048 grad_norm: 0.8671 (0.9124) closs: 0.7807 (0.8556) time: 2.1222 data: 0.0002 max mem: 28042 +[07:09:22.343988] Epoch: [1] [370/812] lr: 0.000047 grad_norm: 0.9250 (0.9165) closs: 0.7688 (0.8546) time: 2.1288 data: 0.0002 max mem: 28042 +[07:09:43.668019] Epoch: [1] [380/812] lr: 0.000047 grad_norm: 0.9275 (0.9156) closs: 0.8322 (0.8548) time: 2.1306 data: 0.0003 max mem: 28042 +[07:10:04.936534] Epoch: [1] [390/812] lr: 0.000047 grad_norm: 0.8581 (0.9324) closs: 0.8617 (0.8550) time: 2.1296 data: 0.0003 max mem: 28042 +[07:10:26.269641] Epoch: [1] [400/812] lr: 0.000047 grad_norm: 0.8561 (0.9311) closs: 0.8443 (0.8548) time: 2.1300 data: 0.0002 max mem: 28042 +[07:10:47.398303] Epoch: [1] [410/812] lr: 0.000047 grad_norm: 0.8857 (0.9329) closs: 0.8655 (0.8563) time: 2.1230 data: 0.0002 max mem: 28042 +[07:11:08.680671] Epoch: [1] [420/812] lr: 0.000047 grad_norm: 0.9295 (0.9312) closs: 0.9093 (0.8567) time: 2.1205 data: 0.0002 max mem: 28042 +[07:11:29.894053] Epoch: [1] [430/812] lr: 0.000047 grad_norm: 0.8829 (0.9323) closs: 0.8460 (0.8558) time: 2.1247 data: 0.0002 max mem: 28042 +[07:11:51.120078] Epoch: [1] [440/812] lr: 0.000046 grad_norm: 0.8876 (0.9315) closs: 0.8460 (0.8558) time: 2.1219 data: 0.0002 max mem: 28042 +[07:12:12.348027] Epoch: [1] [450/812] lr: 0.000046 grad_norm: 0.8868 (0.9304) closs: 0.8481 (0.8554) time: 2.1226 data: 0.0002 max mem: 28042 +[07:12:33.569320] Epoch: [1] [460/812] lr: 0.000046 grad_norm: 0.9059 (0.9318) closs: 0.8101 (0.8548) time: 2.1224 data: 0.0002 max mem: 28042 +[07:12:54.684647] Epoch: [1] [470/812] lr: 0.000046 grad_norm: 0.9218 (0.9311) closs: 0.8178 (0.8549) time: 2.1168 data: 0.0002 max mem: 28042 +[07:13:15.972916] Epoch: [1] [480/812] lr: 0.000046 grad_norm: 0.8772 (0.9306) closs: 0.8478 (0.8555) time: 2.1201 data: 0.0002 max mem: 28042 +[07:13:37.223460] Epoch: [1] [490/812] lr: 0.000046 grad_norm: 0.8772 (0.9295) closs: 0.8584 (0.8561) time: 2.1269 data: 0.0002 max mem: 28042 +[07:13:58.457753] Epoch: [1] [500/812] lr: 0.000045 grad_norm: 0.8557 (0.9276) closs: 0.8375 (0.8556) time: 2.1242 data: 0.0002 max mem: 28042 +[07:14:19.699206] Epoch: [1] [510/812] lr: 0.000045 grad_norm: 0.7918 (0.9274) closs: 0.8128 (0.8553) time: 2.1237 data: 0.0002 max mem: 28042 +[07:14:41.023843] Epoch: [1] [520/812] lr: 0.000045 grad_norm: 0.8657 (0.9260) closs: 0.8301 (0.8552) time: 2.1282 data: 0.0002 max mem: 28042 +[07:15:02.146542] Epoch: [1] [530/812] lr: 0.000045 grad_norm: 0.8365 (0.9253) closs: 0.8398 (0.8553) time: 2.1223 data: 0.0003 max mem: 28042 +[07:15:23.393129] Epoch: [1] [540/812] lr: 0.000045 grad_norm: 0.8390 (0.9242) closs: 0.8310 (0.8553) time: 2.1184 data: 0.0003 max mem: 28042 +[07:15:44.662588] Epoch: [1] [550/812] lr: 0.000045 grad_norm: 0.8286 (0.9227) closs: 0.8088 (0.8546) time: 2.1257 data: 0.0002 max mem: 28042 +[07:16:05.961040] Epoch: [1] [560/812] lr: 0.000044 grad_norm: 0.8162 (0.9208) closs: 0.8267 (0.8545) time: 2.1283 data: 0.0002 max mem: 28042 +[07:16:27.247815] Epoch: [1] [570/812] lr: 0.000044 grad_norm: 0.8102 (0.9196) closs: 0.8561 (0.8549) time: 2.1292 data: 0.0002 max mem: 28042 +[07:16:48.505828] Epoch: [1] [580/812] lr: 0.000044 grad_norm: 0.8414 (0.9193) closs: 0.8574 (0.8542) time: 2.1272 data: 0.0002 max mem: 28042 +[07:17:09.646613] Epoch: [1] [590/812] lr: 0.000044 grad_norm: 0.9224 (0.9192) closs: 0.8602 (0.8545) time: 2.1199 data: 0.0002 max mem: 28042 +[07:17:30.846001] Epoch: [1] [600/812] lr: 0.000044 grad_norm: 0.9177 (0.9188) closs: 0.8599 (0.8548) time: 2.1169 data: 0.0002 max mem: 28042 +[07:17:52.084387] Epoch: [1] [610/812] lr: 0.000043 grad_norm: 0.8469 (0.9183) closs: 0.8411 (0.8548) time: 2.1218 data: 0.0002 max mem: 28042 +[07:18:13.385659] Epoch: [1] [620/812] lr: 0.000043 grad_norm: 0.8469 (0.9170) closs: 0.8457 (0.8548) time: 2.1269 data: 0.0002 max mem: 28042 +[07:18:34.632870] Epoch: [1] [630/812] lr: 0.000043 grad_norm: 0.8292 (0.9164) closs: 0.8625 (0.8553) time: 2.1273 data: 0.0002 max mem: 28042 +[07:18:55.965669] Epoch: [1] [640/812] lr: 0.000043 grad_norm: 0.8478 (0.9155) closs: 0.8567 (0.8553) time: 2.1289 data: 0.0002 max mem: 28042 +[07:19:17.086640] Epoch: [1] [650/812] lr: 0.000043 grad_norm: 0.8298 (0.9147) closs: 0.8326 (0.8550) time: 2.1226 data: 0.0004 max mem: 28042 +[07:19:38.331115] Epoch: [1] [660/812] lr: 0.000042 grad_norm: 0.8200 (0.9143) closs: 0.8794 (0.8560) time: 2.1182 data: 0.0004 max mem: 28042 +[07:19:59.582006] Epoch: [1] [670/812] lr: 0.000042 grad_norm: 0.8277 (0.9136) closs: 0.9154 (0.8567) time: 2.1247 data: 0.0002 max mem: 28042 +[07:20:20.825085] Epoch: [1] [680/812] lr: 0.000042 grad_norm: 0.8543 (0.9137) closs: 0.9115 (0.8573) time: 2.1246 data: 0.0002 max mem: 28042 +[07:20:42.055735] Epoch: [1] [690/812] lr: 0.000042 grad_norm: 0.9597 (0.9188) closs: 0.9179 (0.8578) time: 2.1236 data: 0.0003 max mem: 28042 +[07:21:03.290761] Epoch: [1] [700/812] lr: 0.000041 grad_norm: 0.8645 (0.9179) closs: 0.8496 (0.8577) time: 2.1232 data: 0.0002 max mem: 28042 +[07:21:24.434877] Epoch: [1] [710/812] lr: 0.000041 grad_norm: 0.8440 (0.9172) closs: 0.8131 (0.8574) time: 2.1189 data: 0.0002 max mem: 28042 +[07:21:45.655921] Epoch: [1] [720/812] lr: 0.000041 grad_norm: 0.8724 (0.9173) closs: 0.7997 (0.8569) time: 2.1182 data: 0.0002 max mem: 28042 +[07:22:06.935228] Epoch: [1] [730/812] lr: 0.000041 grad_norm: 0.8932 (0.9170) closs: 0.8529 (0.8571) time: 2.1249 data: 0.0002 max mem: 28042 +[07:22:28.172880] Epoch: [1] [740/812] lr: 0.000041 grad_norm: 0.8932 (0.9174) closs: 0.8546 (0.8569) time: 2.1258 data: 0.0002 max mem: 28042 +[07:22:49.386611] Epoch: [1] [750/812] lr: 0.000040 grad_norm: 0.8588 (0.9167) closs: 0.8256 (0.8568) time: 2.1225 data: 0.0002 max mem: 28042 +[07:23:10.577933] Epoch: [1] [760/812] lr: 0.000040 grad_norm: 0.8566 (0.9159) closs: 0.8375 (0.8571) time: 2.1202 data: 0.0002 max mem: 28042 +[07:23:31.706111] Epoch: [1] [770/812] lr: 0.000040 grad_norm: 0.8276 (0.9146) closs: 0.8624 (0.8566) time: 2.1159 data: 0.0002 max mem: 28042 +[07:23:53.046796] Epoch: [1] [780/812] lr: 0.000040 grad_norm: 0.8238 (0.9141) closs: 0.8051 (0.8564) time: 2.1234 data: 0.0002 max mem: 28042 +[07:24:14.279679] Epoch: [1] [790/812] lr: 0.000039 grad_norm: 0.8885 (0.9144) closs: 0.8512 (0.8566) time: 2.1286 data: 0.0002 max mem: 28042 +[07:24:35.475030] Epoch: [1] [800/812] lr: 0.000039 grad_norm: 0.8738 (0.9139) closs: 0.8451 (0.8562) time: 2.1213 data: 0.0002 max mem: 28042 +[07:24:56.734344] Epoch: [1] [810/812] lr: 0.000039 grad_norm: 0.8738 (0.9154) closs: 0.8451 (0.8562) time: 2.1227 data: 0.0002 max mem: 28042 +[07:24:59.165687] Epoch: [1] Total time: 0:28:46 +[07:24:59.168439] Averaged stats: lr: 0.000039 grad_norm: 0.8738 (0.9152) closs: 0.8673 (0.8545) +[07:24:59.501598] model saved +[07:25:01.228520] optimizer saved +[07:25:01.229115] other rank-common saved +[07:25:01.232745] rank-specific saved +[07:25:01.242649] log_dir: ./output_dir +[07:25:04.605755] Epoch: [2] [0/812] lr: 0.000039 grad_norm: 0.8173 (0.8173) closs: 0.8037 (0.8037) time: 3.3621 data: 1.2087 max mem: 28042 +[07:25:25.938470] Epoch: [2] [10/812] lr: 0.000038 grad_norm: 0.7978 (0.8008) closs: 0.8347 (0.8409) time: 2.2449 data: 0.1101 max mem: 28042 +[07:25:47.222423] Epoch: [2] [20/812] lr: 0.000038 grad_norm: 0.8004 (0.8113) closs: 0.8347 (0.8317) time: 2.1308 data: 0.0002 max mem: 28042 +[07:26:08.393469] Epoch: [2] [30/812] lr: 0.000038 grad_norm: 0.8724 (0.8343) closs: 0.8318 (0.8414) time: 2.1227 data: 0.0002 max mem: 28042 +[07:26:29.541814] Epoch: [2] [40/812] lr: 0.000038 grad_norm: 0.8632 (0.8308) closs: 0.8318 (0.8407) time: 2.1159 data: 0.0002 max mem: 28042 +[07:26:50.705595] Epoch: [2] [50/812] lr: 0.000037 grad_norm: 0.8126 (0.8463) closs: 0.8506 (0.8366) time: 2.1155 data: 0.0002 max mem: 28042 +[07:27:11.930514] Epoch: [2] [60/812] lr: 0.000037 grad_norm: 0.8214 (0.8468) closs: 0.8533 (0.8395) time: 2.1194 data: 0.0002 max mem: 28042 +[07:27:33.224038] Epoch: [2] [70/812] lr: 0.000037 grad_norm: 0.8380 (0.8558) closs: 0.8522 (0.8441) time: 2.1258 data: 0.0002 max mem: 28042 +[07:27:54.451669] Epoch: [2] [80/812] lr: 0.000037 grad_norm: 0.8503 (0.8651) closs: 0.8207 (0.8425) time: 2.1260 data: 0.0002 max mem: 28042 +[07:28:15.680781] Epoch: [2] [90/812] lr: 0.000036 grad_norm: 0.8135 (0.8623) closs: 0.8026 (0.8396) time: 2.1228 data: 0.0002 max mem: 28042 +[07:28:37.066597] Epoch: [2] [100/812] lr: 0.000036 grad_norm: 0.8527 (0.8702) closs: 0.8127 (0.8409) time: 2.1307 data: 0.0002 max mem: 28042 +[07:28:58.274244] Epoch: [2] [110/812] lr: 0.000036 grad_norm: 0.8377 (0.8658) closs: 0.8242 (0.8359) time: 2.1296 data: 0.0002 max mem: 28042 +[07:29:19.632451] Epoch: [2] [120/812] lr: 0.000036 grad_norm: 0.8040 (0.8644) closs: 0.8393 (0.8376) time: 2.1282 data: 0.0002 max mem: 28042 +[07:29:41.020148] Epoch: [2] [130/812] lr: 0.000035 grad_norm: 0.8352 (0.8648) closs: 0.8316 (0.8367) time: 2.1372 data: 0.0002 max mem: 28042 +[07:30:02.395455] Epoch: [2] [140/812] lr: 0.000035 grad_norm: 0.8786 (0.8695) closs: 0.8135 (0.8362) time: 2.1381 data: 0.0002 max mem: 28042 +[07:30:23.808228] Epoch: [2] [150/812] lr: 0.000035 grad_norm: 0.8822 (0.8688) closs: 0.8453 (0.8374) time: 2.1393 data: 0.0002 max mem: 28042 +[07:30:45.164764] Epoch: [2] [160/812] lr: 0.000035 grad_norm: 0.8822 (0.8746) closs: 0.8842 (0.8382) time: 2.1384 data: 0.0002 max mem: 28042 +[07:31:06.315458] Epoch: [2] [170/812] lr: 0.000034 grad_norm: 0.8529 (0.8701) closs: 0.8118 (0.8371) time: 2.1253 data: 0.0002 max mem: 28042 +[07:31:27.656439] Epoch: [2] [180/812] lr: 0.000034 grad_norm: 0.8679 (0.8743) closs: 0.8118 (0.8369) time: 2.1245 data: 0.0002 max mem: 28042 +[07:31:49.023759] Epoch: [2] [190/812] lr: 0.000034 grad_norm: 0.8803 (0.8730) closs: 0.8327 (0.8381) time: 2.1353 data: 0.0002 max mem: 28042 +[07:32:10.391465] Epoch: [2] [200/812] lr: 0.000033 grad_norm: 0.8540 (0.8725) closs: 0.8327 (0.8392) time: 2.1367 data: 0.0002 max mem: 28042 +[07:32:31.801438] Epoch: [2] [210/812] lr: 0.000033 grad_norm: 0.8493 (0.8724) closs: 0.8227 (0.8396) time: 2.1388 data: 0.0002 max mem: 28042 +[07:32:53.101914] Epoch: [2] [220/812] lr: 0.000033 grad_norm: 0.8557 (0.8723) closs: 0.8429 (0.8411) time: 2.1354 data: 0.0003 max mem: 28042 +[07:33:14.125610] Epoch: [2] [230/812] lr: 0.000033 grad_norm: 0.8792 (0.8748) closs: 0.8429 (0.8417) time: 2.1161 data: 0.0003 max mem: 28042 +[07:33:35.442382] Epoch: [2] [240/812] lr: 0.000032 grad_norm: 0.8701 (0.8736) closs: 0.8309 (0.8416) time: 2.1169 data: 0.0002 max mem: 28042 +[07:33:56.828157] Epoch: [2] [250/812] lr: 0.000032 grad_norm: 0.8521 (0.8742) closs: 0.8317 (0.8422) time: 2.1351 data: 0.0002 max mem: 28042 +[07:34:18.204872] Epoch: [2] [260/812] lr: 0.000032 grad_norm: 0.8521 (0.8736) closs: 0.7921 (0.8415) time: 2.1380 data: 0.0002 max mem: 28042 +[07:34:39.611093] Epoch: [2] [270/812] lr: 0.000031 grad_norm: 0.7877 (0.8718) closs: 0.7921 (0.8428) time: 2.1391 data: 0.0002 max mem: 28042 +[07:35:00.999360] Epoch: [2] [280/812] lr: 0.000031 grad_norm: 0.8320 (0.8718) closs: 0.8277 (0.8442) time: 2.1396 data: 0.0002 max mem: 28042 +[07:35:22.112658] Epoch: [2] [290/812] lr: 0.000031 grad_norm: 0.8661 (0.8723) closs: 0.8074 (0.8410) time: 2.1250 data: 0.0002 max mem: 28042 +[07:35:43.467764] Epoch: [2] [300/812] lr: 0.000031 grad_norm: 0.8647 (0.8730) closs: 0.8054 (0.8405) time: 2.1233 data: 0.0002 max mem: 28042 +[07:36:04.824780] Epoch: [2] [310/812] lr: 0.000030 grad_norm: 0.8826 (0.8761) closs: 0.8440 (0.8406) time: 2.1355 data: 0.0002 max mem: 28042 +[07:36:26.170930] Epoch: [2] [320/812] lr: 0.000030 grad_norm: 0.8778 (0.8759) closs: 0.8422 (0.8418) time: 2.1351 data: 0.0002 max mem: 28042 +[07:36:47.504093] Epoch: [2] [330/812] lr: 0.000030 grad_norm: 0.8626 (0.8788) closs: 0.8727 (0.8434) time: 2.1339 data: 0.0002 max mem: 28042 +[07:37:08.880216] Epoch: [2] [340/812] lr: 0.000029 grad_norm: 0.8626 (0.8780) closs: 0.8866 (0.8442) time: 2.1354 data: 0.0002 max mem: 28042 +[07:37:30.026320] Epoch: [2] [350/812] lr: 0.000029 grad_norm: 0.8242 (0.8770) closs: 0.8389 (0.8438) time: 2.1260 data: 0.0002 max mem: 28042 +[07:37:51.342100] Epoch: [2] [360/812] lr: 0.000029 grad_norm: 0.8242 (0.8763) closs: 0.8495 (0.8446) time: 2.1230 data: 0.0002 max mem: 28042 +[07:38:12.672288] Epoch: [2] [370/812] lr: 0.000029 grad_norm: 0.8642 (0.8774) closs: 0.8439 (0.8445) time: 2.1322 data: 0.0002 max mem: 28042 +[07:38:34.038903] Epoch: [2] [380/812] lr: 0.000028 grad_norm: 0.8655 (0.8792) closs: 0.8376 (0.8437) time: 2.1348 data: 0.0002 max mem: 28042 +[07:38:55.416450] Epoch: [2] [390/812] lr: 0.000028 grad_norm: 0.8655 (0.8799) closs: 0.8200 (0.8424) time: 2.1371 data: 0.0002 max mem: 28042 +[07:39:16.805648] Epoch: [2] [400/812] lr: 0.000028 grad_norm: 0.9245 (0.8820) closs: 0.8217 (0.8425) time: 2.1383 data: 0.0002 max mem: 28042 +[07:39:37.947526] Epoch: [2] [410/812] lr: 0.000027 grad_norm: 0.8268 (0.8809) closs: 0.8217 (0.8423) time: 2.1265 data: 0.0002 max mem: 28042 +[07:39:59.300724] Epoch: [2] [420/812] lr: 0.000027 grad_norm: 0.7987 (0.8815) closs: 0.8043 (0.8423) time: 2.1247 data: 0.0004 max mem: 28042 +[07:40:20.670550] Epoch: [2] [430/812] lr: 0.000027 grad_norm: 0.8388 (0.8824) closs: 0.8112 (0.8429) time: 2.1361 data: 0.0004 max mem: 28042 +[07:40:42.075112] Epoch: [2] [440/812] lr: 0.000027 grad_norm: 0.8653 (0.8825) closs: 0.8614 (0.8427) time: 2.1386 data: 0.0002 max mem: 28042 +[07:41:03.398003] Epoch: [2] [450/812] lr: 0.000026 grad_norm: 0.8892 (0.8833) closs: 0.8563 (0.8438) time: 2.1363 data: 0.0002 max mem: 28042 +[07:41:24.767838] Epoch: [2] [460/812] lr: 0.000026 grad_norm: 0.8508 (0.8832) closs: 0.9083 (0.8458) time: 2.1346 data: 0.0002 max mem: 28042 +[07:41:45.899249] Epoch: [2] [470/812] lr: 0.000026 grad_norm: 0.8296 (0.8824) closs: 0.8864 (0.8453) time: 2.1250 data: 0.0002 max mem: 28042 +[07:42:07.234898] Epoch: [2] [480/812] lr: 0.000025 grad_norm: 0.8476 (0.8834) closs: 0.8653 (0.8465) time: 2.1233 data: 0.0002 max mem: 28042 +[07:42:28.526171] Epoch: [2] [490/812] lr: 0.000025 grad_norm: 0.8476 (0.8827) closs: 0.8379 (0.8457) time: 2.1313 data: 0.0002 max mem: 28042 +[07:42:49.834652] Epoch: [2] [500/812] lr: 0.000025 grad_norm: 0.8437 (0.8834) closs: 0.8202 (0.8453) time: 2.1299 data: 0.0002 max mem: 28042 +[07:43:11.253722] Epoch: [2] [510/812] lr: 0.000024 grad_norm: 0.8629 (0.8832) closs: 0.8411 (0.8454) time: 2.1363 data: 0.0002 max mem: 28042 +[07:43:32.569350] Epoch: [2] [520/812] lr: 0.000024 grad_norm: 0.8694 (0.8837) closs: 0.8411 (0.8454) time: 2.1366 data: 0.0002 max mem: 28042 +[07:43:53.723009] Epoch: [2] [530/812] lr: 0.000024 grad_norm: 0.8694 (0.8828) closs: 0.8377 (0.8456) time: 2.1234 data: 0.0002 max mem: 28042 +[07:44:15.068725] Epoch: [2] [540/812] lr: 0.000024 grad_norm: 0.8512 (0.8832) closs: 0.8377 (0.8454) time: 2.1249 data: 0.0002 max mem: 28042 +[07:44:36.418823] Epoch: [2] [550/812] lr: 0.000023 grad_norm: 0.8541 (0.8821) closs: 0.8485 (0.8453) time: 2.1347 data: 0.0002 max mem: 28042 +[07:44:57.735533] Epoch: [2] [560/812] lr: 0.000023 grad_norm: 0.7920 (0.8821) closs: 0.8278 (0.8453) time: 2.1333 data: 0.0002 max mem: 28042 +[07:45:19.044480] Epoch: [2] [570/812] lr: 0.000023 grad_norm: 0.8350 (0.8831) closs: 0.8197 (0.8447) time: 2.1312 data: 0.0002 max mem: 28042 +[07:45:40.336507] Epoch: [2] [580/812] lr: 0.000022 grad_norm: 0.9544 (0.8832) closs: 0.8045 (0.8436) time: 2.1300 data: 0.0002 max mem: 28042 +[07:46:01.445076] Epoch: [2] [590/812] lr: 0.000022 grad_norm: 0.8539 (0.8832) closs: 0.8222 (0.8426) time: 2.1200 data: 0.0002 max mem: 28042 +[07:46:22.758448] Epoch: [2] [600/812] lr: 0.000022 grad_norm: 0.8545 (0.8826) closs: 0.8357 (0.8426) time: 2.1210 data: 0.0002 max mem: 28042 +[07:46:43.975732] Epoch: [2] [610/812] lr: 0.000022 grad_norm: 0.8283 (0.8824) closs: 0.8244 (0.8424) time: 2.1265 data: 0.0002 max mem: 28042 +[07:47:05.349197] Epoch: [2] [620/812] lr: 0.000021 grad_norm: 0.8456 (0.8825) closs: 0.8244 (0.8423) time: 2.1295 data: 0.0002 max mem: 28042 +[07:47:26.735767] Epoch: [2] [630/812] lr: 0.000021 grad_norm: 0.8456 (0.8814) closs: 0.8121 (0.8423) time: 2.1379 data: 0.0002 max mem: 28042 +[07:47:48.056890] Epoch: [2] [640/812] lr: 0.000021 grad_norm: 0.8785 (0.8820) closs: 0.8153 (0.8423) time: 2.1353 data: 0.0002 max mem: 28042 +[07:48:09.177512] Epoch: [2] [650/812] lr: 0.000021 grad_norm: 0.8796 (0.8835) closs: 0.7759 (0.8408) time: 2.1220 data: 0.0002 max mem: 28042 +[07:48:30.537949] Epoch: [2] [660/812] lr: 0.000020 grad_norm: 0.8254 (0.8828) closs: 0.7714 (0.8401) time: 2.1240 data: 0.0002 max mem: 28042 +[07:48:51.921824] Epoch: [2] [670/812] lr: 0.000020 grad_norm: 0.8632 (0.8835) closs: 0.7948 (0.8396) time: 2.1371 data: 0.0002 max mem: 28042 +[07:49:13.251435] Epoch: [2] [680/812] lr: 0.000020 grad_norm: 0.8754 (0.8834) closs: 0.8484 (0.8402) time: 2.1356 data: 0.0002 max mem: 28042 +[07:49:34.579325] Epoch: [2] [690/812] lr: 0.000019 grad_norm: 0.8018 (0.8829) closs: 0.8826 (0.8405) time: 2.1328 data: 0.0002 max mem: 28042 +[07:49:55.977559] Epoch: [2] [700/812] lr: 0.000019 grad_norm: 0.8168 (0.8827) closs: 0.8405 (0.8403) time: 2.1362 data: 0.0002 max mem: 28042 +[07:50:17.140172] Epoch: [2] [710/812] lr: 0.000019 grad_norm: 0.8241 (0.8818) closs: 0.7813 (0.8396) time: 2.1280 data: 0.0002 max mem: 28042 +[07:50:38.489617] Epoch: [2] [720/812] lr: 0.000019 grad_norm: 0.7747 (0.8815) closs: 0.7818 (0.8397) time: 2.1255 data: 0.0002 max mem: 28042 +[07:50:59.854958] Epoch: [2] [730/812] lr: 0.000018 grad_norm: 0.8660 (0.8817) closs: 0.8288 (0.8405) time: 2.1357 data: 0.0002 max mem: 28042 +[07:51:21.193447] Epoch: [2] [740/812] lr: 0.000018 grad_norm: 0.8745 (0.8813) closs: 0.8010 (0.8404) time: 2.1351 data: 0.0002 max mem: 28042 +[07:51:42.504050] Epoch: [2] [750/812] lr: 0.000018 grad_norm: 0.8117 (0.8810) closs: 0.8010 (0.8404) time: 2.1324 data: 0.0002 max mem: 28042 +[07:52:03.789461] Epoch: [2] [760/812] lr: 0.000018 grad_norm: 0.8396 (0.8808) closs: 0.8350 (0.8399) time: 2.1297 data: 0.0002 max mem: 28042 +[07:52:24.867624] Epoch: [2] [770/812] lr: 0.000017 grad_norm: 0.8502 (0.8811) closs: 0.8475 (0.8405) time: 2.1181 data: 0.0002 max mem: 28042 +[07:52:46.212501] Epoch: [2] [780/812] lr: 0.000017 grad_norm: 0.8794 (0.8809) closs: 0.8875 (0.8410) time: 2.1211 data: 0.0002 max mem: 28042 +[07:53:07.550117] Epoch: [2] [790/812] lr: 0.000017 grad_norm: 0.7999 (0.8805) closs: 0.8674 (0.8414) time: 2.1341 data: 0.0001 max mem: 28042 +[07:53:28.932999] Epoch: [2] [800/812] lr: 0.000017 grad_norm: 0.8289 (0.8808) closs: 0.8283 (0.8416) time: 2.1359 data: 0.0001 max mem: 28042 +[07:53:50.278871] Epoch: [2] [810/812] lr: 0.000016 grad_norm: 0.8302 (0.8809) closs: 0.8504 (0.8421) time: 2.1364 data: 0.0001 max mem: 28042 +[07:53:52.714588] Epoch: [2] Total time: 0:28:51 +[07:53:52.730137] Averaged stats: lr: 0.000016 grad_norm: 0.8289 (0.8807) closs: 0.8552 (0.8424) +[07:53:53.203661] model saved +[07:53:54.901871] optimizer saved +[07:53:54.902645] other rank-common saved +[07:53:54.906903] rank-specific saved +[07:53:54.917520] log_dir: ./output_dir +[07:53:58.191492] Epoch: [3] [0/812] lr: 0.000016 grad_norm: 1.0490 (1.0490) closs: 0.6441 (0.6441) time: 3.2729 data: 1.1646 max mem: 28042 +[07:54:19.479554] Epoch: [3] [10/812] lr: 0.000016 grad_norm: 0.8902 (0.9635) closs: 0.9038 (0.8853) time: 2.2327 data: 0.1061 max mem: 28042 +[07:54:40.626378] Epoch: [3] [20/812] lr: 0.000016 grad_norm: 0.8645 (0.9616) closs: 0.8700 (0.8582) time: 2.1217 data: 0.0002 max mem: 28042 +[07:55:01.731665] Epoch: [3] [30/812] lr: 0.000016 grad_norm: 0.8400 (0.9256) closs: 0.8400 (0.8562) time: 2.1125 data: 0.0002 max mem: 28042 +[07:55:22.885001] Epoch: [3] [40/812] lr: 0.000015 grad_norm: 0.8370 (0.9113) closs: 0.8383 (0.8448) time: 2.1129 data: 0.0002 max mem: 28042 +[07:55:44.040122] Epoch: [3] [50/812] lr: 0.000015 grad_norm: 0.8304 (0.8985) closs: 0.8214 (0.8448) time: 2.1153 data: 0.0002 max mem: 28042 +[07:56:05.256778] Epoch: [3] [60/812] lr: 0.000015 grad_norm: 0.8304 (0.9031) closs: 0.8439 (0.8496) time: 2.1185 data: 0.0002 max mem: 28042 +[07:56:26.408434] Epoch: [3] [70/812] lr: 0.000015 grad_norm: 0.8670 (0.9090) closs: 0.8136 (0.8462) time: 2.1183 data: 0.0002 max mem: 28042 +[07:56:47.651858] Epoch: [3] [80/812] lr: 0.000014 grad_norm: 0.8294 (0.9039) closs: 0.8295 (0.8458) time: 2.1197 data: 0.0002 max mem: 28042 +[07:57:08.789407] Epoch: [3] [90/812] lr: 0.000014 grad_norm: 0.8278 (0.9153) closs: 0.8492 (0.8447) time: 2.1190 data: 0.0002 max mem: 28042 +[07:57:29.972419] Epoch: [3] [100/812] lr: 0.000014 grad_norm: 0.8774 (0.9115) closs: 0.8209 (0.8421) time: 2.1159 data: 0.0002 max mem: 28042 +[07:57:51.294111] Epoch: [3] [110/812] lr: 0.000014 grad_norm: 0.8553 (0.9095) closs: 0.7868 (0.8363) time: 2.1252 data: 0.0002 max mem: 28042 +[07:58:12.671601] Epoch: [3] [120/812] lr: 0.000013 grad_norm: 0.8556 (0.9050) closs: 0.7868 (0.8408) time: 2.1349 data: 0.0003 max mem: 28042 +[07:58:33.964903] Epoch: [3] [130/812] lr: 0.000013 grad_norm: 0.8585 (0.9045) closs: 0.8151 (0.8379) time: 2.1335 data: 0.0003 max mem: 28042 +[07:58:55.294678] Epoch: [3] [140/812] lr: 0.000013 grad_norm: 0.8542 (0.9024) closs: 0.8471 (0.8414) time: 2.1311 data: 0.0002 max mem: 28042 +[07:59:16.596797] Epoch: [3] [150/812] lr: 0.000013 grad_norm: 0.8433 (0.9080) closs: 0.8486 (0.8407) time: 2.1315 data: 0.0002 max mem: 28042 +[07:59:37.813975] Epoch: [3] [160/812] lr: 0.000012 grad_norm: 0.8266 (0.9031) closs: 0.7934 (0.8408) time: 2.1259 data: 0.0002 max mem: 28042 +[07:59:59.101368] Epoch: [3] [170/812] lr: 0.000012 grad_norm: 0.8266 (0.9000) closs: 0.8247 (0.8395) time: 2.1252 data: 0.0002 max mem: 28042 +[08:00:20.346641] Epoch: [3] [180/812] lr: 0.000012 grad_norm: 0.8709 (0.9001) closs: 0.8101 (0.8379) time: 2.1266 data: 0.0002 max mem: 28042 +[08:00:41.591960] Epoch: [3] [190/812] lr: 0.000012 grad_norm: 0.9020 (0.9010) closs: 0.8010 (0.8377) time: 2.1245 data: 0.0002 max mem: 28042 +[08:01:02.880467] Epoch: [3] [200/812] lr: 0.000012 grad_norm: 0.8726 (0.8965) closs: 0.8493 (0.8391) time: 2.1266 data: 0.0002 max mem: 28042 +[08:01:24.071233] Epoch: [3] [210/812] lr: 0.000011 grad_norm: 0.7843 (0.8941) closs: 0.8166 (0.8381) time: 2.1239 data: 0.0002 max mem: 28042 +[08:01:45.236262] Epoch: [3] [220/812] lr: 0.000011 grad_norm: 0.8370 (0.8924) closs: 0.8164 (0.8396) time: 2.1177 data: 0.0002 max mem: 28042 +[08:02:06.489504] Epoch: [3] [230/812] lr: 0.000011 grad_norm: 0.8645 (0.8946) closs: 0.8308 (0.8402) time: 2.1208 data: 0.0002 max mem: 28042 +[08:02:27.784203] Epoch: [3] [240/812] lr: 0.000011 grad_norm: 0.8777 (0.8973) closs: 0.8308 (0.8405) time: 2.1273 data: 0.0002 max mem: 28042 +[08:02:49.078010] Epoch: [3] [250/812] lr: 0.000011 grad_norm: 0.8627 (0.8957) closs: 0.8112 (0.8399) time: 2.1293 data: 0.0002 max mem: 28042 +[08:03:10.423309] Epoch: [3] [260/812] lr: 0.000010 grad_norm: 0.7976 (0.8924) closs: 0.7705 (0.8383) time: 2.1319 data: 0.0002 max mem: 28042 +[08:03:31.704948] Epoch: [3] [270/812] lr: 0.000010 grad_norm: 0.8373 (0.8915) closs: 0.7730 (0.8388) time: 2.1313 data: 0.0002 max mem: 28042 +[08:03:52.854770] Epoch: [3] [280/812] lr: 0.000010 grad_norm: 0.8283 (0.8906) closs: 0.7952 (0.8401) time: 2.1215 data: 0.0002 max mem: 28042 +[08:04:14.113366] Epoch: [3] [290/812] lr: 0.000010 grad_norm: 0.8470 (0.8935) closs: 0.7852 (0.8392) time: 2.1203 data: 0.0002 max mem: 28042 +[08:04:35.342838] Epoch: [3] [300/812] lr: 0.000010 grad_norm: 0.9499 (0.8946) closs: 0.7852 (0.8366) time: 2.1243 data: 0.0003 max mem: 28042 +[08:04:56.618987] Epoch: [3] [310/812] lr: 0.000010 grad_norm: 0.9211 (0.8958) closs: 0.8185 (0.8384) time: 2.1252 data: 0.0003 max mem: 28042 +[08:05:17.910768] Epoch: [3] [320/812] lr: 0.000009 grad_norm: 0.8922 (0.8954) closs: 0.8641 (0.8389) time: 2.1283 data: 0.0003 max mem: 28042 +[08:05:39.221330] Epoch: [3] [330/812] lr: 0.000009 grad_norm: 0.8283 (0.8942) closs: 0.8408 (0.8387) time: 2.1300 data: 0.0002 max mem: 28042 +[08:06:00.377654] Epoch: [3] [340/812] lr: 0.000009 grad_norm: 0.8666 (0.8945) closs: 0.8742 (0.8393) time: 2.1233 data: 0.0002 max mem: 28042 +[08:06:21.664630] Epoch: [3] [350/812] lr: 0.000009 grad_norm: 0.8586 (0.8937) closs: 0.8742 (0.8399) time: 2.1221 data: 0.0002 max mem: 28042 +[08:06:42.936458] Epoch: [3] [360/812] lr: 0.000009 grad_norm: 0.8542 (0.8931) closs: 0.8490 (0.8404) time: 2.1279 data: 0.0002 max mem: 28042 +[08:07:04.181192] Epoch: [3] [370/812] lr: 0.000009 grad_norm: 0.8234 (0.8933) closs: 0.8577 (0.8406) time: 2.1257 data: 0.0002 max mem: 28042 +[08:07:25.478197] Epoch: [3] [380/812] lr: 0.000008 grad_norm: 0.8222 (0.8926) closs: 0.8238 (0.8396) time: 2.1270 data: 0.0002 max mem: 28042 +[08:07:46.790628] Epoch: [3] [390/812] lr: 0.000008 grad_norm: 0.8357 (0.8927) closs: 0.7723 (0.8386) time: 2.1304 data: 0.0002 max mem: 28042 +[08:08:07.856380] Epoch: [3] [400/812] lr: 0.000008 grad_norm: 0.8908 (0.8945) closs: 0.8394 (0.8390) time: 2.1188 data: 0.0002 max mem: 28042 +[08:08:29.129808] Epoch: [3] [410/812] lr: 0.000008 grad_norm: 0.8908 (0.8952) closs: 0.8187 (0.8376) time: 2.1169 data: 0.0002 max mem: 28042 +[08:08:50.389711] Epoch: [3] [420/812] lr: 0.000008 grad_norm: 0.8459 (0.8954) closs: 0.7789 (0.8370) time: 2.1266 data: 0.0002 max mem: 28042 +[08:09:11.665127] Epoch: [3] [430/812] lr: 0.000008 grad_norm: 0.8785 (0.8963) closs: 0.7809 (0.8356) time: 2.1267 data: 0.0002 max mem: 28042 +[08:09:32.923390] Epoch: [3] [440/812] lr: 0.000008 grad_norm: 0.8872 (0.8970) closs: 0.7809 (0.8346) time: 2.1266 data: 0.0002 max mem: 28042 +[08:09:54.151497] Epoch: [3] [450/812] lr: 0.000007 grad_norm: 0.8938 (0.8975) closs: 0.7968 (0.8343) time: 2.1242 data: 0.0002 max mem: 28042 +[08:10:15.305060] Epoch: [3] [460/812] lr: 0.000007 grad_norm: 0.8952 (0.9105) closs: 0.8061 (0.8337) time: 2.1190 data: 0.0002 max mem: 28042 +[08:10:36.576963] Epoch: [3] [470/812] lr: 0.000007 grad_norm: 0.8742 (0.9107) closs: 0.7998 (0.8334) time: 2.1212 data: 0.0002 max mem: 28042 +[08:10:57.844902] Epoch: [3] [480/812] lr: 0.000007 grad_norm: 0.8667 (0.9107) closs: 0.7924 (0.8329) time: 2.1269 data: 0.0002 max mem: 28042 +[08:11:19.115588] Epoch: [3] [490/812] lr: 0.000007 grad_norm: 0.8655 (0.9100) closs: 0.7802 (0.8330) time: 2.1269 data: 0.0002 max mem: 28042 +[08:11:40.448298] Epoch: [3] [500/812] lr: 0.000007 grad_norm: 0.8168 (0.9101) closs: 0.8021 (0.8326) time: 2.1301 data: 0.0005 max mem: 28042 +[08:12:01.736491] Epoch: [3] [510/812] lr: 0.000007 grad_norm: 0.8157 (0.9084) closs: 0.8268 (0.8339) time: 2.1310 data: 0.0004 max mem: 28042 +[08:12:22.852744] Epoch: [3] [520/812] lr: 0.000007 grad_norm: 0.8698 (0.9092) closs: 0.9299 (0.8346) time: 2.1201 data: 0.0002 max mem: 28042 +[08:12:44.116451] Epoch: [3] [530/812] lr: 0.000006 grad_norm: 0.8853 (0.9092) closs: 0.7947 (0.8340) time: 2.1189 data: 0.0002 max mem: 28042 +[08:13:05.415617] Epoch: [3] [540/812] lr: 0.000006 grad_norm: 0.8769 (0.9088) closs: 0.7747 (0.8337) time: 2.1281 data: 0.0002 max mem: 28042 +[08:13:26.702689] Epoch: [3] [550/812] lr: 0.000006 grad_norm: 0.8467 (0.9087) closs: 0.8074 (0.8329) time: 2.1292 data: 0.0002 max mem: 28042 +[08:13:47.965371] Epoch: [3] [560/812] lr: 0.000006 grad_norm: 0.8314 (0.9073) closs: 0.8062 (0.8323) time: 2.1274 data: 0.0002 max mem: 28042 +[08:14:09.281838] Epoch: [3] [570/812] lr: 0.000006 grad_norm: 0.8763 (0.9081) closs: 0.8191 (0.8329) time: 2.1289 data: 0.0002 max mem: 28042 +[08:14:30.423561] Epoch: [3] [580/812] lr: 0.000006 grad_norm: 0.9098 (0.9078) closs: 0.8414 (0.8333) time: 2.1228 data: 0.0002 max mem: 28042 +[08:14:51.732903] Epoch: [3] [590/812] lr: 0.000006 grad_norm: 0.8236 (0.9058) closs: 0.8080 (0.8328) time: 2.1225 data: 0.0002 max mem: 28042 +[08:15:13.032032] Epoch: [3] [600/812] lr: 0.000006 grad_norm: 0.8071 (0.9055) closs: 0.7610 (0.8326) time: 2.1303 data: 0.0002 max mem: 28042 +[08:15:34.338684] Epoch: [3] [610/812] lr: 0.000006 grad_norm: 0.8387 (0.9068) closs: 0.8489 (0.8333) time: 2.1302 data: 0.0002 max mem: 28042 +[08:15:55.610535] Epoch: [3] [620/812] lr: 0.000006 grad_norm: 0.8382 (0.9067) closs: 0.8438 (0.8335) time: 2.1289 data: 0.0002 max mem: 28042 +[08:16:16.889308] Epoch: [3] [630/812] lr: 0.000006 grad_norm: 0.8850 (0.9083) closs: 0.7955 (0.8327) time: 2.1275 data: 0.0002 max mem: 28042 +[08:16:38.059748] Epoch: [3] [640/812] lr: 0.000006 grad_norm: 0.8985 (0.9078) closs: 0.7955 (0.8326) time: 2.1224 data: 0.0002 max mem: 28042 +[08:16:59.340361] Epoch: [3] [650/812] lr: 0.000005 grad_norm: 0.8872 (0.9074) closs: 0.8339 (0.8337) time: 2.1225 data: 0.0002 max mem: 28042 +[08:17:20.632265] Epoch: [3] [660/812] lr: 0.000005 grad_norm: 0.8939 (0.9068) closs: 0.8938 (0.8344) time: 2.1286 data: 0.0002 max mem: 28042 +[08:17:41.842573] Epoch: [3] [670/812] lr: 0.000005 grad_norm: 0.8939 (0.9067) closs: 0.8269 (0.8338) time: 2.1250 data: 0.0002 max mem: 28042 +[08:18:03.079831] Epoch: [3] [680/812] lr: 0.000005 grad_norm: 0.8363 (0.9068) closs: 0.8251 (0.8344) time: 2.1223 data: 0.0002 max mem: 28042 +[08:18:24.338829] Epoch: [3] [690/812] lr: 0.000005 grad_norm: 0.8847 (0.9081) closs: 0.8305 (0.8341) time: 2.1247 data: 0.0002 max mem: 28042 +[08:18:45.483996] Epoch: [3] [700/812] lr: 0.000005 grad_norm: 0.8847 (0.9070) closs: 0.8262 (0.8343) time: 2.1201 data: 0.0002 max mem: 28042 +[08:19:06.675603] Epoch: [3] [710/812] lr: 0.000005 grad_norm: 0.8213 (0.9068) closs: 0.8601 (0.8347) time: 2.1168 data: 0.0002 max mem: 28042 +[08:19:27.902064] Epoch: [3] [720/812] lr: 0.000005 grad_norm: 0.8174 (0.9057) closs: 0.8440 (0.8350) time: 2.1208 data: 0.0002 max mem: 28042 +[08:19:49.131969] Epoch: [3] [730/812] lr: 0.000005 grad_norm: 0.8260 (0.9056) closs: 0.8061 (0.8343) time: 2.1227 data: 0.0002 max mem: 28042 +[08:20:10.427471] Epoch: [3] [740/812] lr: 0.000005 grad_norm: 0.8716 (0.9058) closs: 0.8154 (0.8347) time: 2.1262 data: 0.0002 max mem: 28042 +[08:20:31.811740] Epoch: [3] [750/812] lr: 0.000005 grad_norm: 0.8902 (0.9072) closs: 0.8183 (0.8343) time: 2.1339 data: 0.0002 max mem: 28042 +[08:20:52.966245] Epoch: [3] [760/812] lr: 0.000005 grad_norm: 0.8902 (0.9094) closs: 0.8189 (0.8348) time: 2.1269 data: 0.0002 max mem: 28042 +[08:21:14.257344] Epoch: [3] [770/812] lr: 0.000005 grad_norm: 0.8943 (0.9086) closs: 0.8266 (0.8348) time: 2.1222 data: 0.0002 max mem: 28042 +[08:21:35.557339] Epoch: [3] [780/812] lr: 0.000005 grad_norm: 0.8404 (0.9086) closs: 0.8143 (0.8347) time: 2.1295 data: 0.0002 max mem: 28042 +[08:21:56.831315] Epoch: [3] [790/812] lr: 0.000005 grad_norm: 0.8876 (0.9092) closs: 0.8193 (0.8347) time: 2.1286 data: 0.0002 max mem: 28042 +[08:22:18.140914] Epoch: [3] [800/812] lr: 0.000005 grad_norm: 0.8628 (0.9097) closs: 0.8552 (0.8350) time: 2.1291 data: 0.0002 max mem: 28042 +[08:22:39.368185] Epoch: [3] [810/812] lr: 0.000005 grad_norm: 0.8627 (0.9094) closs: 0.9012 (0.8357) time: 2.1268 data: 0.0002 max mem: 28042 +[08:22:41.821347] Epoch: [3] Total time: 0:28:46 +[08:22:41.825202] Averaged stats: lr: 0.000005 grad_norm: 0.8627 (0.9092) closs: 0.9007 (0.8364) +[08:22:42.226562] model saved +[08:22:43.924087] optimizer saved +[08:22:43.925003] other rank-common saved +[08:22:43.931834] rank-specific saved +[08:22:43.932142] Training time 1:55:22 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..f050e55b07facfaec1e5a03de296bde911239050 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d5c15a2808412d27835f14c0fc05ef1fc310923302af1fb9c02c445a59b7304 +size 58162939 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..6aa599035681e620d1e8181b9ffd84a7bf151194 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f565b9183e251c048fae68754e0b1a015c651fd43b0cd71a3982f9b4d1077dd +size 130819127 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..e317414ddacd9d596ca17754b1fb347e8c620137 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11ad37201918bfc6a749b9c14caf28e560ac54b89a2d0fa3be30e50c49fab9cd +size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..331afc6d009c4c4bdce415e1e05fe0a7c6fd04f8 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c997936cd82af0a6012259bafc3d2e9e2a6c0acd90f8762ec953c18c052c8c4 +size 58162939 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..9f5c4bf62d29d14a7652cf8b73c31183e5b6b245 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:586da76697344d1409f6d0331176114e72fd47b98091fd6c258a4792eea01a82 +size 130819127 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..2c36cacd3ff477892660b4bdf6361cc4ab5bf40a --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6d32e2b269680e43657c7c735c0be6f8fd7672e839232b8381fccdf09d36792 +size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e4a9509a579916c44cb1ef97e02d070092a0fe9a --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9b7acb10eef238d30c123f47b15af4c9c443ef414ab5f8737674883bf3d5189 +size 58162939 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e77163e87c30cafa46def5bdb669dc533a6005 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57e071734f60c90e438181c8ae0d2dd00a0cf7081ac674445bbeec548315a98a +size 130819127 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..e415f2ef15b0b4d08b397865c99738db07bf0a07 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3da9395be70ad1f7631e92aec6d4efc745e19ca78afde1004e4193976aea056d +size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e71af1b7c315b37a4aea851461f8dcefbd6e27d7 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71aece4919e4237bb2920f0dfd74e23ac51a04fd74a56ab8efcab06962a6d3cd +size 58162939 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..2da15d2c95c60871c58625ee883ee9fff7857cc6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8c23cd50d34ab5d423af68a8e4a2fa3ba11191f6cbd5146638cc478d9f3bb6 +size 130819127 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..b9125500778ab300ca76c74d5da10a548c4985af --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f931fea34d05be0494c0ab9718bce37ad922d058b64b820c329cd71d918f3a70 +size 1751 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/log.txt b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..641deafc203ae506ccbc5ac592b83c4532ba0041 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/log.txt @@ -0,0 +1,4 @@ +{"train_lr": 2.49692118226601e-05, "train_grad_norm": 0.6733022602687916, "train_closs": 0.9640464621499695, "epoch": 0, "val_lr": 2.49692118226601e-05, "val_grad_norm": 0.6733022602687916, "val_closs": 0.9640464621499695} +{"train_lr": 4.6114274981403966e-05, "train_grad_norm": 0.5561584743299508, "train_closs": 0.9136834735638035, "epoch": 1, "val_lr": 4.6114274981403966e-05, "val_grad_norm": 0.5561584743299508, "val_closs": 0.9136834735638035} +{"train_lr": 2.751385467980297e-05, "train_grad_norm": 0.5519019591764276, "train_closs": 0.9010383364437102, "epoch": 2, "val_lr": 2.751385467980297e-05, "val_grad_norm": 0.5519019591764276, "val_closs": 0.9010383364437102} +{"train_lr": 8.899579698398978e-06, "train_grad_norm": 0.5554111024796082, "train_closs": 0.8950173630897561, "epoch": 3, "val_lr": 8.899579698398978e-06, "val_grad_norm": 0.5554111024796082, "val_closs": 0.8950173630897561} diff --git a/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/output.log b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/output.log new file mode 100644 index 0000000000000000000000000000000000000000..9d6d14604302702cadfc48b3ace8125e28a27c43 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B/output.log @@ -0,0 +1,648 @@ +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +| distributed init (rank 1): env://, gpu 1 +| distributed init (rank 6): env://, gpu 6 +| distributed init (rank 4): env://, gpu 4 +| distributed init (rank 7): env://, gpu 7 +| distributed init (rank 3): env://, gpu 3 +| distributed init (rank 0): env://, gpu 0 +| distributed init (rank 2): env://, gpu 2 +| distributed init (rank 5): env://, gpu 5 +[04:34:34.123451] > initializing model parallel with size 1 +[04:34:34.123679] > initializing ddp with size 8 +[04:34:34.123687] > initializing pipeline with size 1 +[04:34:34.273785] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory +[04:34:34.273908] Namespace(batch_size=8, +accum_iter=1, +llama_type='llama_peft', +llama_config=['../checkpoints/llama2/Llama-2-7b/params.json', +'configs/model/finetune/sg/llamaPeft_normBiasLora.json'], +no_visual=True, +tokenizer_path='../checkpoints/llama2/Llama-2-7b/tokenizer.model', +pretrained_path='../checkpoints/llama2/Llama-2-7b/', +pretrained_type='meta_ori', +weight_decay=0.02, +lr=5e-05, +min_lr=5e-06, +epochs=4, +warmup_epochs=1.0, +clip_grad=2, +max_words=512, +dialog=False, +data_config='configs/data/finetune/sg/alpaca.yaml', +output_dir='output/finetune/sg/alpaca_llamaPeft_normBiasLora_QF_512_7B', +log_dir='./output_dir', +save_interval=1, +only_save_trainable=True, +device='cuda', +seed=0, +resume='', +num_workers=24, +pin_mem=True, +world_size=8, +local_rank=-1, +dist_on_itp=False, +dist_url='env://', +model_parallel_size=1, +data_parallel='sdp', +precision='bf16', +checkpointing=True, +quant=True, +rank=0, +gpu=0, +distributed=True, +dist_backend='nccl') +[04:34:34.276112] Start initialization. +[04:34:34.276172] ## Processing on RANK 0. +[04:34:34.285967] Model Args: + ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=16, bias_tuning=True) +[04:35:23.173165] Model is Peft: True +[04:35:23.179946] Trainable parameter count : 41603072 (local rank), 41603072 (all). +[04:35:23.288029] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/ +[04:35:40.542782] ## Quantizing model to 4bit! + Qunatization Process: 0%| | 0/839 [00:00 +[04:49:56.134049] Start training for 4 epochs +[04:49:56.141961] log_dir: ./output_dir +[04:50:00.993524] Epoch: [0] [0/812] lr: 0.000000 grad_norm: 1.6694 (1.6694) closs: 1.1100 (1.1100) time: 4.8504 data: 1.6775 max mem: 11767 +[04:50:13.467172] Epoch: [0] [10/812] lr: 0.000001 grad_norm: 1.7142 (1.6843) closs: 1.1138 (1.1233) time: 1.5749 data: 0.1527 max mem: 17666 +[04:50:25.645879] Epoch: [0] [20/812] lr: 0.000001 grad_norm: 1.6067 (1.6129) closs: 1.0717 (1.1035) time: 1.2325 data: 0.0002 max mem: 17666 +[04:50:37.815063] Epoch: [0] [30/812] lr: 0.000002 grad_norm: 1.6067 (1.6224) closs: 1.1138 (1.1220) time: 1.2173 data: 0.0002 max mem: 17666 +[04:50:50.097974] Epoch: [0] [40/812] lr: 0.000002 grad_norm: 1.5736 (1.5833) closs: 1.1291 (1.1294) time: 1.2225 data: 0.0001 max mem: 17666 +[04:51:02.303652] Epoch: [0] [50/812] lr: 0.000003 grad_norm: 1.3909 (1.5353) closs: 1.1291 (1.1312) time: 1.2244 data: 0.0002 max mem: 17666 +[04:51:14.652585] Epoch: [0] [60/812] lr: 0.000004 grad_norm: 1.2569 (1.4798) closs: 1.1374 (1.1263) time: 1.2277 data: 0.0002 max mem: 17666 +[04:51:26.911672] Epoch: [0] [70/812] lr: 0.000004 grad_norm: 1.1196 (1.4223) closs: 1.0760 (1.1289) time: 1.2303 data: 0.0002 max mem: 17666 +[04:51:39.110388] Epoch: [0] [80/812] lr: 0.000005 grad_norm: 0.9717 (1.3646) closs: 1.0850 (1.1260) time: 1.2228 data: 0.0002 max mem: 17666 +[04:51:51.365652] Epoch: [0] [90/812] lr: 0.000006 grad_norm: 0.8888 (1.3056) closs: 1.0433 (1.1139) time: 1.2226 data: 0.0002 max mem: 17666 +[04:52:03.587350] Epoch: [0] [100/812] lr: 0.000006 grad_norm: 0.7781 (1.2548) closs: 1.0227 (1.1095) time: 1.2238 data: 0.0002 max mem: 17666 +[04:52:15.847329] Epoch: [0] [110/812] lr: 0.000007 grad_norm: 0.7166 (1.2090) closs: 1.0220 (1.0972) time: 1.2240 data: 0.0002 max mem: 17666 +[04:52:28.053228] Epoch: [0] [120/812] lr: 0.000007 grad_norm: 0.7216 (1.1685) closs: 1.0220 (1.0963) time: 1.2232 data: 0.0002 max mem: 17666 +[04:52:40.306358] Epoch: [0] [130/812] lr: 0.000008 grad_norm: 0.7216 (1.1348) closs: 1.0442 (1.0918) time: 1.2229 data: 0.0002 max mem: 17666 +[04:52:52.564879] Epoch: [0] [140/812] lr: 0.000009 grad_norm: 0.6922 (1.1040) closs: 0.9522 (1.0824) time: 1.2255 data: 0.0002 max mem: 17666 +[04:53:04.777107] Epoch: [0] [150/812] lr: 0.000009 grad_norm: 0.6689 (1.0747) closs: 0.9576 (1.0768) time: 1.2235 data: 0.0002 max mem: 17666 +[04:53:17.031527] Epoch: [0] [160/812] lr: 0.000010 grad_norm: 0.6162 (1.0461) closs: 0.9987 (1.0734) time: 1.2233 data: 0.0002 max mem: 17666 +[04:53:29.214924] Epoch: [0] [170/812] lr: 0.000010 grad_norm: 0.6162 (1.0222) closs: 0.9767 (1.0659) time: 1.2218 data: 0.0002 max mem: 17666 +[04:53:41.435558] Epoch: [0] [180/812] lr: 0.000011 grad_norm: 0.6274 (0.9998) closs: 0.9133 (1.0574) time: 1.2201 data: 0.0002 max mem: 17666 +[04:53:53.641153] Epoch: [0] [190/812] lr: 0.000012 grad_norm: 0.5874 (0.9789) closs: 0.9174 (1.0514) time: 1.2212 data: 0.0002 max mem: 17666 +[04:54:05.868672] Epoch: [0] [200/812] lr: 0.000012 grad_norm: 0.5874 (0.9605) closs: 0.9700 (1.0485) time: 1.2216 data: 0.0002 max mem: 17666 +[04:54:18.107142] Epoch: [0] [210/812] lr: 0.000013 grad_norm: 0.5872 (0.9425) closs: 0.9847 (1.0443) time: 1.2232 data: 0.0002 max mem: 17666 +[04:54:30.330679] Epoch: [0] [220/812] lr: 0.000014 grad_norm: 0.5699 (0.9259) closs: 0.9661 (1.0414) time: 1.2230 data: 0.0001 max mem: 17666 +[04:54:42.605734] Epoch: [0] [230/812] lr: 0.000014 grad_norm: 0.5536 (0.9112) closs: 0.9191 (1.0351) time: 1.2249 data: 0.0001 max mem: 17666 +[04:54:54.840846] Epoch: [0] [240/812] lr: 0.000015 grad_norm: 0.5524 (0.8976) closs: 0.9078 (1.0315) time: 1.2254 data: 0.0002 max mem: 17666 +[04:55:07.114906] Epoch: [0] [250/812] lr: 0.000015 grad_norm: 0.5982 (0.8887) closs: 0.9491 (1.0288) time: 1.2254 data: 0.0002 max mem: 17666 +[04:55:19.357896] Epoch: [0] [260/812] lr: 0.000016 grad_norm: 0.5592 (0.8752) closs: 0.9431 (1.0253) time: 1.2258 data: 0.0002 max mem: 17666 +[04:55:31.788387] Epoch: [0] [270/812] lr: 0.000017 grad_norm: 0.5482 (0.8643) closs: 0.9608 (1.0244) time: 1.2336 data: 0.0002 max mem: 17666 +[04:55:44.053664] Epoch: [0] [280/812] lr: 0.000017 grad_norm: 0.5581 (0.8548) closs: 0.9608 (1.0214) time: 1.2347 data: 0.0001 max mem: 17666 +[04:55:56.289359] Epoch: [0] [290/812] lr: 0.000018 grad_norm: 0.5606 (0.8459) closs: 0.9531 (1.0183) time: 1.2250 data: 0.0001 max mem: 17666 +[04:56:08.574032] Epoch: [0] [300/812] lr: 0.000018 grad_norm: 0.5730 (0.8372) closs: 0.9457 (1.0158) time: 1.2260 data: 0.0001 max mem: 17666 +[04:56:20.819959] Epoch: [0] [310/812] lr: 0.000019 grad_norm: 0.5730 (0.8287) closs: 0.9613 (1.0152) time: 1.2265 data: 0.0002 max mem: 17666 +[04:56:33.077045] Epoch: [0] [320/812] lr: 0.000020 grad_norm: 0.5720 (0.8213) closs: 0.9613 (1.0149) time: 1.2251 data: 0.0002 max mem: 17666 +[04:56:45.304449] Epoch: [0] [330/812] lr: 0.000020 grad_norm: 0.5427 (0.8132) closs: 0.9252 (1.0109) time: 1.2241 data: 0.0002 max mem: 17666 +[04:56:57.563745] Epoch: [0] [340/812] lr: 0.000021 grad_norm: 0.5427 (0.8059) closs: 0.9062 (1.0078) time: 1.2242 data: 0.0002 max mem: 17666 +[04:57:09.823556] Epoch: [0] [350/812] lr: 0.000022 grad_norm: 0.5474 (0.7994) closs: 0.9406 (1.0088) time: 1.2259 data: 0.0002 max mem: 17666 +[04:57:22.044133] Epoch: [0] [360/812] lr: 0.000022 grad_norm: 0.5634 (0.7934) closs: 0.9744 (1.0071) time: 1.2240 data: 0.0002 max mem: 17666 +[04:57:34.303848] Epoch: [0] [370/812] lr: 0.000023 grad_norm: 0.5715 (0.7872) closs: 0.9075 (1.0046) time: 1.2239 data: 0.0002 max mem: 17666 +[04:57:46.512890] Epoch: [0] [380/812] lr: 0.000023 grad_norm: 0.5607 (0.7815) closs: 0.9075 (1.0032) time: 1.2234 data: 0.0002 max mem: 17666 +[04:57:58.775654] Epoch: [0] [390/812] lr: 0.000024 grad_norm: 0.5731 (0.7765) closs: 0.9216 (1.0017) time: 1.2235 data: 0.0002 max mem: 17666 +[04:58:11.005032] Epoch: [0] [400/812] lr: 0.000025 grad_norm: 0.5846 (0.7721) closs: 0.9061 (1.0003) time: 1.2245 data: 0.0002 max mem: 17666 +[04:58:23.252387] Epoch: [0] [410/812] lr: 0.000025 grad_norm: 0.5909 (0.7688) closs: 0.9044 (0.9990) time: 1.2238 data: 0.0002 max mem: 17666 +[04:58:35.540811] Epoch: [0] [420/812] lr: 0.000026 grad_norm: 0.5704 (0.7638) closs: 0.8909 (0.9968) time: 1.2267 data: 0.0002 max mem: 17666 +[04:58:47.754683] Epoch: [0] [430/812] lr: 0.000026 grad_norm: 0.5641 (0.7602) closs: 0.8689 (0.9940) time: 1.2250 data: 0.0002 max mem: 17666 +[04:59:00.034244] Epoch: [0] [440/812] lr: 0.000027 grad_norm: 0.5588 (0.7557) closs: 0.8496 (0.9914) time: 1.2246 data: 0.0002 max mem: 17666 +[04:59:12.243723] Epoch: [0] [450/812] lr: 0.000028 grad_norm: 0.5588 (0.7518) closs: 0.8680 (0.9887) time: 1.2244 data: 0.0002 max mem: 17666 +[04:59:24.510184] Epoch: [0] [460/812] lr: 0.000028 grad_norm: 0.5679 (0.7476) closs: 0.8964 (0.9874) time: 1.2237 data: 0.0002 max mem: 17666 +[04:59:36.736038] Epoch: [0] [470/812] lr: 0.000029 grad_norm: 0.5676 (0.7444) closs: 0.9041 (0.9869) time: 1.2245 data: 0.0002 max mem: 17666 +[04:59:48.991058] Epoch: [0] [480/812] lr: 0.000030 grad_norm: 0.5570 (0.7411) closs: 0.9610 (0.9865) time: 1.2240 data: 0.0002 max mem: 17666 +[05:00:01.241427] Epoch: [0] [490/812] lr: 0.000030 grad_norm: 0.5518 (0.7368) closs: 0.9518 (0.9857) time: 1.2252 data: 0.0002 max mem: 17666 +[05:00:13.456707] Epoch: [0] [500/812] lr: 0.000031 grad_norm: 0.5348 (0.7332) closs: 0.9462 (0.9844) time: 1.2232 data: 0.0002 max mem: 17666 +[05:00:25.704700] Epoch: [0] [510/812] lr: 0.000031 grad_norm: 0.5677 (0.7322) closs: 0.9125 (0.9823) time: 1.2231 data: 0.0002 max mem: 17666 +[05:00:37.923520] Epoch: [0] [520/812] lr: 0.000032 grad_norm: 0.5883 (0.7326) closs: 0.8610 (0.9811) time: 1.2233 data: 0.0002 max mem: 17666 +[05:00:50.181172] Epoch: [0] [530/812] lr: 0.000033 grad_norm: 0.5724 (0.7299) closs: 0.9259 (0.9802) time: 1.2238 data: 0.0002 max mem: 17666 +[05:01:02.403659] Epoch: [0] [540/812] lr: 0.000033 grad_norm: 0.5683 (0.7269) closs: 0.9560 (0.9795) time: 1.2239 data: 0.0002 max mem: 17666 +[05:01:14.663952] Epoch: [0] [550/812] lr: 0.000034 grad_norm: 0.5575 (0.7241) closs: 0.9308 (0.9781) time: 1.2241 data: 0.0002 max mem: 17666 +[05:01:26.950895] Epoch: [0] [560/812] lr: 0.000034 grad_norm: 0.5575 (0.7212) closs: 0.8756 (0.9775) time: 1.2273 data: 0.0003 max mem: 17666 +[05:01:39.165892] Epoch: [0] [570/812] lr: 0.000035 grad_norm: 0.5441 (0.7187) closs: 0.9025 (0.9766) time: 1.2250 data: 0.0003 max mem: 17666 +[05:01:51.405903] Epoch: [0] [580/812] lr: 0.000036 grad_norm: 0.5507 (0.7162) closs: 0.9051 (0.9764) time: 1.2227 data: 0.0002 max mem: 17666 +[05:02:03.610466] Epoch: [0] [590/812] lr: 0.000036 grad_norm: 0.5490 (0.7134) closs: 0.9051 (0.9755) time: 1.2222 data: 0.0002 max mem: 17666 +[05:02:15.849292] Epoch: [0] [600/812] lr: 0.000037 grad_norm: 0.5563 (0.7107) closs: 0.9230 (0.9747) time: 1.2221 data: 0.0002 max mem: 17666 +[05:02:28.073108] Epoch: [0] [610/812] lr: 0.000038 grad_norm: 0.5737 (0.7087) closs: 0.8677 (0.9726) time: 1.2231 data: 0.0003 max mem: 17666 +[05:02:40.301764] Epoch: [0] [620/812] lr: 0.000038 grad_norm: 0.5858 (0.7066) closs: 0.8455 (0.9718) time: 1.2226 data: 0.0003 max mem: 17666 +[05:02:52.540493] Epoch: [0] [630/812] lr: 0.000039 grad_norm: 0.5850 (0.7049) closs: 0.8993 (0.9707) time: 1.2233 data: 0.0002 max mem: 17666 +[05:03:04.773003] Epoch: [0] [640/812] lr: 0.000039 grad_norm: 0.5764 (0.7030) closs: 0.9169 (0.9703) time: 1.2235 data: 0.0002 max mem: 17666 +[05:03:17.077033] Epoch: [0] [650/812] lr: 0.000040 grad_norm: 0.5656 (0.7012) closs: 0.9087 (0.9689) time: 1.2268 data: 0.0002 max mem: 17666 +[05:03:29.323502] Epoch: [0] [660/812] lr: 0.000041 grad_norm: 0.5623 (0.6993) closs: 0.8749 (0.9684) time: 1.2274 data: 0.0002 max mem: 17666 +[05:03:41.591619] Epoch: [0] [670/812] lr: 0.000041 grad_norm: 0.5259 (0.6967) closs: 0.8953 (0.9678) time: 1.2257 data: 0.0002 max mem: 17666 +[05:03:53.817646] Epoch: [0] [680/812] lr: 0.000042 grad_norm: 0.5327 (0.6945) closs: 0.9142 (0.9673) time: 1.2246 data: 0.0002 max mem: 17666 +[05:04:06.075349] Epoch: [0] [690/812] lr: 0.000042 grad_norm: 0.5405 (0.6927) closs: 0.9243 (0.9669) time: 1.2241 data: 0.0002 max mem: 17666 +[05:04:18.348610] Epoch: [0] [700/812] lr: 0.000043 grad_norm: 0.5495 (0.6909) closs: 0.9243 (0.9666) time: 1.2265 data: 0.0002 max mem: 17666 +[05:04:30.558726] Epoch: [0] [710/812] lr: 0.000044 grad_norm: 0.5495 (0.6890) closs: 0.8956 (0.9654) time: 1.2241 data: 0.0002 max mem: 17666 +[05:04:42.827196] Epoch: [0] [720/812] lr: 0.000044 grad_norm: 0.5697 (0.6876) closs: 0.8870 (0.9649) time: 1.2239 data: 0.0002 max mem: 17666 +[05:04:55.071487] Epoch: [0] [730/812] lr: 0.000045 grad_norm: 0.5697 (0.6858) closs: 0.8934 (0.9640) time: 1.2256 data: 0.0002 max mem: 17666 +[05:05:07.340719] Epoch: [0] [740/812] lr: 0.000046 grad_norm: 0.5596 (0.6840) closs: 0.8871 (0.9627) time: 1.2256 data: 0.0002 max mem: 17666 +[05:05:19.598714] Epoch: [0] [750/812] lr: 0.000046 grad_norm: 0.5451 (0.6823) closs: 0.9205 (0.9636) time: 1.2263 data: 0.0002 max mem: 17666 +[05:05:31.859047] Epoch: [0] [760/812] lr: 0.000047 grad_norm: 0.5443 (0.6806) closs: 0.9429 (0.9627) time: 1.2258 data: 0.0002 max mem: 17666 +[05:05:44.113014] Epoch: [0] [770/812] lr: 0.000047 grad_norm: 0.5571 (0.6793) closs: 0.9180 (0.9618) time: 1.2256 data: 0.0002 max mem: 17666 +[05:05:56.310853] Epoch: [0] [780/812] lr: 0.000048 grad_norm: 0.5778 (0.6778) closs: 0.9180 (0.9614) time: 1.2225 data: 0.0001 max mem: 17666 +[05:06:08.553058] Epoch: [0] [790/812] lr: 0.000049 grad_norm: 0.5687 (0.6765) closs: 0.8819 (0.9604) time: 1.2219 data: 0.0001 max mem: 17666 +[05:06:20.746249] Epoch: [0] [800/812] lr: 0.000049 grad_norm: 0.5436 (0.6748) closs: 0.8819 (0.9599) time: 1.2217 data: 0.0001 max mem: 17666 +[05:06:33.006020] Epoch: [0] [810/812] lr: 0.000050 grad_norm: 0.5421 (0.6735) closs: 0.8738 (0.9586) time: 1.2226 data: 0.0001 max mem: 17666 +[05:06:34.434061] Epoch: [0] Total time: 0:16:38 +[05:06:34.435967] Averaged stats: lr: 0.000050 grad_norm: 0.5358 (0.6733) closs: 0.8738 (0.9640) +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[05:06:34.685369] model saved +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[05:06:36.004693] optimizer saved +[05:06:36.005298] other rank-common saved +[05:06:36.008307] rank-specific saved +[05:06:36.016492] log_dir: ./output_dir +[05:06:38.409445] Epoch: [1] [0/812] lr: 0.000050 grad_norm: 0.5716 (0.5716) closs: 0.7068 (0.7068) time: 2.3920 data: 1.1581 max mem: 17666 +[05:06:50.657381] Epoch: [1] [10/812] lr: 0.000050 grad_norm: 0.5716 (0.5743) closs: 0.8450 (0.8412) time: 1.3308 data: 0.1054 max mem: 17666 +[05:07:02.826034] Epoch: [1] [20/812] lr: 0.000050 grad_norm: 0.5748 (0.5730) closs: 0.8714 (0.9120) time: 1.2208 data: 0.0001 max mem: 17666 +[05:07:14.934821] Epoch: [1] [30/812] lr: 0.000050 grad_norm: 0.5622 (0.5677) closs: 0.9029 (0.9080) time: 1.2138 data: 0.0001 max mem: 17666 +[05:07:27.078058] Epoch: [1] [40/812] lr: 0.000050 grad_norm: 0.5479 (0.5654) closs: 0.9393 (0.9264) time: 1.2125 data: 0.0001 max mem: 17666 +[05:07:39.214844] Epoch: [1] [50/812] lr: 0.000050 grad_norm: 0.5380 (0.5637) closs: 0.9393 (0.9220) time: 1.2139 data: 0.0001 max mem: 17666 +[05:07:51.352136] Epoch: [1] [60/812] lr: 0.000050 grad_norm: 0.5607 (0.5649) closs: 0.9001 (0.9198) time: 1.2136 data: 0.0001 max mem: 17666 +[05:08:03.517114] Epoch: [1] [70/812] lr: 0.000050 grad_norm: 0.5700 (0.5683) closs: 0.9085 (0.9224) time: 1.2150 data: 0.0001 max mem: 17666 +[05:08:15.729009] Epoch: [1] [80/812] lr: 0.000050 grad_norm: 0.5498 (0.5672) closs: 0.9337 (0.9186) time: 1.2188 data: 0.0001 max mem: 17666 +[05:08:27.942413] Epoch: [1] [90/812] lr: 0.000050 grad_norm: 0.5372 (0.5618) closs: 0.9337 (0.9233) time: 1.2212 data: 0.0002 max mem: 17666 +[05:08:40.177210] Epoch: [1] [100/812] lr: 0.000050 grad_norm: 0.5399 (0.5635) closs: 0.9525 (0.9257) time: 1.2223 data: 0.0002 max mem: 17666 +[05:08:52.395851] Epoch: [1] [110/812] lr: 0.000050 grad_norm: 0.5470 (0.5630) closs: 0.9445 (0.9224) time: 1.2226 data: 0.0002 max mem: 17666 +[05:09:04.620910] Epoch: [1] [120/812] lr: 0.000050 grad_norm: 0.5660 (0.5649) closs: 0.9445 (0.9235) time: 1.2221 data: 0.0002 max mem: 17666 +[05:09:16.823910] Epoch: [1] [130/812] lr: 0.000050 grad_norm: 0.5666 (0.5642) closs: 0.9266 (0.9232) time: 1.2213 data: 0.0002 max mem: 17666 +[05:09:29.044814] Epoch: [1] [140/812] lr: 0.000050 grad_norm: 0.5469 (0.5636) closs: 0.9191 (0.9222) time: 1.2211 data: 0.0002 max mem: 17666 +[05:09:41.250901] Epoch: [1] [150/812] lr: 0.000050 grad_norm: 0.5427 (0.5631) closs: 0.9318 (0.9220) time: 1.2213 data: 0.0002 max mem: 17666 +[05:09:53.461830] Epoch: [1] [160/812] lr: 0.000050 grad_norm: 0.5499 (0.5632) closs: 0.9413 (0.9241) time: 1.2208 data: 0.0002 max mem: 17666 +[05:10:05.704359] Epoch: [1] [170/812] lr: 0.000049 grad_norm: 0.5499 (0.5628) closs: 0.9694 (0.9263) time: 1.2226 data: 0.0002 max mem: 17666 +[05:10:17.913859] Epoch: [1] [180/812] lr: 0.000049 grad_norm: 0.5187 (0.5618) closs: 0.9341 (0.9267) time: 1.2225 data: 0.0002 max mem: 17666 +[05:10:30.152053] Epoch: [1] [190/812] lr: 0.000049 grad_norm: 0.5289 (0.5600) closs: 0.8866 (0.9248) time: 1.2223 data: 0.0002 max mem: 17666 +[05:10:42.370346] Epoch: [1] [200/812] lr: 0.000049 grad_norm: 0.5296 (0.5589) closs: 0.8866 (0.9232) time: 1.2228 data: 0.0002 max mem: 17666 +[05:10:54.621739] Epoch: [1] [210/812] lr: 0.000049 grad_norm: 0.5406 (0.5576) closs: 0.9429 (0.9253) time: 1.2234 data: 0.0002 max mem: 17666 +[05:11:06.854883] Epoch: [1] [220/812] lr: 0.000049 grad_norm: 0.5362 (0.5573) closs: 0.9438 (0.9241) time: 1.2242 data: 0.0002 max mem: 17666 +[05:11:19.064849] Epoch: [1] [230/812] lr: 0.000049 grad_norm: 0.5260 (0.5562) closs: 0.8665 (0.9228) time: 1.2221 data: 0.0002 max mem: 17666 +[05:11:31.300169] Epoch: [1] [240/812] lr: 0.000049 grad_norm: 0.5371 (0.5563) closs: 0.8964 (0.9237) time: 1.2222 data: 0.0001 max mem: 17666 +[05:11:43.496321] Epoch: [1] [250/812] lr: 0.000049 grad_norm: 0.5503 (0.5565) closs: 0.9451 (0.9228) time: 1.2215 data: 0.0001 max mem: 17666 +[05:11:55.764297] Epoch: [1] [260/812] lr: 0.000049 grad_norm: 0.5451 (0.5569) closs: 0.9342 (0.9233) time: 1.2231 data: 0.0002 max mem: 17666 +[05:12:07.971728] Epoch: [1] [270/812] lr: 0.000049 grad_norm: 0.5451 (0.5568) closs: 0.9317 (0.9219) time: 1.2237 data: 0.0002 max mem: 17666 +[05:12:20.187157] Epoch: [1] [280/812] lr: 0.000049 grad_norm: 0.5521 (0.5574) closs: 0.9182 (0.9220) time: 1.2211 data: 0.0002 max mem: 17666 +[05:12:32.426547] Epoch: [1] [290/812] lr: 0.000048 grad_norm: 0.5328 (0.5572) closs: 0.9268 (0.9225) time: 1.2227 data: 0.0002 max mem: 17666 +[05:12:44.641825] Epoch: [1] [300/812] lr: 0.000048 grad_norm: 0.5328 (0.5566) closs: 0.8957 (0.9212) time: 1.2227 data: 0.0002 max mem: 17666 +[05:12:56.872549] Epoch: [1] [310/812] lr: 0.000048 grad_norm: 0.5379 (0.5574) closs: 0.8875 (0.9203) time: 1.2222 data: 0.0002 max mem: 17666 +[05:13:09.087618] Epoch: [1] [320/812] lr: 0.000048 grad_norm: 0.5465 (0.5576) closs: 0.8875 (0.9207) time: 1.2222 data: 0.0002 max mem: 17666 +[05:13:21.321873] Epoch: [1] [330/812] lr: 0.000048 grad_norm: 0.5207 (0.5569) closs: 0.8802 (0.9200) time: 1.2224 data: 0.0002 max mem: 17666 +[05:13:33.555776] Epoch: [1] [340/812] lr: 0.000048 grad_norm: 0.5206 (0.5569) closs: 0.8556 (0.9179) time: 1.2233 data: 0.0002 max mem: 17666 +[05:13:45.822566] Epoch: [1] [350/812] lr: 0.000048 grad_norm: 0.5441 (0.5565) closs: 0.8780 (0.9175) time: 1.2250 data: 0.0001 max mem: 17666 +[05:13:58.058935] Epoch: [1] [360/812] lr: 0.000048 grad_norm: 0.5441 (0.5563) closs: 0.8524 (0.9153) time: 1.2251 data: 0.0001 max mem: 17666 +[05:14:10.306817] Epoch: [1] [370/812] lr: 0.000047 grad_norm: 0.5542 (0.5571) closs: 0.8262 (0.9143) time: 1.2241 data: 0.0002 max mem: 17666 +[05:14:22.557391] Epoch: [1] [380/812] lr: 0.000047 grad_norm: 0.5502 (0.5566) closs: 0.8911 (0.9145) time: 1.2249 data: 0.0001 max mem: 17666 +[05:14:34.775480] Epoch: [1] [390/812] lr: 0.000047 grad_norm: 0.5379 (0.5564) closs: 0.9057 (0.9150) time: 1.2234 data: 0.0001 max mem: 17666 +[05:14:47.019726] Epoch: [1] [400/812] lr: 0.000047 grad_norm: 0.5379 (0.5560) closs: 0.9057 (0.9147) time: 1.2231 data: 0.0001 max mem: 17666 +[05:14:59.254412] Epoch: [1] [410/812] lr: 0.000047 grad_norm: 0.5308 (0.5556) closs: 0.9153 (0.9160) time: 1.2239 data: 0.0001 max mem: 17666 +[05:15:11.483844] Epoch: [1] [420/812] lr: 0.000047 grad_norm: 0.5308 (0.5551) closs: 0.9592 (0.9165) time: 1.2231 data: 0.0001 max mem: 17666 +[05:15:23.696028] Epoch: [1] [430/812] lr: 0.000047 grad_norm: 0.5458 (0.5561) closs: 0.8909 (0.9157) time: 1.2220 data: 0.0002 max mem: 17666 +[05:15:35.910601] Epoch: [1] [440/812] lr: 0.000046 grad_norm: 0.5484 (0.5561) closs: 0.9059 (0.9157) time: 1.2213 data: 0.0002 max mem: 17666 +[05:15:48.141787] Epoch: [1] [450/812] lr: 0.000046 grad_norm: 0.5484 (0.5563) closs: 0.9138 (0.9153) time: 1.2222 data: 0.0002 max mem: 17666 +[05:16:00.351582] Epoch: [1] [460/812] lr: 0.000046 grad_norm: 0.5362 (0.5557) closs: 0.8589 (0.9147) time: 1.2220 data: 0.0002 max mem: 17666 +[05:16:12.585800] Epoch: [1] [470/812] lr: 0.000046 grad_norm: 0.5292 (0.5555) closs: 0.8808 (0.9147) time: 1.2221 data: 0.0002 max mem: 17666 +[05:16:24.813078] Epoch: [1] [480/812] lr: 0.000046 grad_norm: 0.5494 (0.5564) closs: 0.8981 (0.9153) time: 1.2230 data: 0.0002 max mem: 17666 +[05:16:37.044404] Epoch: [1] [490/812] lr: 0.000046 grad_norm: 0.5662 (0.5563) closs: 0.9437 (0.9160) time: 1.2229 data: 0.0002 max mem: 17666 +[05:16:49.306306] Epoch: [1] [500/812] lr: 0.000045 grad_norm: 0.5418 (0.5562) closs: 0.9002 (0.9156) time: 1.2246 data: 0.0002 max mem: 17666 +[05:17:01.504233] Epoch: [1] [510/812] lr: 0.000045 grad_norm: 0.5569 (0.5596) closs: 0.8913 (0.9152) time: 1.2229 data: 0.0001 max mem: 17666 +[05:17:13.732894] Epoch: [1] [520/812] lr: 0.000045 grad_norm: 0.5569 (0.5594) closs: 0.9053 (0.9152) time: 1.2213 data: 0.0002 max mem: 17666 +[05:17:25.950468] Epoch: [1] [530/812] lr: 0.000045 grad_norm: 0.5472 (0.5591) closs: 0.8929 (0.9153) time: 1.2222 data: 0.0001 max mem: 17666 +[05:17:38.187123] Epoch: [1] [540/812] lr: 0.000045 grad_norm: 0.5325 (0.5590) closs: 0.8836 (0.9154) time: 1.2226 data: 0.0001 max mem: 17666 +[05:17:50.416363] Epoch: [1] [550/812] lr: 0.000045 grad_norm: 0.5393 (0.5585) closs: 0.8734 (0.9148) time: 1.2232 data: 0.0002 max mem: 17666 +[05:18:02.631787] Epoch: [1] [560/812] lr: 0.000044 grad_norm: 0.5416 (0.5586) closs: 0.9068 (0.9150) time: 1.2222 data: 0.0002 max mem: 17666 +[05:18:14.860124] Epoch: [1] [570/812] lr: 0.000044 grad_norm: 0.5634 (0.5589) closs: 0.9202 (0.9154) time: 1.2221 data: 0.0002 max mem: 17666 +[05:18:27.083101] Epoch: [1] [580/812] lr: 0.000044 grad_norm: 0.5520 (0.5588) closs: 0.9180 (0.9145) time: 1.2225 data: 0.0002 max mem: 17666 +[05:18:39.321277] Epoch: [1] [590/812] lr: 0.000044 grad_norm: 0.5561 (0.5592) closs: 0.9180 (0.9147) time: 1.2230 data: 0.0002 max mem: 17666 +[05:18:51.538167] Epoch: [1] [600/812] lr: 0.000044 grad_norm: 0.5392 (0.5588) closs: 0.9240 (0.9150) time: 1.2227 data: 0.0002 max mem: 17666 +[05:19:03.786880] Epoch: [1] [610/812] lr: 0.000043 grad_norm: 0.5296 (0.5585) closs: 0.8944 (0.9151) time: 1.2232 data: 0.0001 max mem: 17666 +[05:19:16.029554] Epoch: [1] [620/812] lr: 0.000043 grad_norm: 0.5438 (0.5580) closs: 0.9104 (0.9152) time: 1.2245 data: 0.0002 max mem: 17666 +[05:19:28.287030] Epoch: [1] [630/812] lr: 0.000043 grad_norm: 0.5312 (0.5574) closs: 0.9339 (0.9155) time: 1.2249 data: 0.0002 max mem: 17666 +[05:19:40.534557] Epoch: [1] [640/812] lr: 0.000043 grad_norm: 0.5312 (0.5572) closs: 0.9009 (0.9155) time: 1.2252 data: 0.0002 max mem: 17666 +[05:19:52.747120] Epoch: [1] [650/812] lr: 0.000043 grad_norm: 0.5282 (0.5566) closs: 0.8875 (0.9153) time: 1.2229 data: 0.0001 max mem: 17666 +[05:20:04.986132] Epoch: [1] [660/812] lr: 0.000042 grad_norm: 0.5202 (0.5564) closs: 0.9420 (0.9163) time: 1.2225 data: 0.0001 max mem: 17666 +[05:20:17.207197] Epoch: [1] [670/812] lr: 0.000042 grad_norm: 0.5224 (0.5563) closs: 0.9829 (0.9170) time: 1.2229 data: 0.0002 max mem: 17666 +[05:20:29.408267] Epoch: [1] [680/812] lr: 0.000042 grad_norm: 0.5301 (0.5566) closs: 0.9759 (0.9177) time: 1.2210 data: 0.0002 max mem: 17666 +[05:20:41.615333] Epoch: [1] [690/812] lr: 0.000042 grad_norm: 0.5389 (0.5570) closs: 0.9759 (0.9181) time: 1.2203 data: 0.0002 max mem: 17666 +[05:20:53.843553] Epoch: [1] [700/812] lr: 0.000041 grad_norm: 0.5477 (0.5568) closs: 0.9354 (0.9181) time: 1.2217 data: 0.0002 max mem: 17666 +[05:21:06.076892] Epoch: [1] [710/812] lr: 0.000041 grad_norm: 0.5477 (0.5567) closs: 0.8810 (0.9178) time: 1.2230 data: 0.0002 max mem: 17666 +[05:21:18.276846] Epoch: [1] [720/812] lr: 0.000041 grad_norm: 0.5319 (0.5565) closs: 0.8641 (0.9173) time: 1.2216 data: 0.0001 max mem: 17666 +[05:21:30.499197] Epoch: [1] [730/812] lr: 0.000041 grad_norm: 0.5319 (0.5562) closs: 0.9143 (0.9173) time: 1.2211 data: 0.0001 max mem: 17666 +[05:21:42.697532] Epoch: [1] [740/812] lr: 0.000041 grad_norm: 0.5363 (0.5563) closs: 0.9228 (0.9173) time: 1.2210 data: 0.0001 max mem: 17666 +[05:21:54.912903] Epoch: [1] [750/812] lr: 0.000040 grad_norm: 0.5423 (0.5559) closs: 0.8785 (0.9170) time: 1.2206 data: 0.0001 max mem: 17666 +[05:22:07.135892] Epoch: [1] [760/812] lr: 0.000040 grad_norm: 0.5280 (0.5560) closs: 0.8854 (0.9174) time: 1.2219 data: 0.0002 max mem: 17666 +[05:22:19.333692] Epoch: [1] [770/812] lr: 0.000040 grad_norm: 0.5221 (0.5555) closs: 0.9338 (0.9169) time: 1.2210 data: 0.0001 max mem: 17666 +[05:22:31.551464] Epoch: [1] [780/812] lr: 0.000040 grad_norm: 0.5360 (0.5559) closs: 0.8506 (0.9166) time: 1.2207 data: 0.0001 max mem: 17666 +[05:22:43.738476] Epoch: [1] [790/812] lr: 0.000039 grad_norm: 0.5617 (0.5560) closs: 0.8934 (0.9169) time: 1.2202 data: 0.0001 max mem: 17666 +[05:22:55.936330] Epoch: [1] [800/812] lr: 0.000039 grad_norm: 0.5488 (0.5560) closs: 0.9056 (0.9165) time: 1.2192 data: 0.0001 max mem: 17666 +[05:23:08.115449] Epoch: [1] [810/812] lr: 0.000039 grad_norm: 0.5498 (0.5563) closs: 0.9065 (0.9165) time: 1.2188 data: 0.0001 max mem: 17666 +[05:23:09.610496] Epoch: [1] Total time: 0:16:33 +[05:23:09.633077] Averaged stats: lr: 0.000039 grad_norm: 0.5498 (0.5562) closs: 0.9187 (0.9137) +[05:23:09.936052] model saved +[05:23:11.238391] optimizer saved +[05:23:11.238903] other rank-common saved +[05:23:11.241868] rank-specific saved +[05:23:11.249988] log_dir: ./output_dir +[05:23:13.663887] Epoch: [2] [0/812] lr: 0.000039 grad_norm: 0.7125 (0.7125) closs: 0.8745 (0.8745) time: 2.4131 data: 1.1672 max mem: 17666 +[05:23:25.915941] Epoch: [2] [10/812] lr: 0.000038 grad_norm: 0.5159 (0.5531) closs: 0.8802 (0.8976) time: 1.3331 data: 0.1062 max mem: 17666 +[05:23:38.174870] Epoch: [2] [20/812] lr: 0.000038 grad_norm: 0.5206 (0.5430) closs: 0.8802 (0.8902) time: 1.2255 data: 0.0002 max mem: 17666 +[05:23:50.412310] Epoch: [2] [30/812] lr: 0.000038 grad_norm: 0.5260 (0.5434) closs: 0.8835 (0.8971) time: 1.2247 data: 0.0002 max mem: 17666 +[05:24:02.710838] Epoch: [2] [40/812] lr: 0.000038 grad_norm: 0.5309 (0.5425) closs: 0.8968 (0.8971) time: 1.2267 data: 0.0002 max mem: 17666 +[05:24:15.054535] Epoch: [2] [50/812] lr: 0.000037 grad_norm: 0.5321 (0.5424) closs: 0.9001 (0.8951) time: 1.2320 data: 0.0004 max mem: 17666 +[05:24:27.318243] Epoch: [2] [60/812] lr: 0.000037 grad_norm: 0.5432 (0.5419) closs: 0.9220 (0.8985) time: 1.2303 data: 0.0004 max mem: 17666 +[05:24:39.554150] Epoch: [2] [70/812] lr: 0.000037 grad_norm: 0.5432 (0.5457) closs: 0.9125 (0.9035) time: 1.2249 data: 0.0002 max mem: 17666 +[05:24:51.781126] Epoch: [2] [80/812] lr: 0.000037 grad_norm: 0.5337 (0.5448) closs: 0.8991 (0.9013) time: 1.2231 data: 0.0002 max mem: 17666 +[05:25:03.998186] Epoch: [2] [90/812] lr: 0.000036 grad_norm: 0.5328 (0.5431) closs: 0.8633 (0.8985) time: 1.2221 data: 0.0002 max mem: 17666 +[05:25:16.195929] Epoch: [2] [100/812] lr: 0.000036 grad_norm: 0.5328 (0.5437) closs: 0.8633 (0.8997) time: 1.2207 data: 0.0002 max mem: 17666 +[05:25:28.433549] Epoch: [2] [110/812] lr: 0.000036 grad_norm: 0.5378 (0.5440) closs: 0.8652 (0.8944) time: 1.2217 data: 0.0002 max mem: 17666 +[05:25:40.630768] Epoch: [2] [120/812] lr: 0.000036 grad_norm: 0.5295 (0.5427) closs: 0.8916 (0.8959) time: 1.2217 data: 0.0002 max mem: 17666 +[05:25:52.846608] Epoch: [2] [130/812] lr: 0.000035 grad_norm: 0.5252 (0.5422) closs: 0.8785 (0.8946) time: 1.2206 data: 0.0002 max mem: 17666 +[05:26:05.118006] Epoch: [2] [140/812] lr: 0.000035 grad_norm: 0.5381 (0.5433) closs: 0.8633 (0.8939) time: 1.2243 data: 0.0002 max mem: 17666 +[05:26:17.382923] Epoch: [2] [150/812] lr: 0.000035 grad_norm: 0.5409 (0.5437) closs: 0.8702 (0.8949) time: 1.2267 data: 0.0002 max mem: 17666 +[05:26:29.624515] Epoch: [2] [160/812] lr: 0.000035 grad_norm: 0.5378 (0.5439) closs: 0.9324 (0.8958) time: 1.2253 data: 0.0002 max mem: 17666 +[05:26:41.866362] Epoch: [2] [170/812] lr: 0.000034 grad_norm: 0.5373 (0.5433) closs: 0.8786 (0.8948) time: 1.2241 data: 0.0002 max mem: 17666 +[05:26:54.115542] Epoch: [2] [180/812] lr: 0.000034 grad_norm: 0.5291 (0.5426) closs: 0.8532 (0.8941) time: 1.2245 data: 0.0002 max mem: 17666 +[05:27:06.354062] Epoch: [2] [190/812] lr: 0.000034 grad_norm: 0.5375 (0.5448) closs: 0.8871 (0.8957) time: 1.2243 data: 0.0002 max mem: 17666 +[05:27:18.583612] Epoch: [2] [200/812] lr: 0.000033 grad_norm: 0.5450 (0.5459) closs: 0.9092 (0.8974) time: 1.2233 data: 0.0002 max mem: 17666 +[05:27:30.827297] Epoch: [2] [210/812] lr: 0.000033 grad_norm: 0.5450 (0.5465) closs: 0.8947 (0.8979) time: 1.2236 data: 0.0002 max mem: 17666 +[05:27:43.099277] Epoch: [2] [220/812] lr: 0.000033 grad_norm: 0.5412 (0.5470) closs: 0.9164 (0.8990) time: 1.2257 data: 0.0002 max mem: 17666 +[05:27:55.311466] Epoch: [2] [230/812] lr: 0.000033 grad_norm: 0.5342 (0.5467) closs: 0.9164 (0.8996) time: 1.2241 data: 0.0002 max mem: 17666 +[05:28:07.527387] Epoch: [2] [240/812] lr: 0.000032 grad_norm: 0.5269 (0.5467) closs: 0.8845 (0.8995) time: 1.2213 data: 0.0002 max mem: 17666 +[05:28:19.757509] Epoch: [2] [250/812] lr: 0.000032 grad_norm: 0.5346 (0.5467) closs: 0.8747 (0.9001) time: 1.2222 data: 0.0002 max mem: 17666 +[05:28:31.953868] Epoch: [2] [260/812] lr: 0.000032 grad_norm: 0.5462 (0.5475) closs: 0.8479 (0.8997) time: 1.2213 data: 0.0002 max mem: 17666 +[05:28:44.170627] Epoch: [2] [270/812] lr: 0.000031 grad_norm: 0.5350 (0.5466) closs: 0.8574 (0.9012) time: 1.2206 data: 0.0002 max mem: 17666 +[05:28:56.370480] Epoch: [2] [280/812] lr: 0.000031 grad_norm: 0.5331 (0.5468) closs: 0.8961 (0.9028) time: 1.2208 data: 0.0002 max mem: 17666 +[05:29:08.686787] Epoch: [2] [290/812] lr: 0.000031 grad_norm: 0.5416 (0.5475) closs: 0.8706 (0.8995) time: 1.2257 data: 0.0002 max mem: 17666 +[05:29:20.929953] Epoch: [2] [300/812] lr: 0.000031 grad_norm: 0.5308 (0.5467) closs: 0.8766 (0.8992) time: 1.2279 data: 0.0002 max mem: 17666 +[05:29:33.133327] Epoch: [2] [310/812] lr: 0.000030 grad_norm: 0.5419 (0.5486) closs: 0.9146 (0.8993) time: 1.2223 data: 0.0002 max mem: 17666 +[05:29:45.380974] Epoch: [2] [320/812] lr: 0.000030 grad_norm: 0.5416 (0.5480) closs: 0.9215 (0.9009) time: 1.2225 data: 0.0002 max mem: 17666 +[05:29:57.586882] Epoch: [2] [330/812] lr: 0.000030 grad_norm: 0.5708 (0.5498) closs: 0.9306 (0.9025) time: 1.2226 data: 0.0002 max mem: 17666 +[05:30:09.877899] Epoch: [2] [340/812] lr: 0.000029 grad_norm: 0.5762 (0.5500) closs: 0.9558 (0.9035) time: 1.2248 data: 0.0002 max mem: 17666 +[05:30:22.088804] Epoch: [2] [350/812] lr: 0.000029 grad_norm: 0.5395 (0.5498) closs: 0.9168 (0.9032) time: 1.2250 data: 0.0002 max mem: 17666 +[05:30:34.326928] Epoch: [2] [360/812] lr: 0.000029 grad_norm: 0.5362 (0.5499) closs: 0.8943 (0.9040) time: 1.2224 data: 0.0002 max mem: 17666 +[05:30:46.554197] Epoch: [2] [370/812] lr: 0.000029 grad_norm: 0.5362 (0.5499) closs: 0.8968 (0.9041) time: 1.2232 data: 0.0002 max mem: 17666 +[05:30:58.787894] Epoch: [2] [380/812] lr: 0.000028 grad_norm: 0.5605 (0.5500) closs: 0.8968 (0.9033) time: 1.2230 data: 0.0002 max mem: 17666 +[05:31:11.026268] Epoch: [2] [390/812] lr: 0.000028 grad_norm: 0.5654 (0.5508) closs: 0.8928 (0.9019) time: 1.2235 data: 0.0002 max mem: 17666 +[05:31:23.250055] Epoch: [2] [400/812] lr: 0.000028 grad_norm: 0.5627 (0.5509) closs: 0.8756 (0.9017) time: 1.2230 data: 0.0002 max mem: 17666 +[05:31:35.499135] Epoch: [2] [410/812] lr: 0.000027 grad_norm: 0.5489 (0.5520) closs: 0.8756 (0.9014) time: 1.2236 data: 0.0002 max mem: 17666 +[05:31:47.722078] Epoch: [2] [420/812] lr: 0.000027 grad_norm: 0.5539 (0.5521) closs: 0.8572 (0.9015) time: 1.2235 data: 0.0002 max mem: 17666 +[05:32:00.005353] Epoch: [2] [430/812] lr: 0.000027 grad_norm: 0.5353 (0.5520) closs: 0.8631 (0.9021) time: 1.2252 data: 0.0002 max mem: 17666 +[05:32:12.261554] Epoch: [2] [440/812] lr: 0.000027 grad_norm: 0.5384 (0.5529) closs: 0.9113 (0.9020) time: 1.2269 data: 0.0002 max mem: 17666 +[05:32:24.476152] Epoch: [2] [450/812] lr: 0.000026 grad_norm: 0.5707 (0.5529) closs: 0.9142 (0.9032) time: 1.2235 data: 0.0002 max mem: 17666 +[05:32:36.730553] Epoch: [2] [460/812] lr: 0.000026 grad_norm: 0.5230 (0.5534) closs: 0.9659 (0.9053) time: 1.2234 data: 0.0002 max mem: 17666 +[05:32:48.963624] Epoch: [2] [470/812] lr: 0.000026 grad_norm: 0.5223 (0.5531) closs: 0.9463 (0.9048) time: 1.2243 data: 0.0002 max mem: 17666 +[05:33:01.221874] Epoch: [2] [480/812] lr: 0.000025 grad_norm: 0.5196 (0.5530) closs: 0.9217 (0.9058) time: 1.2245 data: 0.0002 max mem: 17666 +[05:33:13.454427] Epoch: [2] [490/812] lr: 0.000025 grad_norm: 0.5165 (0.5525) closs: 0.9051 (0.9051) time: 1.2245 data: 0.0002 max mem: 17666 +[05:33:25.723568] Epoch: [2] [500/812] lr: 0.000025 grad_norm: 0.5228 (0.5525) closs: 0.8909 (0.9047) time: 1.2250 data: 0.0002 max mem: 17666 +[05:33:38.061671] Epoch: [2] [510/812] lr: 0.000024 grad_norm: 0.5380 (0.5523) closs: 0.8909 (0.9047) time: 1.2303 data: 0.0002 max mem: 17666 +[05:33:50.262136] Epoch: [2] [520/812] lr: 0.000024 grad_norm: 0.5317 (0.5519) closs: 0.8782 (0.9047) time: 1.2268 data: 0.0002 max mem: 17666 +[05:34:02.535828] Epoch: [2] [530/812] lr: 0.000024 grad_norm: 0.5358 (0.5519) closs: 0.8886 (0.9047) time: 1.2236 data: 0.0002 max mem: 17666 +[05:34:14.922873] Epoch: [2] [540/812] lr: 0.000024 grad_norm: 0.5384 (0.5516) closs: 0.8886 (0.9045) time: 1.2330 data: 0.0002 max mem: 17666 +[05:34:27.171951] Epoch: [2] [550/812] lr: 0.000023 grad_norm: 0.5198 (0.5508) closs: 0.8887 (0.9045) time: 1.2317 data: 0.0002 max mem: 17666 +[05:34:39.396660] Epoch: [2] [560/812] lr: 0.000023 grad_norm: 0.5260 (0.5509) closs: 0.8877 (0.9046) time: 1.2236 data: 0.0002 max mem: 17666 +[05:34:51.640988] Epoch: [2] [570/812] lr: 0.000023 grad_norm: 0.5627 (0.5513) closs: 0.8820 (0.9039) time: 1.2234 data: 0.0001 max mem: 17666 +[05:35:03.912231] Epoch: [2] [580/812] lr: 0.000022 grad_norm: 0.5364 (0.5511) closs: 0.8524 (0.9029) time: 1.2257 data: 0.0002 max mem: 17666 +[05:35:16.149707] Epoch: [2] [590/812] lr: 0.000022 grad_norm: 0.5502 (0.5518) closs: 0.8858 (0.9019) time: 1.2254 data: 0.0002 max mem: 17666 +[05:35:28.402157] Epoch: [2] [600/812] lr: 0.000022 grad_norm: 0.5616 (0.5521) closs: 0.8858 (0.9020) time: 1.2244 data: 0.0001 max mem: 17666 +[05:35:40.600967] Epoch: [2] [610/812] lr: 0.000022 grad_norm: 0.5462 (0.5522) closs: 0.8935 (0.9018) time: 1.2225 data: 0.0001 max mem: 17666 +[05:35:52.829356] Epoch: [2] [620/812] lr: 0.000021 grad_norm: 0.5462 (0.5521) closs: 0.8935 (0.9016) time: 1.2213 data: 0.0002 max mem: 17666 +[05:36:05.050971] Epoch: [2] [630/812] lr: 0.000021 grad_norm: 0.5440 (0.5521) closs: 0.8583 (0.9015) time: 1.2224 data: 0.0002 max mem: 17666 +[05:36:17.274985] Epoch: [2] [640/812] lr: 0.000021 grad_norm: 0.5375 (0.5518) closs: 0.8728 (0.9015) time: 1.2222 data: 0.0002 max mem: 17666 +[05:36:29.516934] Epoch: [2] [650/812] lr: 0.000021 grad_norm: 0.5418 (0.5523) closs: 0.8445 (0.8999) time: 1.2232 data: 0.0002 max mem: 17666 +[05:36:41.764958] Epoch: [2] [660/812] lr: 0.000020 grad_norm: 0.5287 (0.5519) closs: 0.8310 (0.8993) time: 1.2244 data: 0.0002 max mem: 17666 +[05:36:54.012876] Epoch: [2] [670/812] lr: 0.000020 grad_norm: 0.5246 (0.5520) closs: 0.8673 (0.8988) time: 1.2247 data: 0.0002 max mem: 17666 +[05:37:06.248293] Epoch: [2] [680/812] lr: 0.000020 grad_norm: 0.5323 (0.5519) closs: 0.9063 (0.8993) time: 1.2241 data: 0.0002 max mem: 17666 +[05:37:18.499422] Epoch: [2] [690/812] lr: 0.000019 grad_norm: 0.5320 (0.5520) closs: 0.9313 (0.8995) time: 1.2243 data: 0.0002 max mem: 17666 +[05:37:30.723896] Epoch: [2] [700/812] lr: 0.000019 grad_norm: 0.5400 (0.5520) closs: 0.8904 (0.8992) time: 1.2237 data: 0.0002 max mem: 17666 +[05:37:42.959287] Epoch: [2] [710/812] lr: 0.000019 grad_norm: 0.5535 (0.5522) closs: 0.8538 (0.8986) time: 1.2229 data: 0.0002 max mem: 17666 +[05:37:55.217857] Epoch: [2] [720/812] lr: 0.000019 grad_norm: 0.5448 (0.5520) closs: 0.8520 (0.8987) time: 1.2246 data: 0.0002 max mem: 17666 +[05:38:07.427583] Epoch: [2] [730/812] lr: 0.000018 grad_norm: 0.5490 (0.5522) closs: 0.8863 (0.8994) time: 1.2233 data: 0.0002 max mem: 17666 +[05:38:19.686100] Epoch: [2] [740/812] lr: 0.000018 grad_norm: 0.5623 (0.5524) closs: 0.8536 (0.8994) time: 1.2233 data: 0.0002 max mem: 17666 +[05:38:31.903113] Epoch: [2] [750/812] lr: 0.000018 grad_norm: 0.5355 (0.5521) closs: 0.8654 (0.8993) time: 1.2237 data: 0.0002 max mem: 17666 +[05:38:44.115597] Epoch: [2] [760/812] lr: 0.000018 grad_norm: 0.5332 (0.5522) closs: 0.8976 (0.8987) time: 1.2214 data: 0.0002 max mem: 17666 +[05:38:56.321914] Epoch: [2] [770/812] lr: 0.000017 grad_norm: 0.5436 (0.5522) closs: 0.9065 (0.8994) time: 1.2209 data: 0.0002 max mem: 17666 +[05:39:08.539215] Epoch: [2] [780/812] lr: 0.000017 grad_norm: 0.5429 (0.5522) closs: 0.9551 (0.8999) time: 1.2211 data: 0.0002 max mem: 17666 +[05:39:20.776202] Epoch: [2] [790/812] lr: 0.000017 grad_norm: 0.5429 (0.5520) closs: 0.9152 (0.9003) time: 1.2226 data: 0.0001 max mem: 17666 +[05:39:33.004979] Epoch: [2] [800/812] lr: 0.000017 grad_norm: 0.5364 (0.5520) closs: 0.8958 (0.9006) time: 1.2232 data: 0.0001 max mem: 17666 +[05:39:45.263298] Epoch: [2] [810/812] lr: 0.000016 grad_norm: 0.5421 (0.5519) closs: 0.9132 (0.9010) time: 1.2243 data: 0.0001 max mem: 17666 +[05:39:46.761821] Epoch: [2] Total time: 0:16:35 +[05:39:46.769483] Averaged stats: lr: 0.000016 grad_norm: 0.5364 (0.5519) closs: 0.9132 (0.9010) +[05:39:47.092441] model saved +[05:39:48.487045] optimizer saved +[05:39:48.487749] other rank-common saved +[05:39:48.491225] rank-specific saved +[05:39:48.499785] log_dir: ./output_dir +[05:39:50.944123] Epoch: [3] [0/812] lr: 0.000016 grad_norm: 0.5562 (0.5562) closs: 0.6677 (0.6677) time: 2.4434 data: 1.2020 max mem: 17666 +[05:40:03.157106] Epoch: [3] [10/812] lr: 0.000016 grad_norm: 0.5337 (0.5507) closs: 0.9688 (0.9449) time: 1.3323 data: 0.1094 max mem: 17666 +[05:40:15.422338] Epoch: [3] [20/812] lr: 0.000016 grad_norm: 0.5337 (0.5547) closs: 0.9130 (0.9159) time: 1.2238 data: 0.0002 max mem: 17666 +[05:40:27.617120] Epoch: [3] [30/812] lr: 0.000016 grad_norm: 0.5153 (0.5406) closs: 0.9044 (0.9138) time: 1.2229 data: 0.0002 max mem: 17666 +[05:40:39.859947] Epoch: [3] [40/812] lr: 0.000015 grad_norm: 0.5153 (0.5433) closs: 0.8974 (0.9028) time: 1.2218 data: 0.0002 max mem: 17666 +[05:40:52.133298] Epoch: [3] [50/812] lr: 0.000015 grad_norm: 0.5365 (0.5437) closs: 0.9004 (0.9038) time: 1.2257 data: 0.0002 max mem: 17666 +[05:41:04.316686] Epoch: [3] [60/812] lr: 0.000015 grad_norm: 0.5300 (0.5426) closs: 0.8967 (0.9080) time: 1.2228 data: 0.0002 max mem: 17666 +[05:41:16.709159] Epoch: [3] [70/812] lr: 0.000015 grad_norm: 0.5389 (0.5437) closs: 0.8772 (0.9053) time: 1.2287 data: 0.0002 max mem: 17666 +[05:41:28.981080] Epoch: [3] [80/812] lr: 0.000014 grad_norm: 0.5358 (0.5439) closs: 0.8995 (0.9040) time: 1.2331 data: 0.0002 max mem: 17666 +[05:41:41.218043] Epoch: [3] [90/812] lr: 0.000014 grad_norm: 0.5358 (0.5482) closs: 0.9016 (0.9026) time: 1.2254 data: 0.0002 max mem: 17666 +[05:41:53.459622] Epoch: [3] [100/812] lr: 0.000014 grad_norm: 0.5407 (0.5488) closs: 0.8973 (0.9010) time: 1.2238 data: 0.0002 max mem: 17666 +[05:42:05.684572] Epoch: [3] [110/812] lr: 0.000014 grad_norm: 0.5407 (0.5518) closs: 0.8370 (0.8944) time: 1.2233 data: 0.0002 max mem: 17666 +[05:42:17.955761] Epoch: [3] [120/812] lr: 0.000013 grad_norm: 0.5543 (0.5515) closs: 0.8370 (0.8983) time: 1.2247 data: 0.0002 max mem: 17666 +[05:42:30.181846] Epoch: [3] [130/812] lr: 0.000013 grad_norm: 0.5379 (0.5507) closs: 0.8520 (0.8947) time: 1.2248 data: 0.0002 max mem: 17666 +[05:42:42.456814] Epoch: [3] [140/812] lr: 0.000013 grad_norm: 0.5309 (0.5496) closs: 0.8633 (0.8987) time: 1.2250 data: 0.0002 max mem: 17666 +[05:42:54.678890] Epoch: [3] [150/812] lr: 0.000013 grad_norm: 0.5271 (0.5505) closs: 0.9299 (0.8988) time: 1.2248 data: 0.0002 max mem: 17666 +[05:43:06.935982] Epoch: [3] [160/812] lr: 0.000012 grad_norm: 0.5488 (0.5496) closs: 0.8740 (0.8989) time: 1.2239 data: 0.0002 max mem: 17666 +[05:43:19.181914] Epoch: [3] [170/812] lr: 0.000012 grad_norm: 0.5464 (0.5495) closs: 0.8753 (0.8977) time: 1.2251 data: 0.0002 max mem: 17666 +[05:43:31.400721] Epoch: [3] [180/812] lr: 0.000012 grad_norm: 0.5487 (0.5500) closs: 0.8691 (0.8962) time: 1.2232 data: 0.0002 max mem: 17666 +[05:43:43.639535] Epoch: [3] [190/812] lr: 0.000012 grad_norm: 0.5439 (0.5500) closs: 0.8474 (0.8957) time: 1.2228 data: 0.0002 max mem: 17666 +[05:43:55.888343] Epoch: [3] [200/812] lr: 0.000012 grad_norm: 0.5147 (0.5471) closs: 0.9034 (0.8973) time: 1.2243 data: 0.0002 max mem: 17666 +[05:44:08.136204] Epoch: [3] [210/812] lr: 0.000011 grad_norm: 0.5083 (0.5460) closs: 0.8917 (0.8965) time: 1.2248 data: 0.0002 max mem: 17666 +[05:44:20.360724] Epoch: [3] [220/812] lr: 0.000011 grad_norm: 0.5294 (0.5456) closs: 0.8673 (0.8980) time: 1.2235 data: 0.0002 max mem: 17666 +[05:44:32.597597] Epoch: [3] [230/812] lr: 0.000011 grad_norm: 0.5309 (0.5452) closs: 0.8934 (0.8987) time: 1.2230 data: 0.0002 max mem: 17666 +[05:44:44.822067] Epoch: [3] [240/812] lr: 0.000011 grad_norm: 0.5445 (0.5462) closs: 0.8918 (0.8988) time: 1.2230 data: 0.0002 max mem: 17666 +[05:44:57.071616] Epoch: [3] [250/812] lr: 0.000011 grad_norm: 0.5577 (0.5471) closs: 0.8841 (0.8983) time: 1.2236 data: 0.0002 max mem: 17666 +[05:45:09.334382] Epoch: [3] [260/812] lr: 0.000010 grad_norm: 0.5525 (0.5469) closs: 0.8411 (0.8965) time: 1.2255 data: 0.0002 max mem: 17666 +[05:45:21.576233] Epoch: [3] [270/812] lr: 0.000010 grad_norm: 0.5297 (0.5460) closs: 0.8316 (0.8970) time: 1.2252 data: 0.0002 max mem: 17666 +[05:45:33.792847] Epoch: [3] [280/812] lr: 0.000010 grad_norm: 0.5256 (0.5458) closs: 0.8528 (0.8986) time: 1.2228 data: 0.0002 max mem: 17666 +[05:45:45.985161] Epoch: [3] [290/812] lr: 0.000010 grad_norm: 0.5459 (0.5469) closs: 0.8481 (0.8978) time: 1.2204 data: 0.0002 max mem: 17666 +[05:45:58.246023] Epoch: [3] [300/812] lr: 0.000010 grad_norm: 0.5559 (0.5478) closs: 0.8454 (0.8950) time: 1.2226 data: 0.0002 max mem: 17666 +[05:46:10.538520] Epoch: [3] [310/812] lr: 0.000010 grad_norm: 0.5453 (0.5486) closs: 0.8714 (0.8966) time: 1.2276 data: 0.0002 max mem: 17666 +[05:46:22.758193] Epoch: [3] [320/812] lr: 0.000009 grad_norm: 0.5333 (0.5487) closs: 0.9090 (0.8970) time: 1.2255 data: 0.0002 max mem: 17666 +[05:46:35.009523] Epoch: [3] [330/812] lr: 0.000009 grad_norm: 0.5240 (0.5482) closs: 0.9090 (0.8967) time: 1.2235 data: 0.0002 max mem: 17666 +[05:46:47.248170] Epoch: [3] [340/812] lr: 0.000009 grad_norm: 0.5495 (0.5481) closs: 0.9205 (0.8972) time: 1.2244 data: 0.0002 max mem: 17666 +[05:46:59.513079] Epoch: [3] [350/812] lr: 0.000009 grad_norm: 0.5428 (0.5478) closs: 0.9117 (0.8979) time: 1.2251 data: 0.0002 max mem: 17666 +[05:47:11.756475] Epoch: [3] [360/812] lr: 0.000009 grad_norm: 0.5528 (0.5483) closs: 0.9040 (0.8985) time: 1.2253 data: 0.0002 max mem: 17666 +[05:47:24.001759] Epoch: [3] [370/812] lr: 0.000009 grad_norm: 0.5583 (0.5497) closs: 0.9428 (0.8988) time: 1.2244 data: 0.0002 max mem: 17666 +[05:47:36.258369] Epoch: [3] [380/812] lr: 0.000008 grad_norm: 0.5532 (0.5498) closs: 0.8826 (0.8977) time: 1.2250 data: 0.0002 max mem: 17666 +[05:47:48.534397] Epoch: [3] [390/812] lr: 0.000008 grad_norm: 0.5464 (0.5498) closs: 0.8169 (0.8967) time: 1.2266 data: 0.0002 max mem: 17666 +[05:48:00.757119] Epoch: [3] [400/812] lr: 0.000008 grad_norm: 0.5397 (0.5499) closs: 0.8881 (0.8972) time: 1.2249 data: 0.0002 max mem: 17666 +[05:48:12.965384] Epoch: [3] [410/812] lr: 0.000008 grad_norm: 0.5424 (0.5500) closs: 0.8712 (0.8959) time: 1.2215 data: 0.0002 max mem: 17666 +[05:48:25.205411] Epoch: [3] [420/812] lr: 0.000008 grad_norm: 0.5482 (0.5500) closs: 0.8673 (0.8952) time: 1.2223 data: 0.0002 max mem: 17666 +[05:48:37.398291] Epoch: [3] [430/812] lr: 0.000008 grad_norm: 0.5379 (0.5502) closs: 0.8604 (0.8939) time: 1.2216 data: 0.0002 max mem: 17666 +[05:48:49.634277] Epoch: [3] [440/812] lr: 0.000008 grad_norm: 0.5367 (0.5501) closs: 0.8604 (0.8932) time: 1.2214 data: 0.0002 max mem: 17666 +[05:49:01.842468] Epoch: [3] [450/812] lr: 0.000007 grad_norm: 0.5338 (0.5509) closs: 0.8707 (0.8928) time: 1.2221 data: 0.0002 max mem: 17666 +[05:49:14.048276] Epoch: [3] [460/812] lr: 0.000007 grad_norm: 0.5654 (0.5517) closs: 0.8607 (0.8921) time: 1.2206 data: 0.0002 max mem: 17666 +[05:49:26.270248] Epoch: [3] [470/812] lr: 0.000007 grad_norm: 0.5506 (0.5519) closs: 0.8607 (0.8917) time: 1.2213 data: 0.0002 max mem: 17666 +[05:49:38.467867] Epoch: [3] [480/812] lr: 0.000007 grad_norm: 0.5433 (0.5522) closs: 0.8556 (0.8913) time: 1.2209 data: 0.0002 max mem: 17666 +[05:49:50.683011] Epoch: [3] [490/812] lr: 0.000007 grad_norm: 0.5388 (0.5518) closs: 0.8579 (0.8916) time: 1.2206 data: 0.0002 max mem: 17666 +[05:50:02.890194] Epoch: [3] [500/812] lr: 0.000007 grad_norm: 0.5285 (0.5519) closs: 0.8663 (0.8911) time: 1.2210 data: 0.0002 max mem: 17666 +[05:50:15.194581] Epoch: [3] [510/812] lr: 0.000007 grad_norm: 0.5267 (0.5519) closs: 0.8912 (0.8925) time: 1.2255 data: 0.0002 max mem: 17666 +[05:50:27.434194] Epoch: [3] [520/812] lr: 0.000007 grad_norm: 0.5485 (0.5521) closs: 0.9753 (0.8933) time: 1.2271 data: 0.0002 max mem: 17666 +[05:50:39.651081] Epoch: [3] [530/812] lr: 0.000006 grad_norm: 0.5609 (0.5526) closs: 0.8542 (0.8926) time: 1.2228 data: 0.0002 max mem: 17666 +[05:50:51.885523] Epoch: [3] [540/812] lr: 0.000006 grad_norm: 0.5575 (0.5528) closs: 0.8232 (0.8925) time: 1.2225 data: 0.0002 max mem: 17666 +[05:51:04.114756] Epoch: [3] [550/812] lr: 0.000006 grad_norm: 0.5378 (0.5530) closs: 0.8856 (0.8916) time: 1.2231 data: 0.0002 max mem: 17666 +[05:51:16.348968] Epoch: [3] [560/812] lr: 0.000006 grad_norm: 0.5246 (0.5528) closs: 0.8477 (0.8911) time: 1.2231 data: 0.0002 max mem: 17666 +[05:51:28.542827] Epoch: [3] [570/812] lr: 0.000006 grad_norm: 0.5556 (0.5531) closs: 0.8762 (0.8917) time: 1.2213 data: 0.0002 max mem: 17666 +[05:51:40.755871] Epoch: [3] [580/812] lr: 0.000006 grad_norm: 0.5559 (0.5532) closs: 0.9160 (0.8922) time: 1.2203 data: 0.0002 max mem: 17666 +[05:51:52.996942] Epoch: [3] [590/812] lr: 0.000006 grad_norm: 0.5523 (0.5531) closs: 0.8803 (0.8920) time: 1.2226 data: 0.0002 max mem: 17666 +[05:52:05.218398] Epoch: [3] [600/812] lr: 0.000006 grad_norm: 0.5592 (0.5532) closs: 0.8175 (0.8915) time: 1.2231 data: 0.0002 max mem: 17666 +[05:52:17.508068] Epoch: [3] [610/812] lr: 0.000006 grad_norm: 0.5655 (0.5533) closs: 0.9026 (0.8923) time: 1.2255 data: 0.0002 max mem: 17666 +[05:52:29.722507] Epoch: [3] [620/812] lr: 0.000006 grad_norm: 0.5540 (0.5537) closs: 0.8860 (0.8926) time: 1.2251 data: 0.0002 max mem: 17666 +[05:52:41.951732] Epoch: [3] [630/812] lr: 0.000006 grad_norm: 0.5512 (0.5543) closs: 0.8709 (0.8918) time: 1.2221 data: 0.0002 max mem: 17666 +[05:52:54.184550] Epoch: [3] [640/812] lr: 0.000006 grad_norm: 0.5541 (0.5544) closs: 0.8467 (0.8915) time: 1.2230 data: 0.0002 max mem: 17666 +[05:53:06.428821] Epoch: [3] [650/812] lr: 0.000005 grad_norm: 0.5534 (0.5547) closs: 0.8995 (0.8926) time: 1.2238 data: 0.0002 max mem: 17666 +[05:53:18.690090] Epoch: [3] [660/812] lr: 0.000005 grad_norm: 0.5558 (0.5549) closs: 0.9336 (0.8932) time: 1.2252 data: 0.0002 max mem: 17666 +[05:53:30.911141] Epoch: [3] [670/812] lr: 0.000005 grad_norm: 0.5558 (0.5548) closs: 0.8835 (0.8925) time: 1.2240 data: 0.0002 max mem: 17666 +[05:53:43.161458] Epoch: [3] [680/812] lr: 0.000005 grad_norm: 0.5321 (0.5547) closs: 0.8819 (0.8931) time: 1.2235 data: 0.0002 max mem: 17666 +[05:53:55.371169] Epoch: [3] [690/812] lr: 0.000005 grad_norm: 0.5240 (0.5545) closs: 0.8819 (0.8929) time: 1.2229 data: 0.0002 max mem: 17666 +[05:54:07.586578] Epoch: [3] [700/812] lr: 0.000005 grad_norm: 0.5243 (0.5545) closs: 0.8890 (0.8933) time: 1.2212 data: 0.0002 max mem: 17666 +[05:54:19.795256] Epoch: [3] [710/812] lr: 0.000005 grad_norm: 0.5534 (0.5550) closs: 0.9166 (0.8937) time: 1.2211 data: 0.0002 max mem: 17666 +[05:54:32.015414] Epoch: [3] [720/812] lr: 0.000005 grad_norm: 0.5337 (0.5548) closs: 0.9047 (0.8940) time: 1.2214 data: 0.0002 max mem: 17666 +[05:54:44.241974] Epoch: [3] [730/812] lr: 0.000005 grad_norm: 0.5269 (0.5549) closs: 0.8744 (0.8932) time: 1.2223 data: 0.0002 max mem: 17666 +[05:54:56.452329] Epoch: [3] [740/812] lr: 0.000005 grad_norm: 0.5342 (0.5546) closs: 0.8736 (0.8936) time: 1.2218 data: 0.0002 max mem: 17666 +[05:55:08.685255] Epoch: [3] [750/812] lr: 0.000005 grad_norm: 0.5347 (0.5546) closs: 0.8736 (0.8932) time: 1.2221 data: 0.0002 max mem: 17666 +[05:55:20.895362] Epoch: [3] [760/812] lr: 0.000005 grad_norm: 0.5396 (0.5548) closs: 0.8711 (0.8937) time: 1.2221 data: 0.0002 max mem: 17666 +[05:55:33.133468] Epoch: [3] [770/812] lr: 0.000005 grad_norm: 0.5376 (0.5548) closs: 0.8961 (0.8938) time: 1.2223 data: 0.0002 max mem: 17666 +[05:55:45.329927] Epoch: [3] [780/812] lr: 0.000005 grad_norm: 0.5376 (0.5551) closs: 0.8806 (0.8935) time: 1.2217 data: 0.0002 max mem: 17666 +[05:55:57.537681] Epoch: [3] [790/812] lr: 0.000005 grad_norm: 0.5535 (0.5551) closs: 0.8785 (0.8936) time: 1.2201 data: 0.0001 max mem: 17666 +[05:56:09.778726] Epoch: [3] [800/812] lr: 0.000005 grad_norm: 0.5479 (0.5551) closs: 0.9185 (0.8940) time: 1.2224 data: 0.0001 max mem: 17666 +[05:56:21.990260] Epoch: [3] [810/812] lr: 0.000005 grad_norm: 0.5479 (0.5552) closs: 0.9699 (0.8947) time: 1.2226 data: 0.0001 max mem: 17666 +[05:56:23.509702] Epoch: [3] Total time: 0:16:35 +[05:56:23.512943] Averaged stats: lr: 0.000005 grad_norm: 0.5555 (0.5554) closs: 0.9666 (0.8950) +[05:56:23.780574] model saved +[05:56:25.200026] optimizer saved +[05:56:25.200659] other rank-common saved +[05:56:25.203616] rank-specific saved +[05:56:25.203811] Training time 1:06:29 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..42b3545022ed0c9ba4047a2d43746d8e0678966d --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43f79632e6a7ff0d46489b43cff701139e8645f43bec7e32cb5d687cb503cec0 +size 5206987 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..b3a347fbf41eea2d382e98d812fb29debb9954e4 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2494a26669b4d50d54988024ed1ee0056139168b77a4b96f33edf4537324bff7 +size 20612285 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..6580ed46844c8196025c2b4408c0a94b11a3286d --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97d31e98bf3b50a9d9871169f77286c5546169adca7c3a2a188449c713f14f8c +size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch0/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..f8c3a35cca5a559e355024cb252d318095ae46c9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa15182e7ab119a25276d512b31db08d8b47ea700b4ac7f2e3e2820f453621c2 +size 5206987 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..4ff6e848e440516d8813521768ce52cc4c0e7ab9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c755bf0c774ab128c975ad3111d38c03d5fc7a8083be54a8bd6130936e9c189d +size 20612285 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..019815c54f38ca0117b2de142a833f36cf716f12 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bd92ff1afa34a36c0974183d0b5c5014a24b768cafce4c0456053efdb3be65f +size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch1/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..092ee2ad6f9d75acb8f5d6c4189c9c2312533f4d --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84a7de8305a6b1286201270e9f6452693b86ab8a28eac06237986264f15f491c +size 5206987 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..cd87dc791566e0b6234c1981ceb12bcb1b9dd161 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d439fa2c56094ad77e821131dda743becb65d0c43f025ccea1a7b1e031730cd +size 20612285 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..f69f0d31ce8e9530fb0924334a376441d7622460 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3767b207334656ecfef01d2de48627986334fc7f316b6a0764dbd5d18e40093c +size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch2/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..e07dffae975107d6528e85cde208d0a2a3795dbd --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18f024463d38fab32865ace8ed97a55e10a820080f9a8d9377e5c94615ea9e8f +size 5206987 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..2cb084d7f45451eee4c753268d244361a7a8dcda --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e148b43413f2c3e62b92ae2c9995effef04ecc773c694b65ba52897c00b9497b +size 20612285 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..eb934285a6ff5bf3f441fd2d3de536082f91905d --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9002aa74e4a1bfbe09add178be306f8d719a30c9a5ec892a01f20ffc4116ff8c +size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/epoch3/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/log.txt b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..d21cd68fdc1fa6ba68b8123cb153f0df70ff1937 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/log.txt @@ -0,0 +1,4 @@ +{"train_lr": 2.49692118226601e-05, "train_grad_norm": 0.42762760063770955, "train_closs": 0.9074021448872139, "epoch": 0, "val_lr": 2.49692118226601e-05, "val_grad_norm": 0.42762760063770955, "val_closs": 0.9074021448872139} +{"train_lr": 4.6114274981403966e-05, "train_grad_norm": 0.3180465380583197, "train_closs": 0.8586826063417302, "epoch": 1, "val_lr": 4.6114274981403966e-05, "val_grad_norm": 0.3180465380583197, "val_closs": 0.8586826063417302} +{"train_lr": 2.751385467980297e-05, "train_grad_norm": 0.3192772727845044, "train_closs": 0.8497503121323847, "epoch": 2, "val_lr": 2.751385467980297e-05, "val_grad_norm": 0.3192772727845044, "val_closs": 0.8497503121323847} +{"train_lr": 8.899579698398978e-06, "train_grad_norm": 0.3224347219274843, "train_closs": 0.846084712903598, "epoch": 3, "val_lr": 8.899579698398978e-06, "val_grad_norm": 0.3224347219274843, "val_closs": 0.846084712903598} diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/output.log b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/output.log new file mode 100644 index 0000000000000000000000000000000000000000..233da8dd518b474697fae9e4b91af66a3b7cc39f --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B/output.log @@ -0,0 +1,591 @@ +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +| distributed init (rank 0): env://, gpu 0 +| distributed init (rank 6): env://, gpu 6 +| distributed init (rank 2): env://, gpu 2 +| distributed init (rank 5): env://, gpu 5 +| distributed init (rank 4): env://, gpu 4 +| distributed init (rank 1): env://, gpu 1 +| distributed init (rank 7): env://, gpu 7 +| distributed init (rank 3): env://, gpu 3 +[02:21:24.645037] > initializing model parallel with size 1 +[02:21:24.645129] > initializing ddp with size 8 +[02:21:24.645136] > initializing pipeline with size 1 +[02:21:24.802227] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory +[02:21:24.802312] Namespace(batch_size=8, +accum_iter=1, +llama_type='llama_peft', +llama_config=['../checkpoints/llama2/Llama-2-13b/params.json'], +no_visual=True, +tokenizer_path='../checkpoints/llama2/Llama-2-13b/tokenizer.model', +pretrained_path='../checkpoints/llama2/Llama-2-13b/', +pretrained_type='meta_ori', +weight_decay=0.02, +lr=5e-05, +min_lr=5e-06, +epochs=4, +warmup_epochs=1.0, +clip_grad=2, +max_words=512, +dialog=False, +data_config='configs/data/finetune/sg/alpaca.yaml', +output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_13B', +log_dir='./output_dir', +save_interval=1, +only_save_trainable=True, +device='cuda', +seed=0, +resume='', +num_workers=24, +pin_mem=True, +world_size=8, +local_rank=-1, +dist_on_itp=False, +dist_url='env://', +model_parallel_size=1, +data_parallel='sdp', +precision='bf16', +checkpointing=True, +quant=True, +rank=0, +gpu=0, +distributed=True, +dist_backend='nccl') +[02:21:24.803162] Start initialization. +[02:21:24.803195] ## Processing on RANK 0. +[02:21:24.813174] Model Args: + ModelArgs(dim=5120, n_layers=40, n_heads=40, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True) +[02:22:55.142663] Model is Peft: True +[02:22:55.146562] Trainable parameter count : 2544640 (local rank), 2544640 (all). +[02:22:55.156925] ## Load pretrained from ../checkpoints/llama2/Llama-2-13b/ +[02:23:23.489563] ## Quantizing model to 4bit! + Qunatization Process: 0%| | 0/487 [00:00 +[02:51:32.419486] Start training for 4 epochs +[02:51:32.424561] log_dir: ./output_dir +[02:51:37.180990] Epoch: [0] [0/812] lr: 0.000000 grad_norm: 1.2368 (1.2368) closs: 1.0616 (1.0616) time: 4.7553 data: 1.6005 max mem: 18726 +[02:51:56.653801] Epoch: [0] [10/812] lr: 0.000001 grad_norm: 1.3463 (1.3196) closs: 1.0616 (1.0548) time: 2.2025 data: 0.1457 max mem: 27896 +[02:52:15.510287] Epoch: [0] [20/812] lr: 0.000001 grad_norm: 1.3090 (1.2957) closs: 1.0002 (1.0350) time: 1.9164 data: 0.0002 max mem: 27896 +[02:52:34.499639] Epoch: [0] [30/812] lr: 0.000002 grad_norm: 1.2693 (1.2965) closs: 1.0493 (1.0553) time: 1.8922 data: 0.0002 max mem: 27896 +[02:52:53.410651] Epoch: [0] [40/812] lr: 0.000002 grad_norm: 1.2444 (1.2760) closs: 1.0673 (1.0619) time: 1.8949 data: 0.0002 max mem: 27896 +[02:53:12.478456] Epoch: [0] [50/812] lr: 0.000003 grad_norm: 1.1888 (1.2653) closs: 1.0609 (1.0622) time: 1.8988 data: 0.0002 max mem: 27896 +[02:53:31.393636] Epoch: [0] [60/812] lr: 0.000004 grad_norm: 1.1431 (1.2403) closs: 1.0390 (1.0578) time: 1.8991 data: 0.0002 max mem: 27896 +[02:53:50.403916] Epoch: [0] [70/812] lr: 0.000004 grad_norm: 1.0537 (1.2016) closs: 1.0404 (1.0618) time: 1.8962 data: 0.0002 max mem: 27896 +[02:54:09.283519] Epoch: [0] [80/812] lr: 0.000005 grad_norm: 0.9215 (1.1737) closs: 1.0404 (1.0612) time: 1.8944 data: 0.0002 max mem: 27896 +[02:54:28.281118] Epoch: [0] [90/812] lr: 0.000006 grad_norm: 0.8322 (1.1354) closs: 1.0057 (1.0516) time: 1.8937 data: 0.0002 max mem: 27896 +[02:54:47.146602] Epoch: [0] [100/812] lr: 0.000006 grad_norm: 0.7272 (1.0881) closs: 0.9742 (1.0478) time: 1.8931 data: 0.0002 max mem: 27896 +[02:55:06.025584] Epoch: [0] [110/812] lr: 0.000007 grad_norm: 0.6155 (1.0440) closs: 0.9687 (1.0375) time: 1.8871 data: 0.0002 max mem: 27896 +[02:55:24.976544] Epoch: [0] [120/812] lr: 0.000007 grad_norm: 0.5469 (0.9996) closs: 0.9673 (1.0377) time: 1.8914 data: 0.0002 max mem: 27896 +[02:55:43.830404] Epoch: [0] [130/812] lr: 0.000008 grad_norm: 0.4990 (0.9606) closs: 0.9881 (1.0344) time: 1.8902 data: 0.0002 max mem: 27896 +[02:56:02.840447] Epoch: [0] [140/812] lr: 0.000009 grad_norm: 0.4640 (0.9261) closs: 0.9172 (1.0263) time: 1.8931 data: 0.0002 max mem: 27896 +[02:56:21.715218] Epoch: [0] [150/812] lr: 0.000009 grad_norm: 0.4640 (0.8962) closs: 0.9242 (1.0218) time: 1.8942 data: 0.0002 max mem: 27896 +[02:56:40.929946] Epoch: [0] [160/812] lr: 0.000010 grad_norm: 0.4432 (0.8666) closs: 0.9550 (1.0186) time: 1.9044 data: 0.0002 max mem: 27896 +[02:56:59.814846] Epoch: [0] [170/812] lr: 0.000010 grad_norm: 0.4047 (0.8400) closs: 0.9260 (1.0121) time: 1.9049 data: 0.0002 max mem: 27896 +[02:57:18.798007] Epoch: [0] [180/812] lr: 0.000011 grad_norm: 0.3673 (0.8123) closs: 0.8709 (1.0037) time: 1.8933 data: 0.0002 max mem: 27896 +[02:57:37.654408] Epoch: [0] [190/812] lr: 0.000012 grad_norm: 0.3308 (0.7864) closs: 0.8709 (0.9977) time: 1.8919 data: 0.0002 max mem: 27896 +[02:57:56.600103] Epoch: [0] [200/812] lr: 0.000012 grad_norm: 0.3035 (0.7639) closs: 0.9270 (0.9951) time: 1.8900 data: 0.0002 max mem: 27896 +[02:58:15.461031] Epoch: [0] [210/812] lr: 0.000013 grad_norm: 0.3035 (0.7424) closs: 0.9337 (0.9907) time: 1.8903 data: 0.0002 max mem: 27896 +[02:58:34.481503] Epoch: [0] [220/812] lr: 0.000014 grad_norm: 0.3010 (0.7223) closs: 0.9109 (0.9876) time: 1.8940 data: 0.0002 max mem: 27896 +[02:58:53.318663] Epoch: [0] [230/812] lr: 0.000014 grad_norm: 0.2931 (0.7045) closs: 0.8898 (0.9813) time: 1.8928 data: 0.0002 max mem: 27896 +[02:59:12.273142] Epoch: [0] [240/812] lr: 0.000015 grad_norm: 0.2934 (0.6879) closs: 0.8515 (0.9774) time: 1.8895 data: 0.0002 max mem: 27896 +[02:59:31.148548] Epoch: [0] [250/812] lr: 0.000015 grad_norm: 0.3028 (0.6734) closs: 0.9104 (0.9749) time: 1.8914 data: 0.0002 max mem: 27896 +[02:59:50.126862] Epoch: [0] [260/812] lr: 0.000016 grad_norm: 0.2997 (0.6589) closs: 0.9027 (0.9712) time: 1.8926 data: 0.0002 max mem: 27896 +[03:00:08.984543] Epoch: [0] [270/812] lr: 0.000017 grad_norm: 0.2848 (0.6456) closs: 0.9058 (0.9701) time: 1.8917 data: 0.0002 max mem: 27896 +[03:00:27.916360] Epoch: [0] [280/812] lr: 0.000017 grad_norm: 0.2850 (0.6339) closs: 0.9158 (0.9670) time: 1.8894 data: 0.0002 max mem: 27896 +[03:00:46.795221] Epoch: [0] [290/812] lr: 0.000018 grad_norm: 0.2990 (0.6227) closs: 0.8726 (0.9636) time: 1.8905 data: 0.0002 max mem: 27896 +[03:01:05.829556] Epoch: [0] [300/812] lr: 0.000018 grad_norm: 0.3051 (0.6121) closs: 0.8665 (0.9607) time: 1.8956 data: 0.0002 max mem: 27896 +[03:01:24.744180] Epoch: [0] [310/812] lr: 0.000019 grad_norm: 0.3069 (0.6027) closs: 0.8881 (0.9598) time: 1.8974 data: 0.0002 max mem: 27896 +[03:01:43.610809] Epoch: [0] [320/812] lr: 0.000020 grad_norm: 0.3030 (0.5933) closs: 0.9071 (0.9591) time: 1.8890 data: 0.0003 max mem: 27896 +[03:02:02.590087] Epoch: [0] [330/812] lr: 0.000020 grad_norm: 0.2972 (0.5858) closs: 0.8722 (0.9554) time: 1.8922 data: 0.0003 max mem: 27896 +[03:02:21.416511] Epoch: [0] [340/812] lr: 0.000021 grad_norm: 0.3050 (0.5778) closs: 0.8452 (0.9523) time: 1.8902 data: 0.0002 max mem: 27896 +[03:02:40.411438] Epoch: [0] [350/812] lr: 0.000022 grad_norm: 0.2947 (0.5697) closs: 0.8839 (0.9532) time: 1.8910 data: 0.0002 max mem: 27896 +[03:02:59.253600] Epoch: [0] [360/812] lr: 0.000022 grad_norm: 0.2946 (0.5626) closs: 0.9033 (0.9514) time: 1.8918 data: 0.0002 max mem: 27896 +[03:03:18.214034] Epoch: [0] [370/812] lr: 0.000023 grad_norm: 0.2946 (0.5564) closs: 0.8445 (0.9490) time: 1.8901 data: 0.0002 max mem: 27896 +[03:03:37.033540] Epoch: [0] [380/812] lr: 0.000023 grad_norm: 0.2868 (0.5501) closs: 0.8445 (0.9476) time: 1.8889 data: 0.0002 max mem: 27896 +[03:03:55.989931] Epoch: [0] [390/812] lr: 0.000024 grad_norm: 0.2806 (0.5436) closs: 0.8717 (0.9460) time: 1.8887 data: 0.0002 max mem: 27896 +[03:04:14.812323] Epoch: [0] [400/812] lr: 0.000025 grad_norm: 0.2848 (0.5388) closs: 0.8652 (0.9447) time: 1.8889 data: 0.0002 max mem: 27896 +[03:04:33.756960] Epoch: [0] [410/812] lr: 0.000025 grad_norm: 0.2892 (0.5332) closs: 0.8652 (0.9431) time: 1.8883 data: 0.0002 max mem: 27896 +[03:04:52.642341] Epoch: [0] [420/812] lr: 0.000026 grad_norm: 0.2892 (0.5293) closs: 0.8611 (0.9411) time: 1.8914 data: 0.0002 max mem: 27896 +[03:05:11.582135] Epoch: [0] [430/812] lr: 0.000026 grad_norm: 0.3156 (0.5253) closs: 0.8376 (0.9384) time: 1.8912 data: 0.0002 max mem: 27896 +[03:05:30.444304] Epoch: [0] [440/812] lr: 0.000027 grad_norm: 0.3005 (0.5203) closs: 0.8013 (0.9357) time: 1.8900 data: 0.0002 max mem: 27896 +[03:05:49.417047] Epoch: [0] [450/812] lr: 0.000028 grad_norm: 0.2965 (0.5153) closs: 0.8168 (0.9333) time: 1.8917 data: 0.0002 max mem: 27896 +[03:06:08.266807] Epoch: [0] [460/812] lr: 0.000028 grad_norm: 0.2900 (0.5106) closs: 0.8519 (0.9319) time: 1.8910 data: 0.0002 max mem: 27896 +[03:06:27.237157] Epoch: [0] [470/812] lr: 0.000029 grad_norm: 0.2993 (0.5069) closs: 0.8519 (0.9312) time: 1.8909 data: 0.0002 max mem: 27896 +[03:06:46.077686] Epoch: [0] [480/812] lr: 0.000030 grad_norm: 0.3083 (0.5031) closs: 0.9032 (0.9304) time: 1.8905 data: 0.0002 max mem: 27896 +[03:07:05.047292] Epoch: [0] [490/812] lr: 0.000030 grad_norm: 0.2765 (0.4983) closs: 0.8994 (0.9296) time: 1.8904 data: 0.0002 max mem: 27896 +[03:07:23.915717] Epoch: [0] [500/812] lr: 0.000031 grad_norm: 0.2689 (0.4940) closs: 0.8750 (0.9282) time: 1.8918 data: 0.0002 max mem: 27896 +[03:07:42.875114] Epoch: [0] [510/812] lr: 0.000031 grad_norm: 0.2883 (0.4920) closs: 0.8579 (0.9260) time: 1.8913 data: 0.0002 max mem: 27896 +[03:08:01.733927] Epoch: [0] [520/812] lr: 0.000032 grad_norm: 0.3075 (0.4881) closs: 0.7963 (0.9248) time: 1.8908 data: 0.0002 max mem: 27896 +[03:08:20.563694] Epoch: [0] [530/812] lr: 0.000033 grad_norm: 0.3265 (0.4861) closs: 0.8782 (0.9239) time: 1.8844 data: 0.0002 max mem: 27896 +[03:08:39.514327] Epoch: [0] [540/812] lr: 0.000033 grad_norm: 0.3301 (0.4828) closs: 0.9101 (0.9233) time: 1.8890 data: 0.0002 max mem: 27896 +[03:08:58.358994] Epoch: [0] [550/812] lr: 0.000034 grad_norm: 0.3194 (0.4800) closs: 0.8802 (0.9221) time: 1.8897 data: 0.0002 max mem: 27896 +[03:09:17.294100] Epoch: [0] [560/812] lr: 0.000034 grad_norm: 0.3228 (0.4772) closs: 0.8471 (0.9214) time: 1.8889 data: 0.0002 max mem: 27896 +[03:09:36.134535] Epoch: [0] [570/812] lr: 0.000035 grad_norm: 0.2998 (0.4749) closs: 0.8602 (0.9206) time: 1.8887 data: 0.0002 max mem: 27896 +[03:09:55.071136] Epoch: [0] [580/812] lr: 0.000036 grad_norm: 0.2998 (0.4723) closs: 0.8705 (0.9203) time: 1.8888 data: 0.0002 max mem: 27896 +[03:10:13.960266] Epoch: [0] [590/812] lr: 0.000036 grad_norm: 0.3104 (0.4695) closs: 0.8705 (0.9193) time: 1.8912 data: 0.0002 max mem: 27896 +[03:10:32.941351] Epoch: [0] [600/812] lr: 0.000037 grad_norm: 0.2986 (0.4666) closs: 0.8547 (0.9186) time: 1.8934 data: 0.0002 max mem: 27896 +[03:10:51.868015] Epoch: [0] [610/812] lr: 0.000038 grad_norm: 0.3084 (0.4641) closs: 0.8324 (0.9164) time: 1.8953 data: 0.0002 max mem: 27896 +[03:11:10.823823] Epoch: [0] [620/812] lr: 0.000038 grad_norm: 0.3157 (0.4616) closs: 0.7938 (0.9157) time: 1.8941 data: 0.0002 max mem: 27896 +[03:11:29.685695] Epoch: [0] [630/812] lr: 0.000039 grad_norm: 0.3046 (0.4592) closs: 0.8578 (0.9145) time: 1.8908 data: 0.0002 max mem: 27896 +[03:11:48.663630] Epoch: [0] [640/812] lr: 0.000039 grad_norm: 0.3046 (0.4568) closs: 0.8596 (0.9142) time: 1.8919 data: 0.0002 max mem: 27896 +[03:12:07.520528] Epoch: [0] [650/812] lr: 0.000040 grad_norm: 0.3001 (0.4547) closs: 0.8522 (0.9129) time: 1.8917 data: 0.0002 max mem: 27896 +[03:12:26.490698] Epoch: [0] [660/812] lr: 0.000041 grad_norm: 0.2877 (0.4523) closs: 0.8184 (0.9122) time: 1.8913 data: 0.0002 max mem: 27896 +[03:12:45.398835] Epoch: [0] [670/812] lr: 0.000041 grad_norm: 0.2794 (0.4497) closs: 0.8443 (0.9117) time: 1.8938 data: 0.0002 max mem: 27896 +[03:13:04.374147] Epoch: [0] [680/812] lr: 0.000042 grad_norm: 0.2880 (0.4478) closs: 0.8587 (0.9112) time: 1.8941 data: 0.0002 max mem: 27896 +[03:13:23.286590] Epoch: [0] [690/812] lr: 0.000042 grad_norm: 0.3029 (0.4457) closs: 0.8656 (0.9107) time: 1.8943 data: 0.0002 max mem: 27896 +[03:13:42.230027] Epoch: [0] [700/812] lr: 0.000043 grad_norm: 0.3407 (0.4445) closs: 0.8605 (0.9102) time: 1.8927 data: 0.0002 max mem: 27896 +[03:14:01.080842] Epoch: [0] [710/812] lr: 0.000044 grad_norm: 0.3416 (0.4431) closs: 0.8290 (0.9091) time: 1.8896 data: 0.0002 max mem: 27896 +[03:14:20.026951] Epoch: [0] [720/812] lr: 0.000044 grad_norm: 0.3232 (0.4414) closs: 0.8416 (0.9086) time: 1.8898 data: 0.0002 max mem: 27896 +[03:14:38.859533] Epoch: [0] [730/812] lr: 0.000045 grad_norm: 0.3002 (0.4395) closs: 0.8626 (0.9077) time: 1.8889 data: 0.0002 max mem: 27896 +[03:14:57.748386] Epoch: [0] [740/812] lr: 0.000046 grad_norm: 0.2982 (0.4376) closs: 0.8342 (0.9064) time: 1.8860 data: 0.0002 max mem: 27896 +[03:15:16.711446] Epoch: [0] [750/812] lr: 0.000046 grad_norm: 0.3052 (0.4363) closs: 0.8395 (0.9072) time: 1.8925 data: 0.0002 max mem: 27896 +[03:15:35.570759] Epoch: [0] [760/812] lr: 0.000047 grad_norm: 0.3141 (0.4346) closs: 0.8774 (0.9065) time: 1.8910 data: 0.0002 max mem: 27896 +[03:15:54.542427] Epoch: [0] [770/812] lr: 0.000047 grad_norm: 0.3141 (0.4332) closs: 0.8427 (0.9055) time: 1.8915 data: 0.0002 max mem: 27896 +[03:16:13.384792] Epoch: [0] [780/812] lr: 0.000048 grad_norm: 0.3178 (0.4316) closs: 0.8427 (0.9050) time: 1.8906 data: 0.0002 max mem: 27896 +[03:16:32.365361] Epoch: [0] [790/812] lr: 0.000049 grad_norm: 0.2918 (0.4301) closs: 0.8414 (0.9041) time: 1.8911 data: 0.0002 max mem: 27896 +[03:16:51.471742] Epoch: [0] [800/812] lr: 0.000049 grad_norm: 0.2894 (0.4286) closs: 0.8447 (0.9036) time: 1.9043 data: 0.0002 max mem: 27896 +[03:17:10.470921] Epoch: [0] [810/812] lr: 0.000050 grad_norm: 0.2894 (0.4278) closs: 0.8293 (0.9023) time: 1.9052 data: 0.0001 max mem: 27896 +[03:17:12.550725] Epoch: [0] Total time: 0:25:40 +[03:17:12.557705] Averaged stats: lr: 0.000050 grad_norm: 0.2894 (0.4276) closs: 0.8217 (0.9074) +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[03:17:12.633835] model saved +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[03:17:13.416849] optimizer saved +[03:17:13.417272] other rank-common saved +[03:17:13.419172] rank-specific saved +[03:17:13.424182] log_dir: ./output_dir +[03:17:16.591962] Epoch: [1] [0/812] lr: 0.000050 grad_norm: 0.3125 (0.3125) closs: 0.6480 (0.6480) time: 3.1669 data: 1.2401 max mem: 27896 +[03:17:35.447743] Epoch: [1] [10/812] lr: 0.000050 grad_norm: 0.3209 (0.3396) closs: 0.8064 (0.7937) time: 2.0020 data: 0.1129 max mem: 27896 +[03:17:54.386301] Epoch: [1] [20/812] lr: 0.000050 grad_norm: 0.3280 (0.3354) closs: 0.8198 (0.8550) time: 1.8896 data: 0.0002 max mem: 27896 +[03:18:13.203946] Epoch: [1] [30/812] lr: 0.000050 grad_norm: 0.3237 (0.3326) closs: 0.8367 (0.8505) time: 1.8877 data: 0.0002 max mem: 27896 +[03:18:32.059763] Epoch: [1] [40/812] lr: 0.000050 grad_norm: 0.3117 (0.3285) closs: 0.8744 (0.8705) time: 1.8836 data: 0.0002 max mem: 27896 +[03:18:50.851265] Epoch: [1] [50/812] lr: 0.000050 grad_norm: 0.3022 (0.3242) closs: 0.8867 (0.8662) time: 1.8823 data: 0.0002 max mem: 27896 +[03:19:09.770410] Epoch: [1] [60/812] lr: 0.000050 grad_norm: 0.3107 (0.3216) closs: 0.8414 (0.8637) time: 1.8855 data: 0.0002 max mem: 27896 +[03:19:28.603865] Epoch: [1] [70/812] lr: 0.000050 grad_norm: 0.3107 (0.3218) closs: 0.8715 (0.8666) time: 1.8876 data: 0.0002 max mem: 27896 +[03:19:47.568062] Epoch: [1] [80/812] lr: 0.000050 grad_norm: 0.3189 (0.3245) closs: 0.8715 (0.8625) time: 1.8898 data: 0.0002 max mem: 27896 +[03:20:06.507893] Epoch: [1] [90/812] lr: 0.000050 grad_norm: 0.2924 (0.3215) closs: 0.8712 (0.8666) time: 1.8951 data: 0.0001 max mem: 27896 +[03:20:25.438363] Epoch: [1] [100/812] lr: 0.000050 grad_norm: 0.2860 (0.3200) closs: 0.8985 (0.8687) time: 1.8934 data: 0.0002 max mem: 27896 +[03:20:44.375178] Epoch: [1] [110/812] lr: 0.000050 grad_norm: 0.3010 (0.3197) closs: 0.8714 (0.8658) time: 1.8933 data: 0.0002 max mem: 27896 +[03:21:03.380453] Epoch: [1] [120/812] lr: 0.000050 grad_norm: 0.3176 (0.3210) closs: 0.8714 (0.8665) time: 1.8970 data: 0.0001 max mem: 27896 +[03:21:22.330716] Epoch: [1] [130/812] lr: 0.000050 grad_norm: 0.3235 (0.3209) closs: 0.8674 (0.8666) time: 1.8977 data: 0.0001 max mem: 27896 +[03:21:41.288988] Epoch: [1] [140/812] lr: 0.000050 grad_norm: 0.3232 (0.3216) closs: 0.8588 (0.8663) time: 1.8954 data: 0.0002 max mem: 27896 +[03:22:00.301536] Epoch: [1] [150/812] lr: 0.000050 grad_norm: 0.2895 (0.3203) closs: 0.8738 (0.8659) time: 1.8985 data: 0.0002 max mem: 27896 +[03:22:19.237615] Epoch: [1] [160/812] lr: 0.000050 grad_norm: 0.2951 (0.3205) closs: 0.8738 (0.8684) time: 1.8974 data: 0.0001 max mem: 27896 +[03:22:38.171105] Epoch: [1] [170/812] lr: 0.000049 grad_norm: 0.3091 (0.3205) closs: 0.9185 (0.8708) time: 1.8934 data: 0.0001 max mem: 27896 +[03:22:57.187937] Epoch: [1] [180/812] lr: 0.000049 grad_norm: 0.3005 (0.3198) closs: 0.8777 (0.8709) time: 1.8975 data: 0.0001 max mem: 27896 +[03:23:16.138555] Epoch: [1] [190/812] lr: 0.000049 grad_norm: 0.2864 (0.3175) closs: 0.8148 (0.8694) time: 1.8983 data: 0.0002 max mem: 27896 +[03:23:35.201639] Epoch: [1] [200/812] lr: 0.000049 grad_norm: 0.2781 (0.3173) closs: 0.8484 (0.8675) time: 1.9006 data: 0.0002 max mem: 27896 +[03:23:54.136076] Epoch: [1] [210/812] lr: 0.000049 grad_norm: 0.2996 (0.3176) closs: 0.8737 (0.8697) time: 1.8998 data: 0.0002 max mem: 27896 +[03:24:13.166403] Epoch: [1] [220/812] lr: 0.000049 grad_norm: 0.3093 (0.3172) closs: 0.8767 (0.8683) time: 1.8982 data: 0.0001 max mem: 27896 +[03:24:32.133003] Epoch: [1] [230/812] lr: 0.000049 grad_norm: 0.2879 (0.3159) closs: 0.8170 (0.8667) time: 1.8998 data: 0.0001 max mem: 27896 +[03:24:51.228672] Epoch: [1] [240/812] lr: 0.000049 grad_norm: 0.2825 (0.3146) closs: 0.8517 (0.8678) time: 1.9030 data: 0.0002 max mem: 27896 +[03:25:10.186686] Epoch: [1] [250/812] lr: 0.000049 grad_norm: 0.3003 (0.3158) closs: 0.8893 (0.8669) time: 1.9026 data: 0.0002 max mem: 27896 +[03:25:29.165652] Epoch: [1] [260/812] lr: 0.000049 grad_norm: 0.3473 (0.3173) closs: 0.8900 (0.8674) time: 1.8968 data: 0.0002 max mem: 27896 +[03:25:48.150931] Epoch: [1] [270/812] lr: 0.000049 grad_norm: 0.2909 (0.3169) closs: 0.8733 (0.8661) time: 1.8981 data: 0.0002 max mem: 27896 +[03:26:07.074547] Epoch: [1] [280/812] lr: 0.000049 grad_norm: 0.2983 (0.3172) closs: 0.8733 (0.8664) time: 1.8954 data: 0.0001 max mem: 27896 +[03:26:26.101288] Epoch: [1] [290/812] lr: 0.000048 grad_norm: 0.2983 (0.3173) closs: 0.8555 (0.8667) time: 1.8974 data: 0.0002 max mem: 27896 +[03:26:45.048249] Epoch: [1] [300/812] lr: 0.000048 grad_norm: 0.3011 (0.3168) closs: 0.8480 (0.8656) time: 1.8986 data: 0.0002 max mem: 27896 +[03:27:04.100352] Epoch: [1] [310/812] lr: 0.000048 grad_norm: 0.3140 (0.3176) closs: 0.8480 (0.8648) time: 1.8999 data: 0.0002 max mem: 27896 +[03:27:23.049115] Epoch: [1] [320/812] lr: 0.000048 grad_norm: 0.3152 (0.3181) closs: 0.8513 (0.8656) time: 1.9000 data: 0.0002 max mem: 27896 +[03:27:42.072420] Epoch: [1] [330/812] lr: 0.000048 grad_norm: 0.2992 (0.3181) closs: 0.8314 (0.8651) time: 1.8985 data: 0.0002 max mem: 27896 +[03:28:01.013759] Epoch: [1] [340/812] lr: 0.000048 grad_norm: 0.3095 (0.3191) closs: 0.8002 (0.8631) time: 1.8982 data: 0.0002 max mem: 27896 +[03:28:19.984896] Epoch: [1] [350/812] lr: 0.000048 grad_norm: 0.3020 (0.3184) closs: 0.8002 (0.8626) time: 1.8955 data: 0.0002 max mem: 27896 +[03:28:39.032148] Epoch: [1] [360/812] lr: 0.000048 grad_norm: 0.2855 (0.3181) closs: 0.7816 (0.8603) time: 1.9009 data: 0.0002 max mem: 27896 +[03:28:57.981034] Epoch: [1] [370/812] lr: 0.000047 grad_norm: 0.3070 (0.3185) closs: 0.7695 (0.8594) time: 1.8997 data: 0.0001 max mem: 27896 +[03:29:16.955211] Epoch: [1] [380/812] lr: 0.000047 grad_norm: 0.2980 (0.3177) closs: 0.8399 (0.8595) time: 1.8961 data: 0.0001 max mem: 27896 +[03:29:36.033549] Epoch: [1] [390/812] lr: 0.000047 grad_norm: 0.2980 (0.3188) closs: 0.8620 (0.8597) time: 1.9026 data: 0.0002 max mem: 27896 +[03:29:54.991316] Epoch: [1] [400/812] lr: 0.000047 grad_norm: 0.2939 (0.3181) closs: 0.8539 (0.8596) time: 1.9017 data: 0.0002 max mem: 27896 +[03:30:14.020931] Epoch: [1] [410/812] lr: 0.000047 grad_norm: 0.2939 (0.3179) closs: 0.8723 (0.8610) time: 1.8993 data: 0.0002 max mem: 27896 +[03:30:32.994084] Epoch: [1] [420/812] lr: 0.000047 grad_norm: 0.2965 (0.3175) closs: 0.9123 (0.8614) time: 1.9001 data: 0.0002 max mem: 27896 +[03:30:52.010141] Epoch: [1] [430/812] lr: 0.000047 grad_norm: 0.3213 (0.3190) closs: 0.8477 (0.8606) time: 1.8994 data: 0.0002 max mem: 27896 +[03:31:10.982488] Epoch: [1] [440/812] lr: 0.000046 grad_norm: 0.3319 (0.3193) closs: 0.8510 (0.8606) time: 1.8994 data: 0.0002 max mem: 27896 +[03:31:30.070904] Epoch: [1] [450/812] lr: 0.000046 grad_norm: 0.3319 (0.3200) closs: 0.8574 (0.8601) time: 1.9030 data: 0.0002 max mem: 27896 +[03:31:48.984566] Epoch: [1] [460/812] lr: 0.000046 grad_norm: 0.3272 (0.3199) closs: 0.8117 (0.8596) time: 1.9000 data: 0.0002 max mem: 27896 +[03:32:07.916033] Epoch: [1] [470/812] lr: 0.000046 grad_norm: 0.3272 (0.3203) closs: 0.8229 (0.8597) time: 1.8922 data: 0.0002 max mem: 27896 +[03:32:26.962961] Epoch: [1] [480/812] lr: 0.000046 grad_norm: 0.3430 (0.3215) closs: 0.8518 (0.8603) time: 1.8988 data: 0.0002 max mem: 27896 +[03:32:45.959510] Epoch: [1] [490/812] lr: 0.000046 grad_norm: 0.3272 (0.3215) closs: 0.8580 (0.8609) time: 1.9021 data: 0.0002 max mem: 27896 +[03:33:05.027682] Epoch: [1] [500/812] lr: 0.000045 grad_norm: 0.3090 (0.3216) closs: 0.8381 (0.8604) time: 1.9032 data: 0.0002 max mem: 27896 +[03:33:23.971362] Epoch: [1] [510/812] lr: 0.000045 grad_norm: 0.2950 (0.3209) closs: 0.8181 (0.8601) time: 1.9005 data: 0.0002 max mem: 27896 +[03:33:43.025814] Epoch: [1] [520/812] lr: 0.000045 grad_norm: 0.2972 (0.3213) closs: 0.8379 (0.8600) time: 1.8998 data: 0.0002 max mem: 27896 +[03:34:02.005637] Epoch: [1] [530/812] lr: 0.000045 grad_norm: 0.2975 (0.3208) closs: 0.8449 (0.8601) time: 1.9016 data: 0.0002 max mem: 27896 +[03:34:21.081675] Epoch: [1] [540/812] lr: 0.000045 grad_norm: 0.2917 (0.3201) closs: 0.8353 (0.8602) time: 1.9027 data: 0.0002 max mem: 27896 +[03:34:40.078513] Epoch: [1] [550/812] lr: 0.000045 grad_norm: 0.2818 (0.3200) closs: 0.8204 (0.8595) time: 1.9036 data: 0.0002 max mem: 27896 +[03:34:59.061182] Epoch: [1] [560/812] lr: 0.000044 grad_norm: 0.2818 (0.3203) closs: 0.8368 (0.8594) time: 1.8989 data: 0.0002 max mem: 27896 +[03:35:18.076387] Epoch: [1] [570/812] lr: 0.000044 grad_norm: 0.2940 (0.3200) closs: 0.8546 (0.8598) time: 1.8998 data: 0.0002 max mem: 27896 +[03:35:37.024489] Epoch: [1] [580/812] lr: 0.000044 grad_norm: 0.2995 (0.3199) closs: 0.8630 (0.8592) time: 1.8981 data: 0.0002 max mem: 27896 +[03:35:55.936245] Epoch: [1] [590/812] lr: 0.000044 grad_norm: 0.3004 (0.3198) closs: 0.8705 (0.8594) time: 1.8929 data: 0.0001 max mem: 27896 +[03:36:14.971334] Epoch: [1] [600/812] lr: 0.000044 grad_norm: 0.2997 (0.3196) closs: 0.8623 (0.8597) time: 1.8973 data: 0.0001 max mem: 27896 +[03:36:33.918017] Epoch: [1] [610/812] lr: 0.000043 grad_norm: 0.2937 (0.3191) closs: 0.8426 (0.8597) time: 1.8990 data: 0.0002 max mem: 27896 +[03:36:52.988382] Epoch: [1] [620/812] lr: 0.000043 grad_norm: 0.2937 (0.3188) closs: 0.8482 (0.8598) time: 1.9008 data: 0.0002 max mem: 27896 +[03:37:11.918731] Epoch: [1] [630/812] lr: 0.000043 grad_norm: 0.2877 (0.3183) closs: 0.8632 (0.8602) time: 1.9000 data: 0.0002 max mem: 27896 +[03:37:30.972461] Epoch: [1] [640/812] lr: 0.000043 grad_norm: 0.2992 (0.3184) closs: 0.8567 (0.8603) time: 1.8991 data: 0.0002 max mem: 27896 +[03:37:49.926451] Epoch: [1] [650/812] lr: 0.000043 grad_norm: 0.3033 (0.3182) closs: 0.8303 (0.8600) time: 1.9003 data: 0.0002 max mem: 27896 +[03:38:09.012642] Epoch: [1] [660/812] lr: 0.000042 grad_norm: 0.2824 (0.3180) closs: 0.8863 (0.8610) time: 1.9019 data: 0.0002 max mem: 27896 +[03:38:27.968324] Epoch: [1] [670/812] lr: 0.000042 grad_norm: 0.3086 (0.3179) closs: 0.9184 (0.8616) time: 1.9020 data: 0.0002 max mem: 27896 +[03:38:46.902521] Epoch: [1] [680/812] lr: 0.000042 grad_norm: 0.3148 (0.3180) closs: 0.9153 (0.8622) time: 1.8944 data: 0.0002 max mem: 27896 +[03:39:05.936878] Epoch: [1] [690/812] lr: 0.000042 grad_norm: 0.3135 (0.3184) closs: 0.9215 (0.8627) time: 1.8984 data: 0.0002 max mem: 27896 +[03:39:24.940302] Epoch: [1] [700/812] lr: 0.000041 grad_norm: 0.3135 (0.3184) closs: 0.8515 (0.8626) time: 1.9018 data: 0.0002 max mem: 27896 +[03:39:44.005071] Epoch: [1] [710/812] lr: 0.000041 grad_norm: 0.2938 (0.3180) closs: 0.8168 (0.8624) time: 1.9033 data: 0.0002 max mem: 27896 +[03:40:02.939301] Epoch: [1] [720/812] lr: 0.000041 grad_norm: 0.2826 (0.3177) closs: 0.8161 (0.8619) time: 1.8999 data: 0.0002 max mem: 27896 +[03:40:21.995546] Epoch: [1] [730/812] lr: 0.000041 grad_norm: 0.2945 (0.3175) closs: 0.8535 (0.8620) time: 1.8994 data: 0.0003 max mem: 27896 +[03:40:40.932897] Epoch: [1] [740/812] lr: 0.000041 grad_norm: 0.2991 (0.3176) closs: 0.8593 (0.8619) time: 1.8996 data: 0.0003 max mem: 27896 +[03:40:59.951242] Epoch: [1] [750/812] lr: 0.000040 grad_norm: 0.3080 (0.3176) closs: 0.8315 (0.8617) time: 1.8977 data: 0.0002 max mem: 27896 +[03:41:18.894508] Epoch: [1] [760/812] lr: 0.000040 grad_norm: 0.3038 (0.3174) closs: 0.8399 (0.8620) time: 1.8980 data: 0.0002 max mem: 27896 +[03:41:37.827976] Epoch: [1] [770/812] lr: 0.000040 grad_norm: 0.2959 (0.3172) closs: 0.8679 (0.8615) time: 1.8938 data: 0.0002 max mem: 27896 +[03:41:56.837040] Epoch: [1] [780/812] lr: 0.000040 grad_norm: 0.3130 (0.3175) closs: 0.8068 (0.8613) time: 1.8971 data: 0.0002 max mem: 27896 +[03:42:15.740854] Epoch: [1] [790/812] lr: 0.000039 grad_norm: 0.3148 (0.3176) closs: 0.8560 (0.8615) time: 1.8956 data: 0.0001 max mem: 27896 +[03:42:34.640288] Epoch: [1] [800/812] lr: 0.000039 grad_norm: 0.2908 (0.3177) closs: 0.8465 (0.8611) time: 1.8901 data: 0.0001 max mem: 27896 +[03:42:53.636957] Epoch: [1] [810/812] lr: 0.000039 grad_norm: 0.3240 (0.3181) closs: 0.8465 (0.8611) time: 1.8947 data: 0.0001 max mem: 27896 +[03:42:55.771594] Epoch: [1] Total time: 0:25:42 +[03:42:55.780429] Averaged stats: lr: 0.000039 grad_norm: 0.3240 (0.3180) closs: 0.8698 (0.8587) +[03:42:55.860690] model saved +[03:42:56.725243] optimizer saved +[03:42:56.725711] other rank-common saved +[03:42:56.727577] rank-specific saved +[03:42:56.732685] log_dir: ./output_dir +[03:42:59.943195] Epoch: [2] [0/812] lr: 0.000039 grad_norm: 0.2991 (0.2991) closs: 0.8053 (0.8053) time: 3.2096 data: 1.2947 max mem: 27896 +[03:43:18.767681] Epoch: [2] [10/812] lr: 0.000038 grad_norm: 0.2981 (0.3024) closs: 0.8402 (0.8487) time: 2.0030 data: 0.1178 max mem: 27896 +[03:43:37.559302] Epoch: [2] [20/812] lr: 0.000038 grad_norm: 0.2907 (0.2977) closs: 0.8531 (0.8402) time: 1.8807 data: 0.0002 max mem: 27896 +[03:43:56.328664] Epoch: [2] [30/812] lr: 0.000038 grad_norm: 0.2892 (0.2990) closs: 0.8531 (0.8497) time: 1.8780 data: 0.0002 max mem: 27896 +[03:44:15.360720] Epoch: [2] [40/812] lr: 0.000038 grad_norm: 0.3005 (0.3036) closs: 0.8428 (0.8486) time: 1.8900 data: 0.0002 max mem: 27896 +[03:44:34.223213] Epoch: [2] [50/812] lr: 0.000037 grad_norm: 0.3022 (0.3076) closs: 0.8627 (0.8444) time: 1.8946 data: 0.0002 max mem: 27896 +[03:44:53.324545] Epoch: [2] [60/812] lr: 0.000037 grad_norm: 0.3037 (0.3084) closs: 0.8664 (0.8472) time: 1.8981 data: 0.0002 max mem: 27896 +[03:45:12.223238] Epoch: [2] [70/812] lr: 0.000037 grad_norm: 0.2974 (0.3133) closs: 0.8676 (0.8517) time: 1.8999 data: 0.0002 max mem: 27896 +[03:45:31.177084] Epoch: [2] [80/812] lr: 0.000037 grad_norm: 0.2974 (0.3155) closs: 0.8256 (0.8500) time: 1.8926 data: 0.0002 max mem: 27896 +[03:45:50.121054] Epoch: [2] [90/812] lr: 0.000036 grad_norm: 0.2943 (0.3140) closs: 0.8076 (0.8475) time: 1.8948 data: 0.0002 max mem: 27896 +[03:46:09.082639] Epoch: [2] [100/812] lr: 0.000036 grad_norm: 0.2897 (0.3147) closs: 0.8119 (0.8487) time: 1.8952 data: 0.0002 max mem: 27896 +[03:46:28.081000] Epoch: [2] [110/812] lr: 0.000036 grad_norm: 0.3119 (0.3176) closs: 0.8299 (0.8439) time: 1.8979 data: 0.0002 max mem: 27896 +[03:46:47.036342] Epoch: [2] [120/812] lr: 0.000036 grad_norm: 0.3081 (0.3179) closs: 0.8515 (0.8454) time: 1.8976 data: 0.0002 max mem: 27896 +[03:47:05.993187] Epoch: [2] [130/812] lr: 0.000035 grad_norm: 0.2996 (0.3170) closs: 0.8354 (0.8443) time: 1.8955 data: 0.0002 max mem: 27896 +[03:47:24.929514] Epoch: [2] [140/812] lr: 0.000035 grad_norm: 0.3093 (0.3183) closs: 0.8190 (0.8438) time: 1.8946 data: 0.0002 max mem: 27896 +[03:47:43.876677] Epoch: [2] [150/812] lr: 0.000035 grad_norm: 0.3143 (0.3207) closs: 0.8561 (0.8450) time: 1.8941 data: 0.0002 max mem: 27896 +[03:48:02.795947] Epoch: [2] [160/812] lr: 0.000035 grad_norm: 0.3143 (0.3201) closs: 0.8929 (0.8460) time: 1.8933 data: 0.0002 max mem: 27896 +[03:48:21.796307] Epoch: [2] [170/812] lr: 0.000034 grad_norm: 0.3005 (0.3200) closs: 0.8243 (0.8450) time: 1.8959 data: 0.0002 max mem: 27896 +[03:48:40.746031] Epoch: [2] [180/812] lr: 0.000034 grad_norm: 0.3005 (0.3187) closs: 0.8145 (0.8446) time: 1.8974 data: 0.0002 max mem: 27896 +[03:48:59.708301] Epoch: [2] [190/812] lr: 0.000034 grad_norm: 0.2997 (0.3187) closs: 0.8326 (0.8457) time: 1.8955 data: 0.0002 max mem: 27896 +[03:49:18.559350] Epoch: [2] [200/812] lr: 0.000033 grad_norm: 0.2997 (0.3192) closs: 0.8350 (0.8467) time: 1.8906 data: 0.0002 max mem: 27896 +[03:49:37.630429] Epoch: [2] [210/812] lr: 0.000033 grad_norm: 0.3154 (0.3202) closs: 0.8350 (0.8471) time: 1.8960 data: 0.0002 max mem: 27896 +[03:49:56.492380] Epoch: [2] [220/812] lr: 0.000033 grad_norm: 0.3290 (0.3208) closs: 0.8725 (0.8488) time: 1.8966 data: 0.0002 max mem: 27896 +[03:50:15.471718] Epoch: [2] [230/812] lr: 0.000033 grad_norm: 0.3217 (0.3205) closs: 0.8615 (0.8494) time: 1.8920 data: 0.0002 max mem: 27896 +[03:50:34.336505] Epoch: [2] [240/812] lr: 0.000032 grad_norm: 0.2963 (0.3207) closs: 0.8458 (0.8492) time: 1.8921 data: 0.0002 max mem: 27896 +[03:50:53.385960] Epoch: [2] [250/812] lr: 0.000032 grad_norm: 0.2963 (0.3202) closs: 0.8483 (0.8498) time: 1.8956 data: 0.0002 max mem: 27896 +[03:51:12.244932] Epoch: [2] [260/812] lr: 0.000032 grad_norm: 0.3020 (0.3205) closs: 0.8005 (0.8491) time: 1.8953 data: 0.0002 max mem: 27896 +[03:51:31.317945] Epoch: [2] [270/812] lr: 0.000031 grad_norm: 0.2977 (0.3204) closs: 0.8133 (0.8502) time: 1.8965 data: 0.0002 max mem: 27896 +[03:51:50.166003] Epoch: [2] [280/812] lr: 0.000031 grad_norm: 0.2890 (0.3196) closs: 0.8392 (0.8516) time: 1.8960 data: 0.0002 max mem: 27896 +[03:52:09.099328] Epoch: [2] [290/812] lr: 0.000031 grad_norm: 0.3024 (0.3196) closs: 0.8183 (0.8484) time: 1.8890 data: 0.0002 max mem: 27896 +[03:52:28.076001] Epoch: [2] [300/812] lr: 0.000031 grad_norm: 0.2902 (0.3187) closs: 0.8146 (0.8481) time: 1.8954 data: 0.0002 max mem: 27896 +[03:52:47.002133] Epoch: [2] [310/812] lr: 0.000030 grad_norm: 0.3055 (0.3195) closs: 0.8441 (0.8482) time: 1.8951 data: 0.0002 max mem: 27896 +[03:53:05.972836] Epoch: [2] [320/812] lr: 0.000030 grad_norm: 0.3280 (0.3197) closs: 0.8456 (0.8494) time: 1.8948 data: 0.0002 max mem: 27896 +[03:53:24.927527] Epoch: [2] [330/812] lr: 0.000030 grad_norm: 0.3214 (0.3210) closs: 0.8868 (0.8509) time: 1.8962 data: 0.0002 max mem: 27896 +[03:53:43.909637] Epoch: [2] [340/812] lr: 0.000029 grad_norm: 0.3140 (0.3209) closs: 0.9016 (0.8518) time: 1.8968 data: 0.0002 max mem: 27896 +[03:54:02.886469] Epoch: [2] [350/812] lr: 0.000029 grad_norm: 0.3002 (0.3203) closs: 0.8522 (0.8515) time: 1.8979 data: 0.0002 max mem: 27896 +[03:54:21.844980] Epoch: [2] [360/812] lr: 0.000029 grad_norm: 0.2893 (0.3196) closs: 0.8485 (0.8523) time: 1.8967 data: 0.0002 max mem: 27896 +[03:54:40.781893] Epoch: [2] [370/812] lr: 0.000029 grad_norm: 0.3001 (0.3192) closs: 0.8485 (0.8522) time: 1.8947 data: 0.0002 max mem: 27896 +[03:54:59.740019] Epoch: [2] [380/812] lr: 0.000028 grad_norm: 0.3056 (0.3227) closs: 0.8477 (0.8514) time: 1.8947 data: 0.0002 max mem: 27896 +[03:55:18.678795] Epoch: [2] [390/812] lr: 0.000028 grad_norm: 0.3007 (0.3224) closs: 0.8237 (0.8500) time: 1.8948 data: 0.0002 max mem: 27896 +[03:55:37.600494] Epoch: [2] [400/812] lr: 0.000028 grad_norm: 0.3007 (0.3219) closs: 0.8250 (0.8501) time: 1.8930 data: 0.0002 max mem: 27896 +[03:55:56.472663] Epoch: [2] [410/812] lr: 0.000027 grad_norm: 0.3009 (0.3216) closs: 0.8250 (0.8499) time: 1.8896 data: 0.0002 max mem: 27896 +[03:56:15.500175] Epoch: [2] [420/812] lr: 0.000027 grad_norm: 0.3128 (0.3219) closs: 0.8157 (0.8500) time: 1.8949 data: 0.0003 max mem: 27896 +[03:56:34.380680] Epoch: [2] [430/812] lr: 0.000027 grad_norm: 0.3113 (0.3214) closs: 0.8179 (0.8505) time: 1.8953 data: 0.0004 max mem: 27896 +[03:56:53.398340] Epoch: [2] [440/812] lr: 0.000027 grad_norm: 0.3011 (0.3245) closs: 0.8574 (0.8503) time: 1.8948 data: 0.0002 max mem: 27896 +[03:57:12.203346] Epoch: [2] [450/812] lr: 0.000026 grad_norm: 0.3066 (0.3242) closs: 0.8575 (0.8514) time: 1.8911 data: 0.0002 max mem: 27896 +[03:57:31.281200] Epoch: [2] [460/812] lr: 0.000026 grad_norm: 0.3004 (0.3246) closs: 0.9145 (0.8534) time: 1.8941 data: 0.0001 max mem: 27896 +[03:57:50.139462] Epoch: [2] [470/812] lr: 0.000026 grad_norm: 0.3004 (0.3239) closs: 0.8963 (0.8530) time: 1.8967 data: 0.0001 max mem: 27896 +[03:58:09.179436] Epoch: [2] [480/812] lr: 0.000025 grad_norm: 0.2824 (0.3238) closs: 0.8761 (0.8542) time: 1.8948 data: 0.0002 max mem: 27896 +[03:58:28.025942] Epoch: [2] [490/812] lr: 0.000025 grad_norm: 0.2953 (0.3236) closs: 0.8482 (0.8534) time: 1.8943 data: 0.0002 max mem: 27896 +[03:58:46.950908] Epoch: [2] [500/812] lr: 0.000025 grad_norm: 0.2953 (0.3233) closs: 0.8324 (0.8530) time: 1.8885 data: 0.0002 max mem: 27896 +[03:59:05.900493] Epoch: [2] [510/812] lr: 0.000024 grad_norm: 0.2942 (0.3227) closs: 0.8472 (0.8531) time: 1.8937 data: 0.0002 max mem: 27896 +[03:59:24.850132] Epoch: [2] [520/812] lr: 0.000024 grad_norm: 0.2936 (0.3224) closs: 0.8472 (0.8531) time: 1.8949 data: 0.0002 max mem: 27896 +[03:59:43.822183] Epoch: [2] [530/812] lr: 0.000024 grad_norm: 0.3021 (0.3224) closs: 0.8391 (0.8533) time: 1.8960 data: 0.0002 max mem: 27896 +[04:00:02.715570] Epoch: [2] [540/812] lr: 0.000024 grad_norm: 0.3103 (0.3226) closs: 0.8391 (0.8531) time: 1.8932 data: 0.0002 max mem: 27896 +[04:00:21.684010] Epoch: [2] [550/812] lr: 0.000023 grad_norm: 0.3008 (0.3223) closs: 0.8504 (0.8529) time: 1.8930 data: 0.0002 max mem: 27896 +[04:00:40.626087] Epoch: [2] [560/812] lr: 0.000023 grad_norm: 0.2991 (0.3220) closs: 0.8343 (0.8530) time: 1.8955 data: 0.0002 max mem: 27896 +[04:00:59.536283] Epoch: [2] [570/812] lr: 0.000023 grad_norm: 0.3049 (0.3218) closs: 0.8319 (0.8523) time: 1.8925 data: 0.0002 max mem: 27896 +[04:01:18.455498] Epoch: [2] [580/812] lr: 0.000022 grad_norm: 0.2932 (0.3219) closs: 0.8129 (0.8513) time: 1.8914 data: 0.0002 max mem: 27896 +[04:01:37.396965] Epoch: [2] [590/812] lr: 0.000022 grad_norm: 0.2951 (0.3221) closs: 0.8361 (0.8503) time: 1.8930 data: 0.0002 max mem: 27896 +[04:01:56.333263] Epoch: [2] [600/812] lr: 0.000022 grad_norm: 0.2951 (0.3216) closs: 0.8458 (0.8503) time: 1.8938 data: 0.0002 max mem: 27896 +[04:02:15.228857] Epoch: [2] [610/812] lr: 0.000022 grad_norm: 0.2950 (0.3215) closs: 0.8371 (0.8502) time: 1.8915 data: 0.0002 max mem: 27896 +[04:02:34.062409] Epoch: [2] [620/812] lr: 0.000021 grad_norm: 0.3077 (0.3212) closs: 0.8371 (0.8501) time: 1.8864 data: 0.0002 max mem: 27896 +[04:02:53.092759] Epoch: [2] [630/812] lr: 0.000021 grad_norm: 0.2967 (0.3209) closs: 0.8287 (0.8500) time: 1.8931 data: 0.0002 max mem: 27896 +[04:03:11.911605] Epoch: [2] [640/812] lr: 0.000021 grad_norm: 0.2954 (0.3207) closs: 0.8199 (0.8500) time: 1.8924 data: 0.0002 max mem: 27896 +[04:03:30.948186] Epoch: [2] [650/812] lr: 0.000021 grad_norm: 0.3093 (0.3212) closs: 0.7838 (0.8485) time: 1.8927 data: 0.0002 max mem: 27896 +[04:03:49.838472] Epoch: [2] [660/812] lr: 0.000020 grad_norm: 0.3031 (0.3209) closs: 0.7751 (0.8479) time: 1.8963 data: 0.0002 max mem: 27896 +[04:04:08.869671] Epoch: [2] [670/812] lr: 0.000020 grad_norm: 0.2833 (0.3208) closs: 0.7997 (0.8473) time: 1.8960 data: 0.0002 max mem: 27896 +[04:04:27.733943] Epoch: [2] [680/812] lr: 0.000020 grad_norm: 0.2833 (0.3206) closs: 0.8524 (0.8480) time: 1.8947 data: 0.0002 max mem: 27896 +[04:04:46.759941] Epoch: [2] [690/812] lr: 0.000019 grad_norm: 0.2948 (0.3202) closs: 0.8962 (0.8483) time: 1.8944 data: 0.0002 max mem: 27896 +[04:05:05.600006] Epoch: [2] [700/812] lr: 0.000019 grad_norm: 0.3046 (0.3202) closs: 0.8425 (0.8481) time: 1.8932 data: 0.0002 max mem: 27896 +[04:05:24.547147] Epoch: [2] [710/812] lr: 0.000019 grad_norm: 0.3177 (0.3202) closs: 0.8007 (0.8475) time: 1.8893 data: 0.0002 max mem: 27896 +[04:05:43.510683] Epoch: [2] [720/812] lr: 0.000019 grad_norm: 0.3048 (0.3202) closs: 0.8007 (0.8476) time: 1.8955 data: 0.0002 max mem: 27896 +[04:06:02.433316] Epoch: [2] [730/812] lr: 0.000018 grad_norm: 0.3199 (0.3203) closs: 0.8372 (0.8483) time: 1.8942 data: 0.0002 max mem: 27896 +[04:06:21.364546] Epoch: [2] [740/812] lr: 0.000018 grad_norm: 0.3199 (0.3204) closs: 0.8130 (0.8483) time: 1.8926 data: 0.0002 max mem: 27896 +[04:06:40.294476] Epoch: [2] [750/812] lr: 0.000018 grad_norm: 0.3025 (0.3201) closs: 0.8130 (0.8483) time: 1.8930 data: 0.0002 max mem: 27896 +[04:06:59.218302] Epoch: [2] [760/812] lr: 0.000018 grad_norm: 0.2889 (0.3199) closs: 0.8377 (0.8477) time: 1.8926 data: 0.0002 max mem: 27896 +[04:07:18.167960] Epoch: [2] [770/812] lr: 0.000017 grad_norm: 0.2964 (0.3197) closs: 0.8568 (0.8483) time: 1.8936 data: 0.0002 max mem: 27896 +[04:07:37.105525] Epoch: [2] [780/812] lr: 0.000017 grad_norm: 0.3041 (0.3196) closs: 0.8957 (0.8489) time: 1.8943 data: 0.0001 max mem: 27896 +[04:07:56.039500] Epoch: [2] [790/812] lr: 0.000017 grad_norm: 0.3081 (0.3194) closs: 0.8753 (0.8492) time: 1.8935 data: 0.0001 max mem: 27896 +[04:08:14.960851] Epoch: [2] [800/812] lr: 0.000017 grad_norm: 0.3034 (0.3192) closs: 0.8286 (0.8495) time: 1.8927 data: 0.0001 max mem: 27896 +[04:08:33.883409] Epoch: [2] [810/812] lr: 0.000016 grad_norm: 0.3034 (0.3193) closs: 0.8577 (0.8499) time: 1.8921 data: 0.0001 max mem: 27896 +[04:08:36.022537] Epoch: [2] Total time: 0:25:39 +[04:08:36.035493] Averaged stats: lr: 0.000016 grad_norm: 0.3008 (0.3193) closs: 0.8684 (0.8498) +[04:08:36.115200] model saved +[04:08:36.872967] optimizer saved +[04:08:36.873428] other rank-common saved +[04:08:36.875290] rank-specific saved +[04:08:36.880381] log_dir: ./output_dir +[04:08:40.086697] Epoch: [3] [0/812] lr: 0.000016 grad_norm: 0.3132 (0.3132) closs: 0.6514 (0.6514) time: 3.2054 data: 1.2846 max mem: 27896 +[04:08:58.944133] Epoch: [3] [10/812] lr: 0.000016 grad_norm: 0.3039 (0.3061) closs: 0.9142 (0.8930) time: 2.0056 data: 0.1169 max mem: 27896 +[04:09:17.782690] Epoch: [3] [20/812] lr: 0.000016 grad_norm: 0.3034 (0.3083) closs: 0.8751 (0.8666) time: 1.8847 data: 0.0001 max mem: 27896 +[04:09:36.650555] Epoch: [3] [30/812] lr: 0.000016 grad_norm: 0.3027 (0.3036) closs: 0.8511 (0.8649) time: 1.8852 data: 0.0001 max mem: 27896 +[04:09:55.557013] Epoch: [3] [40/812] lr: 0.000015 grad_norm: 0.3062 (0.3063) closs: 0.8427 (0.8541) time: 1.8886 data: 0.0002 max mem: 27896 +[04:10:14.538779] Epoch: [3] [50/812] lr: 0.000015 grad_norm: 0.3062 (0.3108) closs: 0.8332 (0.8547) time: 1.8943 data: 0.0002 max mem: 27896 +[04:10:33.501826] Epoch: [3] [60/812] lr: 0.000015 grad_norm: 0.3091 (0.3128) closs: 0.8533 (0.8592) time: 1.8972 data: 0.0002 max mem: 27896 +[04:10:52.456478] Epoch: [3] [70/812] lr: 0.000015 grad_norm: 0.3068 (0.3104) closs: 0.8281 (0.8562) time: 1.8958 data: 0.0002 max mem: 27896 +[04:11:11.449969] Epoch: [3] [80/812] lr: 0.000014 grad_norm: 0.3006 (0.3107) closs: 0.8402 (0.8559) time: 1.8973 data: 0.0001 max mem: 27896 +[04:11:30.433114] Epoch: [3] [90/812] lr: 0.000014 grad_norm: 0.3006 (0.3130) closs: 0.8645 (0.8548) time: 1.8988 data: 0.0002 max mem: 27896 +[04:11:49.390336] Epoch: [3] [100/812] lr: 0.000014 grad_norm: 0.3083 (0.3140) closs: 0.8268 (0.8523) time: 1.8969 data: 0.0002 max mem: 27896 +[04:12:08.297454] Epoch: [3] [110/812] lr: 0.000014 grad_norm: 0.3087 (0.3158) closs: 0.7961 (0.8464) time: 1.8931 data: 0.0002 max mem: 27896 +[04:12:27.263638] Epoch: [3] [120/812] lr: 0.000013 grad_norm: 0.3062 (0.3156) closs: 0.7961 (0.8509) time: 1.8936 data: 0.0001 max mem: 27896 +[04:12:46.235939] Epoch: [3] [130/812] lr: 0.000013 grad_norm: 0.2982 (0.3144) closs: 0.8222 (0.8480) time: 1.8969 data: 0.0001 max mem: 27896 +[04:13:05.196913] Epoch: [3] [140/812] lr: 0.000013 grad_norm: 0.2880 (0.3124) closs: 0.8459 (0.8516) time: 1.8966 data: 0.0002 max mem: 27896 +[04:13:24.152162] Epoch: [3] [150/812] lr: 0.000013 grad_norm: 0.2880 (0.3148) closs: 0.8459 (0.8508) time: 1.8957 data: 0.0002 max mem: 27896 +[04:13:43.111343] Epoch: [3] [160/812] lr: 0.000012 grad_norm: 0.3095 (0.3133) closs: 0.8125 (0.8509) time: 1.8957 data: 0.0002 max mem: 27896 +[04:14:02.059325] Epoch: [3] [170/812] lr: 0.000012 grad_norm: 0.3053 (0.3137) closs: 0.8366 (0.8496) time: 1.8953 data: 0.0002 max mem: 27896 +[04:14:20.984571] Epoch: [3] [180/812] lr: 0.000012 grad_norm: 0.3173 (0.3141) closs: 0.8201 (0.8481) time: 1.8936 data: 0.0001 max mem: 27896 +[04:14:39.914781] Epoch: [3] [190/812] lr: 0.000012 grad_norm: 0.3322 (0.3155) closs: 0.8201 (0.8478) time: 1.8927 data: 0.0002 max mem: 27896 +[04:14:58.900447] Epoch: [3] [200/812] lr: 0.000012 grad_norm: 0.2994 (0.3135) closs: 0.8613 (0.8494) time: 1.8957 data: 0.0002 max mem: 27896 +[04:15:17.859832] Epoch: [3] [210/812] lr: 0.000011 grad_norm: 0.2909 (0.3131) closs: 0.8319 (0.8483) time: 1.8972 data: 0.0002 max mem: 27896 +[04:15:36.794509] Epoch: [3] [220/812] lr: 0.000011 grad_norm: 0.2960 (0.3124) closs: 0.8319 (0.8499) time: 1.8946 data: 0.0001 max mem: 27896 +[04:15:55.658566] Epoch: [3] [230/812] lr: 0.000011 grad_norm: 0.2969 (0.3122) closs: 0.8500 (0.8504) time: 1.8899 data: 0.0001 max mem: 27896 +[04:16:14.645590] Epoch: [3] [240/812] lr: 0.000011 grad_norm: 0.3129 (0.3137) closs: 0.8391 (0.8506) time: 1.8925 data: 0.0002 max mem: 27896 +[04:16:33.513571] Epoch: [3] [250/812] lr: 0.000011 grad_norm: 0.3129 (0.3137) closs: 0.8141 (0.8500) time: 1.8927 data: 0.0002 max mem: 27896 +[04:16:52.475377] Epoch: [3] [260/812] lr: 0.000010 grad_norm: 0.2952 (0.3132) closs: 0.7776 (0.8484) time: 1.8914 data: 0.0002 max mem: 27896 +[04:17:11.412915] Epoch: [3] [270/812] lr: 0.000010 grad_norm: 0.2944 (0.3123) closs: 0.7811 (0.8488) time: 1.8949 data: 0.0002 max mem: 27896 +[04:17:30.330446] Epoch: [3] [280/812] lr: 0.000010 grad_norm: 0.2960 (0.3125) closs: 0.8017 (0.8501) time: 1.8927 data: 0.0002 max mem: 27896 +[04:17:49.226468] Epoch: [3] [290/812] lr: 0.000010 grad_norm: 0.3128 (0.3135) closs: 0.8010 (0.8492) time: 1.8906 data: 0.0001 max mem: 27896 +[04:18:08.157379] Epoch: [3] [300/812] lr: 0.000010 grad_norm: 0.3259 (0.3140) closs: 0.7914 (0.8465) time: 1.8913 data: 0.0002 max mem: 27896 +[04:18:27.089429] Epoch: [3] [310/812] lr: 0.000010 grad_norm: 0.3107 (0.3136) closs: 0.8240 (0.8483) time: 1.8931 data: 0.0002 max mem: 27896 +[04:18:46.029402] Epoch: [3] [320/812] lr: 0.000009 grad_norm: 0.2942 (0.3132) closs: 0.8801 (0.8490) time: 1.8935 data: 0.0002 max mem: 27896 +[04:19:04.985159] Epoch: [3] [330/812] lr: 0.000009 grad_norm: 0.2938 (0.3127) closs: 0.8596 (0.8487) time: 1.8947 data: 0.0002 max mem: 27896 +[04:19:23.926985] Epoch: [3] [340/812] lr: 0.000009 grad_norm: 0.2990 (0.3139) closs: 0.8847 (0.8493) time: 1.8948 data: 0.0001 max mem: 27896 +[04:19:42.889202] Epoch: [3] [350/812] lr: 0.000009 grad_norm: 0.2984 (0.3136) closs: 0.8847 (0.8498) time: 1.8951 data: 0.0002 max mem: 27896 +[04:20:01.819354] Epoch: [3] [360/812] lr: 0.000009 grad_norm: 0.2945 (0.3141) closs: 0.8506 (0.8503) time: 1.8946 data: 0.0002 max mem: 27896 +[04:20:20.681250] Epoch: [3] [370/812] lr: 0.000009 grad_norm: 0.3086 (0.3139) closs: 0.8640 (0.8506) time: 1.8895 data: 0.0002 max mem: 27896 +[04:20:39.561245] Epoch: [3] [380/812] lr: 0.000008 grad_norm: 0.3031 (0.3136) closs: 0.8254 (0.8496) time: 1.8870 data: 0.0002 max mem: 27896 +[04:20:58.511789] Epoch: [3] [390/812] lr: 0.000008 grad_norm: 0.3048 (0.3138) closs: 0.7809 (0.8486) time: 1.8915 data: 0.0002 max mem: 27896 +[04:21:17.407643] Epoch: [3] [400/812] lr: 0.000008 grad_norm: 0.3048 (0.3140) closs: 0.8496 (0.8490) time: 1.8923 data: 0.0002 max mem: 27896 +[04:21:36.346825] Epoch: [3] [410/812] lr: 0.000008 grad_norm: 0.3205 (0.3144) closs: 0.8241 (0.8476) time: 1.8917 data: 0.0002 max mem: 27896 +[04:21:55.284941] Epoch: [3] [420/812] lr: 0.000008 grad_norm: 0.3215 (0.3144) closs: 0.7919 (0.8469) time: 1.8938 data: 0.0002 max mem: 27896 +[04:22:14.183727] Epoch: [3] [430/812] lr: 0.000008 grad_norm: 0.2908 (0.3151) closs: 0.7896 (0.8455) time: 1.8918 data: 0.0002 max mem: 27896 +[04:22:33.033058] Epoch: [3] [440/812] lr: 0.000008 grad_norm: 0.2986 (0.3149) closs: 0.7896 (0.8446) time: 1.8873 data: 0.0002 max mem: 27896 +[04:22:52.033141] Epoch: [3] [450/812] lr: 0.000007 grad_norm: 0.3073 (0.3153) closs: 0.8106 (0.8442) time: 1.8924 data: 0.0002 max mem: 27896 +[04:23:10.877552] Epoch: [3] [460/812] lr: 0.000007 grad_norm: 0.3192 (0.3159) closs: 0.8106 (0.8436) time: 1.8922 data: 0.0002 max mem: 27896 +[04:23:29.818614] Epoch: [3] [470/812] lr: 0.000007 grad_norm: 0.3165 (0.3162) closs: 0.8062 (0.8433) time: 1.8892 data: 0.0002 max mem: 27896 +[04:23:48.729837] Epoch: [3] [480/812] lr: 0.000007 grad_norm: 0.3106 (0.3164) closs: 0.7980 (0.8427) time: 1.8925 data: 0.0002 max mem: 27896 +[04:24:07.666036] Epoch: [3] [490/812] lr: 0.000007 grad_norm: 0.2986 (0.3160) closs: 0.7908 (0.8428) time: 1.8923 data: 0.0002 max mem: 27896 +[04:24:26.621467] Epoch: [3] [500/812] lr: 0.000007 grad_norm: 0.2896 (0.3159) closs: 0.8088 (0.8424) time: 1.8945 data: 0.0002 max mem: 27896 +[04:24:45.599584] Epoch: [3] [510/812] lr: 0.000007 grad_norm: 0.2821 (0.3161) closs: 0.8423 (0.8438) time: 1.8966 data: 0.0002 max mem: 27896 +[04:25:04.476389] Epoch: [3] [520/812] lr: 0.000007 grad_norm: 0.3012 (0.3183) closs: 0.9362 (0.8444) time: 1.8927 data: 0.0002 max mem: 27896 +[04:25:23.459088] Epoch: [3] [530/812] lr: 0.000006 grad_norm: 0.3306 (0.3186) closs: 0.7977 (0.8438) time: 1.8929 data: 0.0002 max mem: 27896 +[04:25:42.384155] Epoch: [3] [540/812] lr: 0.000006 grad_norm: 0.3257 (0.3189) closs: 0.7825 (0.8435) time: 1.8953 data: 0.0002 max mem: 27896 +[04:26:01.358224] Epoch: [3] [550/812] lr: 0.000006 grad_norm: 0.3128 (0.3201) closs: 0.8171 (0.8426) time: 1.8949 data: 0.0002 max mem: 27896 +[04:26:20.291928] Epoch: [3] [560/812] lr: 0.000006 grad_norm: 0.3103 (0.3200) closs: 0.8064 (0.8420) time: 1.8953 data: 0.0002 max mem: 27896 +[04:26:39.237442] Epoch: [3] [570/812] lr: 0.000006 grad_norm: 0.3198 (0.3202) closs: 0.8320 (0.8427) time: 1.8939 data: 0.0002 max mem: 27896 +[04:26:58.136155] Epoch: [3] [580/812] lr: 0.000006 grad_norm: 0.3220 (0.3202) closs: 0.8468 (0.8431) time: 1.8921 data: 0.0002 max mem: 27896 +[04:27:17.086286] Epoch: [3] [590/812] lr: 0.000006 grad_norm: 0.3003 (0.3201) closs: 0.8231 (0.8426) time: 1.8924 data: 0.0002 max mem: 27896 +[04:27:36.048124] Epoch: [3] [600/812] lr: 0.000006 grad_norm: 0.3098 (0.3202) closs: 0.7716 (0.8424) time: 1.8955 data: 0.0002 max mem: 27896 +[04:27:54.982696] Epoch: [3] [610/812] lr: 0.000006 grad_norm: 0.3098 (0.3201) closs: 0.8542 (0.8431) time: 1.8948 data: 0.0002 max mem: 27896 +[04:28:13.909296] Epoch: [3] [620/812] lr: 0.000006 grad_norm: 0.3018 (0.3205) closs: 0.8435 (0.8433) time: 1.8930 data: 0.0002 max mem: 27896 +[04:28:32.846395] Epoch: [3] [630/812] lr: 0.000006 grad_norm: 0.3174 (0.3209) closs: 0.8081 (0.8426) time: 1.8931 data: 0.0002 max mem: 27896 +[04:28:51.781440] Epoch: [3] [640/812] lr: 0.000006 grad_norm: 0.3213 (0.3209) closs: 0.8105 (0.8424) time: 1.8935 data: 0.0002 max mem: 27896 +[04:29:10.626535] Epoch: [3] [650/812] lr: 0.000005 grad_norm: 0.3271 (0.3213) closs: 0.8478 (0.8436) time: 1.8889 data: 0.0002 max mem: 27896 +[04:29:29.652082] Epoch: [3] [660/812] lr: 0.000005 grad_norm: 0.3271 (0.3219) closs: 0.8954 (0.8442) time: 1.8935 data: 0.0003 max mem: 27896 +[04:29:48.482813] Epoch: [3] [670/812] lr: 0.000005 grad_norm: 0.3248 (0.3219) closs: 0.8301 (0.8436) time: 1.8927 data: 0.0003 max mem: 27896 +[04:30:07.417393] Epoch: [3] [680/812] lr: 0.000005 grad_norm: 0.3113 (0.3224) closs: 0.8336 (0.8442) time: 1.8882 data: 0.0002 max mem: 27896 +[04:30:26.314560] Epoch: [3] [690/812] lr: 0.000005 grad_norm: 0.2994 (0.3220) closs: 0.8543 (0.8439) time: 1.8915 data: 0.0002 max mem: 27896 +[04:30:45.279799] Epoch: [3] [700/812] lr: 0.000005 grad_norm: 0.2994 (0.3222) closs: 0.8287 (0.8441) time: 1.8931 data: 0.0002 max mem: 27896 +[04:31:04.223755] Epoch: [3] [710/812] lr: 0.000005 grad_norm: 0.3136 (0.3222) closs: 0.8644 (0.8445) time: 1.8954 data: 0.0002 max mem: 27896 +[04:31:23.141454] Epoch: [3] [720/812] lr: 0.000005 grad_norm: 0.3024 (0.3221) closs: 0.8557 (0.8448) time: 1.8930 data: 0.0002 max mem: 27896 +[04:31:42.071129] Epoch: [3] [730/812] lr: 0.000005 grad_norm: 0.3024 (0.3218) closs: 0.8299 (0.8441) time: 1.8923 data: 0.0002 max mem: 27896 +[04:32:01.036945] Epoch: [3] [740/812] lr: 0.000005 grad_norm: 0.3053 (0.3215) closs: 0.8202 (0.8445) time: 1.8947 data: 0.0002 max mem: 27896 +[04:32:19.981071] Epoch: [3] [750/812] lr: 0.000005 grad_norm: 0.3053 (0.3215) closs: 0.8202 (0.8441) time: 1.8954 data: 0.0002 max mem: 27896 +[04:32:38.888418] Epoch: [3] [760/812] lr: 0.000005 grad_norm: 0.2892 (0.3214) closs: 0.8244 (0.8446) time: 1.8925 data: 0.0002 max mem: 27896 +[04:32:57.804891] Epoch: [3] [770/812] lr: 0.000005 grad_norm: 0.2988 (0.3216) closs: 0.8321 (0.8446) time: 1.8911 data: 0.0002 max mem: 27896 +[04:33:16.740653] Epoch: [3] [780/812] lr: 0.000005 grad_norm: 0.2995 (0.3220) closs: 0.8274 (0.8445) time: 1.8925 data: 0.0001 max mem: 27896 +[04:33:35.601422] Epoch: [3] [790/812] lr: 0.000005 grad_norm: 0.3101 (0.3220) closs: 0.8274 (0.8445) time: 1.8898 data: 0.0001 max mem: 27896 +[04:33:54.572764] Epoch: [3] [800/812] lr: 0.000005 grad_norm: 0.3027 (0.3219) closs: 0.8614 (0.8449) time: 1.8915 data: 0.0001 max mem: 27896 +[04:34:13.455842] Epoch: [3] [810/812] lr: 0.000005 grad_norm: 0.3061 (0.3223) closs: 0.9174 (0.8456) time: 1.8927 data: 0.0001 max mem: 27896 +[04:34:15.578241] Epoch: [3] Total time: 0:25:38 +[04:34:15.579225] Averaged stats: lr: 0.000005 grad_norm: 0.3140 (0.3224) closs: 0.9157 (0.8461) +[04:34:15.661264] model saved +[04:34:16.446718] optimizer saved +[04:34:16.447181] other rank-common saved +[04:34:16.449046] rank-specific saved +[04:34:16.449254] Training time 1:42:44 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..8f3ba8a74edeb56d30f33b93229d72ff0893234d --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57840f6a94b317e5216383955e671d5c485aa459e9a77c46399004b14aab3449 +size 16308187 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..e0aa9f434f0cc0a821265ec66357c81590a16153 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e48515afe7b737d914e6a945fddc635b1f3bd53d716da73568279b19d4e9fc +size 64801559 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..6c3e73a9d547910fb183caa16be9b2f9ed5a7866 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15ad86707b40e58d9f1e5b304e7f138c74be797e45cdcd6c9c3e67d1ddea2a8b +size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00000-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00000-of-00002.pth new file mode 100644 index 0000000000000000000000000000000000000000..d5700f48e3c878c8ea005c25793d536c052e44be --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00000-of-00002.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:773a3b4cf6877fcfb087e3efb3b631fc40b16a6fdef5b9afb3cd6854bc59509f +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00001-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00001-of-00002.pth new file mode 100644 index 0000000000000000000000000000000000000000..695871058b022437de6682dd3f0b8db0055fc6fa --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch0/rank-specific-00001-of-00002.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9baaa2fa1f453e7261aad9ed636db8f4395edcee0ada8daad1f8078b40d5f61c +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf8b0b7d0020d9c161451e9628f04abe5fbbfd7d --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95c28c92604c98733f7d11de93aabb7b91bf51cf6d5d1b4a7648f88735df9be8 +size 16308187 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..4ae586e8704515a765d6489bf6c23b882d96dd76 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:726e9afa5e22acbf9e238aaa616537ff8bc766c7a407d50f49fd184a85596d3f +size 64801559 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..db1a4f53cdd71b119a332dfc0460aacb6dcba83d --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3ffbc6452c328bba47a43a12dbd4bc293c231de8f57a6bd819aa611ed703d60 +size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00000-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00000-of-00002.pth new file mode 100644 index 0000000000000000000000000000000000000000..d5700f48e3c878c8ea005c25793d536c052e44be --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00000-of-00002.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:773a3b4cf6877fcfb087e3efb3b631fc40b16a6fdef5b9afb3cd6854bc59509f +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00001-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00001-of-00002.pth new file mode 100644 index 0000000000000000000000000000000000000000..695871058b022437de6682dd3f0b8db0055fc6fa --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch1/rank-specific-00001-of-00002.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9baaa2fa1f453e7261aad9ed636db8f4395edcee0ada8daad1f8078b40d5f61c +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..0b6b9e7e744aa00e7901cd325ae5b2ca70edd02f --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71c909bd4009747dcecd359334b72083ce7d70ba611d0835b3f9e805633df345 +size 16308187 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..8209a0373966755123d2c2a67c9f7f3716dad120 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8af183f8bb45d2e6aecf19b37dd23b380900961ab1df49a422c47a88086dd99b +size 64801559 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..654e95a72619e275667b7e2858b830b4eb336d60 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e1407e17b55be720204de47db0e9d89c18253e4bd99ce9beecf96812ad9220b +size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00000-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00000-of-00002.pth new file mode 100644 index 0000000000000000000000000000000000000000..d5700f48e3c878c8ea005c25793d536c052e44be --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00000-of-00002.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:773a3b4cf6877fcfb087e3efb3b631fc40b16a6fdef5b9afb3cd6854bc59509f +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00001-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00001-of-00002.pth new file mode 100644 index 0000000000000000000000000000000000000000..695871058b022437de6682dd3f0b8db0055fc6fa --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch2/rank-specific-00001-of-00002.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9baaa2fa1f453e7261aad9ed636db8f4395edcee0ada8daad1f8078b40d5f61c +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..fdb8a6396561e7587d5763107287361c3b7d2a65 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ca963a2250f78056df0749f6d91daf572f7e87a00398a2ea04fb8e0d4fb2981 +size 16308187 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..28bd47df51e7ce248d00ef3d9ee2ca5cd4821c25 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8f6388325fba6658d8f5f383cb6affc164930411672f0944d7f249e10a03b78 +size 64801559 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..d5680517ef1395e38ea1a66b086e2213b58700f0 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99c45bf04236026e6ce60807e96c85d649fb32f7065883be1052037d816478dc +size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00000-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00000-of-00002.pth new file mode 100644 index 0000000000000000000000000000000000000000..d5700f48e3c878c8ea005c25793d536c052e44be --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00000-of-00002.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:773a3b4cf6877fcfb087e3efb3b631fc40b16a6fdef5b9afb3cd6854bc59509f +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00001-of-00002.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00001-of-00002.pth new file mode 100644 index 0000000000000000000000000000000000000000..695871058b022437de6682dd3f0b8db0055fc6fa --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/epoch3/rank-specific-00001-of-00002.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9baaa2fa1f453e7261aad9ed636db8f4395edcee0ada8daad1f8078b40d5f61c +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/log.txt b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2ec26280e83c14eb2d09d4d2d0ccd17ebb3ede7 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/log.txt @@ -0,0 +1,4 @@ +{"train_lr": 2.49923076923077e-05, "train_closs": 0.7980487976441016, "train_grad_norm": 0.5980251384056532, "epoch": 0, "val_lr": 2.49923076923077e-05, "val_closs": 0.7980487976441016, "val_grad_norm": 0.5980251384056532} +{"train_lr": 4.6109080828728024e-05, "train_closs": 0.7623572307492678, "train_grad_norm": 0.45453824085914174, "epoch": 1, "val_lr": 4.6109080828728024e-05, "val_closs": 0.7623572307492678, "val_grad_norm": 0.45453824085914174} +{"train_lr": 2.750346153846151e-05, "train_closs": 0.750338752788993, "train_grad_norm": 0.46191218195511746, "epoch": 2, "val_lr": 2.750346153846151e-05, "val_closs": 0.750338752788993, "val_grad_norm": 0.46191218195511746} +{"train_lr": 8.894380709733404e-06, "train_closs": 0.742047518081963, "train_grad_norm": 0.47685301401064945, "epoch": 3, "val_lr": 8.894380709733404e-06, "val_closs": 0.742047518081963, "val_grad_norm": 0.47685301401064945} diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/output.log b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/output.log new file mode 100644 index 0000000000000000000000000000000000000000..7237f8e9f2027c7ccb41d96c4cfe451ad95ec056 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B/output.log @@ -0,0 +1,7130 @@ +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +| distributed init (rank 1): env://, gpu 1 +| distributed init (rank 2): env://, gpu 2 +| distributed init (rank 3): env://, gpu 3 +| distributed init (rank 0): env://, gpu 0 +[16:45:57.921447] > initializing model parallel with size 1 +[16:45:57.921566] > initializing ddp with size 4 +[16:45:57.921573] > initializing pipeline with size 1 +[16:45:58.007928] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory +[16:45:58.008031] Namespace(batch_size=4, +accum_iter=2, +llama_type='llama_peft', +llama_config=['../checkpoints/llama2/Llama-2-70b/params.json'], +no_visual=True, +tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model', +pretrained_path='../checkpoints/llama2/Llama-2-70b/', +pretrained_type='meta_ori', +weight_decay=0.02, +lr=5e-05, +min_lr=5e-06, +epochs=4, +warmup_epochs=1.0, +clip_grad=2, +max_words=512, +dialog=False, +data_config='configs/data/finetune/sg/alpaca.yaml', +output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B', +log_dir='./output_dir', +save_interval=1, +device='cuda', +seed=0, +resume='', +num_workers=4, +pin_mem=True, +world_size=4, +local_rank=-1, +dist_on_itp=False, +dist_url='env://', +model_parallel_size=1, +data_parallel='sdp', +precision='bf16', +checkpointing=True, +quant=True, +rank=0, +gpu=0, +distributed=True, +dist_backend='nccl') +[16:45:58.019987] Model Args: + ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True) +[16:54:32.881369] Model is Peft: True +[16:54:32.888802] Trainable parameter count : 8036352 (local rank), 8036352 (all). +[16:54:32.917071] Trainable param: llma.tok_embeddings.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917111] Trainable param: llma.layers.0.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917124] Trainable param: llma.layers.0.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.917137] Trainable param: llma.layers.0.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917147] Trainable param: llma.layers.0.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.917159] Trainable param: llma.layers.0.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917169] Trainable param: llma.layers.0.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.917180] Trainable param: llma.layers.0.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917190] Trainable param: llma.layers.0.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917204] Trainable param: llma.layers.0.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917214] Trainable param: llma.layers.0.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.917226] Trainable param: llma.layers.0.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917236] Trainable param: llma.layers.0.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917247] Trainable param: llma.layers.0.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917257] Trainable param: llma.layers.0.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.917269] Trainable param: llma.layers.0.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917279] Trainable param: llma.layers.0.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917293] Trainable param: llma.layers.1.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917304] Trainable param: llma.layers.1.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.917315] Trainable param: llma.layers.1.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917325] Trainable param: llma.layers.1.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.917336] Trainable param: llma.layers.1.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917346] Trainable param: llma.layers.1.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.917357] Trainable param: llma.layers.1.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917367] Trainable param: llma.layers.1.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917379] Trainable param: llma.layers.1.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917389] Trainable param: llma.layers.1.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.917400] Trainable param: llma.layers.1.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917410] Trainable param: llma.layers.1.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917421] Trainable param: llma.layers.1.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917431] Trainable param: llma.layers.1.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.917442] Trainable param: llma.layers.1.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917452] Trainable param: llma.layers.1.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917466] Trainable param: llma.layers.2.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917475] Trainable param: llma.layers.2.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.917487] Trainable param: llma.layers.2.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917497] Trainable param: llma.layers.2.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.917508] Trainable param: llma.layers.2.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917522] Trainable param: llma.layers.2.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.917533] Trainable param: llma.layers.2.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917543] Trainable param: llma.layers.2.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917555] Trainable param: llma.layers.2.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917565] Trainable param: llma.layers.2.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.917576] Trainable param: llma.layers.2.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917585] Trainable param: llma.layers.2.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917596] Trainable param: llma.layers.2.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917606] Trainable param: llma.layers.2.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.917617] Trainable param: llma.layers.2.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917627] Trainable param: llma.layers.2.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917641] Trainable param: llma.layers.3.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917650] Trainable param: llma.layers.3.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.917662] Trainable param: llma.layers.3.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917671] Trainable param: llma.layers.3.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.917682] Trainable param: llma.layers.3.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917692] Trainable param: llma.layers.3.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.917703] Trainable param: llma.layers.3.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917713] Trainable param: llma.layers.3.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917725] Trainable param: llma.layers.3.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917735] Trainable param: llma.layers.3.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.917746] Trainable param: llma.layers.3.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917755] Trainable param: llma.layers.3.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917767] Trainable param: llma.layers.3.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917777] Trainable param: llma.layers.3.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.917787] Trainable param: llma.layers.3.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917799] Trainable param: llma.layers.3.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917812] Trainable param: llma.layers.4.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917821] Trainable param: llma.layers.4.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.917832] Trainable param: llma.layers.4.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917842] Trainable param: llma.layers.4.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.917853] Trainable param: llma.layers.4.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917863] Trainable param: llma.layers.4.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.917874] Trainable param: llma.layers.4.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917884] Trainable param: llma.layers.4.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917896] Trainable param: llma.layers.4.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917905] Trainable param: llma.layers.4.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.917917] Trainable param: llma.layers.4.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917928] Trainable param: llma.layers.4.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917939] Trainable param: llma.layers.4.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917949] Trainable param: llma.layers.4.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.917959] Trainable param: llma.layers.4.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917970] Trainable param: llma.layers.4.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.917984] Trainable param: llma.layers.5.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.917993] Trainable param: llma.layers.5.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.918004] Trainable param: llma.layers.5.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918014] Trainable param: llma.layers.5.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.918025] Trainable param: llma.layers.5.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918035] Trainable param: llma.layers.5.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.918046] Trainable param: llma.layers.5.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918055] Trainable param: llma.layers.5.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918067] Trainable param: llma.layers.5.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918077] Trainable param: llma.layers.5.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.918088] Trainable param: llma.layers.5.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918098] Trainable param: llma.layers.5.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918109] Trainable param: llma.layers.5.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918119] Trainable param: llma.layers.5.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.918130] Trainable param: llma.layers.5.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918142] Trainable param: llma.layers.5.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918155] Trainable param: llma.layers.6.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918165] Trainable param: llma.layers.6.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.918176] Trainable param: llma.layers.6.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918186] Trainable param: llma.layers.6.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.918197] Trainable param: llma.layers.6.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918207] Trainable param: llma.layers.6.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.918217] Trainable param: llma.layers.6.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918227] Trainable param: llma.layers.6.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918239] Trainable param: llma.layers.6.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918249] Trainable param: llma.layers.6.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.918260] Trainable param: llma.layers.6.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918270] Trainable param: llma.layers.6.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918281] Trainable param: llma.layers.6.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918291] Trainable param: llma.layers.6.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.918302] Trainable param: llma.layers.6.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918313] Trainable param: llma.layers.6.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918326] Trainable param: llma.layers.7.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918335] Trainable param: llma.layers.7.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.918346] Trainable param: llma.layers.7.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918356] Trainable param: llma.layers.7.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.918367] Trainable param: llma.layers.7.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918377] Trainable param: llma.layers.7.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.918388] Trainable param: llma.layers.7.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918398] Trainable param: llma.layers.7.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918410] Trainable param: llma.layers.7.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918420] Trainable param: llma.layers.7.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.918431] Trainable param: llma.layers.7.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918440] Trainable param: llma.layers.7.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918451] Trainable param: llma.layers.7.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918461] Trainable param: llma.layers.7.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.918472] Trainable param: llma.layers.7.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918482] Trainable param: llma.layers.7.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918496] Trainable param: llma.layers.8.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918505] Trainable param: llma.layers.8.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.918516] Trainable param: llma.layers.8.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918526] Trainable param: llma.layers.8.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.918537] Trainable param: llma.layers.8.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918547] Trainable param: llma.layers.8.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.918558] Trainable param: llma.layers.8.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918567] Trainable param: llma.layers.8.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918579] Trainable param: llma.layers.8.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918589] Trainable param: llma.layers.8.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.918600] Trainable param: llma.layers.8.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918610] Trainable param: llma.layers.8.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918621] Trainable param: llma.layers.8.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918631] Trainable param: llma.layers.8.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.918642] Trainable param: llma.layers.8.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918653] Trainable param: llma.layers.8.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918666] Trainable param: llma.layers.9.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918676] Trainable param: llma.layers.9.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.918687] Trainable param: llma.layers.9.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918696] Trainable param: llma.layers.9.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.918708] Trainable param: llma.layers.9.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918717] Trainable param: llma.layers.9.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.918728] Trainable param: llma.layers.9.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918738] Trainable param: llma.layers.9.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918750] Trainable param: llma.layers.9.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918759] Trainable param: llma.layers.9.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.918771] Trainable param: llma.layers.9.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918780] Trainable param: llma.layers.9.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918792] Trainable param: llma.layers.9.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918801] Trainable param: llma.layers.9.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.918812] Trainable param: llma.layers.9.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918823] Trainable param: llma.layers.9.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918836] Trainable param: llma.layers.10.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918846] Trainable param: llma.layers.10.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.918857] Trainable param: llma.layers.10.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918867] Trainable param: llma.layers.10.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.918878] Trainable param: llma.layers.10.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918887] Trainable param: llma.layers.10.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.918898] Trainable param: llma.layers.10.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918908] Trainable param: llma.layers.10.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918920] Trainable param: llma.layers.10.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918930] Trainable param: llma.layers.10.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.918941] Trainable param: llma.layers.10.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918951] Trainable param: llma.layers.10.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918962] Trainable param: llma.layers.10.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.918971] Trainable param: llma.layers.10.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.918982] Trainable param: llma.layers.10.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.918993] Trainable param: llma.layers.10.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919006] Trainable param: llma.layers.11.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919016] Trainable param: llma.layers.11.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.919026] Trainable param: llma.layers.11.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919036] Trainable param: llma.layers.11.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.919047] Trainable param: llma.layers.11.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919057] Trainable param: llma.layers.11.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.919068] Trainable param: llma.layers.11.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919078] Trainable param: llma.layers.11.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919090] Trainable param: llma.layers.11.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919100] Trainable param: llma.layers.11.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.919111] Trainable param: llma.layers.11.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919121] Trainable param: llma.layers.11.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919132] Trainable param: llma.layers.11.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919142] Trainable param: llma.layers.11.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.919152] Trainable param: llma.layers.11.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919163] Trainable param: llma.layers.11.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919176] Trainable param: llma.layers.12.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919186] Trainable param: llma.layers.12.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.919197] Trainable param: llma.layers.12.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919207] Trainable param: llma.layers.12.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.919218] Trainable param: llma.layers.12.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919228] Trainable param: llma.layers.12.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.919239] Trainable param: llma.layers.12.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919248] Trainable param: llma.layers.12.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919260] Trainable param: llma.layers.12.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919270] Trainable param: llma.layers.12.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.919281] Trainable param: llma.layers.12.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919291] Trainable param: llma.layers.12.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919301] Trainable param: llma.layers.12.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919311] Trainable param: llma.layers.12.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.919322] Trainable param: llma.layers.12.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919333] Trainable param: llma.layers.12.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919346] Trainable param: llma.layers.13.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919355] Trainable param: llma.layers.13.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.919367] Trainable param: llma.layers.13.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919376] Trainable param: llma.layers.13.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.919387] Trainable param: llma.layers.13.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919397] Trainable param: llma.layers.13.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.919408] Trainable param: llma.layers.13.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919418] Trainable param: llma.layers.13.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919430] Trainable param: llma.layers.13.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919440] Trainable param: llma.layers.13.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.919451] Trainable param: llma.layers.13.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919460] Trainable param: llma.layers.13.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919472] Trainable param: llma.layers.13.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919481] Trainable param: llma.layers.13.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.919492] Trainable param: llma.layers.13.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919503] Trainable param: llma.layers.13.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919516] Trainable param: llma.layers.14.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919526] Trainable param: llma.layers.14.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.919537] Trainable param: llma.layers.14.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919547] Trainable param: llma.layers.14.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.919558] Trainable param: llma.layers.14.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919567] Trainable param: llma.layers.14.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.919578] Trainable param: llma.layers.14.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919588] Trainable param: llma.layers.14.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919600] Trainable param: llma.layers.14.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919610] Trainable param: llma.layers.14.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.919621] Trainable param: llma.layers.14.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919631] Trainable param: llma.layers.14.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919642] Trainable param: llma.layers.14.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919652] Trainable param: llma.layers.14.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.919662] Trainable param: llma.layers.14.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919673] Trainable param: llma.layers.14.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919686] Trainable param: llma.layers.15.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919696] Trainable param: llma.layers.15.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.919707] Trainable param: llma.layers.15.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919717] Trainable param: llma.layers.15.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.919728] Trainable param: llma.layers.15.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919738] Trainable param: llma.layers.15.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.919749] Trainable param: llma.layers.15.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919758] Trainable param: llma.layers.15.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919770] Trainable param: llma.layers.15.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919780] Trainable param: llma.layers.15.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.919791] Trainable param: llma.layers.15.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919801] Trainable param: llma.layers.15.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919812] Trainable param: llma.layers.15.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919822] Trainable param: llma.layers.15.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.919832] Trainable param: llma.layers.15.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919843] Trainable param: llma.layers.15.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919856] Trainable param: llma.layers.16.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919866] Trainable param: llma.layers.16.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.919877] Trainable param: llma.layers.16.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919887] Trainable param: llma.layers.16.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.919898] Trainable param: llma.layers.16.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919907] Trainable param: llma.layers.16.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.919918] Trainable param: llma.layers.16.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919928] Trainable param: llma.layers.16.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919940] Trainable param: llma.layers.16.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919950] Trainable param: llma.layers.16.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.919961] Trainable param: llma.layers.16.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919970] Trainable param: llma.layers.16.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.919981] Trainable param: llma.layers.16.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.919991] Trainable param: llma.layers.16.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.920003] Trainable param: llma.layers.16.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920013] Trainable param: llma.layers.16.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920027] Trainable param: llma.layers.17.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920036] Trainable param: llma.layers.17.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.920047] Trainable param: llma.layers.17.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920057] Trainable param: llma.layers.17.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.920068] Trainable param: llma.layers.17.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920078] Trainable param: llma.layers.17.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.920089] Trainable param: llma.layers.17.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920098] Trainable param: llma.layers.17.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920110] Trainable param: llma.layers.17.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920120] Trainable param: llma.layers.17.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.920131] Trainable param: llma.layers.17.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920141] Trainable param: llma.layers.17.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920152] Trainable param: llma.layers.17.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920162] Trainable param: llma.layers.17.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.920172] Trainable param: llma.layers.17.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920183] Trainable param: llma.layers.17.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920196] Trainable param: llma.layers.18.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920206] Trainable param: llma.layers.18.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.920216] Trainable param: llma.layers.18.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920226] Trainable param: llma.layers.18.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.920237] Trainable param: llma.layers.18.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920247] Trainable param: llma.layers.18.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.920258] Trainable param: llma.layers.18.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920267] Trainable param: llma.layers.18.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920279] Trainable param: llma.layers.18.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920289] Trainable param: llma.layers.18.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.920300] Trainable param: llma.layers.18.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920310] Trainable param: llma.layers.18.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920321] Trainable param: llma.layers.18.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920330] Trainable param: llma.layers.18.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.920341] Trainable param: llma.layers.18.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920352] Trainable param: llma.layers.18.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920365] Trainable param: llma.layers.19.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920379] Trainable param: llma.layers.19.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.920390] Trainable param: llma.layers.19.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920400] Trainable param: llma.layers.19.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.920411] Trainable param: llma.layers.19.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920420] Trainable param: llma.layers.19.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.920432] Trainable param: llma.layers.19.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920441] Trainable param: llma.layers.19.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920453] Trainable param: llma.layers.19.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920463] Trainable param: llma.layers.19.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.920474] Trainable param: llma.layers.19.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920484] Trainable param: llma.layers.19.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920495] Trainable param: llma.layers.19.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920505] Trainable param: llma.layers.19.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.920516] Trainable param: llma.layers.19.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920526] Trainable param: llma.layers.19.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920539] Trainable param: llma.layers.20.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920549] Trainable param: llma.layers.20.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.920560] Trainable param: llma.layers.20.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920570] Trainable param: llma.layers.20.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.920581] Trainable param: llma.layers.20.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920590] Trainable param: llma.layers.20.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.920601] Trainable param: llma.layers.20.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920611] Trainable param: llma.layers.20.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920623] Trainable param: llma.layers.20.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920632] Trainable param: llma.layers.20.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.920644] Trainable param: llma.layers.20.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920653] Trainable param: llma.layers.20.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920664] Trainable param: llma.layers.20.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920674] Trainable param: llma.layers.20.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.920685] Trainable param: llma.layers.20.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920696] Trainable param: llma.layers.20.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920709] Trainable param: llma.layers.21.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920718] Trainable param: llma.layers.21.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.920729] Trainable param: llma.layers.21.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920739] Trainable param: llma.layers.21.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.920750] Trainable param: llma.layers.21.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920760] Trainable param: llma.layers.21.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.920771] Trainable param: llma.layers.21.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920781] Trainable param: llma.layers.21.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920792] Trainable param: llma.layers.21.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920802] Trainable param: llma.layers.21.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.920813] Trainable param: llma.layers.21.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920823] Trainable param: llma.layers.21.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920834] Trainable param: llma.layers.21.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920844] Trainable param: llma.layers.21.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.920855] Trainable param: llma.layers.21.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920866] Trainable param: llma.layers.21.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920879] Trainable param: llma.layers.22.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920888] Trainable param: llma.layers.22.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.920899] Trainable param: llma.layers.22.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920909] Trainable param: llma.layers.22.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.920920] Trainable param: llma.layers.22.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920930] Trainable param: llma.layers.22.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.920941] Trainable param: llma.layers.22.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920950] Trainable param: llma.layers.22.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.920962] Trainable param: llma.layers.22.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920972] Trainable param: llma.layers.22.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.920983] Trainable param: llma.layers.22.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.920992] Trainable param: llma.layers.22.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921003] Trainable param: llma.layers.22.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921013] Trainable param: llma.layers.22.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.921024] Trainable param: llma.layers.22.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921035] Trainable param: llma.layers.22.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921048] Trainable param: llma.layers.23.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921057] Trainable param: llma.layers.23.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.921068] Trainable param: llma.layers.23.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921078] Trainable param: llma.layers.23.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.921089] Trainable param: llma.layers.23.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921099] Trainable param: llma.layers.23.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.921110] Trainable param: llma.layers.23.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921120] Trainable param: llma.layers.23.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921132] Trainable param: llma.layers.23.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921142] Trainable param: llma.layers.23.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.921153] Trainable param: llma.layers.23.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921163] Trainable param: llma.layers.23.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921174] Trainable param: llma.layers.23.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921183] Trainable param: llma.layers.23.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.921194] Trainable param: llma.layers.23.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921205] Trainable param: llma.layers.23.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921218] Trainable param: llma.layers.24.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921228] Trainable param: llma.layers.24.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.921239] Trainable param: llma.layers.24.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921249] Trainable param: llma.layers.24.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.921260] Trainable param: llma.layers.24.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921270] Trainable param: llma.layers.24.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.921280] Trainable param: llma.layers.24.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921290] Trainable param: llma.layers.24.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921302] Trainable param: llma.layers.24.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921312] Trainable param: llma.layers.24.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.921323] Trainable param: llma.layers.24.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921333] Trainable param: llma.layers.24.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921344] Trainable param: llma.layers.24.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921354] Trainable param: llma.layers.24.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.921365] Trainable param: llma.layers.24.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921375] Trainable param: llma.layers.24.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921391] Trainable param: llma.layers.25.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921401] Trainable param: llma.layers.25.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.921412] Trainable param: llma.layers.25.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921422] Trainable param: llma.layers.25.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.921433] Trainable param: llma.layers.25.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921443] Trainable param: llma.layers.25.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.921454] Trainable param: llma.layers.25.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921463] Trainable param: llma.layers.25.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921475] Trainable param: llma.layers.25.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921485] Trainable param: llma.layers.25.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.921496] Trainable param: llma.layers.25.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921506] Trainable param: llma.layers.25.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921520] Trainable param: llma.layers.25.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921530] Trainable param: llma.layers.25.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.921541] Trainable param: llma.layers.25.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921552] Trainable param: llma.layers.25.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921565] Trainable param: llma.layers.26.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921575] Trainable param: llma.layers.26.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.921587] Trainable param: llma.layers.26.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921596] Trainable param: llma.layers.26.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.921607] Trainable param: llma.layers.26.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921617] Trainable param: llma.layers.26.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.921628] Trainable param: llma.layers.26.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921638] Trainable param: llma.layers.26.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921650] Trainable param: llma.layers.26.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921659] Trainable param: llma.layers.26.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.921670] Trainable param: llma.layers.26.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921680] Trainable param: llma.layers.26.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921691] Trainable param: llma.layers.26.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921701] Trainable param: llma.layers.26.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.921711] Trainable param: llma.layers.26.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921722] Trainable param: llma.layers.26.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921735] Trainable param: llma.layers.27.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921745] Trainable param: llma.layers.27.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.921756] Trainable param: llma.layers.27.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921766] Trainable param: llma.layers.27.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.921777] Trainable param: llma.layers.27.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921786] Trainable param: llma.layers.27.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.921797] Trainable param: llma.layers.27.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921807] Trainable param: llma.layers.27.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921819] Trainable param: llma.layers.27.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921828] Trainable param: llma.layers.27.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.921840] Trainable param: llma.layers.27.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921849] Trainable param: llma.layers.27.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921861] Trainable param: llma.layers.27.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921870] Trainable param: llma.layers.27.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.921881] Trainable param: llma.layers.27.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921892] Trainable param: llma.layers.27.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921905] Trainable param: llma.layers.28.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921914] Trainable param: llma.layers.28.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.921925] Trainable param: llma.layers.28.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921935] Trainable param: llma.layers.28.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.921946] Trainable param: llma.layers.28.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921956] Trainable param: llma.layers.28.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.921967] Trainable param: llma.layers.28.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921977] Trainable param: llma.layers.28.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.921988] Trainable param: llma.layers.28.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.921998] Trainable param: llma.layers.28.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.922009] Trainable param: llma.layers.28.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922018] Trainable param: llma.layers.28.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922029] Trainable param: llma.layers.28.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922039] Trainable param: llma.layers.28.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.922050] Trainable param: llma.layers.28.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922063] Trainable param: llma.layers.28.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922077] Trainable param: llma.layers.29.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922087] Trainable param: llma.layers.29.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.922098] Trainable param: llma.layers.29.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922108] Trainable param: llma.layers.29.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.922120] Trainable param: llma.layers.29.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922130] Trainable param: llma.layers.29.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.922142] Trainable param: llma.layers.29.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922152] Trainable param: llma.layers.29.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922164] Trainable param: llma.layers.29.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922174] Trainable param: llma.layers.29.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.922185] Trainable param: llma.layers.29.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922195] Trainable param: llma.layers.29.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922206] Trainable param: llma.layers.29.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922216] Trainable param: llma.layers.29.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.922227] Trainable param: llma.layers.29.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922238] Trainable param: llma.layers.29.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922251] Trainable param: llma.layers.30.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922260] Trainable param: llma.layers.30.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.922272] Trainable param: llma.layers.30.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922281] Trainable param: llma.layers.30.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.922293] Trainable param: llma.layers.30.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922302] Trainable param: llma.layers.30.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.922313] Trainable param: llma.layers.30.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922323] Trainable param: llma.layers.30.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922335] Trainable param: llma.layers.30.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922345] Trainable param: llma.layers.30.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.922356] Trainable param: llma.layers.30.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922366] Trainable param: llma.layers.30.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922377] Trainable param: llma.layers.30.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922386] Trainable param: llma.layers.30.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.922397] Trainable param: llma.layers.30.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922408] Trainable param: llma.layers.30.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922421] Trainable param: llma.layers.31.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922431] Trainable param: llma.layers.31.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.922442] Trainable param: llma.layers.31.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922452] Trainable param: llma.layers.31.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.922463] Trainable param: llma.layers.31.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922473] Trainable param: llma.layers.31.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.922484] Trainable param: llma.layers.31.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922494] Trainable param: llma.layers.31.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922506] Trainable param: llma.layers.31.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922516] Trainable param: llma.layers.31.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.922528] Trainable param: llma.layers.31.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922538] Trainable param: llma.layers.31.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922549] Trainable param: llma.layers.31.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922559] Trainable param: llma.layers.31.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.922570] Trainable param: llma.layers.31.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922581] Trainable param: llma.layers.31.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922594] Trainable param: llma.layers.32.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922604] Trainable param: llma.layers.32.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.922616] Trainable param: llma.layers.32.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922625] Trainable param: llma.layers.32.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.922637] Trainable param: llma.layers.32.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922647] Trainable param: llma.layers.32.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.922658] Trainable param: llma.layers.32.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922668] Trainable param: llma.layers.32.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922680] Trainable param: llma.layers.32.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922690] Trainable param: llma.layers.32.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.922701] Trainable param: llma.layers.32.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922711] Trainable param: llma.layers.32.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922722] Trainable param: llma.layers.32.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922732] Trainable param: llma.layers.32.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.922743] Trainable param: llma.layers.32.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922754] Trainable param: llma.layers.32.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922767] Trainable param: llma.layers.33.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922777] Trainable param: llma.layers.33.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.922788] Trainable param: llma.layers.33.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922798] Trainable param: llma.layers.33.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.922809] Trainable param: llma.layers.33.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922819] Trainable param: llma.layers.33.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.922830] Trainable param: llma.layers.33.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922840] Trainable param: llma.layers.33.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922853] Trainable param: llma.layers.33.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922863] Trainable param: llma.layers.33.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.922874] Trainable param: llma.layers.33.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922884] Trainable param: llma.layers.33.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922895] Trainable param: llma.layers.33.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922905] Trainable param: llma.layers.33.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.922916] Trainable param: llma.layers.33.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922926] Trainable param: llma.layers.33.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.922939] Trainable param: llma.layers.34.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922949] Trainable param: llma.layers.34.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.922961] Trainable param: llma.layers.34.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922971] Trainable param: llma.layers.34.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.922982] Trainable param: llma.layers.34.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.922992] Trainable param: llma.layers.34.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.923003] Trainable param: llma.layers.34.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923013] Trainable param: llma.layers.34.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923025] Trainable param: llma.layers.34.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923035] Trainable param: llma.layers.34.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.923046] Trainable param: llma.layers.34.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923056] Trainable param: llma.layers.34.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923067] Trainable param: llma.layers.34.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923076] Trainable param: llma.layers.34.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.923087] Trainable param: llma.layers.34.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923098] Trainable param: llma.layers.34.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923111] Trainable param: llma.layers.35.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923121] Trainable param: llma.layers.35.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.923132] Trainable param: llma.layers.35.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923142] Trainable param: llma.layers.35.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.923153] Trainable param: llma.layers.35.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923163] Trainable param: llma.layers.35.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.923174] Trainable param: llma.layers.35.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923184] Trainable param: llma.layers.35.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923196] Trainable param: llma.layers.35.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923205] Trainable param: llma.layers.35.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.923216] Trainable param: llma.layers.35.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923226] Trainable param: llma.layers.35.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923237] Trainable param: llma.layers.35.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923247] Trainable param: llma.layers.35.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.923258] Trainable param: llma.layers.35.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923269] Trainable param: llma.layers.35.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923282] Trainable param: llma.layers.36.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923292] Trainable param: llma.layers.36.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.923303] Trainable param: llma.layers.36.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923312] Trainable param: llma.layers.36.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.923324] Trainable param: llma.layers.36.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923333] Trainable param: llma.layers.36.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.923344] Trainable param: llma.layers.36.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923354] Trainable param: llma.layers.36.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923366] Trainable param: llma.layers.36.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923376] Trainable param: llma.layers.36.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.923387] Trainable param: llma.layers.36.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923396] Trainable param: llma.layers.36.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923407] Trainable param: llma.layers.36.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923417] Trainable param: llma.layers.36.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.923427] Trainable param: llma.layers.36.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923438] Trainable param: llma.layers.36.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923451] Trainable param: llma.layers.37.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923461] Trainable param: llma.layers.37.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.923472] Trainable param: llma.layers.37.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923482] Trainable param: llma.layers.37.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.923493] Trainable param: llma.layers.37.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923503] Trainable param: llma.layers.37.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.923514] Trainable param: llma.layers.37.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923524] Trainable param: llma.layers.37.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923536] Trainable param: llma.layers.37.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923546] Trainable param: llma.layers.37.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.923557] Trainable param: llma.layers.37.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923566] Trainable param: llma.layers.37.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923578] Trainable param: llma.layers.37.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923587] Trainable param: llma.layers.37.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.923598] Trainable param: llma.layers.37.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923609] Trainable param: llma.layers.37.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923622] Trainable param: llma.layers.38.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923632] Trainable param: llma.layers.38.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.923643] Trainable param: llma.layers.38.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923653] Trainable param: llma.layers.38.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.923664] Trainable param: llma.layers.38.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923673] Trainable param: llma.layers.38.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.923684] Trainable param: llma.layers.38.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923694] Trainable param: llma.layers.38.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923705] Trainable param: llma.layers.38.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923714] Trainable param: llma.layers.38.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.923724] Trainable param: llma.layers.38.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923733] Trainable param: llma.layers.38.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923744] Trainable param: llma.layers.38.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923752] Trainable param: llma.layers.38.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.923762] Trainable param: llma.layers.38.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923772] Trainable param: llma.layers.38.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923784] Trainable param: llma.layers.39.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923793] Trainable param: llma.layers.39.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.923804] Trainable param: llma.layers.39.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923812] Trainable param: llma.layers.39.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.923823] Trainable param: llma.layers.39.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923831] Trainable param: llma.layers.39.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.923842] Trainable param: llma.layers.39.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923850] Trainable param: llma.layers.39.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923861] Trainable param: llma.layers.39.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923870] Trainable param: llma.layers.39.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.923880] Trainable param: llma.layers.39.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923889] Trainable param: llma.layers.39.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923899] Trainable param: llma.layers.39.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923908] Trainable param: llma.layers.39.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.923917] Trainable param: llma.layers.39.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923927] Trainable param: llma.layers.39.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.923939] Trainable param: llma.layers.40.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923948] Trainable param: llma.layers.40.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.923958] Trainable param: llma.layers.40.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923967] Trainable param: llma.layers.40.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.923977] Trainable param: llma.layers.40.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.923986] Trainable param: llma.layers.40.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.923996] Trainable param: llma.layers.40.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924004] Trainable param: llma.layers.40.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924016] Trainable param: llma.layers.40.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924024] Trainable param: llma.layers.40.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.924034] Trainable param: llma.layers.40.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924043] Trainable param: llma.layers.40.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924053] Trainable param: llma.layers.40.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924062] Trainable param: llma.layers.40.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.924072] Trainable param: llma.layers.40.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924081] Trainable param: llma.layers.40.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924093] Trainable param: llma.layers.41.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924102] Trainable param: llma.layers.41.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.924112] Trainable param: llma.layers.41.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924121] Trainable param: llma.layers.41.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.924131] Trainable param: llma.layers.41.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924139] Trainable param: llma.layers.41.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.924150] Trainable param: llma.layers.41.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924158] Trainable param: llma.layers.41.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924170] Trainable param: llma.layers.41.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924178] Trainable param: llma.layers.41.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.924188] Trainable param: llma.layers.41.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924197] Trainable param: llma.layers.41.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924207] Trainable param: llma.layers.41.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924216] Trainable param: llma.layers.41.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.924227] Trainable param: llma.layers.41.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924237] Trainable param: llma.layers.41.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924249] Trainable param: llma.layers.42.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924258] Trainable param: llma.layers.42.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.924268] Trainable param: llma.layers.42.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924277] Trainable param: llma.layers.42.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.924287] Trainable param: llma.layers.42.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924296] Trainable param: llma.layers.42.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.924306] Trainable param: llma.layers.42.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924314] Trainable param: llma.layers.42.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924326] Trainable param: llma.layers.42.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924334] Trainable param: llma.layers.42.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.924345] Trainable param: llma.layers.42.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924353] Trainable param: llma.layers.42.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924363] Trainable param: llma.layers.42.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924372] Trainable param: llma.layers.42.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.924382] Trainable param: llma.layers.42.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924392] Trainable param: llma.layers.42.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924403] Trainable param: llma.layers.43.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924412] Trainable param: llma.layers.43.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.924422] Trainable param: llma.layers.43.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924431] Trainable param: llma.layers.43.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.924441] Trainable param: llma.layers.43.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924450] Trainable param: llma.layers.43.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.924460] Trainable param: llma.layers.43.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924469] Trainable param: llma.layers.43.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924480] Trainable param: llma.layers.43.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924490] Trainable param: llma.layers.43.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.924500] Trainable param: llma.layers.43.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924509] Trainable param: llma.layers.43.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924519] Trainable param: llma.layers.43.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924528] Trainable param: llma.layers.43.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.924538] Trainable param: llma.layers.43.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924548] Trainable param: llma.layers.43.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924560] Trainable param: llma.layers.44.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924569] Trainable param: llma.layers.44.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.924579] Trainable param: llma.layers.44.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924588] Trainable param: llma.layers.44.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.924598] Trainable param: llma.layers.44.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924607] Trainable param: llma.layers.44.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.924617] Trainable param: llma.layers.44.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924625] Trainable param: llma.layers.44.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924636] Trainable param: llma.layers.44.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924645] Trainable param: llma.layers.44.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.924655] Trainable param: llma.layers.44.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924664] Trainable param: llma.layers.44.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924674] Trainable param: llma.layers.44.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924683] Trainable param: llma.layers.44.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.924693] Trainable param: llma.layers.44.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924702] Trainable param: llma.layers.44.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924715] Trainable param: llma.layers.45.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924723] Trainable param: llma.layers.45.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.924734] Trainable param: llma.layers.45.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924742] Trainable param: llma.layers.45.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.924753] Trainable param: llma.layers.45.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924762] Trainable param: llma.layers.45.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.924772] Trainable param: llma.layers.45.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924781] Trainable param: llma.layers.45.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924793] Trainable param: llma.layers.45.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924802] Trainable param: llma.layers.45.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.924812] Trainable param: llma.layers.45.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924821] Trainable param: llma.layers.45.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924831] Trainable param: llma.layers.45.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924840] Trainable param: llma.layers.45.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.924850] Trainable param: llma.layers.45.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924860] Trainable param: llma.layers.45.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924871] Trainable param: llma.layers.46.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924880] Trainable param: llma.layers.46.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.924890] Trainable param: llma.layers.46.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924899] Trainable param: llma.layers.46.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.924909] Trainable param: llma.layers.46.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924917] Trainable param: llma.layers.46.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.924927] Trainable param: llma.layers.46.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924936] Trainable param: llma.layers.46.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924947] Trainable param: llma.layers.46.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924956] Trainable param: llma.layers.46.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.924966] Trainable param: llma.layers.46.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924975] Trainable param: llma.layers.46.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.924985] Trainable param: llma.layers.46.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.924993] Trainable param: llma.layers.46.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.925003] Trainable param: llma.layers.46.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925015] Trainable param: llma.layers.46.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925028] Trainable param: llma.layers.47.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925036] Trainable param: llma.layers.47.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.925046] Trainable param: llma.layers.47.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925055] Trainable param: llma.layers.47.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.925065] Trainable param: llma.layers.47.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925074] Trainable param: llma.layers.47.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.925084] Trainable param: llma.layers.47.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925093] Trainable param: llma.layers.47.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925103] Trainable param: llma.layers.47.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925112] Trainable param: llma.layers.47.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.925122] Trainable param: llma.layers.47.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925131] Trainable param: llma.layers.47.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925141] Trainable param: llma.layers.47.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925150] Trainable param: llma.layers.47.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.925160] Trainable param: llma.layers.47.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925169] Trainable param: llma.layers.47.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925181] Trainable param: llma.layers.48.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925190] Trainable param: llma.layers.48.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.925200] Trainable param: llma.layers.48.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925209] Trainable param: llma.layers.48.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.925219] Trainable param: llma.layers.48.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925228] Trainable param: llma.layers.48.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.925238] Trainable param: llma.layers.48.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925246] Trainable param: llma.layers.48.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925257] Trainable param: llma.layers.48.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925266] Trainable param: llma.layers.48.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.925277] Trainable param: llma.layers.48.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925286] Trainable param: llma.layers.48.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925296] Trainable param: llma.layers.48.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925305] Trainable param: llma.layers.48.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.925315] Trainable param: llma.layers.48.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925325] Trainable param: llma.layers.48.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925337] Trainable param: llma.layers.49.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925346] Trainable param: llma.layers.49.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.925356] Trainable param: llma.layers.49.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925365] Trainable param: llma.layers.49.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.925375] Trainable param: llma.layers.49.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925384] Trainable param: llma.layers.49.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.925394] Trainable param: llma.layers.49.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925402] Trainable param: llma.layers.49.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925413] Trainable param: llma.layers.49.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925422] Trainable param: llma.layers.49.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.925432] Trainable param: llma.layers.49.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925441] Trainable param: llma.layers.49.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925451] Trainable param: llma.layers.49.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925460] Trainable param: llma.layers.49.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.925470] Trainable param: llma.layers.49.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925479] Trainable param: llma.layers.49.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925491] Trainable param: llma.layers.50.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925500] Trainable param: llma.layers.50.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.925510] Trainable param: llma.layers.50.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925523] Trainable param: llma.layers.50.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.925533] Trainable param: llma.layers.50.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925542] Trainable param: llma.layers.50.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.925552] Trainable param: llma.layers.50.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925561] Trainable param: llma.layers.50.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925572] Trainable param: llma.layers.50.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925581] Trainable param: llma.layers.50.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.925591] Trainable param: llma.layers.50.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925599] Trainable param: llma.layers.50.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925609] Trainable param: llma.layers.50.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925618] Trainable param: llma.layers.50.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.925628] Trainable param: llma.layers.50.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925638] Trainable param: llma.layers.50.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925650] Trainable param: llma.layers.51.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925658] Trainable param: llma.layers.51.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.925669] Trainable param: llma.layers.51.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925677] Trainable param: llma.layers.51.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.925687] Trainable param: llma.layers.51.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925696] Trainable param: llma.layers.51.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.925706] Trainable param: llma.layers.51.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925715] Trainable param: llma.layers.51.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925726] Trainable param: llma.layers.51.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925734] Trainable param: llma.layers.51.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.925744] Trainable param: llma.layers.51.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925753] Trainable param: llma.layers.51.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925763] Trainable param: llma.layers.51.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925772] Trainable param: llma.layers.51.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.925782] Trainable param: llma.layers.51.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925792] Trainable param: llma.layers.51.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925805] Trainable param: llma.layers.52.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925814] Trainable param: llma.layers.52.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.925825] Trainable param: llma.layers.52.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925833] Trainable param: llma.layers.52.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.925843] Trainable param: llma.layers.52.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925852] Trainable param: llma.layers.52.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.925862] Trainable param: llma.layers.52.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925871] Trainable param: llma.layers.52.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925882] Trainable param: llma.layers.52.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925891] Trainable param: llma.layers.52.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.925901] Trainable param: llma.layers.52.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925909] Trainable param: llma.layers.52.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925919] Trainable param: llma.layers.52.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925928] Trainable param: llma.layers.52.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.925938] Trainable param: llma.layers.52.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925947] Trainable param: llma.layers.52.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.925959] Trainable param: llma.layers.53.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925968] Trainable param: llma.layers.53.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.925978] Trainable param: llma.layers.53.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.925987] Trainable param: llma.layers.53.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.925997] Trainable param: llma.layers.53.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926006] Trainable param: llma.layers.53.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.926016] Trainable param: llma.layers.53.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926025] Trainable param: llma.layers.53.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926036] Trainable param: llma.layers.53.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926044] Trainable param: llma.layers.53.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.926054] Trainable param: llma.layers.53.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926064] Trainable param: llma.layers.53.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926074] Trainable param: llma.layers.53.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926083] Trainable param: llma.layers.53.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.926093] Trainable param: llma.layers.53.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926103] Trainable param: llma.layers.53.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926114] Trainable param: llma.layers.54.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926123] Trainable param: llma.layers.54.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.926134] Trainable param: llma.layers.54.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926142] Trainable param: llma.layers.54.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.926152] Trainable param: llma.layers.54.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926161] Trainable param: llma.layers.54.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.926171] Trainable param: llma.layers.54.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926180] Trainable param: llma.layers.54.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926191] Trainable param: llma.layers.54.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926200] Trainable param: llma.layers.54.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.926210] Trainable param: llma.layers.54.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926219] Trainable param: llma.layers.54.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926229] Trainable param: llma.layers.54.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926238] Trainable param: llma.layers.54.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.926248] Trainable param: llma.layers.54.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926258] Trainable param: llma.layers.54.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926270] Trainable param: llma.layers.55.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926279] Trainable param: llma.layers.55.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.926289] Trainable param: llma.layers.55.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926298] Trainable param: llma.layers.55.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.926308] Trainable param: llma.layers.55.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926317] Trainable param: llma.layers.55.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.926328] Trainable param: llma.layers.55.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926338] Trainable param: llma.layers.55.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926349] Trainable param: llma.layers.55.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926358] Trainable param: llma.layers.55.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.926368] Trainable param: llma.layers.55.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926377] Trainable param: llma.layers.55.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926387] Trainable param: llma.layers.55.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926396] Trainable param: llma.layers.55.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.926406] Trainable param: llma.layers.55.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926417] Trainable param: llma.layers.55.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926430] Trainable param: llma.layers.56.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926438] Trainable param: llma.layers.56.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.926448] Trainable param: llma.layers.56.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926457] Trainable param: llma.layers.56.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.926467] Trainable param: llma.layers.56.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926476] Trainable param: llma.layers.56.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.926486] Trainable param: llma.layers.56.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926494] Trainable param: llma.layers.56.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926505] Trainable param: llma.layers.56.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926514] Trainable param: llma.layers.56.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.926524] Trainable param: llma.layers.56.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926533] Trainable param: llma.layers.56.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926543] Trainable param: llma.layers.56.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926552] Trainable param: llma.layers.56.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.926561] Trainable param: llma.layers.56.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926571] Trainable param: llma.layers.56.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926583] Trainable param: llma.layers.57.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926592] Trainable param: llma.layers.57.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.926602] Trainable param: llma.layers.57.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926611] Trainable param: llma.layers.57.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.926621] Trainable param: llma.layers.57.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926630] Trainable param: llma.layers.57.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.926640] Trainable param: llma.layers.57.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926649] Trainable param: llma.layers.57.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926660] Trainable param: llma.layers.57.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926669] Trainable param: llma.layers.57.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.926679] Trainable param: llma.layers.57.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926687] Trainable param: llma.layers.57.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926697] Trainable param: llma.layers.57.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926706] Trainable param: llma.layers.57.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.926716] Trainable param: llma.layers.57.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926726] Trainable param: llma.layers.57.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926738] Trainable param: llma.layers.58.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926746] Trainable param: llma.layers.58.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.926756] Trainable param: llma.layers.58.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926765] Trainable param: llma.layers.58.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.926775] Trainable param: llma.layers.58.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926783] Trainable param: llma.layers.58.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.926793] Trainable param: llma.layers.58.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926802] Trainable param: llma.layers.58.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926813] Trainable param: llma.layers.58.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926822] Trainable param: llma.layers.58.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.926832] Trainable param: llma.layers.58.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926841] Trainable param: llma.layers.58.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926852] Trainable param: llma.layers.58.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926861] Trainable param: llma.layers.58.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.926871] Trainable param: llma.layers.58.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926880] Trainable param: llma.layers.58.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926892] Trainable param: llma.layers.59.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926901] Trainable param: llma.layers.59.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.926911] Trainable param: llma.layers.59.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926920] Trainable param: llma.layers.59.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.926930] Trainable param: llma.layers.59.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926939] Trainable param: llma.layers.59.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.926949] Trainable param: llma.layers.59.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926958] Trainable param: llma.layers.59.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.926969] Trainable param: llma.layers.59.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926978] Trainable param: llma.layers.59.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.926988] Trainable param: llma.layers.59.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.926997] Trainable param: llma.layers.59.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927007] Trainable param: llma.layers.59.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927016] Trainable param: llma.layers.59.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.927025] Trainable param: llma.layers.59.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927035] Trainable param: llma.layers.59.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927048] Trainable param: llma.layers.60.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927056] Trainable param: llma.layers.60.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.927066] Trainable param: llma.layers.60.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927075] Trainable param: llma.layers.60.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.927085] Trainable param: llma.layers.60.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927094] Trainable param: llma.layers.60.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.927104] Trainable param: llma.layers.60.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927114] Trainable param: llma.layers.60.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927125] Trainable param: llma.layers.60.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927133] Trainable param: llma.layers.60.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.927143] Trainable param: llma.layers.60.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927152] Trainable param: llma.layers.60.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927162] Trainable param: llma.layers.60.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927171] Trainable param: llma.layers.60.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.927181] Trainable param: llma.layers.60.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927190] Trainable param: llma.layers.60.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927202] Trainable param: llma.layers.61.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927211] Trainable param: llma.layers.61.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.927221] Trainable param: llma.layers.61.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927229] Trainable param: llma.layers.61.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.927240] Trainable param: llma.layers.61.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927249] Trainable param: llma.layers.61.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.927259] Trainable param: llma.layers.61.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927268] Trainable param: llma.layers.61.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927279] Trainable param: llma.layers.61.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927288] Trainable param: llma.layers.61.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.927298] Trainable param: llma.layers.61.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927306] Trainable param: llma.layers.61.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927317] Trainable param: llma.layers.61.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927325] Trainable param: llma.layers.61.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.927335] Trainable param: llma.layers.61.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927345] Trainable param: llma.layers.61.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927357] Trainable param: llma.layers.62.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927365] Trainable param: llma.layers.62.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.927376] Trainable param: llma.layers.62.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927385] Trainable param: llma.layers.62.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.927396] Trainable param: llma.layers.62.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927405] Trainable param: llma.layers.62.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.927415] Trainable param: llma.layers.62.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927424] Trainable param: llma.layers.62.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927435] Trainable param: llma.layers.62.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927444] Trainable param: llma.layers.62.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.927454] Trainable param: llma.layers.62.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927463] Trainable param: llma.layers.62.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927473] Trainable param: llma.layers.62.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927482] Trainable param: llma.layers.62.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.927492] Trainable param: llma.layers.62.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927501] Trainable param: llma.layers.62.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927513] Trainable param: llma.layers.63.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927522] Trainable param: llma.layers.63.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.927532] Trainable param: llma.layers.63.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927541] Trainable param: llma.layers.63.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.927551] Trainable param: llma.layers.63.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927560] Trainable param: llma.layers.63.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.927571] Trainable param: llma.layers.63.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927580] Trainable param: llma.layers.63.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927592] Trainable param: llma.layers.63.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927601] Trainable param: llma.layers.63.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.927611] Trainable param: llma.layers.63.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927620] Trainable param: llma.layers.63.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927630] Trainable param: llma.layers.63.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927639] Trainable param: llma.layers.63.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.927649] Trainable param: llma.layers.63.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927659] Trainable param: llma.layers.63.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927671] Trainable param: llma.layers.64.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927680] Trainable param: llma.layers.64.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.927690] Trainable param: llma.layers.64.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927699] Trainable param: llma.layers.64.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.927709] Trainable param: llma.layers.64.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927717] Trainable param: llma.layers.64.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.927727] Trainable param: llma.layers.64.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927736] Trainable param: llma.layers.64.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927747] Trainable param: llma.layers.64.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927756] Trainable param: llma.layers.64.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.927766] Trainable param: llma.layers.64.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927775] Trainable param: llma.layers.64.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927785] Trainable param: llma.layers.64.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927793] Trainable param: llma.layers.64.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.927803] Trainable param: llma.layers.64.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927813] Trainable param: llma.layers.64.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927825] Trainable param: llma.layers.65.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927834] Trainable param: llma.layers.65.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.927845] Trainable param: llma.layers.65.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927853] Trainable param: llma.layers.65.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.927864] Trainable param: llma.layers.65.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927873] Trainable param: llma.layers.65.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.927883] Trainable param: llma.layers.65.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927891] Trainable param: llma.layers.65.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927903] Trainable param: llma.layers.65.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927912] Trainable param: llma.layers.65.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.927922] Trainable param: llma.layers.65.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927931] Trainable param: llma.layers.65.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927941] Trainable param: llma.layers.65.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927950] Trainable param: llma.layers.65.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.927960] Trainable param: llma.layers.65.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927969] Trainable param: llma.layers.65.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.927982] Trainable param: llma.layers.66.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.927991] Trainable param: llma.layers.66.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.928001] Trainable param: llma.layers.66.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928010] Trainable param: llma.layers.66.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928020] Trainable param: llma.layers.66.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928028] Trainable param: llma.layers.66.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928039] Trainable param: llma.layers.66.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928047] Trainable param: llma.layers.66.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928058] Trainable param: llma.layers.66.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928067] Trainable param: llma.layers.66.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.928077] Trainable param: llma.layers.66.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928086] Trainable param: llma.layers.66.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928096] Trainable param: llma.layers.66.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928105] Trainable param: llma.layers.66.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.928115] Trainable param: llma.layers.66.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928124] Trainable param: llma.layers.66.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928136] Trainable param: llma.layers.67.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928145] Trainable param: llma.layers.67.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.928155] Trainable param: llma.layers.67.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928165] Trainable param: llma.layers.67.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928175] Trainable param: llma.layers.67.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928184] Trainable param: llma.layers.67.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928194] Trainable param: llma.layers.67.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928203] Trainable param: llma.layers.67.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928214] Trainable param: llma.layers.67.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928223] Trainable param: llma.layers.67.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.928233] Trainable param: llma.layers.67.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928242] Trainable param: llma.layers.67.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928252] Trainable param: llma.layers.67.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928260] Trainable param: llma.layers.67.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.928270] Trainable param: llma.layers.67.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928280] Trainable param: llma.layers.67.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928292] Trainable param: llma.layers.68.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928301] Trainable param: llma.layers.68.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.928312] Trainable param: llma.layers.68.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928320] Trainable param: llma.layers.68.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928331] Trainable param: llma.layers.68.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928340] Trainable param: llma.layers.68.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928349] Trainable param: llma.layers.68.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928358] Trainable param: llma.layers.68.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928369] Trainable param: llma.layers.68.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928378] Trainable param: llma.layers.68.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.928388] Trainable param: llma.layers.68.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928397] Trainable param: llma.layers.68.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928407] Trainable param: llma.layers.68.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928415] Trainable param: llma.layers.68.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.928426] Trainable param: llma.layers.68.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928436] Trainable param: llma.layers.68.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928448] Trainable param: llma.layers.69.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928457] Trainable param: llma.layers.69.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.928467] Trainable param: llma.layers.69.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928476] Trainable param: llma.layers.69.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928486] Trainable param: llma.layers.69.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928495] Trainable param: llma.layers.69.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928505] Trainable param: llma.layers.69.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928514] Trainable param: llma.layers.69.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928525] Trainable param: llma.layers.69.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928534] Trainable param: llma.layers.69.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.928543] Trainable param: llma.layers.69.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928552] Trainable param: llma.layers.69.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928562] Trainable param: llma.layers.69.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928571] Trainable param: llma.layers.69.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.928581] Trainable param: llma.layers.69.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928591] Trainable param: llma.layers.69.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928603] Trainable param: llma.layers.70.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928612] Trainable param: llma.layers.70.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.928622] Trainable param: llma.layers.70.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928631] Trainable param: llma.layers.70.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928641] Trainable param: llma.layers.70.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928650] Trainable param: llma.layers.70.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928660] Trainable param: llma.layers.70.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928669] Trainable param: llma.layers.70.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928680] Trainable param: llma.layers.70.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928690] Trainable param: llma.layers.70.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.928700] Trainable param: llma.layers.70.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928709] Trainable param: llma.layers.70.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928719] Trainable param: llma.layers.70.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928728] Trainable param: llma.layers.70.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.928738] Trainable param: llma.layers.70.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928747] Trainable param: llma.layers.70.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928759] Trainable param: llma.layers.71.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928768] Trainable param: llma.layers.71.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.928778] Trainable param: llma.layers.71.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928787] Trainable param: llma.layers.71.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928797] Trainable param: llma.layers.71.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928806] Trainable param: llma.layers.71.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928816] Trainable param: llma.layers.71.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928825] Trainable param: llma.layers.71.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928835] Trainable param: llma.layers.71.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928844] Trainable param: llma.layers.71.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.928854] Trainable param: llma.layers.71.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928863] Trainable param: llma.layers.71.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928873] Trainable param: llma.layers.71.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928882] Trainable param: llma.layers.71.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.928892] Trainable param: llma.layers.71.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928902] Trainable param: llma.layers.71.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928913] Trainable param: llma.layers.72.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928922] Trainable param: llma.layers.72.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.928932] Trainable param: llma.layers.72.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928941] Trainable param: llma.layers.72.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928953] Trainable param: llma.layers.72.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928962] Trainable param: llma.layers.72.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.928972] Trainable param: llma.layers.72.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.928981] Trainable param: llma.layers.72.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.928992] Trainable param: llma.layers.72.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929001] Trainable param: llma.layers.72.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.929012] Trainable param: llma.layers.72.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929020] Trainable param: llma.layers.72.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929031] Trainable param: llma.layers.72.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929040] Trainable param: llma.layers.72.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.929050] Trainable param: llma.layers.72.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929060] Trainable param: llma.layers.72.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929072] Trainable param: llma.layers.73.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929081] Trainable param: llma.layers.73.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.929091] Trainable param: llma.layers.73.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929100] Trainable param: llma.layers.73.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.929110] Trainable param: llma.layers.73.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929119] Trainable param: llma.layers.73.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.929130] Trainable param: llma.layers.73.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929139] Trainable param: llma.layers.73.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929150] Trainable param: llma.layers.73.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929159] Trainable param: llma.layers.73.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.929169] Trainable param: llma.layers.73.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929178] Trainable param: llma.layers.73.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929188] Trainable param: llma.layers.73.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929196] Trainable param: llma.layers.73.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.929206] Trainable param: llma.layers.73.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929216] Trainable param: llma.layers.73.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929228] Trainable param: llma.layers.74.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929237] Trainable param: llma.layers.74.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.929247] Trainable param: llma.layers.74.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929256] Trainable param: llma.layers.74.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.929266] Trainable param: llma.layers.74.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929275] Trainable param: llma.layers.74.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.929285] Trainable param: llma.layers.74.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929293] Trainable param: llma.layers.74.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929304] Trainable param: llma.layers.74.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929313] Trainable param: llma.layers.74.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.929323] Trainable param: llma.layers.74.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929332] Trainable param: llma.layers.74.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929342] Trainable param: llma.layers.74.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929351] Trainable param: llma.layers.74.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.929361] Trainable param: llma.layers.74.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929371] Trainable param: llma.layers.74.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929383] Trainable param: llma.layers.75.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929392] Trainable param: llma.layers.75.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.929402] Trainable param: llma.layers.75.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929411] Trainable param: llma.layers.75.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.929422] Trainable param: llma.layers.75.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929430] Trainable param: llma.layers.75.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.929440] Trainable param: llma.layers.75.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929449] Trainable param: llma.layers.75.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929460] Trainable param: llma.layers.75.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929469] Trainable param: llma.layers.75.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.929480] Trainable param: llma.layers.75.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929489] Trainable param: llma.layers.75.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929499] Trainable param: llma.layers.75.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929508] Trainable param: llma.layers.75.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.929523] Trainable param: llma.layers.75.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929533] Trainable param: llma.layers.75.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929545] Trainable param: llma.layers.76.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929554] Trainable param: llma.layers.76.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.929565] Trainable param: llma.layers.76.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929573] Trainable param: llma.layers.76.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.929584] Trainable param: llma.layers.76.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929592] Trainable param: llma.layers.76.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.929603] Trainable param: llma.layers.76.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929611] Trainable param: llma.layers.76.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929623] Trainable param: llma.layers.76.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929632] Trainable param: llma.layers.76.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.929642] Trainable param: llma.layers.76.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929670] Trainable param: llma.layers.76.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929680] Trainable param: llma.layers.76.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929689] Trainable param: llma.layers.76.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.929699] Trainable param: llma.layers.76.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929709] Trainable param: llma.layers.76.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929721] Trainable param: llma.layers.77.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929730] Trainable param: llma.layers.77.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.929740] Trainable param: llma.layers.77.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929749] Trainable param: llma.layers.77.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.929760] Trainable param: llma.layers.77.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929770] Trainable param: llma.layers.77.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.929780] Trainable param: llma.layers.77.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929788] Trainable param: llma.layers.77.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929799] Trainable param: llma.layers.77.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929808] Trainable param: llma.layers.77.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.929818] Trainable param: llma.layers.77.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929827] Trainable param: llma.layers.77.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929837] Trainable param: llma.layers.77.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929846] Trainable param: llma.layers.77.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.929856] Trainable param: llma.layers.77.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929866] Trainable param: llma.layers.77.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929878] Trainable param: llma.layers.78.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929887] Trainable param: llma.layers.78.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.929897] Trainable param: llma.layers.78.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929906] Trainable param: llma.layers.78.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.929916] Trainable param: llma.layers.78.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929925] Trainable param: llma.layers.78.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.929934] Trainable param: llma.layers.78.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929943] Trainable param: llma.layers.78.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929954] Trainable param: llma.layers.78.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929962] Trainable param: llma.layers.78.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.929972] Trainable param: llma.layers.78.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.929981] Trainable param: llma.layers.78.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.929991] Trainable param: llma.layers.78.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.930000] Trainable param: llma.layers.78.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.930010] Trainable param: llma.layers.78.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.930019] Trainable param: llma.layers.78.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.930032] Trainable param: llma.layers.79.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.930041] Trainable param: llma.layers.79.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[16:54:32.930051] Trainable param: llma.layers.79.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.930060] Trainable param: llma.layers.79.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.930070] Trainable param: llma.layers.79.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.930079] Trainable param: llma.layers.79.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[16:54:32.930089] Trainable param: llma.layers.79.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.930098] Trainable param: llma.layers.79.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.930109] Trainable param: llma.layers.79.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.930118] Trainable param: llma.layers.79.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.930128] Trainable param: llma.layers.79.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.930136] Trainable param: llma.layers.79.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.930146] Trainable param: llma.layers.79.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.930155] Trainable param: llma.layers.79.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[16:54:32.930165] Trainable param: llma.layers.79.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.930175] Trainable param: llma.layers.79.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.930186] Trainable param: llma.norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[16:54:32.930199] Trainable param: llma.output.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16 +[16:54:32.930222] load pretrained from ../checkpoints/llama2/Llama-2-70b/ +[16:54:32.930227] Quantizing model to 4bit! +Traceback (most recent call last): +Traceback (most recent call last): + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 318, in + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 318, in +Traceback (most recent call last): + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 318, in +Traceback (most recent call last): + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 318, in + main(args) + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 172, in main + main(args)main(args) + +main(args) File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 172, in main + + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 172, in main + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 172, in main + load_tensor_parallel_model(model, args.pretrained_path, args.pretrained_type) + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 177, in load_tensor_parallel_model + load_tensor_parallel_model(model, args.pretrained_path, args.pretrained_type) +load_tensor_parallel_model(model, args.pretrained_path, args.pretrained_type) + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 177, in load_tensor_parallel_model + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 177, in load_tensor_parallel_model + load_tensor_parallel_model(model, args.pretrained_path, args.pretrained_type) + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 177, in load_tensor_parallel_model + local_state_dict = _load_checkpoint_and_merge_ranks( + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 71, in _load_checkpoint_and_merge_ranks + local_state_dict = _load_checkpoint_and_merge_ranks(local_state_dict = _load_checkpoint_and_merge_ranks( + + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 71, in _load_checkpoint_and_merge_ranks + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 71, in _load_checkpoint_and_merge_ranks +shard = torch.load(ckpt_files[shard_id], map_location="cpu") + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 809, in load + local_state_dict = _load_checkpoint_and_merge_ranks( + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/util/tensor_parallel.py", line 71, in _load_checkpoint_and_merge_ranks +shard = torch.load(ckpt_files[shard_id], map_location="cpu") + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 809, in load + shard = torch.load(ckpt_files[shard_id], map_location="cpu") + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 809, in load + shard = torch.load(ckpt_files[shard_id], map_location="cpu") + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 809, in load + return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1172, in _load +return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1172, in _load + return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1172, in _load + return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1172, in _load + result = unpickler.load() + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1142, in persistent_load + result = unpickler.load() + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1142, in persistent_load + result = unpickler.load() + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1142, in persistent_load + result = unpickler.load() + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1142, in persistent_load + typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location)) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1112, in load_tensor + typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location)) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1112, in load_tensor + typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location)) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1112, in load_tensor + typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location)) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/serialization.py", line 1112, in load_tensor + storage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage)._typed_storage()._untyped_storage +RuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 58720256 bytes. Error code 12 (Cannot allocate memory) + storage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage)._typed_storage()._untyped_storage + RuntimeErrorstorage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage)._typed_storage()._untyped_storage: +[enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 58720256 bytes. Error code 12 (Cannot allocate memory) +RuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 58720256 bytes. Error code 12 (Cannot allocate memory) + storage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage)._typed_storage()._untyped_storage +RuntimeError: [enforce fail at alloc_cpu.cpp:75] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 58720256 bytes. Error code 12 (Cannot allocate memory) +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 106890) of binary: /data/anaconda3/envs/accessory/bin/python3.10 +Traceback (most recent call last): + File "/data/anaconda3/envs/accessory/bin/torchrun", line 8, in + sys.exit(main()) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper + return f(*args, **kwargs) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main + run(args) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run + elastic_launch( + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +main_finetune.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2023-08-10_16:55:46 + host : iZ2ze8qpzapxkhyc9k2qojZ + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 106891) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[2]: + time : 2023-08-10_16:55:46 + host : iZ2ze8qpzapxkhyc9k2qojZ + rank : 2 (local_rank: 2) + exitcode : 1 (pid: 106892) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +[3]: + time : 2023-08-10_16:55:46 + host : iZ2ze8qpzapxkhyc9k2qojZ + rank : 3 (local_rank: 3) + exitcode : 1 (pid: 106893) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2023-08-10_16:55:46 + host : iZ2ze8qpzapxkhyc9k2qojZ + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 106890) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +| distributed init (rank 1): env://, gpu 1 +| distributed init (rank 0): env://, gpu 0 +[17:01:32.060306] > initializing model parallel with size 1 +[17:01:32.060361] > initializing ddp with size 2 +[17:01:32.060366] > initializing pipeline with size 1 +[17:01:32.134298] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory +[17:01:32.134363] Namespace(batch_size=4, +accum_iter=2, +llama_type='llama_peft', +llama_config=['../checkpoints/llama2/Llama-2-70b/params.json'], +no_visual=True, +tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model', +pretrained_path='../checkpoints/llama2/Llama-2-70b/', +pretrained_type='meta_ori', +weight_decay=0.02, +lr=5e-05, +min_lr=5e-06, +epochs=4, +warmup_epochs=1.0, +clip_grad=2, +max_words=512, +dialog=False, +data_config='configs/data/finetune/sg/alpaca.yaml', +output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B', +log_dir='./output_dir', +save_interval=1, +device='cuda', +seed=0, +resume='', +num_workers=4, +pin_mem=True, +world_size=2, +local_rank=-1, +dist_on_itp=False, +dist_url='env://', +model_parallel_size=1, +data_parallel='sdp', +precision='bf16', +checkpointing=True, +quant=True, +rank=0, +gpu=0, +distributed=True, +dist_backend='nccl') +[17:01:32.145104] Model Args: + ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True) +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +| distributed init (rank 0): env://, gpu 0 +| distributed init (rank 1): env://, gpu 1 +[17:01:52.288410] > initializing model parallel with size 1 +[17:01:52.288524] > initializing ddp with size 2 +[17:01:52.288530] > initializing pipeline with size 1 +[17:01:52.344237] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory +[17:01:52.344328] Namespace(batch_size=4, +accum_iter=2, +llama_type='llama_peft', +llama_config=['../checkpoints/llama2/Llama-2-70b/params.json'], +no_visual=True, +tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model', +pretrained_path='../checkpoints/llama2/Llama-2-70b/', +pretrained_type='meta_ori', +weight_decay=0.02, +lr=5e-05, +min_lr=5e-06, +epochs=4, +warmup_epochs=1.0, +clip_grad=2, +max_words=512, +dialog=False, +data_config='configs/data/finetune/sg/alpaca.yaml', +output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B', +log_dir='./output_dir', +save_interval=1, +device='cuda', +seed=0, +resume='', +num_workers=8, +pin_mem=True, +world_size=2, +local_rank=-1, +dist_on_itp=False, +dist_url='env://', +model_parallel_size=1, +data_parallel='sdp', +precision='bf16', +checkpointing=True, +quant=True, +rank=0, +gpu=0, +distributed=True, +dist_backend='nccl') +[17:01:52.354679] Model Args: + ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True) +[17:09:51.764818] Model is Peft: True +[17:09:51.772293] Trainable parameter count : 8036352 (local rank), 8036352 (all). +[17:09:51.796307] Trainable param: llma.tok_embeddings.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796346] Trainable param: llma.layers.0.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796359] Trainable param: llma.layers.0.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.796372] Trainable param: llma.layers.0.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796383] Trainable param: llma.layers.0.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.796395] Trainable param: llma.layers.0.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796405] Trainable param: llma.layers.0.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.796416] Trainable param: llma.layers.0.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796427] Trainable param: llma.layers.0.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796440] Trainable param: llma.layers.0.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796450] Trainable param: llma.layers.0.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.796461] Trainable param: llma.layers.0.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796471] Trainable param: llma.layers.0.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796482] Trainable param: llma.layers.0.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796492] Trainable param: llma.layers.0.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.796503] Trainable param: llma.layers.0.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796514] Trainable param: llma.layers.0.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796528] Trainable param: llma.layers.1.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796539] Trainable param: llma.layers.1.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.796550] Trainable param: llma.layers.1.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796560] Trainable param: llma.layers.1.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.796572] Trainable param: llma.layers.1.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796581] Trainable param: llma.layers.1.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.796592] Trainable param: llma.layers.1.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796602] Trainable param: llma.layers.1.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796614] Trainable param: llma.layers.1.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796624] Trainable param: llma.layers.1.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.796635] Trainable param: llma.layers.1.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796645] Trainable param: llma.layers.1.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796656] Trainable param: llma.layers.1.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796666] Trainable param: llma.layers.1.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.796677] Trainable param: llma.layers.1.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796688] Trainable param: llma.layers.1.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796701] Trainable param: llma.layers.2.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796711] Trainable param: llma.layers.2.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.796722] Trainable param: llma.layers.2.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796732] Trainable param: llma.layers.2.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.796743] Trainable param: llma.layers.2.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796753] Trainable param: llma.layers.2.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.796764] Trainable param: llma.layers.2.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796774] Trainable param: llma.layers.2.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796785] Trainable param: llma.layers.2.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796795] Trainable param: llma.layers.2.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.796806] Trainable param: llma.layers.2.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796816] Trainable param: llma.layers.2.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796827] Trainable param: llma.layers.2.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796837] Trainable param: llma.layers.2.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.796848] Trainable param: llma.layers.2.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796858] Trainable param: llma.layers.2.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796871] Trainable param: llma.layers.3.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796881] Trainable param: llma.layers.3.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.796892] Trainable param: llma.layers.3.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796902] Trainable param: llma.layers.3.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.796913] Trainable param: llma.layers.3.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796923] Trainable param: llma.layers.3.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.796934] Trainable param: llma.layers.3.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796944] Trainable param: llma.layers.3.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796956] Trainable param: llma.layers.3.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796966] Trainable param: llma.layers.3.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.796977] Trainable param: llma.layers.3.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.796986] Trainable param: llma.layers.3.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.796997] Trainable param: llma.layers.3.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797008] Trainable param: llma.layers.3.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.797019] Trainable param: llma.layers.3.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797029] Trainable param: llma.layers.3.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797042] Trainable param: llma.layers.4.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797052] Trainable param: llma.layers.4.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.797064] Trainable param: llma.layers.4.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797073] Trainable param: llma.layers.4.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.797085] Trainable param: llma.layers.4.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797094] Trainable param: llma.layers.4.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.797105] Trainable param: llma.layers.4.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797115] Trainable param: llma.layers.4.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797127] Trainable param: llma.layers.4.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797137] Trainable param: llma.layers.4.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.797148] Trainable param: llma.layers.4.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797159] Trainable param: llma.layers.4.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797170] Trainable param: llma.layers.4.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797180] Trainable param: llma.layers.4.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.797191] Trainable param: llma.layers.4.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797202] Trainable param: llma.layers.4.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797215] Trainable param: llma.layers.5.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797224] Trainable param: llma.layers.5.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.797235] Trainable param: llma.layers.5.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797245] Trainable param: llma.layers.5.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.797256] Trainable param: llma.layers.5.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797266] Trainable param: llma.layers.5.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.797277] Trainable param: llma.layers.5.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797287] Trainable param: llma.layers.5.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797299] Trainable param: llma.layers.5.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797308] Trainable param: llma.layers.5.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.797320] Trainable param: llma.layers.5.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797329] Trainable param: llma.layers.5.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797340] Trainable param: llma.layers.5.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797350] Trainable param: llma.layers.5.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.797361] Trainable param: llma.layers.5.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797373] Trainable param: llma.layers.5.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797386] Trainable param: llma.layers.6.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797396] Trainable param: llma.layers.6.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.797408] Trainable param: llma.layers.6.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797417] Trainable param: llma.layers.6.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.797428] Trainable param: llma.layers.6.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797438] Trainable param: llma.layers.6.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.797449] Trainable param: llma.layers.6.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797459] Trainable param: llma.layers.6.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797471] Trainable param: llma.layers.6.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797481] Trainable param: llma.layers.6.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.797492] Trainable param: llma.layers.6.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797502] Trainable param: llma.layers.6.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797517] Trainable param: llma.layers.6.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797527] Trainable param: llma.layers.6.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.797538] Trainable param: llma.layers.6.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797551] Trainable param: llma.layers.6.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797564] Trainable param: llma.layers.7.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797575] Trainable param: llma.layers.7.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.797586] Trainable param: llma.layers.7.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797596] Trainable param: llma.layers.7.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.797607] Trainable param: llma.layers.7.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797617] Trainable param: llma.layers.7.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.797629] Trainable param: llma.layers.7.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797639] Trainable param: llma.layers.7.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797651] Trainable param: llma.layers.7.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797661] Trainable param: llma.layers.7.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.797672] Trainable param: llma.layers.7.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797682] Trainable param: llma.layers.7.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797694] Trainable param: llma.layers.7.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797703] Trainable param: llma.layers.7.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.797714] Trainable param: llma.layers.7.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797725] Trainable param: llma.layers.7.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797738] Trainable param: llma.layers.8.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797748] Trainable param: llma.layers.8.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.797759] Trainable param: llma.layers.8.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797769] Trainable param: llma.layers.8.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.797780] Trainable param: llma.layers.8.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797790] Trainable param: llma.layers.8.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.797801] Trainable param: llma.layers.8.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797810] Trainable param: llma.layers.8.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797822] Trainable param: llma.layers.8.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797832] Trainable param: llma.layers.8.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.797843] Trainable param: llma.layers.8.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797853] Trainable param: llma.layers.8.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797864] Trainable param: llma.layers.8.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797874] Trainable param: llma.layers.8.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.797885] Trainable param: llma.layers.8.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797895] Trainable param: llma.layers.8.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797908] Trainable param: llma.layers.9.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797918] Trainable param: llma.layers.9.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.797929] Trainable param: llma.layers.9.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797939] Trainable param: llma.layers.9.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.797950] Trainable param: llma.layers.9.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797960] Trainable param: llma.layers.9.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.797971] Trainable param: llma.layers.9.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.797981] Trainable param: llma.layers.9.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.797993] Trainable param: llma.layers.9.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798002] Trainable param: llma.layers.9.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.798013] Trainable param: llma.layers.9.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798023] Trainable param: llma.layers.9.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798034] Trainable param: llma.layers.9.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798044] Trainable param: llma.layers.9.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.798055] Trainable param: llma.layers.9.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798066] Trainable param: llma.layers.9.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798079] Trainable param: llma.layers.10.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798089] Trainable param: llma.layers.10.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.798100] Trainable param: llma.layers.10.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798109] Trainable param: llma.layers.10.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.798120] Trainable param: llma.layers.10.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798130] Trainable param: llma.layers.10.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.798141] Trainable param: llma.layers.10.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798151] Trainable param: llma.layers.10.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798163] Trainable param: llma.layers.10.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798173] Trainable param: llma.layers.10.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.798184] Trainable param: llma.layers.10.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798193] Trainable param: llma.layers.10.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798204] Trainable param: llma.layers.10.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798214] Trainable param: llma.layers.10.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.798225] Trainable param: llma.layers.10.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798235] Trainable param: llma.layers.10.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798248] Trainable param: llma.layers.11.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798258] Trainable param: llma.layers.11.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.798269] Trainable param: llma.layers.11.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798278] Trainable param: llma.layers.11.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.798290] Trainable param: llma.layers.11.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798299] Trainable param: llma.layers.11.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.798310] Trainable param: llma.layers.11.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798320] Trainable param: llma.layers.11.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798331] Trainable param: llma.layers.11.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798341] Trainable param: llma.layers.11.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.798352] Trainable param: llma.layers.11.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798362] Trainable param: llma.layers.11.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798373] Trainable param: llma.layers.11.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798383] Trainable param: llma.layers.11.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.798393] Trainable param: llma.layers.11.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798404] Trainable param: llma.layers.11.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798417] Trainable param: llma.layers.12.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798427] Trainable param: llma.layers.12.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.798438] Trainable param: llma.layers.12.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798448] Trainable param: llma.layers.12.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.798459] Trainable param: llma.layers.12.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798469] Trainable param: llma.layers.12.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.798480] Trainable param: llma.layers.12.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798490] Trainable param: llma.layers.12.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798502] Trainable param: llma.layers.12.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798511] Trainable param: llma.layers.12.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.798522] Trainable param: llma.layers.12.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798532] Trainable param: llma.layers.12.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798543] Trainable param: llma.layers.12.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798553] Trainable param: llma.layers.12.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.798564] Trainable param: llma.layers.12.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798574] Trainable param: llma.layers.12.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798587] Trainable param: llma.layers.13.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798597] Trainable param: llma.layers.13.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.798608] Trainable param: llma.layers.13.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798617] Trainable param: llma.layers.13.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.798629] Trainable param: llma.layers.13.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798638] Trainable param: llma.layers.13.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.798649] Trainable param: llma.layers.13.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798659] Trainable param: llma.layers.13.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798671] Trainable param: llma.layers.13.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798681] Trainable param: llma.layers.13.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.798692] Trainable param: llma.layers.13.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798701] Trainable param: llma.layers.13.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798712] Trainable param: llma.layers.13.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798722] Trainable param: llma.layers.13.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.798733] Trainable param: llma.layers.13.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798743] Trainable param: llma.layers.13.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798756] Trainable param: llma.layers.14.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798766] Trainable param: llma.layers.14.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.798777] Trainable param: llma.layers.14.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798787] Trainable param: llma.layers.14.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.798798] Trainable param: llma.layers.14.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798808] Trainable param: llma.layers.14.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.798819] Trainable param: llma.layers.14.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798829] Trainable param: llma.layers.14.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798841] Trainable param: llma.layers.14.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798851] Trainable param: llma.layers.14.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.798862] Trainable param: llma.layers.14.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798871] Trainable param: llma.layers.14.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798882] Trainable param: llma.layers.14.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798892] Trainable param: llma.layers.14.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.798903] Trainable param: llma.layers.14.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798914] Trainable param: llma.layers.14.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.798926] Trainable param: llma.layers.15.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798936] Trainable param: llma.layers.15.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.798948] Trainable param: llma.layers.15.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798957] Trainable param: llma.layers.15.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.798968] Trainable param: llma.layers.15.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798978] Trainable param: llma.layers.15.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.798989] Trainable param: llma.layers.15.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.798999] Trainable param: llma.layers.15.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799011] Trainable param: llma.layers.15.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799020] Trainable param: llma.layers.15.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.799031] Trainable param: llma.layers.15.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799041] Trainable param: llma.layers.15.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799052] Trainable param: llma.layers.15.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799062] Trainable param: llma.layers.15.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.799073] Trainable param: llma.layers.15.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799083] Trainable param: llma.layers.15.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799096] Trainable param: llma.layers.16.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799106] Trainable param: llma.layers.16.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.799117] Trainable param: llma.layers.16.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799127] Trainable param: llma.layers.16.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.799138] Trainable param: llma.layers.16.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799148] Trainable param: llma.layers.16.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.799158] Trainable param: llma.layers.16.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799168] Trainable param: llma.layers.16.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799180] Trainable param: llma.layers.16.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799190] Trainable param: llma.layers.16.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.799201] Trainable param: llma.layers.16.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799211] Trainable param: llma.layers.16.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799222] Trainable param: llma.layers.16.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799231] Trainable param: llma.layers.16.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.799242] Trainable param: llma.layers.16.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799253] Trainable param: llma.layers.16.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799266] Trainable param: llma.layers.17.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799276] Trainable param: llma.layers.17.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.799287] Trainable param: llma.layers.17.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799296] Trainable param: llma.layers.17.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.799308] Trainable param: llma.layers.17.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799318] Trainable param: llma.layers.17.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.799329] Trainable param: llma.layers.17.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799338] Trainable param: llma.layers.17.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799350] Trainable param: llma.layers.17.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799360] Trainable param: llma.layers.17.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.799371] Trainable param: llma.layers.17.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799380] Trainable param: llma.layers.17.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799391] Trainable param: llma.layers.17.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799401] Trainable param: llma.layers.17.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.799412] Trainable param: llma.layers.17.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799422] Trainable param: llma.layers.17.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799436] Trainable param: llma.layers.18.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799446] Trainable param: llma.layers.18.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.799457] Trainable param: llma.layers.18.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799466] Trainable param: llma.layers.18.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.799478] Trainable param: llma.layers.18.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799487] Trainable param: llma.layers.18.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.799498] Trainable param: llma.layers.18.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799508] Trainable param: llma.layers.18.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799520] Trainable param: llma.layers.18.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799530] Trainable param: llma.layers.18.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.799541] Trainable param: llma.layers.18.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799550] Trainable param: llma.layers.18.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799561] Trainable param: llma.layers.18.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799571] Trainable param: llma.layers.18.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.799582] Trainable param: llma.layers.18.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799593] Trainable param: llma.layers.18.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799605] Trainable param: llma.layers.19.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799619] Trainable param: llma.layers.19.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.799631] Trainable param: llma.layers.19.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799640] Trainable param: llma.layers.19.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.799652] Trainable param: llma.layers.19.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799661] Trainable param: llma.layers.19.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.799672] Trainable param: llma.layers.19.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799682] Trainable param: llma.layers.19.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799694] Trainable param: llma.layers.19.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799703] Trainable param: llma.layers.19.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.799715] Trainable param: llma.layers.19.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799724] Trainable param: llma.layers.19.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799735] Trainable param: llma.layers.19.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799745] Trainable param: llma.layers.19.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.799756] Trainable param: llma.layers.19.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799766] Trainable param: llma.layers.19.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799779] Trainable param: llma.layers.20.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799789] Trainable param: llma.layers.20.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.799800] Trainable param: llma.layers.20.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799810] Trainable param: llma.layers.20.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.799821] Trainable param: llma.layers.20.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799831] Trainable param: llma.layers.20.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.799842] Trainable param: llma.layers.20.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799851] Trainable param: llma.layers.20.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799864] Trainable param: llma.layers.20.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799873] Trainable param: llma.layers.20.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.799884] Trainable param: llma.layers.20.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799894] Trainable param: llma.layers.20.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799905] Trainable param: llma.layers.20.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799914] Trainable param: llma.layers.20.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.799925] Trainable param: llma.layers.20.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799936] Trainable param: llma.layers.20.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.799949] Trainable param: llma.layers.21.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799959] Trainable param: llma.layers.21.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.799970] Trainable param: llma.layers.21.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.799979] Trainable param: llma.layers.21.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.799990] Trainable param: llma.layers.21.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800000] Trainable param: llma.layers.21.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.800011] Trainable param: llma.layers.21.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800021] Trainable param: llma.layers.21.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800033] Trainable param: llma.layers.21.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800043] Trainable param: llma.layers.21.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.800054] Trainable param: llma.layers.21.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800063] Trainable param: llma.layers.21.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800074] Trainable param: llma.layers.21.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800084] Trainable param: llma.layers.21.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.800095] Trainable param: llma.layers.21.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800106] Trainable param: llma.layers.21.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800119] Trainable param: llma.layers.22.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800128] Trainable param: llma.layers.22.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.800139] Trainable param: llma.layers.22.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800149] Trainable param: llma.layers.22.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.800160] Trainable param: llma.layers.22.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800170] Trainable param: llma.layers.22.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.800181] Trainable param: llma.layers.22.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800191] Trainable param: llma.layers.22.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800203] Trainable param: llma.layers.22.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800213] Trainable param: llma.layers.22.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.800223] Trainable param: llma.layers.22.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800233] Trainable param: llma.layers.22.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800244] Trainable param: llma.layers.22.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800254] Trainable param: llma.layers.22.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.800265] Trainable param: llma.layers.22.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800276] Trainable param: llma.layers.22.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800289] Trainable param: llma.layers.23.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800299] Trainable param: llma.layers.23.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.800310] Trainable param: llma.layers.23.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800320] Trainable param: llma.layers.23.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.800330] Trainable param: llma.layers.23.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800340] Trainable param: llma.layers.23.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.800351] Trainable param: llma.layers.23.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800361] Trainable param: llma.layers.23.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800373] Trainable param: llma.layers.23.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800382] Trainable param: llma.layers.23.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.800393] Trainable param: llma.layers.23.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800403] Trainable param: llma.layers.23.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800414] Trainable param: llma.layers.23.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800424] Trainable param: llma.layers.23.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.800435] Trainable param: llma.layers.23.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800446] Trainable param: llma.layers.23.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800459] Trainable param: llma.layers.24.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800469] Trainable param: llma.layers.24.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.800480] Trainable param: llma.layers.24.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800489] Trainable param: llma.layers.24.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.800500] Trainable param: llma.layers.24.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800510] Trainable param: llma.layers.24.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.800521] Trainable param: llma.layers.24.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800531] Trainable param: llma.layers.24.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800543] Trainable param: llma.layers.24.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800553] Trainable param: llma.layers.24.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.800564] Trainable param: llma.layers.24.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800574] Trainable param: llma.layers.24.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800585] Trainable param: llma.layers.24.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800594] Trainable param: llma.layers.24.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.800605] Trainable param: llma.layers.24.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800616] Trainable param: llma.layers.24.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800631] Trainable param: llma.layers.25.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800641] Trainable param: llma.layers.25.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.800652] Trainable param: llma.layers.25.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800662] Trainable param: llma.layers.25.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.800673] Trainable param: llma.layers.25.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800683] Trainable param: llma.layers.25.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.800694] Trainable param: llma.layers.25.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800704] Trainable param: llma.layers.25.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800715] Trainable param: llma.layers.25.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800725] Trainable param: llma.layers.25.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.800736] Trainable param: llma.layers.25.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800746] Trainable param: llma.layers.25.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800757] Trainable param: llma.layers.25.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800766] Trainable param: llma.layers.25.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.800777] Trainable param: llma.layers.25.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800788] Trainable param: llma.layers.25.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800801] Trainable param: llma.layers.26.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800810] Trainable param: llma.layers.26.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.800822] Trainable param: llma.layers.26.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800831] Trainable param: llma.layers.26.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.800842] Trainable param: llma.layers.26.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800852] Trainable param: llma.layers.26.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.800863] Trainable param: llma.layers.26.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800872] Trainable param: llma.layers.26.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800885] Trainable param: llma.layers.26.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800894] Trainable param: llma.layers.26.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.800905] Trainable param: llma.layers.26.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800915] Trainable param: llma.layers.26.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800926] Trainable param: llma.layers.26.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800936] Trainable param: llma.layers.26.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.800946] Trainable param: llma.layers.26.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800957] Trainable param: llma.layers.26.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.800970] Trainable param: llma.layers.27.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.800980] Trainable param: llma.layers.27.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.800991] Trainable param: llma.layers.27.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801001] Trainable param: llma.layers.27.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.801012] Trainable param: llma.layers.27.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801021] Trainable param: llma.layers.27.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.801032] Trainable param: llma.layers.27.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801042] Trainable param: llma.layers.27.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801054] Trainable param: llma.layers.27.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801063] Trainable param: llma.layers.27.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.801074] Trainable param: llma.layers.27.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801084] Trainable param: llma.layers.27.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801095] Trainable param: llma.layers.27.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801105] Trainable param: llma.layers.27.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.801115] Trainable param: llma.layers.27.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801126] Trainable param: llma.layers.27.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801139] Trainable param: llma.layers.28.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801149] Trainable param: llma.layers.28.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.801160] Trainable param: llma.layers.28.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801170] Trainable param: llma.layers.28.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.801181] Trainable param: llma.layers.28.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801190] Trainable param: llma.layers.28.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.801202] Trainable param: llma.layers.28.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801211] Trainable param: llma.layers.28.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801223] Trainable param: llma.layers.28.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801233] Trainable param: llma.layers.28.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.801244] Trainable param: llma.layers.28.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801253] Trainable param: llma.layers.28.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801264] Trainable param: llma.layers.28.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801274] Trainable param: llma.layers.28.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.801285] Trainable param: llma.layers.28.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801296] Trainable param: llma.layers.28.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801308] Trainable param: llma.layers.29.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801318] Trainable param: llma.layers.29.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.801329] Trainable param: llma.layers.29.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801339] Trainable param: llma.layers.29.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.801349] Trainable param: llma.layers.29.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801359] Trainable param: llma.layers.29.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.801370] Trainable param: llma.layers.29.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801380] Trainable param: llma.layers.29.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801391] Trainable param: llma.layers.29.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801401] Trainable param: llma.layers.29.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.801412] Trainable param: llma.layers.29.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801422] Trainable param: llma.layers.29.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801433] Trainable param: llma.layers.29.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801442] Trainable param: llma.layers.29.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.801453] Trainable param: llma.layers.29.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801464] Trainable param: llma.layers.29.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801476] Trainable param: llma.layers.30.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801486] Trainable param: llma.layers.30.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.801497] Trainable param: llma.layers.30.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801507] Trainable param: llma.layers.30.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.801521] Trainable param: llma.layers.30.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801531] Trainable param: llma.layers.30.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.801542] Trainable param: llma.layers.30.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801552] Trainable param: llma.layers.30.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801565] Trainable param: llma.layers.30.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801575] Trainable param: llma.layers.30.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.801586] Trainable param: llma.layers.30.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801596] Trainable param: llma.layers.30.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801607] Trainable param: llma.layers.30.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801617] Trainable param: llma.layers.30.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.801628] Trainable param: llma.layers.30.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801639] Trainable param: llma.layers.30.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801652] Trainable param: llma.layers.31.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801662] Trainable param: llma.layers.31.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.801673] Trainable param: llma.layers.31.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801683] Trainable param: llma.layers.31.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.801694] Trainable param: llma.layers.31.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801704] Trainable param: llma.layers.31.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.801715] Trainable param: llma.layers.31.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801725] Trainable param: llma.layers.31.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801737] Trainable param: llma.layers.31.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801747] Trainable param: llma.layers.31.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.801758] Trainable param: llma.layers.31.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801767] Trainable param: llma.layers.31.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801778] Trainable param: llma.layers.31.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801788] Trainable param: llma.layers.31.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.801799] Trainable param: llma.layers.31.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801809] Trainable param: llma.layers.31.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801822] Trainable param: llma.layers.32.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801832] Trainable param: llma.layers.32.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.801843] Trainable param: llma.layers.32.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801853] Trainable param: llma.layers.32.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.801865] Trainable param: llma.layers.32.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801874] Trainable param: llma.layers.32.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.801885] Trainable param: llma.layers.32.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801895] Trainable param: llma.layers.32.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801907] Trainable param: llma.layers.32.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801917] Trainable param: llma.layers.32.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.801928] Trainable param: llma.layers.32.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801938] Trainable param: llma.layers.32.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801949] Trainable param: llma.layers.32.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.801959] Trainable param: llma.layers.32.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.801969] Trainable param: llma.layers.32.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801980] Trainable param: llma.layers.32.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.801993] Trainable param: llma.layers.33.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802003] Trainable param: llma.layers.33.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.802014] Trainable param: llma.layers.33.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802024] Trainable param: llma.layers.33.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.802035] Trainable param: llma.layers.33.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802044] Trainable param: llma.layers.33.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.802055] Trainable param: llma.layers.33.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802065] Trainable param: llma.layers.33.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802077] Trainable param: llma.layers.33.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802087] Trainable param: llma.layers.33.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.802098] Trainable param: llma.layers.33.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802108] Trainable param: llma.layers.33.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802119] Trainable param: llma.layers.33.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802128] Trainable param: llma.layers.33.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.802139] Trainable param: llma.layers.33.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802150] Trainable param: llma.layers.33.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802163] Trainable param: llma.layers.34.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802173] Trainable param: llma.layers.34.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.802184] Trainable param: llma.layers.34.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802193] Trainable param: llma.layers.34.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.802204] Trainable param: llma.layers.34.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802214] Trainable param: llma.layers.34.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.802225] Trainable param: llma.layers.34.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802235] Trainable param: llma.layers.34.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802248] Trainable param: llma.layers.34.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802258] Trainable param: llma.layers.34.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.802269] Trainable param: llma.layers.34.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802278] Trainable param: llma.layers.34.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802290] Trainable param: llma.layers.34.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802299] Trainable param: llma.layers.34.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.802310] Trainable param: llma.layers.34.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802321] Trainable param: llma.layers.34.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802334] Trainable param: llma.layers.35.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802344] Trainable param: llma.layers.35.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.802355] Trainable param: llma.layers.35.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802365] Trainable param: llma.layers.35.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.802376] Trainable param: llma.layers.35.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802386] Trainable param: llma.layers.35.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.802397] Trainable param: llma.layers.35.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802407] Trainable param: llma.layers.35.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802418] Trainable param: llma.layers.35.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802428] Trainable param: llma.layers.35.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.802439] Trainable param: llma.layers.35.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802449] Trainable param: llma.layers.35.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802460] Trainable param: llma.layers.35.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802469] Trainable param: llma.layers.35.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.802480] Trainable param: llma.layers.35.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802491] Trainable param: llma.layers.35.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802504] Trainable param: llma.layers.36.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802514] Trainable param: llma.layers.36.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.802525] Trainable param: llma.layers.36.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802534] Trainable param: llma.layers.36.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.802546] Trainable param: llma.layers.36.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802555] Trainable param: llma.layers.36.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.802566] Trainable param: llma.layers.36.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802576] Trainable param: llma.layers.36.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802588] Trainable param: llma.layers.36.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802597] Trainable param: llma.layers.36.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.802609] Trainable param: llma.layers.36.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802618] Trainable param: llma.layers.36.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802629] Trainable param: llma.layers.36.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802639] Trainable param: llma.layers.36.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.802650] Trainable param: llma.layers.36.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802661] Trainable param: llma.layers.36.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802674] Trainable param: llma.layers.37.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802684] Trainable param: llma.layers.37.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.802695] Trainable param: llma.layers.37.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802705] Trainable param: llma.layers.37.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.802716] Trainable param: llma.layers.37.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802726] Trainable param: llma.layers.37.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.802737] Trainable param: llma.layers.37.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802747] Trainable param: llma.layers.37.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802759] Trainable param: llma.layers.37.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802768] Trainable param: llma.layers.37.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.802779] Trainable param: llma.layers.37.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802789] Trainable param: llma.layers.37.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802800] Trainable param: llma.layers.37.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802810] Trainable param: llma.layers.37.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.802821] Trainable param: llma.layers.37.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802831] Trainable param: llma.layers.37.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802844] Trainable param: llma.layers.38.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802854] Trainable param: llma.layers.38.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.802865] Trainable param: llma.layers.38.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802875] Trainable param: llma.layers.38.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.802886] Trainable param: llma.layers.38.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802896] Trainable param: llma.layers.38.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.802907] Trainable param: llma.layers.38.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802917] Trainable param: llma.layers.38.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802929] Trainable param: llma.layers.38.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802938] Trainable param: llma.layers.38.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.802949] Trainable param: llma.layers.38.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802959] Trainable param: llma.layers.38.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.802970] Trainable param: llma.layers.38.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.802980] Trainable param: llma.layers.38.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.802991] Trainable param: llma.layers.38.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803001] Trainable param: llma.layers.38.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803014] Trainable param: llma.layers.39.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803024] Trainable param: llma.layers.39.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.803035] Trainable param: llma.layers.39.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803045] Trainable param: llma.layers.39.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.803056] Trainable param: llma.layers.39.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803065] Trainable param: llma.layers.39.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.803076] Trainable param: llma.layers.39.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803086] Trainable param: llma.layers.39.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803098] Trainable param: llma.layers.39.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803108] Trainable param: llma.layers.39.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.803119] Trainable param: llma.layers.39.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803128] Trainable param: llma.layers.39.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803140] Trainable param: llma.layers.39.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803149] Trainable param: llma.layers.39.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.803160] Trainable param: llma.layers.39.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803171] Trainable param: llma.layers.39.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803183] Trainable param: llma.layers.40.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803193] Trainable param: llma.layers.40.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.803204] Trainable param: llma.layers.40.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803214] Trainable param: llma.layers.40.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.803225] Trainable param: llma.layers.40.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803235] Trainable param: llma.layers.40.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.803246] Trainable param: llma.layers.40.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803255] Trainable param: llma.layers.40.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803267] Trainable param: llma.layers.40.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803277] Trainable param: llma.layers.40.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.803288] Trainable param: llma.layers.40.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803298] Trainable param: llma.layers.40.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803309] Trainable param: llma.layers.40.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803319] Trainable param: llma.layers.40.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.803330] Trainable param: llma.layers.40.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803340] Trainable param: llma.layers.40.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803353] Trainable param: llma.layers.41.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803363] Trainable param: llma.layers.41.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.803374] Trainable param: llma.layers.41.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803384] Trainable param: llma.layers.41.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.803395] Trainable param: llma.layers.41.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803404] Trainable param: llma.layers.41.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.803415] Trainable param: llma.layers.41.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803425] Trainable param: llma.layers.41.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803437] Trainable param: llma.layers.41.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803446] Trainable param: llma.layers.41.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.803457] Trainable param: llma.layers.41.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803467] Trainable param: llma.layers.41.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803478] Trainable param: llma.layers.41.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803488] Trainable param: llma.layers.41.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.803499] Trainable param: llma.layers.41.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803510] Trainable param: llma.layers.41.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803523] Trainable param: llma.layers.42.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803533] Trainable param: llma.layers.42.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.803544] Trainable param: llma.layers.42.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803553] Trainable param: llma.layers.42.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.803565] Trainable param: llma.layers.42.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803574] Trainable param: llma.layers.42.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.803585] Trainable param: llma.layers.42.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803595] Trainable param: llma.layers.42.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803607] Trainable param: llma.layers.42.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803617] Trainable param: llma.layers.42.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.803628] Trainable param: llma.layers.42.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803638] Trainable param: llma.layers.42.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803649] Trainable param: llma.layers.42.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803659] Trainable param: llma.layers.42.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.803669] Trainable param: llma.layers.42.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803680] Trainable param: llma.layers.42.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803693] Trainable param: llma.layers.43.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803703] Trainable param: llma.layers.43.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.803714] Trainable param: llma.layers.43.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803723] Trainable param: llma.layers.43.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.803735] Trainable param: llma.layers.43.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803744] Trainable param: llma.layers.43.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.803755] Trainable param: llma.layers.43.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803765] Trainable param: llma.layers.43.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803777] Trainable param: llma.layers.43.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803786] Trainable param: llma.layers.43.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.803797] Trainable param: llma.layers.43.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803807] Trainable param: llma.layers.43.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803818] Trainable param: llma.layers.43.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803828] Trainable param: llma.layers.43.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.803839] Trainable param: llma.layers.43.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803850] Trainable param: llma.layers.43.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803862] Trainable param: llma.layers.44.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803872] Trainable param: llma.layers.44.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.803883] Trainable param: llma.layers.44.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803893] Trainable param: llma.layers.44.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.803904] Trainable param: llma.layers.44.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803914] Trainable param: llma.layers.44.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.803925] Trainable param: llma.layers.44.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803934] Trainable param: llma.layers.44.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803946] Trainable param: llma.layers.44.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803956] Trainable param: llma.layers.44.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.803967] Trainable param: llma.layers.44.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803977] Trainable param: llma.layers.44.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.803988] Trainable param: llma.layers.44.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.803998] Trainable param: llma.layers.44.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.804009] Trainable param: llma.layers.44.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804019] Trainable param: llma.layers.44.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804032] Trainable param: llma.layers.45.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804042] Trainable param: llma.layers.45.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.804053] Trainable param: llma.layers.45.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804063] Trainable param: llma.layers.45.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.804074] Trainable param: llma.layers.45.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804083] Trainable param: llma.layers.45.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.804094] Trainable param: llma.layers.45.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804104] Trainable param: llma.layers.45.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804116] Trainable param: llma.layers.45.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804126] Trainable param: llma.layers.45.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.804137] Trainable param: llma.layers.45.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804146] Trainable param: llma.layers.45.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804157] Trainable param: llma.layers.45.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804167] Trainable param: llma.layers.45.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.804178] Trainable param: llma.layers.45.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804189] Trainable param: llma.layers.45.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804202] Trainable param: llma.layers.46.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804211] Trainable param: llma.layers.46.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.804222] Trainable param: llma.layers.46.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804232] Trainable param: llma.layers.46.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.804243] Trainable param: llma.layers.46.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804253] Trainable param: llma.layers.46.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.804264] Trainable param: llma.layers.46.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804273] Trainable param: llma.layers.46.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804285] Trainable param: llma.layers.46.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804295] Trainable param: llma.layers.46.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.804306] Trainable param: llma.layers.46.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804316] Trainable param: llma.layers.46.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804327] Trainable param: llma.layers.46.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804337] Trainable param: llma.layers.46.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.804348] Trainable param: llma.layers.46.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804358] Trainable param: llma.layers.46.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804371] Trainable param: llma.layers.47.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804381] Trainable param: llma.layers.47.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.804392] Trainable param: llma.layers.47.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804402] Trainable param: llma.layers.47.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.804413] Trainable param: llma.layers.47.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804423] Trainable param: llma.layers.47.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.804434] Trainable param: llma.layers.47.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804443] Trainable param: llma.layers.47.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804455] Trainable param: llma.layers.47.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804465] Trainable param: llma.layers.47.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.804476] Trainable param: llma.layers.47.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804485] Trainable param: llma.layers.47.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804496] Trainable param: llma.layers.47.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804506] Trainable param: llma.layers.47.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.804517] Trainable param: llma.layers.47.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804528] Trainable param: llma.layers.47.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804541] Trainable param: llma.layers.48.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804551] Trainable param: llma.layers.48.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.804562] Trainable param: llma.layers.48.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804571] Trainable param: llma.layers.48.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.804583] Trainable param: llma.layers.48.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804592] Trainable param: llma.layers.48.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.804603] Trainable param: llma.layers.48.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804613] Trainable param: llma.layers.48.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804625] Trainable param: llma.layers.48.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804634] Trainable param: llma.layers.48.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.804645] Trainable param: llma.layers.48.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804655] Trainable param: llma.layers.48.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804666] Trainable param: llma.layers.48.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804676] Trainable param: llma.layers.48.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.804687] Trainable param: llma.layers.48.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804698] Trainable param: llma.layers.48.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804710] Trainable param: llma.layers.49.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804720] Trainable param: llma.layers.49.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.804731] Trainable param: llma.layers.49.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804741] Trainable param: llma.layers.49.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.804752] Trainable param: llma.layers.49.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804762] Trainable param: llma.layers.49.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.804773] Trainable param: llma.layers.49.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804783] Trainable param: llma.layers.49.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804795] Trainable param: llma.layers.49.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804804] Trainable param: llma.layers.49.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.804815] Trainable param: llma.layers.49.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804825] Trainable param: llma.layers.49.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804836] Trainable param: llma.layers.49.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804846] Trainable param: llma.layers.49.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.804857] Trainable param: llma.layers.49.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804868] Trainable param: llma.layers.49.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804881] Trainable param: llma.layers.50.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804891] Trainable param: llma.layers.50.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.804902] Trainable param: llma.layers.50.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804912] Trainable param: llma.layers.50.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.804923] Trainable param: llma.layers.50.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804933] Trainable param: llma.layers.50.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.804944] Trainable param: llma.layers.50.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804953] Trainable param: llma.layers.50.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.804965] Trainable param: llma.layers.50.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804974] Trainable param: llma.layers.50.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.804985] Trainable param: llma.layers.50.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.804995] Trainable param: llma.layers.50.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805006] Trainable param: llma.layers.50.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805016] Trainable param: llma.layers.50.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.805027] Trainable param: llma.layers.50.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805037] Trainable param: llma.layers.50.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805050] Trainable param: llma.layers.51.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805060] Trainable param: llma.layers.51.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.805071] Trainable param: llma.layers.51.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805081] Trainable param: llma.layers.51.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.805092] Trainable param: llma.layers.51.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805102] Trainable param: llma.layers.51.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.805113] Trainable param: llma.layers.51.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805122] Trainable param: llma.layers.51.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805134] Trainable param: llma.layers.51.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805144] Trainable param: llma.layers.51.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.805155] Trainable param: llma.layers.51.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805165] Trainable param: llma.layers.51.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805176] Trainable param: llma.layers.51.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805185] Trainable param: llma.layers.51.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.805196] Trainable param: llma.layers.51.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805207] Trainable param: llma.layers.51.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805220] Trainable param: llma.layers.52.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805230] Trainable param: llma.layers.52.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.805241] Trainable param: llma.layers.52.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805251] Trainable param: llma.layers.52.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.805262] Trainable param: llma.layers.52.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805272] Trainable param: llma.layers.52.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.805283] Trainable param: llma.layers.52.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805293] Trainable param: llma.layers.52.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805305] Trainable param: llma.layers.52.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805315] Trainable param: llma.layers.52.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.805325] Trainable param: llma.layers.52.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805335] Trainable param: llma.layers.52.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805346] Trainable param: llma.layers.52.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805356] Trainable param: llma.layers.52.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.805367] Trainable param: llma.layers.52.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805377] Trainable param: llma.layers.52.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805390] Trainable param: llma.layers.53.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805400] Trainable param: llma.layers.53.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.805411] Trainable param: llma.layers.53.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805421] Trainable param: llma.layers.53.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.805432] Trainable param: llma.layers.53.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805442] Trainable param: llma.layers.53.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.805453] Trainable param: llma.layers.53.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805463] Trainable param: llma.layers.53.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805475] Trainable param: llma.layers.53.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805485] Trainable param: llma.layers.53.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.805496] Trainable param: llma.layers.53.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805505] Trainable param: llma.layers.53.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805519] Trainable param: llma.layers.53.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805529] Trainable param: llma.layers.53.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.805540] Trainable param: llma.layers.53.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805551] Trainable param: llma.layers.53.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805564] Trainable param: llma.layers.54.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805575] Trainable param: llma.layers.54.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.805587] Trainable param: llma.layers.54.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805596] Trainable param: llma.layers.54.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.805607] Trainable param: llma.layers.54.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805617] Trainable param: llma.layers.54.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.805628] Trainable param: llma.layers.54.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805638] Trainable param: llma.layers.54.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805650] Trainable param: llma.layers.54.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805659] Trainable param: llma.layers.54.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.805670] Trainable param: llma.layers.54.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805680] Trainable param: llma.layers.54.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805691] Trainable param: llma.layers.54.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805701] Trainable param: llma.layers.54.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.805712] Trainable param: llma.layers.54.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805722] Trainable param: llma.layers.54.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805735] Trainable param: llma.layers.55.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805745] Trainable param: llma.layers.55.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.805756] Trainable param: llma.layers.55.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805766] Trainable param: llma.layers.55.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.805777] Trainable param: llma.layers.55.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805787] Trainable param: llma.layers.55.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.805798] Trainable param: llma.layers.55.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805807] Trainable param: llma.layers.55.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805819] Trainable param: llma.layers.55.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805829] Trainable param: llma.layers.55.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.805840] Trainable param: llma.layers.55.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805849] Trainable param: llma.layers.55.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805860] Trainable param: llma.layers.55.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805870] Trainable param: llma.layers.55.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.805881] Trainable param: llma.layers.55.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805892] Trainable param: llma.layers.55.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805905] Trainable param: llma.layers.56.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805915] Trainable param: llma.layers.56.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.805926] Trainable param: llma.layers.56.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805936] Trainable param: llma.layers.56.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.805947] Trainable param: llma.layers.56.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805956] Trainable param: llma.layers.56.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.805967] Trainable param: llma.layers.56.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805977] Trainable param: llma.layers.56.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.805989] Trainable param: llma.layers.56.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.805999] Trainable param: llma.layers.56.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.806010] Trainable param: llma.layers.56.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806020] Trainable param: llma.layers.56.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806031] Trainable param: llma.layers.56.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806040] Trainable param: llma.layers.56.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.806051] Trainable param: llma.layers.56.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806062] Trainable param: llma.layers.56.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806075] Trainable param: llma.layers.57.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806085] Trainable param: llma.layers.57.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.806096] Trainable param: llma.layers.57.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806106] Trainable param: llma.layers.57.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.806117] Trainable param: llma.layers.57.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806127] Trainable param: llma.layers.57.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.806138] Trainable param: llma.layers.57.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806147] Trainable param: llma.layers.57.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806159] Trainable param: llma.layers.57.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806169] Trainable param: llma.layers.57.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.806180] Trainable param: llma.layers.57.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806190] Trainable param: llma.layers.57.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806201] Trainable param: llma.layers.57.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806210] Trainable param: llma.layers.57.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.806221] Trainable param: llma.layers.57.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806232] Trainable param: llma.layers.57.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806245] Trainable param: llma.layers.58.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806254] Trainable param: llma.layers.58.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.806266] Trainable param: llma.layers.58.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806276] Trainable param: llma.layers.58.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.806287] Trainable param: llma.layers.58.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806297] Trainable param: llma.layers.58.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.806308] Trainable param: llma.layers.58.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806317] Trainable param: llma.layers.58.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806329] Trainable param: llma.layers.58.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806339] Trainable param: llma.layers.58.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.806350] Trainable param: llma.layers.58.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806360] Trainable param: llma.layers.58.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806371] Trainable param: llma.layers.58.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806380] Trainable param: llma.layers.58.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.806391] Trainable param: llma.layers.58.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806402] Trainable param: llma.layers.58.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806415] Trainable param: llma.layers.59.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806425] Trainable param: llma.layers.59.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.806436] Trainable param: llma.layers.59.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806445] Trainable param: llma.layers.59.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.806457] Trainable param: llma.layers.59.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806466] Trainable param: llma.layers.59.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.806477] Trainable param: llma.layers.59.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806487] Trainable param: llma.layers.59.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806499] Trainable param: llma.layers.59.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806509] Trainable param: llma.layers.59.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.806520] Trainable param: llma.layers.59.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806529] Trainable param: llma.layers.59.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806540] Trainable param: llma.layers.59.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806550] Trainable param: llma.layers.59.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.806561] Trainable param: llma.layers.59.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806572] Trainable param: llma.layers.59.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806585] Trainable param: llma.layers.60.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806594] Trainable param: llma.layers.60.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.806605] Trainable param: llma.layers.60.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806615] Trainable param: llma.layers.60.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.806626] Trainable param: llma.layers.60.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806636] Trainable param: llma.layers.60.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.806647] Trainable param: llma.layers.60.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806657] Trainable param: llma.layers.60.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806669] Trainable param: llma.layers.60.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806678] Trainable param: llma.layers.60.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.806689] Trainable param: llma.layers.60.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806699] Trainable param: llma.layers.60.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806710] Trainable param: llma.layers.60.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806720] Trainable param: llma.layers.60.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.806731] Trainable param: llma.layers.60.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806741] Trainable param: llma.layers.60.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806754] Trainable param: llma.layers.61.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806764] Trainable param: llma.layers.61.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.806775] Trainable param: llma.layers.61.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806785] Trainable param: llma.layers.61.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.806796] Trainable param: llma.layers.61.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806806] Trainable param: llma.layers.61.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.806817] Trainable param: llma.layers.61.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806827] Trainable param: llma.layers.61.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806839] Trainable param: llma.layers.61.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806848] Trainable param: llma.layers.61.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.806860] Trainable param: llma.layers.61.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806869] Trainable param: llma.layers.61.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806880] Trainable param: llma.layers.61.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806890] Trainable param: llma.layers.61.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.806901] Trainable param: llma.layers.61.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806912] Trainable param: llma.layers.61.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.806925] Trainable param: llma.layers.62.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806934] Trainable param: llma.layers.62.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.806946] Trainable param: llma.layers.62.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806955] Trainable param: llma.layers.62.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.806966] Trainable param: llma.layers.62.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806976] Trainable param: llma.layers.62.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.806987] Trainable param: llma.layers.62.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.806997] Trainable param: llma.layers.62.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807009] Trainable param: llma.layers.62.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807019] Trainable param: llma.layers.62.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.807030] Trainable param: llma.layers.62.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807040] Trainable param: llma.layers.62.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807051] Trainable param: llma.layers.62.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807060] Trainable param: llma.layers.62.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.807071] Trainable param: llma.layers.62.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807082] Trainable param: llma.layers.62.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807095] Trainable param: llma.layers.63.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807105] Trainable param: llma.layers.63.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.807116] Trainable param: llma.layers.63.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807126] Trainable param: llma.layers.63.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.807137] Trainable param: llma.layers.63.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807146] Trainable param: llma.layers.63.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.807158] Trainable param: llma.layers.63.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807168] Trainable param: llma.layers.63.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807180] Trainable param: llma.layers.63.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807190] Trainable param: llma.layers.63.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.807201] Trainable param: llma.layers.63.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807211] Trainable param: llma.layers.63.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807222] Trainable param: llma.layers.63.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807232] Trainable param: llma.layers.63.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.807242] Trainable param: llma.layers.63.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807253] Trainable param: llma.layers.63.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807266] Trainable param: llma.layers.64.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807276] Trainable param: llma.layers.64.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.807287] Trainable param: llma.layers.64.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807297] Trainable param: llma.layers.64.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.807308] Trainable param: llma.layers.64.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807317] Trainable param: llma.layers.64.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.807329] Trainable param: llma.layers.64.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807338] Trainable param: llma.layers.64.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807350] Trainable param: llma.layers.64.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807360] Trainable param: llma.layers.64.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.807371] Trainable param: llma.layers.64.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807380] Trainable param: llma.layers.64.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807391] Trainable param: llma.layers.64.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807401] Trainable param: llma.layers.64.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.807412] Trainable param: llma.layers.64.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807423] Trainable param: llma.layers.64.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807435] Trainable param: llma.layers.65.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807445] Trainable param: llma.layers.65.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.807456] Trainable param: llma.layers.65.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807466] Trainable param: llma.layers.65.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.807477] Trainable param: llma.layers.65.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807486] Trainable param: llma.layers.65.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.807497] Trainable param: llma.layers.65.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807507] Trainable param: llma.layers.65.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807519] Trainable param: llma.layers.65.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807528] Trainable param: llma.layers.65.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.807539] Trainable param: llma.layers.65.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807549] Trainable param: llma.layers.65.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807560] Trainable param: llma.layers.65.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807570] Trainable param: llma.layers.65.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.807580] Trainable param: llma.layers.65.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807591] Trainable param: llma.layers.65.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807604] Trainable param: llma.layers.66.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807614] Trainable param: llma.layers.66.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.807625] Trainable param: llma.layers.66.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807634] Trainable param: llma.layers.66.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.807645] Trainable param: llma.layers.66.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807655] Trainable param: llma.layers.66.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.807666] Trainable param: llma.layers.66.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807675] Trainable param: llma.layers.66.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807687] Trainable param: llma.layers.66.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807697] Trainable param: llma.layers.66.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.807708] Trainable param: llma.layers.66.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807717] Trainable param: llma.layers.66.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807728] Trainable param: llma.layers.66.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807738] Trainable param: llma.layers.66.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.807749] Trainable param: llma.layers.66.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807760] Trainable param: llma.layers.66.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807773] Trainable param: llma.layers.67.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807782] Trainable param: llma.layers.67.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.807793] Trainable param: llma.layers.67.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807803] Trainable param: llma.layers.67.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.807814] Trainable param: llma.layers.67.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807824] Trainable param: llma.layers.67.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.807835] Trainable param: llma.layers.67.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807844] Trainable param: llma.layers.67.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807856] Trainable param: llma.layers.67.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807866] Trainable param: llma.layers.67.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.807877] Trainable param: llma.layers.67.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807886] Trainable param: llma.layers.67.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807897] Trainable param: llma.layers.67.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807907] Trainable param: llma.layers.67.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.807918] Trainable param: llma.layers.67.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807929] Trainable param: llma.layers.67.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.807942] Trainable param: llma.layers.68.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807951] Trainable param: llma.layers.68.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.807962] Trainable param: llma.layers.68.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807972] Trainable param: llma.layers.68.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.807983] Trainable param: llma.layers.68.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.807993] Trainable param: llma.layers.68.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.808004] Trainable param: llma.layers.68.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808013] Trainable param: llma.layers.68.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808025] Trainable param: llma.layers.68.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808035] Trainable param: llma.layers.68.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.808046] Trainable param: llma.layers.68.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808056] Trainable param: llma.layers.68.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808067] Trainable param: llma.layers.68.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808077] Trainable param: llma.layers.68.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.808088] Trainable param: llma.layers.68.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808098] Trainable param: llma.layers.68.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808111] Trainable param: llma.layers.69.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808121] Trainable param: llma.layers.69.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.808132] Trainable param: llma.layers.69.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808141] Trainable param: llma.layers.69.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.808152] Trainable param: llma.layers.69.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808162] Trainable param: llma.layers.69.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.808173] Trainable param: llma.layers.69.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808183] Trainable param: llma.layers.69.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808195] Trainable param: llma.layers.69.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808204] Trainable param: llma.layers.69.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.808215] Trainable param: llma.layers.69.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808225] Trainable param: llma.layers.69.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808236] Trainable param: llma.layers.69.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808246] Trainable param: llma.layers.69.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.808257] Trainable param: llma.layers.69.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808267] Trainable param: llma.layers.69.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808280] Trainable param: llma.layers.70.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808290] Trainable param: llma.layers.70.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.808301] Trainable param: llma.layers.70.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808310] Trainable param: llma.layers.70.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.808321] Trainable param: llma.layers.70.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808331] Trainable param: llma.layers.70.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.808342] Trainable param: llma.layers.70.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808351] Trainable param: llma.layers.70.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808363] Trainable param: llma.layers.70.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808373] Trainable param: llma.layers.70.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.808384] Trainable param: llma.layers.70.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808394] Trainable param: llma.layers.70.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808405] Trainable param: llma.layers.70.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808414] Trainable param: llma.layers.70.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.808425] Trainable param: llma.layers.70.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808436] Trainable param: llma.layers.70.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808449] Trainable param: llma.layers.71.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808459] Trainable param: llma.layers.71.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.808470] Trainable param: llma.layers.71.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808479] Trainable param: llma.layers.71.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.808491] Trainable param: llma.layers.71.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808500] Trainable param: llma.layers.71.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.808511] Trainable param: llma.layers.71.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808521] Trainable param: llma.layers.71.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808532] Trainable param: llma.layers.71.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808542] Trainable param: llma.layers.71.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.808553] Trainable param: llma.layers.71.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808563] Trainable param: llma.layers.71.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808574] Trainable param: llma.layers.71.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808584] Trainable param: llma.layers.71.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.808595] Trainable param: llma.layers.71.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808605] Trainable param: llma.layers.71.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808618] Trainable param: llma.layers.72.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808628] Trainable param: llma.layers.72.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.808639] Trainable param: llma.layers.72.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808648] Trainable param: llma.layers.72.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.808660] Trainable param: llma.layers.72.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808669] Trainable param: llma.layers.72.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.808680] Trainable param: llma.layers.72.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808690] Trainable param: llma.layers.72.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808702] Trainable param: llma.layers.72.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808711] Trainable param: llma.layers.72.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.808722] Trainable param: llma.layers.72.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808732] Trainable param: llma.layers.72.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808743] Trainable param: llma.layers.72.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808753] Trainable param: llma.layers.72.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.808764] Trainable param: llma.layers.72.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808775] Trainable param: llma.layers.72.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808787] Trainable param: llma.layers.73.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808797] Trainable param: llma.layers.73.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.808808] Trainable param: llma.layers.73.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808817] Trainable param: llma.layers.73.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.808829] Trainable param: llma.layers.73.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808838] Trainable param: llma.layers.73.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.808849] Trainable param: llma.layers.73.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808859] Trainable param: llma.layers.73.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808871] Trainable param: llma.layers.73.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808880] Trainable param: llma.layers.73.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.808891] Trainable param: llma.layers.73.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808901] Trainable param: llma.layers.73.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808912] Trainable param: llma.layers.73.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808922] Trainable param: llma.layers.73.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.808932] Trainable param: llma.layers.73.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808943] Trainable param: llma.layers.73.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.808956] Trainable param: llma.layers.74.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808966] Trainable param: llma.layers.74.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.808977] Trainable param: llma.layers.74.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.808986] Trainable param: llma.layers.74.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.808997] Trainable param: llma.layers.74.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809007] Trainable param: llma.layers.74.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.809018] Trainable param: llma.layers.74.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809028] Trainable param: llma.layers.74.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.809039] Trainable param: llma.layers.74.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809049] Trainable param: llma.layers.74.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.809060] Trainable param: llma.layers.74.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809070] Trainable param: llma.layers.74.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.809081] Trainable param: llma.layers.74.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809091] Trainable param: llma.layers.74.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.809102] Trainable param: llma.layers.74.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.809112] Trainable param: llma.layers.74.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.809125] Trainable param: llma.layers.75.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809134] Trainable param: llma.layers.75.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.809145] Trainable param: llma.layers.75.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809155] Trainable param: llma.layers.75.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.809166] Trainable param: llma.layers.75.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809176] Trainable param: llma.layers.75.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.809187] Trainable param: llma.layers.75.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809196] Trainable param: llma.layers.75.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.809208] Trainable param: llma.layers.75.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809217] Trainable param: llma.layers.75.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.809229] Trainable param: llma.layers.75.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809238] Trainable param: llma.layers.75.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.809249] Trainable param: llma.layers.75.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809259] Trainable param: llma.layers.75.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.809270] Trainable param: llma.layers.75.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.809281] Trainable param: llma.layers.75.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.809294] Trainable param: llma.layers.76.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809303] Trainable param: llma.layers.76.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.809314] Trainable param: llma.layers.76.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809324] Trainable param: llma.layers.76.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.809335] Trainable param: llma.layers.76.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809344] Trainable param: llma.layers.76.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.809355] Trainable param: llma.layers.76.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809365] Trainable param: llma.layers.76.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.809377] Trainable param: llma.layers.76.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809387] Trainable param: llma.layers.76.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.809398] Trainable param: llma.layers.76.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809423] Trainable param: llma.layers.76.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.809435] Trainable param: llma.layers.76.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809444] Trainable param: llma.layers.76.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.809455] Trainable param: llma.layers.76.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.809466] Trainable param: llma.layers.76.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.809479] Trainable param: llma.layers.77.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809488] Trainable param: llma.layers.77.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.809500] Trainable param: llma.layers.77.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.809509] Trainable param: llma.layers.77.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.818748] Trainable param: llma.layers.77.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.818762] Trainable param: llma.layers.77.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.818775] Trainable param: llma.layers.77.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.818785] Trainable param: llma.layers.77.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.818798] Trainable param: llma.layers.77.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.818808] Trainable param: llma.layers.77.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.818819] Trainable param: llma.layers.77.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.818829] Trainable param: llma.layers.77.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.818840] Trainable param: llma.layers.77.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.818850] Trainable param: llma.layers.77.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.818861] Trainable param: llma.layers.77.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.818871] Trainable param: llma.layers.77.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.818885] Trainable param: llma.layers.78.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.818894] Trainable param: llma.layers.78.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.818906] Trainable param: llma.layers.78.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.818916] Trainable param: llma.layers.78.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.818927] Trainable param: llma.layers.78.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.818936] Trainable param: llma.layers.78.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.818947] Trainable param: llma.layers.78.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.818957] Trainable param: llma.layers.78.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.818969] Trainable param: llma.layers.78.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.818978] Trainable param: llma.layers.78.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.818989] Trainable param: llma.layers.78.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.818999] Trainable param: llma.layers.78.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.819010] Trainable param: llma.layers.78.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.819019] Trainable param: llma.layers.78.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.819030] Trainable param: llma.layers.78.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.819041] Trainable param: llma.layers.78.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.819054] Trainable param: llma.layers.79.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.819064] Trainable param: llma.layers.79.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[17:09:51.819075] Trainable param: llma.layers.79.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.819084] Trainable param: llma.layers.79.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.819095] Trainable param: llma.layers.79.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.819105] Trainable param: llma.layers.79.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[17:09:51.819116] Trainable param: llma.layers.79.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.819126] Trainable param: llma.layers.79.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.819138] Trainable param: llma.layers.79.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.819148] Trainable param: llma.layers.79.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.819159] Trainable param: llma.layers.79.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.819168] Trainable param: llma.layers.79.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.819179] Trainable param: llma.layers.79.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.819189] Trainable param: llma.layers.79.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[17:09:51.819200] Trainable param: llma.layers.79.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.819210] Trainable param: llma.layers.79.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.819222] Trainable param: llma.norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[17:09:51.819235] Trainable param: llma.output.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16 +[17:09:51.819266] load pretrained from ../checkpoints/llama2/Llama-2-70b/ +[17:09:51.819272] Quantizing model to 4bit! + 0%| | 0/967 [00:00 +[17:23:31.365017] Start training for 4 epochs +[17:23:31.374088] log_dir: ./output_dir +[17:23:41.365961] Epoch: [0] [0/6500] lr: 0.000000 closs: 0.6425 (0.6425) time: 9.9894 data: 2.4921 max mem: 55263 +[17:24:36.776017] Epoch: [0] [10/6500] lr: 0.000000 closs: 1.1259 (1.1314) grad_norm: 1.2729 (1.2429) time: 5.9453 data: 0.2268 max mem: 71357 +[17:25:32.743504] Epoch: [0] [20/6500] lr: 0.000000 closs: 0.9694 (1.0205) grad_norm: 1.2619 (1.2698) time: 5.5688 data: 0.0002 max mem: 71357 +[17:26:28.403906] Epoch: [0] [30/6500] lr: 0.000000 closs: 0.9322 (1.0122) grad_norm: 1.2729 (1.3065) time: 5.5813 data: 0.0002 max mem: 71357 +[17:27:24.071878] Epoch: [0] [40/6500] lr: 0.000000 closs: 0.9322 (1.0139) grad_norm: 1.1836 (1.2646) time: 5.5663 data: 0.0002 max mem: 71357 +[17:28:19.797340] Epoch: [0] [50/6500] lr: 0.000000 closs: 0.9739 (1.0026) grad_norm: 1.1697 (1.2362) time: 5.5696 data: 0.0002 max mem: 71357 +[17:29:15.536579] Epoch: [0] [60/6500] lr: 0.000000 closs: 0.9523 (0.9969) grad_norm: 1.0358 (1.2359) time: 5.5732 data: 0.0002 max mem: 71357 +[17:30:11.306733] Epoch: [0] [70/6500] lr: 0.000001 closs: 0.9465 (0.9975) grad_norm: 1.0358 (1.2176) time: 5.5754 data: 0.0002 max mem: 71357 +[17:31:06.992357] Epoch: [0] [80/6500] lr: 0.000001 closs: 0.9465 (0.9946) grad_norm: 1.0641 (1.2193) time: 5.5727 data: 0.0002 max mem: 71357 +[17:32:02.793097] Epoch: [0] [90/6500] lr: 0.000001 closs: 0.9669 (0.9922) grad_norm: 1.0981 (1.2193) time: 5.5742 data: 0.0001 max mem: 71357 +[17:32:58.479760] Epoch: [0] [100/6500] lr: 0.000001 closs: 0.9693 (0.9890) grad_norm: 1.1596 (1.2120) time: 5.5743 data: 0.0001 max mem: 71357 +[17:33:54.306944] Epoch: [0] [110/6500] lr: 0.000001 closs: 0.9520 (0.9836) grad_norm: 1.1596 (1.2033) time: 5.5756 data: 0.0002 max mem: 71357 +[17:34:50.074430] Epoch: [0] [120/6500] lr: 0.000001 closs: 0.9365 (0.9833) grad_norm: 1.0410 (1.1872) time: 5.5797 data: 0.0002 max mem: 71357 +[17:35:45.809543] Epoch: [0] [130/6500] lr: 0.000001 closs: 0.9427 (0.9864) grad_norm: 1.0198 (1.1784) time: 5.5750 data: 0.0002 max mem: 71357 +[17:36:41.515666] Epoch: [0] [140/6500] lr: 0.000001 closs: 0.9960 (0.9895) grad_norm: 1.0198 (1.1752) time: 5.5719 data: 0.0002 max mem: 71357 +[17:37:37.376263] Epoch: [0] [150/6500] lr: 0.000001 closs: 0.9367 (0.9828) grad_norm: 1.0198 (1.1735) time: 5.5782 data: 0.0002 max mem: 71357 +[17:38:33.178090] Epoch: [0] [160/6500] lr: 0.000001 closs: 0.8710 (0.9798) grad_norm: 1.0702 (1.1935) time: 5.5830 data: 0.0002 max mem: 71357 +[17:39:28.866051] Epoch: [0] [170/6500] lr: 0.000001 closs: 0.8168 (0.9763) grad_norm: 1.1173 (1.1967) time: 5.5744 data: 0.0002 max mem: 71357 +[17:40:24.644969] Epoch: [0] [180/6500] lr: 0.000001 closs: 0.9151 (0.9716) grad_norm: 1.0284 (1.1866) time: 5.5733 data: 0.0002 max mem: 71357 +[17:41:20.449725] Epoch: [0] [190/6500] lr: 0.000001 closs: 0.9151 (0.9690) grad_norm: 1.0219 (1.1783) time: 5.5791 data: 0.0001 max mem: 71357 +[17:42:16.173660] Epoch: [0] [200/6500] lr: 0.000002 closs: 0.9091 (0.9631) grad_norm: 1.0265 (1.1808) time: 5.5763 data: 0.0001 max mem: 71357 +[17:43:11.896862] Epoch: [0] [210/6500] lr: 0.000002 closs: 0.9030 (0.9615) grad_norm: 1.0265 (1.1799) time: 5.5722 data: 0.0001 max mem: 71357 +[17:44:07.617751] Epoch: [0] [220/6500] lr: 0.000002 closs: 0.9008 (0.9594) grad_norm: 1.0652 (1.1861) time: 5.5721 data: 0.0001 max mem: 71357 +[17:45:03.381846] Epoch: [0] [230/6500] lr: 0.000002 closs: 0.8917 (0.9581) grad_norm: 1.1156 (1.1990) time: 5.5742 data: 0.0001 max mem: 71357 +[17:45:59.158047] Epoch: [0] [240/6500] lr: 0.000002 closs: 0.9274 (0.9582) grad_norm: 1.1156 (1.2056) time: 5.5769 data: 0.0001 max mem: 71357 +[17:46:54.838196] Epoch: [0] [250/6500] lr: 0.000002 closs: 0.9450 (0.9598) grad_norm: 1.1180 (1.2000) time: 5.5727 data: 0.0001 max mem: 71357 +[17:47:50.633838] Epoch: [0] [260/6500] lr: 0.000002 closs: 0.9151 (0.9571) grad_norm: 1.0244 (1.1863) time: 5.5737 data: 0.0002 max mem: 71357 +[17:48:46.374345] Epoch: [0] [270/6500] lr: 0.000002 closs: 0.8736 (0.9565) grad_norm: 0.9105 (1.1753) time: 5.5767 data: 0.0001 max mem: 71357 +[17:49:42.139732] Epoch: [0] [280/6500] lr: 0.000002 closs: 0.8896 (0.9573) grad_norm: 1.0178 (1.1818) time: 5.5752 data: 0.0002 max mem: 71357 +[17:50:37.920786] Epoch: [0] [290/6500] lr: 0.000002 closs: 0.9003 (0.9554) grad_norm: 0.8901 (1.1759) time: 5.5772 data: 0.0002 max mem: 71357 +[17:51:33.665702] Epoch: [0] [300/6500] lr: 0.000002 closs: 0.8788 (0.9538) grad_norm: 1.0452 (1.1730) time: 5.5762 data: 0.0001 max mem: 71357 +[17:52:29.335217] Epoch: [0] [310/6500] lr: 0.000002 closs: 0.8900 (0.9538) grad_norm: 1.0911 (1.1693) time: 5.5706 data: 0.0001 max mem: 71357 +[17:53:25.050732] Epoch: [0] [320/6500] lr: 0.000002 closs: 0.8907 (0.9528) grad_norm: 0.9340 (1.1656) time: 5.5692 data: 0.0001 max mem: 71357 +[17:54:20.840942] Epoch: [0] [330/6500] lr: 0.000003 closs: 0.9088 (0.9559) grad_norm: 0.9319 (1.1552) time: 5.5752 data: 0.0002 max mem: 71357 +[17:55:16.512912] Epoch: [0] [340/6500] lr: 0.000003 closs: 0.8765 (0.9528) grad_norm: 0.8839 (1.1505) time: 5.5730 data: 0.0002 max mem: 71357 +[17:56:12.171744] Epoch: [0] [350/6500] lr: 0.000003 closs: 0.8691 (0.9530) grad_norm: 0.8736 (1.1448) time: 5.5664 data: 0.0001 max mem: 71357 +[17:57:07.866853] Epoch: [0] [360/6500] lr: 0.000003 closs: 0.8886 (0.9515) grad_norm: 0.8445 (1.1351) time: 5.5676 data: 0.0001 max mem: 71357 +[17:58:03.685808] Epoch: [0] [370/6500] lr: 0.000003 closs: 0.9019 (0.9531) grad_norm: 0.8445 (1.1251) time: 5.5756 data: 0.0001 max mem: 71357 +[17:58:59.457163] Epoch: [0] [380/6500] lr: 0.000003 closs: 0.9910 (0.9514) grad_norm: 0.8201 (1.1156) time: 5.5794 data: 0.0001 max mem: 71357 +[17:59:55.171125] Epoch: [0] [390/6500] lr: 0.000003 closs: 0.8344 (0.9485) grad_norm: 0.7503 (1.1046) time: 5.5742 data: 0.0001 max mem: 71357 +[18:00:50.857523] Epoch: [0] [400/6500] lr: 0.000003 closs: 0.8829 (0.9491) grad_norm: 0.7532 (1.0999) time: 5.5699 data: 0.0001 max mem: 71357 +[18:01:46.766039] Epoch: [0] [410/6500] lr: 0.000003 closs: 0.8405 (0.9436) grad_norm: 0.7532 (1.0935) time: 5.5797 data: 0.0001 max mem: 71357 +[18:02:42.491407] Epoch: [0] [420/6500] lr: 0.000003 closs: 0.7425 (0.9431) grad_norm: 0.7532 (1.0857) time: 5.5816 data: 0.0001 max mem: 71357 +[18:03:38.223961] Epoch: [0] [430/6500] lr: 0.000003 closs: 0.8892 (0.9426) grad_norm: 0.7592 (1.0802) time: 5.5728 data: 0.0001 max mem: 71357 +[18:04:33.923170] Epoch: [0] [440/6500] lr: 0.000003 closs: 0.8794 (0.9410) grad_norm: 0.6973 (1.0741) time: 5.5715 data: 0.0001 max mem: 71357 +[18:05:29.628857] Epoch: [0] [450/6500] lr: 0.000003 closs: 0.8692 (0.9410) grad_norm: 0.6973 (1.0658) time: 5.5702 data: 0.0001 max mem: 71357 +[18:06:25.467666] Epoch: [0] [460/6500] lr: 0.000004 closs: 0.8951 (0.9400) grad_norm: 0.6579 (1.0562) time: 5.5772 data: 0.0001 max mem: 71357 +[18:07:21.090474] Epoch: [0] [470/6500] lr: 0.000004 closs: 0.9276 (0.9407) grad_norm: 0.6579 (1.0511) time: 5.5730 data: 0.0001 max mem: 71357 +[18:08:16.811633] Epoch: [0] [480/6500] lr: 0.000004 closs: 0.9387 (0.9406) grad_norm: 0.6530 (1.0441) time: 5.5671 data: 0.0001 max mem: 71357 +[18:09:12.584026] Epoch: [0] [490/6500] lr: 0.000004 closs: 0.8099 (0.9383) grad_norm: 0.6177 (1.0377) time: 5.5746 data: 0.0001 max mem: 71357 +[18:10:08.482777] Epoch: [0] [500/6500] lr: 0.000004 closs: 0.7688 (0.9347) grad_norm: 0.6177 (1.0309) time: 5.5835 data: 0.0001 max mem: 71357 +[18:11:04.244325] Epoch: [0] [510/6500] lr: 0.000004 closs: 0.7851 (0.9333) grad_norm: 0.6626 (1.0246) time: 5.5829 data: 0.0001 max mem: 71357 +[18:11:59.905130] Epoch: [0] [520/6500] lr: 0.000004 closs: 0.8155 (0.9318) grad_norm: 0.6998 (1.0221) time: 5.5710 data: 0.0001 max mem: 71357 +[18:12:55.576114] Epoch: [0] [530/6500] lr: 0.000004 closs: 0.7851 (0.9303) grad_norm: 0.6998 (1.0148) time: 5.5665 data: 0.0001 max mem: 71357 +[18:13:51.221974] Epoch: [0] [540/6500] lr: 0.000004 closs: 0.8393 (0.9295) grad_norm: 0.6998 (1.0106) time: 5.5657 data: 0.0001 max mem: 71357 +[18:14:47.001446] Epoch: [0] [550/6500] lr: 0.000004 closs: 0.9264 (0.9299) grad_norm: 0.7249 (1.0064) time: 5.5711 data: 0.0001 max mem: 71357 +[18:15:42.665872] Epoch: [0] [560/6500] lr: 0.000004 closs: 0.8648 (0.9292) grad_norm: 0.7504 (1.0082) time: 5.5721 data: 0.0001 max mem: 71357 +[18:16:38.396948] Epoch: [0] [570/6500] lr: 0.000004 closs: 0.8370 (0.9284) grad_norm: 0.8616 (1.0058) time: 5.5697 data: 0.0001 max mem: 71357 +[18:17:34.143183] Epoch: [0] [580/6500] lr: 0.000004 closs: 0.8808 (0.9264) grad_norm: 0.8343 (1.0013) time: 5.5738 data: 0.0001 max mem: 71357 +[18:18:29.970942] Epoch: [0] [590/6500] lr: 0.000005 closs: 0.8689 (0.9260) grad_norm: 0.8343 (0.9999) time: 5.5786 data: 0.0001 max mem: 71357 +[18:19:25.719357] Epoch: [0] [600/6500] lr: 0.000005 closs: 0.8692 (0.9257) grad_norm: 0.8343 (1.0018) time: 5.5787 data: 0.0002 max mem: 71357 +[18:20:21.373952] Epoch: [0] [610/6500] lr: 0.000005 closs: 0.8144 (0.9231) grad_norm: 0.6895 (1.0004) time: 5.5701 data: 0.0002 max mem: 71357 +[18:21:17.129310] Epoch: [0] [620/6500] lr: 0.000005 closs: 0.8144 (0.9223) grad_norm: 0.6895 (0.9955) time: 5.5704 data: 0.0001 max mem: 71357 +[18:22:12.912488] Epoch: [0] [630/6500] lr: 0.000005 closs: 0.8523 (0.9212) grad_norm: 0.7028 (0.9910) time: 5.5768 data: 0.0001 max mem: 71357 +[18:23:08.618409] Epoch: [0] [640/6500] lr: 0.000005 closs: 0.8383 (0.9202) grad_norm: 0.7028 (0.9864) time: 5.5743 data: 0.0001 max mem: 71357 +[18:24:04.348829] Epoch: [0] [650/6500] lr: 0.000005 closs: 0.8570 (0.9197) grad_norm: 0.7166 (0.9852) time: 5.5717 data: 0.0002 max mem: 71357 +[18:25:00.006201] Epoch: [0] [660/6500] lr: 0.000005 closs: 0.8232 (0.9176) grad_norm: 0.7285 (0.9834) time: 5.5693 data: 0.0002 max mem: 71357 +[18:25:55.728679] Epoch: [0] [670/6500] lr: 0.000005 closs: 0.7907 (0.9164) grad_norm: 0.7166 (0.9780) time: 5.5689 data: 0.0002 max mem: 71357 +[18:26:51.493602] Epoch: [0] [680/6500] lr: 0.000005 closs: 0.8000 (0.9157) grad_norm: 0.7775 (0.9767) time: 5.5743 data: 0.0001 max mem: 71357 +[18:27:47.259175] Epoch: [0] [690/6500] lr: 0.000005 closs: 0.8076 (0.9146) grad_norm: 0.6919 (0.9753) time: 5.5764 data: 0.0001 max mem: 71357 +[18:28:42.942644] Epoch: [0] [700/6500] lr: 0.000005 closs: 0.7424 (0.9123) grad_norm: 0.6303 (0.9723) time: 5.5723 data: 0.0001 max mem: 71357 +[18:29:38.581234] Epoch: [0] [710/6500] lr: 0.000005 closs: 0.7424 (0.9105) grad_norm: 0.7645 (0.9741) time: 5.5660 data: 0.0002 max mem: 71357 +[18:30:34.409782] Epoch: [0] [720/6500] lr: 0.000006 closs: 0.7110 (0.9079) grad_norm: 0.7526 (0.9709) time: 5.5732 data: 0.0002 max mem: 71357 +[18:31:30.096730] Epoch: [0] [730/6500] lr: 0.000006 closs: 0.7914 (0.9071) grad_norm: 0.6638 (0.9656) time: 5.5757 data: 0.0001 max mem: 71357 +[18:32:25.758320] Epoch: [0] [740/6500] lr: 0.000006 closs: 0.8183 (0.9059) grad_norm: 0.6975 (0.9641) time: 5.5673 data: 0.0001 max mem: 71357 +[18:33:21.499555] Epoch: [0] [750/6500] lr: 0.000006 closs: 0.7790 (0.9051) grad_norm: 0.6621 (0.9590) time: 5.5700 data: 0.0001 max mem: 71357 +[18:34:17.084158] Epoch: [0] [760/6500] lr: 0.000006 closs: 0.8239 (0.9048) grad_norm: 0.6638 (0.9582) time: 5.5662 data: 0.0001 max mem: 71357 +[18:35:12.977650] Epoch: [0] [770/6500] lr: 0.000006 closs: 0.8609 (0.9039) grad_norm: 0.6498 (0.9539) time: 5.5738 data: 0.0001 max mem: 71357 +[18:36:08.664952] Epoch: [0] [780/6500] lr: 0.000006 closs: 0.7882 (0.9016) grad_norm: 0.6142 (0.9481) time: 5.5789 data: 0.0002 max mem: 71357 +[18:37:04.413888] Epoch: [0] [790/6500] lr: 0.000006 closs: 0.7919 (0.9005) grad_norm: 0.6271 (0.9440) time: 5.5717 data: 0.0001 max mem: 71357 +[18:38:00.067141] Epoch: [0] [800/6500] lr: 0.000006 closs: 0.8323 (0.9000) grad_norm: 0.5438 (0.9401) time: 5.5700 data: 0.0001 max mem: 71357 +[18:38:55.953874] Epoch: [0] [810/6500] lr: 0.000006 closs: 0.8323 (0.8978) grad_norm: 0.5438 (0.9353) time: 5.5769 data: 0.0001 max mem: 71357 +[18:39:51.627904] Epoch: [0] [820/6500] lr: 0.000006 closs: 0.7819 (0.8968) grad_norm: 0.5438 (0.9310) time: 5.5779 data: 0.0002 max mem: 71357 +[18:40:47.301617] Epoch: [0] [830/6500] lr: 0.000006 closs: 0.7707 (0.8952) grad_norm: 0.5119 (0.9265) time: 5.5673 data: 0.0002 max mem: 71357 +[18:41:43.054599] Epoch: [0] [840/6500] lr: 0.000006 closs: 0.7601 (0.8950) grad_norm: 0.5119 (0.9249) time: 5.5712 data: 0.0001 max mem: 71357 +[18:42:38.803150] Epoch: [0] [850/6500] lr: 0.000007 closs: 0.8690 (0.8952) grad_norm: 0.5323 (0.9204) time: 5.5749 data: 0.0001 max mem: 71357 +[18:43:34.505421] Epoch: [0] [860/6500] lr: 0.000007 closs: 0.8132 (0.8943) grad_norm: 0.5332 (0.9166) time: 5.5724 data: 0.0002 max mem: 71357 +[18:44:30.116370] Epoch: [0] [870/6500] lr: 0.000007 closs: 0.8058 (0.8946) grad_norm: 0.5786 (0.9147) time: 5.5655 data: 0.0002 max mem: 71357 +[18:45:25.754743] Epoch: [0] [880/6500] lr: 0.000007 closs: 0.8048 (0.8936) grad_norm: 0.5437 (0.9109) time: 5.5623 data: 0.0002 max mem: 71357 +[18:46:21.437868] Epoch: [0] [890/6500] lr: 0.000007 closs: 0.7226 (0.8921) grad_norm: 0.5817 (0.9073) time: 5.5659 data: 0.0001 max mem: 71357 +[18:47:17.526399] Epoch: [0] [900/6500] lr: 0.000007 closs: 0.7589 (0.8912) grad_norm: 0.6070 (0.9076) time: 5.5884 data: 0.0001 max mem: 71357 +[18:48:13.144234] Epoch: [0] [910/6500] lr: 0.000007 closs: 0.8236 (0.8904) grad_norm: 0.5817 (0.9036) time: 5.5852 data: 0.0002 max mem: 71357 +[18:49:08.786046] Epoch: [0] [920/6500] lr: 0.000007 closs: 0.8949 (0.8904) grad_norm: 0.5817 (0.9000) time: 5.5629 data: 0.0002 max mem: 71357 +[18:50:04.500343] Epoch: [0] [930/6500] lr: 0.000007 closs: 0.8826 (0.8899) grad_norm: 0.5138 (0.8955) time: 5.5677 data: 0.0002 max mem: 71357 +[18:51:00.386039] Epoch: [0] [940/6500] lr: 0.000007 closs: 0.8056 (0.8898) grad_norm: 0.5107 (0.8927) time: 5.5799 data: 0.0002 max mem: 71357 +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[W socket.cpp:426] [c10d] The server socket has failed to bind to [::]:1113 (errno: 98 - Address already in use). +[W socket.cpp:426] [c10d] The server socket has failed to bind to 0.0.0.0:1113 (errno: 98 - Address already in use). +[E socket.cpp:462] [c10d] The server socket has failed to listen on any local network address. +Traceback (most recent call last): + File "/data/anaconda3/envs/accessory/bin/torchrun", line 8, in + sys.exit(main()) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper + return f(*args, **kwargs) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main + run(args) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run + elastic_launch( + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 241, in launch_agent + result = agent.run() + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper + result = f(*args, **kwargs) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 723, in run + result = self._invoke_run(role) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 858, in _invoke_run + self._initialize_workers(self._worker_group) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper + result = f(*args, **kwargs) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 692, in _initialize_workers + self._rendezvous(worker_group) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper + result = f(*args, **kwargs) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 546, in _rendezvous + store, group_rank, group_world_size = spec.rdzv_handler.next_rendezvous() + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 55, in next_rendezvous + self._store = TCPStore( # type: ignore[call-arg] +RuntimeError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:1113 (errno: 98 - Address already in use). The server socket has failed to bind to 0.0.0.0:1113 (errno: 98 - Address already in use). +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +> /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py(134)main() + 133  import ipdb;ipdb.set_trace() +--> 134  misc.init_distributed_mode(args) + 135  fs_init.initialize_model_parallel(args.model_parallel_size) + +ipdb> *** SyntaxError: invalid syntax +ipdb> > /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py(134)main() + 133  import ipdb;ipdb.set_trace() +--> 134  misc.init_distributed_mode(args) + 135  fs_init.initialize_model_parallel(args.model_parallel_size) + +ipdb> [18:51:56.159949] Epoch: [0] [950/6500] lr: 0.000007 closs: 0.8056 (0.8895) grad_norm: 0.5041 (0.8887) time: 5.5829 data: 0.0002 max mem: 71357 +[18:52:51.944635] Epoch: [0] [960/6500] lr: 0.000007 closs: 0.8003 (0.8885) grad_norm: 0.4752 (0.8862) time: 5.5779 data: 0.0002 max mem: 71357 +True +ipdb> [18:53:47.670278] Epoch: [0] [970/6500] lr: 0.000007 closs: 0.8003 (0.8873) grad_norm: 0.5352 (0.8829) time: 5.5754 data: 0.0001 max mem: 71357 +[18:54:43.331543] Epoch: [0] [980/6500] lr: 0.000008 closs: 0.7697 (0.8869) grad_norm: 0.5414 (0.8806) time: 5.5692 data: 0.0002 max mem: 71357 +[18:55:39.144294] Epoch: [0] [990/6500] lr: 0.000008 closs: 0.7697 (0.8861) grad_norm: 0.5501 (0.8777) time: 5.5736 data: 0.0002 max mem: 71357 +[18:56:34.832623] Epoch: [0] [1000/6500] lr: 0.000008 closs: 0.7968 (0.8853) grad_norm: 0.5508 (0.8757) time: 5.5750 data: 0.0001 max mem: 71357 +[18:57:30.584157] Epoch: [0] [1010/6500] lr: 0.000008 closs: 0.8029 (0.8847) grad_norm: 0.6143 (0.8732) time: 5.5719 data: 0.0001 max mem: 71357 +[18:58:26.208202] Epoch: [0] [1020/6500] lr: 0.000008 closs: 0.8208 (0.8845) grad_norm: 0.6465 (0.8734) time: 5.5687 data: 0.0001 max mem: 71357 +[18:59:22.004799] Epoch: [0] [1030/6500] lr: 0.000008 closs: 0.7216 (0.8831) grad_norm: 0.6336 (0.8707) time: 5.5709 data: 0.0002 max mem: 71357 +[19:00:17.779495] Epoch: [0] [1040/6500] lr: 0.000008 closs: 0.7816 (0.8825) grad_norm: 0.5328 (0.8678) time: 5.5785 data: 0.0002 max mem: 71357 +[19:01:13.491558] Epoch: [0] [1050/6500] lr: 0.000008 closs: 0.8551 (0.8816) grad_norm: 0.5170 (0.8638) time: 5.5742 data: 0.0001 max mem: 71357 +[19:02:09.272274] Epoch: [0] [1060/6500] lr: 0.000008 closs: 0.7913 (0.8810) grad_norm: 0.5079 (0.8599) time: 5.5745 data: 0.0001 max mem: 71357 +[19:03:05.080375] Epoch: [0] [1070/6500] lr: 0.000008 closs: 0.8080 (0.8807) grad_norm: 0.4961 (0.8577) time: 5.5794 data: 0.0001 max mem: 71357 +[19:04:00.844337] Epoch: [0] [1080/6500] lr: 0.000008 closs: 0.8225 (0.8806) grad_norm: 0.4745 (0.8547) time: 5.5785 data: 0.0001 max mem: 71357 + +Traceback (most recent call last): + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 319, in + main(args) + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 134, in main + misc.init_distributed_mode(args) + File "/data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory/main_finetune.py", line 134, in main + misc.init_distributed_mode(args) + File "/data/anaconda3/envs/accessory/lib/python3.10/bdb.py", line 90, in trace_dispatch + return self.dispatch_line(frame) + File "/data/anaconda3/envs/accessory/lib/python3.10/bdb.py", line 115, in dispatch_line + if self.quitting: raise BdbQuit +bdb.BdbQuit + +If you suspect this is an IPython 8.14.0 bug, please report it at: + https://github.com/ipython/ipython/issues +or send an email to the mailing list at ipython-dev@python.org + +You can print a more detailed traceback right now with "%tb", or use "%debug" +to interactively debug it. + +Extra-detailed tracebacks for bug-reporting purposes can be enabled via: + %config Application.verbose_crash=True + +WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 130230 closing signal SIGTERM +ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 130231) of binary: /data/anaconda3/envs/accessory/bin/python3.10 +Traceback (most recent call last): + File "/data/anaconda3/envs/accessory/bin/torchrun", line 8, in + sys.exit(main()) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper + return f(*args, **kwargs) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main + run(args) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run + elastic_launch( + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +main_finetune.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2023-08-10_19:04:32 + host : iZ2ze8qpzapxkhyc9k2qojZ + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 130231) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +[19:04:56.595547] Epoch: [0] [1090/6500] lr: 0.000008 closs: 0.8706 (0.8802) grad_norm: 0.4761 (0.8526) time: 5.5757 data: 0.0001 max mem: 71357 +[19:05:52.420471] Epoch: [0] [1100/6500] lr: 0.000008 closs: 0.8706 (0.8799) grad_norm: 0.4761 (0.8506) time: 5.5787 data: 0.0001 max mem: 71357 +[19:06:48.151002] Epoch: [0] [1110/6500] lr: 0.000009 closs: 0.8431 (0.8795) grad_norm: 0.4751 (0.8474) time: 5.5777 data: 0.0001 max mem: 71357 +[19:07:44.022831] Epoch: [0] [1120/6500] lr: 0.000009 closs: 0.7763 (0.8782) grad_norm: 0.4751 (0.8446) time: 5.5800 data: 0.0001 max mem: 71357 +[19:08:39.741796] Epoch: [0] [1130/6500] lr: 0.000009 closs: 0.7718 (0.8775) grad_norm: 0.5106 (0.8439) time: 5.5794 data: 0.0001 max mem: 71357 +[19:09:35.445169] Epoch: [0] [1140/6500] lr: 0.000009 closs: 0.7377 (0.8759) grad_norm: 0.5852 (0.8474) time: 5.5710 data: 0.0001 max mem: 71357 +[19:10:31.229899] Epoch: [0] [1150/6500] lr: 0.000009 closs: 0.7763 (0.8757) grad_norm: 0.5936 (0.8482) time: 5.5743 data: 0.0002 max mem: 71357 +[19:11:27.112847] Epoch: [0] [1160/6500] lr: 0.000009 closs: 0.8189 (0.8754) grad_norm: 0.5936 (0.8451) time: 5.5833 data: 0.0002 max mem: 71357 +[19:12:22.770834] Epoch: [0] [1170/6500] lr: 0.000009 closs: 0.7905 (0.8742) grad_norm: 0.5854 (0.8432) time: 5.5769 data: 0.0002 max mem: 71357 +[19:13:18.473532] Epoch: [0] [1180/6500] lr: 0.000009 closs: 0.7743 (0.8739) grad_norm: 0.4815 (0.8403) time: 5.5679 data: 0.0001 max mem: 71357 +[19:14:14.090142] Epoch: [0] [1190/6500] lr: 0.000009 closs: 0.8370 (0.8738) grad_norm: 0.4815 (0.8390) time: 5.5659 data: 0.0001 max mem: 71357 +[19:15:09.821953] Epoch: [0] [1200/6500] lr: 0.000009 closs: 0.7973 (0.8732) grad_norm: 0.5191 (0.8370) time: 5.5673 data: 0.0001 max mem: 71357 +[19:16:05.632464] Epoch: [0] [1210/6500] lr: 0.000009 closs: 0.7952 (0.8725) grad_norm: 0.5191 (0.8346) time: 5.5770 data: 0.0001 max mem: 71357 +[19:17:01.237737] Epoch: [0] [1220/6500] lr: 0.000009 closs: 0.7952 (0.8720) grad_norm: 0.5838 (0.8334) time: 5.5707 data: 0.0001 max mem: 71357 +[19:17:56.918626] Epoch: [0] [1230/6500] lr: 0.000009 closs: 0.8244 (0.8716) grad_norm: 0.5838 (0.8319) time: 5.5642 data: 0.0002 max mem: 71357 +[19:18:52.636548] Epoch: [0] [1240/6500] lr: 0.000010 closs: 0.8378 (0.8712) grad_norm: 0.6067 (0.8326) time: 5.5698 data: 0.0001 max mem: 71357 +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[W socket.cpp:426] [c10d] The server socket has failed to bind to [::]:1113 (errno: 98 - Address already in use). +[W socket.cpp:426] [c10d] The server socket has failed to bind to 0.0.0.0:1113 (errno: 98 - Address already in use). +[E socket.cpp:462] [c10d] The server socket has failed to listen on any local network address. +Traceback (most recent call last): + File "/data/anaconda3/envs/accessory/bin/torchrun", line 8, in + sys.exit(main()) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper + return f(*args, **kwargs) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main + run(args) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run + elastic_launch( + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 241, in launch_agent + result = agent.run() + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper + result = f(*args, **kwargs) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 723, in run + result = self._invoke_run(role) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 858, in _invoke_run + self._initialize_workers(self._worker_group) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper + result = f(*args, **kwargs) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 692, in _initialize_workers + self._rendezvous(worker_group) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper + result = f(*args, **kwargs) + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 546, in _rendezvous + store, group_rank, group_world_size = spec.rdzv_handler.next_rendezvous() + File "/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 55, in next_rendezvous + self._store = TCPStore( # type: ignore[call-arg] +RuntimeError: The server socket has failed to listen on any local network address. The server socket has failed to bind to [::]:1113 (errno: 98 - Address already in use). The server socket has failed to bind to 0.0.0.0:1113 (errno: 98 - Address already in use). +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +| distributed init (rank 1): env://, gpu 1 +| distributed init (rank 0): env://, gpu 0 +[19:19:28.076271] > initializing model parallel with size 1 +[19:19:28.076336] > initializing ddp with size 2 +[19:19:28.076342] > initializing pipeline with size 1 +[19:19:28.121707] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory +[19:19:28.121800] Namespace(batch_size=4, +accum_iter=2, +llama_type='llama_peft', +llama_config=['../checkpoints/llama2/Llama-2-70b/params.json'], +no_visual=True, +tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model', +pretrained_path='../checkpoints/llama2/Llama-2-70b/', +pretrained_type='meta_ori', +weight_decay=0.02, +lr=5e-05, +min_lr=5e-06, +epochs=4, +warmup_epochs=1.0, +clip_grad=2, +max_words=512, +dialog=False, +data_config='configs/data/finetune/sg/alpaca.yaml', +output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B', +log_dir='./output_dir', +save_interval=1, +device='cuda', +seed=0, +resume='', +num_workers=8, +pin_mem=True, +world_size=2, +local_rank=-1, +dist_on_itp=False, +dist_url='env://', +model_parallel_size=1, +data_parallel='sdp', +precision='bf16', +checkpointing=True, +quant=True, +rank=0, +gpu=0, +distributed=True, +dist_backend='nccl') +[19:19:28.133114] Model Args: + ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True) +[19:19:48.469330] Epoch: [0] [1250/6500] lr: 0.000010 closs: 0.7255 (0.8696) grad_norm: 0.6067 (0.8306) time: 5.5775 data: 0.0002 max mem: 71357 +[19:20:44.159763] Epoch: [0] [1260/6500] lr: 0.000010 closs: 0.7255 (0.8691) grad_norm: 0.6007 (0.8286) time: 5.5761 data: 0.0002 max mem: 71357 +[19:21:39.944680] Epoch: [0] [1270/6500] lr: 0.000010 closs: 0.8232 (0.8687) grad_norm: 0.5559 (0.8273) time: 5.5737 data: 0.0002 max mem: 71357 +[19:22:35.727072] Epoch: [0] [1280/6500] lr: 0.000010 closs: 0.7722 (0.8681) grad_norm: 0.6128 (0.8262) time: 5.5782 data: 0.0002 max mem: 71357 +[19:23:31.545412] Epoch: [0] [1290/6500] lr: 0.000010 closs: 0.7420 (0.8668) grad_norm: 0.5559 (0.8239) time: 5.5799 data: 0.0002 max mem: 71357 +[19:24:27.267608] Epoch: [0] [1300/6500] lr: 0.000010 closs: 0.7926 (0.8669) grad_norm: 0.6245 (0.8228) time: 5.5769 data: 0.0002 max mem: 71357 +[19:25:23.128280] Epoch: [0] [1310/6500] lr: 0.000010 closs: 0.7852 (0.8662) grad_norm: 0.6224 (0.8206) time: 5.5791 data: 0.0002 max mem: 71357 +[19:26:18.848671] Epoch: [0] [1320/6500] lr: 0.000010 closs: 0.7704 (0.8660) grad_norm: 0.5277 (0.8185) time: 5.5789 data: 0.0002 max mem: 71357 +[19:27:14.498952] Epoch: [0] [1330/6500] lr: 0.000010 closs: 0.8118 (0.8657) grad_norm: 0.5535 (0.8175) time: 5.5684 data: 0.0002 max mem: 71357 +[19:27:29.243249] Model is Peft: True +[19:27:29.250590] Trainable parameter count : 8036352 (local rank), 8036352 (all). +[19:27:29.274209] Trainable param: llma.tok_embeddings.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274240] Trainable param: llma.layers.0.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274253] Trainable param: llma.layers.0.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.274266] Trainable param: llma.layers.0.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274276] Trainable param: llma.layers.0.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.274288] Trainable param: llma.layers.0.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274299] Trainable param: llma.layers.0.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.274310] Trainable param: llma.layers.0.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274320] Trainable param: llma.layers.0.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274333] Trainable param: llma.layers.0.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274343] Trainable param: llma.layers.0.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.274355] Trainable param: llma.layers.0.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274365] Trainable param: llma.layers.0.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274376] Trainable param: llma.layers.0.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274386] Trainable param: llma.layers.0.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.274397] Trainable param: llma.layers.0.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274408] Trainable param: llma.layers.0.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274423] Trainable param: llma.layers.1.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274435] Trainable param: llma.layers.1.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.274446] Trainable param: llma.layers.1.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274456] Trainable param: llma.layers.1.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.274468] Trainable param: llma.layers.1.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274478] Trainable param: llma.layers.1.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.274489] Trainable param: llma.layers.1.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274499] Trainable param: llma.layers.1.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274511] Trainable param: llma.layers.1.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274521] Trainable param: llma.layers.1.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.274532] Trainable param: llma.layers.1.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274542] Trainable param: llma.layers.1.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274553] Trainable param: llma.layers.1.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274563] Trainable param: llma.layers.1.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.274574] Trainable param: llma.layers.1.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274585] Trainable param: llma.layers.1.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274599] Trainable param: llma.layers.2.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274609] Trainable param: llma.layers.2.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.274620] Trainable param: llma.layers.2.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274630] Trainable param: llma.layers.2.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.274641] Trainable param: llma.layers.2.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274651] Trainable param: llma.layers.2.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.274662] Trainable param: llma.layers.2.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274672] Trainable param: llma.layers.2.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274684] Trainable param: llma.layers.2.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274694] Trainable param: llma.layers.2.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.274705] Trainable param: llma.layers.2.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274715] Trainable param: llma.layers.2.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274726] Trainable param: llma.layers.2.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274736] Trainable param: llma.layers.2.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.274748] Trainable param: llma.layers.2.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274759] Trainable param: llma.layers.2.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274772] Trainable param: llma.layers.3.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274782] Trainable param: llma.layers.3.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.274793] Trainable param: llma.layers.3.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274803] Trainable param: llma.layers.3.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.274814] Trainable param: llma.layers.3.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274824] Trainable param: llma.layers.3.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.274835] Trainable param: llma.layers.3.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274845] Trainable param: llma.layers.3.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274857] Trainable param: llma.layers.3.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274867] Trainable param: llma.layers.3.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.274878] Trainable param: llma.layers.3.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274888] Trainable param: llma.layers.3.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274900] Trainable param: llma.layers.3.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274909] Trainable param: llma.layers.3.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.274921] Trainable param: llma.layers.3.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274931] Trainable param: llma.layers.3.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.274945] Trainable param: llma.layers.4.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274954] Trainable param: llma.layers.4.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.274966] Trainable param: llma.layers.4.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274976] Trainable param: llma.layers.4.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.274987] Trainable param: llma.layers.4.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.274997] Trainable param: llma.layers.4.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.275008] Trainable param: llma.layers.4.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275018] Trainable param: llma.layers.4.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275030] Trainable param: llma.layers.4.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275040] Trainable param: llma.layers.4.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.275052] Trainable param: llma.layers.4.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275063] Trainable param: llma.layers.4.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275075] Trainable param: llma.layers.4.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275085] Trainable param: llma.layers.4.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.275096] Trainable param: llma.layers.4.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275107] Trainable param: llma.layers.4.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275120] Trainable param: llma.layers.5.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275130] Trainable param: llma.layers.5.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.275142] Trainable param: llma.layers.5.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275151] Trainable param: llma.layers.5.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.275163] Trainable param: llma.layers.5.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275172] Trainable param: llma.layers.5.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.275184] Trainable param: llma.layers.5.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275194] Trainable param: llma.layers.5.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275206] Trainable param: llma.layers.5.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275215] Trainable param: llma.layers.5.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.275227] Trainable param: llma.layers.5.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275237] Trainable param: llma.layers.5.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275248] Trainable param: llma.layers.5.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275258] Trainable param: llma.layers.5.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.275269] Trainable param: llma.layers.5.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275282] Trainable param: llma.layers.5.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275295] Trainable param: llma.layers.6.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275305] Trainable param: llma.layers.6.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.275316] Trainable param: llma.layers.6.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275326] Trainable param: llma.layers.6.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.275338] Trainable param: llma.layers.6.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275347] Trainable param: llma.layers.6.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.275359] Trainable param: llma.layers.6.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275368] Trainable param: llma.layers.6.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275380] Trainable param: llma.layers.6.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275390] Trainable param: llma.layers.6.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.275401] Trainable param: llma.layers.6.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275411] Trainable param: llma.layers.6.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275423] Trainable param: llma.layers.6.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275432] Trainable param: llma.layers.6.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.275443] Trainable param: llma.layers.6.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275454] Trainable param: llma.layers.6.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275467] Trainable param: llma.layers.7.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275477] Trainable param: llma.layers.7.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.275488] Trainable param: llma.layers.7.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275498] Trainable param: llma.layers.7.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.275509] Trainable param: llma.layers.7.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275519] Trainable param: llma.layers.7.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.275531] Trainable param: llma.layers.7.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275540] Trainable param: llma.layers.7.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275553] Trainable param: llma.layers.7.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275563] Trainable param: llma.layers.7.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.275574] Trainable param: llma.layers.7.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275584] Trainable param: llma.layers.7.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275595] Trainable param: llma.layers.7.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275605] Trainable param: llma.layers.7.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.275616] Trainable param: llma.layers.7.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275627] Trainable param: llma.layers.7.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275640] Trainable param: llma.layers.8.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275650] Trainable param: llma.layers.8.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.275661] Trainable param: llma.layers.8.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275675] Trainable param: llma.layers.8.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.275692] Trainable param: llma.layers.8.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275703] Trainable param: llma.layers.8.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.275714] Trainable param: llma.layers.8.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275724] Trainable param: llma.layers.8.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275736] Trainable param: llma.layers.8.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275746] Trainable param: llma.layers.8.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.275758] Trainable param: llma.layers.8.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275768] Trainable param: llma.layers.8.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275779] Trainable param: llma.layers.8.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275789] Trainable param: llma.layers.8.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.275800] Trainable param: llma.layers.8.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275812] Trainable param: llma.layers.8.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275825] Trainable param: llma.layers.9.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275835] Trainable param: llma.layers.9.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.275846] Trainable param: llma.layers.9.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275856] Trainable param: llma.layers.9.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.275868] Trainable param: llma.layers.9.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275878] Trainable param: llma.layers.9.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.275889] Trainable param: llma.layers.9.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275899] Trainable param: llma.layers.9.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275911] Trainable param: llma.layers.9.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275921] Trainable param: llma.layers.9.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.275932] Trainable param: llma.layers.9.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275942] Trainable param: llma.layers.9.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275954] Trainable param: llma.layers.9.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.275963] Trainable param: llma.layers.9.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.275975] Trainable param: llma.layers.9.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275986] Trainable param: llma.layers.9.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.275999] Trainable param: llma.layers.10.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276009] Trainable param: llma.layers.10.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.276021] Trainable param: llma.layers.10.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276030] Trainable param: llma.layers.10.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.276042] Trainable param: llma.layers.10.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276051] Trainable param: llma.layers.10.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.276063] Trainable param: llma.layers.10.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276073] Trainable param: llma.layers.10.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276085] Trainable param: llma.layers.10.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276095] Trainable param: llma.layers.10.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.276106] Trainable param: llma.layers.10.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276116] Trainable param: llma.layers.10.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276127] Trainable param: llma.layers.10.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276137] Trainable param: llma.layers.10.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.276148] Trainable param: llma.layers.10.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276159] Trainable param: llma.layers.10.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276172] Trainable param: llma.layers.11.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276182] Trainable param: llma.layers.11.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.276194] Trainable param: llma.layers.11.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276203] Trainable param: llma.layers.11.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.276215] Trainable param: llma.layers.11.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276224] Trainable param: llma.layers.11.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.276236] Trainable param: llma.layers.11.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276245] Trainable param: llma.layers.11.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276257] Trainable param: llma.layers.11.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276267] Trainable param: llma.layers.11.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.276278] Trainable param: llma.layers.11.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276288] Trainable param: llma.layers.11.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276299] Trainable param: llma.layers.11.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276309] Trainable param: llma.layers.11.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.276320] Trainable param: llma.layers.11.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276331] Trainable param: llma.layers.11.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276344] Trainable param: llma.layers.12.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276354] Trainable param: llma.layers.12.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.276366] Trainable param: llma.layers.12.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276375] Trainable param: llma.layers.12.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.276387] Trainable param: llma.layers.12.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276396] Trainable param: llma.layers.12.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.276408] Trainable param: llma.layers.12.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276417] Trainable param: llma.layers.12.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276430] Trainable param: llma.layers.12.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276439] Trainable param: llma.layers.12.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.276451] Trainable param: llma.layers.12.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276460] Trainable param: llma.layers.12.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276472] Trainable param: llma.layers.12.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276481] Trainable param: llma.layers.12.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.276493] Trainable param: llma.layers.12.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276504] Trainable param: llma.layers.12.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276516] Trainable param: llma.layers.13.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276526] Trainable param: llma.layers.13.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.276538] Trainable param: llma.layers.13.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276547] Trainable param: llma.layers.13.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.276559] Trainable param: llma.layers.13.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276568] Trainable param: llma.layers.13.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.276580] Trainable param: llma.layers.13.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276590] Trainable param: llma.layers.13.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276602] Trainable param: llma.layers.13.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276611] Trainable param: llma.layers.13.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.276623] Trainable param: llma.layers.13.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276633] Trainable param: llma.layers.13.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276644] Trainable param: llma.layers.13.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276654] Trainable param: llma.layers.13.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.276665] Trainable param: llma.layers.13.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276676] Trainable param: llma.layers.13.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276689] Trainable param: llma.layers.14.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276699] Trainable param: llma.layers.14.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.276710] Trainable param: llma.layers.14.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276720] Trainable param: llma.layers.14.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.276731] Trainable param: llma.layers.14.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276741] Trainable param: llma.layers.14.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.276752] Trainable param: llma.layers.14.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276762] Trainable param: llma.layers.14.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276774] Trainable param: llma.layers.14.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276784] Trainable param: llma.layers.14.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.276795] Trainable param: llma.layers.14.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276805] Trainable param: llma.layers.14.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276816] Trainable param: llma.layers.14.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276826] Trainable param: llma.layers.14.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.276837] Trainable param: llma.layers.14.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276848] Trainable param: llma.layers.14.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276861] Trainable param: llma.layers.15.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276871] Trainable param: llma.layers.15.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.276882] Trainable param: llma.layers.15.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276892] Trainable param: llma.layers.15.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.276903] Trainable param: llma.layers.15.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276913] Trainable param: llma.layers.15.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.276924] Trainable param: llma.layers.15.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276934] Trainable param: llma.layers.15.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276947] Trainable param: llma.layers.15.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276956] Trainable param: llma.layers.15.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.276968] Trainable param: llma.layers.15.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276977] Trainable param: llma.layers.15.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.276989] Trainable param: llma.layers.15.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.276998] Trainable param: llma.layers.15.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.277009] Trainable param: llma.layers.15.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277020] Trainable param: llma.layers.15.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277033] Trainable param: llma.layers.16.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277043] Trainable param: llma.layers.16.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.277055] Trainable param: llma.layers.16.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277065] Trainable param: llma.layers.16.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.277076] Trainable param: llma.layers.16.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277086] Trainable param: llma.layers.16.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.277097] Trainable param: llma.layers.16.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277107] Trainable param: llma.layers.16.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277119] Trainable param: llma.layers.16.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277129] Trainable param: llma.layers.16.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.277140] Trainable param: llma.layers.16.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277150] Trainable param: llma.layers.16.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277161] Trainable param: llma.layers.16.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277171] Trainable param: llma.layers.16.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.277182] Trainable param: llma.layers.16.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277193] Trainable param: llma.layers.16.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277206] Trainable param: llma.layers.17.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277216] Trainable param: llma.layers.17.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.277227] Trainable param: llma.layers.17.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277237] Trainable param: llma.layers.17.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.277248] Trainable param: llma.layers.17.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277258] Trainable param: llma.layers.17.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.277269] Trainable param: llma.layers.17.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277279] Trainable param: llma.layers.17.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277291] Trainable param: llma.layers.17.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277301] Trainable param: llma.layers.17.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.277312] Trainable param: llma.layers.17.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277322] Trainable param: llma.layers.17.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277333] Trainable param: llma.layers.17.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277343] Trainable param: llma.layers.17.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.277355] Trainable param: llma.layers.17.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277365] Trainable param: llma.layers.17.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277378] Trainable param: llma.layers.18.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277388] Trainable param: llma.layers.18.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.277400] Trainable param: llma.layers.18.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277409] Trainable param: llma.layers.18.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.277421] Trainable param: llma.layers.18.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277431] Trainable param: llma.layers.18.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.277442] Trainable param: llma.layers.18.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277452] Trainable param: llma.layers.18.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277464] Trainable param: llma.layers.18.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277474] Trainable param: llma.layers.18.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.277485] Trainable param: llma.layers.18.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277495] Trainable param: llma.layers.18.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277506] Trainable param: llma.layers.18.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277521] Trainable param: llma.layers.18.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.277532] Trainable param: llma.layers.18.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277543] Trainable param: llma.layers.18.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277556] Trainable param: llma.layers.19.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277570] Trainable param: llma.layers.19.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.277581] Trainable param: llma.layers.19.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277591] Trainable param: llma.layers.19.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.277603] Trainable param: llma.layers.19.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277613] Trainable param: llma.layers.19.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.277624] Trainable param: llma.layers.19.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277634] Trainable param: llma.layers.19.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277646] Trainable param: llma.layers.19.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277656] Trainable param: llma.layers.19.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.277667] Trainable param: llma.layers.19.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277677] Trainable param: llma.layers.19.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277688] Trainable param: llma.layers.19.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277698] Trainable param: llma.layers.19.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.277709] Trainable param: llma.layers.19.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277720] Trainable param: llma.layers.19.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277733] Trainable param: llma.layers.20.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277743] Trainable param: llma.layers.20.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.277754] Trainable param: llma.layers.20.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277764] Trainable param: llma.layers.20.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.277775] Trainable param: llma.layers.20.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277785] Trainable param: llma.layers.20.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.277796] Trainable param: llma.layers.20.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277806] Trainable param: llma.layers.20.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277818] Trainable param: llma.layers.20.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277828] Trainable param: llma.layers.20.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.277839] Trainable param: llma.layers.20.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277849] Trainable param: llma.layers.20.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277861] Trainable param: llma.layers.20.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277870] Trainable param: llma.layers.20.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.277881] Trainable param: llma.layers.20.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277892] Trainable param: llma.layers.20.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277905] Trainable param: llma.layers.21.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277915] Trainable param: llma.layers.21.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.277926] Trainable param: llma.layers.21.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277936] Trainable param: llma.layers.21.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.277947] Trainable param: llma.layers.21.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277957] Trainable param: llma.layers.21.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.277968] Trainable param: llma.layers.21.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.277978] Trainable param: llma.layers.21.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.277990] Trainable param: llma.layers.21.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278000] Trainable param: llma.layers.21.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.278011] Trainable param: llma.layers.21.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278021] Trainable param: llma.layers.21.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278032] Trainable param: llma.layers.21.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278042] Trainable param: llma.layers.21.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.278053] Trainable param: llma.layers.21.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278064] Trainable param: llma.layers.21.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278077] Trainable param: llma.layers.22.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278086] Trainable param: llma.layers.22.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.278098] Trainable param: llma.layers.22.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278107] Trainable param: llma.layers.22.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.278118] Trainable param: llma.layers.22.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278128] Trainable param: llma.layers.22.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.278139] Trainable param: llma.layers.22.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278149] Trainable param: llma.layers.22.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278161] Trainable param: llma.layers.22.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278171] Trainable param: llma.layers.22.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.278182] Trainable param: llma.layers.22.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278192] Trainable param: llma.layers.22.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278203] Trainable param: llma.layers.22.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278213] Trainable param: llma.layers.22.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.278224] Trainable param: llma.layers.22.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278235] Trainable param: llma.layers.22.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278248] Trainable param: llma.layers.23.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278258] Trainable param: llma.layers.23.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.278269] Trainable param: llma.layers.23.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278279] Trainable param: llma.layers.23.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.278290] Trainable param: llma.layers.23.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278300] Trainable param: llma.layers.23.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.278311] Trainable param: llma.layers.23.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278321] Trainable param: llma.layers.23.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278333] Trainable param: llma.layers.23.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278343] Trainable param: llma.layers.23.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.278354] Trainable param: llma.layers.23.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278364] Trainable param: llma.layers.23.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278375] Trainable param: llma.layers.23.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278385] Trainable param: llma.layers.23.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.278396] Trainable param: llma.layers.23.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278407] Trainable param: llma.layers.23.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278420] Trainable param: llma.layers.24.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278429] Trainable param: llma.layers.24.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.278441] Trainable param: llma.layers.24.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278451] Trainable param: llma.layers.24.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.278462] Trainable param: llma.layers.24.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278472] Trainable param: llma.layers.24.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.278483] Trainable param: llma.layers.24.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278493] Trainable param: llma.layers.24.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278505] Trainable param: llma.layers.24.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278515] Trainable param: llma.layers.24.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.278526] Trainable param: llma.layers.24.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278536] Trainable param: llma.layers.24.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278547] Trainable param: llma.layers.24.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278557] Trainable param: llma.layers.24.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.278568] Trainable param: llma.layers.24.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278579] Trainable param: llma.layers.24.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278595] Trainable param: llma.layers.25.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278605] Trainable param: llma.layers.25.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.278616] Trainable param: llma.layers.25.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278626] Trainable param: llma.layers.25.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.278637] Trainable param: llma.layers.25.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278647] Trainable param: llma.layers.25.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.278658] Trainable param: llma.layers.25.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278668] Trainable param: llma.layers.25.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278680] Trainable param: llma.layers.25.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278690] Trainable param: llma.layers.25.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.278701] Trainable param: llma.layers.25.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278711] Trainable param: llma.layers.25.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278722] Trainable param: llma.layers.25.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278732] Trainable param: llma.layers.25.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.278743] Trainable param: llma.layers.25.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278754] Trainable param: llma.layers.25.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278767] Trainable param: llma.layers.26.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278776] Trainable param: llma.layers.26.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.278788] Trainable param: llma.layers.26.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278797] Trainable param: llma.layers.26.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.278809] Trainable param: llma.layers.26.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278819] Trainable param: llma.layers.26.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.278830] Trainable param: llma.layers.26.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278840] Trainable param: llma.layers.26.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278851] Trainable param: llma.layers.26.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278861] Trainable param: llma.layers.26.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.278872] Trainable param: llma.layers.26.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278882] Trainable param: llma.layers.26.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278893] Trainable param: llma.layers.26.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278903] Trainable param: llma.layers.26.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.278915] Trainable param: llma.layers.26.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278925] Trainable param: llma.layers.26.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.278938] Trainable param: llma.layers.27.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278948] Trainable param: llma.layers.27.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.278959] Trainable param: llma.layers.27.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278969] Trainable param: llma.layers.27.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.278981] Trainable param: llma.layers.27.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.278990] Trainable param: llma.layers.27.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.279002] Trainable param: llma.layers.27.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279012] Trainable param: llma.layers.27.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279024] Trainable param: llma.layers.27.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279034] Trainable param: llma.layers.27.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.279045] Trainable param: llma.layers.27.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279055] Trainable param: llma.layers.27.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279066] Trainable param: llma.layers.27.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279076] Trainable param: llma.layers.27.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.279087] Trainable param: llma.layers.27.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279098] Trainable param: llma.layers.27.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279110] Trainable param: llma.layers.28.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279120] Trainable param: llma.layers.28.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.279131] Trainable param: llma.layers.28.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279141] Trainable param: llma.layers.28.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.279153] Trainable param: llma.layers.28.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279163] Trainable param: llma.layers.28.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.279174] Trainable param: llma.layers.28.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279184] Trainable param: llma.layers.28.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279196] Trainable param: llma.layers.28.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279206] Trainable param: llma.layers.28.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.279217] Trainable param: llma.layers.28.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279227] Trainable param: llma.layers.28.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279238] Trainable param: llma.layers.28.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279248] Trainable param: llma.layers.28.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.279259] Trainable param: llma.layers.28.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279270] Trainable param: llma.layers.28.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279283] Trainable param: llma.layers.29.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279293] Trainable param: llma.layers.29.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.279304] Trainable param: llma.layers.29.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279314] Trainable param: llma.layers.29.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.279325] Trainable param: llma.layers.29.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279335] Trainable param: llma.layers.29.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.279346] Trainable param: llma.layers.29.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279356] Trainable param: llma.layers.29.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279368] Trainable param: llma.layers.29.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279378] Trainable param: llma.layers.29.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.279389] Trainable param: llma.layers.29.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279399] Trainable param: llma.layers.29.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279410] Trainable param: llma.layers.29.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279420] Trainable param: llma.layers.29.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.279431] Trainable param: llma.layers.29.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279442] Trainable param: llma.layers.29.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279455] Trainable param: llma.layers.30.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279465] Trainable param: llma.layers.30.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.279476] Trainable param: llma.layers.30.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279486] Trainable param: llma.layers.30.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.279497] Trainable param: llma.layers.30.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279507] Trainable param: llma.layers.30.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.279518] Trainable param: llma.layers.30.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279527] Trainable param: llma.layers.30.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279539] Trainable param: llma.layers.30.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279549] Trainable param: llma.layers.30.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.279561] Trainable param: llma.layers.30.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279570] Trainable param: llma.layers.30.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279582] Trainable param: llma.layers.30.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279591] Trainable param: llma.layers.30.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.279603] Trainable param: llma.layers.30.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279613] Trainable param: llma.layers.30.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279626] Trainable param: llma.layers.31.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279636] Trainable param: llma.layers.31.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.279648] Trainable param: llma.layers.31.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279657] Trainable param: llma.layers.31.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.279669] Trainable param: llma.layers.31.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279678] Trainable param: llma.layers.31.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.279690] Trainable param: llma.layers.31.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279700] Trainable param: llma.layers.31.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279712] Trainable param: llma.layers.31.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279722] Trainable param: llma.layers.31.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.279733] Trainable param: llma.layers.31.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279743] Trainable param: llma.layers.31.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279754] Trainable param: llma.layers.31.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279764] Trainable param: llma.layers.31.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.279775] Trainable param: llma.layers.31.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279786] Trainable param: llma.layers.31.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279799] Trainable param: llma.layers.32.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279809] Trainable param: llma.layers.32.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.279821] Trainable param: llma.layers.32.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279830] Trainable param: llma.layers.32.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.279842] Trainable param: llma.layers.32.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279852] Trainable param: llma.layers.32.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.279863] Trainable param: llma.layers.32.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279873] Trainable param: llma.layers.32.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279885] Trainable param: llma.layers.32.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279895] Trainable param: llma.layers.32.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.279906] Trainable param: llma.layers.32.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279916] Trainable param: llma.layers.32.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279927] Trainable param: llma.layers.32.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279937] Trainable param: llma.layers.32.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.279948] Trainable param: llma.layers.32.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279959] Trainable param: llma.layers.32.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.279972] Trainable param: llma.layers.33.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.279982] Trainable param: llma.layers.33.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.279993] Trainable param: llma.layers.33.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280003] Trainable param: llma.layers.33.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.280014] Trainable param: llma.layers.33.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280024] Trainable param: llma.layers.33.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.280035] Trainable param: llma.layers.33.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280045] Trainable param: llma.layers.33.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280057] Trainable param: llma.layers.33.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280067] Trainable param: llma.layers.33.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.280079] Trainable param: llma.layers.33.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280088] Trainable param: llma.layers.33.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280099] Trainable param: llma.layers.33.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280109] Trainable param: llma.layers.33.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.280120] Trainable param: llma.layers.33.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280131] Trainable param: llma.layers.33.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280144] Trainable param: llma.layers.34.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280154] Trainable param: llma.layers.34.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.280165] Trainable param: llma.layers.34.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280175] Trainable param: llma.layers.34.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.280186] Trainable param: llma.layers.34.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280196] Trainable param: llma.layers.34.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.280207] Trainable param: llma.layers.34.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280217] Trainable param: llma.layers.34.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280229] Trainable param: llma.layers.34.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280239] Trainable param: llma.layers.34.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.280250] Trainable param: llma.layers.34.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280260] Trainable param: llma.layers.34.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280271] Trainable param: llma.layers.34.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280281] Trainable param: llma.layers.34.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.280293] Trainable param: llma.layers.34.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280304] Trainable param: llma.layers.34.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280316] Trainable param: llma.layers.35.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280326] Trainable param: llma.layers.35.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.280338] Trainable param: llma.layers.35.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280348] Trainable param: llma.layers.35.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.280359] Trainable param: llma.layers.35.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280369] Trainable param: llma.layers.35.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.280380] Trainable param: llma.layers.35.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280389] Trainable param: llma.layers.35.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280401] Trainable param: llma.layers.35.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280411] Trainable param: llma.layers.35.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.280423] Trainable param: llma.layers.35.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280432] Trainable param: llma.layers.35.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280443] Trainable param: llma.layers.35.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280453] Trainable param: llma.layers.35.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.280464] Trainable param: llma.layers.35.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280475] Trainable param: llma.layers.35.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280488] Trainable param: llma.layers.36.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280498] Trainable param: llma.layers.36.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.280509] Trainable param: llma.layers.36.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280519] Trainable param: llma.layers.36.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.280530] Trainable param: llma.layers.36.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280540] Trainable param: llma.layers.36.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.280551] Trainable param: llma.layers.36.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280561] Trainable param: llma.layers.36.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280573] Trainable param: llma.layers.36.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280583] Trainable param: llma.layers.36.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.280594] Trainable param: llma.layers.36.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280604] Trainable param: llma.layers.36.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280615] Trainable param: llma.layers.36.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280625] Trainable param: llma.layers.36.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.280636] Trainable param: llma.layers.36.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280647] Trainable param: llma.layers.36.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280660] Trainable param: llma.layers.37.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280669] Trainable param: llma.layers.37.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.280681] Trainable param: llma.layers.37.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280691] Trainable param: llma.layers.37.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.280702] Trainable param: llma.layers.37.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280712] Trainable param: llma.layers.37.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.280723] Trainable param: llma.layers.37.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280733] Trainable param: llma.layers.37.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280745] Trainable param: llma.layers.37.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280755] Trainable param: llma.layers.37.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.280766] Trainable param: llma.layers.37.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280776] Trainable param: llma.layers.37.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280787] Trainable param: llma.layers.37.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280797] Trainable param: llma.layers.37.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.280808] Trainable param: llma.layers.37.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280818] Trainable param: llma.layers.37.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280830] Trainable param: llma.layers.38.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280839] Trainable param: llma.layers.38.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.280849] Trainable param: llma.layers.38.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280858] Trainable param: llma.layers.38.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.280868] Trainable param: llma.layers.38.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280877] Trainable param: llma.layers.38.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.280888] Trainable param: llma.layers.38.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280897] Trainable param: llma.layers.38.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280908] Trainable param: llma.layers.38.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280917] Trainable param: llma.layers.38.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.280932] Trainable param: llma.layers.38.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280942] Trainable param: llma.layers.38.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280952] Trainable param: llma.layers.38.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.280961] Trainable param: llma.layers.38.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.280971] Trainable param: llma.layers.38.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280981] Trainable param: llma.layers.38.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.280993] Trainable param: llma.layers.39.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281002] Trainable param: llma.layers.39.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.281013] Trainable param: llma.layers.39.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281022] Trainable param: llma.layers.39.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.281032] Trainable param: llma.layers.39.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281041] Trainable param: llma.layers.39.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.281051] Trainable param: llma.layers.39.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281060] Trainable param: llma.layers.39.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281072] Trainable param: llma.layers.39.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281081] Trainable param: llma.layers.39.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.281093] Trainable param: llma.layers.39.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281102] Trainable param: llma.layers.39.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281112] Trainable param: llma.layers.39.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281121] Trainable param: llma.layers.39.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.281131] Trainable param: llma.layers.39.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281141] Trainable param: llma.layers.39.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281153] Trainable param: llma.layers.40.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281162] Trainable param: llma.layers.40.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.281172] Trainable param: llma.layers.40.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281181] Trainable param: llma.layers.40.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.281192] Trainable param: llma.layers.40.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281201] Trainable param: llma.layers.40.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.281211] Trainable param: llma.layers.40.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281220] Trainable param: llma.layers.40.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281231] Trainable param: llma.layers.40.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281240] Trainable param: llma.layers.40.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.281250] Trainable param: llma.layers.40.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281259] Trainable param: llma.layers.40.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281270] Trainable param: llma.layers.40.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281279] Trainable param: llma.layers.40.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.281289] Trainable param: llma.layers.40.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281299] Trainable param: llma.layers.40.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281311] Trainable param: llma.layers.41.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281320] Trainable param: llma.layers.41.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.281330] Trainable param: llma.layers.41.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281339] Trainable param: llma.layers.41.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.281349] Trainable param: llma.layers.41.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281359] Trainable param: llma.layers.41.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.281370] Trainable param: llma.layers.41.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281379] Trainable param: llma.layers.41.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281390] Trainable param: llma.layers.41.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281399] Trainable param: llma.layers.41.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.281409] Trainable param: llma.layers.41.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281418] Trainable param: llma.layers.41.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281428] Trainable param: llma.layers.41.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281437] Trainable param: llma.layers.41.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.281448] Trainable param: llma.layers.41.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281458] Trainable param: llma.layers.41.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281470] Trainable param: llma.layers.42.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281479] Trainable param: llma.layers.42.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.281489] Trainable param: llma.layers.42.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281498] Trainable param: llma.layers.42.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.281508] Trainable param: llma.layers.42.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281521] Trainable param: llma.layers.42.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.281531] Trainable param: llma.layers.42.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281540] Trainable param: llma.layers.42.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281551] Trainable param: llma.layers.42.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281560] Trainable param: llma.layers.42.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.281571] Trainable param: llma.layers.42.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281580] Trainable param: llma.layers.42.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281590] Trainable param: llma.layers.42.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281599] Trainable param: llma.layers.42.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.281609] Trainable param: llma.layers.42.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281620] Trainable param: llma.layers.42.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281632] Trainable param: llma.layers.43.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281641] Trainable param: llma.layers.43.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.281652] Trainable param: llma.layers.43.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281661] Trainable param: llma.layers.43.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.281671] Trainable param: llma.layers.43.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281680] Trainable param: llma.layers.43.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.281690] Trainable param: llma.layers.43.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281699] Trainable param: llma.layers.43.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281710] Trainable param: llma.layers.43.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281719] Trainable param: llma.layers.43.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.281730] Trainable param: llma.layers.43.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281739] Trainable param: llma.layers.43.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281749] Trainable param: llma.layers.43.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281758] Trainable param: llma.layers.43.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.281769] Trainable param: llma.layers.43.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281779] Trainable param: llma.layers.43.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281791] Trainable param: llma.layers.44.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281800] Trainable param: llma.layers.44.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.281810] Trainable param: llma.layers.44.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281819] Trainable param: llma.layers.44.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.281829] Trainable param: llma.layers.44.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281838] Trainable param: llma.layers.44.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.281848] Trainable param: llma.layers.44.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281857] Trainable param: llma.layers.44.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281869] Trainable param: llma.layers.44.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281878] Trainable param: llma.layers.44.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.281888] Trainable param: llma.layers.44.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281897] Trainable param: llma.layers.44.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281908] Trainable param: llma.layers.44.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281916] Trainable param: llma.layers.44.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.281927] Trainable param: llma.layers.44.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281936] Trainable param: llma.layers.44.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.281948] Trainable param: llma.layers.45.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281957] Trainable param: llma.layers.45.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.281967] Trainable param: llma.layers.45.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281976] Trainable param: llma.layers.45.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.281987] Trainable param: llma.layers.45.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.281996] Trainable param: llma.layers.45.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.282006] Trainable param: llma.layers.45.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282015] Trainable param: llma.layers.45.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282026] Trainable param: llma.layers.45.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282035] Trainable param: llma.layers.45.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.282045] Trainable param: llma.layers.45.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282054] Trainable param: llma.layers.45.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282065] Trainable param: llma.layers.45.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282073] Trainable param: llma.layers.45.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.282084] Trainable param: llma.layers.45.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282094] Trainable param: llma.layers.45.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282106] Trainable param: llma.layers.46.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282115] Trainable param: llma.layers.46.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.282125] Trainable param: llma.layers.46.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282134] Trainable param: llma.layers.46.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.282144] Trainable param: llma.layers.46.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282153] Trainable param: llma.layers.46.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.282164] Trainable param: llma.layers.46.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282173] Trainable param: llma.layers.46.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282184] Trainable param: llma.layers.46.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282193] Trainable param: llma.layers.46.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.282203] Trainable param: llma.layers.46.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282212] Trainable param: llma.layers.46.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282223] Trainable param: llma.layers.46.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282232] Trainable param: llma.layers.46.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.282242] Trainable param: llma.layers.46.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282253] Trainable param: llma.layers.46.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282272] Trainable param: llma.layers.47.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282286] Trainable param: llma.layers.47.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.282302] Trainable param: llma.layers.47.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282316] Trainable param: llma.layers.47.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.282333] Trainable param: llma.layers.47.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282347] Trainable param: llma.layers.47.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.282363] Trainable param: llma.layers.47.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282378] Trainable param: llma.layers.47.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282396] Trainable param: llma.layers.47.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282411] Trainable param: llma.layers.47.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.282428] Trainable param: llma.layers.47.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282443] Trainable param: llma.layers.47.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282460] Trainable param: llma.layers.47.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282475] Trainable param: llma.layers.47.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.282492] Trainable param: llma.layers.47.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282509] Trainable param: llma.layers.47.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282530] Trainable param: llma.layers.48.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282546] Trainable param: llma.layers.48.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.282564] Trainable param: llma.layers.48.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282579] Trainable param: llma.layers.48.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.282597] Trainable param: llma.layers.48.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282613] Trainable param: llma.layers.48.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.282630] Trainable param: llma.layers.48.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282645] Trainable param: llma.layers.48.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282665] Trainable param: llma.layers.48.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282681] Trainable param: llma.layers.48.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.282699] Trainable param: llma.layers.48.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282714] Trainable param: llma.layers.48.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282732] Trainable param: llma.layers.48.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282747] Trainable param: llma.layers.48.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.282766] Trainable param: llma.layers.48.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282783] Trainable param: llma.layers.48.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282805] Trainable param: llma.layers.49.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282821] Trainable param: llma.layers.49.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.282839] Trainable param: llma.layers.49.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282854] Trainable param: llma.layers.49.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.282871] Trainable param: llma.layers.49.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282885] Trainable param: llma.layers.49.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.282901] Trainable param: llma.layers.49.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282916] Trainable param: llma.layers.49.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282934] Trainable param: llma.layers.49.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282949] Trainable param: llma.layers.49.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.282966] Trainable param: llma.layers.49.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.282980] Trainable param: llma.layers.49.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.282997] Trainable param: llma.layers.49.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283012] Trainable param: llma.layers.49.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.283028] Trainable param: llma.layers.49.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283043] Trainable param: llma.layers.49.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283063] Trainable param: llma.layers.50.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283078] Trainable param: llma.layers.50.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.283093] Trainable param: llma.layers.50.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283107] Trainable param: llma.layers.50.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.283123] Trainable param: llma.layers.50.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283138] Trainable param: llma.layers.50.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.283155] Trainable param: llma.layers.50.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283170] Trainable param: llma.layers.50.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283188] Trainable param: llma.layers.50.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283203] Trainable param: llma.layers.50.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.283221] Trainable param: llma.layers.50.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283237] Trainable param: llma.layers.50.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283254] Trainable param: llma.layers.50.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283270] Trainable param: llma.layers.50.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.283288] Trainable param: llma.layers.50.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283306] Trainable param: llma.layers.50.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283327] Trainable param: llma.layers.51.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283343] Trainable param: llma.layers.51.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.283362] Trainable param: llma.layers.51.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283377] Trainable param: llma.layers.51.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.283395] Trainable param: llma.layers.51.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283410] Trainable param: llma.layers.51.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.283428] Trainable param: llma.layers.51.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283444] Trainable param: llma.layers.51.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283464] Trainable param: llma.layers.51.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283479] Trainable param: llma.layers.51.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.283498] Trainable param: llma.layers.51.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283514] Trainable param: llma.layers.51.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283532] Trainable param: llma.layers.51.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283548] Trainable param: llma.layers.51.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.283566] Trainable param: llma.layers.51.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283583] Trainable param: llma.layers.51.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283605] Trainable param: llma.layers.52.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283621] Trainable param: llma.layers.52.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.283639] Trainable param: llma.layers.52.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283654] Trainable param: llma.layers.52.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.283673] Trainable param: llma.layers.52.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283688] Trainable param: llma.layers.52.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.283704] Trainable param: llma.layers.52.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283720] Trainable param: llma.layers.52.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283740] Trainable param: llma.layers.52.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283756] Trainable param: llma.layers.52.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.283773] Trainable param: llma.layers.52.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283789] Trainable param: llma.layers.52.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283807] Trainable param: llma.layers.52.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283823] Trainable param: llma.layers.52.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.283841] Trainable param: llma.layers.52.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283859] Trainable param: llma.layers.52.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.283881] Trainable param: llma.layers.53.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283897] Trainable param: llma.layers.53.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.283915] Trainable param: llma.layers.53.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283931] Trainable param: llma.layers.53.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.283949] Trainable param: llma.layers.53.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283964] Trainable param: llma.layers.53.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.283981] Trainable param: llma.layers.53.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.283996] Trainable param: llma.layers.53.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284014] Trainable param: llma.layers.53.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284029] Trainable param: llma.layers.53.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.284047] Trainable param: llma.layers.53.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284062] Trainable param: llma.layers.53.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284079] Trainable param: llma.layers.53.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284094] Trainable param: llma.layers.53.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.284111] Trainable param: llma.layers.53.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284128] Trainable param: llma.layers.53.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284153] Trainable param: llma.layers.54.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284169] Trainable param: llma.layers.54.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.284188] Trainable param: llma.layers.54.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284204] Trainable param: llma.layers.54.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.284223] Trainable param: llma.layers.54.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284240] Trainable param: llma.layers.54.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.284258] Trainable param: llma.layers.54.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284274] Trainable param: llma.layers.54.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284294] Trainable param: llma.layers.54.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284310] Trainable param: llma.layers.54.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.284328] Trainable param: llma.layers.54.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284344] Trainable param: llma.layers.54.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284362] Trainable param: llma.layers.54.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284378] Trainable param: llma.layers.54.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.284396] Trainable param: llma.layers.54.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284413] Trainable param: llma.layers.54.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284434] Trainable param: llma.layers.55.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284449] Trainable param: llma.layers.55.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.284467] Trainable param: llma.layers.55.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284482] Trainable param: llma.layers.55.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.284501] Trainable param: llma.layers.55.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284518] Trainable param: llma.layers.55.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.284537] Trainable param: llma.layers.55.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284555] Trainable param: llma.layers.55.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284575] Trainable param: llma.layers.55.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284592] Trainable param: llma.layers.55.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.284611] Trainable param: llma.layers.55.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284628] Trainable param: llma.layers.55.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284647] Trainable param: llma.layers.55.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284663] Trainable param: llma.layers.55.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.284681] Trainable param: llma.layers.55.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284701] Trainable param: llma.layers.55.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284725] Trainable param: llma.layers.56.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284742] Trainable param: llma.layers.56.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.284762] Trainable param: llma.layers.56.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284779] Trainable param: llma.layers.56.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.284797] Trainable param: llma.layers.56.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284813] Trainable param: llma.layers.56.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.284832] Trainable param: llma.layers.56.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284847] Trainable param: llma.layers.56.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284868] Trainable param: llma.layers.56.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284884] Trainable param: llma.layers.56.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.284903] Trainable param: llma.layers.56.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284920] Trainable param: llma.layers.56.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284939] Trainable param: llma.layers.56.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.284955] Trainable param: llma.layers.56.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.284974] Trainable param: llma.layers.56.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.284992] Trainable param: llma.layers.56.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285015] Trainable param: llma.layers.57.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285032] Trainable param: llma.layers.57.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.285050] Trainable param: llma.layers.57.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285067] Trainable param: llma.layers.57.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.285084] Trainable param: llma.layers.57.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285100] Trainable param: llma.layers.57.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.285117] Trainable param: llma.layers.57.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285132] Trainable param: llma.layers.57.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285151] Trainable param: llma.layers.57.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285166] Trainable param: llma.layers.57.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.285184] Trainable param: llma.layers.57.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285199] Trainable param: llma.layers.57.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285216] Trainable param: llma.layers.57.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285231] Trainable param: llma.layers.57.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.285247] Trainable param: llma.layers.57.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285263] Trainable param: llma.layers.57.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285282] Trainable param: llma.layers.58.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285297] Trainable param: llma.layers.58.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.285314] Trainable param: llma.layers.58.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285328] Trainable param: llma.layers.58.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.285345] Trainable param: llma.layers.58.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285359] Trainable param: llma.layers.58.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.285375] Trainable param: llma.layers.58.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285389] Trainable param: llma.layers.58.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285407] Trainable param: llma.layers.58.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285421] Trainable param: llma.layers.58.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.285438] Trainable param: llma.layers.58.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285452] Trainable param: llma.layers.58.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285469] Trainable param: llma.layers.58.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285483] Trainable param: llma.layers.58.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.285499] Trainable param: llma.layers.58.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285523] Trainable param: llma.layers.58.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285544] Trainable param: llma.layers.59.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285558] Trainable param: llma.layers.59.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.285574] Trainable param: llma.layers.59.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285589] Trainable param: llma.layers.59.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.285607] Trainable param: llma.layers.59.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285623] Trainable param: llma.layers.59.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.285641] Trainable param: llma.layers.59.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285657] Trainable param: llma.layers.59.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285677] Trainable param: llma.layers.59.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285693] Trainable param: llma.layers.59.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.285711] Trainable param: llma.layers.59.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285727] Trainable param: llma.layers.59.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285745] Trainable param: llma.layers.59.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285761] Trainable param: llma.layers.59.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.285777] Trainable param: llma.layers.59.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285794] Trainable param: llma.layers.59.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285815] Trainable param: llma.layers.60.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285830] Trainable param: llma.layers.60.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.285847] Trainable param: llma.layers.60.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285862] Trainable param: llma.layers.60.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.285878] Trainable param: llma.layers.60.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285893] Trainable param: llma.layers.60.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.285910] Trainable param: llma.layers.60.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285925] Trainable param: llma.layers.60.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.285943] Trainable param: llma.layers.60.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285957] Trainable param: llma.layers.60.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.285974] Trainable param: llma.layers.60.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.285989] Trainable param: llma.layers.60.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286005] Trainable param: llma.layers.60.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286019] Trainable param: llma.layers.60.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.286036] Trainable param: llma.layers.60.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286053] Trainable param: llma.layers.60.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286073] Trainable param: llma.layers.61.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286087] Trainable param: llma.layers.61.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.286104] Trainable param: llma.layers.61.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286119] Trainable param: llma.layers.61.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.286135] Trainable param: llma.layers.61.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286149] Trainable param: llma.layers.61.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.286166] Trainable param: llma.layers.61.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286181] Trainable param: llma.layers.61.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286200] Trainable param: llma.layers.61.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286215] Trainable param: llma.layers.61.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.286232] Trainable param: llma.layers.61.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286247] Trainable param: llma.layers.61.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286264] Trainable param: llma.layers.61.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286280] Trainable param: llma.layers.61.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.286297] Trainable param: llma.layers.61.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286314] Trainable param: llma.layers.61.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286335] Trainable param: llma.layers.62.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286350] Trainable param: llma.layers.62.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.286368] Trainable param: llma.layers.62.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286383] Trainable param: llma.layers.62.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.286401] Trainable param: llma.layers.62.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286417] Trainable param: llma.layers.62.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.286436] Trainable param: llma.layers.62.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286451] Trainable param: llma.layers.62.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286470] Trainable param: llma.layers.62.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286486] Trainable param: llma.layers.62.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.286503] Trainable param: llma.layers.62.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286519] Trainable param: llma.layers.62.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286536] Trainable param: llma.layers.62.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286550] Trainable param: llma.layers.62.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.286567] Trainable param: llma.layers.62.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286584] Trainable param: llma.layers.62.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286605] Trainable param: llma.layers.63.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286620] Trainable param: llma.layers.63.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.286638] Trainable param: llma.layers.63.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286653] Trainable param: llma.layers.63.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.286671] Trainable param: llma.layers.63.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286686] Trainable param: llma.layers.63.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.286704] Trainable param: llma.layers.63.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286720] Trainable param: llma.layers.63.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286739] Trainable param: llma.layers.63.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286753] Trainable param: llma.layers.63.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.286769] Trainable param: llma.layers.63.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286783] Trainable param: llma.layers.63.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286801] Trainable param: llma.layers.63.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286815] Trainable param: llma.layers.63.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.286833] Trainable param: llma.layers.63.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286851] Trainable param: llma.layers.63.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.286874] Trainable param: llma.layers.64.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286889] Trainable param: llma.layers.64.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.286907] Trainable param: llma.layers.64.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286922] Trainable param: llma.layers.64.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.286939] Trainable param: llma.layers.64.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286954] Trainable param: llma.layers.64.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.286971] Trainable param: llma.layers.64.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.286985] Trainable param: llma.layers.64.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287004] Trainable param: llma.layers.64.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287019] Trainable param: llma.layers.64.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.287036] Trainable param: llma.layers.64.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287051] Trainable param: llma.layers.64.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287067] Trainable param: llma.layers.64.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287082] Trainable param: llma.layers.64.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.287099] Trainable param: llma.layers.64.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287115] Trainable param: llma.layers.64.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287136] Trainable param: llma.layers.65.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287151] Trainable param: llma.layers.65.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.287168] Trainable param: llma.layers.65.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287183] Trainable param: llma.layers.65.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.287199] Trainable param: llma.layers.65.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287214] Trainable param: llma.layers.65.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.287230] Trainable param: llma.layers.65.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287245] Trainable param: llma.layers.65.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287263] Trainable param: llma.layers.65.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287277] Trainable param: llma.layers.65.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.287293] Trainable param: llma.layers.65.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287308] Trainable param: llma.layers.65.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287324] Trainable param: llma.layers.65.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287338] Trainable param: llma.layers.65.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.287354] Trainable param: llma.layers.65.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287369] Trainable param: llma.layers.65.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287389] Trainable param: llma.layers.66.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287404] Trainable param: llma.layers.66.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.287421] Trainable param: llma.layers.66.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287435] Trainable param: llma.layers.66.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.287452] Trainable param: llma.layers.66.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287467] Trainable param: llma.layers.66.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.287484] Trainable param: llma.layers.66.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287499] Trainable param: llma.layers.66.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287518] Trainable param: llma.layers.66.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287532] Trainable param: llma.layers.66.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.287549] Trainable param: llma.layers.66.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287564] Trainable param: llma.layers.66.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287581] Trainable param: llma.layers.66.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287596] Trainable param: llma.layers.66.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.287613] Trainable param: llma.layers.66.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287629] Trainable param: llma.layers.66.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287649] Trainable param: llma.layers.67.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287663] Trainable param: llma.layers.67.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.287680] Trainable param: llma.layers.67.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287695] Trainable param: llma.layers.67.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.287711] Trainable param: llma.layers.67.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287725] Trainable param: llma.layers.67.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.287742] Trainable param: llma.layers.67.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287757] Trainable param: llma.layers.67.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287775] Trainable param: llma.layers.67.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287788] Trainable param: llma.layers.67.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.287804] Trainable param: llma.layers.67.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287819] Trainable param: llma.layers.67.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287838] Trainable param: llma.layers.67.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287854] Trainable param: llma.layers.67.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.287873] Trainable param: llma.layers.67.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287892] Trainable param: llma.layers.67.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.287914] Trainable param: llma.layers.68.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287931] Trainable param: llma.layers.68.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.287950] Trainable param: llma.layers.68.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.287965] Trainable param: llma.layers.68.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.287984] Trainable param: llma.layers.68.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288000] Trainable param: llma.layers.68.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.288018] Trainable param: llma.layers.68.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288033] Trainable param: llma.layers.68.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288053] Trainable param: llma.layers.68.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288069] Trainable param: llma.layers.68.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.288088] Trainable param: llma.layers.68.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288104] Trainable param: llma.layers.68.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288122] Trainable param: llma.layers.68.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288138] Trainable param: llma.layers.68.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.288156] Trainable param: llma.layers.68.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288173] Trainable param: llma.layers.68.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288194] Trainable param: llma.layers.69.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288210] Trainable param: llma.layers.69.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.288226] Trainable param: llma.layers.69.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288241] Trainable param: llma.layers.69.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.288258] Trainable param: llma.layers.69.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288272] Trainable param: llma.layers.69.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.288289] Trainable param: llma.layers.69.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288303] Trainable param: llma.layers.69.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288322] Trainable param: llma.layers.69.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288337] Trainable param: llma.layers.69.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.288355] Trainable param: llma.layers.69.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288369] Trainable param: llma.layers.69.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288387] Trainable param: llma.layers.69.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288402] Trainable param: llma.layers.69.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.288419] Trainable param: llma.layers.69.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288436] Trainable param: llma.layers.69.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288457] Trainable param: llma.layers.70.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288472] Trainable param: llma.layers.70.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.288489] Trainable param: llma.layers.70.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288504] Trainable param: llma.layers.70.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.288521] Trainable param: llma.layers.70.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288535] Trainable param: llma.layers.70.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.288551] Trainable param: llma.layers.70.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288565] Trainable param: llma.layers.70.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288583] Trainable param: llma.layers.70.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288598] Trainable param: llma.layers.70.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.288615] Trainable param: llma.layers.70.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288630] Trainable param: llma.layers.70.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288649] Trainable param: llma.layers.70.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288664] Trainable param: llma.layers.70.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.288681] Trainable param: llma.layers.70.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288699] Trainable param: llma.layers.70.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288719] Trainable param: llma.layers.71.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288735] Trainable param: llma.layers.71.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.288752] Trainable param: llma.layers.71.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288767] Trainable param: llma.layers.71.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.288785] Trainable param: llma.layers.71.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288800] Trainable param: llma.layers.71.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.288818] Trainable param: llma.layers.71.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288834] Trainable param: llma.layers.71.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288853] Trainable param: llma.layers.71.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288868] Trainable param: llma.layers.71.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.288886] Trainable param: llma.layers.71.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288901] Trainable param: llma.layers.71.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288919] Trainable param: llma.layers.71.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.288934] Trainable param: llma.layers.71.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.288951] Trainable param: llma.layers.71.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288967] Trainable param: llma.layers.71.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.288989] Trainable param: llma.layers.72.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289005] Trainable param: llma.layers.72.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.289023] Trainable param: llma.layers.72.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289038] Trainable param: llma.layers.72.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.289056] Trainable param: llma.layers.72.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289071] Trainable param: llma.layers.72.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.289088] Trainable param: llma.layers.72.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289104] Trainable param: llma.layers.72.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289124] Trainable param: llma.layers.72.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289139] Trainable param: llma.layers.72.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.289157] Trainable param: llma.layers.72.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289172] Trainable param: llma.layers.72.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289190] Trainable param: llma.layers.72.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289205] Trainable param: llma.layers.72.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.289222] Trainable param: llma.layers.72.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289238] Trainable param: llma.layers.72.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289259] Trainable param: llma.layers.73.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289274] Trainable param: llma.layers.73.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.289292] Trainable param: llma.layers.73.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289307] Trainable param: llma.layers.73.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.289324] Trainable param: llma.layers.73.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289339] Trainable param: llma.layers.73.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.289356] Trainable param: llma.layers.73.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289370] Trainable param: llma.layers.73.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289389] Trainable param: llma.layers.73.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289404] Trainable param: llma.layers.73.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.289421] Trainable param: llma.layers.73.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289436] Trainable param: llma.layers.73.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289452] Trainable param: llma.layers.73.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289467] Trainable param: llma.layers.73.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.289483] Trainable param: llma.layers.73.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289499] Trainable param: llma.layers.73.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289525] Trainable param: llma.layers.74.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289541] Trainable param: llma.layers.74.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.289559] Trainable param: llma.layers.74.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289573] Trainable param: llma.layers.74.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.289590] Trainable param: llma.layers.74.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289604] Trainable param: llma.layers.74.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.289621] Trainable param: llma.layers.74.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289635] Trainable param: llma.layers.74.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289653] Trainable param: llma.layers.74.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289667] Trainable param: llma.layers.74.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.289684] Trainable param: llma.layers.74.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289699] Trainable param: llma.layers.74.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289715] Trainable param: llma.layers.74.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289729] Trainable param: llma.layers.74.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.289745] Trainable param: llma.layers.74.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289761] Trainable param: llma.layers.74.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289782] Trainable param: llma.layers.75.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289797] Trainable param: llma.layers.75.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.289815] Trainable param: llma.layers.75.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289830] Trainable param: llma.layers.75.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.289847] Trainable param: llma.layers.75.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289862] Trainable param: llma.layers.75.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.289878] Trainable param: llma.layers.75.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289892] Trainable param: llma.layers.75.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289909] Trainable param: llma.layers.75.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289924] Trainable param: llma.layers.75.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.289942] Trainable param: llma.layers.75.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289957] Trainable param: llma.layers.75.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.289975] Trainable param: llma.layers.75.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.289991] Trainable param: llma.layers.75.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.290009] Trainable param: llma.layers.75.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290026] Trainable param: llma.layers.75.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290048] Trainable param: llma.layers.76.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290063] Trainable param: llma.layers.76.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.290081] Trainable param: llma.layers.76.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290097] Trainable param: llma.layers.76.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.290115] Trainable param: llma.layers.76.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290130] Trainable param: llma.layers.76.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.290148] Trainable param: llma.layers.76.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290163] Trainable param: llma.layers.76.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290183] Trainable param: llma.layers.76.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290199] Trainable param: llma.layers.76.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.290217] Trainable param: llma.layers.76.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290255] Trainable param: llma.layers.76.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290273] Trainable param: llma.layers.76.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290287] Trainable param: llma.layers.76.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.290301] Trainable param: llma.layers.76.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290311] Trainable param: llma.layers.76.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290324] Trainable param: llma.layers.77.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290334] Trainable param: llma.layers.77.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.290345] Trainable param: llma.layers.77.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290354] Trainable param: llma.layers.77.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.290364] Trainable param: llma.layers.77.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290373] Trainable param: llma.layers.77.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.290384] Trainable param: llma.layers.77.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290393] Trainable param: llma.layers.77.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290405] Trainable param: llma.layers.77.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290414] Trainable param: llma.layers.77.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.290425] Trainable param: llma.layers.77.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290434] Trainable param: llma.layers.77.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290444] Trainable param: llma.layers.77.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290453] Trainable param: llma.layers.77.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.290463] Trainable param: llma.layers.77.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290473] Trainable param: llma.layers.77.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290485] Trainable param: llma.layers.78.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290495] Trainable param: llma.layers.78.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.290506] Trainable param: llma.layers.78.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290515] Trainable param: llma.layers.78.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.290525] Trainable param: llma.layers.78.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290534] Trainable param: llma.layers.78.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.290545] Trainable param: llma.layers.78.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290554] Trainable param: llma.layers.78.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290565] Trainable param: llma.layers.78.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290574] Trainable param: llma.layers.78.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.290584] Trainable param: llma.layers.78.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290593] Trainable param: llma.layers.78.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290603] Trainable param: llma.layers.78.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290612] Trainable param: llma.layers.78.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.290623] Trainable param: llma.layers.78.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290633] Trainable param: llma.layers.78.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290645] Trainable param: llma.layers.79.attention.wq.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290654] Trainable param: llma.layers.79.attention.wq.bias, local_size: torch.Size([8192]), model_parallel: True, dtype: torch.float32 +[19:27:29.290665] Trainable param: llma.layers.79.attention.wk.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290675] Trainable param: llma.layers.79.attention.wk.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.290686] Trainable param: llma.layers.79.attention.wv.weight, local_size: torch.Size([1024, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290695] Trainable param: llma.layers.79.attention.wv.bias, local_size: torch.Size([1024]), model_parallel: True, dtype: torch.float32 +[19:27:29.290705] Trainable param: llma.layers.79.attention.wo.weight, local_size: torch.Size([8192, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290714] Trainable param: llma.layers.79.attention.wo.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290725] Trainable param: llma.layers.79.feed_forward.w1.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290734] Trainable param: llma.layers.79.feed_forward.w1.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.290745] Trainable param: llma.layers.79.feed_forward.w2.weight, local_size: torch.Size([8192, 28672]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290754] Trainable param: llma.layers.79.feed_forward.w2.bias, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290764] Trainable param: llma.layers.79.feed_forward.w3.weight, local_size: torch.Size([28672, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290773] Trainable param: llma.layers.79.feed_forward.w3.bias, local_size: torch.Size([28672]), model_parallel: True, dtype: torch.float32 +[19:27:29.290783] Trainable param: llma.layers.79.attention_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290793] Trainable param: llma.layers.79.ffn_norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290804] Trainable param: llma.norm.weight, local_size: torch.Size([8192]), model_parallel: False, dtype: torch.float32 +[19:27:29.290815] Trainable param: llma.output.weight, local_size: torch.Size([32000, 8192]), model_parallel: True, dtype: torch.bfloat16 +[19:27:29.290840] load pretrained from ../checkpoints/llama2/Llama-2-70b/ +[19:27:29.290845] Quantizing model to 4bit! +[19:28:10.394107] Epoch: [0] [1340/6500] lr: 0.000010 closs: 0.8034 (0.8657) grad_norm: 0.5445 (0.8165) time: 5.5771 data: 0.0002 max mem: 71357 +[19:29:06.239565] Epoch: [0] [1350/6500] lr: 0.000010 closs: 0.8034 (0.8652) grad_norm: 0.5277 (0.8142) time: 5.5869 data: 0.0002 max mem: 71357 +[19:30:01.961975] Epoch: [0] [1360/6500] lr: 0.000010 closs: 0.7740 (0.8646) grad_norm: 0.5703 (0.8128) time: 5.5783 data: 0.0002 max mem: 71357 +[19:30:57.763309] Epoch: [0] [1370/6500] lr: 0.000011 closs: 0.7126 (0.8635) grad_norm: 0.5703 (0.8116) time: 5.5761 data: 0.0002 max mem: 71357 +[19:31:53.564694] Epoch: [0] [1380/6500] lr: 0.000011 closs: 0.7537 (0.8634) grad_norm: 0.5703 (0.8103) time: 5.5800 data: 0.0001 max mem: 71357 +[19:32:49.286247] Epoch: [0] [1390/6500] lr: 0.000011 closs: 0.7631 (0.8624) grad_norm: 0.6007 (0.8092) time: 5.5761 data: 0.0001 max mem: 71357 +[19:33:45.056994] Epoch: [0] [1400/6500] lr: 0.000011 closs: 0.8355 (0.8627) grad_norm: 0.5855 (0.8086) time: 5.5745 data: 0.0001 max mem: 71357 +[19:34:40.796559] Epoch: [0] [1410/6500] lr: 0.000011 closs: 0.8545 (0.8626) grad_norm: 0.5738 (0.8066) time: 5.5754 data: 0.0002 max mem: 71357 +[19:35:36.566183] Epoch: [0] [1420/6500] lr: 0.000011 closs: 0.7831 (0.8623) grad_norm: 0.4588 (0.8040) time: 5.5753 data: 0.0002 max mem: 71357 +[19:36:32.355594] Epoch: [0] [1430/6500] lr: 0.000011 closs: 0.8227 (0.8622) grad_norm: 0.5588 (0.8042) time: 5.5778 data: 0.0001 max mem: 71357 +[19:37:28.095893] Epoch: [0] [1440/6500] lr: 0.000011 closs: 0.8106 (0.8617) grad_norm: 0.5302 (0.8031) time: 5.5764 data: 0.0001 max mem: 71357 +[19:38:23.857545] Epoch: [0] [1450/6500] lr: 0.000011 closs: 0.7735 (0.8613) grad_norm: 0.5302 (0.8011) time: 5.5750 data: 0.0001 max mem: 71357 +[19:39:19.647799] Epoch: [0] [1460/6500] lr: 0.000011 closs: 0.8252 (0.8612) grad_norm: 0.5502 (0.8047) time: 5.5775 data: 0.0001 max mem: 71357 +[19:40:15.482957] Epoch: [0] [1470/6500] lr: 0.000011 closs: 0.8374 (0.8608) grad_norm: 0.5301 (0.8027) time: 5.5811 data: 0.0002 max mem: 71357 +[19:41:11.141116] Epoch: [0] [1480/6500] lr: 0.000011 closs: 0.8001 (0.8606) grad_norm: 0.5333 (0.8027) time: 5.5745 data: 0.0002 max mem: 71357 +[19:42:06.984066] Epoch: [0] [1490/6500] lr: 0.000011 closs: 0.7398 (0.8594) grad_norm: 0.5333 (0.8013) time: 5.5749 data: 0.0001 max mem: 71357 +[19:43:02.746693] Epoch: [0] [1500/6500] lr: 0.000012 closs: 0.7381 (0.8590) grad_norm: 0.5309 (0.8012) time: 5.5802 data: 0.0002 max mem: 71357 +[19:43:58.570975] Epoch: [0] [1510/6500] lr: 0.000012 closs: 0.8013 (0.8588) grad_norm: 0.4857 (0.7988) time: 5.5793 data: 0.0001 max mem: 71357 +[19:44:54.319260] Epoch: [0] [1520/6500] lr: 0.000012 closs: 0.8824 (0.8588) grad_norm: 0.5054 (0.7980) time: 5.5786 data: 0.0002 max mem: 71357 +[19:45:50.069233] Epoch: [0] [1530/6500] lr: 0.000012 closs: 0.8063 (0.8585) grad_norm: 0.4937 (0.7963) time: 5.5748 data: 0.0001 max mem: 71357 +[19:46:45.742778] Epoch: [0] [1540/6500] lr: 0.000012 closs: 0.7936 (0.8579) grad_norm: 0.5091 (0.7950) time: 5.5711 data: 0.0001 max mem: 71357 +[19:47:41.512558] Epoch: [0] [1550/6500] lr: 0.000012 closs: 0.7926 (0.8576) grad_norm: 0.5252 (0.7939) time: 5.5720 data: 0.0001 max mem: 71357 +[19:48:37.387361] Epoch: [0] [1560/6500] lr: 0.000012 closs: 0.7592 (0.8569) grad_norm: 0.4907 (0.7924) time: 5.5821 data: 0.0001 max mem: 71357 +[19:49:33.103590] Epoch: [0] [1570/6500] lr: 0.000012 closs: 0.7427 (0.8565) grad_norm: 0.5252 (0.7923) time: 5.5794 data: 0.0001 max mem: 71357 +[19:50:28.953920] Epoch: [0] [1580/6500] lr: 0.000012 closs: 0.7489 (0.8559) grad_norm: 0.4674 (0.7904) time: 5.5783 data: 0.0001 max mem: 71357 +[19:51:24.796572] Epoch: [0] [1590/6500] lr: 0.000012 closs: 0.7778 (0.8552) grad_norm: 0.4821 (0.7886) time: 5.5845 data: 0.0002 max mem: 71357 +[19:52:20.573169] Epoch: [0] [1600/6500] lr: 0.000012 closs: 0.8162 (0.8553) grad_norm: 0.5151 (0.7882) time: 5.5808 data: 0.0002 max mem: 71357 +[19:53:16.418950] Epoch: [0] [1610/6500] lr: 0.000012 closs: 0.8162 (0.8549) grad_norm: 0.5045 (0.7865) time: 5.5810 data: 0.0001 max mem: 71357 +[19:54:12.129422] Epoch: [0] [1620/6500] lr: 0.000012 closs: 0.7935 (0.8548) grad_norm: 0.5045 (0.7844) time: 5.5777 data: 0.0001 max mem: 71357 +[19:55:07.940492] Epoch: [0] [1630/6500] lr: 0.000013 closs: 0.8191 (0.8547) grad_norm: 0.5045 (0.7841) time: 5.5760 data: 0.0001 max mem: 71357 +[19:56:03.688378] Epoch: [0] [1640/6500] lr: 0.000013 closs: 0.8281 (0.8546) grad_norm: 0.4828 (0.7837) time: 5.5779 data: 0.0001 max mem: 71357 +[19:56:59.568279] Epoch: [0] [1650/6500] lr: 0.000013 closs: 0.7958 (0.8542) grad_norm: 0.4776 (0.7821) time: 5.5813 data: 0.0001 max mem: 71357 +[19:57:55.339672] Epoch: [0] [1660/6500] lr: 0.000013 closs: 0.7472 (0.8538) grad_norm: 0.6027 (0.7811) time: 5.5825 data: 0.0002 max mem: 71357 +[19:58:50.972681] Epoch: [0] [1670/6500] lr: 0.000013 closs: 0.7411 (0.8532) grad_norm: 0.5621 (0.7802) time: 5.5701 data: 0.0002 max mem: 71357 +[19:59:46.690981] Epoch: [0] [1680/6500] lr: 0.000013 closs: 0.7657 (0.8525) grad_norm: 0.5570 (0.7789) time: 5.5674 data: 0.0002 max mem: 71357 +[20:00:42.465398] Epoch: [0] [1690/6500] lr: 0.000013 closs: 0.6785 (0.8513) grad_norm: 0.5621 (0.7781) time: 5.5745 data: 0.0002 max mem: 71357 +[20:01:38.307864] Epoch: [0] [1700/6500] lr: 0.000013 closs: 0.6685 (0.8507) grad_norm: 0.5087 (0.7765) time: 5.5808 data: 0.0001 max mem: 71357 +[20:02:34.069065] Epoch: [0] [1710/6500] lr: 0.000013 closs: 0.7515 (0.8502) grad_norm: 0.5087 (0.7756) time: 5.5801 data: 0.0001 max mem: 71357 +[20:03:29.815804] Epoch: [0] [1720/6500] lr: 0.000013 closs: 0.7740 (0.8501) grad_norm: 0.5406 (0.7768) time: 5.5753 data: 0.0001 max mem: 71357 +[20:04:25.658015] Epoch: [0] [1730/6500] lr: 0.000013 closs: 0.7883 (0.8499) grad_norm: 0.5054 (0.7752) time: 5.5793 data: 0.0001 max mem: 71357 +[20:05:21.448684] Epoch: [0] [1740/6500] lr: 0.000013 closs: 0.7883 (0.8497) grad_norm: 0.5406 (0.7747) time: 5.5815 data: 0.0002 max mem: 71357 +[20:06:17.132635] Epoch: [0] [1750/6500] lr: 0.000013 closs: 0.7562 (0.8491) grad_norm: 0.5160 (0.7736) time: 5.5737 data: 0.0002 max mem: 71357 +[20:07:12.915410] Epoch: [0] [1760/6500] lr: 0.000014 closs: 0.8120 (0.8491) grad_norm: 0.5160 (0.7725) time: 5.5733 data: 0.0001 max mem: 71357 +[20:08:08.564010] Epoch: [0] [1770/6500] lr: 0.000014 closs: 0.9002 (0.8495) grad_norm: 0.6313 (0.7726) time: 5.5715 data: 0.0001 max mem: 71357 +[20:09:04.351531] Epoch: [0] [1780/6500] lr: 0.000014 closs: 0.9002 (0.8491) grad_norm: 0.5656 (0.7711) time: 5.5717 data: 0.0001 max mem: 71357 +[20:10:00.035219] Epoch: [0] [1790/6500] lr: 0.000014 closs: 0.7180 (0.8488) grad_norm: 0.6461 (0.7707) time: 5.5734 data: 0.0002 max mem: 71357 +[20:10:55.870427] Epoch: [0] [1800/6500] lr: 0.000014 closs: 0.7484 (0.8481) grad_norm: 0.5875 (0.7696) time: 5.5758 data: 0.0002 max mem: 71357 +[20:11:51.589061] Epoch: [0] [1810/6500] lr: 0.000014 closs: 0.7187 (0.8474) grad_norm: 0.5284 (0.7691) time: 5.5776 data: 0.0001 max mem: 71357 +[20:12:47.419446] Epoch: [0] [1820/6500] lr: 0.000014 closs: 0.7622 (0.8475) grad_norm: 0.5546 (0.7679) time: 5.5774 data: 0.0002 max mem: 71357 +[20:13:43.152888] Epoch: [0] [1830/6500] lr: 0.000014 closs: 0.8333 (0.8472) grad_norm: 0.5013 (0.7662) time: 5.5781 data: 0.0002 max mem: 71357 +[20:14:38.893836] Epoch: [0] [1840/6500] lr: 0.000014 closs: 0.8029 (0.8472) grad_norm: 0.5110 (0.7648) time: 5.5736 data: 0.0002 max mem: 71357 +[20:15:34.630747] Epoch: [0] [1850/6500] lr: 0.000014 closs: 0.8166 (0.8470) grad_norm: 0.4765 (0.7632) time: 5.5738 data: 0.0001 max mem: 71357 +[20:16:30.314557] Epoch: [0] [1860/6500] lr: 0.000014 closs: 0.8166 (0.8469) grad_norm: 0.4707 (0.7628) time: 5.5709 data: 0.0001 max mem: 71357 +[20:17:26.194852] Epoch: [0] [1870/6500] lr: 0.000014 closs: 0.8289 (0.8467) grad_norm: 0.4765 (0.7619) time: 5.5781 data: 0.0001 max mem: 71357 +[20:18:21.871034] Epoch: [0] [1880/6500] lr: 0.000014 closs: 0.7967 (0.8464) grad_norm: 0.5001 (0.7617) time: 5.5778 data: 0.0001 max mem: 71357 +[20:19:17.591186] Epoch: [0] [1890/6500] lr: 0.000015 closs: 0.7326 (0.8458) grad_norm: 0.6081 (0.7608) time: 5.5697 data: 0.0001 max mem: 71357 +[20:20:13.289294] Epoch: [0] [1900/6500] lr: 0.000015 closs: 0.7281 (0.8450) grad_norm: 0.5038 (0.7597) time: 5.5708 data: 0.0002 max mem: 71357 +[20:21:09.177977] Epoch: [0] [1910/6500] lr: 0.000015 closs: 0.7328 (0.8453) grad_norm: 0.5284 (0.7604) time: 5.5792 data: 0.0002 max mem: 71357 +[20:22:04.997084] Epoch: [0] [1920/6500] lr: 0.000015 closs: 0.8228 (0.8450) grad_norm: 0.5284 (0.7592) time: 5.5853 data: 0.0001 max mem: 71357 +[20:23:00.755724] Epoch: [0] [1930/6500] lr: 0.000015 closs: 0.8124 (0.8447) grad_norm: 0.5513 (0.7585) time: 5.5788 data: 0.0001 max mem: 71357 +[20:23:56.446255] Epoch: [0] [1940/6500] lr: 0.000015 closs: 0.8124 (0.8448) grad_norm: 0.5656 (0.7573) time: 5.5723 data: 0.0001 max mem: 71357 +[20:24:52.260103] Epoch: [0] [1950/6500] lr: 0.000015 closs: 0.7912 (0.8445) grad_norm: 0.5052 (0.7558) time: 5.5751 data: 0.0001 max mem: 71357 +[20:25:47.913210] Epoch: [0] [1960/6500] lr: 0.000015 closs: 0.7160 (0.8440) grad_norm: 0.5052 (0.7555) time: 5.5733 data: 0.0002 max mem: 71357 +[20:26:43.623002] Epoch: [0] [1970/6500] lr: 0.000015 closs: 0.7569 (0.8438) grad_norm: 0.5321 (0.7545) time: 5.5681 data: 0.0002 max mem: 71357 +[20:27:39.403420] Epoch: [0] [1980/6500] lr: 0.000015 closs: 0.8105 (0.8436) grad_norm: 0.5093 (0.7539) time: 5.5744 data: 0.0002 max mem: 71357 +[20:28:35.232297] Epoch: [0] [1990/6500] lr: 0.000015 closs: 0.7956 (0.8437) grad_norm: 0.5200 (0.7528) time: 5.5804 data: 0.0001 max mem: 71357 +[20:29:30.942620] Epoch: [0] [2000/6500] lr: 0.000015 closs: 0.7954 (0.8438) grad_norm: 0.5200 (0.7519) time: 5.5769 data: 0.0001 max mem: 71357 +[20:30:26.631297] Epoch: [0] [2010/6500] lr: 0.000015 closs: 0.7865 (0.8435) grad_norm: 0.4947 (0.7508) time: 5.5698 data: 0.0001 max mem: 71357 +[20:31:22.375631] Epoch: [0] [2020/6500] lr: 0.000016 closs: 0.7190 (0.8429) grad_norm: 0.4947 (0.7500) time: 5.5715 data: 0.0002 max mem: 71357 +[20:32:18.160803] Epoch: [0] [2030/6500] lr: 0.000016 closs: 0.6794 (0.8426) grad_norm: 0.5117 (0.7508) time: 5.5763 data: 0.0002 max mem: 71357 +[20:33:14.032422] Epoch: [0] [2040/6500] lr: 0.000016 closs: 0.6984 (0.8418) grad_norm: 0.5095 (0.7495) time: 5.5827 data: 0.0001 max mem: 71357 +[20:34:09.674229] Epoch: [0] [2050/6500] lr: 0.000016 closs: 0.7023 (0.8413) grad_norm: 0.5421 (0.7498) time: 5.5756 data: 0.0001 max mem: 71357 +[20:35:05.378186] Epoch: [0] [2060/6500] lr: 0.000016 closs: 0.7354 (0.8410) grad_norm: 0.6405 (0.7500) time: 5.5672 data: 0.0002 max mem: 71357 +[20:36:01.171645] Epoch: [0] [2070/6500] lr: 0.000016 closs: 0.7757 (0.8410) grad_norm: 0.5421 (0.7489) time: 5.5747 data: 0.0002 max mem: 71357 +[20:36:57.047870] Epoch: [0] [2080/6500] lr: 0.000016 closs: 0.8422 (0.8410) grad_norm: 0.5457 (0.7477) time: 5.5834 data: 0.0002 max mem: 71357 +[20:37:52.852566] Epoch: [0] [2090/6500] lr: 0.000016 closs: 0.7849 (0.8408) grad_norm: 0.5457 (0.7474) time: 5.5840 data: 0.0001 max mem: 71357 +[20:38:48.604842] Epoch: [0] [2100/6500] lr: 0.000016 closs: 0.7323 (0.8402) grad_norm: 0.4892 (0.7468) time: 5.5778 data: 0.0001 max mem: 71357 +[20:39:44.382964] Epoch: [0] [2110/6500] lr: 0.000016 closs: 0.7806 (0.8405) grad_norm: 0.4828 (0.7459) time: 5.5764 data: 0.0002 max mem: 71357 +[20:40:40.183395] Epoch: [0] [2120/6500] lr: 0.000016 closs: 0.8687 (0.8406) grad_norm: 0.4828 (0.7446) time: 5.5788 data: 0.0001 max mem: 71357 +[20:41:35.950265] Epoch: [0] [2130/6500] lr: 0.000016 closs: 0.7655 (0.8401) grad_norm: 0.4828 (0.7435) time: 5.5782 data: 0.0001 max mem: 71357 +[20:42:31.699836] Epoch: [0] [2140/6500] lr: 0.000016 closs: 0.7670 (0.8399) grad_norm: 0.4762 (0.7425) time: 5.5757 data: 0.0001 max mem: 71357 +[20:43:27.317712] Epoch: [0] [2150/6500] lr: 0.000017 closs: 0.7810 (0.8397) grad_norm: 0.5027 (0.7424) time: 5.5683 data: 0.0001 max mem: 71357 +[20:44:23.007754] Epoch: [0] [2160/6500] lr: 0.000017 closs: 0.7664 (0.8397) grad_norm: 0.5038 (0.7418) time: 5.5653 data: 0.0001 max mem: 71357 +[20:45:18.802104] Epoch: [0] [2170/6500] lr: 0.000017 closs: 0.7484 (0.8394) grad_norm: 0.5949 (0.7415) time: 5.5741 data: 0.0002 max mem: 71357 +[20:46:14.517863] Epoch: [0] [2180/6500] lr: 0.000017 closs: 0.7521 (0.8392) grad_norm: 0.5192 (0.7403) time: 5.5754 data: 0.0002 max mem: 71357 +[20:47:10.200594] Epoch: [0] [2190/6500] lr: 0.000017 closs: 0.7133 (0.8389) grad_norm: 0.5528 (0.7399) time: 5.5698 data: 0.0001 max mem: 71357 +[20:48:05.878498] Epoch: [0] [2200/6500] lr: 0.000017 closs: 0.7301 (0.8386) grad_norm: 0.5528 (0.7390) time: 5.5679 data: 0.0002 max mem: 71357 +[20:49:01.610047] Epoch: [0] [2210/6500] lr: 0.000017 closs: 0.7366 (0.8381) grad_norm: 0.5171 (0.7379) time: 5.5704 data: 0.0001 max mem: 71357 +[20:49:57.481127] Epoch: [0] [2220/6500] lr: 0.000017 closs: 0.7101 (0.8375) grad_norm: 0.5008 (0.7367) time: 5.5800 data: 0.0002 max mem: 71357 +[20:50:53.173124] Epoch: [0] [2230/6500] lr: 0.000017 closs: 0.7101 (0.8372) grad_norm: 0.4889 (0.7370) time: 5.5780 data: 0.0002 max mem: 71357 +[20:51:48.892801] Epoch: [0] [2240/6500] lr: 0.000017 closs: 0.7136 (0.8366) grad_norm: 0.4889 (0.7363) time: 5.5704 data: 0.0002 max mem: 71357 +[20:52:44.665693] Epoch: [0] [2250/6500] lr: 0.000017 closs: 0.7372 (0.8365) grad_norm: 0.5008 (0.7355) time: 5.5745 data: 0.0002 max mem: 71357 +[20:53:40.495845] Epoch: [0] [2260/6500] lr: 0.000017 closs: 0.7952 (0.8366) grad_norm: 0.5428 (0.7346) time: 5.5800 data: 0.0002 max mem: 71357 +[20:54:36.203059] Epoch: [0] [2270/6500] lr: 0.000017 closs: 0.7883 (0.8365) grad_norm: 0.5428 (0.7345) time: 5.5768 data: 0.0002 max mem: 71357 +[20:55:31.918928] Epoch: [0] [2280/6500] lr: 0.000018 closs: 0.7588 (0.8359) grad_norm: 0.4861 (0.7333) time: 5.5711 data: 0.0002 max mem: 71357 +[20:56:27.730546] Epoch: [0] [2290/6500] lr: 0.000018 closs: 0.7458 (0.8357) grad_norm: 0.4716 (0.7331) time: 5.5763 data: 0.0002 max mem: 71357 +[20:57:23.453980] Epoch: [0] [2300/6500] lr: 0.000018 closs: 0.7519 (0.8351) grad_norm: 0.4951 (0.7329) time: 5.5767 data: 0.0002 max mem: 71357 +[20:58:19.330645] Epoch: [0] [2310/6500] lr: 0.000018 closs: 0.7519 (0.8351) grad_norm: 0.4951 (0.7318) time: 5.5799 data: 0.0001 max mem: 71357 +[20:59:15.049870] Epoch: [0] [2320/6500] lr: 0.000018 closs: 0.7585 (0.8348) grad_norm: 0.5165 (0.7310) time: 5.5797 data: 0.0001 max mem: 71357 +[21:00:10.840941] Epoch: [0] [2330/6500] lr: 0.000018 closs: 0.7507 (0.8345) grad_norm: 0.5165 (0.7304) time: 5.5754 data: 0.0001 max mem: 71357 +[21:01:06.575722] Epoch: [0] [2340/6500] lr: 0.000018 closs: 0.7689 (0.8342) grad_norm: 0.4833 (0.7296) time: 5.5762 data: 0.0002 max mem: 71357 +[21:02:02.422716] Epoch: [0] [2350/6500] lr: 0.000018 closs: 0.7870 (0.8342) grad_norm: 0.4735 (0.7284) time: 5.5790 data: 0.0002 max mem: 71357 +[21:02:58.275189] Epoch: [0] [2360/6500] lr: 0.000018 closs: 0.8006 (0.8342) grad_norm: 0.4750 (0.7274) time: 5.5848 data: 0.0001 max mem: 71357 +[21:03:54.007925] Epoch: [0] [2370/6500] lr: 0.000018 closs: 0.7329 (0.8334) grad_norm: 0.4833 (0.7271) time: 5.5791 data: 0.0001 max mem: 71357 +[21:04:49.752785] Epoch: [0] [2380/6500] lr: 0.000018 closs: 0.7316 (0.8335) grad_norm: 0.4943 (0.7266) time: 5.5738 data: 0.0002 max mem: 71357 +[21:05:45.680885] Epoch: [0] [2390/6500] lr: 0.000018 closs: 0.8211 (0.8335) grad_norm: 0.5098 (0.7261) time: 5.5836 data: 0.0002 max mem: 71357 +[21:06:41.469027] Epoch: [0] [2400/6500] lr: 0.000018 closs: 0.8356 (0.8335) grad_norm: 0.5608 (0.7252) time: 5.5857 data: 0.0001 max mem: 71357 +[21:07:37.229547] Epoch: [0] [2410/6500] lr: 0.000019 closs: 0.7179 (0.8329) grad_norm: 0.5037 (0.7249) time: 5.5773 data: 0.0002 max mem: 71357 +[21:08:32.958561] Epoch: [0] [2420/6500] lr: 0.000019 closs: 0.7157 (0.8327) grad_norm: 0.5037 (0.7241) time: 5.5744 data: 0.0002 max mem: 71357 +[21:09:28.673873] Epoch: [0] [2430/6500] lr: 0.000019 closs: 0.7412 (0.8324) grad_norm: 0.4534 (0.7229) time: 5.5721 data: 0.0002 max mem: 71357 +[21:10:24.504516] Epoch: [0] [2440/6500] lr: 0.000019 closs: 0.8024 (0.8324) grad_norm: 0.4475 (0.7218) time: 5.5772 data: 0.0002 max mem: 71357 +[21:11:20.170214] Epoch: [0] [2450/6500] lr: 0.000019 closs: 0.7852 (0.8321) grad_norm: 0.4525 (0.7210) time: 5.5747 data: 0.0002 max mem: 71357 +[21:12:15.876408] Epoch: [0] [2460/6500] lr: 0.000019 closs: 0.7852 (0.8318) grad_norm: 0.4266 (0.7201) time: 5.5685 data: 0.0002 max mem: 71357 +[21:13:11.617462] Epoch: [0] [2470/6500] lr: 0.000019 closs: 0.8011 (0.8319) grad_norm: 0.4346 (0.7192) time: 5.5723 data: 0.0002 max mem: 71357 +[21:14:07.477465] Epoch: [0] [2480/6500] lr: 0.000019 closs: 0.7495 (0.8318) grad_norm: 0.4759 (0.7184) time: 5.5799 data: 0.0001 max mem: 71357 +[21:15:03.160431] Epoch: [0] [2490/6500] lr: 0.000019 closs: 0.7495 (0.8316) grad_norm: 0.4835 (0.7178) time: 5.5770 data: 0.0002 max mem: 71357 +[21:15:58.872968] Epoch: [0] [2500/6500] lr: 0.000019 closs: 0.7675 (0.8312) grad_norm: 0.4726 (0.7166) time: 5.5697 data: 0.0002 max mem: 71357 +[21:16:54.619091] Epoch: [0] [2510/6500] lr: 0.000019 closs: 0.7675 (0.8312) grad_norm: 0.4883 (0.7158) time: 5.5728 data: 0.0001 max mem: 71357 +[21:17:50.323182] Epoch: [0] [2520/6500] lr: 0.000019 closs: 0.8567 (0.8313) grad_norm: 0.5139 (0.7170) time: 5.5724 data: 0.0001 max mem: 71357 +[21:18:46.181308] Epoch: [0] [2530/6500] lr: 0.000019 closs: 0.8298 (0.8314) grad_norm: 0.5134 (0.7165) time: 5.5780 data: 0.0001 max mem: 71357 +[21:19:41.968783] Epoch: [0] [2540/6500] lr: 0.000020 closs: 0.8169 (0.8311) grad_norm: 0.5156 (0.7158) time: 5.5821 data: 0.0001 max mem: 71357 +[21:20:37.843995] Epoch: [0] [2550/6500] lr: 0.000020 closs: 0.8056 (0.8310) grad_norm: 0.5134 (0.7148) time: 5.5830 data: 0.0002 max mem: 71357 +[21:21:33.609151] Epoch: [0] [2560/6500] lr: 0.000020 closs: 0.7994 (0.8310) grad_norm: 0.5011 (0.7142) time: 5.5819 data: 0.0002 max mem: 71357 +[21:22:29.557411] Epoch: [0] [2570/6500] lr: 0.000020 closs: 0.7904 (0.8309) grad_norm: 0.5156 (0.7135) time: 5.5855 data: 0.0002 max mem: 71357 +[21:23:25.337398] Epoch: [0] [2580/6500] lr: 0.000020 closs: 0.7925 (0.8307) grad_norm: 0.5411 (0.7129) time: 5.5863 data: 0.0002 max mem: 71357 +[21:24:21.118584] Epoch: [0] [2590/6500] lr: 0.000020 closs: 0.7940 (0.8308) grad_norm: 0.5367 (0.7119) time: 5.5779 data: 0.0002 max mem: 71357 +[21:25:16.916564] Epoch: [0] [2600/6500] lr: 0.000020 closs: 0.7737 (0.8306) grad_norm: 0.5209 (0.7115) time: 5.5789 data: 0.0002 max mem: 71357 +[21:26:12.737112] Epoch: [0] [2610/6500] lr: 0.000020 closs: 0.7394 (0.8305) grad_norm: 0.5209 (0.7118) time: 5.5808 data: 0.0002 max mem: 71357 +[21:27:08.510919] Epoch: [0] [2620/6500] lr: 0.000020 closs: 0.7944 (0.8304) grad_norm: 0.4956 (0.7111) time: 5.5796 data: 0.0002 max mem: 71357 +[21:28:04.306907] Epoch: [0] [2630/6500] lr: 0.000020 closs: 0.7944 (0.8303) grad_norm: 0.4956 (0.7102) time: 5.5784 data: 0.0002 max mem: 71357 +[21:29:00.079713] Epoch: [0] [2640/6500] lr: 0.000020 closs: 0.7446 (0.8298) grad_norm: 0.4956 (0.7094) time: 5.5784 data: 0.0002 max mem: 71357 +[21:29:55.866717] Epoch: [0] [2650/6500] lr: 0.000020 closs: 0.7546 (0.8296) grad_norm: 0.4603 (0.7084) time: 5.5779 data: 0.0001 max mem: 71357 +[21:30:51.780482] Epoch: [0] [2660/6500] lr: 0.000020 closs: 0.8542 (0.8297) grad_norm: 0.4237 (0.7076) time: 5.5850 data: 0.0001 max mem: 71357 +[21:31:47.549029] Epoch: [0] [2670/6500] lr: 0.000021 closs: 0.7719 (0.8291) grad_norm: 0.4430 (0.7072) time: 5.5840 data: 0.0001 max mem: 71357 +[21:32:43.367174] Epoch: [0] [2680/6500] lr: 0.000021 closs: 0.7024 (0.8288) grad_norm: 0.4734 (0.7067) time: 5.5792 data: 0.0002 max mem: 71357 +[21:33:39.102598] Epoch: [0] [2690/6500] lr: 0.000021 closs: 0.7855 (0.8287) grad_norm: 0.5127 (0.7059) time: 5.5776 data: 0.0002 max mem: 71357 +[21:34:35.007085] Epoch: [0] [2700/6500] lr: 0.000021 closs: 0.7978 (0.8288) grad_norm: 0.5239 (0.7054) time: 5.5819 data: 0.0002 max mem: 71357 +[21:35:30.772658] Epoch: [0] [2710/6500] lr: 0.000021 closs: 0.7809 (0.8284) grad_norm: 0.5049 (0.7043) time: 5.5834 data: 0.0002 max mem: 71357 +[21:36:26.600799] Epoch: [0] [2720/6500] lr: 0.000021 closs: 0.7460 (0.8281) grad_norm: 0.5109 (0.7037) time: 5.5796 data: 0.0002 max mem: 71357 +[21:37:22.396532] Epoch: [0] [2730/6500] lr: 0.000021 closs: 0.6897 (0.8277) grad_norm: 0.4972 (0.7030) time: 5.5811 data: 0.0001 max mem: 71357 +[21:38:18.216414] Epoch: [0] [2740/6500] lr: 0.000021 closs: 0.7477 (0.8275) grad_norm: 0.4605 (0.7029) time: 5.5806 data: 0.0002 max mem: 71357 +[21:39:14.267187] Epoch: [0] [2750/6500] lr: 0.000021 closs: 0.7866 (0.8273) grad_norm: 0.4752 (0.7020) time: 5.5934 data: 0.0002 max mem: 71357 +[21:40:10.212618] Epoch: [0] [2760/6500] lr: 0.000021 closs: 0.7746 (0.8272) grad_norm: 0.4605 (0.7010) time: 5.5996 data: 0.0002 max mem: 71357 +[21:41:05.958460] Epoch: [0] [2770/6500] lr: 0.000021 closs: 0.7746 (0.8272) grad_norm: 0.4752 (0.7008) time: 5.5844 data: 0.0002 max mem: 71357 +[21:42:01.770293] Epoch: [0] [2780/6500] lr: 0.000021 closs: 0.8009 (0.8272) grad_norm: 0.4752 (0.7002) time: 5.5778 data: 0.0002 max mem: 71357 +[21:42:57.678502] Epoch: [0] [2790/6500] lr: 0.000021 closs: 0.8009 (0.8271) grad_norm: 0.4831 (0.7007) time: 5.5859 data: 0.0002 max mem: 71357 +[21:43:53.448984] Epoch: [0] [2800/6500] lr: 0.000022 closs: 0.8074 (0.8271) grad_norm: 0.5026 (0.7000) time: 5.5838 data: 0.0002 max mem: 71357 +[21:44:49.284849] Epoch: [0] [2810/6500] lr: 0.000022 closs: 0.8074 (0.8270) grad_norm: 0.4758 (0.6991) time: 5.5802 data: 0.0002 max mem: 71357 +[21:45:45.008739] Epoch: [0] [2820/6500] lr: 0.000022 closs: 0.7502 (0.8266) grad_norm: 0.4573 (0.6983) time: 5.5778 data: 0.0002 max mem: 71357 +[21:46:40.941467] Epoch: [0] [2830/6500] lr: 0.000022 closs: 0.7663 (0.8264) grad_norm: 0.4320 (0.6973) time: 5.5827 data: 0.0002 max mem: 71357 +[21:47:36.741840] Epoch: [0] [2840/6500] lr: 0.000022 closs: 0.8014 (0.8264) grad_norm: 0.4285 (0.6973) time: 5.5866 data: 0.0002 max mem: 71357 +[21:48:32.454452] Epoch: [0] [2850/6500] lr: 0.000022 closs: 0.8047 (0.8261) grad_norm: 0.4406 (0.6966) time: 5.5756 data: 0.0001 max mem: 71357 +[21:49:28.166471] Epoch: [0] [2860/6500] lr: 0.000022 closs: 0.6620 (0.8256) grad_norm: 0.4406 (0.6963) time: 5.5711 data: 0.0001 max mem: 71357 +[21:50:23.965017] Epoch: [0] [2870/6500] lr: 0.000022 closs: 0.6841 (0.8253) grad_norm: 0.5207 (0.6958) time: 5.5754 data: 0.0002 max mem: 71357 +[21:51:19.762370] Epoch: [0] [2880/6500] lr: 0.000022 closs: 0.7397 (0.8252) grad_norm: 0.5657 (0.6979) time: 5.5797 data: 0.0002 max mem: 71357 +[21:52:15.502191] Epoch: [0] [2890/6500] lr: 0.000022 closs: 0.8410 (0.8254) grad_norm: 0.5332 (0.6971) time: 5.5767 data: 0.0001 max mem: 71357 +[21:53:11.116475] Epoch: [0] [2900/6500] lr: 0.000022 closs: 0.7739 (0.8251) grad_norm: 0.5285 (0.6964) time: 5.5676 data: 0.0001 max mem: 71357 +[21:54:06.797736] Epoch: [0] [2910/6500] lr: 0.000022 closs: 0.7956 (0.8252) grad_norm: 0.5048 (0.6961) time: 5.5647 data: 0.0001 max mem: 71357 +[21:55:02.572561] Epoch: [0] [2920/6500] lr: 0.000022 closs: 0.8117 (0.8251) grad_norm: 0.4933 (0.6954) time: 5.5727 data: 0.0001 max mem: 71357 +[21:55:58.265640] Epoch: [0] [2930/6500] lr: 0.000023 closs: 0.7544 (0.8247) grad_norm: 0.4933 (0.6957) time: 5.5732 data: 0.0002 max mem: 71357 +[21:56:53.952100] Epoch: [0] [2940/6500] lr: 0.000023 closs: 0.8150 (0.8249) grad_norm: 0.5311 (0.6952) time: 5.5689 data: 0.0002 max mem: 71357 +[21:57:49.722026] Epoch: [0] [2950/6500] lr: 0.000023 closs: 0.8308 (0.8248) grad_norm: 0.5311 (0.6948) time: 5.5727 data: 0.0001 max mem: 71357 +[21:58:45.409923] Epoch: [0] [2960/6500] lr: 0.000023 closs: 0.8063 (0.8248) grad_norm: 0.4842 (0.6945) time: 5.5728 data: 0.0002 max mem: 71357 +[21:59:41.250401] Epoch: [0] [2970/6500] lr: 0.000023 closs: 0.8054 (0.8246) grad_norm: 0.4842 (0.6939) time: 5.5763 data: 0.0002 max mem: 71357 +[22:00:36.996921] Epoch: [0] [2980/6500] lr: 0.000023 closs: 0.7675 (0.8245) grad_norm: 0.4995 (0.6935) time: 5.5793 data: 0.0002 max mem: 71357 +[22:01:32.739821] Epoch: [0] [2990/6500] lr: 0.000023 closs: 0.7084 (0.8242) grad_norm: 0.5038 (0.6933) time: 5.5744 data: 0.0002 max mem: 71357 +[22:02:28.417654] Epoch: [0] [3000/6500] lr: 0.000023 closs: 0.7755 (0.8242) grad_norm: 0.5038 (0.6928) time: 5.5709 data: 0.0002 max mem: 71357 +[22:03:24.195424] Epoch: [0] [3010/6500] lr: 0.000023 closs: 0.7861 (0.8242) grad_norm: 0.5038 (0.6922) time: 5.5727 data: 0.0001 max mem: 71357 +[22:04:19.925026] Epoch: [0] [3020/6500] lr: 0.000023 closs: 0.7833 (0.8241) grad_norm: 0.4425 (0.6914) time: 5.5753 data: 0.0001 max mem: 71357 +[22:05:15.625126] Epoch: [0] [3030/6500] lr: 0.000023 closs: 0.8012 (0.8242) grad_norm: 0.4276 (0.6903) time: 5.5713 data: 0.0002 max mem: 71357 +[22:06:11.301289] Epoch: [0] [3040/6500] lr: 0.000023 closs: 0.8578 (0.8245) grad_norm: 0.4276 (0.6900) time: 5.5687 data: 0.0002 max mem: 71357 +[22:07:07.112997] Epoch: [0] [3050/6500] lr: 0.000023 closs: 0.7844 (0.8243) grad_norm: 0.4425 (0.6896) time: 5.5743 data: 0.0001 max mem: 71357 +[22:08:02.885455] Epoch: [0] [3060/6500] lr: 0.000024 closs: 0.7363 (0.8239) grad_norm: 0.4420 (0.6890) time: 5.5791 data: 0.0001 max mem: 71357 +[22:08:58.574922] Epoch: [0] [3070/6500] lr: 0.000024 closs: 0.7478 (0.8239) grad_norm: 0.4767 (0.6882) time: 5.5730 data: 0.0001 max mem: 71357 +[22:09:54.218364] Epoch: [0] [3080/6500] lr: 0.000024 closs: 0.7478 (0.8238) grad_norm: 0.4767 (0.6877) time: 5.5666 data: 0.0001 max mem: 71357 +[22:10:49.953148] Epoch: [0] [3090/6500] lr: 0.000024 closs: 0.8290 (0.8239) grad_norm: 0.4707 (0.6872) time: 5.5688 data: 0.0002 max mem: 71357 +[22:11:45.777210] Epoch: [0] [3100/6500] lr: 0.000024 closs: 0.8290 (0.8241) grad_norm: 0.5149 (0.6872) time: 5.5778 data: 0.0002 max mem: 71357 +[22:12:41.446476] Epoch: [0] [3110/6500] lr: 0.000024 closs: 0.7752 (0.8239) grad_norm: 0.5202 (0.6871) time: 5.5746 data: 0.0002 max mem: 71357 +[22:13:37.138604] Epoch: [0] [3120/6500] lr: 0.000024 closs: 0.7729 (0.8238) grad_norm: 0.5083 (0.6864) time: 5.5680 data: 0.0001 max mem: 71357 +[22:14:32.885980] Epoch: [0] [3130/6500] lr: 0.000024 closs: 0.7892 (0.8237) grad_norm: 0.5083 (0.6860) time: 5.5719 data: 0.0001 max mem: 71357 +[22:15:28.734038] Epoch: [0] [3140/6500] lr: 0.000024 closs: 0.7201 (0.8233) grad_norm: 0.4951 (0.6855) time: 5.5797 data: 0.0001 max mem: 71357 +[22:16:24.401364] Epoch: [0] [3150/6500] lr: 0.000024 closs: 0.7225 (0.8231) grad_norm: 0.4951 (0.6855) time: 5.5757 data: 0.0001 max mem: 71357 +[22:17:20.190194] Epoch: [0] [3160/6500] lr: 0.000024 closs: 0.7225 (0.8227) grad_norm: 0.5478 (0.6851) time: 5.5727 data: 0.0002 max mem: 71357 +[22:18:15.930741] Epoch: [0] [3170/6500] lr: 0.000024 closs: 0.6877 (0.8226) grad_norm: 0.5478 (0.6849) time: 5.5764 data: 0.0002 max mem: 71357 +[22:19:11.777748] Epoch: [0] [3180/6500] lr: 0.000024 closs: 0.7892 (0.8225) grad_norm: 0.5051 (0.6841) time: 5.5793 data: 0.0002 max mem: 71357 +[22:20:07.668122] Epoch: [0] [3190/6500] lr: 0.000025 closs: 0.7907 (0.8225) grad_norm: 0.4834 (0.6836) time: 5.5868 data: 0.0002 max mem: 71357 +[22:21:03.530895] Epoch: [0] [3200/6500] lr: 0.000025 closs: 0.7862 (0.8223) grad_norm: 0.4458 (0.6829) time: 5.5876 data: 0.0002 max mem: 71357 +[22:21:59.265653] Epoch: [0] [3210/6500] lr: 0.000025 closs: 0.8013 (0.8224) grad_norm: 0.4373 (0.6825) time: 5.5798 data: 0.0001 max mem: 71357 +[22:22:54.998366] Epoch: [0] [3220/6500] lr: 0.000025 closs: 0.8076 (0.8222) grad_norm: 0.4758 (0.6823) time: 5.5733 data: 0.0002 max mem: 71357 +[22:23:50.905774] Epoch: [0] [3230/6500] lr: 0.000025 closs: 0.7747 (0.8219) grad_norm: 0.4758 (0.6816) time: 5.5819 data: 0.0002 max mem: 71357 +[22:24:46.627784] Epoch: [0] [3240/6500] lr: 0.000025 closs: 0.7195 (0.8217) grad_norm: 0.4976 (0.6818) time: 5.5813 data: 0.0002 max mem: 71357 +[22:25:42.340275] Epoch: [0] [3250/6500] lr: 0.000025 closs: 0.8083 (0.8220) grad_norm: 0.4758 (0.6814) time: 5.5716 data: 0.0002 max mem: 71357 +[22:26:38.235003] Epoch: [0] [3260/6500] lr: 0.000025 closs: 0.8261 (0.8221) grad_norm: 0.4669 (0.6806) time: 5.5802 data: 0.0002 max mem: 71357 +[22:27:34.143033] Epoch: [0] [3270/6500] lr: 0.000025 closs: 0.7940 (0.8220) grad_norm: 0.4716 (0.6802) time: 5.5900 data: 0.0002 max mem: 71357 +[22:28:29.848989] Epoch: [0] [3280/6500] lr: 0.000025 closs: 0.7949 (0.8218) grad_norm: 0.4669 (0.6798) time: 5.5806 data: 0.0002 max mem: 71357 +[22:29:25.534312] Epoch: [0] [3290/6500] lr: 0.000025 closs: 0.7710 (0.8220) grad_norm: 0.4918 (0.6792) time: 5.5694 data: 0.0002 max mem: 71357 +[22:30:21.210353] Epoch: [0] [3300/6500] lr: 0.000025 closs: 0.7941 (0.8220) grad_norm: 0.4949 (0.6792) time: 5.5679 data: 0.0002 max mem: 71357 +[22:31:16.927815] Epoch: [0] [3310/6500] lr: 0.000025 closs: 0.7941 (0.8216) grad_norm: 0.4980 (0.6788) time: 5.5695 data: 0.0002 max mem: 71357 +[22:32:12.719534] Epoch: [0] [3320/6500] lr: 0.000026 closs: 0.6892 (0.8212) grad_norm: 0.4980 (0.6783) time: 5.5753 data: 0.0001 max mem: 71357 +[22:33:08.513920] Epoch: [0] [3330/6500] lr: 0.000026 closs: 0.7701 (0.8211) grad_norm: 0.5174 (0.6812) time: 5.5792 data: 0.0001 max mem: 71357 +[22:34:04.304134] Epoch: [0] [3340/6500] lr: 0.000026 closs: 0.7871 (0.8210) grad_norm: 0.4950 (0.6806) time: 5.5791 data: 0.0001 max mem: 71357 +[22:35:00.047708] Epoch: [0] [3350/6500] lr: 0.000026 closs: 0.7350 (0.8207) grad_norm: 0.4521 (0.6801) time: 5.5766 data: 0.0001 max mem: 71357 +[22:35:55.874424] Epoch: [0] [3360/6500] lr: 0.000026 closs: 0.7149 (0.8205) grad_norm: 0.4950 (0.6811) time: 5.5784 data: 0.0002 max mem: 71357 +[22:36:51.608865] Epoch: [0] [3370/6500] lr: 0.000026 closs: 0.6943 (0.8199) grad_norm: 0.4521 (0.6807) time: 5.5779 data: 0.0002 max mem: 71357 +[22:37:47.359747] Epoch: [0] [3380/6500] lr: 0.000026 closs: 0.7499 (0.8199) grad_norm: 0.5135 (0.6802) time: 5.5741 data: 0.0002 max mem: 71357 +[22:38:43.055778] Epoch: [0] [3390/6500] lr: 0.000026 closs: 0.7804 (0.8198) grad_norm: 0.5695 (0.6800) time: 5.5722 data: 0.0002 max mem: 71357 +[22:39:38.791189] Epoch: [0] [3400/6500] lr: 0.000026 closs: 0.7784 (0.8198) grad_norm: 0.5695 (0.6806) time: 5.5714 data: 0.0002 max mem: 71357 +[22:40:34.701703] Epoch: [0] [3410/6500] lr: 0.000026 closs: 0.8139 (0.8198) grad_norm: 0.5514 (0.6797) time: 5.5822 data: 0.0002 max mem: 71357 +[22:41:30.512183] Epoch: [0] [3420/6500] lr: 0.000026 closs: 0.7679 (0.8196) grad_norm: 0.5098 (0.6790) time: 5.5860 data: 0.0002 max mem: 71357 +[22:42:26.256064] Epoch: [0] [3430/6500] lr: 0.000026 closs: 0.7416 (0.8196) grad_norm: 0.4407 (0.6784) time: 5.5776 data: 0.0002 max mem: 71357 +[22:43:21.943934] Epoch: [0] [3440/6500] lr: 0.000026 closs: 0.6919 (0.8193) grad_norm: 0.3898 (0.6776) time: 5.5714 data: 0.0002 max mem: 71357 +[22:44:17.761189] Epoch: [0] [3450/6500] lr: 0.000027 closs: 0.7952 (0.8193) grad_norm: 0.3898 (0.6772) time: 5.5751 data: 0.0002 max mem: 71357 +[22:45:13.506509] Epoch: [0] [3460/6500] lr: 0.000027 closs: 0.8043 (0.8192) grad_norm: 0.3898 (0.6770) time: 5.5780 data: 0.0001 max mem: 71357 +[22:46:09.269380] Epoch: [0] [3470/6500] lr: 0.000027 closs: 0.7872 (0.8192) grad_norm: 0.4201 (0.6766) time: 5.5753 data: 0.0002 max mem: 71357 +[22:47:04.958374] Epoch: [0] [3480/6500] lr: 0.000027 closs: 0.7538 (0.8190) grad_norm: 0.4742 (0.6763) time: 5.5725 data: 0.0002 max mem: 71357 +[22:48:00.838506] Epoch: [0] [3490/6500] lr: 0.000027 closs: 0.7062 (0.8188) grad_norm: 0.4572 (0.6762) time: 5.5784 data: 0.0001 max mem: 71357 +[22:48:56.554925] Epoch: [0] [3500/6500] lr: 0.000027 closs: 0.7649 (0.8187) grad_norm: 0.4557 (0.6755) time: 5.5797 data: 0.0001 max mem: 71357 +[22:49:52.356055] Epoch: [0] [3510/6500] lr: 0.000027 closs: 0.7740 (0.8186) grad_norm: 0.4557 (0.6749) time: 5.5757 data: 0.0001 max mem: 71357 +[22:50:48.051775] Epoch: [0] [3520/6500] lr: 0.000027 closs: 0.6907 (0.8184) grad_norm: 0.4465 (0.6745) time: 5.5747 data: 0.0002 max mem: 71357 +[22:51:43.736248] Epoch: [0] [3530/6500] lr: 0.000027 closs: 0.7185 (0.8184) grad_norm: 0.4883 (0.6741) time: 5.5689 data: 0.0001 max mem: 71357 +[22:52:39.582328] Epoch: [0] [3540/6500] lr: 0.000027 closs: 0.7792 (0.8182) grad_norm: 0.4887 (0.6737) time: 5.5765 data: 0.0001 max mem: 71357 +[22:53:35.217988] Epoch: [0] [3550/6500] lr: 0.000027 closs: 0.7264 (0.8181) grad_norm: 0.4923 (0.6732) time: 5.5740 data: 0.0001 max mem: 71357 +[22:54:31.016009] Epoch: [0] [3560/6500] lr: 0.000027 closs: 0.7264 (0.8179) grad_norm: 0.4923 (0.6727) time: 5.5716 data: 0.0002 max mem: 71357 +[22:55:26.802300] Epoch: [0] [3570/6500] lr: 0.000027 closs: 0.7620 (0.8178) grad_norm: 0.4646 (0.6722) time: 5.5791 data: 0.0002 max mem: 71357 +[22:56:22.687278] Epoch: [0] [3580/6500] lr: 0.000028 closs: 0.7688 (0.8178) grad_norm: 0.4572 (0.6717) time: 5.5835 data: 0.0001 max mem: 71357 +[22:57:18.492948] Epoch: [0] [3590/6500] lr: 0.000028 closs: 0.8586 (0.8179) grad_norm: 0.4335 (0.6714) time: 5.5844 data: 0.0002 max mem: 71357 +[22:58:14.266317] Epoch: [0] [3600/6500] lr: 0.000028 closs: 0.8586 (0.8181) grad_norm: 0.4301 (0.6708) time: 5.5788 data: 0.0002 max mem: 71357 +[22:59:10.030708] Epoch: [0] [3610/6500] lr: 0.000028 closs: 0.7420 (0.8178) grad_norm: 0.4168 (0.6702) time: 5.5767 data: 0.0001 max mem: 71357 +[23:00:05.740740] Epoch: [0] [3620/6500] lr: 0.000028 closs: 0.6845 (0.8177) grad_norm: 0.4147 (0.6697) time: 5.5736 data: 0.0002 max mem: 71357 +[23:01:01.628084] Epoch: [0] [3630/6500] lr: 0.000028 closs: 0.7360 (0.8176) grad_norm: 0.4531 (0.6693) time: 5.5798 data: 0.0002 max mem: 71357 +[23:01:57.312973] Epoch: [0] [3640/6500] lr: 0.000028 closs: 0.7370 (0.8177) grad_norm: 0.4726 (0.6689) time: 5.5785 data: 0.0001 max mem: 71357 +[23:02:53.018097] Epoch: [0] [3650/6500] lr: 0.000028 closs: 0.7370 (0.8174) grad_norm: 0.4726 (0.6684) time: 5.5694 data: 0.0001 max mem: 71357 +[23:03:48.696651] Epoch: [0] [3660/6500] lr: 0.000028 closs: 0.7332 (0.8172) grad_norm: 0.4864 (0.6678) time: 5.5691 data: 0.0001 max mem: 71357 +[23:04:44.522063] Epoch: [0] [3670/6500] lr: 0.000028 closs: 0.7933 (0.8171) grad_norm: 0.4716 (0.6672) time: 5.5751 data: 0.0001 max mem: 71357 +[23:05:40.229307] Epoch: [0] [3680/6500] lr: 0.000028 closs: 0.7823 (0.8170) grad_norm: 0.4653 (0.6668) time: 5.5765 data: 0.0002 max mem: 71357 +[23:06:36.012740] Epoch: [0] [3690/6500] lr: 0.000028 closs: 0.7470 (0.8169) grad_norm: 0.4864 (0.6664) time: 5.5744 data: 0.0002 max mem: 71357 +[23:07:31.710488] Epoch: [0] [3700/6500] lr: 0.000028 closs: 0.7934 (0.8167) grad_norm: 0.4874 (0.6661) time: 5.5740 data: 0.0002 max mem: 71357 +[23:08:27.509042] Epoch: [0] [3710/6500] lr: 0.000029 closs: 0.8132 (0.8167) grad_norm: 0.4970 (0.6656) time: 5.5747 data: 0.0001 max mem: 71357 +[23:09:23.247672] Epoch: [0] [3720/6500] lr: 0.000029 closs: 0.7672 (0.8166) grad_norm: 0.4874 (0.6649) time: 5.5767 data: 0.0001 max mem: 71357 +[23:10:19.028843] Epoch: [0] [3730/6500] lr: 0.000029 closs: 0.8333 (0.8167) grad_norm: 0.4414 (0.6645) time: 5.5758 data: 0.0001 max mem: 71357 +[23:11:14.783761] Epoch: [0] [3740/6500] lr: 0.000029 closs: 0.7984 (0.8165) grad_norm: 0.4400 (0.6643) time: 5.5767 data: 0.0001 max mem: 71357 +[23:12:10.561004] Epoch: [0] [3750/6500] lr: 0.000029 closs: 0.7420 (0.8166) grad_norm: 0.4073 (0.6638) time: 5.5765 data: 0.0001 max mem: 71357 +[23:13:06.422660] Epoch: [0] [3760/6500] lr: 0.000029 closs: 0.7733 (0.8165) grad_norm: 0.3984 (0.6631) time: 5.5818 data: 0.0002 max mem: 71357 +[23:14:02.251523] Epoch: [0] [3770/6500] lr: 0.000029 closs: 0.7733 (0.8165) grad_norm: 0.4193 (0.6628) time: 5.5844 data: 0.0002 max mem: 71357 +[23:14:57.959067] Epoch: [0] [3780/6500] lr: 0.000029 closs: 0.7491 (0.8164) grad_norm: 0.4418 (0.6623) time: 5.5767 data: 0.0002 max mem: 71357 +[23:15:53.761394] Epoch: [0] [3790/6500] lr: 0.000029 closs: 0.7587 (0.8163) grad_norm: 0.4418 (0.6619) time: 5.5753 data: 0.0003 max mem: 71357 +[23:16:49.649809] Epoch: [0] [3800/6500] lr: 0.000029 closs: 0.7967 (0.8163) grad_norm: 0.4321 (0.6611) time: 5.5844 data: 0.0002 max mem: 71357 +[23:17:45.356972] Epoch: [0] [3810/6500] lr: 0.000029 closs: 0.7972 (0.8161) grad_norm: 0.4144 (0.6607) time: 5.5797 data: 0.0001 max mem: 71357 +[23:18:41.088973] Epoch: [0] [3820/6500] lr: 0.000029 closs: 0.7616 (0.8161) grad_norm: 0.4144 (0.6602) time: 5.5719 data: 0.0001 max mem: 71357 +[23:19:36.728934] Epoch: [0] [3830/6500] lr: 0.000029 closs: 0.7773 (0.8161) grad_norm: 0.4110 (0.6597) time: 5.5685 data: 0.0001 max mem: 71357 +[23:20:32.508264] Epoch: [0] [3840/6500] lr: 0.000030 closs: 0.7510 (0.8158) grad_norm: 0.4507 (0.6591) time: 5.5708 data: 0.0002 max mem: 71357 +[23:21:28.314526] Epoch: [0] [3850/6500] lr: 0.000030 closs: 0.7856 (0.8159) grad_norm: 0.4618 (0.6590) time: 5.5792 data: 0.0002 max mem: 71357 +[23:22:24.127339] Epoch: [0] [3860/6500] lr: 0.000030 closs: 0.7705 (0.8157) grad_norm: 0.4486 (0.6583) time: 5.5809 data: 0.0001 max mem: 71357 +[23:23:19.774326] Epoch: [0] [3870/6500] lr: 0.000030 closs: 0.7342 (0.8156) grad_norm: 0.4285 (0.6577) time: 5.5729 data: 0.0001 max mem: 71357 +[23:24:15.449400] Epoch: [0] [3880/6500] lr: 0.000030 closs: 0.8044 (0.8156) grad_norm: 0.4773 (0.6583) time: 5.5660 data: 0.0001 max mem: 71357 +[23:25:11.308804] Epoch: [0] [3890/6500] lr: 0.000030 closs: 0.8051 (0.8154) grad_norm: 0.4486 (0.6581) time: 5.5766 data: 0.0001 max mem: 71357 +[23:26:07.026136] Epoch: [0] [3900/6500] lr: 0.000030 closs: 0.8188 (0.8154) grad_norm: 0.4390 (0.6577) time: 5.5787 data: 0.0002 max mem: 71357 +[23:27:02.883831] Epoch: [0] [3910/6500] lr: 0.000030 closs: 0.8057 (0.8154) grad_norm: 0.4913 (0.6572) time: 5.5786 data: 0.0002 max mem: 71357 +[23:27:58.627975] Epoch: [0] [3920/6500] lr: 0.000030 closs: 0.7998 (0.8153) grad_norm: 0.4599 (0.6568) time: 5.5800 data: 0.0001 max mem: 71357 +[23:28:54.466339] Epoch: [0] [3930/6500] lr: 0.000030 closs: 0.8312 (0.8152) grad_norm: 0.4599 (0.6564) time: 5.5790 data: 0.0001 max mem: 71357 +[23:29:50.253141] Epoch: [0] [3940/6500] lr: 0.000030 closs: 0.7910 (0.8150) grad_norm: 0.4448 (0.6559) time: 5.5811 data: 0.0002 max mem: 71357 +[23:30:46.049333] Epoch: [0] [3950/6500] lr: 0.000030 closs: 0.7271 (0.8148) grad_norm: 0.4448 (0.6554) time: 5.5790 data: 0.0002 max mem: 71357 +[23:31:41.791787] Epoch: [0] [3960/6500] lr: 0.000030 closs: 0.7542 (0.8148) grad_norm: 0.4852 (0.6553) time: 5.5768 data: 0.0002 max mem: 71357 +[23:32:37.496971] Epoch: [0] [3970/6500] lr: 0.000031 closs: 0.7941 (0.8147) grad_norm: 0.4852 (0.6549) time: 5.5723 data: 0.0002 max mem: 71357 +[23:33:33.374044] Epoch: [0] [3980/6500] lr: 0.000031 closs: 0.7972 (0.8147) grad_norm: 0.4313 (0.6542) time: 5.5790 data: 0.0002 max mem: 71357 +[23:34:29.051290] Epoch: [0] [3990/6500] lr: 0.000031 closs: 0.7962 (0.8146) grad_norm: 0.4339 (0.6539) time: 5.5776 data: 0.0002 max mem: 71357 +[23:35:24.744469] Epoch: [0] [4000/6500] lr: 0.000031 closs: 0.7493 (0.8144) grad_norm: 0.4339 (0.6536) time: 5.5684 data: 0.0002 max mem: 71357 +[23:36:20.520889] Epoch: [0] [4010/6500] lr: 0.000031 closs: 0.7516 (0.8142) grad_norm: 0.4339 (0.6533) time: 5.5733 data: 0.0002 max mem: 71357 +[23:37:16.369144] Epoch: [0] [4020/6500] lr: 0.000031 closs: 0.8050 (0.8143) grad_norm: 0.4571 (0.6530) time: 5.5811 data: 0.0002 max mem: 71357 +[23:38:12.149618] Epoch: [0] [4030/6500] lr: 0.000031 closs: 0.8104 (0.8143) grad_norm: 0.4843 (0.6527) time: 5.5813 data: 0.0002 max mem: 71357 +[23:39:07.897266] Epoch: [0] [4040/6500] lr: 0.000031 closs: 0.7765 (0.8142) grad_norm: 0.4843 (0.6523) time: 5.5763 data: 0.0002 max mem: 71357 +[23:40:03.635775] Epoch: [0] [4050/6500] lr: 0.000031 closs: 0.7657 (0.8142) grad_norm: 0.4843 (0.6518) time: 5.5742 data: 0.0002 max mem: 71357 +[23:40:59.373975] Epoch: [0] [4060/6500] lr: 0.000031 closs: 0.8069 (0.8142) grad_norm: 0.4719 (0.6513) time: 5.5737 data: 0.0002 max mem: 71357 +[23:41:55.240081] Epoch: [0] [4070/6500] lr: 0.000031 closs: 0.8069 (0.8142) grad_norm: 0.4648 (0.6509) time: 5.5801 data: 0.0002 max mem: 71357 +[23:42:51.032317] Epoch: [0] [4080/6500] lr: 0.000031 closs: 0.8569 (0.8142) grad_norm: 0.4648 (0.6508) time: 5.5828 data: 0.0002 max mem: 71357 +[23:43:46.755137] Epoch: [0] [4090/6500] lr: 0.000031 closs: 0.8239 (0.8142) grad_norm: 0.4661 (0.6506) time: 5.5757 data: 0.0002 max mem: 71357 +[23:44:42.512116] Epoch: [0] [4100/6500] lr: 0.000032 closs: 0.7687 (0.8141) grad_norm: 0.4823 (0.6501) time: 5.5739 data: 0.0002 max mem: 71357 +[23:45:38.454009] Epoch: [0] [4110/6500] lr: 0.000032 closs: 0.7388 (0.8139) grad_norm: 0.4725 (0.6495) time: 5.5848 data: 0.0002 max mem: 71357 +[23:46:34.148019] Epoch: [0] [4120/6500] lr: 0.000032 closs: 0.7714 (0.8141) grad_norm: 0.4684 (0.6490) time: 5.5817 data: 0.0002 max mem: 71357 +[23:47:29.964993] Epoch: [0] [4130/6500] lr: 0.000032 closs: 0.7955 (0.8139) grad_norm: 0.4303 (0.6485) time: 5.5754 data: 0.0001 max mem: 71357 +[23:48:25.655582] Epoch: [0] [4140/6500] lr: 0.000032 closs: 0.6973 (0.8136) grad_norm: 0.4611 (0.6488) time: 5.5753 data: 0.0001 max mem: 71357 +[23:49:21.369932] Epoch: [0] [4150/6500] lr: 0.000032 closs: 0.7174 (0.8135) grad_norm: 0.4709 (0.6485) time: 5.5702 data: 0.0001 max mem: 71357 +[23:50:17.071274] Epoch: [0] [4160/6500] lr: 0.000032 closs: 0.7410 (0.8134) grad_norm: 0.4709 (0.6480) time: 5.5707 data: 0.0001 max mem: 71357 +[23:51:12.793801] Epoch: [0] [4170/6500] lr: 0.000032 closs: 0.6819 (0.8131) grad_norm: 0.4439 (0.6479) time: 5.5711 data: 0.0001 max mem: 71357 +[23:52:08.637169] Epoch: [0] [4180/6500] lr: 0.000032 closs: 0.6791 (0.8130) grad_norm: 0.4330 (0.6475) time: 5.5782 data: 0.0001 max mem: 71357 +[23:53:04.424045] Epoch: [0] [4190/6500] lr: 0.000032 closs: 0.6955 (0.8127) grad_norm: 0.4330 (0.6469) time: 5.5814 data: 0.0002 max mem: 71357 +[23:54:00.361724] Epoch: [0] [4200/6500] lr: 0.000032 closs: 0.7177 (0.8125) grad_norm: 0.3919 (0.6463) time: 5.5861 data: 0.0002 max mem: 71357 +[23:54:56.077357] Epoch: [0] [4210/6500] lr: 0.000032 closs: 0.7655 (0.8125) grad_norm: 0.3950 (0.6458) time: 5.5826 data: 0.0002 max mem: 71357 +[23:55:51.890551] Epoch: [0] [4220/6500] lr: 0.000032 closs: 0.7983 (0.8125) grad_norm: 0.3997 (0.6454) time: 5.5764 data: 0.0002 max mem: 71357 +[23:56:47.608587] Epoch: [0] [4230/6500] lr: 0.000033 closs: 0.8221 (0.8125) grad_norm: 0.4214 (0.6449) time: 5.5765 data: 0.0002 max mem: 71357 +[23:57:43.392132] Epoch: [0] [4240/6500] lr: 0.000033 closs: 0.7227 (0.8123) grad_norm: 0.4214 (0.6444) time: 5.5750 data: 0.0001 max mem: 71357 +[23:58:39.182546] Epoch: [0] [4250/6500] lr: 0.000033 closs: 0.7145 (0.8122) grad_norm: 0.4214 (0.6440) time: 5.5786 data: 0.0001 max mem: 71357 +[23:59:34.962503] Epoch: [0] [4260/6500] lr: 0.000033 closs: 0.7968 (0.8123) grad_norm: 0.4069 (0.6434) time: 5.5784 data: 0.0001 max mem: 71357 +[00:00:30.653249] Epoch: [0] [4270/6500] lr: 0.000033 closs: 0.8109 (0.8125) grad_norm: 0.3782 (0.6429) time: 5.5734 data: 0.0001 max mem: 71357 +[00:01:26.507609] Epoch: [0] [4280/6500] lr: 0.000033 closs: 0.8543 (0.8124) grad_norm: 0.3807 (0.6423) time: 5.5772 data: 0.0001 max mem: 71357 +[00:02:22.266367] Epoch: [0] [4290/6500] lr: 0.000033 closs: 0.7564 (0.8124) grad_norm: 0.3910 (0.6420) time: 5.5806 data: 0.0001 max mem: 71357 +[00:03:18.041885] Epoch: [0] [4300/6500] lr: 0.000033 closs: 0.7627 (0.8123) grad_norm: 0.4018 (0.6417) time: 5.5766 data: 0.0001 max mem: 71357 +[00:04:13.802843] Epoch: [0] [4310/6500] lr: 0.000033 closs: 0.7162 (0.8120) grad_norm: 0.4247 (0.6412) time: 5.5767 data: 0.0001 max mem: 71357 +[00:05:09.467895] Epoch: [0] [4320/6500] lr: 0.000033 closs: 0.6992 (0.8118) grad_norm: 0.4258 (0.6409) time: 5.5712 data: 0.0001 max mem: 71357 +[00:06:05.337634] Epoch: [0] [4330/6500] lr: 0.000033 closs: 0.7296 (0.8117) grad_norm: 0.4258 (0.6405) time: 5.5766 data: 0.0002 max mem: 71357 +[00:07:01.137398] Epoch: [0] [4340/6500] lr: 0.000033 closs: 0.7877 (0.8118) grad_norm: 0.4247 (0.6401) time: 5.5834 data: 0.0002 max mem: 71357 +[00:07:56.821784] Epoch: [0] [4350/6500] lr: 0.000033 closs: 0.8040 (0.8117) grad_norm: 0.4431 (0.6400) time: 5.5741 data: 0.0002 max mem: 71357 +[00:08:52.489372] Epoch: [0] [4360/6500] lr: 0.000034 closs: 0.7670 (0.8116) grad_norm: 0.4431 (0.6398) time: 5.5675 data: 0.0001 max mem: 71357 +[00:09:48.335184] Epoch: [0] [4370/6500] lr: 0.000034 closs: 0.7801 (0.8116) grad_norm: 0.4316 (0.6393) time: 5.5755 data: 0.0002 max mem: 71357 +[00:10:44.089817] Epoch: [0] [4380/6500] lr: 0.000034 closs: 0.7409 (0.8114) grad_norm: 0.4314 (0.6389) time: 5.5799 data: 0.0002 max mem: 71357 +[00:11:39.833736] Epoch: [0] [4390/6500] lr: 0.000034 closs: 0.7122 (0.8114) grad_norm: 0.4360 (0.6385) time: 5.5748 data: 0.0002 max mem: 71357 +[00:12:35.503403] Epoch: [0] [4400/6500] lr: 0.000034 closs: 0.7840 (0.8113) grad_norm: 0.4360 (0.6384) time: 5.5705 data: 0.0001 max mem: 71357 +[00:13:31.186127] Epoch: [0] [4410/6500] lr: 0.000034 closs: 0.7011 (0.8111) grad_norm: 0.4577 (0.6383) time: 5.5675 data: 0.0001 max mem: 71357 +[00:14:26.967773] Epoch: [0] [4420/6500] lr: 0.000034 closs: 0.7771 (0.8112) grad_norm: 0.4687 (0.6380) time: 5.5731 data: 0.0001 max mem: 71357 +[00:15:22.663366] Epoch: [0] [4430/6500] lr: 0.000034 closs: 0.7935 (0.8111) grad_norm: 0.4562 (0.6382) time: 5.5738 data: 0.0001 max mem: 71357 +[00:16:18.415629] Epoch: [0] [4440/6500] lr: 0.000034 closs: 0.7374 (0.8107) grad_norm: 0.4562 (0.6377) time: 5.5723 data: 0.0002 max mem: 71357 +[00:17:14.098438] Epoch: [0] [4450/6500] lr: 0.000034 closs: 0.7220 (0.8107) grad_norm: 0.4404 (0.6374) time: 5.5716 data: 0.0002 max mem: 71357 +[00:18:09.928472] Epoch: [0] [4460/6500] lr: 0.000034 closs: 0.7684 (0.8105) grad_norm: 0.4403 (0.6373) time: 5.5755 data: 0.0001 max mem: 71357 +[00:19:05.687721] Epoch: [0] [4470/6500] lr: 0.000034 closs: 0.7288 (0.8105) grad_norm: 0.4834 (0.6370) time: 5.5793 data: 0.0001 max mem: 71357 +[00:20:01.374623] Epoch: [0] [4480/6500] lr: 0.000034 closs: 0.8508 (0.8105) grad_norm: 0.4678 (0.6365) time: 5.5722 data: 0.0001 max mem: 71357 +[00:20:57.038781] Epoch: [0] [4490/6500] lr: 0.000035 closs: 0.8271 (0.8105) grad_norm: 0.4790 (0.6363) time: 5.5674 data: 0.0001 max mem: 71357 +[00:21:52.724399] Epoch: [0] [4500/6500] lr: 0.000035 closs: 0.8136 (0.8104) grad_norm: 0.4834 (0.6360) time: 5.5674 data: 0.0001 max mem: 71357 +[00:22:48.517686] Epoch: [0] [4510/6500] lr: 0.000035 closs: 0.7913 (0.8104) grad_norm: 0.4602 (0.6357) time: 5.5738 data: 0.0001 max mem: 71357 +[00:23:44.328287] Epoch: [0] [4520/6500] lr: 0.000035 closs: 0.7664 (0.8102) grad_norm: 0.4774 (0.6353) time: 5.5801 data: 0.0001 max mem: 71357 +[00:24:40.071329] Epoch: [0] [4530/6500] lr: 0.000035 closs: 0.7084 (0.8101) grad_norm: 0.4480 (0.6351) time: 5.5776 data: 0.0001 max mem: 71357 +[00:25:35.770539] Epoch: [0] [4540/6500] lr: 0.000035 closs: 0.6961 (0.8099) grad_norm: 0.4464 (0.6349) time: 5.5720 data: 0.0001 max mem: 71357 +[00:26:31.605185] Epoch: [0] [4550/6500] lr: 0.000035 closs: 0.7117 (0.8096) grad_norm: 0.4464 (0.6353) time: 5.5766 data: 0.0001 max mem: 71357 +[00:27:27.408895] Epoch: [0] [4560/6500] lr: 0.000035 closs: 0.7221 (0.8096) grad_norm: 0.4406 (0.6349) time: 5.5818 data: 0.0001 max mem: 71357 +[00:28:23.168811] Epoch: [0] [4570/6500] lr: 0.000035 closs: 0.7814 (0.8096) grad_norm: 0.4406 (0.6345) time: 5.5781 data: 0.0001 max mem: 71357 +[00:29:18.810132] Epoch: [0] [4580/6500] lr: 0.000035 closs: 0.7730 (0.8094) grad_norm: 0.4818 (0.6345) time: 5.5700 data: 0.0001 max mem: 71357 +[00:30:14.742733] Epoch: [0] [4590/6500] lr: 0.000035 closs: 0.7002 (0.8092) grad_norm: 0.4623 (0.6342) time: 5.5786 data: 0.0001 max mem: 71357 +[00:31:10.481301] Epoch: [0] [4600/6500] lr: 0.000035 closs: 0.7432 (0.8091) grad_norm: 0.4623 (0.6339) time: 5.5835 data: 0.0001 max mem: 71357 +[00:32:06.158824] Epoch: [0] [4610/6500] lr: 0.000035 closs: 0.7692 (0.8091) grad_norm: 0.5206 (0.6339) time: 5.5707 data: 0.0001 max mem: 71357 +[00:33:01.810286] Epoch: [0] [4620/6500] lr: 0.000036 closs: 0.7397 (0.8088) grad_norm: 0.5155 (0.6335) time: 5.5663 data: 0.0001 max mem: 71357 +[00:33:57.482433] Epoch: [0] [4630/6500] lr: 0.000036 closs: 0.6831 (0.8086) grad_norm: 0.5155 (0.6331) time: 5.5661 data: 0.0001 max mem: 71357 +[00:34:53.309586] Epoch: [0] [4640/6500] lr: 0.000036 closs: 0.6817 (0.8084) grad_norm: 0.5155 (0.6328) time: 5.5748 data: 0.0001 max mem: 71357 +[00:35:49.041555] Epoch: [0] [4650/6500] lr: 0.000036 closs: 0.7070 (0.8081) grad_norm: 0.4219 (0.6324) time: 5.5779 data: 0.0001 max mem: 71357 +[00:36:44.777926] Epoch: [0] [4660/6500] lr: 0.000036 closs: 0.7158 (0.8080) grad_norm: 0.4482 (0.6320) time: 5.5733 data: 0.0001 max mem: 71357 +[00:37:40.419372] Epoch: [0] [4670/6500] lr: 0.000036 closs: 0.7453 (0.8080) grad_norm: 0.4652 (0.6324) time: 5.5688 data: 0.0001 max mem: 71357 +[00:38:36.219244] Epoch: [0] [4680/6500] lr: 0.000036 closs: 0.7802 (0.8080) grad_norm: 0.4763 (0.6324) time: 5.5720 data: 0.0001 max mem: 71357 +[00:39:31.970352] Epoch: [0] [4690/6500] lr: 0.000036 closs: 0.7953 (0.8080) grad_norm: 0.5270 (0.6321) time: 5.5774 data: 0.0001 max mem: 71357 +[00:40:27.645439] Epoch: [0] [4700/6500] lr: 0.000036 closs: 0.7213 (0.8078) grad_norm: 0.5270 (0.6317) time: 5.5712 data: 0.0001 max mem: 71357 +[00:41:23.345485] Epoch: [0] [4710/6500] lr: 0.000036 closs: 0.7213 (0.8077) grad_norm: 0.4877 (0.6315) time: 5.5686 data: 0.0002 max mem: 71357 +[00:42:19.100016] Epoch: [0] [4720/6500] lr: 0.000036 closs: 0.7877 (0.8077) grad_norm: 0.4571 (0.6311) time: 5.5726 data: 0.0002 max mem: 71357 +[00:43:14.868291] Epoch: [0] [4730/6500] lr: 0.000036 closs: 0.8066 (0.8077) grad_norm: 0.4571 (0.6311) time: 5.5761 data: 0.0001 max mem: 71357 +[00:44:10.617607] Epoch: [0] [4740/6500] lr: 0.000036 closs: 0.8107 (0.8077) grad_norm: 0.4426 (0.6306) time: 5.5758 data: 0.0001 max mem: 71357 +[00:45:06.457161] Epoch: [0] [4750/6500] lr: 0.000037 closs: 0.8107 (0.8076) grad_norm: 0.4339 (0.6306) time: 5.5793 data: 0.0001 max mem: 71357 +[00:46:02.224391] Epoch: [0] [4760/6500] lr: 0.000037 closs: 0.7232 (0.8074) grad_norm: 0.4325 (0.6302) time: 5.5802 data: 0.0002 max mem: 71357 +[00:46:58.073901] Epoch: [0] [4770/6500] lr: 0.000037 closs: 0.6645 (0.8072) grad_norm: 0.3619 (0.6297) time: 5.5807 data: 0.0002 max mem: 71357 +[00:47:53.806339] Epoch: [0] [4780/6500] lr: 0.000037 closs: 0.7087 (0.8071) grad_norm: 0.3475 (0.6292) time: 5.5790 data: 0.0001 max mem: 71357 +[00:48:49.478935] Epoch: [0] [4790/6500] lr: 0.000037 closs: 0.8572 (0.8073) grad_norm: 0.4056 (0.6289) time: 5.5701 data: 0.0001 max mem: 71357 +[00:49:45.256418] Epoch: [0] [4800/6500] lr: 0.000037 closs: 0.7900 (0.8071) grad_norm: 0.4126 (0.6285) time: 5.5724 data: 0.0001 max mem: 71357 +[00:50:41.081781] Epoch: [0] [4810/6500] lr: 0.000037 closs: 0.7765 (0.8071) grad_norm: 0.4338 (0.6283) time: 5.5800 data: 0.0001 max mem: 71357 +[00:51:36.744745] Epoch: [0] [4820/6500] lr: 0.000037 closs: 0.8162 (0.8073) grad_norm: 0.4591 (0.6280) time: 5.5743 data: 0.0001 max mem: 71357 +[00:52:32.438026] Epoch: [0] [4830/6500] lr: 0.000037 closs: 0.8205 (0.8073) grad_norm: 0.4549 (0.6276) time: 5.5677 data: 0.0001 max mem: 71357 +[00:53:28.073336] Epoch: [0] [4840/6500] lr: 0.000037 closs: 0.8205 (0.8074) grad_norm: 0.4776 (0.6276) time: 5.5664 data: 0.0001 max mem: 71357 +[00:54:23.863171] Epoch: [0] [4850/6500] lr: 0.000037 closs: 0.8377 (0.8073) grad_norm: 0.4549 (0.6272) time: 5.5712 data: 0.0001 max mem: 71357 +[00:55:19.616326] Epoch: [0] [4860/6500] lr: 0.000037 closs: 0.7501 (0.8072) grad_norm: 0.4700 (0.6269) time: 5.5771 data: 0.0001 max mem: 71357 +[00:56:15.311381] Epoch: [0] [4870/6500] lr: 0.000037 closs: 0.7358 (0.8071) grad_norm: 0.4839 (0.6266) time: 5.5723 data: 0.0002 max mem: 71357 +[00:57:10.987223] Epoch: [0] [4880/6500] lr: 0.000038 closs: 0.7905 (0.8072) grad_norm: 0.4700 (0.6265) time: 5.5684 data: 0.0002 max mem: 71357 +[00:58:06.697477] Epoch: [0] [4890/6500] lr: 0.000038 closs: 0.8178 (0.8072) grad_norm: 0.4622 (0.6262) time: 5.5692 data: 0.0001 max mem: 71357 +[00:59:02.438896] Epoch: [0] [4900/6500] lr: 0.000038 closs: 0.8119 (0.8072) grad_norm: 0.4416 (0.6260) time: 5.5725 data: 0.0001 max mem: 71357 +[00:59:58.097865] Epoch: [0] [4910/6500] lr: 0.000038 closs: 0.7644 (0.8072) grad_norm: 0.4335 (0.6256) time: 5.5699 data: 0.0001 max mem: 71357 +[01:00:53.757132] Epoch: [0] [4920/6500] lr: 0.000038 closs: 0.7307 (0.8071) grad_norm: 0.4416 (0.6254) time: 5.5658 data: 0.0002 max mem: 71357 +[01:01:49.419719] Epoch: [0] [4930/6500] lr: 0.000038 closs: 0.7684 (0.8071) grad_norm: 0.4416 (0.6251) time: 5.5660 data: 0.0002 max mem: 71357 +[01:02:45.018759] Epoch: [0] [4940/6500] lr: 0.000038 closs: 0.8007 (0.8071) grad_norm: 0.4659 (0.6250) time: 5.5630 data: 0.0001 max mem: 71357 +[01:03:40.831339] Epoch: [0] [4950/6500] lr: 0.000038 closs: 0.7903 (0.8070) grad_norm: 0.4948 (0.6247) time: 5.5705 data: 0.0001 max mem: 71357 +[01:04:36.520963] Epoch: [0] [4960/6500] lr: 0.000038 closs: 0.7722 (0.8068) grad_norm: 0.4659 (0.6243) time: 5.5750 data: 0.0001 max mem: 71357 +[01:05:32.153893] Epoch: [0] [4970/6500] lr: 0.000038 closs: 0.7275 (0.8067) grad_norm: 0.4508 (0.6239) time: 5.5660 data: 0.0001 max mem: 71357 +[01:06:27.840249] Epoch: [0] [4980/6500] lr: 0.000038 closs: 0.7662 (0.8067) grad_norm: 0.4232 (0.6235) time: 5.5658 data: 0.0002 max mem: 71357 +[01:07:23.696178] Epoch: [0] [4990/6500] lr: 0.000038 closs: 0.7900 (0.8066) grad_norm: 0.4029 (0.6231) time: 5.5770 data: 0.0002 max mem: 71357 +[01:08:19.291258] Epoch: [0] [5000/6500] lr: 0.000038 closs: 0.6995 (0.8064) grad_norm: 0.4029 (0.6229) time: 5.5725 data: 0.0001 max mem: 71357 +[01:09:14.973960] Epoch: [0] [5010/6500] lr: 0.000039 closs: 0.6936 (0.8064) grad_norm: 0.4029 (0.6225) time: 5.5638 data: 0.0001 max mem: 71357 +[01:10:10.617873] Epoch: [0] [5020/6500] lr: 0.000039 closs: 0.7353 (0.8063) grad_norm: 0.4315 (0.6223) time: 5.5663 data: 0.0001 max mem: 71357 +[01:11:06.449934] Epoch: [0] [5030/6500] lr: 0.000039 closs: 0.8164 (0.8063) grad_norm: 0.4530 (0.6219) time: 5.5737 data: 0.0001 max mem: 71357 +[01:12:02.146675] Epoch: [0] [5040/6500] lr: 0.000039 closs: 0.7967 (0.8062) grad_norm: 0.4530 (0.6215) time: 5.5763 data: 0.0001 max mem: 71357 +[01:12:57.852880] Epoch: [0] [5050/6500] lr: 0.000039 closs: 0.7883 (0.8062) grad_norm: 0.4232 (0.6211) time: 5.5700 data: 0.0001 max mem: 71357 +[01:13:53.575353] Epoch: [0] [5060/6500] lr: 0.000039 closs: 0.7221 (0.8059) grad_norm: 0.4012 (0.6207) time: 5.5713 data: 0.0001 max mem: 71357 +[01:14:49.232927] Epoch: [0] [5070/6500] lr: 0.000039 closs: 0.6839 (0.8059) grad_norm: 0.3975 (0.6202) time: 5.5689 data: 0.0001 max mem: 71357 +[01:15:45.028965] Epoch: [0] [5080/6500] lr: 0.000039 closs: 0.7101 (0.8057) grad_norm: 0.4012 (0.6198) time: 5.5726 data: 0.0001 max mem: 71357 +[01:16:40.801994] Epoch: [0] [5090/6500] lr: 0.000039 closs: 0.7101 (0.8056) grad_norm: 0.3859 (0.6193) time: 5.5783 data: 0.0001 max mem: 71357 +[01:17:36.606893] Epoch: [0] [5100/6500] lr: 0.000039 closs: 0.7536 (0.8056) grad_norm: 0.3690 (0.6189) time: 5.5788 data: 0.0001 max mem: 71357 +[01:18:32.252167] Epoch: [0] [5110/6500] lr: 0.000039 closs: 0.7552 (0.8055) grad_norm: 0.3921 (0.6186) time: 5.5724 data: 0.0001 max mem: 71357 +[01:19:28.118493] Epoch: [0] [5120/6500] lr: 0.000039 closs: 0.7621 (0.8055) grad_norm: 0.3921 (0.6181) time: 5.5755 data: 0.0001 max mem: 71357 +[01:20:23.806918] Epoch: [0] [5130/6500] lr: 0.000039 closs: 0.7900 (0.8054) grad_norm: 0.3971 (0.6178) time: 5.5776 data: 0.0001 max mem: 71357 +[01:21:19.560680] Epoch: [0] [5140/6500] lr: 0.000040 closs: 0.7998 (0.8055) grad_norm: 0.4023 (0.6177) time: 5.5720 data: 0.0002 max mem: 71357 +[01:22:15.265374] Epoch: [0] [5150/6500] lr: 0.000040 closs: 0.8014 (0.8056) grad_norm: 0.4023 (0.6174) time: 5.5728 data: 0.0002 max mem: 71357 +[01:23:10.920939] Epoch: [0] [5160/6500] lr: 0.000040 closs: 0.8151 (0.8056) grad_norm: 0.4297 (0.6172) time: 5.5679 data: 0.0001 max mem: 71357 +[01:24:06.835511] Epoch: [0] [5170/6500] lr: 0.000040 closs: 0.7360 (0.8054) grad_norm: 0.4297 (0.6169) time: 5.5784 data: 0.0001 max mem: 71357 +[01:25:02.500269] Epoch: [0] [5180/6500] lr: 0.000040 closs: 0.7251 (0.8053) grad_norm: 0.4110 (0.6165) time: 5.5789 data: 0.0001 max mem: 71357 +[01:25:58.200527] Epoch: [0] [5190/6500] lr: 0.000040 closs: 0.7061 (0.8051) grad_norm: 0.4110 (0.6162) time: 5.5681 data: 0.0001 max mem: 71357 +[01:26:53.995989] Epoch: [0] [5200/6500] lr: 0.000040 closs: 0.7137 (0.8051) grad_norm: 0.3776 (0.6158) time: 5.5747 data: 0.0001 max mem: 71357 +[01:27:49.727844] Epoch: [0] [5210/6500] lr: 0.000040 closs: 0.7441 (0.8049) grad_norm: 0.3776 (0.6156) time: 5.5762 data: 0.0001 max mem: 71357 +[01:28:45.507070] Epoch: [0] [5220/6500] lr: 0.000040 closs: 0.7577 (0.8049) grad_norm: 0.3804 (0.6153) time: 5.5754 data: 0.0001 max mem: 71357 +[01:29:41.159681] Epoch: [0] [5230/6500] lr: 0.000040 closs: 0.7887 (0.8048) grad_norm: 0.3804 (0.6169) time: 5.5715 data: 0.0001 max mem: 71357 +[01:30:36.804523] Epoch: [0] [5240/6500] lr: 0.000040 closs: 0.7531 (0.8048) grad_norm: 0.4002 (0.6164) time: 5.5648 data: 0.0001 max mem: 71357 +[01:31:32.606137] Epoch: [0] [5250/6500] lr: 0.000040 closs: 0.7744 (0.8048) grad_norm: 0.4071 (0.6161) time: 5.5722 data: 0.0001 max mem: 71357 +[01:32:28.301729] Epoch: [0] [5260/6500] lr: 0.000040 closs: 0.8277 (0.8048) grad_norm: 0.4002 (0.6158) time: 5.5747 data: 0.0001 max mem: 71357 +[01:33:24.112652] Epoch: [0] [5270/6500] lr: 0.000041 closs: 0.7664 (0.8048) grad_norm: 0.3901 (0.6154) time: 5.5752 data: 0.0001 max mem: 71357 +[01:34:19.868194] Epoch: [0] [5280/6500] lr: 0.000041 closs: 0.7340 (0.8046) grad_norm: 0.3901 (0.6151) time: 5.5782 data: 0.0001 max mem: 71357 +[01:35:15.606432] Epoch: [0] [5290/6500] lr: 0.000041 closs: 0.6905 (0.8045) grad_norm: 0.3896 (0.6146) time: 5.5746 data: 0.0001 max mem: 71357 +[01:36:11.383142] Epoch: [0] [5300/6500] lr: 0.000041 closs: 0.7767 (0.8045) grad_norm: 0.3896 (0.6143) time: 5.5757 data: 0.0001 max mem: 71357 +[01:37:07.009457] Epoch: [0] [5310/6500] lr: 0.000041 closs: 0.7287 (0.8044) grad_norm: 0.3921 (0.6145) time: 5.5701 data: 0.0001 max mem: 71357 +[01:38:02.669739] Epoch: [0] [5320/6500] lr: 0.000041 closs: 0.7413 (0.8044) grad_norm: 0.4500 (0.6148) time: 5.5642 data: 0.0001 max mem: 71357 +[01:38:58.422769] Epoch: [0] [5330/6500] lr: 0.000041 closs: 0.7950 (0.8044) grad_norm: 0.4939 (0.6144) time: 5.5706 data: 0.0001 max mem: 71357 +[01:39:54.248251] Epoch: [0] [5340/6500] lr: 0.000041 closs: 0.7414 (0.8043) grad_norm: 0.5255 (0.6143) time: 5.5788 data: 0.0001 max mem: 71357 +[01:40:50.000851] Epoch: [0] [5350/6500] lr: 0.000041 closs: 0.7228 (0.8042) grad_norm: 0.4853 (0.6139) time: 5.5788 data: 0.0001 max mem: 71357 +[01:41:45.707984] Epoch: [0] [5360/6500] lr: 0.000041 closs: 0.7594 (0.8041) grad_norm: 0.4699 (0.6136) time: 5.5729 data: 0.0001 max mem: 71357 +[01:42:41.349859] Epoch: [0] [5370/6500] lr: 0.000041 closs: 0.7594 (0.8040) grad_norm: 0.4699 (0.6133) time: 5.5673 data: 0.0001 max mem: 71357 +[01:43:37.059969] Epoch: [0] [5380/6500] lr: 0.000041 closs: 0.7105 (0.8039) grad_norm: 0.4386 (0.6130) time: 5.5675 data: 0.0001 max mem: 71357 +[01:44:32.852298] Epoch: [0] [5390/6500] lr: 0.000041 closs: 0.7781 (0.8039) grad_norm: 0.4505 (0.6128) time: 5.5750 data: 0.0001 max mem: 71357 +[01:45:28.525634] Epoch: [0] [5400/6500] lr: 0.000042 closs: 0.7665 (0.8038) grad_norm: 0.4386 (0.6126) time: 5.5732 data: 0.0001 max mem: 71357 +[01:46:24.224107] Epoch: [0] [5410/6500] lr: 0.000042 closs: 0.7346 (0.8037) grad_norm: 0.4365 (0.6124) time: 5.5685 data: 0.0002 max mem: 71357 +[01:47:19.853525] Epoch: [0] [5420/6500] lr: 0.000042 closs: 0.7093 (0.8034) grad_norm: 0.5518 (0.6123) time: 5.5663 data: 0.0002 max mem: 71357 +[01:48:15.651981] Epoch: [0] [5430/6500] lr: 0.000042 closs: 0.6989 (0.8033) grad_norm: 0.4783 (0.6121) time: 5.5713 data: 0.0001 max mem: 71357 +[01:49:11.451075] Epoch: [0] [5440/6500] lr: 0.000042 closs: 0.7255 (0.8033) grad_norm: 0.4431 (0.6119) time: 5.5797 data: 0.0001 max mem: 71357 +[01:50:07.107339] Epoch: [0] [5450/6500] lr: 0.000042 closs: 0.7646 (0.8033) grad_norm: 0.4431 (0.6116) time: 5.5726 data: 0.0001 max mem: 71357 +[01:51:02.882550] Epoch: [0] [5460/6500] lr: 0.000042 closs: 0.7432 (0.8032) grad_norm: 0.4143 (0.6113) time: 5.5715 data: 0.0002 max mem: 71357 +[01:51:58.596127] Epoch: [0] [5470/6500] lr: 0.000042 closs: 0.7326 (0.8032) grad_norm: 0.4256 (0.6112) time: 5.5744 data: 0.0002 max mem: 71357 +[01:52:54.291718] Epoch: [0] [5480/6500] lr: 0.000042 closs: 0.7713 (0.8033) grad_norm: 0.4343 (0.6207) time: 5.5704 data: 0.0001 max mem: 71357 +[01:53:50.039357] Epoch: [0] [5490/6500] lr: 0.000042 closs: 0.7713 (0.8033) grad_norm: 0.4256 (0.6203) time: 5.5720 data: 0.0001 max mem: 71357 +[01:54:45.630943] Epoch: [0] [5500/6500] lr: 0.000042 closs: 0.7862 (0.8032) grad_norm: 0.4718 (0.6201) time: 5.5668 data: 0.0001 max mem: 71357 +[01:55:41.348185] Epoch: [0] [5510/6500] lr: 0.000042 closs: 0.7569 (0.8031) grad_norm: 0.4718 (0.6200) time: 5.5653 data: 0.0001 max mem: 71357 +[01:56:37.065250] Epoch: [0] [5520/6500] lr: 0.000042 closs: 0.7050 (0.8030) grad_norm: 0.4453 (0.6197) time: 5.5716 data: 0.0002 max mem: 71357 +[01:57:32.797146] Epoch: [0] [5530/6500] lr: 0.000043 closs: 0.7246 (0.8030) grad_norm: 0.4408 (0.6194) time: 5.5723 data: 0.0002 max mem: 71357 +[01:58:28.499852] Epoch: [0] [5540/6500] lr: 0.000043 closs: 0.7876 (0.8030) grad_norm: 0.4408 (0.6191) time: 5.5716 data: 0.0001 max mem: 71357 +[01:59:24.128893] Epoch: [0] [5550/6500] lr: 0.000043 closs: 0.7692 (0.8030) grad_norm: 0.4469 (0.6190) time: 5.5665 data: 0.0001 max mem: 71357 +[02:00:19.963717] Epoch: [0] [5560/6500] lr: 0.000043 closs: 0.6882 (0.8026) grad_norm: 0.4469 (0.6187) time: 5.5731 data: 0.0001 max mem: 71357 +[02:01:15.615241] Epoch: [0] [5570/6500] lr: 0.000043 closs: 0.7543 (0.8027) grad_norm: 0.4919 (0.6188) time: 5.5742 data: 0.0002 max mem: 71357 +[02:02:11.280741] Epoch: [0] [5580/6500] lr: 0.000043 closs: 0.7857 (0.8028) grad_norm: 0.5595 (0.6187) time: 5.5657 data: 0.0002 max mem: 71357 +[02:03:06.899155] Epoch: [0] [5590/6500] lr: 0.000043 closs: 0.7333 (0.8028) grad_norm: 0.5595 (0.6185) time: 5.5641 data: 0.0001 max mem: 71357 +[02:04:02.557866] Epoch: [0] [5600/6500] lr: 0.000043 closs: 0.7701 (0.8029) grad_norm: 0.4936 (0.6182) time: 5.5637 data: 0.0001 max mem: 71357 +[02:04:58.346233] Epoch: [0] [5610/6500] lr: 0.000043 closs: 0.7756 (0.8028) grad_norm: 0.4655 (0.6179) time: 5.5722 data: 0.0001 max mem: 71357 +[02:05:54.107394] Epoch: [0] [5620/6500] lr: 0.000043 closs: 0.7756 (0.8027) grad_norm: 0.4211 (0.6174) time: 5.5774 data: 0.0001 max mem: 71357 +[02:06:49.764956] Epoch: [0] [5630/6500] lr: 0.000043 closs: 0.7647 (0.8026) grad_norm: 0.4478 (0.6173) time: 5.5708 data: 0.0001 max mem: 71357 +[02:07:45.456020] Epoch: [0] [5640/6500] lr: 0.000043 closs: 0.7647 (0.8025) grad_norm: 0.4073 (0.6169) time: 5.5673 data: 0.0001 max mem: 71357 +[02:08:41.296090] Epoch: [0] [5650/6500] lr: 0.000043 closs: 0.7760 (0.8025) grad_norm: 0.3969 (0.6166) time: 5.5764 data: 0.0001 max mem: 71357 +[02:09:37.090246] Epoch: [0] [5660/6500] lr: 0.000044 closs: 0.7567 (0.8025) grad_norm: 0.4379 (0.6163) time: 5.5816 data: 0.0001 max mem: 71357 +[02:10:32.741323] Epoch: [0] [5670/6500] lr: 0.000044 closs: 0.7353 (0.8024) grad_norm: 0.4004 (0.6161) time: 5.5721 data: 0.0001 max mem: 71357 +[02:11:28.424749] Epoch: [0] [5680/6500] lr: 0.000044 closs: 0.8031 (0.8024) grad_norm: 0.4379 (0.6159) time: 5.5666 data: 0.0002 max mem: 71357 +[02:12:24.285486] Epoch: [0] [5690/6500] lr: 0.000044 closs: 0.7536 (0.8023) grad_norm: 0.4379 (0.6155) time: 5.5771 data: 0.0002 max mem: 71357 +[02:13:19.918884] Epoch: [0] [5700/6500] lr: 0.000044 closs: 0.6936 (0.8021) grad_norm: 0.4161 (0.6153) time: 5.5746 data: 0.0001 max mem: 71357 +[02:14:15.599778] Epoch: [0] [5710/6500] lr: 0.000044 closs: 0.6915 (0.8019) grad_norm: 0.4161 (0.6149) time: 5.5656 data: 0.0001 max mem: 71357 +[02:15:11.331649] Epoch: [0] [5720/6500] lr: 0.000044 closs: 0.7293 (0.8019) grad_norm: 0.4047 (0.6147) time: 5.5705 data: 0.0001 max mem: 71357 +[02:16:07.152160] Epoch: [0] [5730/6500] lr: 0.000044 closs: 0.8008 (0.8020) grad_norm: 0.3836 (0.6144) time: 5.5775 data: 0.0002 max mem: 71357 +[02:17:03.066211] Epoch: [0] [5740/6500] lr: 0.000044 closs: 0.8080 (0.8020) grad_norm: 0.3720 (0.6140) time: 5.5866 data: 0.0002 max mem: 71357 +[02:17:58.785745] Epoch: [0] [5750/6500] lr: 0.000044 closs: 0.7575 (0.8018) grad_norm: 0.3820 (0.6138) time: 5.5816 data: 0.0001 max mem: 71357 +[02:18:54.476351] Epoch: [0] [5760/6500] lr: 0.000044 closs: 0.7069 (0.8017) grad_norm: 0.3731 (0.6134) time: 5.5704 data: 0.0001 max mem: 71357 +[02:19:50.250044] Epoch: [0] [5770/6500] lr: 0.000044 closs: 0.7394 (0.8016) grad_norm: 0.3789 (0.6132) time: 5.5731 data: 0.0001 max mem: 71357 +[02:20:46.124223] Epoch: [0] [5780/6500] lr: 0.000044 closs: 0.7709 (0.8015) grad_norm: 0.3789 (0.6128) time: 5.5823 data: 0.0001 max mem: 71357 +[02:21:41.877082] Epoch: [0] [5790/6500] lr: 0.000045 closs: 0.7787 (0.8014) grad_norm: 0.3731 (0.6127) time: 5.5813 data: 0.0002 max mem: 71357 +[02:22:37.579383] Epoch: [0] [5800/6500] lr: 0.000045 closs: 0.7921 (0.8014) grad_norm: 0.3789 (0.6125) time: 5.5727 data: 0.0002 max mem: 71357 +[02:23:33.334032] Epoch: [0] [5810/6500] lr: 0.000045 closs: 0.7929 (0.8014) grad_norm: 0.3504 (0.6121) time: 5.5727 data: 0.0001 max mem: 71357 +[02:24:29.023544] Epoch: [0] [5820/6500] lr: 0.000045 closs: 0.7977 (0.8014) grad_norm: 0.3907 (0.6119) time: 5.5721 data: 0.0001 max mem: 71357 +[02:25:24.861645] Epoch: [0] [5830/6500] lr: 0.000045 closs: 0.7955 (0.8014) grad_norm: 0.4042 (0.6119) time: 5.5763 data: 0.0001 max mem: 71357 +[02:26:20.652863] Epoch: [0] [5840/6500] lr: 0.000045 closs: 0.7487 (0.8014) grad_norm: 0.3819 (0.6115) time: 5.5813 data: 0.0001 max mem: 71357 +[02:27:16.301506] Epoch: [0] [5850/6500] lr: 0.000045 closs: 0.7196 (0.8012) grad_norm: 0.3847 (0.6113) time: 5.5719 data: 0.0001 max mem: 71357 +[02:28:12.058501] Epoch: [0] [5860/6500] lr: 0.000045 closs: 0.7021 (0.8011) grad_norm: 0.3812 (0.6109) time: 5.5702 data: 0.0001 max mem: 71357 +[02:29:07.841507] Epoch: [0] [5870/6500] lr: 0.000045 closs: 0.7515 (0.8010) grad_norm: 0.3859 (0.6106) time: 5.5769 data: 0.0001 max mem: 71357 +[02:30:03.483436] Epoch: [0] [5880/6500] lr: 0.000045 closs: 0.7809 (0.8010) grad_norm: 0.4351 (0.6106) time: 5.5711 data: 0.0001 max mem: 71357 +[02:30:59.287191] Epoch: [0] [5890/6500] lr: 0.000045 closs: 0.7648 (0.8009) grad_norm: 0.4266 (0.6104) time: 5.5722 data: 0.0001 max mem: 71357 +[02:31:55.068642] Epoch: [0] [5900/6500] lr: 0.000045 closs: 0.7531 (0.8008) grad_norm: 0.4351 (0.6100) time: 5.5791 data: 0.0001 max mem: 71357 +[02:32:50.950525] Epoch: [0] [5910/6500] lr: 0.000045 closs: 0.7531 (0.8007) grad_norm: 0.4113 (0.6096) time: 5.5831 data: 0.0001 max mem: 71357 +[02:33:46.661662] Epoch: [0] [5920/6500] lr: 0.000046 closs: 0.7102 (0.8005) grad_norm: 0.3950 (0.6096) time: 5.5796 data: 0.0001 max mem: 71357 +[02:34:42.406758] Epoch: [0] [5930/6500] lr: 0.000046 closs: 0.7361 (0.8005) grad_norm: 0.3965 (0.6096) time: 5.5727 data: 0.0001 max mem: 71357 +[02:35:38.231017] Epoch: [0] [5940/6500] lr: 0.000046 closs: 0.8309 (0.8006) grad_norm: 0.3901 (0.6092) time: 5.5783 data: 0.0001 max mem: 71357 +[02:36:34.027442] Epoch: [0] [5950/6500] lr: 0.000046 closs: 0.8309 (0.8005) grad_norm: 0.3965 (0.6088) time: 5.5809 data: 0.0002 max mem: 71357 +[02:37:30.000045] Epoch: [0] [5960/6500] lr: 0.000046 closs: 0.7825 (0.8006) grad_norm: 0.3745 (0.6084) time: 5.5883 data: 0.0002 max mem: 71357 +[02:38:25.764491] Epoch: [0] [5970/6500] lr: 0.000046 closs: 0.8335 (0.8005) grad_norm: 0.3616 (0.6081) time: 5.5868 data: 0.0001 max mem: 71357 +[02:39:21.355752] Epoch: [0] [5980/6500] lr: 0.000046 closs: 0.8265 (0.8005) grad_norm: 0.3835 (0.6082) time: 5.5677 data: 0.0001 max mem: 71357 +[02:40:17.146222] Epoch: [0] [5990/6500] lr: 0.000046 closs: 0.7573 (0.8004) grad_norm: 0.3745 (0.6079) time: 5.5690 data: 0.0001 max mem: 71357 +[02:41:12.953638] Epoch: [0] [6000/6500] lr: 0.000046 closs: 0.7502 (0.8004) grad_norm: 0.3969 (0.6075) time: 5.5798 data: 0.0002 max mem: 71357 +[02:42:08.673781] Epoch: [0] [6010/6500] lr: 0.000046 closs: 0.7498 (0.8004) grad_norm: 0.3891 (0.6072) time: 5.5762 data: 0.0002 max mem: 71357 +[02:43:04.453165] Epoch: [0] [6020/6500] lr: 0.000046 closs: 0.7474 (0.8002) grad_norm: 0.3891 (0.6068) time: 5.5749 data: 0.0001 max mem: 71357 +[02:44:00.201410] Epoch: [0] [6030/6500] lr: 0.000046 closs: 0.7241 (0.8002) grad_norm: 0.3969 (0.6066) time: 5.5763 data: 0.0001 max mem: 71357 +[02:44:55.881083] Epoch: [0] [6040/6500] lr: 0.000046 closs: 0.7288 (0.8002) grad_norm: 0.4122 (0.6063) time: 5.5713 data: 0.0001 max mem: 71357 +[02:45:51.715473] Epoch: [0] [6050/6500] lr: 0.000047 closs: 0.7671 (0.8000) grad_norm: 0.4315 (0.6062) time: 5.5756 data: 0.0001 max mem: 71357 +[02:46:47.325180] Epoch: [0] [6060/6500] lr: 0.000047 closs: 0.7667 (0.7999) grad_norm: 0.4321 (0.6060) time: 5.5721 data: 0.0002 max mem: 71357 +[02:47:42.956637] Epoch: [0] [6070/6500] lr: 0.000047 closs: 0.7482 (0.7999) grad_norm: 0.4387 (0.6060) time: 5.5620 data: 0.0002 max mem: 71357 +[02:48:38.729897] Epoch: [0] [6080/6500] lr: 0.000047 closs: 0.7457 (0.7999) grad_norm: 0.4835 (0.6058) time: 5.5701 data: 0.0001 max mem: 71357 +[02:49:34.468997] Epoch: [0] [6090/6500] lr: 0.000047 closs: 0.7455 (0.7998) grad_norm: 0.5015 (0.6058) time: 5.5755 data: 0.0001 max mem: 71357 +[02:50:30.184089] Epoch: [0] [6100/6500] lr: 0.000047 closs: 0.7606 (0.7998) grad_norm: 0.5218 (0.6058) time: 5.5726 data: 0.0001 max mem: 71357 +[02:51:25.929815] Epoch: [0] [6110/6500] lr: 0.000047 closs: 0.8044 (0.7998) grad_norm: 0.4483 (0.6054) time: 5.5729 data: 0.0002 max mem: 71357 +[02:52:21.562615] Epoch: [0] [6120/6500] lr: 0.000047 closs: 0.7751 (0.7998) grad_norm: 0.4406 (0.6052) time: 5.5688 data: 0.0002 max mem: 71357 +[02:53:17.515875] Epoch: [0] [6130/6500] lr: 0.000047 closs: 0.8261 (0.7999) grad_norm: 0.3885 (0.6049) time: 5.5792 data: 0.0001 max mem: 71357 +[02:54:13.223511] Epoch: [0] [6140/6500] lr: 0.000047 closs: 0.8261 (0.7999) grad_norm: 0.3792 (0.6049) time: 5.5830 data: 0.0001 max mem: 71357 +[02:55:08.927662] Epoch: [0] [6150/6500] lr: 0.000047 closs: 0.7420 (0.7997) grad_norm: 0.3861 (0.6051) time: 5.5705 data: 0.0001 max mem: 71357 +[02:56:04.619859] Epoch: [0] [6160/6500] lr: 0.000047 closs: 0.7420 (0.7997) grad_norm: 0.3739 (0.6047) time: 5.5697 data: 0.0001 max mem: 71357 +[02:57:00.344520] Epoch: [0] [6170/6500] lr: 0.000047 closs: 0.7830 (0.7997) grad_norm: 0.3740 (0.6044) time: 5.5707 data: 0.0001 max mem: 71357 +[02:57:56.275405] Epoch: [0] [6180/6500] lr: 0.000048 closs: 0.8380 (0.7997) grad_norm: 0.3740 (0.6042) time: 5.5827 data: 0.0001 max mem: 71357 +[02:58:52.075593] Epoch: [0] [6190/6500] lr: 0.000048 closs: 0.7414 (0.7996) grad_norm: 0.3430 (0.6037) time: 5.5865 data: 0.0001 max mem: 71357 +[02:59:47.687443] Epoch: [0] [6200/6500] lr: 0.000048 closs: 0.7333 (0.7995) grad_norm: 0.3656 (0.6035) time: 5.5705 data: 0.0001 max mem: 71357 +[03:00:43.414828] Epoch: [0] [6210/6500] lr: 0.000048 closs: 0.7467 (0.7997) grad_norm: 0.3530 (0.6032) time: 5.5669 data: 0.0001 max mem: 71357 +[03:01:39.227260] Epoch: [0] [6220/6500] lr: 0.000048 closs: 0.7170 (0.7994) grad_norm: 0.3656 (0.6029) time: 5.5769 data: 0.0001 max mem: 71357 +[03:02:35.020515] Epoch: [0] [6230/6500] lr: 0.000048 closs: 0.6600 (0.7994) grad_norm: 0.4024 (0.6027) time: 5.5802 data: 0.0001 max mem: 71357 +[03:03:30.712438] Epoch: [0] [6240/6500] lr: 0.000048 closs: 0.6600 (0.7993) grad_norm: 0.4024 (0.6024) time: 5.5742 data: 0.0001 max mem: 71357 +[03:04:26.392385] Epoch: [0] [6250/6500] lr: 0.000048 closs: 0.7017 (0.7992) grad_norm: 0.4194 (0.6022) time: 5.5685 data: 0.0001 max mem: 71357 +[03:05:22.220010] Epoch: [0] [6260/6500] lr: 0.000048 closs: 0.7189 (0.7991) grad_norm: 0.4248 (0.6019) time: 5.5753 data: 0.0001 max mem: 71357 +[03:06:18.144391] Epoch: [0] [6270/6500] lr: 0.000048 closs: 0.7910 (0.7992) grad_norm: 0.4248 (0.6016) time: 5.5875 data: 0.0002 max mem: 71357 +[03:07:13.838284] Epoch: [0] [6280/6500] lr: 0.000048 closs: 0.7742 (0.7991) grad_norm: 0.4084 (0.6013) time: 5.5808 data: 0.0002 max mem: 71357 +[03:08:09.622986] Epoch: [0] [6290/6500] lr: 0.000048 closs: 0.7474 (0.7991) grad_norm: 0.3958 (0.6010) time: 5.5738 data: 0.0001 max mem: 71357 +[03:09:05.363224] Epoch: [0] [6300/6500] lr: 0.000048 closs: 0.7197 (0.7990) grad_norm: 0.3744 (0.6008) time: 5.5761 data: 0.0001 max mem: 71357 +[03:10:01.116113] Epoch: [0] [6310/6500] lr: 0.000049 closs: 0.8256 (0.7990) grad_norm: 0.3958 (0.6005) time: 5.5745 data: 0.0001 max mem: 71357 +[03:10:56.828047] Epoch: [0] [6320/6500] lr: 0.000049 closs: 0.8374 (0.7991) grad_norm: 0.3957 (0.6003) time: 5.5731 data: 0.0001 max mem: 71357 +[03:11:52.510798] Epoch: [0] [6330/6500] lr: 0.000049 closs: 0.7012 (0.7988) grad_norm: 0.4162 (0.6001) time: 5.5696 data: 0.0002 max mem: 71357 +[03:12:48.232518] Epoch: [0] [6340/6500] lr: 0.000049 closs: 0.6872 (0.7987) grad_norm: 0.4371 (0.5999) time: 5.5701 data: 0.0002 max mem: 71357 +[03:13:43.960654] Epoch: [0] [6350/6500] lr: 0.000049 closs: 0.7740 (0.7987) grad_norm: 0.4367 (0.5996) time: 5.5724 data: 0.0001 max mem: 71357 +[03:14:39.727757] Epoch: [0] [6360/6500] lr: 0.000049 closs: 0.7274 (0.7985) grad_norm: 0.4371 (0.5994) time: 5.5747 data: 0.0001 max mem: 71357 +[03:15:35.404672] Epoch: [0] [6370/6500] lr: 0.000049 closs: 0.7655 (0.7986) grad_norm: 0.4357 (0.5992) time: 5.5721 data: 0.0001 max mem: 71357 +[03:16:31.087842] Epoch: [0] [6380/6500] lr: 0.000049 closs: 0.7822 (0.7986) grad_norm: 0.4313 (0.5989) time: 5.5679 data: 0.0002 max mem: 71357 +[03:17:26.822736] Epoch: [0] [6390/6500] lr: 0.000049 closs: 0.7659 (0.7985) grad_norm: 0.4313 (0.5989) time: 5.5708 data: 0.0002 max mem: 71357 +[03:18:22.577493] Epoch: [0] [6400/6500] lr: 0.000049 closs: 0.6885 (0.7984) grad_norm: 0.4265 (0.5989) time: 5.5744 data: 0.0001 max mem: 71357 +[03:19:18.330428] Epoch: [0] [6410/6500] lr: 0.000049 closs: 0.7096 (0.7982) grad_norm: 0.4265 (0.5987) time: 5.5753 data: 0.0001 max mem: 71357 +[03:20:14.076963] Epoch: [0] [6420/6500] lr: 0.000049 closs: 0.7532 (0.7982) grad_norm: 0.4265 (0.5984) time: 5.5749 data: 0.0001 max mem: 71357 +[03:21:09.837780] Epoch: [0] [6430/6500] lr: 0.000049 closs: 0.7365 (0.7980) grad_norm: 0.4245 (0.5982) time: 5.5753 data: 0.0001 max mem: 71357 +[03:22:05.632480] Epoch: [0] [6440/6500] lr: 0.000050 closs: 0.7185 (0.7980) grad_norm: 0.4245 (0.5980) time: 5.5777 data: 0.0001 max mem: 71357 +[03:23:01.288584] Epoch: [0] [6450/6500] lr: 0.000050 closs: 0.7666 (0.7979) grad_norm: 0.4007 (0.5978) time: 5.5724 data: 0.0001 max mem: 71357 +[03:23:56.940427] Epoch: [0] [6460/6500] lr: 0.000050 closs: 0.7144 (0.7978) grad_norm: 0.3927 (0.5975) time: 5.5653 data: 0.0001 max mem: 71357 +[03:24:52.591248] Epoch: [0] [6470/6500] lr: 0.000050 closs: 0.7340 (0.7977) grad_norm: 0.4434 (0.5979) time: 5.5650 data: 0.0001 max mem: 71357 +[03:25:48.307940] Epoch: [0] [6480/6500] lr: 0.000050 closs: 0.7340 (0.7977) grad_norm: 0.4148 (0.5983) time: 5.5683 data: 0.0001 max mem: 71357 +[03:26:44.126298] Epoch: [0] [6490/6500] lr: 0.000050 closs: 0.7379 (0.7977) grad_norm: 0.4968 (0.5982) time: 5.5767 data: 0.0002 max mem: 71357 +[03:27:34.661033] Epoch: [0] Total time: 10:04:03 +[03:27:34.691953] Averaged stats: lr: 0.000050 closs: 0.7927 (0.7980) grad_norm: 0.4559 (0.5980) +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[03:27:34.851776] model saved +/data/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[03:27:35.757911] optimizer saved +[03:27:35.758397] other rank-common saved +[03:27:35.761691] rank-specific saved +[03:27:35.770694] log_dir: ./output_dir +[03:27:43.916967] Epoch: [1] [0/6500] lr: 0.000050 closs: 0.6486 (0.6486) time: 8.1455 data: 2.5052 max mem: 71357 +[03:28:39.555610] Epoch: [1] [10/6500] lr: 0.000050 closs: 0.7239 (0.7205) grad_norm: 0.3962 (0.4080) time: 5.7985 data: 0.2279 max mem: 71357 +[03:29:35.233876] Epoch: [1] [20/6500] lr: 0.000050 closs: 0.8120 (0.7878) grad_norm: 0.4027 (0.4338) time: 5.5657 data: 0.0001 max mem: 71357 +[03:30:31.042993] Epoch: [1] [30/6500] lr: 0.000050 closs: 0.8128 (0.7810) grad_norm: 0.4141 (0.4367) time: 5.5743 data: 0.0001 max mem: 71357 +[03:31:26.778182] Epoch: [1] [40/6500] lr: 0.000050 closs: 0.7600 (0.7738) grad_norm: 0.4286 (0.4379) time: 5.5771 data: 0.0001 max mem: 71357 +[03:32:22.493128] Epoch: [1] [50/6500] lr: 0.000050 closs: 0.7715 (0.7786) grad_norm: 0.4286 (0.4348) time: 5.5724 data: 0.0001 max mem: 71357 +[03:33:18.164718] Epoch: [1] [60/6500] lr: 0.000050 closs: 0.7715 (0.7727) grad_norm: 0.4141 (0.4459) time: 5.5692 data: 0.0001 max mem: 71357 +[03:34:13.800199] Epoch: [1] [70/6500] lr: 0.000050 closs: 0.7570 (0.7741) grad_norm: 0.4129 (0.4442) time: 5.5652 data: 0.0001 max mem: 71357 +[03:35:09.583280] Epoch: [1] [80/6500] lr: 0.000050 closs: 0.7335 (0.7625) grad_norm: 0.4129 (0.4435) time: 5.5708 data: 0.0001 max mem: 71357 +[03:36:05.465429] Epoch: [1] [90/6500] lr: 0.000050 closs: 0.7385 (0.7681) grad_norm: 0.4266 (0.4414) time: 5.5832 data: 0.0001 max mem: 71357 +[03:37:01.196399] Epoch: [1] [100/6500] lr: 0.000050 closs: 0.7711 (0.7741) grad_norm: 0.3775 (0.4462) time: 5.5806 data: 0.0001 max mem: 71357 +[03:37:56.957142] Epoch: [1] [110/6500] lr: 0.000050 closs: 0.7738 (0.7750) grad_norm: 0.3745 (0.4366) time: 5.5745 data: 0.0001 max mem: 71357 +[03:38:52.693091] Epoch: [1] [120/6500] lr: 0.000050 closs: 0.7361 (0.7703) grad_norm: 0.3729 (0.4461) time: 5.5747 data: 0.0001 max mem: 71357 +[03:39:48.482751] Epoch: [1] [130/6500] lr: 0.000050 closs: 0.7085 (0.7608) grad_norm: 0.3710 (0.4497) time: 5.5762 data: 0.0001 max mem: 71357 +[03:40:44.069389] Epoch: [1] [140/6500] lr: 0.000050 closs: 0.6935 (0.7555) grad_norm: 0.3846 (0.4603) time: 5.5687 data: 0.0001 max mem: 71357 +[03:41:39.808776] Epoch: [1] [150/6500] lr: 0.000050 closs: 0.7341 (0.7603) grad_norm: 0.3978 (0.4595) time: 5.5662 data: 0.0001 max mem: 71357 +[03:42:35.446981] Epoch: [1] [160/6500] lr: 0.000050 closs: 0.8180 (0.7645) grad_norm: 0.4602 (0.4629) time: 5.5688 data: 0.0001 max mem: 71357 +[03:43:31.177187] Epoch: [1] [170/6500] lr: 0.000050 closs: 0.7881 (0.7646) grad_norm: 0.4495 (0.4614) time: 5.5683 data: 0.0001 max mem: 71357 +[03:44:26.897683] Epoch: [1] [180/6500] lr: 0.000050 closs: 0.8055 (0.7686) grad_norm: 0.4495 (0.4677) time: 5.5724 data: 0.0001 max mem: 71357 +[03:45:22.545885] Epoch: [1] [190/6500] lr: 0.000050 closs: 0.7508 (0.7662) grad_norm: 0.4817 (0.4761) time: 5.5683 data: 0.0001 max mem: 71357 +[03:46:18.218652] Epoch: [1] [200/6500] lr: 0.000050 closs: 0.7508 (0.7668) grad_norm: 0.4273 (0.4732) time: 5.5660 data: 0.0001 max mem: 71357 +[03:47:13.969467] Epoch: [1] [210/6500] lr: 0.000050 closs: 0.7564 (0.7684) grad_norm: 0.4665 (0.4759) time: 5.5711 data: 0.0001 max mem: 71357 +[03:48:09.716365] Epoch: [1] [220/6500] lr: 0.000050 closs: 0.7529 (0.7664) grad_norm: 0.4229 (0.4752) time: 5.5748 data: 0.0001 max mem: 71357 +[03:49:05.445622] Epoch: [1] [230/6500] lr: 0.000050 closs: 0.7457 (0.7659) grad_norm: 0.3906 (0.4711) time: 5.5737 data: 0.0001 max mem: 71357 +[03:50:01.108007] Epoch: [1] [240/6500] lr: 0.000050 closs: 0.7287 (0.7642) grad_norm: 0.4106 (0.4707) time: 5.5695 data: 0.0001 max mem: 71357 +[03:50:56.937160] Epoch: [1] [250/6500] lr: 0.000050 closs: 0.7201 (0.7644) grad_norm: 0.4106 (0.6624) time: 5.5745 data: 0.0001 max mem: 71357 +[03:51:52.784701] Epoch: [1] [260/6500] lr: 0.000050 closs: 0.7706 (0.7645) grad_norm: 0.4106 (0.6523) time: 5.5837 data: 0.0001 max mem: 71357 +[03:52:48.462413] Epoch: [1] [270/6500] lr: 0.000050 closs: 0.7976 (0.7668) grad_norm: 0.4267 (0.6459) time: 5.5761 data: 0.0001 max mem: 71357 +[03:53:44.119411] Epoch: [1] [280/6500] lr: 0.000050 closs: 0.7421 (0.7669) grad_norm: 0.4267 (0.6423) time: 5.5666 data: 0.0002 max mem: 71357 +[03:54:39.776151] Epoch: [1] [290/6500] lr: 0.000050 closs: 0.7414 (0.7677) grad_norm: 0.4362 (0.6379) time: 5.5656 data: 0.0002 max mem: 71357 +[03:55:35.536753] Epoch: [1] [300/6500] lr: 0.000050 closs: 0.7593 (0.7666) grad_norm: 0.5156 (0.6368) time: 5.5707 data: 0.0001 max mem: 71357 +[03:56:31.295194] Epoch: [1] [310/6500] lr: 0.000050 closs: 0.7351 (0.7641) grad_norm: 0.4603 (0.6320) time: 5.5758 data: 0.0001 max mem: 71357 +[03:57:26.899484] Epoch: [1] [320/6500] lr: 0.000050 closs: 0.7863 (0.7665) grad_norm: 0.4362 (0.6254) time: 5.5680 data: 0.0001 max mem: 71357 +[03:58:22.621113] Epoch: [1] [330/6500] lr: 0.000050 closs: 0.7725 (0.7660) grad_norm: 0.4302 (0.6213) time: 5.5662 data: 0.0002 max mem: 71357 +[03:59:18.439876] Epoch: [1] [340/6500] lr: 0.000050 closs: 0.7531 (0.7661) grad_norm: 0.3934 (0.6146) time: 5.5769 data: 0.0002 max mem: 71357 +[04:00:14.348413] Epoch: [1] [350/6500] lr: 0.000050 closs: 0.7826 (0.7659) grad_norm: 0.3847 (0.6065) time: 5.5862 data: 0.0001 max mem: 71357 +[04:01:09.980901] Epoch: [1] [360/6500] lr: 0.000050 closs: 0.8143 (0.7690) grad_norm: 0.3870 (0.6030) time: 5.5769 data: 0.0001 max mem: 71357 +[04:02:05.648563] Epoch: [1] [370/6500] lr: 0.000050 closs: 0.7969 (0.7674) grad_norm: 0.3870 (0.6001) time: 5.5649 data: 0.0001 max mem: 71357 +[04:03:01.348936] Epoch: [1] [380/6500] lr: 0.000050 closs: 0.7290 (0.7683) grad_norm: 0.4082 (0.5945) time: 5.5683 data: 0.0001 max mem: 71357 +[04:03:57.052644] Epoch: [1] [390/6500] lr: 0.000050 closs: 0.7591 (0.7676) grad_norm: 0.4110 (0.5896) time: 5.5701 data: 0.0001 max mem: 71357 +[04:04:52.824329] Epoch: [1] [400/6500] lr: 0.000050 closs: 0.7321 (0.7674) grad_norm: 0.3754 (0.5860) time: 5.5736 data: 0.0001 max mem: 71357 +[04:05:48.442381] Epoch: [1] [410/6500] lr: 0.000050 closs: 0.7504 (0.7677) grad_norm: 0.4082 (0.5832) time: 5.5694 data: 0.0001 max mem: 71357 +[04:06:44.156813] Epoch: [1] [420/6500] lr: 0.000050 closs: 0.7621 (0.7684) grad_norm: 0.3807 (0.5788) time: 5.5665 data: 0.0001 max mem: 71357 +[04:07:39.800804] Epoch: [1] [430/6500] lr: 0.000050 closs: 0.7435 (0.7668) grad_norm: 0.4047 (0.5767) time: 5.5678 data: 0.0001 max mem: 71357 +[04:08:35.582184] Epoch: [1] [440/6500] lr: 0.000050 closs: 0.7382 (0.7663) grad_norm: 0.4047 (0.5735) time: 5.5712 data: 0.0001 max mem: 71357 +[04:09:31.256070] Epoch: [1] [450/6500] lr: 0.000050 closs: 0.7414 (0.7663) grad_norm: 0.3852 (0.5718) time: 5.5727 data: 0.0001 max mem: 71357 +[04:10:26.919187] Epoch: [1] [460/6500] lr: 0.000050 closs: 0.7273 (0.7660) grad_norm: 0.4047 (0.5691) time: 5.5667 data: 0.0001 max mem: 71357 +[04:11:22.572352] Epoch: [1] [470/6500] lr: 0.000050 closs: 0.7546 (0.7670) grad_norm: 0.4096 (0.5692) time: 5.5657 data: 0.0001 max mem: 71357 +[04:12:18.379459] Epoch: [1] [480/6500] lr: 0.000050 closs: 0.7764 (0.7670) grad_norm: 0.4096 (0.5649) time: 5.5729 data: 0.0001 max mem: 71357 +[04:13:14.102252] Epoch: [1] [490/6500] lr: 0.000050 closs: 0.7591 (0.7662) grad_norm: 0.4096 (0.5617) time: 5.5764 data: 0.0001 max mem: 71357 +[04:14:09.753488] Epoch: [1] [500/6500] lr: 0.000050 closs: 0.7008 (0.7658) grad_norm: 0.4468 (0.5610) time: 5.5686 data: 0.0001 max mem: 71357 +[04:15:05.414976] Epoch: [1] [510/6500] lr: 0.000050 closs: 0.7568 (0.7650) grad_norm: 0.3889 (0.5583) time: 5.5655 data: 0.0001 max mem: 71357 +[04:16:01.166383] Epoch: [1] [520/6500] lr: 0.000050 closs: 0.7262 (0.7643) grad_norm: 0.4508 (0.5579) time: 5.5705 data: 0.0001 max mem: 71357 +[04:16:56.919801] Epoch: [1] [530/6500] lr: 0.000050 closs: 0.6799 (0.7621) grad_norm: 0.4514 (0.5565) time: 5.5752 data: 0.0001 max mem: 71357 +[04:17:52.660053] Epoch: [1] [540/6500] lr: 0.000050 closs: 0.6093 (0.7599) grad_norm: 0.4378 (0.5536) time: 5.5746 data: 0.0001 max mem: 71357 +[04:18:48.328201] Epoch: [1] [550/6500] lr: 0.000050 closs: 0.6526 (0.7593) grad_norm: 0.4489 (0.5519) time: 5.5704 data: 0.0002 max mem: 71357 +[04:19:44.116985] Epoch: [1] [560/6500] lr: 0.000050 closs: 0.7426 (0.7588) grad_norm: 0.3843 (0.5489) time: 5.5728 data: 0.0002 max mem: 71357 +[04:20:39.824567] Epoch: [1] [570/6500] lr: 0.000050 closs: 0.7211 (0.7596) grad_norm: 0.3797 (0.5466) time: 5.5747 data: 0.0001 max mem: 71357 +[04:21:35.492317] Epoch: [1] [580/6500] lr: 0.000050 closs: 0.7623 (0.7591) grad_norm: 0.3837 (0.5454) time: 5.5687 data: 0.0001 max mem: 71357 +[04:22:31.256691] Epoch: [1] [590/6500] lr: 0.000050 closs: 0.7601 (0.7591) grad_norm: 0.3874 (0.5442) time: 5.5715 data: 0.0001 max mem: 71357 +[04:23:27.024625] Epoch: [1] [600/6500] lr: 0.000050 closs: 0.7601 (0.7599) grad_norm: 0.4280 (0.5442) time: 5.5765 data: 0.0001 max mem: 71357 +[04:24:22.754492] Epoch: [1] [610/6500] lr: 0.000050 closs: 0.7729 (0.7613) grad_norm: 0.4293 (0.5421) time: 5.5748 data: 0.0001 max mem: 71357 +[04:25:18.641839] Epoch: [1] [620/6500] lr: 0.000050 closs: 0.7729 (0.7617) grad_norm: 0.4281 (0.5403) time: 5.5808 data: 0.0001 max mem: 71357 +[04:26:14.504595] Epoch: [1] [630/6500] lr: 0.000050 closs: 0.7718 (0.7620) grad_norm: 0.4043 (0.5377) time: 5.5874 data: 0.0001 max mem: 71357 +[04:27:10.222475] Epoch: [1] [640/6500] lr: 0.000050 closs: 0.7718 (0.7624) grad_norm: 0.4043 (0.5370) time: 5.5789 data: 0.0001 max mem: 71357 +[04:28:05.954345] Epoch: [1] [650/6500] lr: 0.000050 closs: 0.7636 (0.7626) grad_norm: 0.3995 (0.5348) time: 5.5724 data: 0.0001 max mem: 71357 +[04:29:01.871861] Epoch: [1] [660/6500] lr: 0.000050 closs: 0.7671 (0.7637) grad_norm: 0.3870 (0.5328) time: 5.5824 data: 0.0002 max mem: 71357 +[04:29:57.593541] Epoch: [1] [670/6500] lr: 0.000050 closs: 0.7871 (0.7652) grad_norm: 0.4053 (0.5317) time: 5.5819 data: 0.0002 max mem: 71357 +[04:30:53.271954] Epoch: [1] [680/6500] lr: 0.000050 closs: 0.7538 (0.7653) grad_norm: 0.3957 (0.5294) time: 5.5699 data: 0.0001 max mem: 71357 +[04:31:49.060636] Epoch: [1] [690/6500] lr: 0.000050 closs: 0.7395 (0.7658) grad_norm: 0.3957 (0.5279) time: 5.5733 data: 0.0001 max mem: 71357 +[04:32:44.827788] Epoch: [1] [700/6500] lr: 0.000050 closs: 0.7529 (0.7662) grad_norm: 0.3957 (0.5283) time: 5.5777 data: 0.0001 max mem: 71357 +[04:33:40.570340] Epoch: [1] [710/6500] lr: 0.000050 closs: 0.7456 (0.7660) grad_norm: 0.3900 (0.5295) time: 5.5754 data: 0.0002 max mem: 71357 +[04:34:36.343032] Epoch: [1] [720/6500] lr: 0.000050 closs: 0.7456 (0.7663) grad_norm: 0.3888 (0.5283) time: 5.5756 data: 0.0002 max mem: 71357 +[04:35:32.039216] Epoch: [1] [730/6500] lr: 0.000050 closs: 0.7721 (0.7665) grad_norm: 0.4204 (0.5287) time: 5.5733 data: 0.0001 max mem: 71357 +[04:36:27.753192] Epoch: [1] [740/6500] lr: 0.000050 closs: 0.7721 (0.7673) grad_norm: 0.4113 (0.5270) time: 5.5704 data: 0.0001 max mem: 71357 +[04:37:23.585540] Epoch: [1] [750/6500] lr: 0.000050 closs: 0.7719 (0.7677) grad_norm: 0.3883 (0.5252) time: 5.5772 data: 0.0001 max mem: 71357 +[04:38:19.280542] Epoch: [1] [760/6500] lr: 0.000050 closs: 0.7400 (0.7679) grad_norm: 0.3817 (0.5245) time: 5.5763 data: 0.0001 max mem: 71357 +[04:39:14.994009] Epoch: [1] [770/6500] lr: 0.000050 closs: 0.7093 (0.7682) grad_norm: 0.3817 (0.5237) time: 5.5703 data: 0.0001 max mem: 71357 +[04:40:10.694587] Epoch: [1] [780/6500] lr: 0.000050 closs: 0.7811 (0.7686) grad_norm: 0.3984 (0.5226) time: 5.5706 data: 0.0001 max mem: 71357 +[04:41:06.527247] Epoch: [1] [790/6500] lr: 0.000050 closs: 0.7501 (0.7679) grad_norm: 0.4039 (0.5212) time: 5.5766 data: 0.0001 max mem: 71357 +[04:42:02.287497] Epoch: [1] [800/6500] lr: 0.000050 closs: 0.7254 (0.7673) grad_norm: 0.4044 (0.5203) time: 5.5795 data: 0.0001 max mem: 71357 +[04:42:57.958012] Epoch: [1] [810/6500] lr: 0.000050 closs: 0.6886 (0.7670) grad_norm: 0.4039 (0.5206) time: 5.5714 data: 0.0001 max mem: 71357 +[04:43:53.709271] Epoch: [1] [820/6500] lr: 0.000050 closs: 0.6873 (0.7668) grad_norm: 0.3714 (0.5228) time: 5.5710 data: 0.0001 max mem: 71357 +[04:44:49.456416] Epoch: [1] [830/6500] lr: 0.000050 closs: 0.7426 (0.7667) grad_norm: 0.4044 (0.5218) time: 5.5748 data: 0.0001 max mem: 71357 +[04:45:45.250753] Epoch: [1] [840/6500] lr: 0.000050 closs: 0.7789 (0.7668) grad_norm: 0.4001 (0.5215) time: 5.5770 data: 0.0001 max mem: 71357 +[04:46:40.949223] Epoch: [1] [850/6500] lr: 0.000050 closs: 0.7471 (0.7662) grad_norm: 0.3780 (0.5200) time: 5.5746 data: 0.0001 max mem: 71357 +[04:47:36.561157] Epoch: [1] [860/6500] lr: 0.000050 closs: 0.7247 (0.7658) grad_norm: 0.4001 (0.5197) time: 5.5654 data: 0.0001 max mem: 71357 +[04:48:32.308653] Epoch: [1] [870/6500] lr: 0.000050 closs: 0.7587 (0.7662) grad_norm: 0.4001 (0.5201) time: 5.5679 data: 0.0002 max mem: 71357 +[04:49:28.149376] Epoch: [1] [880/6500] lr: 0.000050 closs: 0.7531 (0.7667) grad_norm: 0.3780 (0.5181) time: 5.5793 data: 0.0002 max mem: 71357 +[04:50:23.950718] Epoch: [1] [890/6500] lr: 0.000050 closs: 0.7416 (0.7664) grad_norm: 0.3905 (0.5193) time: 5.5820 data: 0.0001 max mem: 71357 +[04:51:19.670794] Epoch: [1] [900/6500] lr: 0.000050 closs: 0.7291 (0.7655) grad_norm: 0.3578 (0.5184) time: 5.5760 data: 0.0001 max mem: 71357 +[04:52:15.464157] Epoch: [1] [910/6500] lr: 0.000050 closs: 0.7453 (0.7650) grad_norm: 0.3669 (0.5174) time: 5.5756 data: 0.0001 max mem: 71357 +[04:53:11.330205] Epoch: [1] [920/6500] lr: 0.000050 closs: 0.7024 (0.7651) grad_norm: 0.3843 (0.5163) time: 5.5828 data: 0.0001 max mem: 71357 +[04:54:07.010433] Epoch: [1] [930/6500] lr: 0.000050 closs: 0.7381 (0.7650) grad_norm: 0.3768 (0.5146) time: 5.5772 data: 0.0002 max mem: 71357 +[04:55:02.643380] Epoch: [1] [940/6500] lr: 0.000050 closs: 0.7571 (0.7650) grad_norm: 0.3843 (0.5142) time: 5.5656 data: 0.0002 max mem: 71357 +[04:55:58.347304] Epoch: [1] [950/6500] lr: 0.000050 closs: 0.7475 (0.7644) grad_norm: 0.3946 (0.5133) time: 5.5668 data: 0.0001 max mem: 71357 +[04:56:54.139194] Epoch: [1] [960/6500] lr: 0.000050 closs: 0.7475 (0.7651) grad_norm: 0.3654 (0.5122) time: 5.5747 data: 0.0001 max mem: 71357 +[04:57:50.039666] Epoch: [1] [970/6500] lr: 0.000050 closs: 0.7587 (0.7649) grad_norm: 0.3891 (0.5117) time: 5.5845 data: 0.0001 max mem: 71357 +[04:58:45.709094] Epoch: [1] [980/6500] lr: 0.000050 closs: 0.7098 (0.7645) grad_norm: 0.3654 (0.5107) time: 5.5784 data: 0.0001 max mem: 71357 +[04:59:41.325212] Epoch: [1] [990/6500] lr: 0.000050 closs: 0.7216 (0.7645) grad_norm: 0.3654 (0.5096) time: 5.5642 data: 0.0001 max mem: 71357 +[05:00:36.996785] Epoch: [1] [1000/6500] lr: 0.000050 closs: 0.7500 (0.7646) grad_norm: 0.3880 (0.5092) time: 5.5643 data: 0.0001 max mem: 71357 +[05:01:32.817052] Epoch: [1] [1010/6500] lr: 0.000050 closs: 0.7316 (0.7644) grad_norm: 0.3880 (0.5102) time: 5.5745 data: 0.0001 max mem: 71357 +[05:02:28.530120] Epoch: [1] [1020/6500] lr: 0.000050 closs: 0.7286 (0.7641) grad_norm: 0.4530 (0.5110) time: 5.5766 data: 0.0001 max mem: 71357 +[05:03:24.118826] Epoch: [1] [1030/6500] lr: 0.000050 closs: 0.7875 (0.7641) grad_norm: 0.4651 (0.5107) time: 5.5650 data: 0.0001 max mem: 71357 +[05:04:19.728286] Epoch: [1] [1040/6500] lr: 0.000050 closs: 0.7858 (0.7640) grad_norm: 0.5125 (0.5102) time: 5.5598 data: 0.0001 max mem: 71357 +[05:05:15.444436] Epoch: [1] [1050/6500] lr: 0.000050 closs: 0.7511 (0.7631) grad_norm: 0.4293 (0.5093) time: 5.5662 data: 0.0001 max mem: 71357 +[05:06:11.258329] Epoch: [1] [1060/6500] lr: 0.000050 closs: 0.6617 (0.7621) grad_norm: 0.4393 (0.5098) time: 5.5764 data: 0.0001 max mem: 71357 +[05:07:06.968795] Epoch: [1] [1070/6500] lr: 0.000050 closs: 0.6714 (0.7617) grad_norm: 0.4393 (0.5090) time: 5.5761 data: 0.0001 max mem: 71357 +[05:08:02.608905] Epoch: [1] [1080/6500] lr: 0.000050 closs: 0.6980 (0.7620) grad_norm: 0.4393 (0.5091) time: 5.5675 data: 0.0001 max mem: 71357 +[05:08:58.333156] Epoch: [1] [1090/6500] lr: 0.000050 closs: 0.7979 (0.7625) grad_norm: 0.4444 (0.5083) time: 5.5681 data: 0.0002 max mem: 71357 +[05:09:54.158949] Epoch: [1] [1100/6500] lr: 0.000050 closs: 0.7641 (0.7622) grad_norm: 0.4184 (0.5078) time: 5.5774 data: 0.0002 max mem: 71357 +[05:10:49.934142] Epoch: [1] [1110/6500] lr: 0.000050 closs: 0.7720 (0.7626) grad_norm: 0.4444 (0.5076) time: 5.5800 data: 0.0001 max mem: 71357 +[05:11:45.606270] Epoch: [1] [1120/6500] lr: 0.000050 closs: 0.7850 (0.7630) grad_norm: 0.4184 (0.5068) time: 5.5723 data: 0.0001 max mem: 71357 +[05:12:41.469541] Epoch: [1] [1130/6500] lr: 0.000050 closs: 0.7606 (0.7631) grad_norm: 0.4232 (0.5069) time: 5.5767 data: 0.0001 max mem: 71357 +[05:13:37.226239] Epoch: [1] [1140/6500] lr: 0.000050 closs: 0.7306 (0.7627) grad_norm: 0.4617 (0.5069) time: 5.5809 data: 0.0002 max mem: 71357 +[05:14:32.916965] Epoch: [1] [1150/6500] lr: 0.000050 closs: 0.7464 (0.7632) grad_norm: 0.4232 (0.5061) time: 5.5723 data: 0.0002 max mem: 71357 +[05:15:28.589173] Epoch: [1] [1160/6500] lr: 0.000050 closs: 0.8070 (0.7637) grad_norm: 0.4654 (0.5071) time: 5.5680 data: 0.0001 max mem: 71357 +[05:16:24.252719] Epoch: [1] [1170/6500] lr: 0.000050 closs: 0.7713 (0.7635) grad_norm: 0.4364 (0.5061) time: 5.5667 data: 0.0001 max mem: 71357 +[05:17:19.928322] Epoch: [1] [1180/6500] lr: 0.000050 closs: 0.6577 (0.7623) grad_norm: 0.4311 (0.5059) time: 5.5669 data: 0.0001 max mem: 71357 +[05:18:15.678514] Epoch: [1] [1190/6500] lr: 0.000050 closs: 0.6944 (0.7620) grad_norm: 0.4312 (0.5049) time: 5.5712 data: 0.0001 max mem: 71357 +[05:19:11.287718] Epoch: [1] [1200/6500] lr: 0.000050 closs: 0.7023 (0.7611) grad_norm: 0.4168 (0.5048) time: 5.5679 data: 0.0001 max mem: 71357 +[05:20:06.965210] Epoch: [1] [1210/6500] lr: 0.000050 closs: 0.7447 (0.7617) grad_norm: 0.4043 (0.5037) time: 5.5642 data: 0.0001 max mem: 71357 +[05:21:02.647004] Epoch: [1] [1220/6500] lr: 0.000050 closs: 0.8533 (0.7631) grad_norm: 0.4043 (0.5091) time: 5.5678 data: 0.0001 max mem: 71357 +[05:21:58.423989] Epoch: [1] [1230/6500] lr: 0.000050 closs: 0.8409 (0.7629) grad_norm: 0.4043 (0.5136) time: 5.5728 data: 0.0001 max mem: 71357 +[05:22:54.043709] Epoch: [1] [1240/6500] lr: 0.000050 closs: 0.7011 (0.7626) grad_norm: 0.4162 (0.5134) time: 5.5698 data: 0.0001 max mem: 71357 +[05:23:49.814775] Epoch: [1] [1250/6500] lr: 0.000050 closs: 0.7408 (0.7626) grad_norm: 0.4162 (0.5124) time: 5.5695 data: 0.0001 max mem: 71357 +[05:24:45.428712] Epoch: [1] [1260/6500] lr: 0.000050 closs: 0.7448 (0.7624) grad_norm: 0.4091 (0.5116) time: 5.5691 data: 0.0001 max mem: 71357 +[05:25:41.203614] Epoch: [1] [1270/6500] lr: 0.000050 closs: 0.7227 (0.7624) grad_norm: 0.4091 (0.5118) time: 5.5693 data: 0.0001 max mem: 71357 +[05:26:37.014189] Epoch: [1] [1280/6500] lr: 0.000050 closs: 0.7570 (0.7625) grad_norm: 0.4033 (0.5109) time: 5.5792 data: 0.0001 max mem: 71357 +[05:27:32.726225] Epoch: [1] [1290/6500] lr: 0.000050 closs: 0.7570 (0.7623) grad_norm: 0.4306 (0.5116) time: 5.5760 data: 0.0001 max mem: 71357 +[05:28:28.581411] Epoch: [1] [1300/6500] lr: 0.000050 closs: 0.7812 (0.7627) grad_norm: 0.4184 (0.5103) time: 5.5783 data: 0.0001 max mem: 71357 +[05:29:24.296988] Epoch: [1] [1310/6500] lr: 0.000050 closs: 0.7503 (0.7624) grad_norm: 0.4029 (0.5096) time: 5.5785 data: 0.0001 max mem: 71357 +[05:30:20.071193] Epoch: [1] [1320/6500] lr: 0.000049 closs: 0.7246 (0.7624) grad_norm: 0.4029 (0.5091) time: 5.5744 data: 0.0001 max mem: 71357 +[05:31:15.692148] Epoch: [1] [1330/6500] lr: 0.000049 closs: 0.7118 (0.7625) grad_norm: 0.3548 (0.5083) time: 5.5697 data: 0.0001 max mem: 71357 +[05:32:11.408467] Epoch: [1] [1340/6500] lr: 0.000049 closs: 0.7109 (0.7621) grad_norm: 0.3788 (0.5074) time: 5.5667 data: 0.0001 max mem: 71357 +[05:33:07.091654] Epoch: [1] [1350/6500] lr: 0.000049 closs: 0.7109 (0.7621) grad_norm: 0.3984 (0.5070) time: 5.5699 data: 0.0001 max mem: 71357 +[05:34:02.874850] Epoch: [1] [1360/6500] lr: 0.000049 closs: 0.7131 (0.7629) grad_norm: 0.3984 (0.5061) time: 5.5732 data: 0.0001 max mem: 71357 +[05:34:58.543548] Epoch: [1] [1370/6500] lr: 0.000049 closs: 0.7400 (0.7626) grad_norm: 0.3952 (0.5052) time: 5.5725 data: 0.0001 max mem: 71357 +[05:35:54.208343] Epoch: [1] [1380/6500] lr: 0.000049 closs: 0.7499 (0.7630) grad_norm: 0.3952 (0.5043) time: 5.5666 data: 0.0001 max mem: 71357 +[05:36:49.991759] Epoch: [1] [1390/6500] lr: 0.000049 closs: 0.7824 (0.7628) grad_norm: 0.3701 (0.5033) time: 5.5723 data: 0.0001 max mem: 71357 +[05:37:45.788654] Epoch: [1] [1400/6500] lr: 0.000049 closs: 0.7824 (0.7628) grad_norm: 0.3727 (0.5032) time: 5.5789 data: 0.0001 max mem: 71357 +[05:38:41.516767] Epoch: [1] [1410/6500] lr: 0.000049 closs: 0.8001 (0.7629) grad_norm: 0.3895 (0.5029) time: 5.5761 data: 0.0001 max mem: 71357 +[05:39:37.272883] Epoch: [1] [1420/6500] lr: 0.000049 closs: 0.8019 (0.7634) grad_norm: 0.3895 (0.5022) time: 5.5741 data: 0.0001 max mem: 71357 +[05:40:32.946983] Epoch: [1] [1430/6500] lr: 0.000049 closs: 0.7728 (0.7634) grad_norm: 0.4048 (0.5015) time: 5.5714 data: 0.0001 max mem: 71357 +[05:41:28.638478] Epoch: [1] [1440/6500] lr: 0.000049 closs: 0.6953 (0.7633) grad_norm: 0.4000 (0.5010) time: 5.5682 data: 0.0001 max mem: 71357 +[05:42:24.408111] Epoch: [1] [1450/6500] lr: 0.000049 closs: 0.7377 (0.7630) grad_norm: 0.4000 (0.5005) time: 5.5729 data: 0.0001 max mem: 71357 +[05:43:20.139266] Epoch: [1] [1460/6500] lr: 0.000049 closs: 0.7676 (0.7636) grad_norm: 0.4100 (0.5000) time: 5.5749 data: 0.0001 max mem: 71357 +[05:44:15.852405] Epoch: [1] [1470/6500] lr: 0.000049 closs: 0.8084 (0.7637) grad_norm: 0.4047 (0.4994) time: 5.5721 data: 0.0001 max mem: 71357 +[05:45:11.499402] Epoch: [1] [1480/6500] lr: 0.000049 closs: 0.7897 (0.7633) grad_norm: 0.4047 (0.4993) time: 5.5679 data: 0.0001 max mem: 71357 +[05:46:07.251883] Epoch: [1] [1490/6500] lr: 0.000049 closs: 0.7280 (0.7636) grad_norm: 0.3998 (0.4990) time: 5.5699 data: 0.0001 max mem: 71357 +[05:47:03.095336] Epoch: [1] [1500/6500] lr: 0.000049 closs: 0.7501 (0.7636) grad_norm: 0.3807 (0.4985) time: 5.5797 data: 0.0001 max mem: 71357 +[05:47:58.748278] Epoch: [1] [1510/6500] lr: 0.000049 closs: 0.7772 (0.7638) grad_norm: 0.3998 (0.4986) time: 5.5747 data: 0.0001 max mem: 71357 +[05:48:54.481750] Epoch: [1] [1520/6500] lr: 0.000049 closs: 0.7816 (0.7640) grad_norm: 0.3807 (0.4983) time: 5.5692 data: 0.0002 max mem: 71357 +[05:49:50.180751] Epoch: [1] [1530/6500] lr: 0.000049 closs: 0.7623 (0.7641) grad_norm: 0.3790 (0.4979) time: 5.5715 data: 0.0002 max mem: 71357 +[05:50:45.971116] Epoch: [1] [1540/6500] lr: 0.000049 closs: 0.8359 (0.7645) grad_norm: 0.3935 (0.4972) time: 5.5743 data: 0.0001 max mem: 71357 +[05:51:41.648187] Epoch: [1] [1550/6500] lr: 0.000049 closs: 0.7772 (0.7649) grad_norm: 0.3974 (0.4970) time: 5.5733 data: 0.0001 max mem: 71357 +[05:52:37.329020] Epoch: [1] [1560/6500] lr: 0.000049 closs: 0.7288 (0.7646) grad_norm: 0.4314 (0.4972) time: 5.5678 data: 0.0001 max mem: 71357 +[05:53:33.021624] Epoch: [1] [1570/6500] lr: 0.000049 closs: 0.7566 (0.7647) grad_norm: 0.4203 (0.4978) time: 5.5686 data: 0.0001 max mem: 71357 +[05:54:28.849243] Epoch: [1] [1580/6500] lr: 0.000049 closs: 0.7969 (0.7649) grad_norm: 0.4503 (0.4972) time: 5.5759 data: 0.0001 max mem: 71357 +[05:55:24.622675] Epoch: [1] [1590/6500] lr: 0.000049 closs: 0.7610 (0.7645) grad_norm: 0.4014 (0.4963) time: 5.5800 data: 0.0001 max mem: 71357 +[05:56:20.321965] Epoch: [1] [1600/6500] lr: 0.000049 closs: 0.7300 (0.7644) grad_norm: 0.3717 (0.4956) time: 5.5735 data: 0.0001 max mem: 71357 +[05:57:16.028460] Epoch: [1] [1610/6500] lr: 0.000049 closs: 0.7300 (0.7642) grad_norm: 0.3510 (0.4955) time: 5.5702 data: 0.0001 max mem: 71357 +[05:58:11.783706] Epoch: [1] [1620/6500] lr: 0.000049 closs: 0.7731 (0.7643) grad_norm: 0.3510 (0.4947) time: 5.5730 data: 0.0001 max mem: 71357 +[05:59:07.570790] Epoch: [1] [1630/6500] lr: 0.000049 closs: 0.8139 (0.7647) grad_norm: 0.4063 (0.4945) time: 5.5770 data: 0.0001 max mem: 71357 +[06:00:03.238802] Epoch: [1] [1640/6500] lr: 0.000049 closs: 0.8139 (0.7646) grad_norm: 0.4153 (0.4940) time: 5.5727 data: 0.0001 max mem: 71357 +[06:00:58.977098] Epoch: [1] [1650/6500] lr: 0.000049 closs: 0.7872 (0.7648) grad_norm: 0.4063 (0.4936) time: 5.5702 data: 0.0001 max mem: 71357 +[06:01:54.788329] Epoch: [1] [1660/6500] lr: 0.000049 closs: 0.7872 (0.7650) grad_norm: 0.4311 (0.4933) time: 5.5774 data: 0.0001 max mem: 71357 +[06:02:50.593146] Epoch: [1] [1670/6500] lr: 0.000049 closs: 0.7356 (0.7648) grad_norm: 0.4235 (0.4928) time: 5.5807 data: 0.0001 max mem: 71357 +[06:03:46.216024] Epoch: [1] [1680/6500] lr: 0.000049 closs: 0.7429 (0.7648) grad_norm: 0.4235 (0.4927) time: 5.5713 data: 0.0002 max mem: 71357 +[06:04:41.979289] Epoch: [1] [1690/6500] lr: 0.000049 closs: 0.7711 (0.7647) grad_norm: 0.4235 (0.4929) time: 5.5692 data: 0.0002 max mem: 71357 +[06:05:37.689767] Epoch: [1] [1700/6500] lr: 0.000049 closs: 0.7552 (0.7647) grad_norm: 0.4142 (0.4924) time: 5.5736 data: 0.0001 max mem: 71357 +[06:06:33.493941] Epoch: [1] [1710/6500] lr: 0.000049 closs: 0.6966 (0.7640) grad_norm: 0.3993 (0.4918) time: 5.5756 data: 0.0001 max mem: 71357 +[06:07:29.339116] Epoch: [1] [1720/6500] lr: 0.000049 closs: 0.6799 (0.7640) grad_norm: 0.3993 (0.4916) time: 5.5824 data: 0.0001 max mem: 71357 +[06:08:25.102712] Epoch: [1] [1730/6500] lr: 0.000049 closs: 0.6988 (0.7640) grad_norm: 0.3986 (0.4910) time: 5.5803 data: 0.0001 max mem: 71357 +[06:09:20.832118] Epoch: [1] [1740/6500] lr: 0.000049 closs: 0.6702 (0.7637) grad_norm: 0.3993 (0.4907) time: 5.5746 data: 0.0001 max mem: 71357 +[06:10:16.553919] Epoch: [1] [1750/6500] lr: 0.000049 closs: 0.6695 (0.7633) grad_norm: 0.4026 (0.4905) time: 5.5725 data: 0.0001 max mem: 71357 +[06:11:12.343083] Epoch: [1] [1760/6500] lr: 0.000049 closs: 0.7046 (0.7632) grad_norm: 0.3957 (0.4898) time: 5.5755 data: 0.0001 max mem: 71357 +[06:12:08.075303] Epoch: [1] [1770/6500] lr: 0.000049 closs: 0.7100 (0.7634) grad_norm: 0.3782 (0.4889) time: 5.5760 data: 0.0001 max mem: 71357 +[06:13:03.741874] Epoch: [1] [1780/6500] lr: 0.000049 closs: 0.7462 (0.7635) grad_norm: 0.3682 (0.4882) time: 5.5698 data: 0.0001 max mem: 71357 +[06:13:59.467131] Epoch: [1] [1790/6500] lr: 0.000049 closs: 0.7743 (0.7633) grad_norm: 0.3682 (0.4882) time: 5.5695 data: 0.0002 max mem: 71357 +[06:14:55.302278] Epoch: [1] [1800/6500] lr: 0.000049 closs: 0.7203 (0.7629) grad_norm: 0.3880 (0.4885) time: 5.5779 data: 0.0002 max mem: 71357 +[06:15:51.032856] Epoch: [1] [1810/6500] lr: 0.000049 closs: 0.7254 (0.7630) grad_norm: 0.4342 (0.4885) time: 5.5782 data: 0.0001 max mem: 71357 +[06:16:46.808428] Epoch: [1] [1820/6500] lr: 0.000049 closs: 0.7463 (0.7628) grad_norm: 0.4342 (0.4879) time: 5.5752 data: 0.0001 max mem: 71357 +[06:17:42.566412] Epoch: [1] [1830/6500] lr: 0.000049 closs: 0.7301 (0.7628) grad_norm: 0.4255 (0.4880) time: 5.5766 data: 0.0001 max mem: 71357 +[06:18:38.312144] Epoch: [1] [1840/6500] lr: 0.000049 closs: 0.8045 (0.7632) grad_norm: 0.3756 (0.4880) time: 5.5751 data: 0.0001 max mem: 71357 +[06:19:34.202602] Epoch: [1] [1850/6500] lr: 0.000049 closs: 0.7885 (0.7632) grad_norm: 0.3538 (0.4873) time: 5.5817 data: 0.0001 max mem: 71357 +[06:20:29.987714] Epoch: [1] [1860/6500] lr: 0.000049 closs: 0.7457 (0.7629) grad_norm: 0.3515 (0.4866) time: 5.5837 data: 0.0001 max mem: 71357 +[06:21:25.707631] Epoch: [1] [1870/6500] lr: 0.000049 closs: 0.7457 (0.7628) grad_norm: 0.3717 (0.4872) time: 5.5752 data: 0.0001 max mem: 71357 +[06:22:21.508496] Epoch: [1] [1880/6500] lr: 0.000049 closs: 0.7003 (0.7626) grad_norm: 0.3717 (0.4877) time: 5.5759 data: 0.0001 max mem: 71357 +[06:23:17.323008] Epoch: [1] [1890/6500] lr: 0.000049 closs: 0.7003 (0.7624) grad_norm: 0.3850 (0.4873) time: 5.5806 data: 0.0001 max mem: 71357 +[06:24:13.089745] Epoch: [1] [1900/6500] lr: 0.000049 closs: 0.7211 (0.7625) grad_norm: 0.4018 (0.4866) time: 5.5790 data: 0.0001 max mem: 71357 +[06:25:08.735665] Epoch: [1] [1910/6500] lr: 0.000049 closs: 0.7094 (0.7621) grad_norm: 0.4018 (0.4866) time: 5.5705 data: 0.0001 max mem: 71357 +[06:26:04.431078] Epoch: [1] [1920/6500] lr: 0.000049 closs: 0.6812 (0.7620) grad_norm: 0.4018 (0.4865) time: 5.5669 data: 0.0001 max mem: 71357 +[06:27:00.205152] Epoch: [1] [1930/6500] lr: 0.000049 closs: 0.7006 (0.7619) grad_norm: 0.4018 (0.4860) time: 5.5734 data: 0.0001 max mem: 71357 +[06:27:56.086508] Epoch: [1] [1940/6500] lr: 0.000049 closs: 0.7977 (0.7623) grad_norm: 0.3875 (0.4854) time: 5.5827 data: 0.0001 max mem: 71357 +[06:28:51.724118] Epoch: [1] [1950/6500] lr: 0.000049 closs: 0.7787 (0.7624) grad_norm: 0.3773 (0.4852) time: 5.5759 data: 0.0001 max mem: 71357 +[06:29:47.487192] Epoch: [1] [1960/6500] lr: 0.000049 closs: 0.7331 (0.7623) grad_norm: 0.3773 (0.4848) time: 5.5699 data: 0.0001 max mem: 71357 +[06:30:43.261286] Epoch: [1] [1970/6500] lr: 0.000049 closs: 0.7788 (0.7628) grad_norm: 0.3829 (0.4843) time: 5.5768 data: 0.0001 max mem: 71357 +[06:31:38.991880] Epoch: [1] [1980/6500] lr: 0.000049 closs: 0.7698 (0.7625) grad_norm: 0.3889 (0.4838) time: 5.5751 data: 0.0001 max mem: 71357 +[06:32:34.722420] Epoch: [1] [1990/6500] lr: 0.000049 closs: 0.7517 (0.7626) grad_norm: 0.3977 (0.4837) time: 5.5730 data: 0.0001 max mem: 71357 +[06:33:30.426934] Epoch: [1] [2000/6500] lr: 0.000049 closs: 0.7535 (0.7625) grad_norm: 0.3977 (0.4837) time: 5.5717 data: 0.0001 max mem: 71357 +[06:34:26.157107] Epoch: [1] [2010/6500] lr: 0.000049 closs: 0.7385 (0.7624) grad_norm: 0.3737 (0.4831) time: 5.5717 data: 0.0001 max mem: 71357 +[06:35:21.974735] Epoch: [1] [2020/6500] lr: 0.000049 closs: 0.7578 (0.7625) grad_norm: 0.4042 (0.4834) time: 5.5773 data: 0.0001 max mem: 71357 +[06:36:17.629504] Epoch: [1] [2030/6500] lr: 0.000049 closs: 0.8125 (0.7628) grad_norm: 0.3830 (0.4834) time: 5.5735 data: 0.0001 max mem: 71357 +[06:37:13.410785] Epoch: [1] [2040/6500] lr: 0.000049 closs: 0.8006 (0.7627) grad_norm: 0.3633 (0.4827) time: 5.5717 data: 0.0001 max mem: 71357 +[06:38:09.181666] Epoch: [1] [2050/6500] lr: 0.000049 closs: 0.7954 (0.7626) grad_norm: 0.3844 (0.4823) time: 5.5775 data: 0.0001 max mem: 71357 +[06:39:04.900937] Epoch: [1] [2060/6500] lr: 0.000049 closs: 0.7756 (0.7627) grad_norm: 0.3844 (0.4822) time: 5.5744 data: 0.0001 max mem: 71357 +[06:40:00.725173] Epoch: [1] [2070/6500] lr: 0.000049 closs: 0.7406 (0.7625) grad_norm: 0.3707 (0.4816) time: 5.5771 data: 0.0001 max mem: 71357 +[06:40:56.513573] Epoch: [1] [2080/6500] lr: 0.000049 closs: 0.7405 (0.7626) grad_norm: 0.4113 (0.4810) time: 5.5806 data: 0.0001 max mem: 71357 +[06:41:52.319453] Epoch: [1] [2090/6500] lr: 0.000049 closs: 0.7236 (0.7623) grad_norm: 0.3818 (0.4805) time: 5.5796 data: 0.0001 max mem: 71357 +[06:42:48.076246] Epoch: [1] [2100/6500] lr: 0.000049 closs: 0.6560 (0.7622) grad_norm: 0.3610 (0.4800) time: 5.5781 data: 0.0001 max mem: 71357 +[06:43:43.875280] Epoch: [1] [2110/6500] lr: 0.000049 closs: 0.6950 (0.7623) grad_norm: 0.3698 (0.4796) time: 5.5777 data: 0.0001 max mem: 71357 +[06:44:39.638406] Epoch: [1] [2120/6500] lr: 0.000049 closs: 0.7491 (0.7621) grad_norm: 0.3982 (0.4794) time: 5.5780 data: 0.0001 max mem: 71357 +[06:45:35.343891] Epoch: [1] [2130/6500] lr: 0.000049 closs: 0.7413 (0.7621) grad_norm: 0.4028 (0.4792) time: 5.5733 data: 0.0001 max mem: 71357 +[06:46:31.017460] Epoch: [1] [2140/6500] lr: 0.000049 closs: 0.7413 (0.7618) grad_norm: 0.4288 (0.4796) time: 5.5688 data: 0.0001 max mem: 71357 +[06:47:26.831774] Epoch: [1] [2150/6500] lr: 0.000049 closs: 0.7122 (0.7617) grad_norm: 0.4262 (0.4791) time: 5.5743 data: 0.0001 max mem: 71357 +[06:48:22.566097] Epoch: [1] [2160/6500] lr: 0.000049 closs: 0.7195 (0.7616) grad_norm: 0.3987 (0.4787) time: 5.5774 data: 0.0001 max mem: 71357 +[06:49:18.229337] Epoch: [1] [2170/6500] lr: 0.000049 closs: 0.7297 (0.7615) grad_norm: 0.3768 (0.4784) time: 5.5698 data: 0.0001 max mem: 71357 +[06:50:13.979986] Epoch: [1] [2180/6500] lr: 0.000049 closs: 0.7356 (0.7615) grad_norm: 0.3603 (0.4779) time: 5.5706 data: 0.0001 max mem: 71357 +[06:51:09.697275] Epoch: [1] [2190/6500] lr: 0.000049 closs: 0.7507 (0.7619) grad_norm: 0.3768 (0.4775) time: 5.5733 data: 0.0001 max mem: 71357 +[06:52:05.550399] Epoch: [1] [2200/6500] lr: 0.000049 closs: 0.6988 (0.7616) grad_norm: 0.3747 (0.4779) time: 5.5784 data: 0.0001 max mem: 71357 +[06:53:01.293904] Epoch: [1] [2210/6500] lr: 0.000049 closs: 0.6853 (0.7614) grad_norm: 0.3885 (0.4777) time: 5.5797 data: 0.0001 max mem: 71357 +[06:53:56.969787] Epoch: [1] [2220/6500] lr: 0.000049 closs: 0.7125 (0.7612) grad_norm: 0.4290 (0.4780) time: 5.5709 data: 0.0002 max mem: 71357 +[06:54:52.673780] Epoch: [1] [2230/6500] lr: 0.000049 closs: 0.7342 (0.7614) grad_norm: 0.4588 (0.4781) time: 5.5689 data: 0.0002 max mem: 71357 +[06:55:48.376439] Epoch: [1] [2240/6500] lr: 0.000049 closs: 0.8235 (0.7617) grad_norm: 0.4588 (0.4780) time: 5.5702 data: 0.0001 max mem: 71357 +[06:56:44.026518] Epoch: [1] [2250/6500] lr: 0.000049 closs: 0.7678 (0.7616) grad_norm: 0.4054 (0.4783) time: 5.5675 data: 0.0001 max mem: 71357 +[06:57:39.613302] Epoch: [1] [2260/6500] lr: 0.000049 closs: 0.7678 (0.7619) grad_norm: 0.4079 (0.4790) time: 5.5618 data: 0.0001 max mem: 71357 +[06:58:35.230225] Epoch: [1] [2270/6500] lr: 0.000049 closs: 0.8237 (0.7621) grad_norm: 0.4522 (0.4794) time: 5.5601 data: 0.0001 max mem: 71357 +[06:59:30.944187] Epoch: [1] [2280/6500] lr: 0.000048 closs: 0.7225 (0.7620) grad_norm: 0.4522 (0.4798) time: 5.5664 data: 0.0001 max mem: 71357 +[07:00:26.770134] Epoch: [1] [2290/6500] lr: 0.000048 closs: 0.7159 (0.7618) grad_norm: 0.4822 (0.4797) time: 5.5769 data: 0.0001 max mem: 71357 +[07:01:22.490132] Epoch: [1] [2300/6500] lr: 0.000048 closs: 0.7169 (0.7616) grad_norm: 0.4143 (0.4794) time: 5.5772 data: 0.0001 max mem: 71357 +[07:02:18.117158] Epoch: [1] [2310/6500] lr: 0.000048 closs: 0.7632 (0.7617) grad_norm: 0.3907 (0.4794) time: 5.5673 data: 0.0001 max mem: 71357 +[07:03:13.846062] Epoch: [1] [2320/6500] lr: 0.000048 closs: 0.7486 (0.7616) grad_norm: 0.3819 (0.4791) time: 5.5677 data: 0.0001 max mem: 71357 +[07:04:09.641871] Epoch: [1] [2330/6500] lr: 0.000048 closs: 0.7391 (0.7615) grad_norm: 0.4143 (0.4791) time: 5.5762 data: 0.0001 max mem: 71357 +[07:05:05.451518] Epoch: [1] [2340/6500] lr: 0.000048 closs: 0.7884 (0.7618) grad_norm: 0.3943 (0.4788) time: 5.5802 data: 0.0001 max mem: 71357 +[07:06:01.130149] Epoch: [1] [2350/6500] lr: 0.000048 closs: 0.7346 (0.7614) grad_norm: 0.4021 (0.4786) time: 5.5743 data: 0.0001 max mem: 71357 +[07:06:56.845866] Epoch: [1] [2360/6500] lr: 0.000048 closs: 0.7227 (0.7618) grad_norm: 0.4021 (0.4786) time: 5.5696 data: 0.0001 max mem: 71357 +[07:07:52.563701] Epoch: [1] [2370/6500] lr: 0.000048 closs: 0.8105 (0.7620) grad_norm: 0.3804 (0.4782) time: 5.5716 data: 0.0001 max mem: 71357 +[07:08:48.395607] Epoch: [1] [2380/6500] lr: 0.000048 closs: 0.7941 (0.7621) grad_norm: 0.4021 (0.4783) time: 5.5774 data: 0.0001 max mem: 71357 +[07:09:44.149078] Epoch: [1] [2390/6500] lr: 0.000048 closs: 0.7885 (0.7619) grad_norm: 0.3905 (0.4779) time: 5.5792 data: 0.0001 max mem: 71357 +[07:10:39.820251] Epoch: [1] [2400/6500] lr: 0.000048 closs: 0.6213 (0.7615) grad_norm: 0.4045 (0.4780) time: 5.5712 data: 0.0001 max mem: 71357 +[07:11:35.486229] Epoch: [1] [2410/6500] lr: 0.000048 closs: 0.6479 (0.7614) grad_norm: 0.3905 (0.4782) time: 5.5668 data: 0.0001 max mem: 71357 +[07:12:31.320418] Epoch: [1] [2420/6500] lr: 0.000048 closs: 0.7154 (0.7611) grad_norm: 0.3799 (0.4777) time: 5.5749 data: 0.0001 max mem: 71357 +[07:13:27.010058] Epoch: [1] [2430/6500] lr: 0.000048 closs: 0.7631 (0.7612) grad_norm: 0.3917 (0.4774) time: 5.5761 data: 0.0001 max mem: 71357 +[07:14:22.706755] Epoch: [1] [2440/6500] lr: 0.000048 closs: 0.7958 (0.7615) grad_norm: 0.3609 (0.4771) time: 5.5692 data: 0.0001 max mem: 71357 +[07:15:18.472441] Epoch: [1] [2450/6500] lr: 0.000048 closs: 0.7683 (0.7613) grad_norm: 0.3452 (0.4766) time: 5.5730 data: 0.0001 max mem: 71357 +[07:16:14.358302] Epoch: [1] [2460/6500] lr: 0.000048 closs: 0.7683 (0.7612) grad_norm: 0.3917 (0.4763) time: 5.5825 data: 0.0001 max mem: 71357 +[07:17:09.997614] Epoch: [1] [2470/6500] lr: 0.000048 closs: 0.8213 (0.7615) grad_norm: 0.3609 (0.4760) time: 5.5761 data: 0.0001 max mem: 71357 +[07:18:05.674747] Epoch: [1] [2480/6500] lr: 0.000048 closs: 0.7941 (0.7614) grad_norm: 0.4185 (0.4761) time: 5.5657 data: 0.0001 max mem: 71357 +[07:19:01.392136] Epoch: [1] [2490/6500] lr: 0.000048 closs: 0.7147 (0.7614) grad_norm: 0.4099 (0.4756) time: 5.5696 data: 0.0002 max mem: 71357 +[07:19:57.106876] Epoch: [1] [2500/6500] lr: 0.000048 closs: 0.7326 (0.7611) grad_norm: 0.3720 (0.4753) time: 5.5715 data: 0.0002 max mem: 71357 +[07:20:52.874861] Epoch: [1] [2510/6500] lr: 0.000048 closs: 0.6783 (0.7609) grad_norm: 0.3720 (0.4748) time: 5.5740 data: 0.0001 max mem: 71357 +[07:21:48.643566] Epoch: [1] [2520/6500] lr: 0.000048 closs: 0.6970 (0.7610) grad_norm: 0.3627 (0.4746) time: 5.5767 data: 0.0001 max mem: 71357 +[07:22:44.328112] Epoch: [1] [2530/6500] lr: 0.000048 closs: 0.7447 (0.7608) grad_norm: 0.3790 (0.4745) time: 5.5726 data: 0.0001 max mem: 71357 +[07:23:40.104620] Epoch: [1] [2540/6500] lr: 0.000048 closs: 0.7475 (0.7607) grad_norm: 0.3792 (0.4743) time: 5.5730 data: 0.0001 max mem: 71357 +[07:24:35.986148] Epoch: [1] [2550/6500] lr: 0.000048 closs: 0.8081 (0.7609) grad_norm: 0.3877 (0.4740) time: 5.5828 data: 0.0001 max mem: 71357 +[07:25:31.762648] Epoch: [1] [2560/6500] lr: 0.000048 closs: 0.8194 (0.7610) grad_norm: 0.4054 (0.4738) time: 5.5828 data: 0.0001 max mem: 71357 +[07:26:27.517291] Epoch: [1] [2570/6500] lr: 0.000048 closs: 0.8143 (0.7612) grad_norm: 0.4320 (0.4736) time: 5.5765 data: 0.0001 max mem: 71357 +[07:27:23.081161] Epoch: [1] [2580/6500] lr: 0.000048 closs: 0.8143 (0.7614) grad_norm: 0.4400 (0.4736) time: 5.5658 data: 0.0001 max mem: 71357 +[07:28:18.847361] Epoch: [1] [2590/6500] lr: 0.000048 closs: 0.7732 (0.7615) grad_norm: 0.4291 (0.4731) time: 5.5664 data: 0.0001 max mem: 71357 +[07:29:14.631754] Epoch: [1] [2600/6500] lr: 0.000048 closs: 0.7687 (0.7615) grad_norm: 0.4291 (0.4730) time: 5.5774 data: 0.0001 max mem: 71357 +[07:30:10.313627] Epoch: [1] [2610/6500] lr: 0.000048 closs: 0.7136 (0.7613) grad_norm: 0.4208 (0.4730) time: 5.5732 data: 0.0001 max mem: 71357 +[07:31:06.069722] Epoch: [1] [2620/6500] lr: 0.000048 closs: 0.7577 (0.7615) grad_norm: 0.3758 (0.4727) time: 5.5718 data: 0.0001 max mem: 71357 +[07:32:01.904851] Epoch: [1] [2630/6500] lr: 0.000048 closs: 0.7331 (0.7612) grad_norm: 0.3758 (0.4724) time: 5.5795 data: 0.0001 max mem: 71357 +[07:32:57.687644] Epoch: [1] [2640/6500] lr: 0.000048 closs: 0.7558 (0.7615) grad_norm: 0.3758 (0.4724) time: 5.5808 data: 0.0001 max mem: 71357 +[07:33:53.438636] Epoch: [1] [2650/6500] lr: 0.000048 closs: 0.7639 (0.7615) grad_norm: 0.3693 (0.4720) time: 5.5765 data: 0.0001 max mem: 71357 +[07:34:49.158394] Epoch: [1] [2660/6500] lr: 0.000048 closs: 0.7320 (0.7611) grad_norm: 0.3779 (0.4722) time: 5.5734 data: 0.0001 max mem: 71357 +[07:35:44.955768] Epoch: [1] [2670/6500] lr: 0.000048 closs: 0.7320 (0.7612) grad_norm: 0.3779 (0.4717) time: 5.5758 data: 0.0001 max mem: 71357 +[07:36:40.729888] Epoch: [1] [2680/6500] lr: 0.000048 closs: 0.7680 (0.7613) grad_norm: 0.3804 (0.4715) time: 5.5785 data: 0.0001 max mem: 71357 +[07:37:36.512085] Epoch: [1] [2690/6500] lr: 0.000048 closs: 0.7978 (0.7615) grad_norm: 0.4166 (0.4714) time: 5.5777 data: 0.0001 max mem: 71357 +[07:38:32.202375] Epoch: [1] [2700/6500] lr: 0.000048 closs: 0.7734 (0.7615) grad_norm: 0.3963 (0.4712) time: 5.5735 data: 0.0001 max mem: 71357 +[07:39:27.865502] Epoch: [1] [2710/6500] lr: 0.000048 closs: 0.7563 (0.7617) grad_norm: 0.4106 (0.4714) time: 5.5675 data: 0.0002 max mem: 71357 +[07:40:23.614087] Epoch: [1] [2720/6500] lr: 0.000048 closs: 0.8125 (0.7619) grad_norm: 0.3954 (0.4711) time: 5.5705 data: 0.0002 max mem: 71357 +[07:41:19.483972] Epoch: [1] [2730/6500] lr: 0.000048 closs: 0.8166 (0.7619) grad_norm: 0.3848 (0.4707) time: 5.5808 data: 0.0001 max mem: 71357 +[07:42:15.165152] Epoch: [1] [2740/6500] lr: 0.000048 closs: 0.7304 (0.7619) grad_norm: 0.3909 (0.4707) time: 5.5775 data: 0.0001 max mem: 71357 +[07:43:10.903891] Epoch: [1] [2750/6500] lr: 0.000048 closs: 0.7304 (0.7617) grad_norm: 0.3943 (0.4704) time: 5.5709 data: 0.0001 max mem: 71357 +[07:44:06.612623] Epoch: [1] [2760/6500] lr: 0.000048 closs: 0.6582 (0.7613) grad_norm: 0.3752 (0.4701) time: 5.5723 data: 0.0002 max mem: 71357 +[07:45:02.480213] Epoch: [1] [2770/6500] lr: 0.000048 closs: 0.6582 (0.7612) grad_norm: 0.3752 (0.4696) time: 5.5787 data: 0.0002 max mem: 71357 +[07:45:58.196941] Epoch: [1] [2780/6500] lr: 0.000048 closs: 0.7057 (0.7612) grad_norm: 0.3494 (0.4695) time: 5.5791 data: 0.0001 max mem: 71357 +[07:46:53.877620] Epoch: [1] [2790/6500] lr: 0.000048 closs: 0.7393 (0.7613) grad_norm: 0.3485 (0.4697) time: 5.5698 data: 0.0001 max mem: 71357 +[07:47:49.527855] Epoch: [1] [2800/6500] lr: 0.000048 closs: 0.7912 (0.7615) grad_norm: 0.3532 (0.4696) time: 5.5664 data: 0.0001 max mem: 71357 +[07:48:45.322368] Epoch: [1] [2810/6500] lr: 0.000048 closs: 0.7889 (0.7616) grad_norm: 0.3971 (0.4696) time: 5.5721 data: 0.0001 max mem: 71357 +[07:49:41.244305] Epoch: [1] [2820/6500] lr: 0.000048 closs: 0.7473 (0.7618) grad_norm: 0.4032 (0.4694) time: 5.5857 data: 0.0002 max mem: 71357 +[07:50:37.037384] Epoch: [1] [2830/6500] lr: 0.000048 closs: 0.7493 (0.7616) grad_norm: 0.3973 (0.4690) time: 5.5857 data: 0.0002 max mem: 71357 +[07:51:32.889723] Epoch: [1] [2840/6500] lr: 0.000048 closs: 0.7518 (0.7616) grad_norm: 0.3720 (0.4686) time: 5.5822 data: 0.0001 max mem: 71357 +[07:52:28.686394] Epoch: [1] [2850/6500] lr: 0.000048 closs: 0.7385 (0.7615) grad_norm: 0.3720 (0.4684) time: 5.5823 data: 0.0001 max mem: 71357 +[07:53:24.563749] Epoch: [1] [2860/6500] lr: 0.000048 closs: 0.7385 (0.7615) grad_norm: 0.3721 (0.4692) time: 5.5836 data: 0.0001 max mem: 71357 +[07:54:20.254650] Epoch: [1] [2870/6500] lr: 0.000048 closs: 0.7785 (0.7615) grad_norm: 0.3818 (0.4690) time: 5.5783 data: 0.0002 max mem: 71357 +[07:55:15.940546] Epoch: [1] [2880/6500] lr: 0.000048 closs: 0.7861 (0.7616) grad_norm: 0.3890 (0.4688) time: 5.5687 data: 0.0002 max mem: 71357 +[07:56:11.671960] Epoch: [1] [2890/6500] lr: 0.000048 closs: 0.7440 (0.7616) grad_norm: 0.4069 (0.4688) time: 5.5708 data: 0.0001 max mem: 71357 +[07:57:07.459023] Epoch: [1] [2900/6500] lr: 0.000048 closs: 0.7440 (0.7617) grad_norm: 0.3818 (0.4685) time: 5.5759 data: 0.0001 max mem: 71357 +[07:58:03.203864] Epoch: [1] [2910/6500] lr: 0.000048 closs: 0.7739 (0.7616) grad_norm: 0.3697 (0.4684) time: 5.5765 data: 0.0001 max mem: 71357 +[07:58:58.923850] Epoch: [1] [2920/6500] lr: 0.000048 closs: 0.7745 (0.7616) grad_norm: 0.3614 (0.4680) time: 5.5731 data: 0.0001 max mem: 71357 +[07:59:54.665057] Epoch: [1] [2930/6500] lr: 0.000048 closs: 0.7605 (0.7615) grad_norm: 0.3855 (0.4680) time: 5.5729 data: 0.0001 max mem: 71357 +[08:00:50.398913] Epoch: [1] [2940/6500] lr: 0.000048 closs: 0.6680 (0.7614) grad_norm: 0.3870 (0.4680) time: 5.5737 data: 0.0001 max mem: 71357 +[08:01:46.151660] Epoch: [1] [2950/6500] lr: 0.000048 closs: 0.6983 (0.7613) grad_norm: 0.4192 (0.4678) time: 5.5743 data: 0.0001 max mem: 71357 +[08:02:41.874089] Epoch: [1] [2960/6500] lr: 0.000047 closs: 0.7362 (0.7612) grad_norm: 0.4303 (0.4679) time: 5.5737 data: 0.0001 max mem: 71357 +[08:03:37.531424] Epoch: [1] [2970/6500] lr: 0.000047 closs: 0.7607 (0.7614) grad_norm: 0.4208 (0.4677) time: 5.5689 data: 0.0001 max mem: 71357 +[08:04:33.308851] Epoch: [1] [2980/6500] lr: 0.000047 closs: 0.8016 (0.7617) grad_norm: 0.4293 (0.4677) time: 5.5716 data: 0.0002 max mem: 71357 +[08:05:29.081119] Epoch: [1] [2990/6500] lr: 0.000047 closs: 0.8433 (0.7620) grad_norm: 0.4644 (0.4678) time: 5.5774 data: 0.0002 max mem: 71357 +[08:06:24.770784] Epoch: [1] [3000/6500] lr: 0.000047 closs: 0.8140 (0.7621) grad_norm: 0.4623 (0.4678) time: 5.5730 data: 0.0001 max mem: 71357 +[08:07:20.456994] Epoch: [1] [3010/6500] lr: 0.000047 closs: 0.7919 (0.7624) grad_norm: 0.4056 (0.4676) time: 5.5687 data: 0.0001 max mem: 71357 +[08:08:16.206729] Epoch: [1] [3020/6500] lr: 0.000047 closs: 0.7767 (0.7623) grad_norm: 0.3780 (0.4676) time: 5.5717 data: 0.0001 max mem: 71357 +[08:09:11.944448] Epoch: [1] [3030/6500] lr: 0.000047 closs: 0.7767 (0.7623) grad_norm: 0.3780 (0.4674) time: 5.5743 data: 0.0002 max mem: 71357 +[08:10:07.833770] Epoch: [1] [3040/6500] lr: 0.000047 closs: 0.7765 (0.7625) grad_norm: 0.3748 (0.4672) time: 5.5812 data: 0.0002 max mem: 71357 +[08:11:03.571764] Epoch: [1] [3050/6500] lr: 0.000047 closs: 0.7737 (0.7626) grad_norm: 0.4118 (0.4674) time: 5.5812 data: 0.0001 max mem: 71357 +[08:11:59.228717] Epoch: [1] [3060/6500] lr: 0.000047 closs: 0.7259 (0.7627) grad_norm: 0.4142 (0.4672) time: 5.5697 data: 0.0001 max mem: 71357 +[08:12:54.997556] Epoch: [1] [3070/6500] lr: 0.000047 closs: 0.7579 (0.7627) grad_norm: 0.4118 (0.4670) time: 5.5712 data: 0.0001 max mem: 71357 +[08:13:50.769938] Epoch: [1] [3080/6500] lr: 0.000047 closs: 0.7579 (0.7629) grad_norm: 0.4118 (0.4668) time: 5.5770 data: 0.0001 max mem: 71357 +[08:14:46.551482] Epoch: [1] [3090/6500] lr: 0.000047 closs: 0.8032 (0.7630) grad_norm: 0.3768 (0.4667) time: 5.5776 data: 0.0001 max mem: 71357 +[08:15:42.253189] Epoch: [1] [3100/6500] lr: 0.000047 closs: 0.7503 (0.7631) grad_norm: 0.3593 (0.4665) time: 5.5740 data: 0.0001 max mem: 71357 +[08:16:37.935608] Epoch: [1] [3110/6500] lr: 0.000047 closs: 0.7480 (0.7632) grad_norm: 0.3968 (0.4666) time: 5.5691 data: 0.0001 max mem: 71357 +[08:17:33.663599] Epoch: [1] [3120/6500] lr: 0.000047 closs: 0.7927 (0.7635) grad_norm: 0.4407 (0.4668) time: 5.5704 data: 0.0001 max mem: 71357 +[08:18:29.345056] Epoch: [1] [3130/6500] lr: 0.000047 closs: 0.6941 (0.7632) grad_norm: 0.3968 (0.4668) time: 5.5704 data: 0.0001 max mem: 71357 +[08:19:24.975538] Epoch: [1] [3140/6500] lr: 0.000047 closs: 0.6547 (0.7629) grad_norm: 0.4024 (0.4667) time: 5.5655 data: 0.0002 max mem: 71357 +[08:20:20.672689] Epoch: [1] [3150/6500] lr: 0.000047 closs: 0.7241 (0.7629) grad_norm: 0.4054 (0.4671) time: 5.5663 data: 0.0002 max mem: 71357 +[08:21:16.505928] Epoch: [1] [3160/6500] lr: 0.000047 closs: 0.7440 (0.7630) grad_norm: 0.3850 (0.4667) time: 5.5764 data: 0.0001 max mem: 71357 +[08:22:12.375786] Epoch: [1] [3170/6500] lr: 0.000047 closs: 0.8047 (0.7632) grad_norm: 0.4024 (0.4664) time: 5.5850 data: 0.0001 max mem: 71357 +[08:23:08.092208] Epoch: [1] [3180/6500] lr: 0.000047 closs: 0.7895 (0.7631) grad_norm: 0.4011 (0.4664) time: 5.5792 data: 0.0001 max mem: 71357 +[08:24:03.827649] Epoch: [1] [3190/6500] lr: 0.000047 closs: 0.7448 (0.7630) grad_norm: 0.4006 (0.4662) time: 5.5725 data: 0.0001 max mem: 71357 +[08:24:59.532202] Epoch: [1] [3200/6500] lr: 0.000047 closs: 0.7496 (0.7631) grad_norm: 0.4011 (0.4661) time: 5.5719 data: 0.0001 max mem: 71357 +[08:25:55.397211] Epoch: [1] [3210/6500] lr: 0.000047 closs: 0.7620 (0.7631) grad_norm: 0.4011 (0.4659) time: 5.5784 data: 0.0001 max mem: 71357 +[08:26:51.117745] Epoch: [1] [3220/6500] lr: 0.000047 closs: 0.7429 (0.7630) grad_norm: 0.4174 (0.4658) time: 5.5791 data: 0.0001 max mem: 71357 +[08:27:46.826662] Epoch: [1] [3230/6500] lr: 0.000047 closs: 0.6742 (0.7628) grad_norm: 0.4086 (0.4658) time: 5.5713 data: 0.0001 max mem: 71357 +[08:28:42.685834] Epoch: [1] [3240/6500] lr: 0.000047 closs: 0.6707 (0.7625) grad_norm: 0.4086 (0.4656) time: 5.5783 data: 0.0001 max mem: 71357 +[08:29:38.511456] Epoch: [1] [3250/6500] lr: 0.000047 closs: 0.6844 (0.7624) grad_norm: 0.3959 (0.4653) time: 5.5842 data: 0.0001 max mem: 71357 +[08:30:34.409341] Epoch: [1] [3260/6500] lr: 0.000047 closs: 0.7046 (0.7623) grad_norm: 0.3587 (0.4650) time: 5.5861 data: 0.0001 max mem: 71357 +[08:31:30.133837] Epoch: [1] [3270/6500] lr: 0.000047 closs: 0.7094 (0.7624) grad_norm: 0.3697 (0.4648) time: 5.5810 data: 0.0001 max mem: 71357 +[08:32:25.764744] Epoch: [1] [3280/6500] lr: 0.000047 closs: 0.8080 (0.7624) grad_norm: 0.3697 (0.4645) time: 5.5677 data: 0.0001 max mem: 71357 +[08:33:21.575850] Epoch: [1] [3290/6500] lr: 0.000047 closs: 0.7767 (0.7625) grad_norm: 0.3703 (0.4643) time: 5.5720 data: 0.0001 max mem: 71357 +[08:34:17.542015] Epoch: [1] [3300/6500] lr: 0.000047 closs: 0.7996 (0.7626) grad_norm: 0.3703 (0.4639) time: 5.5888 data: 0.0001 max mem: 71357 +[08:35:13.335207] Epoch: [1] [3310/6500] lr: 0.000047 closs: 0.8321 (0.7627) grad_norm: 0.3927 (0.4637) time: 5.5879 data: 0.0001 max mem: 71357 +[08:36:09.139010] Epoch: [1] [3320/6500] lr: 0.000047 closs: 0.8484 (0.7629) grad_norm: 0.3927 (0.4638) time: 5.5797 data: 0.0001 max mem: 71357 +[08:37:04.835975] Epoch: [1] [3330/6500] lr: 0.000047 closs: 0.7901 (0.7628) grad_norm: 0.4108 (0.4642) time: 5.5749 data: 0.0001 max mem: 71357 +[08:38:00.607556] Epoch: [1] [3340/6500] lr: 0.000047 closs: 0.7757 (0.7629) grad_norm: 0.4291 (0.4640) time: 5.5733 data: 0.0001 max mem: 71357 +[08:38:56.311132] Epoch: [1] [3350/6500] lr: 0.000047 closs: 0.7757 (0.7630) grad_norm: 0.4346 (0.4640) time: 5.5737 data: 0.0001 max mem: 71357 +[08:39:52.026895] Epoch: [1] [3360/6500] lr: 0.000047 closs: 0.7453 (0.7628) grad_norm: 0.4087 (0.4638) time: 5.5709 data: 0.0001 max mem: 71357 +[08:40:47.717482] Epoch: [1] [3370/6500] lr: 0.000047 closs: 0.7354 (0.7627) grad_norm: 0.3643 (0.4664) time: 5.5702 data: 0.0001 max mem: 71357 +[08:41:43.520242] Epoch: [1] [3380/6500] lr: 0.000047 closs: 0.7594 (0.7628) grad_norm: 0.4010 (0.4665) time: 5.5745 data: 0.0001 max mem: 71357 +[08:42:39.268585] Epoch: [1] [3390/6500] lr: 0.000047 closs: 0.7647 (0.7628) grad_norm: 0.4010 (0.4664) time: 5.5775 data: 0.0001 max mem: 71357 +[08:43:35.001439] Epoch: [1] [3400/6500] lr: 0.000047 closs: 0.7647 (0.7628) grad_norm: 0.3704 (0.4664) time: 5.5740 data: 0.0001 max mem: 71357 +[08:44:30.743501] Epoch: [1] [3410/6500] lr: 0.000047 closs: 0.7684 (0.7630) grad_norm: 0.3704 (0.4662) time: 5.5737 data: 0.0002 max mem: 71357 +[08:45:26.393462] Epoch: [1] [3420/6500] lr: 0.000047 closs: 0.7215 (0.7627) grad_norm: 0.3796 (0.4673) time: 5.5695 data: 0.0002 max mem: 71357 +[08:46:22.210157] Epoch: [1] [3430/6500] lr: 0.000047 closs: 0.6823 (0.7628) grad_norm: 0.4108 (0.4672) time: 5.5732 data: 0.0001 max mem: 71357 +[08:47:17.932445] Epoch: [1] [3440/6500] lr: 0.000047 closs: 0.7888 (0.7629) grad_norm: 0.4108 (0.4673) time: 5.5768 data: 0.0001 max mem: 71357 +[08:48:13.684592] Epoch: [1] [3450/6500] lr: 0.000047 closs: 0.7483 (0.7628) grad_norm: 0.4411 (0.4673) time: 5.5736 data: 0.0001 max mem: 71357 +[08:49:09.395292] Epoch: [1] [3460/6500] lr: 0.000047 closs: 0.7008 (0.7626) grad_norm: 0.4411 (0.4672) time: 5.5731 data: 0.0001 max mem: 71357 +[08:50:05.184879] Epoch: [1] [3470/6500] lr: 0.000047 closs: 0.6853 (0.7625) grad_norm: 0.4585 (0.4672) time: 5.5749 data: 0.0001 max mem: 71357 +[08:51:01.025092] Epoch: [1] [3480/6500] lr: 0.000047 closs: 0.7112 (0.7624) grad_norm: 0.4481 (0.4671) time: 5.5814 data: 0.0001 max mem: 71357 +[08:51:56.784945] Epoch: [1] [3490/6500] lr: 0.000047 closs: 0.7441 (0.7625) grad_norm: 0.4319 (0.4668) time: 5.5799 data: 0.0001 max mem: 71357 +[08:52:52.516000] Epoch: [1] [3500/6500] lr: 0.000047 closs: 0.7127 (0.7623) grad_norm: 0.4192 (0.4667) time: 5.5744 data: 0.0001 max mem: 71357 +[08:53:48.323634] Epoch: [1] [3510/6500] lr: 0.000046 closs: 0.6960 (0.7622) grad_norm: 0.3713 (0.4664) time: 5.5768 data: 0.0001 max mem: 71357 +[08:54:44.172207] Epoch: [1] [3520/6500] lr: 0.000046 closs: 0.7526 (0.7620) grad_norm: 0.3674 (0.4662) time: 5.5827 data: 0.0001 max mem: 71357 +[08:55:39.982733] Epoch: [1] [3530/6500] lr: 0.000046 closs: 0.7364 (0.7620) grad_norm: 0.3693 (0.4663) time: 5.5829 data: 0.0001 max mem: 71357 +[08:56:35.769447] Epoch: [1] [3540/6500] lr: 0.000046 closs: 0.7020 (0.7619) grad_norm: 0.3710 (0.4665) time: 5.5798 data: 0.0001 max mem: 71357 +[08:57:31.531558] Epoch: [1] [3550/6500] lr: 0.000046 closs: 0.7080 (0.7618) grad_norm: 0.3710 (0.4661) time: 5.5773 data: 0.0001 max mem: 71357 +[08:58:27.330798] Epoch: [1] [3560/6500] lr: 0.000046 closs: 0.7660 (0.7619) grad_norm: 0.4092 (0.4664) time: 5.5780 data: 0.0001 max mem: 71357 +[08:59:23.008683] Epoch: [1] [3570/6500] lr: 0.000046 closs: 0.7951 (0.7619) grad_norm: 0.4092 (0.4664) time: 5.5738 data: 0.0002 max mem: 71357 +[09:00:18.802643] Epoch: [1] [3580/6500] lr: 0.000046 closs: 0.8291 (0.7622) grad_norm: 0.4014 (0.4662) time: 5.5735 data: 0.0002 max mem: 71357 +[09:01:14.550611] Epoch: [1] [3590/6500] lr: 0.000046 closs: 0.8291 (0.7623) grad_norm: 0.4089 (0.4660) time: 5.5770 data: 0.0001 max mem: 71357 +[09:02:10.271773] Epoch: [1] [3600/6500] lr: 0.000046 closs: 0.7837 (0.7624) grad_norm: 0.4011 (0.4662) time: 5.5734 data: 0.0001 max mem: 71357 +[09:03:06.083973] Epoch: [1] [3610/6500] lr: 0.000046 closs: 0.7482 (0.7622) grad_norm: 0.3935 (0.4660) time: 5.5765 data: 0.0001 max mem: 71357 +[09:04:01.836078] Epoch: [1] [3620/6500] lr: 0.000046 closs: 0.6927 (0.7623) grad_norm: 0.3726 (0.4658) time: 5.5781 data: 0.0001 max mem: 71357 +[09:04:57.478678] Epoch: [1] [3630/6500] lr: 0.000046 closs: 0.6874 (0.7621) grad_norm: 0.3935 (0.4657) time: 5.5697 data: 0.0001 max mem: 71357 +[09:05:53.278864] Epoch: [1] [3640/6500] lr: 0.000046 closs: 0.7235 (0.7621) grad_norm: 0.3875 (0.4655) time: 5.5721 data: 0.0001 max mem: 71357 +[09:06:48.995421] Epoch: [1] [3650/6500] lr: 0.000046 closs: 0.7066 (0.7619) grad_norm: 0.3875 (0.4654) time: 5.5757 data: 0.0001 max mem: 71357 +[09:07:44.777569] Epoch: [1] [3660/6500] lr: 0.000046 closs: 0.6981 (0.7619) grad_norm: 0.3875 (0.4651) time: 5.5748 data: 0.0001 max mem: 71357 +[09:08:40.434566] Epoch: [1] [3670/6500] lr: 0.000046 closs: 0.7989 (0.7620) grad_norm: 0.3847 (0.4649) time: 5.5718 data: 0.0001 max mem: 71357 +[09:09:36.095219] Epoch: [1] [3680/6500] lr: 0.000046 closs: 0.7827 (0.7620) grad_norm: 0.3847 (0.4649) time: 5.5658 data: 0.0002 max mem: 71357 +[09:10:31.882262] Epoch: [1] [3690/6500] lr: 0.000046 closs: 0.7264 (0.7618) grad_norm: 0.3847 (0.4648) time: 5.5723 data: 0.0002 max mem: 71357 +[09:11:27.755027] Epoch: [1] [3700/6500] lr: 0.000046 closs: 0.7427 (0.7620) grad_norm: 0.3847 (0.4646) time: 5.5829 data: 0.0001 max mem: 71357 +[09:12:23.464314] Epoch: [1] [3710/6500] lr: 0.000046 closs: 0.7427 (0.7619) grad_norm: 0.3666 (0.4645) time: 5.5790 data: 0.0001 max mem: 71357 +[09:13:19.156115] Epoch: [1] [3720/6500] lr: 0.000046 closs: 0.6591 (0.7618) grad_norm: 0.3587 (0.4642) time: 5.5700 data: 0.0001 max mem: 71357 +[09:14:14.876345] Epoch: [1] [3730/6500] lr: 0.000046 closs: 0.6957 (0.7617) grad_norm: 0.3575 (0.4639) time: 5.5705 data: 0.0001 max mem: 71357 +[09:15:10.762487] Epoch: [1] [3740/6500] lr: 0.000046 closs: 0.7069 (0.7615) grad_norm: 0.3504 (0.4636) time: 5.5802 data: 0.0001 max mem: 71357 +[09:16:06.591376] Epoch: [1] [3750/6500] lr: 0.000046 closs: 0.7085 (0.7616) grad_norm: 0.3295 (0.4632) time: 5.5856 data: 0.0001 max mem: 71357 +[09:17:02.377046] Epoch: [1] [3760/6500] lr: 0.000046 closs: 0.8005 (0.7617) grad_norm: 0.3338 (0.4631) time: 5.5806 data: 0.0001 max mem: 71357 +[09:17:58.234451] Epoch: [1] [3770/6500] lr: 0.000046 closs: 0.7918 (0.7618) grad_norm: 0.3461 (0.4631) time: 5.5821 data: 0.0001 max mem: 71357 +[09:18:54.023163] Epoch: [1] [3780/6500] lr: 0.000046 closs: 0.7503 (0.7619) grad_norm: 0.3461 (0.4628) time: 5.5822 data: 0.0001 max mem: 71357 +[09:19:49.736021] Epoch: [1] [3790/6500] lr: 0.000046 closs: 0.7219 (0.7617) grad_norm: 0.3838 (0.4628) time: 5.5750 data: 0.0002 max mem: 71357 +[09:20:45.437282] Epoch: [1] [3800/6500] lr: 0.000046 closs: 0.7728 (0.7621) grad_norm: 0.3838 (0.4625) time: 5.5706 data: 0.0002 max mem: 71357 +[09:21:41.168373] Epoch: [1] [3810/6500] lr: 0.000046 closs: 0.8062 (0.7621) grad_norm: 0.3776 (0.4627) time: 5.5715 data: 0.0001 max mem: 71357 +[09:22:36.975415] Epoch: [1] [3820/6500] lr: 0.000046 closs: 0.7432 (0.7622) grad_norm: 0.3782 (0.4624) time: 5.5768 data: 0.0001 max mem: 71357 +[09:23:32.914290] Epoch: [1] [3830/6500] lr: 0.000046 closs: 0.7504 (0.7620) grad_norm: 0.3533 (0.4622) time: 5.5872 data: 0.0001 max mem: 71357 +[09:24:28.594812] Epoch: [1] [3840/6500] lr: 0.000046 closs: 0.7185 (0.7618) grad_norm: 0.3782 (0.4622) time: 5.5809 data: 0.0002 max mem: 71357 +[09:25:24.364082] Epoch: [1] [3850/6500] lr: 0.000046 closs: 0.7185 (0.7618) grad_norm: 0.3533 (0.4620) time: 5.5724 data: 0.0002 max mem: 71357 +[09:26:20.100663] Epoch: [1] [3860/6500] lr: 0.000046 closs: 0.7267 (0.7616) grad_norm: 0.3744 (0.4618) time: 5.5752 data: 0.0001 max mem: 71357 +[09:27:15.928339] Epoch: [1] [3870/6500] lr: 0.000046 closs: 0.7195 (0.7616) grad_norm: 0.3995 (0.4619) time: 5.5781 data: 0.0001 max mem: 71357 +[09:28:11.670865] Epoch: [1] [3880/6500] lr: 0.000046 closs: 0.7310 (0.7615) grad_norm: 0.3729 (0.4618) time: 5.5784 data: 0.0001 max mem: 71357 +[09:29:07.413787] Epoch: [1] [3890/6500] lr: 0.000046 closs: 0.7109 (0.7613) grad_norm: 0.3729 (0.4616) time: 5.5742 data: 0.0001 max mem: 71357 +[09:30:03.241491] Epoch: [1] [3900/6500] lr: 0.000046 closs: 0.7159 (0.7613) grad_norm: 0.3739 (0.4617) time: 5.5784 data: 0.0001 max mem: 71357 +[09:30:59.019159] Epoch: [1] [3910/6500] lr: 0.000046 closs: 0.7093 (0.7612) grad_norm: 0.3819 (0.4618) time: 5.5802 data: 0.0001 max mem: 71357 +[09:31:54.797486] Epoch: [1] [3920/6500] lr: 0.000046 closs: 0.6981 (0.7612) grad_norm: 0.4453 (0.4617) time: 5.5777 data: 0.0001 max mem: 71357 +[09:32:50.515822] Epoch: [1] [3930/6500] lr: 0.000046 closs: 0.7891 (0.7612) grad_norm: 0.4498 (0.4617) time: 5.5747 data: 0.0001 max mem: 71357 +[09:33:46.285993] Epoch: [1] [3940/6500] lr: 0.000046 closs: 0.8399 (0.7613) grad_norm: 0.4643 (0.4619) time: 5.5743 data: 0.0001 max mem: 71357 +[09:34:42.060196] Epoch: [1] [3950/6500] lr: 0.000046 closs: 0.7596 (0.7614) grad_norm: 0.4053 (0.4616) time: 5.5771 data: 0.0002 max mem: 71357 +[09:35:37.892560] Epoch: [1] [3960/6500] lr: 0.000046 closs: 0.7154 (0.7614) grad_norm: 0.4053 (0.4618) time: 5.5802 data: 0.0002 max mem: 71357 +[09:36:33.555038] Epoch: [1] [3970/6500] lr: 0.000046 closs: 0.7729 (0.7614) grad_norm: 0.3912 (0.4616) time: 5.5746 data: 0.0001 max mem: 71357 +[09:37:29.253407] Epoch: [1] [3980/6500] lr: 0.000046 closs: 0.7599 (0.7612) grad_norm: 0.3912 (0.4615) time: 5.5679 data: 0.0001 max mem: 71357 +[09:38:25.028340] Epoch: [1] [3990/6500] lr: 0.000046 closs: 0.7088 (0.7613) grad_norm: 0.3892 (0.4614) time: 5.5735 data: 0.0001 max mem: 71357 +[09:39:20.898508] Epoch: [1] [4000/6500] lr: 0.000045 closs: 0.7781 (0.7614) grad_norm: 0.3892 (0.4614) time: 5.5821 data: 0.0001 max mem: 71357 +[09:40:16.589778] Epoch: [1] [4010/6500] lr: 0.000045 closs: 0.7052 (0.7612) grad_norm: 0.3709 (0.4611) time: 5.5780 data: 0.0001 max mem: 71357 +[09:41:12.368366] Epoch: [1] [4020/6500] lr: 0.000045 closs: 0.7052 (0.7612) grad_norm: 0.3709 (0.4611) time: 5.5734 data: 0.0001 max mem: 71357 +[09:42:08.075783] Epoch: [1] [4030/6500] lr: 0.000045 closs: 0.7410 (0.7613) grad_norm: 0.3698 (0.4610) time: 5.5742 data: 0.0001 max mem: 71357 +[09:43:03.790892] Epoch: [1] [4040/6500] lr: 0.000045 closs: 0.7635 (0.7613) grad_norm: 0.3481 (0.4610) time: 5.5711 data: 0.0001 max mem: 71357 +[09:43:59.611304] Epoch: [1] [4050/6500] lr: 0.000045 closs: 0.7848 (0.7613) grad_norm: 0.3679 (0.4608) time: 5.5767 data: 0.0001 max mem: 71357 +[09:44:55.346989] Epoch: [1] [4060/6500] lr: 0.000045 closs: 0.8193 (0.7616) grad_norm: 0.3679 (0.4607) time: 5.5777 data: 0.0002 max mem: 71357 +[09:45:51.144585] Epoch: [1] [4070/6500] lr: 0.000045 closs: 0.8203 (0.7615) grad_norm: 0.3481 (0.4605) time: 5.5765 data: 0.0002 max mem: 71357 +[09:46:46.827827] Epoch: [1] [4080/6500] lr: 0.000045 closs: 0.7796 (0.7615) grad_norm: 0.3621 (0.4605) time: 5.5739 data: 0.0001 max mem: 71357 +[09:47:42.580540] Epoch: [1] [4090/6500] lr: 0.000045 closs: 0.7967 (0.7616) grad_norm: 0.3748 (0.4606) time: 5.5716 data: 0.0001 max mem: 71357 +[09:48:38.297486] Epoch: [1] [4100/6500] lr: 0.000045 closs: 0.7621 (0.7619) grad_norm: 0.4406 (0.4609) time: 5.5733 data: 0.0001 max mem: 71357 +[09:49:33.991922] Epoch: [1] [4110/6500] lr: 0.000045 closs: 0.7621 (0.7618) grad_norm: 0.5227 (0.4609) time: 5.5705 data: 0.0002 max mem: 71357 +[09:50:29.739373] Epoch: [1] [4120/6500] lr: 0.000045 closs: 0.6884 (0.7617) grad_norm: 0.4406 (0.4607) time: 5.5720 data: 0.0002 max mem: 71357 +[09:51:25.522880] Epoch: [1] [4130/6500] lr: 0.000045 closs: 0.7258 (0.7617) grad_norm: 0.4095 (0.4607) time: 5.5764 data: 0.0001 max mem: 71357 +[09:52:21.422843] Epoch: [1] [4140/6500] lr: 0.000045 closs: 0.7362 (0.7617) grad_norm: 0.3938 (0.4605) time: 5.5841 data: 0.0001 max mem: 71357 +[09:53:17.172143] Epoch: [1] [4150/6500] lr: 0.000045 closs: 0.7308 (0.7617) grad_norm: 0.3742 (0.4604) time: 5.5824 data: 0.0001 max mem: 71357 +[09:54:12.914122] Epoch: [1] [4160/6500] lr: 0.000045 closs: 0.7515 (0.7616) grad_norm: 0.3742 (0.4603) time: 5.5744 data: 0.0001 max mem: 71357 +[09:55:08.625285] Epoch: [1] [4170/6500] lr: 0.000045 closs: 0.7577 (0.7616) grad_norm: 0.3742 (0.4602) time: 5.5725 data: 0.0001 max mem: 71357 +[09:56:04.404235] Epoch: [1] [4180/6500] lr: 0.000045 closs: 0.7953 (0.7617) grad_norm: 0.3742 (0.4600) time: 5.5744 data: 0.0001 max mem: 71357 +[09:57:00.149637] Epoch: [1] [4190/6500] lr: 0.000045 closs: 0.7425 (0.7615) grad_norm: 0.3742 (0.4598) time: 5.5761 data: 0.0001 max mem: 71357 +[09:57:55.891705] Epoch: [1] [4200/6500] lr: 0.000045 closs: 0.7307 (0.7614) grad_norm: 0.3675 (0.4596) time: 5.5743 data: 0.0001 max mem: 71357 +[09:58:51.703348] Epoch: [1] [4210/6500] lr: 0.000045 closs: 0.7554 (0.7616) grad_norm: 0.3542 (0.4596) time: 5.5776 data: 0.0001 max mem: 71357 +[09:59:47.523751] Epoch: [1] [4220/6500] lr: 0.000045 closs: 0.7977 (0.7617) grad_norm: 0.3714 (0.4594) time: 5.5815 data: 0.0001 max mem: 71357 +[10:00:43.137897] Epoch: [1] [4230/6500] lr: 0.000045 closs: 0.7750 (0.7617) grad_norm: 0.3772 (0.4597) time: 5.5716 data: 0.0001 max mem: 71357 +[10:01:38.861888] Epoch: [1] [4240/6500] lr: 0.000045 closs: 0.7175 (0.7616) grad_norm: 0.3714 (0.4594) time: 5.5668 data: 0.0001 max mem: 71357 +[10:02:34.575388] Epoch: [1] [4250/6500] lr: 0.000045 closs: 0.7552 (0.7615) grad_norm: 0.3486 (0.4591) time: 5.5718 data: 0.0001 max mem: 71357 +[10:03:30.284625] Epoch: [1] [4260/6500] lr: 0.000045 closs: 0.8085 (0.7617) grad_norm: 0.3462 (0.4591) time: 5.5710 data: 0.0001 max mem: 71357 +[10:04:26.115209] Epoch: [1] [4270/6500] lr: 0.000045 closs: 0.7622 (0.7617) grad_norm: 0.3462 (0.4591) time: 5.5769 data: 0.0001 max mem: 71357 +[10:05:21.870075] Epoch: [1] [4280/6500] lr: 0.000045 closs: 0.7565 (0.7615) grad_norm: 0.3631 (0.4589) time: 5.5792 data: 0.0001 max mem: 71357 +[10:06:17.586494] Epoch: [1] [4290/6500] lr: 0.000045 closs: 0.7211 (0.7615) grad_norm: 0.3673 (0.4587) time: 5.5734 data: 0.0001 max mem: 71357 +[10:07:13.372539] Epoch: [1] [4300/6500] lr: 0.000045 closs: 0.7198 (0.7613) grad_norm: 0.3553 (0.4586) time: 5.5750 data: 0.0001 max mem: 71357 +[10:08:09.184606] Epoch: [1] [4310/6500] lr: 0.000045 closs: 0.7264 (0.7612) grad_norm: 0.3453 (0.4589) time: 5.5798 data: 0.0001 max mem: 71357 +[10:09:04.953272] Epoch: [1] [4320/6500] lr: 0.000045 closs: 0.7640 (0.7613) grad_norm: 0.3516 (0.4588) time: 5.5789 data: 0.0001 max mem: 71357 +[10:10:00.689997] Epoch: [1] [4330/6500] lr: 0.000045 closs: 0.7334 (0.7610) grad_norm: 0.3783 (0.4586) time: 5.5752 data: 0.0001 max mem: 71357 +[10:10:56.358374] Epoch: [1] [4340/6500] lr: 0.000045 closs: 0.7335 (0.7611) grad_norm: 0.4012 (0.4585) time: 5.5702 data: 0.0001 max mem: 71357 +[10:11:52.042943] Epoch: [1] [4350/6500] lr: 0.000045 closs: 0.7335 (0.7611) grad_norm: 0.4101 (0.4585) time: 5.5675 data: 0.0001 max mem: 71357 +[10:12:47.814019] Epoch: [1] [4360/6500] lr: 0.000045 closs: 0.7259 (0.7610) grad_norm: 0.3995 (0.4584) time: 5.5727 data: 0.0001 max mem: 71357 +[10:13:43.550859] Epoch: [1] [4370/6500] lr: 0.000045 closs: 0.7493 (0.7610) grad_norm: 0.3995 (0.4584) time: 5.5753 data: 0.0001 max mem: 71357 +[10:14:39.253856] Epoch: [1] [4380/6500] lr: 0.000045 closs: 0.7821 (0.7611) grad_norm: 0.4144 (0.4587) time: 5.5719 data: 0.0001 max mem: 71357 +[10:15:35.182285] Epoch: [1] [4390/6500] lr: 0.000045 closs: 0.7537 (0.7612) grad_norm: 0.3995 (0.4589) time: 5.5815 data: 0.0001 max mem: 71357 +[10:16:30.985009] Epoch: [1] [4400/6500] lr: 0.000045 closs: 0.7537 (0.7611) grad_norm: 0.3660 (0.4588) time: 5.5864 data: 0.0001 max mem: 71357 +[10:17:26.686911] Epoch: [1] [4410/6500] lr: 0.000045 closs: 0.7020 (0.7611) grad_norm: 0.3723 (0.4587) time: 5.5751 data: 0.0001 max mem: 71357 +[10:18:22.372236] Epoch: [1] [4420/6500] lr: 0.000045 closs: 0.7378 (0.7611) grad_norm: 0.3589 (0.4585) time: 5.5692 data: 0.0001 max mem: 71357 +[10:19:18.157527] Epoch: [1] [4430/6500] lr: 0.000045 closs: 0.7378 (0.7611) grad_norm: 0.3699 (0.4583) time: 5.5734 data: 0.0001 max mem: 71357 +[10:20:14.052884] Epoch: [1] [4440/6500] lr: 0.000044 closs: 0.6983 (0.7610) grad_norm: 0.3723 (0.4585) time: 5.5839 data: 0.0002 max mem: 71357 +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +| distributed init (rank 1): env://, gpu 1 +| distributed init (rank 0): env://, gpu 0 +[10:21:09.403398] > initializing model parallel with size 1 +[10:21:09.403479] > initializing ddp with size 2 +[10:21:09.403488] > initializing pipeline with size 1 +[10:21:09.450720] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory +[10:21:09.450818] Namespace(batch_size=4, +accum_iter=2, +llama_type='llama_peft', +llama_config=['../checkpoints/llama2/Llama-2-70b/params.json'], +no_visual=True, +tokenizer_path='../checkpoints/llama2/Llama-2-70b/tokenizer.model', +pretrained_path='../checkpoints/llama2/Llama-2-70b/', +pretrained_type='meta_ori', +weight_decay=0.02, +lr=5e-05, +min_lr=5e-06, +epochs=4, +warmup_epochs=1.0, +clip_grad=2, +max_words=512, +dialog=False, +data_config='configs/data/finetune/sg/alpaca.yaml', +output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_70B', +log_dir='./output_dir', +save_interval=1, +device='cuda', +seed=0, +resume='', +num_workers=8, +pin_mem=True, +world_size=2, +local_rank=-1, +dist_on_itp=False, +dist_url='env://', +model_parallel_size=1, +data_parallel='sdp', +precision='bf16', +checkpointing=True, +quant=True, +rank=0, +gpu=0, +distributed=True, +dist_backend='nccl') +[10:21:09.451703] Start initialization. +[10:21:09.466825] Model Args: + ModelArgs(dim=8192, n_layers=80, n_heads=64, n_kv_heads=8, vocab_size=32000, multiple_of=4096, ffn_dim_multiplier=1.3, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True) +[10:21:09.915438] Epoch: [1] [4450/6500] lr: 0.000044 closs: 0.6983 (0.7610) grad_norm: 0.3699 (0.4583) time: 5.5878 data: 0.0002 max mem: 71357 +[10:22:05.668596] Epoch: [1] [4460/6500] lr: 0.000044 closs: 0.7467 (0.7610) grad_norm: 0.3758 (0.4583) time: 5.5807 data: 0.0002 max mem: 71357 +[10:23:01.470643] Epoch: [1] [4470/6500] lr: 0.000044 closs: 0.7482 (0.7610) grad_norm: 0.3644 (0.4581) time: 5.5776 data: 0.0002 max mem: 71357 +[10:23:57.292617] Epoch: [1] [4480/6500] lr: 0.000044 closs: 0.7199 (0.7609) grad_norm: 0.3613 (0.4579) time: 5.5811 data: 0.0002 max mem: 71357 +[10:24:53.185106] Epoch: [1] [4490/6500] lr: 0.000044 closs: 0.8161 (0.7611) grad_norm: 0.3613 (0.4580) time: 5.5856 data: 0.0001 max mem: 71357 +[10:25:48.906594] Epoch: [1] [4500/6500] lr: 0.000044 closs: 0.8007 (0.7612) grad_norm: 0.3793 (0.4581) time: 5.5806 data: 0.0002 max mem: 71357 +[10:26:44.655847] Epoch: [1] [4510/6500] lr: 0.000044 closs: 0.7206 (0.7613) grad_norm: 0.3905 (0.4579) time: 5.5735 data: 0.0002 max mem: 71357 +[10:27:40.351074] Epoch: [1] [4520/6500] lr: 0.000044 closs: 0.7976 (0.7615) grad_norm: 0.4285 (0.4580) time: 5.5721 data: 0.0002 max mem: 71357 +[10:28:36.191258] Epoch: [1] [4530/6500] lr: 0.000044 closs: 0.7976 (0.7615) grad_norm: 0.4008 (0.4579) time: 5.5767 data: 0.0001 max mem: 71357 +[10:29:31.968038] Epoch: [1] [4540/6500] lr: 0.000044 closs: 0.7567 (0.7614) grad_norm: 0.3905 (0.4577) time: 5.5807 data: 0.0001 max mem: 71357 +[10:30:27.763567] Epoch: [1] [4550/6500] lr: 0.000044 closs: 0.7294 (0.7615) grad_norm: 0.3837 (0.4576) time: 5.5785 data: 0.0001 max mem: 71357 +[10:31:23.432711] Epoch: [1] [4560/6500] lr: 0.000044 closs: 0.7294 (0.7615) grad_norm: 0.3770 (0.4577) time: 5.5731 data: 0.0001 max mem: 71357 +[10:32:19.140978] Epoch: [1] [4570/6500] lr: 0.000044 closs: 0.7468 (0.7616) grad_norm: 0.3770 (0.4577) time: 5.5688 data: 0.0001 max mem: 71357 +[10:33:14.948775] Epoch: [1] [4580/6500] lr: 0.000044 closs: 0.7468 (0.7616) grad_norm: 0.3815 (0.4577) time: 5.5757 data: 0.0001 max mem: 71357 +[10:34:10.772732] Epoch: [1] [4590/6500] lr: 0.000044 closs: 0.7741 (0.7616) grad_norm: 0.3815 (0.4577) time: 5.5815 data: 0.0001 max mem: 71357 +[10:35:06.473382] Epoch: [1] [4600/6500] lr: 0.000044 closs: 0.7606 (0.7615) grad_norm: 0.3815 (0.4576) time: 5.5761 data: 0.0002 max mem: 71357 +[10:36:02.235458] Epoch: [1] [4610/6500] lr: 0.000044 closs: 0.6930 (0.7615) grad_norm: 0.3815 (0.4577) time: 5.5730 data: 0.0002 max mem: 71357 +[10:36:58.036756] Epoch: [1] [4620/6500] lr: 0.000044 closs: 0.7670 (0.7616) grad_norm: 0.4095 (0.4576) time: 5.5780 data: 0.0001 max mem: 71357 +[10:37:53.792580] Epoch: [1] [4630/6500] lr: 0.000044 closs: 0.7398 (0.7616) grad_norm: 0.4037 (0.4574) time: 5.5778 data: 0.0001 max mem: 71357 +[10:38:49.606204] Epoch: [1] [4640/6500] lr: 0.000044 closs: 0.7056 (0.7615) grad_norm: 0.3881 (0.4572) time: 5.5784 data: 0.0001 max mem: 71357 +[10:39:45.383787] Epoch: [1] [4650/6500] lr: 0.000044 closs: 0.7542 (0.7616) grad_norm: 0.3881 (0.4574) time: 5.5794 data: 0.0002 max mem: 71357 +[10:40:41.177173] Epoch: [1] [4660/6500] lr: 0.000044 closs: 0.7175 (0.7616) grad_norm: 0.3625 (0.4574) time: 5.5785 data: 0.0001 max mem: 71357 +[10:41:36.903399] Epoch: [1] [4670/6500] lr: 0.000044 closs: 0.6899 (0.7614) grad_norm: 0.3625 (0.4574) time: 5.5759 data: 0.0001 max mem: 71357 +[10:42:32.710916] Epoch: [1] [4680/6500] lr: 0.000044 closs: 0.7280 (0.7615) grad_norm: 0.3585 (0.4571) time: 5.5765 data: 0.0001 max mem: 71357 +[10:43:28.526448] Epoch: [1] [4690/6500] lr: 0.000044 closs: 0.7319 (0.7615) grad_norm: 0.3625 (0.4570) time: 5.5810 data: 0.0001 max mem: 71357 +[10:44:24.274780] Epoch: [1] [4700/6500] lr: 0.000044 closs: 0.7385 (0.7614) grad_norm: 0.3723 (0.4571) time: 5.5781 data: 0.0001 max mem: 71357 +[10:45:20.118274] Epoch: [1] [4710/6500] lr: 0.000044 closs: 0.7702 (0.7616) grad_norm: 0.3723 (0.4570) time: 5.5795 data: 0.0002 max mem: 71357 +[10:46:15.876910] Epoch: [1] [4720/6500] lr: 0.000044 closs: 0.8632 (0.7617) grad_norm: 0.3725 (0.4569) time: 5.5800 data: 0.0002 max mem: 71357 +[10:47:11.670606] Epoch: [1] [4730/6500] lr: 0.000044 closs: 0.7829 (0.7617) grad_norm: 0.3538 (0.4566) time: 5.5775 data: 0.0001 max mem: 71357 +[10:48:07.502727] Epoch: [1] [4740/6500] lr: 0.000044 closs: 0.8662 (0.7620) grad_norm: 0.3451 (0.4564) time: 5.5812 data: 0.0001 max mem: 71357 +[10:49:03.350525] Epoch: [1] [4750/6500] lr: 0.000044 closs: 0.8316 (0.7620) grad_norm: 0.3455 (0.4564) time: 5.5839 data: 0.0001 max mem: 71357 +[10:49:59.088820] Epoch: [1] [4760/6500] lr: 0.000044 closs: 0.7709 (0.7620) grad_norm: 0.3764 (0.4564) time: 5.5792 data: 0.0002 max mem: 71357 +[10:50:54.918656] Epoch: [1] [4770/6500] lr: 0.000044 closs: 0.7709 (0.7620) grad_norm: 0.4139 (0.4562) time: 5.5783 data: 0.0002 max mem: 71357 +[10:51:50.661923] Epoch: [1] [4780/6500] lr: 0.000044 closs: 0.7886 (0.7621) grad_norm: 0.4284 (0.4565) time: 5.5786 data: 0.0001 max mem: 71357 +[10:52:46.468233] Epoch: [1] [4790/6500] lr: 0.000044 closs: 0.8177 (0.7621) grad_norm: 0.4017 (0.4562) time: 5.5774 data: 0.0001 max mem: 71357 +[10:53:42.298368] Epoch: [1] [4800/6500] lr: 0.000044 closs: 0.8177 (0.7623) grad_norm: 0.3854 (0.4563) time: 5.5817 data: 0.0001 max mem: 71357 +[10:54:37.963093] Epoch: [1] [4810/6500] lr: 0.000044 closs: 0.7960 (0.7625) grad_norm: 0.3854 (0.4572) time: 5.5746 data: 0.0001 max mem: 71357 +[10:55:33.724578] Epoch: [1] [4820/6500] lr: 0.000044 closs: 0.7233 (0.7623) grad_norm: 0.3701 (0.4571) time: 5.5712 data: 0.0001 max mem: 71357 +[10:56:29.502489] Epoch: [1] [4830/6500] lr: 0.000044 closs: 0.7313 (0.7624) grad_norm: 0.3720 (0.4579) time: 5.5769 data: 0.0001 max mem: 71357 +[10:57:25.285468] Epoch: [1] [4840/6500] lr: 0.000043 closs: 0.7270 (0.7623) grad_norm: 0.3717 (0.4578) time: 5.5780 data: 0.0001 max mem: 71357 +[10:58:20.999566] Epoch: [1] [4850/6500] lr: 0.000043 closs: 0.7105 (0.7623) grad_norm: 0.4146 (0.4578) time: 5.5747 data: 0.0001 max mem: 71357 +[10:59:16.743808] Epoch: [1] [4860/6500] lr: 0.000043 closs: 0.7434 (0.7624) grad_norm: 0.4005 (0.4577) time: 5.5728 data: 0.0001 max mem: 71357 +[11:00:12.471000] Epoch: [1] [4870/6500] lr: 0.000043 closs: 0.8188 (0.7625) grad_norm: 0.4129 (0.4576) time: 5.5734 data: 0.0002 max mem: 71357 +[11:01:08.334659] Epoch: [1] [4880/6500] lr: 0.000043 closs: 0.7880 (0.7625) grad_norm: 0.3817 (0.4573) time: 5.5794 data: 0.0002 max mem: 71357 +[11:02:04.159214] Epoch: [1] [4890/6500] lr: 0.000043 closs: 0.7651 (0.7626) grad_norm: 0.3529 (0.4571) time: 5.5843 data: 0.0001 max mem: 71357 +[11:02:59.872216] Epoch: [1] [4900/6500] lr: 0.000043 closs: 0.7651 (0.7626) grad_norm: 0.3376 (0.4570) time: 5.5768 data: 0.0001 max mem: 71357 +[11:03:55.655000] Epoch: [1] [4910/6500] lr: 0.000043 closs: 0.7754 (0.7626) grad_norm: 0.3316 (0.4568) time: 5.5747 data: 0.0001 max mem: 71357 +[11:04:51.487749] Epoch: [1] [4920/6500] lr: 0.000043 closs: 0.7336 (0.7625) grad_norm: 0.3376 (0.4566) time: 5.5806 data: 0.0002 max mem: 71357 +[11:05:47.384044] Epoch: [1] [4930/6500] lr: 0.000043 closs: 0.7084 (0.7624) grad_norm: 0.3480 (0.4565) time: 5.5863 data: 0.0002 max mem: 71357 +[11:06:43.177011] Epoch: [1] [4940/6500] lr: 0.000043 closs: 0.7459 (0.7623) grad_norm: 0.3427 (0.4563) time: 5.5843 data: 0.0001 max mem: 71357 +[11:07:38.843228] Epoch: [1] [4950/6500] lr: 0.000043 closs: 0.7459 (0.7623) grad_norm: 0.3660 (0.4564) time: 5.5728 data: 0.0001 max mem: 71357 +[11:08:34.583423] Epoch: [1] [4960/6500] lr: 0.000043 closs: 0.7790 (0.7624) grad_norm: 0.4092 (0.4563) time: 5.5702 data: 0.0001 max mem: 71357 +[11:09:30.439676] Epoch: [1] [4970/6500] lr: 0.000043 closs: 0.7991 (0.7624) grad_norm: 0.4092 (0.4561) time: 5.5797 data: 0.0001 max mem: 71357 +[11:10:26.188771] Epoch: [1] [4980/6500] lr: 0.000043 closs: 0.7671 (0.7624) grad_norm: 0.3963 (0.4561) time: 5.5801 data: 0.0002 max mem: 71357 +[11:11:21.946425] Epoch: [1] [4990/6500] lr: 0.000043 closs: 0.7342 (0.7623) grad_norm: 0.3800 (0.4560) time: 5.5752 data: 0.0002 max mem: 71357 +[11:12:17.744689] Epoch: [1] [5000/6500] lr: 0.000043 closs: 0.7342 (0.7622) grad_norm: 0.3800 (0.4565) time: 5.5777 data: 0.0001 max mem: 71357 +[11:13:13.585475] Epoch: [1] [5010/6500] lr: 0.000043 closs: 0.7470 (0.7621) grad_norm: 0.3830 (0.4563) time: 5.5818 data: 0.0001 max mem: 71357 +[11:14:09.395279] Epoch: [1] [5020/6500] lr: 0.000043 closs: 0.7470 (0.7621) grad_norm: 0.3689 (0.4561) time: 5.5824 data: 0.0001 max mem: 71357 +[11:15:05.164880] Epoch: [1] [5030/6500] lr: 0.000043 closs: 0.7894 (0.7623) grad_norm: 0.3559 (0.4563) time: 5.5788 data: 0.0002 max mem: 71357 +[11:16:00.882700] Epoch: [1] [5040/6500] lr: 0.000043 closs: 0.7406 (0.7621) grad_norm: 0.3560 (0.4567) time: 5.5742 data: 0.0002 max mem: 71357 +[11:16:56.643127] Epoch: [1] [5050/6500] lr: 0.000043 closs: 0.7206 (0.7621) grad_norm: 0.3608 (0.4565) time: 5.5738 data: 0.0001 max mem: 71357 +[11:17:52.504961] Epoch: [1] [5060/6500] lr: 0.000043 closs: 0.7532 (0.7621) grad_norm: 0.3612 (0.4564) time: 5.5810 data: 0.0001 max mem: 71357 +[11:18:48.234806] Epoch: [1] [5070/6500] lr: 0.000043 closs: 0.7623 (0.7620) grad_norm: 0.3612 (0.4564) time: 5.5795 data: 0.0001 max mem: 71357 +[11:19:44.087526] Epoch: [1] [5080/6500] lr: 0.000043 closs: 0.6898 (0.7620) grad_norm: 0.3674 (0.4563) time: 5.5790 data: 0.0001 max mem: 71357 +[11:20:39.901787] Epoch: [1] [5090/6500] lr: 0.000043 closs: 0.6929 (0.7620) grad_norm: 0.3674 (0.4563) time: 5.5832 data: 0.0001 max mem: 71357 +[11:21:35.717532] Epoch: [1] [5100/6500] lr: 0.000043 closs: 0.7438 (0.7620) grad_norm: 0.3975 (0.4565) time: 5.5814 data: 0.0001 max mem: 71357 +[11:22:31.404559] Epoch: [1] [5110/6500] lr: 0.000043 closs: 0.7661 (0.7621) grad_norm: 0.4491 (0.4566) time: 5.5750 data: 0.0001 max mem: 71357 +[11:23:27.158982] Epoch: [1] [5120/6500] lr: 0.000043 closs: 0.8052 (0.7622) grad_norm: 0.4491 (0.4565) time: 5.5720 data: 0.0001 max mem: 71357 +[11:24:22.940301] Epoch: [1] [5130/6500] lr: 0.000043 closs: 0.7489 (0.7619) grad_norm: 0.4491 (0.4564) time: 5.5767 data: 0.0001 max mem: 71357 +[11:25:18.734962] Epoch: [1] [5140/6500] lr: 0.000043 closs: 0.7677 (0.7621) grad_norm: 0.4062 (0.4563) time: 5.5787 data: 0.0002 max mem: 71357 +[11:26:14.603090] Epoch: [1] [5150/6500] lr: 0.000043 closs: 0.7984 (0.7621) grad_norm: 0.3606 (0.4563) time: 5.5830 data: 0.0002 max mem: 71357 +[11:27:10.315042] Epoch: [1] [5160/6500] lr: 0.000043 closs: 0.7870 (0.7623) grad_norm: 0.3626 (0.4561) time: 5.5789 data: 0.0001 max mem: 71357 +[11:28:05.963749] Epoch: [1] [5170/6500] lr: 0.000043 closs: 0.8181 (0.7624) grad_norm: 0.3642 (0.4562) time: 5.5679 data: 0.0001 max mem: 71357 +[11:29:01.772870] Epoch: [1] [5180/6500] lr: 0.000043 closs: 0.7284 (0.7623) grad_norm: 0.3776 (0.4562) time: 5.5728 data: 0.0001 max mem: 71357 +[11:29:57.532695] Epoch: [1] [5190/6500] lr: 0.000043 closs: 0.7601 (0.7623) grad_norm: 0.3662 (0.4561) time: 5.5783 data: 0.0002 max mem: 71357 +[11:30:53.205321] Epoch: [1] [5200/6500] lr: 0.000043 closs: 0.7734 (0.7623) grad_norm: 0.4361 (0.4561) time: 5.5715 data: 0.0002 max mem: 71357 +[11:31:48.890650] Epoch: [1] [5210/6500] lr: 0.000043 closs: 0.7016 (0.7622) grad_norm: 0.4014 (0.4561) time: 5.5678 data: 0.0001 max mem: 71357 +[11:32:44.490303] Epoch: [1] [5220/6500] lr: 0.000043 closs: 0.7215 (0.7623) grad_norm: 0.4014 (0.4563) time: 5.5642 data: 0.0001 max mem: 71357 +[11:33:40.288737] Epoch: [1] [5230/6500] lr: 0.000042 closs: 0.6987 (0.7620) grad_norm: 0.4014 (0.4562) time: 5.5698 data: 0.0001 max mem: 71357 +[11:34:36.201586] Epoch: [1] [5240/6500] lr: 0.000042 closs: 0.7130 (0.7620) grad_norm: 0.3873 (0.4561) time: 5.5855 data: 0.0001 max mem: 71357 +[11:35:31.926282] Epoch: [1] [5250/6500] lr: 0.000042 closs: 0.7390 (0.7619) grad_norm: 0.3751 (0.4559) time: 5.5818 data: 0.0002 max mem: 71357 +[11:36:27.605601] Epoch: [1] [5260/6500] lr: 0.000042 closs: 0.6863 (0.7618) grad_norm: 0.3751 (0.4560) time: 5.5701 data: 0.0002 max mem: 71357 +[11:37:23.356717] Epoch: [1] [5270/6500] lr: 0.000042 closs: 0.7545 (0.7619) grad_norm: 0.3868 (0.4560) time: 5.5714 data: 0.0001 max mem: 71357 +[11:38:19.052455] Epoch: [1] [5280/6500] lr: 0.000042 closs: 0.7771 (0.7618) grad_norm: 0.4176 (0.4562) time: 5.5723 data: 0.0001 max mem: 71357 +[11:39:14.810553] Epoch: [1] [5290/6500] lr: 0.000042 closs: 0.7234 (0.7619) grad_norm: 0.4176 (0.4560) time: 5.5726 data: 0.0001 max mem: 71357 +[11:40:10.478168] Epoch: [1] [5300/6500] lr: 0.000042 closs: 0.7234 (0.7619) grad_norm: 0.3894 (0.4561) time: 5.5712 data: 0.0002 max mem: 71357 +[11:41:06.204522] Epoch: [1] [5310/6500] lr: 0.000042 closs: 0.8350 (0.7620) grad_norm: 0.4485 (0.4564) time: 5.5696 data: 0.0002 max mem: 71357 +[11:42:02.028377] Epoch: [1] [5320/6500] lr: 0.000042 closs: 0.8021 (0.7620) grad_norm: 0.3894 (0.4565) time: 5.5774 data: 0.0001 max mem: 71357 +[11:42:57.762208] Epoch: [1] [5330/6500] lr: 0.000042 closs: 0.7210 (0.7620) grad_norm: 0.4442 (0.4564) time: 5.5778 data: 0.0001 max mem: 71357 +[11:43:53.467868] Epoch: [1] [5340/6500] lr: 0.000042 closs: 0.7326 (0.7621) grad_norm: 0.4205 (0.4564) time: 5.5719 data: 0.0001 max mem: 71357 +[11:44:49.249976] Epoch: [1] [5350/6500] lr: 0.000042 closs: 0.7308 (0.7620) grad_norm: 0.3894 (0.4562) time: 5.5743 data: 0.0001 max mem: 71357 +[11:45:44.957778] Epoch: [1] [5360/6500] lr: 0.000042 closs: 0.6752 (0.7620) grad_norm: 0.3891 (0.4562) time: 5.5744 data: 0.0001 max mem: 71357 +[11:46:40.748059] Epoch: [1] [5370/6500] lr: 0.000042 closs: 0.6930 (0.7618) grad_norm: 0.3779 (0.4562) time: 5.5748 data: 0.0001 max mem: 71357 +[11:47:36.454319] Epoch: [1] [5380/6500] lr: 0.000042 closs: 0.7712 (0.7620) grad_norm: 0.3879 (0.4562) time: 5.5747 data: 0.0001 max mem: 71357 +[11:48:32.242290] Epoch: [1] [5390/6500] lr: 0.000042 closs: 0.8651 (0.7621) grad_norm: 0.3879 (0.4562) time: 5.5746 data: 0.0001 max mem: 71357 +[11:49:27.957643] Epoch: [1] [5400/6500] lr: 0.000042 closs: 0.7200 (0.7620) grad_norm: 0.4342 (0.4562) time: 5.5751 data: 0.0001 max mem: 71357 +[11:50:23.766466] Epoch: [1] [5410/6500] lr: 0.000042 closs: 0.7646 (0.7621) grad_norm: 0.4439 (0.4564) time: 5.5761 data: 0.0001 max mem: 71357 +[11:51:19.479821] Epoch: [1] [5420/6500] lr: 0.000042 closs: 0.7639 (0.7620) grad_norm: 0.4558 (0.4567) time: 5.5760 data: 0.0001 max mem: 71357 +[11:52:15.244104] Epoch: [1] [5430/6500] lr: 0.000042 closs: 0.7082 (0.7620) grad_norm: 0.4558 (0.4565) time: 5.5738 data: 0.0001 max mem: 71357 +[11:53:10.897641] Epoch: [1] [5440/6500] lr: 0.000042 closs: 0.7920 (0.7621) grad_norm: 0.4017 (0.4564) time: 5.5708 data: 0.0001 max mem: 71357 +[11:54:06.694640] Epoch: [1] [5450/6500] lr: 0.000042 closs: 0.7965 (0.7622) grad_norm: 0.4201 (0.4564) time: 5.5724 data: 0.0001 max mem: 71357 +[11:55:02.484616] Epoch: [1] [5460/6500] lr: 0.000042 closs: 0.7598 (0.7622) grad_norm: 0.3805 (0.4563) time: 5.5793 data: 0.0001 max mem: 71357 +[11:55:58.257271] Epoch: [1] [5470/6500] lr: 0.000042 closs: 0.7568 (0.7622) grad_norm: 0.3925 (0.4562) time: 5.5781 data: 0.0001 max mem: 71357 +[11:56:53.980040] Epoch: [1] [5480/6500] lr: 0.000042 closs: 0.7271 (0.7621) grad_norm: 0.3925 (0.4563) time: 5.5747 data: 0.0001 max mem: 71357 +[11:57:49.775101] Epoch: [1] [5490/6500] lr: 0.000042 closs: 0.7271 (0.7620) grad_norm: 0.3860 (0.4561) time: 5.5758 data: 0.0001 max mem: 71357 +[11:58:45.515824] Epoch: [1] [5500/6500] lr: 0.000042 closs: 0.7487 (0.7620) grad_norm: 0.3860 (0.4560) time: 5.5767 data: 0.0001 max mem: 71357 +[11:59:41.216243] Epoch: [1] [5510/6500] lr: 0.000042 closs: 0.7174 (0.7619) grad_norm: 0.4166 (0.4561) time: 5.5720 data: 0.0001 max mem: 71357 +[12:00:36.966815] Epoch: [1] [5520/6500] lr: 0.000042 closs: 0.7540 (0.7621) grad_norm: 0.3887 (0.4560) time: 5.5725 data: 0.0001 max mem: 71357 +[12:01:32.831173] Epoch: [1] [5530/6500] lr: 0.000042 closs: 0.7546 (0.7621) grad_norm: 0.4038 (0.4560) time: 5.5807 data: 0.0001 max mem: 71357 +[12:02:28.647159] Epoch: [1] [5540/6500] lr: 0.000042 closs: 0.7516 (0.7621) grad_norm: 0.4075 (0.4560) time: 5.5839 data: 0.0001 max mem: 71357 +[12:03:24.364310] Epoch: [1] [5550/6500] lr: 0.000042 closs: 0.7670 (0.7621) grad_norm: 0.3887 (0.4558) time: 5.5766 data: 0.0001 max mem: 71357 +[12:04:20.155378] Epoch: [1] [5560/6500] lr: 0.000042 closs: 0.7699 (0.7622) grad_norm: 0.3411 (0.4556) time: 5.5753 data: 0.0001 max mem: 71357 +[12:05:15.851908] Epoch: [1] [5570/6500] lr: 0.000042 closs: 0.7497 (0.7622) grad_norm: 0.3411 (0.4555) time: 5.5743 data: 0.0001 max mem: 71357 +[12:06:11.602612] Epoch: [1] [5580/6500] lr: 0.000042 closs: 0.7377 (0.7622) grad_norm: 0.3426 (0.4556) time: 5.5723 data: 0.0001 max mem: 71357 +[12:07:07.492817] Epoch: [1] [5590/6500] lr: 0.000041 closs: 0.7341 (0.7622) grad_norm: 0.3762 (0.4555) time: 5.5820 data: 0.0001 max mem: 71357 +[12:08:03.293439] Epoch: [1] [5600/6500] lr: 0.000041 closs: 0.7341 (0.7623) grad_norm: 0.4020 (0.4553) time: 5.5845 data: 0.0001 max mem: 71357 +[12:08:58.953646] Epoch: [1] [5610/6500] lr: 0.000041 closs: 0.7692 (0.7621) grad_norm: 0.4286 (0.4556) time: 5.5730 data: 0.0001 max mem: 71357 +[12:09:54.762282] Epoch: [1] [5620/6500] lr: 0.000041 closs: 0.7133 (0.7621) grad_norm: 0.3748 (0.4555) time: 5.5733 data: 0.0001 max mem: 71357 +[12:10:50.637061] Epoch: [1] [5630/6500] lr: 0.000041 closs: 0.7529 (0.7622) grad_norm: 0.3843 (0.4555) time: 5.5841 data: 0.0001 max mem: 71357 +[12:11:46.338076] Epoch: [1] [5640/6500] lr: 0.000041 closs: 0.8054 (0.7622) grad_norm: 0.3843 (0.4553) time: 5.5787 data: 0.0001 max mem: 71357 +[12:12:42.052979] Epoch: [1] [5650/6500] lr: 0.000041 closs: 0.8054 (0.7623) grad_norm: 0.3795 (0.4552) time: 5.5707 data: 0.0001 max mem: 71357 +[12:13:37.757138] Epoch: [1] [5660/6500] lr: 0.000041 closs: 0.7757 (0.7623) grad_norm: 0.3795 (0.4552) time: 5.5709 data: 0.0001 max mem: 71357 +[12:14:33.519114] Epoch: [1] [5670/6500] lr: 0.000041 closs: 0.7771 (0.7623) grad_norm: 0.3751 (0.4552) time: 5.5732 data: 0.0001 max mem: 71357 +[12:15:29.262342] Epoch: [1] [5680/6500] lr: 0.000041 closs: 0.8003 (0.7625) grad_norm: 0.3742 (0.4550) time: 5.5752 data: 0.0001 max mem: 71357 +[12:16:24.960305] Epoch: [1] [5690/6500] lr: 0.000041 closs: 0.7774 (0.7625) grad_norm: 0.3751 (0.4549) time: 5.5720 data: 0.0001 max mem: 71357 +[12:17:20.818451] Epoch: [1] [5700/6500] lr: 0.000041 closs: 0.7509 (0.7625) grad_norm: 0.3829 (0.4548) time: 5.5777 data: 0.0001 max mem: 71357 +[12:18:16.589827] Epoch: [1] [5710/6500] lr: 0.000041 closs: 0.7242 (0.7625) grad_norm: 0.3742 (0.4547) time: 5.5814 data: 0.0001 max mem: 71357 +[12:19:12.364366] Epoch: [1] [5720/6500] lr: 0.000041 closs: 0.7242 (0.7624) grad_norm: 0.4329 (0.4547) time: 5.5772 data: 0.0002 max mem: 71357 +[12:20:08.204706] Epoch: [1] [5730/6500] lr: 0.000041 closs: 0.7491 (0.7624) grad_norm: 0.3653 (0.4545) time: 5.5806 data: 0.0002 max mem: 71357 +[12:21:03.861751] Epoch: [1] [5740/6500] lr: 0.000041 closs: 0.7838 (0.7624) grad_norm: 0.3653 (0.4545) time: 5.5747 data: 0.0001 max mem: 71357 +[12:21:59.718259] Epoch: [1] [5750/6500] lr: 0.000041 closs: 0.7416 (0.7624) grad_norm: 0.3975 (0.4546) time: 5.5756 data: 0.0001 max mem: 71357 +[12:22:55.524356] Epoch: [1] [5760/6500] lr: 0.000041 closs: 0.7056 (0.7623) grad_norm: 0.3606 (0.4545) time: 5.5830 data: 0.0001 max mem: 71357 +[12:23:51.380407] Epoch: [1] [5770/6500] lr: 0.000041 closs: 0.7723 (0.7624) grad_norm: 0.3858 (0.4544) time: 5.5830 data: 0.0001 max mem: 71357 +[12:24:47.150261] Epoch: [1] [5780/6500] lr: 0.000041 closs: 0.7954 (0.7623) grad_norm: 0.4207 (0.4544) time: 5.5812 data: 0.0001 max mem: 71357 +[12:25:42.827316] Epoch: [1] [5790/6500] lr: 0.000041 closs: 0.6956 (0.7622) grad_norm: 0.3718 (0.4544) time: 5.5722 data: 0.0002 max mem: 71357 +[12:26:38.608622] Epoch: [1] [5800/6500] lr: 0.000041 closs: 0.6790 (0.7621) grad_norm: 0.3812 (0.4543) time: 5.5728 data: 0.0001 max mem: 71357 +[12:27:34.355703] Epoch: [1] [5810/6500] lr: 0.000041 closs: 0.6473 (0.7620) grad_norm: 0.4115 (0.4544) time: 5.5763 data: 0.0001 max mem: 71357 +[12:28:30.033082] Epoch: [1] [5820/6500] lr: 0.000041 closs: 0.6839 (0.7620) grad_norm: 0.4103 (0.4543) time: 5.5711 data: 0.0001 max mem: 71357 +[12:29:25.690915] Epoch: [1] [5830/6500] lr: 0.000041 closs: 0.7630 (0.7620) grad_norm: 0.4103 (0.4544) time: 5.5667 data: 0.0001 max mem: 71357 +[12:30:21.426603] Epoch: [1] [5840/6500] lr: 0.000041 closs: 0.7711 (0.7621) grad_norm: 0.4103 (0.4542) time: 5.5696 data: 0.0001 max mem: 71357 +[12:31:17.292434] Epoch: [1] [5850/6500] lr: 0.000041 closs: 0.8304 (0.7622) grad_norm: 0.3960 (0.4541) time: 5.5800 data: 0.0001 max mem: 71357 +[12:32:12.900412] Epoch: [1] [5860/6500] lr: 0.000041 closs: 0.7820 (0.7621) grad_norm: 0.3960 (0.4541) time: 5.5736 data: 0.0001 max mem: 71357 +[12:33:08.766672] Epoch: [1] [5870/6500] lr: 0.000041 closs: 0.6789 (0.7621) grad_norm: 0.3778 (0.4540) time: 5.5736 data: 0.0001 max mem: 71357 +[12:34:04.393916] Epoch: [1] [5880/6500] lr: 0.000041 closs: 0.7246 (0.7622) grad_norm: 0.3856 (0.4547) time: 5.5746 data: 0.0001 max mem: 71357 +[12:35:00.125761] Epoch: [1] [5890/6500] lr: 0.000041 closs: 0.7455 (0.7621) grad_norm: 0.3856 (0.4546) time: 5.5679 data: 0.0001 max mem: 71357 +[12:35:55.981470] Epoch: [1] [5900/6500] lr: 0.000041 closs: 0.7150 (0.7620) grad_norm: 0.3649 (0.4545) time: 5.5793 data: 0.0001 max mem: 71357 +[12:36:51.710443] Epoch: [1] [5910/6500] lr: 0.000041 closs: 0.7150 (0.7620) grad_norm: 0.3649 (0.4544) time: 5.5792 data: 0.0001 max mem: 71357 +[12:37:47.571872] Epoch: [1] [5920/6500] lr: 0.000041 closs: 0.6987 (0.7619) grad_norm: 0.3471 (0.4542) time: 5.5794 data: 0.0001 max mem: 71357 +[12:38:43.443859] Epoch: [1] [5930/6500] lr: 0.000040 closs: 0.6987 (0.7619) grad_norm: 0.3331 (0.4540) time: 5.5866 data: 0.0001 max mem: 71357 +[12:39:39.343282] Epoch: [1] [5940/6500] lr: 0.000040 closs: 0.7192 (0.7619) grad_norm: 0.3435 (0.4541) time: 5.5884 data: 0.0001 max mem: 71357 +[12:40:35.146723] Epoch: [1] [5950/6500] lr: 0.000040 closs: 0.7379 (0.7619) grad_norm: 0.3435 (0.4539) time: 5.5850 data: 0.0002 max mem: 71357 +[12:41:30.902370] Epoch: [1] [5960/6500] lr: 0.000040 closs: 0.7951 (0.7619) grad_norm: 0.3445 (0.4538) time: 5.5779 data: 0.0002 max mem: 71357 +[12:42:26.717992] Epoch: [1] [5970/6500] lr: 0.000040 closs: 0.7941 (0.7620) grad_norm: 0.4037 (0.4539) time: 5.5785 data: 0.0002 max mem: 71357 +[12:43:22.562358] Epoch: [1] [5980/6500] lr: 0.000040 closs: 0.7629 (0.7621) grad_norm: 0.3538 (0.4538) time: 5.5829 data: 0.0002 max mem: 71357 +[12:44:18.336240] Epoch: [1] [5990/6500] lr: 0.000040 closs: 0.7501 (0.7620) grad_norm: 0.3538 (0.4536) time: 5.5808 data: 0.0001 max mem: 71357 +[12:45:14.046587] Epoch: [1] [6000/6500] lr: 0.000040 closs: 0.6951 (0.7620) grad_norm: 0.3621 (0.4535) time: 5.5741 data: 0.0002 max mem: 71357 +[12:46:09.836934] Epoch: [1] [6010/6500] lr: 0.000040 closs: 0.6929 (0.7619) grad_norm: 0.3621 (0.4534) time: 5.5749 data: 0.0002 max mem: 71357 +[12:47:05.668098] Epoch: [1] [6020/6500] lr: 0.000040 closs: 0.7398 (0.7619) grad_norm: 0.3923 (0.4538) time: 5.5810 data: 0.0002 max mem: 71357 +[12:48:01.635878] Epoch: [1] [6030/6500] lr: 0.000040 closs: 0.7508 (0.7619) grad_norm: 0.4124 (0.4537) time: 5.5898 data: 0.0002 max mem: 71357 +[12:48:57.359248] Epoch: [1] [6040/6500] lr: 0.000040 closs: 0.7133 (0.7618) grad_norm: 0.4416 (0.4538) time: 5.5845 data: 0.0002 max mem: 71357 +[12:49:53.165180] Epoch: [1] [6050/6500] lr: 0.000040 closs: 0.7493 (0.7620) grad_norm: 0.4416 (0.4538) time: 5.5764 data: 0.0001 max mem: 71357 +[12:50:49.040989] Epoch: [1] [6060/6500] lr: 0.000040 closs: 0.8315 (0.7620) grad_norm: 0.4096 (0.4538) time: 5.5840 data: 0.0002 max mem: 71357 +[12:51:44.970720] Epoch: [1] [6070/6500] lr: 0.000040 closs: 0.7805 (0.7621) grad_norm: 0.4065 (0.4536) time: 5.5902 data: 0.0002 max mem: 71357 +[12:52:40.669392] Epoch: [1] [6080/6500] lr: 0.000040 closs: 0.8480 (0.7623) grad_norm: 0.4065 (0.4539) time: 5.5813 data: 0.0001 max mem: 71357 +[12:53:36.427897] Epoch: [1] [6090/6500] lr: 0.000040 closs: 0.8226 (0.7624) grad_norm: 0.4065 (0.4540) time: 5.5728 data: 0.0001 max mem: 71357 +[12:54:32.195913] Epoch: [1] [6100/6500] lr: 0.000040 closs: 0.7264 (0.7623) grad_norm: 0.3978 (0.4539) time: 5.5763 data: 0.0001 max mem: 71357 +[12:55:27.987231] Epoch: [1] [6110/6500] lr: 0.000040 closs: 0.8074 (0.7625) grad_norm: 0.4201 (0.4538) time: 5.5779 data: 0.0001 max mem: 71357 +[12:56:23.829834] Epoch: [1] [6120/6500] lr: 0.000040 closs: 0.8245 (0.7626) grad_norm: 0.3929 (0.4537) time: 5.5816 data: 0.0001 max mem: 71357 +[12:57:19.521242] Epoch: [1] [6130/6500] lr: 0.000040 closs: 0.8077 (0.7628) grad_norm: 0.3743 (0.4536) time: 5.5766 data: 0.0001 max mem: 71357 +[12:58:15.203234] Epoch: [1] [6140/6500] lr: 0.000040 closs: 0.7957 (0.7627) grad_norm: 0.3747 (0.4556) time: 5.5686 data: 0.0001 max mem: 71357 +[12:59:10.946149] Epoch: [1] [6150/6500] lr: 0.000040 closs: 0.7957 (0.7628) grad_norm: 0.3743 (0.4555) time: 5.5712 data: 0.0001 max mem: 71357 +[13:00:06.841654] Epoch: [1] [6160/6500] lr: 0.000040 closs: 0.8200 (0.7629) grad_norm: 0.3747 (0.4554) time: 5.5818 data: 0.0001 max mem: 71357 +[13:01:02.559307] Epoch: [1] [6170/6500] lr: 0.000040 closs: 0.7666 (0.7628) grad_norm: 0.3837 (0.4553) time: 5.5806 data: 0.0001 max mem: 71357 +[13:01:58.277625] Epoch: [1] [6180/6500] lr: 0.000040 closs: 0.7439 (0.7629) grad_norm: 0.3926 (0.4553) time: 5.5717 data: 0.0001 max mem: 71357 +[13:02:54.029871] Epoch: [1] [6190/6500] lr: 0.000040 closs: 0.8000 (0.7629) grad_norm: 0.3897 (0.4552) time: 5.5734 data: 0.0001 max mem: 71357 +[13:03:49.873703] Epoch: [1] [6200/6500] lr: 0.000040 closs: 0.7667 (0.7628) grad_norm: 0.3926 (0.4552) time: 5.5797 data: 0.0001 max mem: 71357 +[13:04:45.563891] Epoch: [1] [6210/6500] lr: 0.000040 closs: 0.7751 (0.7629) grad_norm: 0.4210 (0.4552) time: 5.5766 data: 0.0001 max mem: 71357 +[13:05:41.287468] Epoch: [1] [6220/6500] lr: 0.000040 closs: 0.7010 (0.7627) grad_norm: 0.3957 (0.4551) time: 5.5706 data: 0.0002 max mem: 71357 +[13:06:37.089385] Epoch: [1] [6230/6500] lr: 0.000040 closs: 0.7090 (0.7628) grad_norm: 0.4210 (0.4550) time: 5.5762 data: 0.0002 max mem: 71357 +[13:07:32.868772] Epoch: [1] [6240/6500] lr: 0.000040 closs: 0.7651 (0.7628) grad_norm: 0.3920 (0.4550) time: 5.5790 data: 0.0001 max mem: 71357 +[13:08:28.652568] Epoch: [1] [6250/6500] lr: 0.000040 closs: 0.7620 (0.7629) grad_norm: 0.3703 (0.4549) time: 5.5781 data: 0.0001 max mem: 71357 +[13:09:24.362382] Epoch: [1] [6260/6500] lr: 0.000039 closs: 0.7858 (0.7629) grad_norm: 0.3703 (0.4548) time: 5.5746 data: 0.0001 max mem: 71357 +[13:10:20.032595] Epoch: [1] [6270/6500] lr: 0.000039 closs: 0.7305 (0.7628) grad_norm: 0.3763 (0.4549) time: 5.5689 data: 0.0001 max mem: 71357 +[13:11:15.731095] Epoch: [1] [6280/6500] lr: 0.000039 closs: 0.7532 (0.7629) grad_norm: 0.3879 (0.4550) time: 5.5683 data: 0.0001 max mem: 71357 +[13:12:11.442048] Epoch: [1] [6290/6500] lr: 0.000039 closs: 0.7507 (0.7628) grad_norm: 0.4623 (0.4552) time: 5.5703 data: 0.0001 max mem: 71357 +[13:13:07.132957] Epoch: [1] [6300/6500] lr: 0.000039 closs: 0.7333 (0.7628) grad_norm: 0.3946 (0.4550) time: 5.5700 data: 0.0001 max mem: 71357 +[13:14:02.908037] Epoch: [1] [6310/6500] lr: 0.000039 closs: 0.7169 (0.7626) grad_norm: 0.4401 (0.4552) time: 5.5732 data: 0.0001 max mem: 71357 +[13:14:58.686738] Epoch: [1] [6320/6500] lr: 0.000039 closs: 0.7381 (0.7627) grad_norm: 0.3946 (0.4552) time: 5.5776 data: 0.0001 max mem: 71357 +[13:15:54.502734] Epoch: [1] [6330/6500] lr: 0.000039 closs: 0.7568 (0.7627) grad_norm: 0.3855 (0.4550) time: 5.5796 data: 0.0001 max mem: 71357 +[13:16:50.331408] Epoch: [1] [6340/6500] lr: 0.000039 closs: 0.7370 (0.7626) grad_norm: 0.3901 (0.4549) time: 5.5821 data: 0.0001 max mem: 71357 +[13:17:46.076366] Epoch: [1] [6350/6500] lr: 0.000039 closs: 0.7401 (0.7627) grad_norm: 0.3855 (0.4551) time: 5.5786 data: 0.0001 max mem: 71357 +[13:18:41.870339] Epoch: [1] [6360/6500] lr: 0.000039 closs: 0.8105 (0.7628) grad_norm: 0.3901 (0.4551) time: 5.5768 data: 0.0002 max mem: 71357 +[13:19:37.808359] Epoch: [1] [6370/6500] lr: 0.000039 closs: 0.7998 (0.7628) grad_norm: 0.4095 (0.4550) time: 5.5865 data: 0.0002 max mem: 71357 +[13:20:33.725798] Epoch: [1] [6380/6500] lr: 0.000039 closs: 0.7342 (0.7626) grad_norm: 0.3924 (0.4549) time: 5.5926 data: 0.0002 max mem: 71357 +[13:21:29.484753] Epoch: [1] [6390/6500] lr: 0.000039 closs: 0.6765 (0.7626) grad_norm: 0.3924 (0.4548) time: 5.5836 data: 0.0002 max mem: 71357 +[13:22:25.257086] Epoch: [1] [6400/6500] lr: 0.000039 closs: 0.7452 (0.7627) grad_norm: 0.3553 (0.4548) time: 5.5764 data: 0.0002 max mem: 71357 +[13:23:21.034494] Epoch: [1] [6410/6500] lr: 0.000039 closs: 0.7509 (0.7627) grad_norm: 0.3656 (0.4547) time: 5.5774 data: 0.0001 max mem: 71357 +[13:24:17.080944] Epoch: [1] [6420/6500] lr: 0.000039 closs: 0.7457 (0.7627) grad_norm: 0.3553 (0.4545) time: 5.5911 data: 0.0001 max mem: 71357 +[13:25:12.899426] Epoch: [1] [6430/6500] lr: 0.000039 closs: 0.7345 (0.7628) grad_norm: 0.3532 (0.4545) time: 5.5931 data: 0.0001 max mem: 71357 +[13:26:08.709394] Epoch: [1] [6440/6500] lr: 0.000039 closs: 0.7518 (0.7628) grad_norm: 0.3989 (0.4544) time: 5.5813 data: 0.0001 max mem: 71357 +[13:27:04.428882] Epoch: [1] [6450/6500] lr: 0.000039 closs: 0.7228 (0.7627) grad_norm: 0.3685 (0.4544) time: 5.5763 data: 0.0001 max mem: 71357 +[13:28:00.252225] Epoch: [1] [6460/6500] lr: 0.000039 closs: 0.7051 (0.7626) grad_norm: 0.3989 (0.4543) time: 5.5770 data: 0.0001 max mem: 71357 +[13:28:56.043953] Epoch: [1] [6470/6500] lr: 0.000039 closs: 0.7380 (0.7626) grad_norm: 0.4243 (0.4545) time: 5.5807 data: 0.0001 max mem: 71357 +[13:29:51.797569] Epoch: [1] [6480/6500] lr: 0.000039 closs: 0.7187 (0.7625) grad_norm: 0.3741 (0.4543) time: 5.5772 data: 0.0001 max mem: 71357 +[13:30:47.516773] Epoch: [1] [6490/6500] lr: 0.000039 closs: 0.6832 (0.7624) grad_norm: 0.4119 (0.4546) time: 5.5735 data: 0.0001 max mem: 71357 +[13:31:38.239874] Epoch: [1] Total time: 10:04:02 +[13:31:38.277561] Averaged stats: lr: 0.000039 closs: 0.6790 (0.7624) grad_norm: 0.3968 (0.4545) +[13:31:38.437483] model saved +[13:31:39.344162] optimizer saved +[13:31:39.344627] other rank-common saved +[13:31:39.347843] rank-specific saved +[13:31:39.356891] log_dir: ./output_dir +[13:31:47.519594] Epoch: [2] [0/6500] lr: 0.000039 closs: 0.8530 (0.8530) time: 8.1620 data: 2.5124 max mem: 71357 +[13:32:43.288252] Epoch: [2] [10/6500] lr: 0.000039 closs: 0.7463 (0.7264) grad_norm: 0.3308 (0.3256) time: 5.8118 data: 0.2286 max mem: 71357 +[13:33:39.077409] Epoch: [2] [20/6500] lr: 0.000039 closs: 0.7463 (0.7621) grad_norm: 0.3325 (0.3434) time: 5.5778 data: 0.0001 max mem: 71357 +[13:34:34.867998] Epoch: [2] [30/6500] lr: 0.000039 closs: 0.7167 (0.7603) grad_norm: 0.3547 (0.4323) time: 5.5789 data: 0.0001 max mem: 71357 +[13:35:30.670234] Epoch: [2] [40/6500] lr: 0.000039 closs: 0.7167 (0.7590) grad_norm: 0.3576 (0.4284) time: 5.5796 data: 0.0001 max mem: 71357 +[13:36:26.526233] Epoch: [2] [50/6500] lr: 0.000039 closs: 0.7527 (0.7555) grad_norm: 0.3960 (0.4176) time: 5.5828 data: 0.0001 max mem: 71357 +[13:37:22.300149] Epoch: [2] [60/6500] lr: 0.000039 closs: 0.7448 (0.7551) grad_norm: 0.4199 (0.4239) time: 5.5814 data: 0.0001 max mem: 71357 +[13:38:18.028707] Epoch: [2] [70/6500] lr: 0.000039 closs: 0.7028 (0.7479) grad_norm: 0.3856 (0.4157) time: 5.5750 data: 0.0001 max mem: 71357 +[13:39:13.821819] Epoch: [2] [80/6500] lr: 0.000038 closs: 0.7377 (0.7468) grad_norm: 0.3691 (0.4100) time: 5.5760 data: 0.0001 max mem: 71357 +[13:40:09.683506] Epoch: [2] [90/6500] lr: 0.000038 closs: 0.7571 (0.7481) grad_norm: 0.3691 (0.4179) time: 5.5826 data: 0.0001 max mem: 71357 +[13:41:05.487770] Epoch: [2] [100/6500] lr: 0.000038 closs: 0.7571 (0.7515) grad_norm: 0.3682 (0.4140) time: 5.5832 data: 0.0001 max mem: 71357 +[13:42:01.187296] Epoch: [2] [110/6500] lr: 0.000038 closs: 0.7371 (0.7502) grad_norm: 0.3937 (0.4187) time: 5.5751 data: 0.0001 max mem: 71357 +[13:42:56.922471] Epoch: [2] [120/6500] lr: 0.000038 closs: 0.7371 (0.7516) grad_norm: 0.3937 (0.4227) time: 5.5716 data: 0.0002 max mem: 71357 +[13:43:52.820057] Epoch: [2] [130/6500] lr: 0.000038 closs: 0.7830 (0.7562) grad_norm: 0.4010 (0.4249) time: 5.5815 data: 0.0002 max mem: 71357 +[13:44:48.624177] Epoch: [2] [140/6500] lr: 0.000038 closs: 0.7541 (0.7543) grad_norm: 0.4010 (0.4211) time: 5.5850 data: 0.0001 max mem: 71357 +[13:45:44.395166] Epoch: [2] [150/6500] lr: 0.000038 closs: 0.6841 (0.7539) grad_norm: 0.3672 (0.4165) time: 5.5787 data: 0.0001 max mem: 71357 +[13:46:40.119569] Epoch: [2] [160/6500] lr: 0.000038 closs: 0.7270 (0.7544) grad_norm: 0.4010 (0.4255) time: 5.5747 data: 0.0001 max mem: 71357 +[13:47:35.890460] Epoch: [2] [170/6500] lr: 0.000038 closs: 0.7027 (0.7530) grad_norm: 0.3672 (0.4220) time: 5.5747 data: 0.0001 max mem: 71357 +[13:48:31.736860] Epoch: [2] [180/6500] lr: 0.000038 closs: 0.7138 (0.7518) grad_norm: 0.3619 (0.4208) time: 5.5808 data: 0.0001 max mem: 71357 +[13:49:27.392286] Epoch: [2] [190/6500] lr: 0.000038 closs: 0.7209 (0.7522) grad_norm: 0.3753 (0.4228) time: 5.5750 data: 0.0001 max mem: 71357 +[13:50:23.153601] Epoch: [2] [200/6500] lr: 0.000038 closs: 0.7229 (0.7506) grad_norm: 0.3753 (0.4260) time: 5.5707 data: 0.0001 max mem: 71357 +[13:51:18.968289] Epoch: [2] [210/6500] lr: 0.000038 closs: 0.7229 (0.7512) grad_norm: 0.4066 (0.4268) time: 5.5787 data: 0.0001 max mem: 71357 +[13:52:14.932241] Epoch: [2] [220/6500] lr: 0.000038 closs: 0.6851 (0.7505) grad_norm: 0.4066 (0.4231) time: 5.5888 data: 0.0001 max mem: 71357 +[13:53:10.610429] Epoch: [2] [230/6500] lr: 0.000038 closs: 0.7820 (0.7548) grad_norm: 0.3845 (0.4203) time: 5.5820 data: 0.0001 max mem: 71357 +[13:54:06.355460] Epoch: [2] [240/6500] lr: 0.000038 closs: 0.7820 (0.7562) grad_norm: 0.3753 (0.4202) time: 5.5711 data: 0.0001 max mem: 71357 +[13:55:02.130045] Epoch: [2] [250/6500] lr: 0.000038 closs: 0.7681 (0.7567) grad_norm: 0.3540 (0.4181) time: 5.5759 data: 0.0001 max mem: 71357 +[13:55:57.941222] Epoch: [2] [260/6500] lr: 0.000038 closs: 0.7585 (0.7572) grad_norm: 0.3630 (0.4196) time: 5.5792 data: 0.0001 max mem: 71357 +[13:56:53.824737] Epoch: [2] [270/6500] lr: 0.000038 closs: 0.7585 (0.7570) grad_norm: 0.3699 (0.4187) time: 5.5847 data: 0.0001 max mem: 71357 +[13:57:49.697171] Epoch: [2] [280/6500] lr: 0.000038 closs: 0.7405 (0.7566) grad_norm: 0.3712 (0.4186) time: 5.5877 data: 0.0001 max mem: 71357 +[13:58:45.437495] Epoch: [2] [290/6500] lr: 0.000038 closs: 0.7335 (0.7546) grad_norm: 0.3789 (0.4163) time: 5.5806 data: 0.0001 max mem: 71357 +[13:59:41.195413] Epoch: [2] [300/6500] lr: 0.000038 closs: 0.6860 (0.7536) grad_norm: 0.3789 (0.4153) time: 5.5748 data: 0.0001 max mem: 71357 +[14:00:37.032270] Epoch: [2] [310/6500] lr: 0.000038 closs: 0.7333 (0.7531) grad_norm: 0.3733 (0.4166) time: 5.5796 data: 0.0001 max mem: 71357 +[14:01:32.744167] Epoch: [2] [320/6500] lr: 0.000038 closs: 0.7333 (0.7524) grad_norm: 0.3825 (0.4177) time: 5.5773 data: 0.0001 max mem: 71357 +[14:02:28.443547] Epoch: [2] [330/6500] lr: 0.000038 closs: 0.7277 (0.7520) grad_norm: 0.3988 (0.4181) time: 5.5705 data: 0.0001 max mem: 71357 +[14:03:24.231708] Epoch: [2] [340/6500] lr: 0.000038 closs: 0.7531 (0.7512) grad_norm: 0.3988 (0.4170) time: 5.5743 data: 0.0001 max mem: 71357 +[14:04:20.006444] Epoch: [2] [350/6500] lr: 0.000038 closs: 0.7179 (0.7498) grad_norm: 0.4000 (0.4180) time: 5.5780 data: 0.0001 max mem: 71357 +[14:05:15.765801] Epoch: [2] [360/6500] lr: 0.000038 closs: 0.6998 (0.7484) grad_norm: 0.3938 (0.4171) time: 5.5766 data: 0.0001 max mem: 71357 +[14:06:11.530058] Epoch: [2] [370/6500] lr: 0.000038 closs: 0.6998 (0.7490) grad_norm: 0.3695 (0.4163) time: 5.5761 data: 0.0001 max mem: 71357 +[14:07:07.345190] Epoch: [2] [380/6500] lr: 0.000038 closs: 0.6916 (0.7479) grad_norm: 0.3695 (0.4148) time: 5.5789 data: 0.0001 max mem: 71357 +[14:08:03.215899] Epoch: [2] [390/6500] lr: 0.000038 closs: 0.7538 (0.7490) grad_norm: 0.3527 (0.4131) time: 5.5842 data: 0.0001 max mem: 71357 +[14:08:59.012783] Epoch: [2] [400/6500] lr: 0.000037 closs: 0.7534 (0.7484) grad_norm: 0.3527 (0.4176) time: 5.5833 data: 0.0001 max mem: 71357 +[14:09:54.780467] Epoch: [2] [410/6500] lr: 0.000037 closs: 0.6848 (0.7468) grad_norm: 0.3885 (0.4227) time: 5.5781 data: 0.0001 max mem: 71357 +[14:10:50.583135] Epoch: [2] [420/6500] lr: 0.000037 closs: 0.7102 (0.7481) grad_norm: 0.4692 (0.4295) time: 5.5784 data: 0.0001 max mem: 71357 +[14:11:46.513039] Epoch: [2] [430/6500] lr: 0.000037 closs: 0.7994 (0.7501) grad_norm: 0.4802 (0.4279) time: 5.5865 data: 0.0002 max mem: 71357 +[14:12:42.399415] Epoch: [2] [440/6500] lr: 0.000037 closs: 0.7675 (0.7508) grad_norm: 0.3810 (0.4284) time: 5.5907 data: 0.0002 max mem: 71357 +[14:13:38.199675] Epoch: [2] [450/6500] lr: 0.000037 closs: 0.7309 (0.7514) grad_norm: 0.3810 (0.4298) time: 5.5842 data: 0.0002 max mem: 71357 +[14:14:34.022210] Epoch: [2] [460/6500] lr: 0.000037 closs: 0.7586 (0.7526) grad_norm: 0.3805 (0.4293) time: 5.5810 data: 0.0001 max mem: 71357 +[14:15:29.816050] Epoch: [2] [470/6500] lr: 0.000037 closs: 0.7586 (0.7531) grad_norm: 0.3805 (0.4281) time: 5.5807 data: 0.0001 max mem: 71357 +[14:16:25.599470] Epoch: [2] [480/6500] lr: 0.000037 closs: 0.7866 (0.7538) grad_norm: 0.3662 (0.4269) time: 5.5788 data: 0.0001 max mem: 71357 +[14:17:21.518261] Epoch: [2] [490/6500] lr: 0.000037 closs: 0.8141 (0.7544) grad_norm: 0.3628 (0.4270) time: 5.5850 data: 0.0001 max mem: 71357 +[14:18:17.320636] Epoch: [2] [500/6500] lr: 0.000037 closs: 0.8202 (0.7549) grad_norm: 0.3662 (0.4269) time: 5.5860 data: 0.0001 max mem: 71357 +[14:19:12.994870] Epoch: [2] [510/6500] lr: 0.000037 closs: 0.7792 (0.7557) grad_norm: 0.4105 (0.4281) time: 5.5737 data: 0.0001 max mem: 71357 +[14:20:08.814286] Epoch: [2] [520/6500] lr: 0.000037 closs: 0.7314 (0.7551) grad_norm: 0.4120 (0.4289) time: 5.5746 data: 0.0001 max mem: 71357 +[14:21:04.579633] Epoch: [2] [530/6500] lr: 0.000037 closs: 0.7105 (0.7545) grad_norm: 0.4370 (0.4294) time: 5.5791 data: 0.0001 max mem: 71357 +[14:22:00.313354] Epoch: [2] [540/6500] lr: 0.000037 closs: 0.6849 (0.7527) grad_norm: 0.4191 (0.4286) time: 5.5749 data: 0.0001 max mem: 71357 +[14:22:56.019516] Epoch: [2] [550/6500] lr: 0.000037 closs: 0.7390 (0.7546) grad_norm: 0.4063 (0.4291) time: 5.5719 data: 0.0001 max mem: 71357 +[14:23:51.755868] Epoch: [2] [560/6500] lr: 0.000037 closs: 0.8036 (0.7556) grad_norm: 0.4064 (0.4301) time: 5.5721 data: 0.0001 max mem: 71357 +[14:24:47.612236] Epoch: [2] [570/6500] lr: 0.000037 closs: 0.7781 (0.7556) grad_norm: 0.4063 (0.4297) time: 5.5795 data: 0.0001 max mem: 71357 +[14:25:43.321070] Epoch: [2] [580/6500] lr: 0.000037 closs: 0.7385 (0.7558) grad_norm: 0.4227 (0.4305) time: 5.5781 data: 0.0001 max mem: 71357 +[14:26:39.029217] Epoch: [2] [590/6500] lr: 0.000037 closs: 0.8222 (0.7575) grad_norm: 0.4322 (0.4309) time: 5.5707 data: 0.0001 max mem: 71357 +[14:27:34.840416] Epoch: [2] [600/6500] lr: 0.000037 closs: 0.8283 (0.7582) grad_norm: 0.4322 (0.4303) time: 5.5759 data: 0.0001 max mem: 71357 +[14:28:30.571734] Epoch: [2] [610/6500] lr: 0.000037 closs: 0.6999 (0.7561) grad_norm: 0.4594 (0.4356) time: 5.5771 data: 0.0001 max mem: 71357 +[14:29:26.385644] Epoch: [2] [620/6500] lr: 0.000037 closs: 0.6217 (0.7547) grad_norm: 0.4041 (0.4344) time: 5.5772 data: 0.0001 max mem: 71357 +[14:30:22.152023] Epoch: [2] [630/6500] lr: 0.000037 closs: 0.6924 (0.7546) grad_norm: 0.3903 (0.4342) time: 5.5789 data: 0.0001 max mem: 71357 +[14:31:17.977329] Epoch: [2] [640/6500] lr: 0.000037 closs: 0.7096 (0.7539) grad_norm: 0.3903 (0.4346) time: 5.5795 data: 0.0001 max mem: 71357 +[14:32:13.809917] Epoch: [2] [650/6500] lr: 0.000037 closs: 0.7096 (0.7538) grad_norm: 0.3726 (0.4336) time: 5.5828 data: 0.0001 max mem: 71357 +[14:33:09.675270] Epoch: [2] [660/6500] lr: 0.000037 closs: 0.7337 (0.7539) grad_norm: 0.3889 (0.4341) time: 5.5848 data: 0.0001 max mem: 71357 +[14:34:05.473554] Epoch: [2] [670/6500] lr: 0.000037 closs: 0.6978 (0.7532) grad_norm: 0.3686 (0.4333) time: 5.5831 data: 0.0001 max mem: 71357 +[14:35:01.331221] Epoch: [2] [680/6500] lr: 0.000037 closs: 0.7212 (0.7526) grad_norm: 0.3643 (0.4318) time: 5.5827 data: 0.0001 max mem: 71357 +[14:35:57.063267] Epoch: [2] [690/6500] lr: 0.000037 closs: 0.7305 (0.7527) grad_norm: 0.3668 (0.4318) time: 5.5794 data: 0.0001 max mem: 71357 +[14:36:52.859279] Epoch: [2] [700/6500] lr: 0.000036 closs: 0.7603 (0.7529) grad_norm: 0.3595 (0.4308) time: 5.5763 data: 0.0001 max mem: 71357 +[14:37:48.751222] Epoch: [2] [710/6500] lr: 0.000036 closs: 0.7380 (0.7525) grad_norm: 0.3590 (0.4302) time: 5.5843 data: 0.0002 max mem: 71357 +[14:38:44.494905] Epoch: [2] [720/6500] lr: 0.000036 closs: 0.7349 (0.7529) grad_norm: 0.3672 (0.4298) time: 5.5817 data: 0.0002 max mem: 71357 +[14:39:40.195246] Epoch: [2] [730/6500] lr: 0.000036 closs: 0.7399 (0.7520) grad_norm: 0.4055 (0.4299) time: 5.5721 data: 0.0001 max mem: 71357 +[14:40:36.044045] Epoch: [2] [740/6500] lr: 0.000036 closs: 0.7879 (0.7533) grad_norm: 0.3741 (0.4294) time: 5.5774 data: 0.0001 max mem: 71357 +[14:41:31.881779] Epoch: [2] [750/6500] lr: 0.000036 closs: 0.8348 (0.7536) grad_norm: 0.4074 (0.4296) time: 5.5843 data: 0.0001 max mem: 71357 +[14:42:27.626745] Epoch: [2] [760/6500] lr: 0.000036 closs: 0.7557 (0.7528) grad_norm: 0.4074 (0.4299) time: 5.5790 data: 0.0001 max mem: 71357 +[14:43:23.427493] Epoch: [2] [770/6500] lr: 0.000036 closs: 0.7330 (0.7528) grad_norm: 0.4074 (0.4302) time: 5.5772 data: 0.0001 max mem: 71357 +[14:44:19.252177] Epoch: [2] [780/6500] lr: 0.000036 closs: 0.7833 (0.7537) grad_norm: 0.4074 (0.4319) time: 5.5812 data: 0.0001 max mem: 71357 +[14:45:15.139740] Epoch: [2] [790/6500] lr: 0.000036 closs: 0.7768 (0.7542) grad_norm: 0.3707 (0.4311) time: 5.5855 data: 0.0001 max mem: 71357 +[14:46:10.918823] Epoch: [2] [800/6500] lr: 0.000036 closs: 0.7081 (0.7539) grad_norm: 0.3902 (0.4308) time: 5.5833 data: 0.0001 max mem: 71357 +[14:47:06.736186] Epoch: [2] [810/6500] lr: 0.000036 closs: 0.7008 (0.7532) grad_norm: 0.3791 (0.4306) time: 5.5797 data: 0.0001 max mem: 71357 +[14:48:02.479681] Epoch: [2] [820/6500] lr: 0.000036 closs: 0.7034 (0.7527) grad_norm: 0.3791 (0.4314) time: 5.5779 data: 0.0002 max mem: 71357 +[14:48:58.271678] Epoch: [2] [830/6500] lr: 0.000036 closs: 0.6858 (0.7525) grad_norm: 0.4105 (0.4314) time: 5.5766 data: 0.0002 max mem: 71357 +[14:49:54.210878] Epoch: [2] [840/6500] lr: 0.000036 closs: 0.7823 (0.7531) grad_norm: 0.4233 (0.4315) time: 5.5865 data: 0.0001 max mem: 71357 +[14:50:49.920972] Epoch: [2] [850/6500] lr: 0.000036 closs: 0.7854 (0.7535) grad_norm: 0.4315 (0.4319) time: 5.5824 data: 0.0001 max mem: 71357 +[14:51:45.745245] Epoch: [2] [860/6500] lr: 0.000036 closs: 0.7603 (0.7538) grad_norm: 0.4249 (0.4585) time: 5.5766 data: 0.0002 max mem: 71357 +[14:52:41.602665] Epoch: [2] [870/6500] lr: 0.000036 closs: 0.7513 (0.7540) grad_norm: 0.4205 (0.4580) time: 5.5840 data: 0.0002 max mem: 71357 +[14:53:37.485162] Epoch: [2] [880/6500] lr: 0.000036 closs: 0.7341 (0.7539) grad_norm: 0.3926 (0.4577) time: 5.5869 data: 0.0002 max mem: 71357 +[14:54:33.281177] Epoch: [2] [890/6500] lr: 0.000036 closs: 0.7098 (0.7535) grad_norm: 0.3598 (0.4576) time: 5.5838 data: 0.0002 max mem: 71357 +[14:55:29.092517] Epoch: [2] [900/6500] lr: 0.000036 closs: 0.7047 (0.7527) grad_norm: 0.3727 (0.4576) time: 5.5802 data: 0.0002 max mem: 71357 +[14:56:24.907964] Epoch: [2] [910/6500] lr: 0.000036 closs: 0.7047 (0.7532) grad_norm: 0.3870 (0.4592) time: 5.5812 data: 0.0002 max mem: 71357 +[14:57:20.783829] Epoch: [2] [920/6500] lr: 0.000036 closs: 0.7544 (0.7532) grad_norm: 0.3975 (0.4588) time: 5.5844 data: 0.0002 max mem: 71357 +[14:58:16.764824] Epoch: [2] [930/6500] lr: 0.000036 closs: 0.7854 (0.7545) grad_norm: 0.3975 (0.4591) time: 5.5927 data: 0.0002 max mem: 71357 +[14:59:12.669502] Epoch: [2] [940/6500] lr: 0.000036 closs: 0.8281 (0.7545) grad_norm: 0.3922 (0.4587) time: 5.5942 data: 0.0002 max mem: 71357 +[15:00:08.554641] Epoch: [2] [950/6500] lr: 0.000036 closs: 0.7504 (0.7542) grad_norm: 0.3922 (0.4584) time: 5.5894 data: 0.0002 max mem: 71357 +[15:01:04.431999] Epoch: [2] [960/6500] lr: 0.000036 closs: 0.7345 (0.7542) grad_norm: 0.3973 (0.4582) time: 5.5880 data: 0.0002 max mem: 71357 +[15:02:00.303487] Epoch: [2] [970/6500] lr: 0.000036 closs: 0.6935 (0.7539) grad_norm: 0.4130 (0.4579) time: 5.5873 data: 0.0002 max mem: 71357 +[15:02:56.106192] Epoch: [2] [980/6500] lr: 0.000036 closs: 0.6935 (0.7538) grad_norm: 0.3919 (0.4570) time: 5.5836 data: 0.0002 max mem: 71357 +[15:03:51.815680] Epoch: [2] [990/6500] lr: 0.000036 closs: 0.6868 (0.7532) grad_norm: 0.3815 (0.4564) time: 5.5755 data: 0.0002 max mem: 71357 +[15:04:47.621183] Epoch: [2] [1000/6500] lr: 0.000035 closs: 0.6118 (0.7523) grad_norm: 0.3674 (0.4557) time: 5.5756 data: 0.0002 max mem: 71357 +[15:05:43.550443] Epoch: [2] [1010/6500] lr: 0.000035 closs: 0.6951 (0.7526) grad_norm: 0.3764 (0.4553) time: 5.5866 data: 0.0002 max mem: 71357 +[15:06:39.399591] Epoch: [2] [1020/6500] lr: 0.000035 closs: 0.7234 (0.7525) grad_norm: 0.3825 (0.4549) time: 5.5888 data: 0.0002 max mem: 71357 +[15:07:35.244881] Epoch: [2] [1030/6500] lr: 0.000035 closs: 0.7264 (0.7526) grad_norm: 0.4198 (0.4550) time: 5.5846 data: 0.0002 max mem: 71357 +[15:08:31.032761] Epoch: [2] [1040/6500] lr: 0.000035 closs: 0.7264 (0.7522) grad_norm: 0.4198 (0.4551) time: 5.5815 data: 0.0002 max mem: 71357 +[15:09:26.908118] Epoch: [2] [1050/6500] lr: 0.000035 closs: 0.7246 (0.7525) grad_norm: 0.4107 (0.4542) time: 5.5830 data: 0.0002 max mem: 71357 +[15:10:22.920376] Epoch: [2] [1060/6500] lr: 0.000035 closs: 0.7357 (0.7525) grad_norm: 0.4051 (0.4537) time: 5.5943 data: 0.0002 max mem: 71357 +[15:11:18.721035] Epoch: [2] [1070/6500] lr: 0.000035 closs: 0.7357 (0.7528) grad_norm: 0.3653 (0.4529) time: 5.5905 data: 0.0002 max mem: 71357 +[15:12:14.492900] Epoch: [2] [1080/6500] lr: 0.000035 closs: 0.7322 (0.7529) grad_norm: 0.3654 (0.4585) time: 5.5785 data: 0.0002 max mem: 71357 +[15:13:10.357333] Epoch: [2] [1090/6500] lr: 0.000035 closs: 0.7251 (0.7528) grad_norm: 0.4259 (0.4585) time: 5.5817 data: 0.0002 max mem: 71357 +[15:14:06.180945] Epoch: [2] [1100/6500] lr: 0.000035 closs: 0.7086 (0.7525) grad_norm: 0.4282 (0.4583) time: 5.5843 data: 0.0002 max mem: 71357 +[15:15:01.964084] Epoch: [2] [1110/6500] lr: 0.000035 closs: 0.6976 (0.7523) grad_norm: 0.4664 (0.4576) time: 5.5802 data: 0.0002 max mem: 71357 +[15:15:57.685490] Epoch: [2] [1120/6500] lr: 0.000035 closs: 0.7384 (0.7523) grad_norm: 0.3756 (0.4574) time: 5.5751 data: 0.0002 max mem: 71357 +[15:16:53.523206] Epoch: [2] [1130/6500] lr: 0.000035 closs: 0.7384 (0.7522) grad_norm: 0.3756 (0.4573) time: 5.5779 data: 0.0002 max mem: 71357 +[15:17:49.360003] Epoch: [2] [1140/6500] lr: 0.000035 closs: 0.7554 (0.7525) grad_norm: 0.3694 (0.4568) time: 5.5836 data: 0.0002 max mem: 71357 +[15:18:45.316724] Epoch: [2] [1150/6500] lr: 0.000035 closs: 0.7964 (0.7526) grad_norm: 0.3718 (0.4563) time: 5.5895 data: 0.0002 max mem: 71357 +[15:19:41.125463] Epoch: [2] [1160/6500] lr: 0.000035 closs: 0.7983 (0.7529) grad_norm: 0.3694 (0.4561) time: 5.5881 data: 0.0002 max mem: 71357 +[15:20:36.889226] Epoch: [2] [1170/6500] lr: 0.000035 closs: 0.7632 (0.7523) grad_norm: 0.3718 (0.4555) time: 5.5785 data: 0.0002 max mem: 71357 +[15:21:32.660197] Epoch: [2] [1180/6500] lr: 0.000035 closs: 0.6888 (0.7523) grad_norm: 0.4002 (0.4550) time: 5.5766 data: 0.0002 max mem: 71357 +[15:22:28.516278] Epoch: [2] [1190/6500] lr: 0.000035 closs: 0.6888 (0.7523) grad_norm: 0.4274 (0.4553) time: 5.5813 data: 0.0002 max mem: 71357 +[15:23:24.346365] Epoch: [2] [1200/6500] lr: 0.000035 closs: 0.7446 (0.7525) grad_norm: 0.4099 (0.4549) time: 5.5842 data: 0.0002 max mem: 71357 +[15:24:20.134239] Epoch: [2] [1210/6500] lr: 0.000035 closs: 0.6904 (0.7520) grad_norm: 0.4104 (0.4544) time: 5.5807 data: 0.0002 max mem: 71357 +[15:25:15.925899] Epoch: [2] [1220/6500] lr: 0.000035 closs: 0.7181 (0.7522) grad_norm: 0.4152 (0.4552) time: 5.5788 data: 0.0002 max mem: 71357 +[15:26:11.888462] Epoch: [2] [1230/6500] lr: 0.000035 closs: 0.7267 (0.7519) grad_norm: 0.4083 (0.4548) time: 5.5876 data: 0.0002 max mem: 71357 +[15:27:07.646299] Epoch: [2] [1240/6500] lr: 0.000035 closs: 0.6987 (0.7515) grad_norm: 0.4104 (0.4549) time: 5.5859 data: 0.0002 max mem: 71357 +[15:28:03.389698] Epoch: [2] [1250/6500] lr: 0.000035 closs: 0.6987 (0.7520) grad_norm: 0.4244 (0.4545) time: 5.5750 data: 0.0002 max mem: 71357 +[15:28:59.282048] Epoch: [2] [1260/6500] lr: 0.000035 closs: 0.6953 (0.7516) grad_norm: 0.4244 (0.4546) time: 5.5816 data: 0.0002 max mem: 71357 +[15:29:55.061908] Epoch: [2] [1270/6500] lr: 0.000035 closs: 0.7070 (0.7511) grad_norm: 0.4485 (0.4550) time: 5.5834 data: 0.0002 max mem: 71357 +[15:30:50.944627] Epoch: [2] [1280/6500] lr: 0.000035 closs: 0.7370 (0.7518) grad_norm: 0.4485 (0.4555) time: 5.5830 data: 0.0002 max mem: 71357 +[15:31:46.666142] Epoch: [2] [1290/6500] lr: 0.000034 closs: 0.7087 (0.7517) grad_norm: 0.4504 (0.4558) time: 5.5801 data: 0.0001 max mem: 71357 +[15:32:42.433467] Epoch: [2] [1300/6500] lr: 0.000034 closs: 0.7037 (0.7513) grad_norm: 0.4323 (0.4552) time: 5.5744 data: 0.0001 max mem: 71357 +[15:33:38.181099] Epoch: [2] [1310/6500] lr: 0.000034 closs: 0.7091 (0.7517) grad_norm: 0.4323 (0.4556) time: 5.5757 data: 0.0002 max mem: 71357 +[15:34:33.994700] Epoch: [2] [1320/6500] lr: 0.000034 closs: 0.7841 (0.7518) grad_norm: 0.4367 (0.4562) time: 5.5780 data: 0.0002 max mem: 71357 +[15:35:29.692112] Epoch: [2] [1330/6500] lr: 0.000034 closs: 0.7856 (0.7518) grad_norm: 0.3952 (0.4559) time: 5.5754 data: 0.0002 max mem: 71357 +[15:36:25.439166] Epoch: [2] [1340/6500] lr: 0.000034 closs: 0.7897 (0.7518) grad_norm: 0.4837 (0.4569) time: 5.5721 data: 0.0001 max mem: 71357 +[15:37:21.352355] Epoch: [2] [1350/6500] lr: 0.000034 closs: 0.7514 (0.7517) grad_norm: 0.4169 (0.4564) time: 5.5829 data: 0.0001 max mem: 71357 +[15:38:17.111671] Epoch: [2] [1360/6500] lr: 0.000034 closs: 0.7093 (0.7515) grad_norm: 0.3850 (0.4564) time: 5.5835 data: 0.0002 max mem: 71357 +[15:39:12.952198] Epoch: [2] [1370/6500] lr: 0.000034 closs: 0.7113 (0.7511) grad_norm: 0.4112 (0.4626) time: 5.5799 data: 0.0002 max mem: 71357 +[15:40:08.662786] Epoch: [2] [1380/6500] lr: 0.000034 closs: 0.7377 (0.7512) grad_norm: 0.3917 (0.4633) time: 5.5775 data: 0.0001 max mem: 71357 +[15:41:04.467847] Epoch: [2] [1390/6500] lr: 0.000034 closs: 0.7426 (0.7512) grad_norm: 0.4009 (0.4635) time: 5.5756 data: 0.0002 max mem: 71357 +[15:42:00.167536] Epoch: [2] [1400/6500] lr: 0.000034 closs: 0.7377 (0.7511) grad_norm: 0.4512 (0.4636) time: 5.5751 data: 0.0002 max mem: 71357 +[15:42:56.023589] Epoch: [2] [1410/6500] lr: 0.000034 closs: 0.7182 (0.7510) grad_norm: 0.4512 (0.4638) time: 5.5777 data: 0.0002 max mem: 71357 +[15:43:51.710783] Epoch: [2] [1420/6500] lr: 0.000034 closs: 0.7622 (0.7516) grad_norm: 0.4390 (0.4632) time: 5.5771 data: 0.0002 max mem: 71357 +[15:44:47.432279] Epoch: [2] [1430/6500] lr: 0.000034 closs: 0.8510 (0.7520) grad_norm: 0.4332 (0.4633) time: 5.5703 data: 0.0002 max mem: 71357 +[15:45:43.110068] Epoch: [2] [1440/6500] lr: 0.000034 closs: 0.8121 (0.7523) grad_norm: 0.4311 (0.4631) time: 5.5698 data: 0.0002 max mem: 71357 +[15:46:39.076188] Epoch: [2] [1450/6500] lr: 0.000034 closs: 0.7827 (0.7524) grad_norm: 0.3811 (0.4628) time: 5.5821 data: 0.0002 max mem: 71357 +[15:47:34.786235] Epoch: [2] [1460/6500] lr: 0.000034 closs: 0.7050 (0.7523) grad_norm: 0.3816 (0.4624) time: 5.5837 data: 0.0002 max mem: 71357 +[15:48:30.542459] Epoch: [2] [1470/6500] lr: 0.000034 closs: 0.7427 (0.7526) grad_norm: 0.3891 (0.4620) time: 5.5732 data: 0.0001 max mem: 71357 +[15:49:26.323343] Epoch: [2] [1480/6500] lr: 0.000034 closs: 0.7493 (0.7524) grad_norm: 0.3891 (0.4615) time: 5.5767 data: 0.0001 max mem: 71357 +[15:50:22.063916] Epoch: [2] [1490/6500] lr: 0.000034 closs: 0.7493 (0.7529) grad_norm: 0.4173 (0.4618) time: 5.5759 data: 0.0001 max mem: 71357 +[15:51:17.879657] Epoch: [2] [1500/6500] lr: 0.000034 closs: 0.7679 (0.7528) grad_norm: 0.4173 (0.4612) time: 5.5777 data: 0.0001 max mem: 71357 +[15:52:13.600143] Epoch: [2] [1510/6500] lr: 0.000034 closs: 0.7527 (0.7528) grad_norm: 0.4107 (0.4608) time: 5.5767 data: 0.0001 max mem: 71357 +[15:53:09.349932] Epoch: [2] [1520/6500] lr: 0.000034 closs: 0.7025 (0.7528) grad_norm: 0.4178 (0.4605) time: 5.5734 data: 0.0002 max mem: 71357 +[15:54:05.153207] Epoch: [2] [1530/6500] lr: 0.000034 closs: 0.6863 (0.7526) grad_norm: 0.3977 (0.4603) time: 5.5776 data: 0.0002 max mem: 71357 +[15:55:01.034381] Epoch: [2] [1540/6500] lr: 0.000034 closs: 0.6656 (0.7521) grad_norm: 0.3977 (0.4598) time: 5.5841 data: 0.0002 max mem: 71357 +[15:55:56.795048] Epoch: [2] [1550/6500] lr: 0.000034 closs: 0.6656 (0.7522) grad_norm: 0.3977 (0.4595) time: 5.5820 data: 0.0002 max mem: 71357 +[15:56:52.611293] Epoch: [2] [1560/6500] lr: 0.000034 closs: 0.7849 (0.7525) grad_norm: 0.3977 (0.4592) time: 5.5787 data: 0.0001 max mem: 71357 +[15:57:48.402966] Epoch: [2] [1570/6500] lr: 0.000034 closs: 0.7785 (0.7525) grad_norm: 0.4062 (0.4590) time: 5.5803 data: 0.0001 max mem: 71357 +[15:58:44.217639] Epoch: [2] [1580/6500] lr: 0.000033 closs: 0.7870 (0.7531) grad_norm: 0.4106 (0.4587) time: 5.5802 data: 0.0001 max mem: 71357 +[15:59:40.002405] Epoch: [2] [1590/6500] lr: 0.000033 closs: 0.7723 (0.7532) grad_norm: 0.4159 (0.4585) time: 5.5799 data: 0.0001 max mem: 71357 +[16:00:35.721907] Epoch: [2] [1600/6500] lr: 0.000033 closs: 0.6743 (0.7524) grad_norm: 0.4302 (0.4586) time: 5.5751 data: 0.0001 max mem: 71357 +[16:01:31.458032] Epoch: [2] [1610/6500] lr: 0.000033 closs: 0.6607 (0.7525) grad_norm: 0.4363 (0.4585) time: 5.5727 data: 0.0001 max mem: 71357 +[16:02:27.297980] Epoch: [2] [1620/6500] lr: 0.000033 closs: 0.7730 (0.7526) grad_norm: 0.4302 (0.4584) time: 5.5787 data: 0.0001 max mem: 71357 +[16:03:23.124468] Epoch: [2] [1630/6500] lr: 0.000033 closs: 0.7734 (0.7526) grad_norm: 0.4404 (0.4611) time: 5.5832 data: 0.0001 max mem: 71357 +[16:04:18.899383] Epoch: [2] [1640/6500] lr: 0.000033 closs: 0.7204 (0.7525) grad_norm: 0.4069 (0.4606) time: 5.5800 data: 0.0001 max mem: 71357 +[16:05:14.642459] Epoch: [2] [1650/6500] lr: 0.000033 closs: 0.7107 (0.7522) grad_norm: 0.3977 (0.4603) time: 5.5758 data: 0.0001 max mem: 71357 +[16:06:10.346118] Epoch: [2] [1660/6500] lr: 0.000033 closs: 0.6985 (0.7520) grad_norm: 0.3828 (0.4600) time: 5.5722 data: 0.0001 max mem: 71357 +[16:07:06.260484] Epoch: [2] [1670/6500] lr: 0.000033 closs: 0.7301 (0.7522) grad_norm: 0.3594 (0.4594) time: 5.5808 data: 0.0001 max mem: 71357 +[16:08:01.979004] Epoch: [2] [1680/6500] lr: 0.000033 closs: 0.7341 (0.7519) grad_norm: 0.3670 (0.4596) time: 5.5816 data: 0.0001 max mem: 71357 +[16:08:57.731380] Epoch: [2] [1690/6500] lr: 0.000033 closs: 0.7382 (0.7520) grad_norm: 0.3670 (0.4593) time: 5.5735 data: 0.0001 max mem: 71357 +[16:09:53.536263] Epoch: [2] [1700/6500] lr: 0.000033 closs: 0.7382 (0.7517) grad_norm: 0.3817 (0.4593) time: 5.5778 data: 0.0001 max mem: 71357 +[16:10:49.268849] Epoch: [2] [1710/6500] lr: 0.000033 closs: 0.7031 (0.7516) grad_norm: 0.4069 (0.4592) time: 5.5768 data: 0.0001 max mem: 71357 +[16:11:45.090787] Epoch: [2] [1720/6500] lr: 0.000033 closs: 0.7533 (0.7522) grad_norm: 0.3817 (0.4587) time: 5.5776 data: 0.0001 max mem: 71357 +[16:12:40.803323] Epoch: [2] [1730/6500] lr: 0.000033 closs: 0.7173 (0.7521) grad_norm: 0.4069 (0.4589) time: 5.5766 data: 0.0001 max mem: 71357 +[16:13:36.587002] Epoch: [2] [1740/6500] lr: 0.000033 closs: 0.6879 (0.7516) grad_norm: 0.3828 (0.4585) time: 5.5747 data: 0.0002 max mem: 71357 +[16:14:32.418319] Epoch: [2] [1750/6500] lr: 0.000033 closs: 0.6997 (0.7518) grad_norm: 0.3686 (0.4582) time: 5.5807 data: 0.0002 max mem: 71357 +[16:15:28.290997] Epoch: [2] [1760/6500] lr: 0.000033 closs: 0.7878 (0.7520) grad_norm: 0.3828 (0.4587) time: 5.5851 data: 0.0001 max mem: 71357 +[16:16:24.081054] Epoch: [2] [1770/6500] lr: 0.000033 closs: 0.8214 (0.7525) grad_norm: 0.3686 (0.4582) time: 5.5830 data: 0.0001 max mem: 71357 +[16:17:19.820691] Epoch: [2] [1780/6500] lr: 0.000033 closs: 0.7482 (0.7530) grad_norm: 0.3871 (0.4579) time: 5.5764 data: 0.0001 max mem: 71357 +[16:18:15.530633] Epoch: [2] [1790/6500] lr: 0.000033 closs: 0.7315 (0.7531) grad_norm: 0.4017 (0.4577) time: 5.5724 data: 0.0002 max mem: 71357 +[16:19:11.342479] Epoch: [2] [1800/6500] lr: 0.000033 closs: 0.7292 (0.7527) grad_norm: 0.4017 (0.4573) time: 5.5760 data: 0.0002 max mem: 71357 +[16:20:07.123117] Epoch: [2] [1810/6500] lr: 0.000033 closs: 0.7292 (0.7532) grad_norm: 0.4125 (0.4572) time: 5.5796 data: 0.0001 max mem: 71357 +[16:21:02.878153] Epoch: [2] [1820/6500] lr: 0.000033 closs: 0.8183 (0.7531) grad_norm: 0.4125 (0.4570) time: 5.5767 data: 0.0001 max mem: 71357 +[16:21:58.572732] Epoch: [2] [1830/6500] lr: 0.000033 closs: 0.7561 (0.7535) grad_norm: 0.4260 (0.4570) time: 5.5724 data: 0.0001 max mem: 71357 +[16:22:54.333135] Epoch: [2] [1840/6500] lr: 0.000033 closs: 0.7238 (0.7532) grad_norm: 0.4551 (0.4570) time: 5.5726 data: 0.0001 max mem: 71357 +[16:23:50.182133] Epoch: [2] [1850/6500] lr: 0.000033 closs: 0.7363 (0.7533) grad_norm: 0.4112 (0.4567) time: 5.5804 data: 0.0001 max mem: 71357 +[16:24:45.956038] Epoch: [2] [1860/6500] lr: 0.000032 closs: 0.7864 (0.7535) grad_norm: 0.4024 (0.4560) time: 5.5811 data: 0.0001 max mem: 71357 +[16:25:41.681321] Epoch: [2] [1870/6500] lr: 0.000032 closs: 0.7445 (0.7531) grad_norm: 0.3907 (0.4558) time: 5.5749 data: 0.0001 max mem: 71357 +[16:26:37.476320] Epoch: [2] [1880/6500] lr: 0.000032 closs: 0.7359 (0.7534) grad_norm: 0.3884 (0.4627) time: 5.5759 data: 0.0001 max mem: 71357 +[16:27:33.312302] Epoch: [2] [1890/6500] lr: 0.000032 closs: 0.6661 (0.7528) grad_norm: 0.3852 (0.4630) time: 5.5814 data: 0.0001 max mem: 71357 +[16:28:29.226442] Epoch: [2] [1900/6500] lr: 0.000032 closs: 0.7272 (0.7528) grad_norm: 0.3907 (0.4624) time: 5.5874 data: 0.0002 max mem: 71357 +[16:29:24.933778] Epoch: [2] [1910/6500] lr: 0.000032 closs: 0.7816 (0.7531) grad_norm: 0.3694 (0.4621) time: 5.5810 data: 0.0001 max mem: 71357 +[16:30:20.642646] Epoch: [2] [1920/6500] lr: 0.000032 closs: 0.7881 (0.7530) grad_norm: 0.3694 (0.4622) time: 5.5707 data: 0.0001 max mem: 71357 +[16:31:16.469179] Epoch: [2] [1930/6500] lr: 0.000032 closs: 0.7151 (0.7529) grad_norm: 0.3642 (0.4620) time: 5.5767 data: 0.0001 max mem: 71357 +[16:32:12.233790] Epoch: [2] [1940/6500] lr: 0.000032 closs: 0.7240 (0.7527) grad_norm: 0.3498 (0.4614) time: 5.5794 data: 0.0001 max mem: 71357 +[16:33:07.938858] Epoch: [2] [1950/6500] lr: 0.000032 closs: 0.7272 (0.7528) grad_norm: 0.3489 (0.4612) time: 5.5733 data: 0.0002 max mem: 71357 +[16:34:03.698037] Epoch: [2] [1960/6500] lr: 0.000032 closs: 0.7917 (0.7535) grad_norm: 0.3489 (0.4617) time: 5.5731 data: 0.0002 max mem: 71357 +[16:34:59.459082] Epoch: [2] [1970/6500] lr: 0.000032 closs: 0.7833 (0.7534) grad_norm: 0.3513 (0.4616) time: 5.5759 data: 0.0001 max mem: 71357 +[16:35:55.325722] Epoch: [2] [1980/6500] lr: 0.000032 closs: 0.7259 (0.7530) grad_norm: 0.3869 (0.4612) time: 5.5813 data: 0.0001 max mem: 71357 +[16:36:51.047078] Epoch: [2] [1990/6500] lr: 0.000032 closs: 0.7144 (0.7527) grad_norm: 0.3811 (0.4608) time: 5.5793 data: 0.0001 max mem: 71357 +[16:37:46.785724] Epoch: [2] [2000/6500] lr: 0.000032 closs: 0.7638 (0.7531) grad_norm: 0.3811 (0.4604) time: 5.5729 data: 0.0001 max mem: 71357 +[16:38:42.646345] Epoch: [2] [2010/6500] lr: 0.000032 closs: 0.8048 (0.7533) grad_norm: 0.3741 (0.4601) time: 5.5798 data: 0.0002 max mem: 71357 +[16:39:38.375405] Epoch: [2] [2020/6500] lr: 0.000032 closs: 0.7819 (0.7533) grad_norm: 0.3729 (0.4600) time: 5.5793 data: 0.0002 max mem: 71357 +[16:40:34.271946] Epoch: [2] [2030/6500] lr: 0.000032 closs: 0.7338 (0.7533) grad_norm: 0.3709 (0.4597) time: 5.5812 data: 0.0001 max mem: 71357 +[16:41:29.947289] Epoch: [2] [2040/6500] lr: 0.000032 closs: 0.7511 (0.7535) grad_norm: 0.3709 (0.4593) time: 5.5785 data: 0.0001 max mem: 71357 +[16:42:25.641146] Epoch: [2] [2050/6500] lr: 0.000032 closs: 0.7587 (0.7534) grad_norm: 0.3732 (0.4592) time: 5.5683 data: 0.0001 max mem: 71357 +[16:43:21.404041] Epoch: [2] [2060/6500] lr: 0.000032 closs: 0.7396 (0.7533) grad_norm: 0.4092 (0.4594) time: 5.5727 data: 0.0002 max mem: 71357 +[16:44:17.307464] Epoch: [2] [2070/6500] lr: 0.000032 closs: 0.7244 (0.7532) grad_norm: 0.4281 (0.4594) time: 5.5832 data: 0.0002 max mem: 71357 +[16:45:13.098288] Epoch: [2] [2080/6500] lr: 0.000032 closs: 0.7375 (0.7533) grad_norm: 0.4281 (0.4592) time: 5.5846 data: 0.0001 max mem: 71357 +[16:46:08.858975] Epoch: [2] [2090/6500] lr: 0.000032 closs: 0.7613 (0.7531) grad_norm: 0.4259 (0.4589) time: 5.5774 data: 0.0001 max mem: 71357 +[16:47:04.577290] Epoch: [2] [2100/6500] lr: 0.000032 closs: 0.7005 (0.7532) grad_norm: 0.4049 (0.4588) time: 5.5738 data: 0.0002 max mem: 71357 +[16:48:00.432916] Epoch: [2] [2110/6500] lr: 0.000032 closs: 0.8067 (0.7535) grad_norm: 0.4049 (0.4590) time: 5.5786 data: 0.0002 max mem: 71357 +[16:48:56.200449] Epoch: [2] [2120/6500] lr: 0.000032 closs: 0.7533 (0.7535) grad_norm: 0.3871 (0.4590) time: 5.5810 data: 0.0001 max mem: 71357 +[16:49:51.959520] Epoch: [2] [2130/6500] lr: 0.000032 closs: 0.7356 (0.7535) grad_norm: 0.4249 (0.4589) time: 5.5762 data: 0.0001 max mem: 71357 +[16:50:47.683973] Epoch: [2] [2140/6500] lr: 0.000032 closs: 0.7356 (0.7534) grad_norm: 0.4513 (0.4592) time: 5.5741 data: 0.0001 max mem: 71357 +[16:51:43.510769] Epoch: [2] [2150/6500] lr: 0.000031 closs: 0.7314 (0.7532) grad_norm: 0.4494 (0.4594) time: 5.5775 data: 0.0002 max mem: 71357 +[16:52:39.483174] Epoch: [2] [2160/6500] lr: 0.000031 closs: 0.7314 (0.7532) grad_norm: 0.4494 (0.4591) time: 5.5898 data: 0.0002 max mem: 71357 +[16:53:35.190269] Epoch: [2] [2170/6500] lr: 0.000031 closs: 0.7144 (0.7529) grad_norm: 0.4850 (0.4600) time: 5.5839 data: 0.0002 max mem: 71357 +[16:54:30.969262] Epoch: [2] [2180/6500] lr: 0.000031 closs: 0.6895 (0.7527) grad_norm: 0.4358 (0.4601) time: 5.5742 data: 0.0002 max mem: 71357 +[16:55:26.737443] Epoch: [2] [2190/6500] lr: 0.000031 closs: 0.8195 (0.7531) grad_norm: 0.4794 (0.4601) time: 5.5773 data: 0.0001 max mem: 71357 +[16:56:22.693351] Epoch: [2] [2200/6500] lr: 0.000031 closs: 0.8195 (0.7531) grad_norm: 0.4243 (0.4596) time: 5.5861 data: 0.0001 max mem: 71357 +[16:57:18.527056] Epoch: [2] [2210/6500] lr: 0.000031 closs: 0.7547 (0.7532) grad_norm: 0.3739 (0.4592) time: 5.5894 data: 0.0001 max mem: 71357 +[16:58:14.196064] Epoch: [2] [2220/6500] lr: 0.000031 closs: 0.8415 (0.7538) grad_norm: 0.3739 (0.4589) time: 5.5750 data: 0.0001 max mem: 71357 +[16:59:09.869300] Epoch: [2] [2230/6500] lr: 0.000031 closs: 0.8448 (0.7541) grad_norm: 0.3739 (0.4590) time: 5.5670 data: 0.0001 max mem: 71357 +[17:00:05.674412] Epoch: [2] [2240/6500] lr: 0.000031 closs: 0.6850 (0.7535) grad_norm: 0.4223 (0.4592) time: 5.5738 data: 0.0001 max mem: 71357 +[17:01:01.574772] Epoch: [2] [2250/6500] lr: 0.000031 closs: 0.6532 (0.7533) grad_norm: 0.4223 (0.4587) time: 5.5852 data: 0.0001 max mem: 71357 +[17:01:57.342997] Epoch: [2] [2260/6500] lr: 0.000031 closs: 0.7523 (0.7535) grad_norm: 0.4039 (0.4583) time: 5.5834 data: 0.0001 max mem: 71357 +[17:02:53.157781] Epoch: [2] [2270/6500] lr: 0.000031 closs: 0.7387 (0.7535) grad_norm: 0.3563 (0.4579) time: 5.5791 data: 0.0001 max mem: 71357 +[17:03:48.964752] Epoch: [2] [2280/6500] lr: 0.000031 closs: 0.7139 (0.7531) grad_norm: 0.3563 (0.4580) time: 5.5810 data: 0.0002 max mem: 71357 +[17:04:44.707350] Epoch: [2] [2290/6500] lr: 0.000031 closs: 0.7008 (0.7529) grad_norm: 0.3878 (0.4583) time: 5.5774 data: 0.0002 max mem: 71357 +[17:05:40.431226] Epoch: [2] [2300/6500] lr: 0.000031 closs: 0.7468 (0.7534) grad_norm: 0.4123 (0.4584) time: 5.5732 data: 0.0001 max mem: 71357 +[17:06:36.202782] Epoch: [2] [2310/6500] lr: 0.000031 closs: 0.7475 (0.7532) grad_norm: 0.4419 (0.4583) time: 5.5747 data: 0.0001 max mem: 71357 +[17:07:31.949877] Epoch: [2] [2320/6500] lr: 0.000031 closs: 0.7278 (0.7532) grad_norm: 0.4416 (0.4583) time: 5.5759 data: 0.0001 max mem: 71357 +[17:08:27.811744] Epoch: [2] [2330/6500] lr: 0.000031 closs: 0.7511 (0.7532) grad_norm: 0.4123 (0.4579) time: 5.5804 data: 0.0002 max mem: 71357 +[17:09:23.480954] Epoch: [2] [2340/6500] lr: 0.000031 closs: 0.6838 (0.7529) grad_norm: 0.3687 (0.4580) time: 5.5764 data: 0.0002 max mem: 71357 +[17:10:19.161065] Epoch: [2] [2350/6500] lr: 0.000031 closs: 0.6708 (0.7526) grad_norm: 0.4002 (0.4580) time: 5.5673 data: 0.0001 max mem: 71357 +[17:11:15.085404] Epoch: [2] [2360/6500] lr: 0.000031 closs: 0.7101 (0.7525) grad_norm: 0.3574 (0.4576) time: 5.5801 data: 0.0001 max mem: 71357 +[17:12:10.933953] Epoch: [2] [2370/6500] lr: 0.000031 closs: 0.6778 (0.7522) grad_norm: 0.3789 (0.4578) time: 5.5886 data: 0.0001 max mem: 71357 +[17:13:06.743314] Epoch: [2] [2380/6500] lr: 0.000031 closs: 0.6736 (0.7522) grad_norm: 0.3789 (0.4583) time: 5.5828 data: 0.0002 max mem: 71357 +[17:14:02.607812] Epoch: [2] [2390/6500] lr: 0.000031 closs: 0.7213 (0.7523) grad_norm: 0.3567 (0.4582) time: 5.5836 data: 0.0002 max mem: 71357 +[17:14:58.400102] Epoch: [2] [2400/6500] lr: 0.000031 closs: 0.7723 (0.7525) grad_norm: 0.4134 (0.4579) time: 5.5827 data: 0.0002 max mem: 71357 +[17:15:54.228655] Epoch: [2] [2410/6500] lr: 0.000031 closs: 0.7646 (0.7522) grad_norm: 0.4042 (0.4577) time: 5.5809 data: 0.0002 max mem: 71357 +[17:16:50.097920] Epoch: [2] [2420/6500] lr: 0.000030 closs: 0.7181 (0.7523) grad_norm: 0.4134 (0.4580) time: 5.5848 data: 0.0002 max mem: 71357 +[17:17:45.947083] Epoch: [2] [2430/6500] lr: 0.000030 closs: 0.7181 (0.7522) grad_norm: 0.4134 (0.4578) time: 5.5858 data: 0.0002 max mem: 71357 +[17:18:41.722068] Epoch: [2] [2440/6500] lr: 0.000030 closs: 0.6543 (0.7520) grad_norm: 0.3858 (0.4578) time: 5.5811 data: 0.0002 max mem: 71357 +[17:19:37.404521] Epoch: [2] [2450/6500] lr: 0.000030 closs: 0.7162 (0.7520) grad_norm: 0.4227 (0.4576) time: 5.5728 data: 0.0001 max mem: 71357 +[17:20:33.253044] Epoch: [2] [2460/6500] lr: 0.000030 closs: 0.7169 (0.7519) grad_norm: 0.3806 (0.4578) time: 5.5764 data: 0.0001 max mem: 71357 +[17:21:29.076746] Epoch: [2] [2470/6500] lr: 0.000030 closs: 0.7169 (0.7521) grad_norm: 0.3771 (0.4575) time: 5.5835 data: 0.0001 max mem: 71357 +[17:22:24.780924] Epoch: [2] [2480/6500] lr: 0.000030 closs: 0.7095 (0.7519) grad_norm: 0.3961 (0.4574) time: 5.5763 data: 0.0001 max mem: 71357 +[17:23:20.609490] Epoch: [2] [2490/6500] lr: 0.000030 closs: 0.6672 (0.7515) grad_norm: 0.3961 (0.4572) time: 5.5765 data: 0.0002 max mem: 71357 +[17:24:16.427929] Epoch: [2] [2500/6500] lr: 0.000030 closs: 0.6893 (0.7514) grad_norm: 0.4139 (0.4583) time: 5.5823 data: 0.0002 max mem: 71357 +[17:25:12.307447] Epoch: [2] [2510/6500] lr: 0.000030 closs: 0.7710 (0.7517) grad_norm: 0.4430 (0.4583) time: 5.5848 data: 0.0002 max mem: 71357 +[17:26:08.087180] Epoch: [2] [2520/6500] lr: 0.000030 closs: 0.7870 (0.7520) grad_norm: 0.4261 (0.4582) time: 5.5828 data: 0.0002 max mem: 71357 +[17:27:03.896747] Epoch: [2] [2530/6500] lr: 0.000030 closs: 0.7789 (0.7522) grad_norm: 0.4251 (0.4578) time: 5.5793 data: 0.0002 max mem: 71357 +[17:27:59.671442] Epoch: [2] [2540/6500] lr: 0.000030 closs: 0.7912 (0.7527) grad_norm: 0.4251 (0.4584) time: 5.5791 data: 0.0001 max mem: 71357 +[17:28:55.523823] Epoch: [2] [2550/6500] lr: 0.000030 closs: 0.7942 (0.7527) grad_norm: 0.4170 (0.4583) time: 5.5813 data: 0.0001 max mem: 71357 +[17:29:51.350447] Epoch: [2] [2560/6500] lr: 0.000030 closs: 0.7354 (0.7527) grad_norm: 0.4065 (0.4579) time: 5.5839 data: 0.0001 max mem: 71357 +[17:30:47.189924] Epoch: [2] [2570/6500] lr: 0.000030 closs: 0.7708 (0.7528) grad_norm: 0.4130 (0.4578) time: 5.5832 data: 0.0001 max mem: 71357 +[17:31:42.914037] Epoch: [2] [2580/6500] lr: 0.000030 closs: 0.7749 (0.7529) grad_norm: 0.4057 (0.4577) time: 5.5781 data: 0.0001 max mem: 71357 +[17:32:38.776742] Epoch: [2] [2590/6500] lr: 0.000030 closs: 0.7551 (0.7528) grad_norm: 0.3975 (0.4582) time: 5.5792 data: 0.0001 max mem: 71357 +[17:33:34.669070] Epoch: [2] [2600/6500] lr: 0.000030 closs: 0.7472 (0.7528) grad_norm: 0.3958 (0.4579) time: 5.5876 data: 0.0002 max mem: 71357 +[17:34:30.494585] Epoch: [2] [2610/6500] lr: 0.000030 closs: 0.7444 (0.7528) grad_norm: 0.3791 (0.4578) time: 5.5858 data: 0.0002 max mem: 71357 +[17:35:26.217992] Epoch: [2] [2620/6500] lr: 0.000030 closs: 0.7376 (0.7526) grad_norm: 0.4022 (0.4580) time: 5.5774 data: 0.0002 max mem: 71357 +[17:36:21.989461] Epoch: [2] [2630/6500] lr: 0.000030 closs: 0.6735 (0.7525) grad_norm: 0.3811 (0.4577) time: 5.5746 data: 0.0002 max mem: 71357 +[17:37:17.875799] Epoch: [2] [2640/6500] lr: 0.000030 closs: 0.7215 (0.7526) grad_norm: 0.4032 (0.4577) time: 5.5827 data: 0.0002 max mem: 71357 +[17:38:13.618548] Epoch: [2] [2650/6500] lr: 0.000030 closs: 0.7928 (0.7529) grad_norm: 0.4032 (0.4575) time: 5.5813 data: 0.0002 max mem: 71357 +[17:39:09.409027] Epoch: [2] [2660/6500] lr: 0.000030 closs: 0.7808 (0.7528) grad_norm: 0.3757 (0.4571) time: 5.5765 data: 0.0002 max mem: 71357 +[17:40:05.267929] Epoch: [2] [2670/6500] lr: 0.000030 closs: 0.6744 (0.7524) grad_norm: 0.3697 (0.4571) time: 5.5823 data: 0.0002 max mem: 71357 +[17:41:01.187074] Epoch: [2] [2680/6500] lr: 0.000030 closs: 0.6744 (0.7522) grad_norm: 0.3697 (0.4574) time: 5.5888 data: 0.0002 max mem: 71357 +[17:41:57.168637] Epoch: [2] [2690/6500] lr: 0.000030 closs: 0.7051 (0.7521) grad_norm: 0.3697 (0.4577) time: 5.5949 data: 0.0002 max mem: 71357 +[17:42:53.038755] Epoch: [2] [2700/6500] lr: 0.000029 closs: 0.6676 (0.7519) grad_norm: 0.3712 (0.4573) time: 5.5924 data: 0.0002 max mem: 71357 +[17:43:48.838773] Epoch: [2] [2710/6500] lr: 0.000029 closs: 0.6929 (0.7518) grad_norm: 0.3499 (0.4569) time: 5.5834 data: 0.0002 max mem: 71357 +[17:44:44.674479] Epoch: [2] [2720/6500] lr: 0.000029 closs: 0.7400 (0.7517) grad_norm: 0.3712 (0.4572) time: 5.5816 data: 0.0002 max mem: 71357 +[17:45:40.590527] Epoch: [2] [2730/6500] lr: 0.000029 closs: 0.7642 (0.7517) grad_norm: 0.3458 (0.4568) time: 5.5874 data: 0.0002 max mem: 71357 +[17:46:36.441104] Epoch: [2] [2740/6500] lr: 0.000029 closs: 0.7642 (0.7517) grad_norm: 0.3688 (0.4567) time: 5.5882 data: 0.0002 max mem: 71357 +[17:47:32.267248] Epoch: [2] [2750/6500] lr: 0.000029 closs: 0.7494 (0.7519) grad_norm: 0.3896 (0.4564) time: 5.5837 data: 0.0002 max mem: 71357 +[17:48:28.031604] Epoch: [2] [2760/6500] lr: 0.000029 closs: 0.7593 (0.7519) grad_norm: 0.3661 (0.4562) time: 5.5793 data: 0.0002 max mem: 71357 +[17:49:24.001258] Epoch: [2] [2770/6500] lr: 0.000029 closs: 0.7320 (0.7518) grad_norm: 0.3661 (0.4561) time: 5.5866 data: 0.0002 max mem: 71357 +[17:50:19.737391] Epoch: [2] [2780/6500] lr: 0.000029 closs: 0.7929 (0.7520) grad_norm: 0.3975 (0.4562) time: 5.5852 data: 0.0002 max mem: 71357 +[17:51:15.562224] Epoch: [2] [2790/6500] lr: 0.000029 closs: 0.7360 (0.7517) grad_norm: 0.4177 (0.4565) time: 5.5779 data: 0.0002 max mem: 71357 +[17:52:11.360869] Epoch: [2] [2800/6500] lr: 0.000029 closs: 0.7360 (0.7518) grad_norm: 0.4483 (0.4565) time: 5.5811 data: 0.0002 max mem: 71357 +[17:53:07.203956] Epoch: [2] [2810/6500] lr: 0.000029 closs: 0.7754 (0.7520) grad_norm: 0.4330 (0.4563) time: 5.5820 data: 0.0002 max mem: 71357 +[17:54:03.137654] Epoch: [2] [2820/6500] lr: 0.000029 closs: 0.7180 (0.7520) grad_norm: 0.4330 (0.4559) time: 5.5887 data: 0.0002 max mem: 71357 +[17:54:58.848050] Epoch: [2] [2830/6500] lr: 0.000029 closs: 0.7873 (0.7525) grad_norm: 0.3781 (0.4559) time: 5.5821 data: 0.0002 max mem: 71357 +[17:55:54.655523] Epoch: [2] [2840/6500] lr: 0.000029 closs: 0.7809 (0.7522) grad_norm: 0.3781 (0.4558) time: 5.5757 data: 0.0002 max mem: 71357 +[17:56:50.438313] Epoch: [2] [2850/6500] lr: 0.000029 closs: 0.7091 (0.7522) grad_norm: 0.4116 (0.4562) time: 5.5794 data: 0.0002 max mem: 71357 +[17:57:46.308543] Epoch: [2] [2860/6500] lr: 0.000029 closs: 0.7401 (0.7521) grad_norm: 0.4116 (0.4559) time: 5.5825 data: 0.0002 max mem: 71357 +[17:58:42.069918] Epoch: [2] [2870/6500] lr: 0.000029 closs: 0.7651 (0.7522) grad_norm: 0.4116 (0.4559) time: 5.5814 data: 0.0002 max mem: 71357 +[17:59:37.920237] Epoch: [2] [2880/6500] lr: 0.000029 closs: 0.6953 (0.7519) grad_norm: 0.3913 (0.4556) time: 5.5805 data: 0.0002 max mem: 71357 +[18:00:33.763842] Epoch: [2] [2890/6500] lr: 0.000029 closs: 0.6868 (0.7522) grad_norm: 0.3913 (0.4557) time: 5.5846 data: 0.0002 max mem: 71357 +[18:01:29.628889] Epoch: [2] [2900/6500] lr: 0.000029 closs: 0.6978 (0.7521) grad_norm: 0.3972 (0.4556) time: 5.5853 data: 0.0002 max mem: 71357 +[18:02:25.459492] Epoch: [2] [2910/6500] lr: 0.000029 closs: 0.7243 (0.7522) grad_norm: 0.3972 (0.4556) time: 5.5846 data: 0.0002 max mem: 71357 +[18:03:21.321438] Epoch: [2] [2920/6500] lr: 0.000029 closs: 0.8076 (0.7526) grad_norm: 0.3893 (0.4553) time: 5.5845 data: 0.0002 max mem: 71357 +[18:04:17.173694] Epoch: [2] [2930/6500] lr: 0.000029 closs: 0.8087 (0.7527) grad_norm: 0.3737 (0.4552) time: 5.5855 data: 0.0002 max mem: 71357 +[18:05:13.069197] Epoch: [2] [2940/6500] lr: 0.000029 closs: 0.7949 (0.7529) grad_norm: 0.3737 (0.4549) time: 5.5872 data: 0.0002 max mem: 71357 +[18:06:08.996669] Epoch: [2] [2950/6500] lr: 0.000029 closs: 0.7837 (0.7529) grad_norm: 0.3866 (0.4548) time: 5.5910 data: 0.0002 max mem: 71357 +[18:07:04.827409] Epoch: [2] [2960/6500] lr: 0.000029 closs: 0.7392 (0.7529) grad_norm: 0.4097 (0.4546) time: 5.5878 data: 0.0002 max mem: 71357 +[18:08:00.615811] Epoch: [2] [2970/6500] lr: 0.000029 closs: 0.6962 (0.7528) grad_norm: 0.3931 (0.4544) time: 5.5808 data: 0.0002 max mem: 71357 +[18:08:56.463465] Epoch: [2] [2980/6500] lr: 0.000028 closs: 0.6962 (0.7526) grad_norm: 0.3931 (0.4542) time: 5.5816 data: 0.0002 max mem: 71357 +[18:09:52.439453] Epoch: [2] [2990/6500] lr: 0.000028 closs: 0.6416 (0.7524) grad_norm: 0.3694 (0.4540) time: 5.5910 data: 0.0002 max mem: 71357 +[18:10:48.280738] Epoch: [2] [3000/6500] lr: 0.000028 closs: 0.7245 (0.7525) grad_norm: 0.3694 (0.4538) time: 5.5907 data: 0.0002 max mem: 71357 +[18:11:44.003007] Epoch: [2] [3010/6500] lr: 0.000028 closs: 0.7370 (0.7526) grad_norm: 0.3666 (0.4538) time: 5.5781 data: 0.0002 max mem: 71357 +[18:12:39.782636] Epoch: [2] [3020/6500] lr: 0.000028 closs: 0.7203 (0.7527) grad_norm: 0.3794 (0.4540) time: 5.5750 data: 0.0002 max mem: 71357 +[18:13:35.609566] Epoch: [2] [3030/6500] lr: 0.000028 closs: 0.7023 (0.7525) grad_norm: 0.4202 (0.4539) time: 5.5802 data: 0.0002 max mem: 71357 +[18:14:31.428499] Epoch: [2] [3040/6500] lr: 0.000028 closs: 0.7417 (0.7528) grad_norm: 0.3936 (0.4536) time: 5.5822 data: 0.0002 max mem: 71357 +[18:15:27.180218] Epoch: [2] [3050/6500] lr: 0.000028 closs: 0.7516 (0.7528) grad_norm: 0.3936 (0.4538) time: 5.5784 data: 0.0001 max mem: 71357 +[18:16:22.953546] Epoch: [2] [3060/6500] lr: 0.000028 closs: 0.7541 (0.7529) grad_norm: 0.4185 (0.4543) time: 5.5762 data: 0.0001 max mem: 71357 +[18:17:18.679769] Epoch: [2] [3070/6500] lr: 0.000028 closs: 0.7600 (0.7529) grad_norm: 0.4185 (0.4543) time: 5.5749 data: 0.0001 max mem: 71357 +[18:18:14.439626] Epoch: [2] [3080/6500] lr: 0.000028 closs: 0.7600 (0.7530) grad_norm: 0.4330 (0.4541) time: 5.5742 data: 0.0001 max mem: 71357 +[18:19:10.196084] Epoch: [2] [3090/6500] lr: 0.000028 closs: 0.7707 (0.7531) grad_norm: 0.4097 (0.4539) time: 5.5757 data: 0.0002 max mem: 71357 +[18:20:05.958990] Epoch: [2] [3100/6500] lr: 0.000028 closs: 0.7064 (0.7532) grad_norm: 0.4097 (0.4542) time: 5.5759 data: 0.0002 max mem: 71357 +[18:21:01.656755] Epoch: [2] [3110/6500] lr: 0.000028 closs: 0.7059 (0.7531) grad_norm: 0.4056 (0.4545) time: 5.5729 data: 0.0001 max mem: 71357 +[18:21:57.405447] Epoch: [2] [3120/6500] lr: 0.000028 closs: 0.7170 (0.7530) grad_norm: 0.3960 (0.4554) time: 5.5723 data: 0.0001 max mem: 71357 +[18:22:53.313307] Epoch: [2] [3130/6500] lr: 0.000028 closs: 0.7557 (0.7532) grad_norm: 0.3887 (0.4553) time: 5.5828 data: 0.0001 max mem: 71357 +[18:23:49.017260] Epoch: [2] [3140/6500] lr: 0.000028 closs: 0.8077 (0.7531) grad_norm: 0.3960 (0.4552) time: 5.5805 data: 0.0001 max mem: 71357 +[18:24:44.802215] Epoch: [2] [3150/6500] lr: 0.000028 closs: 0.7324 (0.7530) grad_norm: 0.3656 (0.4549) time: 5.5743 data: 0.0001 max mem: 71357 +[18:25:40.636892] Epoch: [2] [3160/6500] lr: 0.000028 closs: 0.6688 (0.7530) grad_norm: 0.4030 (0.4549) time: 5.5809 data: 0.0001 max mem: 71357 +[18:26:36.458261] Epoch: [2] [3170/6500] lr: 0.000028 closs: 0.7235 (0.7530) grad_norm: 0.4011 (0.4547) time: 5.5827 data: 0.0001 max mem: 71357 +[18:27:32.179589] Epoch: [2] [3180/6500] lr: 0.000028 closs: 0.7677 (0.7531) grad_norm: 0.3832 (0.4547) time: 5.5770 data: 0.0001 max mem: 71357 +[18:28:27.829819] Epoch: [2] [3190/6500] lr: 0.000028 closs: 0.7677 (0.7532) grad_norm: 0.4011 (0.4550) time: 5.5685 data: 0.0001 max mem: 71357 +[18:29:23.602288] Epoch: [2] [3200/6500] lr: 0.000028 closs: 0.6795 (0.7532) grad_norm: 0.4011 (0.4551) time: 5.5710 data: 0.0001 max mem: 71357 +[18:30:19.499138] Epoch: [2] [3210/6500] lr: 0.000028 closs: 0.6620 (0.7529) grad_norm: 0.4624 (0.4554) time: 5.5833 data: 0.0001 max mem: 71357 +[18:31:15.203601] Epoch: [2] [3220/6500] lr: 0.000028 closs: 0.7157 (0.7530) grad_norm: 0.4624 (0.4555) time: 5.5800 data: 0.0001 max mem: 71357 +[18:32:11.005898] Epoch: [2] [3230/6500] lr: 0.000028 closs: 0.8028 (0.7529) grad_norm: 0.3974 (0.4553) time: 5.5752 data: 0.0001 max mem: 71357 +[18:33:06.750451] Epoch: [2] [3240/6500] lr: 0.000028 closs: 0.7577 (0.7528) grad_norm: 0.3974 (0.4551) time: 5.5773 data: 0.0001 max mem: 71357 +[18:34:02.445923] Epoch: [2] [3250/6500] lr: 0.000028 closs: 0.6945 (0.7527) grad_norm: 0.3940 (0.4554) time: 5.5719 data: 0.0001 max mem: 71357 +[18:34:58.253353] Epoch: [2] [3260/6500] lr: 0.000027 closs: 0.6054 (0.7522) grad_norm: 0.4058 (0.4556) time: 5.5750 data: 0.0001 max mem: 71357 +[18:35:54.028810] Epoch: [2] [3270/6500] lr: 0.000027 closs: 0.6357 (0.7522) grad_norm: 0.4058 (0.4553) time: 5.5790 data: 0.0001 max mem: 71357 +[18:36:49.797666] Epoch: [2] [3280/6500] lr: 0.000027 closs: 0.7882 (0.7521) grad_norm: 0.4058 (0.4555) time: 5.5771 data: 0.0001 max mem: 71357 +[18:37:45.573523] Epoch: [2] [3290/6500] lr: 0.000027 closs: 0.7882 (0.7521) grad_norm: 0.3778 (0.4556) time: 5.5771 data: 0.0001 max mem: 71357 +[18:38:41.467647] Epoch: [2] [3300/6500] lr: 0.000027 closs: 0.7377 (0.7520) grad_norm: 0.3660 (0.4553) time: 5.5834 data: 0.0001 max mem: 71357 +[18:39:37.255941] Epoch: [2] [3310/6500] lr: 0.000027 closs: 0.6829 (0.7519) grad_norm: 0.3693 (0.4554) time: 5.5840 data: 0.0001 max mem: 71357 +[18:40:33.026676] Epoch: [2] [3320/6500] lr: 0.000027 closs: 0.7501 (0.7518) grad_norm: 0.3773 (0.4556) time: 5.5778 data: 0.0001 max mem: 71357 +[18:41:28.790780] Epoch: [2] [3330/6500] lr: 0.000027 closs: 0.7565 (0.7519) grad_norm: 0.3885 (0.4554) time: 5.5766 data: 0.0001 max mem: 71357 +[18:42:24.526741] Epoch: [2] [3340/6500] lr: 0.000027 closs: 0.7717 (0.7522) grad_norm: 0.4094 (0.4555) time: 5.5749 data: 0.0001 max mem: 71357 +[18:43:20.332717] Epoch: [2] [3350/6500] lr: 0.000027 closs: 0.7165 (0.7520) grad_norm: 0.4280 (0.4556) time: 5.5770 data: 0.0001 max mem: 71357 +[18:44:16.086465] Epoch: [2] [3360/6500] lr: 0.000027 closs: 0.6851 (0.7518) grad_norm: 0.4280 (0.4554) time: 5.5779 data: 0.0001 max mem: 71357 +[18:45:11.852035] Epoch: [2] [3370/6500] lr: 0.000027 closs: 0.6957 (0.7519) grad_norm: 0.4283 (0.4553) time: 5.5759 data: 0.0001 max mem: 71357 +[18:46:07.516148] Epoch: [2] [3380/6500] lr: 0.000027 closs: 0.7751 (0.7520) grad_norm: 0.4299 (0.4558) time: 5.5714 data: 0.0001 max mem: 71357 +[18:47:03.306747] Epoch: [2] [3390/6500] lr: 0.000027 closs: 0.7488 (0.7519) grad_norm: 0.4299 (0.4556) time: 5.5727 data: 0.0001 max mem: 71357 +[18:47:59.053434] Epoch: [2] [3400/6500] lr: 0.000027 closs: 0.7401 (0.7521) grad_norm: 0.4102 (0.4552) time: 5.5768 data: 0.0001 max mem: 71357 +[18:48:54.740746] Epoch: [2] [3410/6500] lr: 0.000027 closs: 0.7448 (0.7521) grad_norm: 0.4299 (0.4553) time: 5.5716 data: 0.0001 max mem: 71357 +[18:49:50.560422] Epoch: [2] [3420/6500] lr: 0.000027 closs: 0.7453 (0.7521) grad_norm: 0.3657 (0.4550) time: 5.5753 data: 0.0001 max mem: 71357 +[18:50:46.435930] Epoch: [2] [3430/6500] lr: 0.000027 closs: 0.7453 (0.7521) grad_norm: 0.3689 (0.4550) time: 5.5847 data: 0.0001 max mem: 71357 +[18:51:42.267551] Epoch: [2] [3440/6500] lr: 0.000027 closs: 0.7437 (0.7523) grad_norm: 0.4137 (0.4552) time: 5.5852 data: 0.0001 max mem: 71357 +[18:52:38.039434] Epoch: [2] [3450/6500] lr: 0.000027 closs: 0.7048 (0.7521) grad_norm: 0.4042 (0.4552) time: 5.5801 data: 0.0001 max mem: 71357 +[18:53:33.817847] Epoch: [2] [3460/6500] lr: 0.000027 closs: 0.7392 (0.7522) grad_norm: 0.4137 (0.4551) time: 5.5774 data: 0.0001 max mem: 71357 +[18:54:29.618361] Epoch: [2] [3470/6500] lr: 0.000027 closs: 0.7650 (0.7522) grad_norm: 0.4506 (0.4550) time: 5.5788 data: 0.0001 max mem: 71357 +[18:55:25.443455] Epoch: [2] [3480/6500] lr: 0.000027 closs: 0.7204 (0.7520) grad_norm: 0.3884 (0.4548) time: 5.5812 data: 0.0001 max mem: 71357 +[18:56:21.227026] Epoch: [2] [3490/6500] lr: 0.000027 closs: 0.7204 (0.7519) grad_norm: 0.3710 (0.4546) time: 5.5803 data: 0.0001 max mem: 71357 +[18:57:16.999656] Epoch: [2] [3500/6500] lr: 0.000027 closs: 0.7337 (0.7519) grad_norm: 0.3694 (0.4546) time: 5.5777 data: 0.0001 max mem: 71357 +[18:58:12.821955] Epoch: [2] [3510/6500] lr: 0.000027 closs: 0.7021 (0.7521) grad_norm: 0.3694 (0.4544) time: 5.5797 data: 0.0001 max mem: 71357 +[18:59:08.653591] Epoch: [2] [3520/6500] lr: 0.000027 closs: 0.7021 (0.7520) grad_norm: 0.3993 (0.4544) time: 5.5826 data: 0.0001 max mem: 71357 +[19:00:04.373591] Epoch: [2] [3530/6500] lr: 0.000026 closs: 0.6680 (0.7517) grad_norm: 0.4037 (0.4543) time: 5.5775 data: 0.0001 max mem: 71357 +[19:01:00.095231] Epoch: [2] [3540/6500] lr: 0.000026 closs: 0.6592 (0.7515) grad_norm: 0.3815 (0.4543) time: 5.5720 data: 0.0001 max mem: 71357 +[19:01:55.783676] Epoch: [2] [3550/6500] lr: 0.000026 closs: 0.7683 (0.7518) grad_norm: 0.3815 (0.4542) time: 5.5704 data: 0.0001 max mem: 71357 +[19:02:51.613530] Epoch: [2] [3560/6500] lr: 0.000026 closs: 0.7802 (0.7518) grad_norm: 0.3848 (0.4543) time: 5.5758 data: 0.0001 max mem: 71357 +[19:03:47.362602] Epoch: [2] [3570/6500] lr: 0.000026 closs: 0.7255 (0.7519) grad_norm: 0.4178 (0.4544) time: 5.5789 data: 0.0001 max mem: 71357 +[19:04:43.065929] Epoch: [2] [3580/6500] lr: 0.000026 closs: 0.7179 (0.7519) grad_norm: 0.4329 (0.4545) time: 5.5726 data: 0.0001 max mem: 71357 +[19:05:38.869463] Epoch: [2] [3590/6500] lr: 0.000026 closs: 0.7179 (0.7518) grad_norm: 0.4329 (0.4543) time: 5.5752 data: 0.0001 max mem: 71357 +[19:06:34.613911] Epoch: [2] [3600/6500] lr: 0.000026 closs: 0.7426 (0.7519) grad_norm: 0.4353 (0.4545) time: 5.5773 data: 0.0001 max mem: 71357 +[19:07:30.425204] Epoch: [2] [3610/6500] lr: 0.000026 closs: 0.7318 (0.7520) grad_norm: 0.4175 (0.4543) time: 5.5777 data: 0.0001 max mem: 71357 +[19:08:26.219365] Epoch: [2] [3620/6500] lr: 0.000026 closs: 0.7082 (0.7519) grad_norm: 0.4031 (0.4542) time: 5.5802 data: 0.0001 max mem: 71357 +[19:09:21.945457] Epoch: [2] [3630/6500] lr: 0.000026 closs: 0.7118 (0.7519) grad_norm: 0.4031 (0.4574) time: 5.5759 data: 0.0001 max mem: 71357 +[19:10:17.768386] Epoch: [2] [3640/6500] lr: 0.000026 closs: 0.7003 (0.7517) grad_norm: 0.4097 (0.4575) time: 5.5774 data: 0.0001 max mem: 71357 +[19:11:13.634197] Epoch: [2] [3650/6500] lr: 0.000026 closs: 0.6395 (0.7513) grad_norm: 0.4272 (0.4582) time: 5.5843 data: 0.0001 max mem: 71357 +[19:12:09.273361] Epoch: [2] [3660/6500] lr: 0.000026 closs: 0.5753 (0.7509) grad_norm: 0.4639 (0.4585) time: 5.5751 data: 0.0001 max mem: 71357 +[19:13:05.018929] Epoch: [2] [3670/6500] lr: 0.000026 closs: 0.7002 (0.7509) grad_norm: 0.4504 (0.4583) time: 5.5691 data: 0.0001 max mem: 71357 +[19:14:00.765347] Epoch: [2] [3680/6500] lr: 0.000026 closs: 0.7512 (0.7510) grad_norm: 0.3919 (0.4580) time: 5.5745 data: 0.0001 max mem: 71357 +[19:14:56.592826] Epoch: [2] [3690/6500] lr: 0.000026 closs: 0.7369 (0.7511) grad_norm: 0.4039 (0.4580) time: 5.5786 data: 0.0001 max mem: 71357 +[19:15:52.353357] Epoch: [2] [3700/6500] lr: 0.000026 closs: 0.7269 (0.7511) grad_norm: 0.4066 (0.4582) time: 5.5793 data: 0.0001 max mem: 71357 +[19:16:48.075930] Epoch: [2] [3710/6500] lr: 0.000026 closs: 0.7604 (0.7514) grad_norm: 0.4211 (0.4584) time: 5.5741 data: 0.0001 max mem: 71357 +[19:17:43.807793] Epoch: [2] [3720/6500] lr: 0.000026 closs: 0.7592 (0.7515) grad_norm: 0.4212 (0.4582) time: 5.5726 data: 0.0001 max mem: 71357 +[19:18:39.589533] Epoch: [2] [3730/6500] lr: 0.000026 closs: 0.7153 (0.7513) grad_norm: 0.4276 (0.4583) time: 5.5756 data: 0.0001 max mem: 71357 +[19:19:35.517625] Epoch: [2] [3740/6500] lr: 0.000026 closs: 0.6791 (0.7514) grad_norm: 0.4190 (0.4583) time: 5.5854 data: 0.0001 max mem: 71357 +[19:20:31.333293] Epoch: [2] [3750/6500] lr: 0.000026 closs: 0.7668 (0.7515) grad_norm: 0.3620 (0.4579) time: 5.5871 data: 0.0001 max mem: 71357 +[19:21:27.133335] Epoch: [2] [3760/6500] lr: 0.000026 closs: 0.7307 (0.7514) grad_norm: 0.3646 (0.4580) time: 5.5807 data: 0.0001 max mem: 71357 +[19:22:22.832460] Epoch: [2] [3770/6500] lr: 0.000026 closs: 0.6995 (0.7513) grad_norm: 0.3620 (0.4578) time: 5.5749 data: 0.0001 max mem: 71357 +[19:23:18.596759] Epoch: [2] [3780/6500] lr: 0.000026 closs: 0.6823 (0.7511) grad_norm: 0.3688 (0.4578) time: 5.5731 data: 0.0001 max mem: 71357 +[19:24:14.511863] Epoch: [2] [3790/6500] lr: 0.000026 closs: 0.6904 (0.7510) grad_norm: 0.4127 (0.4578) time: 5.5838 data: 0.0002 max mem: 71357 +[19:25:10.290788] Epoch: [2] [3800/6500] lr: 0.000026 closs: 0.7732 (0.7511) grad_norm: 0.3918 (0.4576) time: 5.5846 data: 0.0002 max mem: 71357 +[19:26:06.038060] Epoch: [2] [3810/6500] lr: 0.000025 closs: 0.8008 (0.7514) grad_norm: 0.4030 (0.4576) time: 5.5762 data: 0.0001 max mem: 71357 +[19:27:01.862060] Epoch: [2] [3820/6500] lr: 0.000025 closs: 0.7921 (0.7515) grad_norm: 0.4030 (0.4575) time: 5.5785 data: 0.0001 max mem: 71357 +[19:27:57.761778] Epoch: [2] [3830/6500] lr: 0.000025 closs: 0.7190 (0.7514) grad_norm: 0.4133 (0.4574) time: 5.5861 data: 0.0001 max mem: 71357 +[19:28:53.629998] Epoch: [2] [3840/6500] lr: 0.000025 closs: 0.7989 (0.7516) grad_norm: 0.4133 (0.4576) time: 5.5883 data: 0.0001 max mem: 71357 +[19:29:49.375750] Epoch: [2] [3850/6500] lr: 0.000025 closs: 0.7985 (0.7515) grad_norm: 0.4121 (0.4574) time: 5.5806 data: 0.0001 max mem: 71357 +[19:30:45.134749] Epoch: [2] [3860/6500] lr: 0.000025 closs: 0.7189 (0.7515) grad_norm: 0.4118 (0.4572) time: 5.5751 data: 0.0001 max mem: 71357 +[19:31:41.021523] Epoch: [2] [3870/6500] lr: 0.000025 closs: 0.7189 (0.7515) grad_norm: 0.3507 (0.4573) time: 5.5822 data: 0.0001 max mem: 71357 +[19:32:36.805285] Epoch: [2] [3880/6500] lr: 0.000025 closs: 0.7183 (0.7515) grad_norm: 0.3621 (0.4572) time: 5.5835 data: 0.0001 max mem: 71357 +[19:33:32.498955] Epoch: [2] [3890/6500] lr: 0.000025 closs: 0.7183 (0.7517) grad_norm: 0.4034 (0.4573) time: 5.5738 data: 0.0001 max mem: 71357 +[19:34:28.271499] Epoch: [2] [3900/6500] lr: 0.000025 closs: 0.7171 (0.7516) grad_norm: 0.4034 (0.4572) time: 5.5732 data: 0.0001 max mem: 71357 +[19:35:24.068345] Epoch: [2] [3910/6500] lr: 0.000025 closs: 0.7171 (0.7516) grad_norm: 0.4155 (0.4572) time: 5.5784 data: 0.0001 max mem: 71357 +[19:36:19.846821] Epoch: [2] [3920/6500] lr: 0.000025 closs: 0.7395 (0.7517) grad_norm: 0.4235 (0.4572) time: 5.5787 data: 0.0001 max mem: 71357 +[19:37:15.586916] Epoch: [2] [3930/6500] lr: 0.000025 closs: 0.7115 (0.7516) grad_norm: 0.4147 (0.4570) time: 5.5758 data: 0.0001 max mem: 71357 +[19:38:11.257908] Epoch: [2] [3940/6500] lr: 0.000025 closs: 0.7486 (0.7519) grad_norm: 0.4114 (0.4569) time: 5.5704 data: 0.0001 max mem: 71357 +[19:39:07.044937] Epoch: [2] [3950/6500] lr: 0.000025 closs: 0.7479 (0.7518) grad_norm: 0.3677 (0.4569) time: 5.5728 data: 0.0002 max mem: 71357 +[19:40:02.943403] Epoch: [2] [3960/6500] lr: 0.000025 closs: 0.7034 (0.7517) grad_norm: 0.3595 (0.4567) time: 5.5842 data: 0.0002 max mem: 71357 +[19:40:58.634133] Epoch: [2] [3970/6500] lr: 0.000025 closs: 0.6918 (0.7517) grad_norm: 0.3632 (0.4566) time: 5.5794 data: 0.0001 max mem: 71357 +[19:41:54.409209] Epoch: [2] [3980/6500] lr: 0.000025 closs: 0.6974 (0.7517) grad_norm: 0.3576 (0.4563) time: 5.5732 data: 0.0001 max mem: 71357 +[19:42:50.131763] Epoch: [2] [3990/6500] lr: 0.000025 closs: 0.7662 (0.7519) grad_norm: 0.3603 (0.4562) time: 5.5748 data: 0.0001 max mem: 71357 +[19:43:45.912721] Epoch: [2] [4000/6500] lr: 0.000025 closs: 0.7659 (0.7518) grad_norm: 0.3586 (0.4560) time: 5.5751 data: 0.0001 max mem: 71357 +[19:44:41.737617] Epoch: [2] [4010/6500] lr: 0.000025 closs: 0.6833 (0.7515) grad_norm: 0.3591 (0.4560) time: 5.5802 data: 0.0001 max mem: 71357 +[19:45:37.533836] Epoch: [2] [4020/6500] lr: 0.000025 closs: 0.6835 (0.7515) grad_norm: 0.3779 (0.4559) time: 5.5810 data: 0.0001 max mem: 71357 +[19:46:33.370425] Epoch: [2] [4030/6500] lr: 0.000025 closs: 0.7941 (0.7518) grad_norm: 0.3591 (0.4558) time: 5.5816 data: 0.0001 max mem: 71357 +[19:47:29.110704] Epoch: [2] [4040/6500] lr: 0.000025 closs: 0.8463 (0.7520) grad_norm: 0.4270 (0.4560) time: 5.5788 data: 0.0001 max mem: 71357 +[19:48:24.981624] Epoch: [2] [4050/6500] lr: 0.000025 closs: 0.7672 (0.7521) grad_norm: 0.4270 (0.4562) time: 5.5805 data: 0.0001 max mem: 71357 +[19:49:20.791100] Epoch: [2] [4060/6500] lr: 0.000025 closs: 0.7486 (0.7521) grad_norm: 0.4354 (0.4561) time: 5.5839 data: 0.0001 max mem: 71357 +[19:50:16.553534] Epoch: [2] [4070/6500] lr: 0.000025 closs: 0.7918 (0.7525) grad_norm: 0.4397 (0.4561) time: 5.5785 data: 0.0001 max mem: 71357 +[19:51:12.298445] Epoch: [2] [4080/6500] lr: 0.000025 closs: 0.8232 (0.7524) grad_norm: 0.4222 (0.4559) time: 5.5753 data: 0.0001 max mem: 71357 +[19:52:08.214197] Epoch: [2] [4090/6500] lr: 0.000024 closs: 0.6875 (0.7523) grad_norm: 0.3943 (0.4558) time: 5.5829 data: 0.0001 max mem: 71357 +[19:53:03.923183] Epoch: [2] [4100/6500] lr: 0.000024 closs: 0.6875 (0.7524) grad_norm: 0.3943 (0.4558) time: 5.5811 data: 0.0001 max mem: 71357 +[19:53:59.656143] Epoch: [2] [4110/6500] lr: 0.000024 closs: 0.7027 (0.7524) grad_norm: 0.3542 (0.4555) time: 5.5720 data: 0.0001 max mem: 71357 +[19:54:55.392527] Epoch: [2] [4120/6500] lr: 0.000024 closs: 0.7027 (0.7524) grad_norm: 0.3617 (0.4554) time: 5.5734 data: 0.0001 max mem: 71357 +[19:55:51.121119] Epoch: [2] [4130/6500] lr: 0.000024 closs: 0.7443 (0.7525) grad_norm: 0.4015 (0.4557) time: 5.5731 data: 0.0001 max mem: 71357 +[19:56:47.017662] Epoch: [2] [4140/6500] lr: 0.000024 closs: 0.6968 (0.7523) grad_norm: 0.4069 (0.4558) time: 5.5812 data: 0.0001 max mem: 71357 +[19:57:42.833295] Epoch: [2] [4150/6500] lr: 0.000024 closs: 0.6998 (0.7525) grad_norm: 0.4234 (0.4557) time: 5.5855 data: 0.0001 max mem: 71357 +[19:58:38.652202] Epoch: [2] [4160/6500] lr: 0.000024 closs: 0.7783 (0.7526) grad_norm: 0.4234 (0.4561) time: 5.5816 data: 0.0001 max mem: 71357 +[19:59:34.468532] Epoch: [2] [4170/6500] lr: 0.000024 closs: 0.7338 (0.7524) grad_norm: 0.4316 (0.4561) time: 5.5816 data: 0.0002 max mem: 71357 +[20:00:30.388864] Epoch: [2] [4180/6500] lr: 0.000024 closs: 0.6694 (0.7522) grad_norm: 0.3939 (0.4559) time: 5.5866 data: 0.0002 max mem: 71357 +[20:01:26.113858] Epoch: [2] [4190/6500] lr: 0.000024 closs: 0.6837 (0.7522) grad_norm: 0.3998 (0.4559) time: 5.5820 data: 0.0001 max mem: 71357 +[20:02:21.829528] Epoch: [2] [4200/6500] lr: 0.000024 closs: 0.6585 (0.7517) grad_norm: 0.3751 (0.4557) time: 5.5720 data: 0.0001 max mem: 71357 +[20:03:17.604842] Epoch: [2] [4210/6500] lr: 0.000024 closs: 0.6701 (0.7517) grad_norm: 0.3748 (0.4598) time: 5.5745 data: 0.0001 max mem: 71357 +[20:04:13.477797] Epoch: [2] [4220/6500] lr: 0.000024 closs: 0.7236 (0.7518) grad_norm: 0.3748 (0.4596) time: 5.5823 data: 0.0001 max mem: 71357 +[20:05:09.384343] Epoch: [2] [4230/6500] lr: 0.000024 closs: 0.7549 (0.7519) grad_norm: 0.3470 (0.4595) time: 5.5889 data: 0.0001 max mem: 71357 +[20:06:05.120629] Epoch: [2] [4240/6500] lr: 0.000024 closs: 0.8058 (0.7521) grad_norm: 0.3470 (0.4594) time: 5.5820 data: 0.0001 max mem: 71357 +[20:07:00.817213] Epoch: [2] [4250/6500] lr: 0.000024 closs: 0.7658 (0.7519) grad_norm: 0.3538 (0.4594) time: 5.5716 data: 0.0001 max mem: 71357 +[20:07:56.580927] Epoch: [2] [4260/6500] lr: 0.000024 closs: 0.6630 (0.7518) grad_norm: 0.3951 (0.4593) time: 5.5729 data: 0.0001 max mem: 71357 +[20:08:52.369538] Epoch: [2] [4270/6500] lr: 0.000024 closs: 0.6919 (0.7519) grad_norm: 0.3951 (0.4592) time: 5.5775 data: 0.0001 max mem: 71357 +[20:09:48.116804] Epoch: [2] [4280/6500] lr: 0.000024 closs: 0.7422 (0.7519) grad_norm: 0.3951 (0.4593) time: 5.5767 data: 0.0001 max mem: 71357 +[20:10:43.991384] Epoch: [2] [4290/6500] lr: 0.000024 closs: 0.7422 (0.7519) grad_norm: 0.3758 (0.4593) time: 5.5810 data: 0.0001 max mem: 71357 +[20:11:39.673183] Epoch: [2] [4300/6500] lr: 0.000024 closs: 0.6998 (0.7519) grad_norm: 0.3586 (0.4592) time: 5.5777 data: 0.0001 max mem: 71357 +[20:12:35.691999] Epoch: [2] [4310/6500] lr: 0.000024 closs: 0.6928 (0.7518) grad_norm: 0.3542 (0.4589) time: 5.5849 data: 0.0001 max mem: 71357 +[20:13:31.414107] Epoch: [2] [4320/6500] lr: 0.000024 closs: 0.7853 (0.7521) grad_norm: 0.3542 (0.4591) time: 5.5870 data: 0.0001 max mem: 71357 +[20:14:27.153355] Epoch: [2] [4330/6500] lr: 0.000024 closs: 0.7853 (0.7521) grad_norm: 0.3800 (0.4590) time: 5.5730 data: 0.0001 max mem: 71357 +[20:15:22.884207] Epoch: [2] [4340/6500] lr: 0.000024 closs: 0.7557 (0.7522) grad_norm: 0.3681 (0.4590) time: 5.5734 data: 0.0001 max mem: 71357 +[20:16:18.717580] Epoch: [2] [4350/6500] lr: 0.000024 closs: 0.7819 (0.7523) grad_norm: 0.3800 (0.4589) time: 5.5781 data: 0.0001 max mem: 71357 +[20:17:14.490059] Epoch: [2] [4360/6500] lr: 0.000023 closs: 0.7472 (0.7523) grad_norm: 0.4156 (0.4590) time: 5.5802 data: 0.0001 max mem: 71357 +[20:18:10.275532] Epoch: [2] [4370/6500] lr: 0.000023 closs: 0.7444 (0.7523) grad_norm: 0.3810 (0.4588) time: 5.5778 data: 0.0001 max mem: 71357 +[20:19:06.010266] Epoch: [2] [4380/6500] lr: 0.000023 closs: 0.7763 (0.7524) grad_norm: 0.4355 (0.4589) time: 5.5759 data: 0.0001 max mem: 71357 +[20:20:01.808150] Epoch: [2] [4390/6500] lr: 0.000023 closs: 0.7244 (0.7523) grad_norm: 0.4355 (0.4589) time: 5.5766 data: 0.0001 max mem: 71357 +[20:20:57.625348] Epoch: [2] [4400/6500] lr: 0.000023 closs: 0.7288 (0.7523) grad_norm: 0.3992 (0.4587) time: 5.5807 data: 0.0001 max mem: 71357 +[20:21:53.324472] Epoch: [2] [4410/6500] lr: 0.000023 closs: 0.7448 (0.7522) grad_norm: 0.4006 (0.4586) time: 5.5757 data: 0.0001 max mem: 71357 +[20:22:49.049941] Epoch: [2] [4420/6500] lr: 0.000023 closs: 0.7451 (0.7522) grad_norm: 0.4197 (0.4587) time: 5.5712 data: 0.0001 max mem: 71357 +[20:23:44.748732] Epoch: [2] [4430/6500] lr: 0.000023 closs: 0.7624 (0.7522) grad_norm: 0.3477 (0.4584) time: 5.5711 data: 0.0001 max mem: 71357 +[20:24:40.497376] Epoch: [2] [4440/6500] lr: 0.000023 closs: 0.7418 (0.7522) grad_norm: 0.4209 (0.4585) time: 5.5723 data: 0.0001 max mem: 71357 +[20:25:36.293414] Epoch: [2] [4450/6500] lr: 0.000023 closs: 0.7588 (0.7522) grad_norm: 0.3757 (0.4587) time: 5.5772 data: 0.0001 max mem: 71357 +[20:26:31.999472] Epoch: [2] [4460/6500] lr: 0.000023 closs: 0.7787 (0.7523) grad_norm: 0.3697 (0.4586) time: 5.5750 data: 0.0001 max mem: 71357 +[20:27:27.726368] Epoch: [2] [4470/6500] lr: 0.000023 closs: 0.8265 (0.7524) grad_norm: 0.3947 (0.4585) time: 5.5715 data: 0.0001 max mem: 71357 +[20:28:23.380473] Epoch: [2] [4480/6500] lr: 0.000023 closs: 0.7835 (0.7524) grad_norm: 0.3890 (0.4584) time: 5.5689 data: 0.0001 max mem: 71357 +[20:29:19.182199] Epoch: [2] [4490/6500] lr: 0.000023 closs: 0.7434 (0.7525) grad_norm: 0.4197 (0.4584) time: 5.5727 data: 0.0001 max mem: 71357 +[20:30:14.955737] Epoch: [2] [4500/6500] lr: 0.000023 closs: 0.7503 (0.7527) grad_norm: 0.4197 (0.4583) time: 5.5787 data: 0.0001 max mem: 71357 +[20:31:10.618074] Epoch: [2] [4510/6500] lr: 0.000023 closs: 0.7323 (0.7525) grad_norm: 0.3890 (0.4581) time: 5.5717 data: 0.0001 max mem: 71357 +[20:32:06.333679] Epoch: [2] [4520/6500] lr: 0.000023 closs: 0.7582 (0.7526) grad_norm: 0.4145 (0.4580) time: 5.5688 data: 0.0001 max mem: 71357 +[20:33:02.124341] Epoch: [2] [4530/6500] lr: 0.000023 closs: 0.7793 (0.7527) grad_norm: 0.3605 (0.4579) time: 5.5752 data: 0.0001 max mem: 71357 +[20:33:57.818097] Epoch: [2] [4540/6500] lr: 0.000023 closs: 0.7636 (0.7527) grad_norm: 0.3769 (0.4582) time: 5.5741 data: 0.0001 max mem: 71357 +[20:34:53.515671] Epoch: [2] [4550/6500] lr: 0.000023 closs: 0.7089 (0.7527) grad_norm: 0.3769 (0.4582) time: 5.5695 data: 0.0001 max mem: 71357 +[20:35:49.117517] Epoch: [2] [4560/6500] lr: 0.000023 closs: 0.7042 (0.7527) grad_norm: 0.3902 (0.4581) time: 5.5649 data: 0.0001 max mem: 71357 +[20:36:44.782322] Epoch: [2] [4570/6500] lr: 0.000023 closs: 0.7926 (0.7528) grad_norm: 0.4067 (0.4581) time: 5.5632 data: 0.0001 max mem: 71357 +[20:37:40.621732] Epoch: [2] [4580/6500] lr: 0.000023 closs: 0.7278 (0.7527) grad_norm: 0.4201 (0.4580) time: 5.5751 data: 0.0001 max mem: 71357 +[20:38:36.392047] Epoch: [2] [4590/6500] lr: 0.000023 closs: 0.6795 (0.7525) grad_norm: 0.4201 (0.4580) time: 5.5804 data: 0.0001 max mem: 71357 +[20:39:32.120868] Epoch: [2] [4600/6500] lr: 0.000023 closs: 0.6879 (0.7525) grad_norm: 0.4157 (0.4579) time: 5.5749 data: 0.0001 max mem: 71357 +[20:40:27.907400] Epoch: [2] [4610/6500] lr: 0.000023 closs: 0.7533 (0.7526) grad_norm: 0.4157 (0.4580) time: 5.5757 data: 0.0001 max mem: 71357 +[20:41:23.723245] Epoch: [2] [4620/6500] lr: 0.000023 closs: 0.7976 (0.7527) grad_norm: 0.4157 (0.4581) time: 5.5800 data: 0.0001 max mem: 71357 +[20:42:19.443021] Epoch: [2] [4630/6500] lr: 0.000023 closs: 0.8094 (0.7528) grad_norm: 0.4242 (0.4581) time: 5.5767 data: 0.0001 max mem: 71357 +[20:43:15.254542] Epoch: [2] [4640/6500] lr: 0.000023 closs: 0.8314 (0.7530) grad_norm: 0.4242 (0.4580) time: 5.5764 data: 0.0001 max mem: 71357 +[20:44:11.113583] Epoch: [2] [4650/6500] lr: 0.000022 closs: 0.7892 (0.7530) grad_norm: 0.4242 (0.4579) time: 5.5834 data: 0.0001 max mem: 71357 +[20:45:06.929343] Epoch: [2] [4660/6500] lr: 0.000022 closs: 0.7705 (0.7531) grad_norm: 0.3678 (0.4577) time: 5.5836 data: 0.0001 max mem: 71357 +[20:46:02.726719] Epoch: [2] [4670/6500] lr: 0.000022 closs: 0.7750 (0.7532) grad_norm: 0.4022 (0.4577) time: 5.5806 data: 0.0001 max mem: 71357 +[20:46:58.391264] Epoch: [2] [4680/6500] lr: 0.000022 closs: 0.7492 (0.7534) grad_norm: 0.4199 (0.4578) time: 5.5730 data: 0.0001 max mem: 71357 +[20:47:54.137636] Epoch: [2] [4690/6500] lr: 0.000022 closs: 0.7607 (0.7534) grad_norm: 0.4330 (0.4579) time: 5.5705 data: 0.0001 max mem: 71357 +[20:48:49.866982] Epoch: [2] [4700/6500] lr: 0.000022 closs: 0.7612 (0.7535) grad_norm: 0.4420 (0.4581) time: 5.5737 data: 0.0001 max mem: 71357 +[20:49:45.726989] Epoch: [2] [4710/6500] lr: 0.000022 closs: 0.7462 (0.7535) grad_norm: 0.4460 (0.4581) time: 5.5794 data: 0.0002 max mem: 71357 +[20:50:41.469292] Epoch: [2] [4720/6500] lr: 0.000022 closs: 0.7418 (0.7533) grad_norm: 0.4460 (0.4580) time: 5.5800 data: 0.0002 max mem: 71357 +[20:51:37.246552] Epoch: [2] [4730/6500] lr: 0.000022 closs: 0.7053 (0.7532) grad_norm: 0.4460 (0.4580) time: 5.5759 data: 0.0001 max mem: 71357 +[20:52:33.006985] Epoch: [2] [4740/6500] lr: 0.000022 closs: 0.7434 (0.7532) grad_norm: 0.3723 (0.4579) time: 5.5768 data: 0.0001 max mem: 71357 +[20:53:28.828753] Epoch: [2] [4750/6500] lr: 0.000022 closs: 0.7571 (0.7532) grad_norm: 0.3702 (0.4580) time: 5.5790 data: 0.0001 max mem: 71357 +[20:54:24.592903] Epoch: [2] [4760/6500] lr: 0.000022 closs: 0.7973 (0.7534) grad_norm: 0.3789 (0.4579) time: 5.5792 data: 0.0001 max mem: 71357 +[20:55:20.352634] Epoch: [2] [4770/6500] lr: 0.000022 closs: 0.7573 (0.7534) grad_norm: 0.3854 (0.4581) time: 5.5761 data: 0.0001 max mem: 71357 +[20:56:16.111581] Epoch: [2] [4780/6500] lr: 0.000022 closs: 0.7458 (0.7534) grad_norm: 0.3854 (0.4584) time: 5.5758 data: 0.0001 max mem: 71357 +[20:57:11.915746] Epoch: [2] [4790/6500] lr: 0.000022 closs: 0.7730 (0.7534) grad_norm: 0.3789 (0.4581) time: 5.5780 data: 0.0001 max mem: 71357 +[20:58:07.707096] Epoch: [2] [4800/6500] lr: 0.000022 closs: 0.7921 (0.7534) grad_norm: 0.3786 (0.4579) time: 5.5797 data: 0.0001 max mem: 71357 +[20:59:03.442025] Epoch: [2] [4810/6500] lr: 0.000022 closs: 0.8092 (0.7535) grad_norm: 0.3616 (0.4578) time: 5.5762 data: 0.0001 max mem: 71357 +[20:59:59.160618] Epoch: [2] [4820/6500] lr: 0.000022 closs: 0.8096 (0.7535) grad_norm: 0.3616 (0.4579) time: 5.5726 data: 0.0001 max mem: 71357 +[21:00:55.024537] Epoch: [2] [4830/6500] lr: 0.000022 closs: 0.7769 (0.7535) grad_norm: 0.3820 (0.4578) time: 5.5791 data: 0.0001 max mem: 71357 +[21:01:50.864427] Epoch: [2] [4840/6500] lr: 0.000022 closs: 0.8090 (0.7537) grad_norm: 0.3829 (0.4577) time: 5.5851 data: 0.0001 max mem: 71357 +[21:02:46.526123] Epoch: [2] [4850/6500] lr: 0.000022 closs: 0.7622 (0.7536) grad_norm: 0.3724 (0.4575) time: 5.5750 data: 0.0001 max mem: 71357 +[21:03:42.259788] Epoch: [2] [4860/6500] lr: 0.000022 closs: 0.6715 (0.7535) grad_norm: 0.3909 (0.4574) time: 5.5696 data: 0.0002 max mem: 71357 +[21:04:37.979185] Epoch: [2] [4870/6500] lr: 0.000022 closs: 0.7794 (0.7535) grad_norm: 0.3939 (0.4574) time: 5.5725 data: 0.0002 max mem: 71357 +[21:05:33.778939] Epoch: [2] [4880/6500] lr: 0.000022 closs: 0.7504 (0.7535) grad_norm: 0.3825 (0.4576) time: 5.5758 data: 0.0002 max mem: 71357 +[21:06:29.677234] Epoch: [2] [4890/6500] lr: 0.000022 closs: 0.7927 (0.7536) grad_norm: 0.3982 (0.4581) time: 5.5848 data: 0.0002 max mem: 71357 +[21:07:25.514489] Epoch: [2] [4900/6500] lr: 0.000022 closs: 0.7927 (0.7536) grad_norm: 0.3893 (0.4580) time: 5.5867 data: 0.0002 max mem: 71357 +[21:08:21.434487] Epoch: [2] [4910/6500] lr: 0.000022 closs: 0.6924 (0.7535) grad_norm: 0.3806 (0.4577) time: 5.5877 data: 0.0002 max mem: 71357 +[21:09:17.248775] Epoch: [2] [4920/6500] lr: 0.000022 closs: 0.6780 (0.7535) grad_norm: 0.3656 (0.4577) time: 5.5866 data: 0.0002 max mem: 71357 +[21:10:13.285780] Epoch: [2] [4930/6500] lr: 0.000021 closs: 0.7640 (0.7536) grad_norm: 0.3893 (0.4577) time: 5.5925 data: 0.0003 max mem: 71357 +[21:11:09.052304] Epoch: [2] [4940/6500] lr: 0.000021 closs: 0.7222 (0.7535) grad_norm: 0.3819 (0.4575) time: 5.5901 data: 0.0002 max mem: 71357 +[21:12:04.750288] Epoch: [2] [4950/6500] lr: 0.000021 closs: 0.7536 (0.7536) grad_norm: 0.3978 (0.4575) time: 5.5731 data: 0.0001 max mem: 71357 +[21:13:00.512784] Epoch: [2] [4960/6500] lr: 0.000021 closs: 0.7589 (0.7535) grad_norm: 0.3978 (0.4574) time: 5.5729 data: 0.0001 max mem: 71357 +[21:13:56.368221] Epoch: [2] [4970/6500] lr: 0.000021 closs: 0.6880 (0.7534) grad_norm: 0.4164 (0.4574) time: 5.5808 data: 0.0001 max mem: 71357 +[21:14:52.098157] Epoch: [2] [4980/6500] lr: 0.000021 closs: 0.6675 (0.7533) grad_norm: 0.4466 (0.4576) time: 5.5792 data: 0.0002 max mem: 71357 +[21:15:47.834844] Epoch: [2] [4990/6500] lr: 0.000021 closs: 0.7294 (0.7534) grad_norm: 0.4218 (0.4576) time: 5.5732 data: 0.0002 max mem: 71357 +[21:16:43.616335] Epoch: [2] [5000/6500] lr: 0.000021 closs: 0.7294 (0.7533) grad_norm: 0.4392 (0.4575) time: 5.5758 data: 0.0001 max mem: 71357 +[21:17:39.418621] Epoch: [2] [5010/6500] lr: 0.000021 closs: 0.6945 (0.7532) grad_norm: 0.3921 (0.4575) time: 5.5791 data: 0.0001 max mem: 71357 +[21:18:35.268740] Epoch: [2] [5020/6500] lr: 0.000021 closs: 0.7293 (0.7533) grad_norm: 0.3777 (0.4574) time: 5.5825 data: 0.0001 max mem: 71357 +[21:19:31.074686] Epoch: [2] [5030/6500] lr: 0.000021 closs: 0.7293 (0.7532) grad_norm: 0.3928 (0.4573) time: 5.5827 data: 0.0002 max mem: 71357 +[21:20:26.820398] Epoch: [2] [5040/6500] lr: 0.000021 closs: 0.7199 (0.7531) grad_norm: 0.4195 (0.4573) time: 5.5774 data: 0.0002 max mem: 71357 +[21:21:22.626457] Epoch: [2] [5050/6500] lr: 0.000021 closs: 0.7394 (0.7530) grad_norm: 0.3752 (0.4572) time: 5.5775 data: 0.0001 max mem: 71357 +[21:22:18.529363] Epoch: [2] [5060/6500] lr: 0.000021 closs: 0.6648 (0.7528) grad_norm: 0.3752 (0.4571) time: 5.5854 data: 0.0001 max mem: 71357 +[21:23:14.371468] Epoch: [2] [5070/6500] lr: 0.000021 closs: 0.6893 (0.7528) grad_norm: 0.3752 (0.4574) time: 5.5872 data: 0.0002 max mem: 71357 +[21:24:10.166597] Epoch: [2] [5080/6500] lr: 0.000021 closs: 0.8464 (0.7531) grad_norm: 0.3935 (0.4573) time: 5.5818 data: 0.0002 max mem: 71357 +[21:25:06.007932] Epoch: [2] [5090/6500] lr: 0.000021 closs: 0.7938 (0.7531) grad_norm: 0.4019 (0.4572) time: 5.5817 data: 0.0002 max mem: 71357 +[21:26:01.836094] Epoch: [2] [5100/6500] lr: 0.000021 closs: 0.7887 (0.7532) grad_norm: 0.4274 (0.4577) time: 5.5833 data: 0.0002 max mem: 71357 +[21:26:57.816257] Epoch: [2] [5110/6500] lr: 0.000021 closs: 0.7349 (0.7531) grad_norm: 0.4136 (0.4577) time: 5.5903 data: 0.0002 max mem: 71357 +[21:27:53.625174] Epoch: [2] [5120/6500] lr: 0.000021 closs: 0.7168 (0.7532) grad_norm: 0.4136 (0.4575) time: 5.5893 data: 0.0003 max mem: 71357 +[21:28:49.364341] Epoch: [2] [5130/6500] lr: 0.000021 closs: 0.7883 (0.7531) grad_norm: 0.4200 (0.4574) time: 5.5773 data: 0.0002 max mem: 71357 +[21:29:45.116681] Epoch: [2] [5140/6500] lr: 0.000021 closs: 0.8003 (0.7533) grad_norm: 0.3964 (0.4575) time: 5.5744 data: 0.0002 max mem: 71357 +[21:30:41.097664] Epoch: [2] [5150/6500] lr: 0.000021 closs: 0.7947 (0.7533) grad_norm: 0.4253 (0.4577) time: 5.5865 data: 0.0002 max mem: 71357 +[21:31:36.852342] Epoch: [2] [5160/6500] lr: 0.000021 closs: 0.6738 (0.7530) grad_norm: 0.4253 (0.4575) time: 5.5866 data: 0.0002 max mem: 71357 +[21:32:32.634601] Epoch: [2] [5170/6500] lr: 0.000021 closs: 0.6738 (0.7530) grad_norm: 0.4441 (0.4575) time: 5.5767 data: 0.0002 max mem: 71357 +[21:33:28.375412] Epoch: [2] [5180/6500] lr: 0.000021 closs: 0.7572 (0.7529) grad_norm: 0.4158 (0.4575) time: 5.5760 data: 0.0001 max mem: 71357 +[21:34:24.373049] Epoch: [2] [5190/6500] lr: 0.000021 closs: 0.7376 (0.7530) grad_norm: 0.3803 (0.4574) time: 5.5868 data: 0.0001 max mem: 71357 +[21:35:20.090392] Epoch: [2] [5200/6500] lr: 0.000021 closs: 0.7732 (0.7529) grad_norm: 0.3696 (0.4575) time: 5.5856 data: 0.0001 max mem: 71357 +[21:36:15.790968] Epoch: [2] [5210/6500] lr: 0.000021 closs: 0.7453 (0.7529) grad_norm: 0.3696 (0.4573) time: 5.5708 data: 0.0001 max mem: 71357 +[21:37:11.636798] Epoch: [2] [5220/6500] lr: 0.000020 closs: 0.7165 (0.7528) grad_norm: 0.3760 (0.4576) time: 5.5772 data: 0.0001 max mem: 71357 +[21:38:07.477832] Epoch: [2] [5230/6500] lr: 0.000020 closs: 0.7192 (0.7528) grad_norm: 0.3629 (0.4574) time: 5.5842 data: 0.0001 max mem: 71357 +[21:39:03.327791] Epoch: [2] [5240/6500] lr: 0.000020 closs: 0.7629 (0.7528) grad_norm: 0.4283 (0.4577) time: 5.5844 data: 0.0001 max mem: 71357 +[21:39:59.108460] Epoch: [2] [5250/6500] lr: 0.000020 closs: 0.7569 (0.7529) grad_norm: 0.4310 (0.4577) time: 5.5814 data: 0.0002 max mem: 71357 +[21:40:54.871076] Epoch: [2] [5260/6500] lr: 0.000020 closs: 0.7255 (0.7528) grad_norm: 0.4283 (0.4581) time: 5.5771 data: 0.0002 max mem: 71357 +[21:41:50.589820] Epoch: [2] [5270/6500] lr: 0.000020 closs: 0.7657 (0.7530) grad_norm: 0.4283 (0.4580) time: 5.5740 data: 0.0001 max mem: 71357 +[21:42:46.474046] Epoch: [2] [5280/6500] lr: 0.000020 closs: 0.7840 (0.7530) grad_norm: 0.3961 (0.4581) time: 5.5801 data: 0.0001 max mem: 71357 +[21:43:42.200918] Epoch: [2] [5290/6500] lr: 0.000020 closs: 0.7246 (0.7530) grad_norm: 0.4003 (0.4582) time: 5.5805 data: 0.0001 max mem: 71357 +[21:44:37.997141] Epoch: [2] [5300/6500] lr: 0.000020 closs: 0.7503 (0.7530) grad_norm: 0.3962 (0.4580) time: 5.5760 data: 0.0001 max mem: 71357 +[21:45:33.699151] Epoch: [2] [5310/6500] lr: 0.000020 closs: 0.7254 (0.7531) grad_norm: 0.4213 (0.4580) time: 5.5748 data: 0.0001 max mem: 71357 +[21:46:29.584831] Epoch: [2] [5320/6500] lr: 0.000020 closs: 0.8009 (0.7532) grad_norm: 0.4213 (0.4581) time: 5.5793 data: 0.0001 max mem: 71357 +[21:47:25.383542] Epoch: [2] [5330/6500] lr: 0.000020 closs: 0.8009 (0.7533) grad_norm: 0.4237 (0.4581) time: 5.5841 data: 0.0001 max mem: 71357 +[21:48:21.194631] Epoch: [2] [5340/6500] lr: 0.000020 closs: 0.8154 (0.7535) grad_norm: 0.4372 (0.4580) time: 5.5804 data: 0.0001 max mem: 71357 +[21:49:16.955311] Epoch: [2] [5350/6500] lr: 0.000020 closs: 0.7530 (0.7534) grad_norm: 0.4259 (0.4580) time: 5.5785 data: 0.0001 max mem: 71357 +[21:50:12.677438] Epoch: [2] [5360/6500] lr: 0.000020 closs: 0.7140 (0.7534) grad_norm: 0.4259 (0.4581) time: 5.5741 data: 0.0001 max mem: 71357 +[21:51:08.506115] Epoch: [2] [5370/6500] lr: 0.000020 closs: 0.7140 (0.7534) grad_norm: 0.3761 (0.4579) time: 5.5774 data: 0.0001 max mem: 71357 +[21:52:04.259279] Epoch: [2] [5380/6500] lr: 0.000020 closs: 0.6909 (0.7533) grad_norm: 0.3761 (0.4579) time: 5.5790 data: 0.0001 max mem: 71357 +[21:53:00.007049] Epoch: [2] [5390/6500] lr: 0.000020 closs: 0.7034 (0.7533) grad_norm: 0.3758 (0.4578) time: 5.5750 data: 0.0001 max mem: 71357 +[21:53:55.686501] Epoch: [2] [5400/6500] lr: 0.000020 closs: 0.7651 (0.7533) grad_norm: 0.3841 (0.4578) time: 5.5713 data: 0.0001 max mem: 71357 +[21:54:51.514514] Epoch: [2] [5410/6500] lr: 0.000020 closs: 0.7785 (0.7534) grad_norm: 0.4139 (0.4580) time: 5.5753 data: 0.0001 max mem: 71357 +[21:55:47.154445] Epoch: [2] [5420/6500] lr: 0.000020 closs: 0.7544 (0.7533) grad_norm: 0.4081 (0.4579) time: 5.5733 data: 0.0001 max mem: 71357 +[21:56:42.951537] Epoch: [2] [5430/6500] lr: 0.000020 closs: 0.6845 (0.7532) grad_norm: 0.4150 (0.4579) time: 5.5717 data: 0.0001 max mem: 71357 +[21:57:38.676644] Epoch: [2] [5440/6500] lr: 0.000020 closs: 0.8226 (0.7534) grad_norm: 0.4081 (0.4578) time: 5.5760 data: 0.0001 max mem: 71357 +[21:58:34.461701] Epoch: [2] [5450/6500] lr: 0.000020 closs: 0.7645 (0.7533) grad_norm: 0.4039 (0.4578) time: 5.5754 data: 0.0001 max mem: 71357 +[21:59:30.295020] Epoch: [2] [5460/6500] lr: 0.000020 closs: 0.7645 (0.7535) grad_norm: 0.4783 (0.4581) time: 5.5808 data: 0.0001 max mem: 71357 +[22:00:25.992583] Epoch: [2] [5470/6500] lr: 0.000020 closs: 0.8520 (0.7536) grad_norm: 0.4082 (0.4580) time: 5.5765 data: 0.0001 max mem: 71357 +[22:01:21.691970] Epoch: [2] [5480/6500] lr: 0.000020 closs: 0.7229 (0.7536) grad_norm: 0.4799 (0.4582) time: 5.5697 data: 0.0001 max mem: 71357 +[22:02:17.516596] Epoch: [2] [5490/6500] lr: 0.000020 closs: 0.7057 (0.7536) grad_norm: 0.4082 (0.4581) time: 5.5761 data: 0.0001 max mem: 71357 +[22:03:13.445909] Epoch: [2] [5500/6500] lr: 0.000020 closs: 0.6673 (0.7533) grad_norm: 0.3902 (0.4579) time: 5.5876 data: 0.0001 max mem: 71357 +[22:04:09.147708] Epoch: [2] [5510/6500] lr: 0.000019 closs: 0.6875 (0.7535) grad_norm: 0.3914 (0.4580) time: 5.5814 data: 0.0001 max mem: 71357 +[22:05:04.874061] Epoch: [2] [5520/6500] lr: 0.000019 closs: 0.7620 (0.7535) grad_norm: 0.3816 (0.4579) time: 5.5713 data: 0.0002 max mem: 71357 +[22:06:00.681888] Epoch: [2] [5530/6500] lr: 0.000019 closs: 0.7095 (0.7535) grad_norm: 0.3783 (0.4601) time: 5.5766 data: 0.0002 max mem: 71357 +[22:06:56.443705] Epoch: [2] [5540/6500] lr: 0.000019 closs: 0.7318 (0.7536) grad_norm: 0.4034 (0.4601) time: 5.5784 data: 0.0001 max mem: 71357 +[22:07:52.418745] Epoch: [2] [5550/6500] lr: 0.000019 closs: 0.7458 (0.7538) grad_norm: 0.3594 (0.4599) time: 5.5867 data: 0.0001 max mem: 71357 +[22:08:48.144479] Epoch: [2] [5560/6500] lr: 0.000019 closs: 0.7224 (0.7537) grad_norm: 0.4024 (0.4599) time: 5.5849 data: 0.0001 max mem: 71357 +[22:09:43.897713] Epoch: [2] [5570/6500] lr: 0.000019 closs: 0.7157 (0.7537) grad_norm: 0.3973 (0.4597) time: 5.5739 data: 0.0001 max mem: 71357 +[22:10:39.690994] Epoch: [2] [5580/6500] lr: 0.000019 closs: 0.7680 (0.7539) grad_norm: 0.3898 (0.4596) time: 5.5773 data: 0.0001 max mem: 71357 +[22:11:35.469045] Epoch: [2] [5590/6500] lr: 0.000019 closs: 0.7702 (0.7539) grad_norm: 0.3973 (0.4594) time: 5.5785 data: 0.0001 max mem: 71357 +[22:12:31.188413] Epoch: [2] [5600/6500] lr: 0.000019 closs: 0.7568 (0.7539) grad_norm: 0.3946 (0.4593) time: 5.5747 data: 0.0001 max mem: 71357 +[22:13:26.862699] Epoch: [2] [5610/6500] lr: 0.000019 closs: 0.7371 (0.7539) grad_norm: 0.3946 (0.4594) time: 5.5696 data: 0.0001 max mem: 71357 +[22:14:22.622269] Epoch: [2] [5620/6500] lr: 0.000019 closs: 0.7695 (0.7540) grad_norm: 0.4004 (0.4594) time: 5.5716 data: 0.0001 max mem: 71357 +[22:15:18.453726] Epoch: [2] [5630/6500] lr: 0.000019 closs: 0.7800 (0.7540) grad_norm: 0.3960 (0.4595) time: 5.5795 data: 0.0001 max mem: 71357 +[22:16:14.272135] Epoch: [2] [5640/6500] lr: 0.000019 closs: 0.7343 (0.7540) grad_norm: 0.3889 (0.4594) time: 5.5824 data: 0.0001 max mem: 71357 +[22:17:10.010456] Epoch: [2] [5650/6500] lr: 0.000019 closs: 0.7035 (0.7539) grad_norm: 0.3810 (0.4594) time: 5.5777 data: 0.0001 max mem: 71357 +[22:18:05.780687] Epoch: [2] [5660/6500] lr: 0.000019 closs: 0.7525 (0.7540) grad_norm: 0.3810 (0.4593) time: 5.5753 data: 0.0001 max mem: 71357 +[22:19:01.603251] Epoch: [2] [5670/6500] lr: 0.000019 closs: 0.7833 (0.7539) grad_norm: 0.3969 (0.4594) time: 5.5795 data: 0.0001 max mem: 71357 +[22:19:57.464364] Epoch: [2] [5680/6500] lr: 0.000019 closs: 0.6852 (0.7538) grad_norm: 0.4027 (0.4592) time: 5.5841 data: 0.0002 max mem: 71357 +[22:20:53.232893] Epoch: [2] [5690/6500] lr: 0.000019 closs: 0.7009 (0.7537) grad_norm: 0.3941 (0.4593) time: 5.5814 data: 0.0002 max mem: 71357 +[22:21:49.061197] Epoch: [2] [5700/6500] lr: 0.000019 closs: 0.7428 (0.7538) grad_norm: 0.3807 (0.4591) time: 5.5798 data: 0.0001 max mem: 71357 +[22:22:44.755870] Epoch: [2] [5710/6500] lr: 0.000019 closs: 0.7428 (0.7538) grad_norm: 0.3692 (0.4591) time: 5.5761 data: 0.0001 max mem: 71357 +[22:23:40.624436] Epoch: [2] [5720/6500] lr: 0.000019 closs: 0.6894 (0.7538) grad_norm: 0.3692 (0.4590) time: 5.5780 data: 0.0001 max mem: 71357 +[22:24:36.360822] Epoch: [2] [5730/6500] lr: 0.000019 closs: 0.7312 (0.7538) grad_norm: 0.3719 (0.4599) time: 5.5801 data: 0.0001 max mem: 71357 +[22:25:32.063175] Epoch: [2] [5740/6500] lr: 0.000019 closs: 0.7312 (0.7537) grad_norm: 0.4130 (0.4600) time: 5.5719 data: 0.0001 max mem: 71357 +[22:26:27.774640] Epoch: [2] [5750/6500] lr: 0.000019 closs: 0.6550 (0.7536) grad_norm: 0.4130 (0.4601) time: 5.5706 data: 0.0001 max mem: 71357 +[22:27:23.569992] Epoch: [2] [5760/6500] lr: 0.000019 closs: 0.6720 (0.7535) grad_norm: 0.4130 (0.4600) time: 5.5753 data: 0.0001 max mem: 71357 +[22:28:19.393765] Epoch: [2] [5770/6500] lr: 0.000019 closs: 0.6776 (0.7535) grad_norm: 0.4095 (0.4599) time: 5.5809 data: 0.0001 max mem: 71357 +[22:29:15.058736] Epoch: [2] [5780/6500] lr: 0.000019 closs: 0.7125 (0.7535) grad_norm: 0.3872 (0.4598) time: 5.5743 data: 0.0001 max mem: 71357 +[22:30:10.761069] Epoch: [2] [5790/6500] lr: 0.000019 closs: 0.7571 (0.7536) grad_norm: 0.3880 (0.4604) time: 5.5683 data: 0.0001 max mem: 71357 +[22:31:06.473078] Epoch: [2] [5800/6500] lr: 0.000019 closs: 0.7571 (0.7535) grad_norm: 0.4095 (0.4604) time: 5.5706 data: 0.0001 max mem: 71357 +[22:32:02.241747] Epoch: [2] [5810/6500] lr: 0.000018 closs: 0.7479 (0.7536) grad_norm: 0.4455 (0.4605) time: 5.5740 data: 0.0001 max mem: 71357 +[22:32:57.961342] Epoch: [2] [5820/6500] lr: 0.000018 closs: 0.7479 (0.7535) grad_norm: 0.4557 (0.4606) time: 5.5743 data: 0.0001 max mem: 71357 +[22:33:53.730026] Epoch: [2] [5830/6500] lr: 0.000018 closs: 0.7750 (0.7537) grad_norm: 0.4460 (0.4607) time: 5.5743 data: 0.0001 max mem: 71357 +[22:34:49.531097] Epoch: [2] [5840/6500] lr: 0.000018 closs: 0.7832 (0.7537) grad_norm: 0.4754 (0.4607) time: 5.5784 data: 0.0002 max mem: 71357 +[22:35:45.415785] Epoch: [2] [5850/6500] lr: 0.000018 closs: 0.7469 (0.7538) grad_norm: 0.4186 (0.4606) time: 5.5842 data: 0.0002 max mem: 71357 +[22:36:41.194513] Epoch: [2] [5860/6500] lr: 0.000018 closs: 0.7760 (0.7538) grad_norm: 0.3871 (0.4605) time: 5.5831 data: 0.0001 max mem: 71357 +[22:37:36.993239] Epoch: [2] [5870/6500] lr: 0.000018 closs: 0.7566 (0.7538) grad_norm: 0.3684 (0.4603) time: 5.5788 data: 0.0001 max mem: 71357 +[22:38:32.745756] Epoch: [2] [5880/6500] lr: 0.000018 closs: 0.7442 (0.7537) grad_norm: 0.3798 (0.4603) time: 5.5774 data: 0.0001 max mem: 71357 +[22:39:28.509506] Epoch: [2] [5890/6500] lr: 0.000018 closs: 0.7118 (0.7536) grad_norm: 0.3814 (0.4602) time: 5.5757 data: 0.0001 max mem: 71357 +[22:40:24.433869] Epoch: [2] [5900/6500] lr: 0.000018 closs: 0.6788 (0.7535) grad_norm: 0.3861 (0.4602) time: 5.5843 data: 0.0001 max mem: 71357 +[22:41:20.129147] Epoch: [2] [5910/6500] lr: 0.000018 closs: 0.7175 (0.7536) grad_norm: 0.4095 (0.4603) time: 5.5809 data: 0.0001 max mem: 71357 +[22:42:15.865346] Epoch: [2] [5920/6500] lr: 0.000018 closs: 0.7207 (0.7535) grad_norm: 0.4065 (0.4602) time: 5.5715 data: 0.0001 max mem: 71357 +[22:43:11.634563] Epoch: [2] [5930/6500] lr: 0.000018 closs: 0.7174 (0.7535) grad_norm: 0.4065 (0.4613) time: 5.5752 data: 0.0001 max mem: 71357 +[22:44:07.473739] Epoch: [2] [5940/6500] lr: 0.000018 closs: 0.7479 (0.7535) grad_norm: 0.3961 (0.4614) time: 5.5803 data: 0.0001 max mem: 71357 +[22:45:03.266670] Epoch: [2] [5950/6500] lr: 0.000018 closs: 0.7342 (0.7535) grad_norm: 0.3684 (0.4612) time: 5.5815 data: 0.0002 max mem: 71357 +[22:45:59.020127] Epoch: [2] [5960/6500] lr: 0.000018 closs: 0.6752 (0.7534) grad_norm: 0.3959 (0.4612) time: 5.5772 data: 0.0002 max mem: 71357 +[22:46:54.732305] Epoch: [2] [5970/6500] lr: 0.000018 closs: 0.7144 (0.7534) grad_norm: 0.4040 (0.4613) time: 5.5732 data: 0.0001 max mem: 71357 +[22:47:50.554503] Epoch: [2] [5980/6500] lr: 0.000018 closs: 0.7962 (0.7534) grad_norm: 0.4040 (0.4624) time: 5.5766 data: 0.0001 max mem: 71357 +[22:48:46.306751] Epoch: [2] [5990/6500] lr: 0.000018 closs: 0.7715 (0.7535) grad_norm: 0.3959 (0.4623) time: 5.5786 data: 0.0001 max mem: 71357 +[22:49:41.974220] Epoch: [2] [6000/6500] lr: 0.000018 closs: 0.7402 (0.7536) grad_norm: 0.3846 (0.4624) time: 5.5709 data: 0.0001 max mem: 71357 +[22:50:37.783659] Epoch: [2] [6010/6500] lr: 0.000018 closs: 0.7132 (0.7535) grad_norm: 0.3823 (0.4623) time: 5.5738 data: 0.0001 max mem: 71357 +[22:51:33.639991] Epoch: [2] [6020/6500] lr: 0.000018 closs: 0.7106 (0.7535) grad_norm: 0.3712 (0.4622) time: 5.5832 data: 0.0001 max mem: 71357 +[22:52:29.438869] Epoch: [2] [6030/6500] lr: 0.000018 closs: 0.7124 (0.7535) grad_norm: 0.3909 (0.4623) time: 5.5827 data: 0.0001 max mem: 71357 +[22:53:25.189254] Epoch: [2] [6040/6500] lr: 0.000018 closs: 0.7124 (0.7534) grad_norm: 0.3894 (0.4622) time: 5.5774 data: 0.0001 max mem: 71357 +[22:54:20.915316] Epoch: [2] [6050/6500] lr: 0.000018 closs: 0.7851 (0.7536) grad_norm: 0.3782 (0.4621) time: 5.5737 data: 0.0001 max mem: 71357 +[22:55:16.695819] Epoch: [2] [6060/6500] lr: 0.000018 closs: 0.7867 (0.7536) grad_norm: 0.3782 (0.4621) time: 5.5752 data: 0.0002 max mem: 71357 +[22:56:12.546878] Epoch: [2] [6070/6500] lr: 0.000018 closs: 0.7574 (0.7536) grad_norm: 0.3723 (0.4619) time: 5.5815 data: 0.0002 max mem: 71357 +[22:57:08.327277] Epoch: [2] [6080/6500] lr: 0.000018 closs: 0.6751 (0.7537) grad_norm: 0.3723 (0.4619) time: 5.5815 data: 0.0001 max mem: 71357 +[22:58:04.087779] Epoch: [2] [6090/6500] lr: 0.000018 closs: 0.7851 (0.7536) grad_norm: 0.3784 (0.4620) time: 5.5769 data: 0.0001 max mem: 71357 +[22:58:59.916758] Epoch: [2] [6100/6500] lr: 0.000018 closs: 0.7344 (0.7536) grad_norm: 0.4472 (0.4621) time: 5.5794 data: 0.0001 max mem: 71357 +[22:59:55.759053] Epoch: [2] [6110/6500] lr: 0.000017 closs: 0.7704 (0.7537) grad_norm: 0.4621 (0.4621) time: 5.5834 data: 0.0002 max mem: 71357 +[23:00:51.574760] Epoch: [2] [6120/6500] lr: 0.000017 closs: 0.8100 (0.7537) grad_norm: 0.4472 (0.4620) time: 5.5828 data: 0.0002 max mem: 71357 +[23:01:47.278852] Epoch: [2] [6130/6500] lr: 0.000017 closs: 0.7380 (0.7537) grad_norm: 0.4204 (0.4620) time: 5.5759 data: 0.0001 max mem: 71357 +[23:02:43.031933] Epoch: [2] [6140/6500] lr: 0.000017 closs: 0.7306 (0.7537) grad_norm: 0.4444 (0.4623) time: 5.5728 data: 0.0001 max mem: 71357 +[23:03:38.769960] Epoch: [2] [6150/6500] lr: 0.000017 closs: 0.7840 (0.7539) grad_norm: 0.4829 (0.4623) time: 5.5745 data: 0.0001 max mem: 71357 +[23:04:34.600484] Epoch: [2] [6160/6500] lr: 0.000017 closs: 0.7840 (0.7539) grad_norm: 0.4829 (0.4622) time: 5.5783 data: 0.0001 max mem: 71357 +[23:05:30.286996] Epoch: [2] [6170/6500] lr: 0.000017 closs: 0.7545 (0.7540) grad_norm: 0.4356 (0.4622) time: 5.5758 data: 0.0001 max mem: 71357 +[23:06:25.993506] Epoch: [2] [6180/6500] lr: 0.000017 closs: 0.7504 (0.7539) grad_norm: 0.4211 (0.4621) time: 5.5696 data: 0.0001 max mem: 71357 +[23:07:21.647010] Epoch: [2] [6190/6500] lr: 0.000017 closs: 0.7039 (0.7539) grad_norm: 0.4211 (0.4628) time: 5.5679 data: 0.0001 max mem: 71357 +[23:08:17.362028] Epoch: [2] [6200/6500] lr: 0.000017 closs: 0.6685 (0.7538) grad_norm: 0.4351 (0.4630) time: 5.5684 data: 0.0001 max mem: 71357 +[23:09:13.186215] Epoch: [2] [6210/6500] lr: 0.000017 closs: 0.7096 (0.7538) grad_norm: 0.4211 (0.4629) time: 5.5769 data: 0.0001 max mem: 71357 +[23:10:08.928622] Epoch: [2] [6220/6500] lr: 0.000017 closs: 0.7746 (0.7537) grad_norm: 0.4352 (0.4630) time: 5.5782 data: 0.0001 max mem: 71357 +[23:11:04.652755] Epoch: [2] [6230/6500] lr: 0.000017 closs: 0.7821 (0.7538) grad_norm: 0.4103 (0.4630) time: 5.5732 data: 0.0001 max mem: 71357 +[23:12:00.419833] Epoch: [2] [6240/6500] lr: 0.000017 closs: 0.7860 (0.7538) grad_norm: 0.4103 (0.4630) time: 5.5745 data: 0.0001 max mem: 71357 +[23:12:56.282491] Epoch: [2] [6250/6500] lr: 0.000017 closs: 0.7497 (0.7538) grad_norm: 0.4111 (0.4630) time: 5.5814 data: 0.0001 max mem: 71357 +[23:13:52.002571] Epoch: [2] [6260/6500] lr: 0.000017 closs: 0.6927 (0.7537) grad_norm: 0.4111 (0.4630) time: 5.5791 data: 0.0001 max mem: 71357 +[23:14:47.790512] Epoch: [2] [6270/6500] lr: 0.000017 closs: 0.6601 (0.7535) grad_norm: 0.4325 (0.4631) time: 5.5753 data: 0.0001 max mem: 71357 +[23:15:43.475077] Epoch: [2] [6280/6500] lr: 0.000017 closs: 0.7613 (0.7537) grad_norm: 0.4325 (0.4631) time: 5.5735 data: 0.0001 max mem: 71357 +[23:16:39.282617] Epoch: [2] [6290/6500] lr: 0.000017 closs: 0.7696 (0.7537) grad_norm: 0.4325 (0.4631) time: 5.5745 data: 0.0001 max mem: 71357 +[23:17:34.943611] Epoch: [2] [6300/6500] lr: 0.000017 closs: 0.6945 (0.7536) grad_norm: 0.4325 (0.4631) time: 5.5733 data: 0.0001 max mem: 71357 +[23:18:30.788632] Epoch: [2] [6310/6500] lr: 0.000017 closs: 0.6751 (0.7535) grad_norm: 0.3664 (0.4629) time: 5.5752 data: 0.0001 max mem: 71357 +[23:19:26.585278] Epoch: [2] [6320/6500] lr: 0.000017 closs: 0.7299 (0.7535) grad_norm: 0.3616 (0.4629) time: 5.5820 data: 0.0001 max mem: 71357 +[23:20:22.397220] Epoch: [2] [6330/6500] lr: 0.000017 closs: 0.7299 (0.7535) grad_norm: 0.3927 (0.4630) time: 5.5803 data: 0.0002 max mem: 71357 +[23:21:18.332983] Epoch: [2] [6340/6500] lr: 0.000017 closs: 0.7740 (0.7536) grad_norm: 0.3840 (0.4629) time: 5.5873 data: 0.0002 max mem: 71357 +[23:22:14.017952] Epoch: [2] [6350/6500] lr: 0.000017 closs: 0.7530 (0.7536) grad_norm: 0.4238 (0.4629) time: 5.5809 data: 0.0002 max mem: 71357 +[23:23:09.970496] Epoch: [2] [6360/6500] lr: 0.000017 closs: 0.6566 (0.7535) grad_norm: 0.3840 (0.4627) time: 5.5818 data: 0.0002 max mem: 71357 +[23:24:05.875075] Epoch: [2] [6370/6500] lr: 0.000017 closs: 0.6932 (0.7535) grad_norm: 0.3840 (0.4627) time: 5.5928 data: 0.0002 max mem: 71357 +[23:25:01.751484] Epoch: [2] [6380/6500] lr: 0.000017 closs: 0.6841 (0.7535) grad_norm: 0.4111 (0.4626) time: 5.5890 data: 0.0002 max mem: 71357 +[23:25:57.533484] Epoch: [2] [6390/6500] lr: 0.000017 closs: 0.7375 (0.7535) grad_norm: 0.3716 (0.4625) time: 5.5828 data: 0.0002 max mem: 71357 +[23:26:53.655347] Epoch: [2] [6400/6500] lr: 0.000017 closs: 0.7540 (0.7535) grad_norm: 0.4111 (0.4624) time: 5.5946 data: 0.0002 max mem: 71357 +[23:27:49.479003] Epoch: [2] [6410/6500] lr: 0.000017 closs: 0.6857 (0.7535) grad_norm: 0.3663 (0.4622) time: 5.5967 data: 0.0002 max mem: 71357 +[23:28:45.243961] Epoch: [2] [6420/6500] lr: 0.000017 closs: 0.7293 (0.7535) grad_norm: 0.3680 (0.4622) time: 5.5794 data: 0.0002 max mem: 71357 +[23:29:41.165348] Epoch: [2] [6430/6500] lr: 0.000016 closs: 0.7293 (0.7536) grad_norm: 0.3680 (0.4621) time: 5.5842 data: 0.0001 max mem: 71357 +[23:30:37.000964] Epoch: [2] [6440/6500] lr: 0.000016 closs: 0.6909 (0.7535) grad_norm: 0.3927 (0.4620) time: 5.5878 data: 0.0001 max mem: 71357 +[23:31:32.684603] Epoch: [2] [6450/6500] lr: 0.000016 closs: 0.7128 (0.7534) grad_norm: 0.3942 (0.4621) time: 5.5759 data: 0.0001 max mem: 71357 +[23:32:28.371373] Epoch: [2] [6460/6500] lr: 0.000016 closs: 0.7128 (0.7534) grad_norm: 0.3942 (0.4621) time: 5.5685 data: 0.0001 max mem: 71357 +[23:33:24.189082] Epoch: [2] [6470/6500] lr: 0.000016 closs: 0.6997 (0.7533) grad_norm: 0.3942 (0.4619) time: 5.5752 data: 0.0001 max mem: 71357 +[23:34:19.829473] Epoch: [2] [6480/6500] lr: 0.000016 closs: 0.6889 (0.7533) grad_norm: 0.3942 (0.4619) time: 5.5728 data: 0.0001 max mem: 71357 +[23:35:15.552398] Epoch: [2] [6490/6500] lr: 0.000016 closs: 0.7060 (0.7532) grad_norm: 0.3979 (0.4619) time: 5.5681 data: 0.0001 max mem: 71357 +[23:36:06.410367] Epoch: [2] Total time: 10:04:27 +[23:36:06.411251] Averaged stats: lr: 0.000016 closs: 0.7366 (0.7503) grad_norm: 0.3976 (0.4619) +[23:36:06.578363] model saved +[23:36:07.582844] optimizer saved +[23:36:07.583305] other rank-common saved +[23:36:07.586540] rank-specific saved +[23:36:07.595987] log_dir: ./output_dir +[23:36:15.789670] Epoch: [3] [0/6500] lr: 0.000016 closs: 0.6146 (0.6146) time: 8.1929 data: 2.6079 max mem: 71357 +[23:37:11.511729] Epoch: [3] [10/6500] lr: 0.000016 closs: 0.7006 (0.7128) grad_norm: 0.4318 (0.4230) time: 5.8104 data: 0.2372 max mem: 71357 +[23:38:07.453950] Epoch: [3] [20/6500] lr: 0.000016 closs: 0.7074 (0.7354) grad_norm: 0.3599 (0.4300) time: 5.5831 data: 0.0001 max mem: 71357 +[23:39:03.188210] Epoch: [3] [30/6500] lr: 0.000016 closs: 0.7432 (0.7265) grad_norm: 0.3704 (0.4030) time: 5.5837 data: 0.0001 max mem: 71357 +[23:39:58.904121] Epoch: [3] [40/6500] lr: 0.000016 closs: 0.7133 (0.7253) grad_norm: 0.3738 (0.4072) time: 5.5724 data: 0.0001 max mem: 71357 +[23:40:54.597672] Epoch: [3] [50/6500] lr: 0.000016 closs: 0.7198 (0.7230) grad_norm: 0.3704 (0.4126) time: 5.5704 data: 0.0001 max mem: 71357 +[23:41:50.339082] Epoch: [3] [60/6500] lr: 0.000016 closs: 0.7353 (0.7339) grad_norm: 0.3956 (0.4189) time: 5.5716 data: 0.0001 max mem: 71357 +[23:42:46.233960] Epoch: [3] [70/6500] lr: 0.000016 closs: 0.8521 (0.7525) grad_norm: 0.3956 (0.4162) time: 5.5817 data: 0.0001 max mem: 71357 +[23:43:41.956645] Epoch: [3] [80/6500] lr: 0.000016 closs: 0.7817 (0.7474) grad_norm: 0.3673 (0.4140) time: 5.5808 data: 0.0001 max mem: 71357 +[23:44:37.633777] Epoch: [3] [90/6500] lr: 0.000016 closs: 0.6593 (0.7440) grad_norm: 0.3729 (0.4156) time: 5.5699 data: 0.0001 max mem: 71357 +[23:45:33.361270] Epoch: [3] [100/6500] lr: 0.000016 closs: 0.6504 (0.7386) grad_norm: 0.3673 (0.4159) time: 5.5702 data: 0.0001 max mem: 71357 +[23:46:29.167607] Epoch: [3] [110/6500] lr: 0.000016 closs: 0.7165 (0.7374) grad_norm: 0.3846 (0.4321) time: 5.5766 data: 0.0001 max mem: 71357 +[23:47:25.114397] Epoch: [3] [120/6500] lr: 0.000016 closs: 0.7398 (0.7371) grad_norm: 0.4167 (0.4296) time: 5.5876 data: 0.0001 max mem: 71357 +[23:48:20.766418] Epoch: [3] [130/6500] lr: 0.000016 closs: 0.7793 (0.7405) grad_norm: 0.4167 (0.4313) time: 5.5798 data: 0.0001 max mem: 71357 +[23:49:16.432372] Epoch: [3] [140/6500] lr: 0.000016 closs: 0.7152 (0.7344) grad_norm: 0.3969 (0.4370) time: 5.5658 data: 0.0001 max mem: 71357 +[23:50:12.174575] Epoch: [3] [150/6500] lr: 0.000016 closs: 0.7152 (0.7377) grad_norm: 0.3969 (0.4372) time: 5.5703 data: 0.0001 max mem: 71357 +[23:51:08.059066] Epoch: [3] [160/6500] lr: 0.000016 closs: 0.7234 (0.7331) grad_norm: 0.3990 (0.4364) time: 5.5813 data: 0.0001 max mem: 71357 +[23:52:03.804257] Epoch: [3] [170/6500] lr: 0.000016 closs: 0.6501 (0.7310) grad_norm: 0.4371 (0.4405) time: 5.5814 data: 0.0002 max mem: 71357 +[23:52:59.500243] Epoch: [3] [180/6500] lr: 0.000016 closs: 0.7634 (0.7355) grad_norm: 0.4371 (0.4403) time: 5.5719 data: 0.0002 max mem: 71357 +[23:53:55.305535] Epoch: [3] [190/6500] lr: 0.000016 closs: 0.7560 (0.7352) grad_norm: 0.4015 (0.4372) time: 5.5749 data: 0.0001 max mem: 71357 +[23:54:51.144563] Epoch: [3] [200/6500] lr: 0.000016 closs: 0.7020 (0.7311) grad_norm: 0.4015 (0.4360) time: 5.5821 data: 0.0001 max mem: 71357 +[23:55:46.998394] Epoch: [3] [210/6500] lr: 0.000016 closs: 0.7057 (0.7326) grad_norm: 0.3800 (0.4397) time: 5.5846 data: 0.0001 max mem: 71357 +[23:56:42.733975] Epoch: [3] [220/6500] lr: 0.000016 closs: 0.7198 (0.7340) grad_norm: 0.3790 (0.4489) time: 5.5794 data: 0.0001 max mem: 71357 +[23:57:38.456914] Epoch: [3] [230/6500] lr: 0.000016 closs: 0.6877 (0.7321) grad_norm: 0.3800 (0.4482) time: 5.5729 data: 0.0001 max mem: 71357 +[23:58:34.352787] Epoch: [3] [240/6500] lr: 0.000016 closs: 0.7389 (0.7350) grad_norm: 0.3748 (0.4443) time: 5.5809 data: 0.0001 max mem: 71357 +[23:59:30.047396] Epoch: [3] [250/6500] lr: 0.000015 closs: 0.7643 (0.7352) grad_norm: 0.3739 (0.4411) time: 5.5794 data: 0.0001 max mem: 71357 +[00:00:25.716985] Epoch: [3] [260/6500] lr: 0.000015 closs: 0.7649 (0.7387) grad_norm: 0.3739 (0.4420) time: 5.5681 data: 0.0001 max mem: 71357 +[00:01:21.437902] Epoch: [3] [270/6500] lr: 0.000015 closs: 0.7609 (0.7394) grad_norm: 0.3747 (0.4463) time: 5.5695 data: 0.0001 max mem: 71357 +[00:02:17.206931] Epoch: [3] [280/6500] lr: 0.000015 closs: 0.6993 (0.7386) grad_norm: 0.3980 (0.4447) time: 5.5744 data: 0.0001 max mem: 71357 +[00:03:12.921880] Epoch: [3] [290/6500] lr: 0.000015 closs: 0.7272 (0.7393) grad_norm: 0.4024 (0.4441) time: 5.5741 data: 0.0001 max mem: 71357 +[00:04:08.682771] Epoch: [3] [300/6500] lr: 0.000015 closs: 0.7557 (0.7407) grad_norm: 0.4064 (0.4482) time: 5.5737 data: 0.0001 max mem: 71357 +[00:05:04.350866] Epoch: [3] [310/6500] lr: 0.000015 closs: 0.7481 (0.7401) grad_norm: 0.4064 (0.4465) time: 5.5713 data: 0.0001 max mem: 71357 +[00:06:00.086395] Epoch: [3] [320/6500] lr: 0.000015 closs: 0.7440 (0.7390) grad_norm: 0.3864 (0.4437) time: 5.5701 data: 0.0001 max mem: 71357 +[00:06:55.945079] Epoch: [3] [330/6500] lr: 0.000015 closs: 0.7742 (0.7414) grad_norm: 0.3987 (0.4443) time: 5.5796 data: 0.0001 max mem: 71357 +[00:07:51.657783] Epoch: [3] [340/6500] lr: 0.000015 closs: 0.7810 (0.7444) grad_norm: 0.3864 (0.4482) time: 5.5785 data: 0.0001 max mem: 71357 +[00:08:47.398301] Epoch: [3] [350/6500] lr: 0.000015 closs: 0.7696 (0.7429) grad_norm: 0.3989 (0.4466) time: 5.5725 data: 0.0001 max mem: 71357 +[00:09:43.161770] Epoch: [3] [360/6500] lr: 0.000015 closs: 0.7516 (0.7433) grad_norm: 0.4093 (0.4457) time: 5.5751 data: 0.0001 max mem: 71357 +[00:10:38.853854] Epoch: [3] [370/6500] lr: 0.000015 closs: 0.7676 (0.7444) grad_norm: 0.4122 (0.4464) time: 5.5727 data: 0.0001 max mem: 71357 +[00:11:34.646934] Epoch: [3] [380/6500] lr: 0.000015 closs: 0.7893 (0.7451) grad_norm: 0.3989 (0.4464) time: 5.5742 data: 0.0001 max mem: 71357 +[00:12:30.405911] Epoch: [3] [390/6500] lr: 0.000015 closs: 0.8063 (0.7453) grad_norm: 0.4085 (0.4461) time: 5.5775 data: 0.0001 max mem: 71357 +[00:13:26.046470] Epoch: [3] [400/6500] lr: 0.000015 closs: 0.7670 (0.7463) grad_norm: 0.4090 (0.4495) time: 5.5699 data: 0.0001 max mem: 71357 +[00:14:21.747482] Epoch: [3] [410/6500] lr: 0.000015 closs: 0.7218 (0.7445) grad_norm: 0.3901 (0.4504) time: 5.5670 data: 0.0001 max mem: 71357 +[00:15:17.668173] Epoch: [3] [420/6500] lr: 0.000015 closs: 0.6873 (0.7438) grad_norm: 0.3901 (0.4491) time: 5.5810 data: 0.0001 max mem: 71357 +[00:16:13.463723] Epoch: [3] [430/6500] lr: 0.000015 closs: 0.7053 (0.7430) grad_norm: 0.3729 (0.4473) time: 5.5857 data: 0.0001 max mem: 71357 +[00:17:09.143060] Epoch: [3] [440/6500] lr: 0.000015 closs: 0.7145 (0.7429) grad_norm: 0.3853 (0.4475) time: 5.5737 data: 0.0001 max mem: 71357 +[00:18:04.852034] Epoch: [3] [450/6500] lr: 0.000015 closs: 0.7834 (0.7436) grad_norm: 0.3907 (0.4470) time: 5.5693 data: 0.0001 max mem: 71357 +[00:19:00.721499] Epoch: [3] [460/6500] lr: 0.000015 closs: 0.7946 (0.7431) grad_norm: 0.3853 (0.4447) time: 5.5788 data: 0.0001 max mem: 71357 +[00:19:56.460552] Epoch: [3] [470/6500] lr: 0.000015 closs: 0.7314 (0.7425) grad_norm: 0.3587 (0.4440) time: 5.5803 data: 0.0001 max mem: 71357 +[00:20:52.121137] Epoch: [3] [480/6500] lr: 0.000015 closs: 0.7648 (0.7444) grad_norm: 0.3587 (0.4444) time: 5.5699 data: 0.0001 max mem: 71357 +[00:21:47.795418] Epoch: [3] [490/6500] lr: 0.000015 closs: 0.7545 (0.7415) grad_norm: 0.3587 (0.4470) time: 5.5667 data: 0.0001 max mem: 71357 +[00:22:43.610741] Epoch: [3] [500/6500] lr: 0.000015 closs: 0.6561 (0.7423) grad_norm: 0.3719 (0.4456) time: 5.5744 data: 0.0001 max mem: 71357 +[00:23:39.289884] Epoch: [3] [510/6500] lr: 0.000015 closs: 0.7800 (0.7433) grad_norm: 0.4030 (0.4467) time: 5.5746 data: 0.0001 max mem: 71357 +[00:24:34.955681] Epoch: [3] [520/6500] lr: 0.000015 closs: 0.7182 (0.7408) grad_norm: 0.3815 (0.4456) time: 5.5672 data: 0.0001 max mem: 71357 +[00:25:30.752698] Epoch: [3] [530/6500] lr: 0.000015 closs: 0.7124 (0.7414) grad_norm: 0.3815 (0.4451) time: 5.5730 data: 0.0001 max mem: 71357 +[00:26:26.458444] Epoch: [3] [540/6500] lr: 0.000015 closs: 0.7216 (0.7405) grad_norm: 0.3815 (0.4445) time: 5.5750 data: 0.0001 max mem: 71357 +[00:27:22.282296] Epoch: [3] [550/6500] lr: 0.000015 closs: 0.7084 (0.7406) grad_norm: 0.3815 (0.4461) time: 5.5764 data: 0.0001 max mem: 71357 +[00:28:17.976928] Epoch: [3] [560/6500] lr: 0.000015 closs: 0.7254 (0.7403) grad_norm: 0.4211 (0.4493) time: 5.5759 data: 0.0001 max mem: 71357 +[00:29:13.811494] Epoch: [3] [570/6500] lr: 0.000015 closs: 0.6689 (0.7397) grad_norm: 0.3693 (0.4479) time: 5.5764 data: 0.0001 max mem: 71357 +[00:30:09.537691] Epoch: [3] [580/6500] lr: 0.000014 closs: 0.6779 (0.7394) grad_norm: 0.4004 (0.4494) time: 5.5779 data: 0.0001 max mem: 71357 +[00:31:05.357551] Epoch: [3] [590/6500] lr: 0.000014 closs: 0.7062 (0.7387) grad_norm: 0.4039 (0.4512) time: 5.5772 data: 0.0001 max mem: 71357 +[00:32:01.086703] Epoch: [3] [600/6500] lr: 0.000014 closs: 0.6869 (0.7392) grad_norm: 0.4087 (0.4528) time: 5.5774 data: 0.0001 max mem: 71357 +[00:32:56.729576] Epoch: [3] [610/6500] lr: 0.000014 closs: 0.6827 (0.7380) grad_norm: 0.5076 (0.4546) time: 5.5685 data: 0.0001 max mem: 71357 +[00:33:52.466776] Epoch: [3] [620/6500] lr: 0.000014 closs: 0.6962 (0.7388) grad_norm: 0.5076 (0.4552) time: 5.5689 data: 0.0001 max mem: 71357 +[00:34:48.197321] Epoch: [3] [630/6500] lr: 0.000014 closs: 0.7863 (0.7393) grad_norm: 0.5055 (0.4563) time: 5.5733 data: 0.0001 max mem: 71357 +[00:35:43.932519] Epoch: [3] [640/6500] lr: 0.000014 closs: 0.7812 (0.7391) grad_norm: 0.4378 (0.4563) time: 5.5732 data: 0.0001 max mem: 71357 +[00:36:39.649858] Epoch: [3] [650/6500] lr: 0.000014 closs: 0.7366 (0.7392) grad_norm: 0.4040 (0.4562) time: 5.5725 data: 0.0001 max mem: 71357 +[00:37:35.318846] Epoch: [3] [660/6500] lr: 0.000014 closs: 0.6963 (0.7391) grad_norm: 0.4330 (0.4559) time: 5.5692 data: 0.0001 max mem: 71357 +[00:38:30.992475] Epoch: [3] [670/6500] lr: 0.000014 closs: 0.7026 (0.7402) grad_norm: 0.4156 (0.4553) time: 5.5670 data: 0.0001 max mem: 71357 +[00:39:26.871070] Epoch: [3] [680/6500] lr: 0.000014 closs: 0.7783 (0.7407) grad_norm: 0.4330 (0.4568) time: 5.5775 data: 0.0001 max mem: 71357 +[00:40:22.585682] Epoch: [3] [690/6500] lr: 0.000014 closs: 0.7404 (0.7394) grad_norm: 0.4211 (0.4559) time: 5.5795 data: 0.0001 max mem: 71357 +[00:41:18.283696] Epoch: [3] [700/6500] lr: 0.000014 closs: 0.7099 (0.7398) grad_norm: 0.4050 (0.4552) time: 5.5705 data: 0.0001 max mem: 71357 +[00:42:14.020162] Epoch: [3] [710/6500] lr: 0.000014 closs: 0.7389 (0.7400) grad_norm: 0.4050 (0.4566) time: 5.5716 data: 0.0002 max mem: 71357 +[00:43:09.826888] Epoch: [3] [720/6500] lr: 0.000014 closs: 0.7389 (0.7399) grad_norm: 0.3828 (0.4553) time: 5.5771 data: 0.0002 max mem: 71357 +[00:44:05.589010] Epoch: [3] [730/6500] lr: 0.000014 closs: 0.7528 (0.7405) grad_norm: 0.4084 (0.4581) time: 5.5784 data: 0.0001 max mem: 71357 +[00:45:01.263164] Epoch: [3] [740/6500] lr: 0.000014 closs: 0.7142 (0.7401) grad_norm: 0.4105 (0.4583) time: 5.5717 data: 0.0001 max mem: 71357 +[00:45:57.041552] Epoch: [3] [750/6500] lr: 0.000014 closs: 0.7161 (0.7410) grad_norm: 0.4105 (0.4583) time: 5.5725 data: 0.0001 max mem: 71357 +[00:46:52.780365] Epoch: [3] [760/6500] lr: 0.000014 closs: 0.7962 (0.7404) grad_norm: 0.4450 (0.4576) time: 5.5758 data: 0.0001 max mem: 71357 +[00:47:48.538095] Epoch: [3] [770/6500] lr: 0.000014 closs: 0.7276 (0.7410) grad_norm: 0.4256 (0.4577) time: 5.5747 data: 0.0001 max mem: 71357 +[00:48:44.243706] Epoch: [3] [780/6500] lr: 0.000014 closs: 0.7184 (0.7401) grad_norm: 0.4427 (0.4580) time: 5.5731 data: 0.0001 max mem: 71357 +[00:49:39.903887] Epoch: [3] [790/6500] lr: 0.000014 closs: 0.6724 (0.7390) grad_norm: 0.4518 (0.4593) time: 5.5682 data: 0.0001 max mem: 71357 +[00:50:35.705245] Epoch: [3] [800/6500] lr: 0.000014 closs: 0.6724 (0.7385) grad_norm: 0.4518 (0.4585) time: 5.5730 data: 0.0001 max mem: 71357 +[00:51:31.514197] Epoch: [3] [810/6500] lr: 0.000014 closs: 0.6940 (0.7379) grad_norm: 0.4427 (0.4582) time: 5.5804 data: 0.0001 max mem: 71357 +[00:52:27.324671] Epoch: [3] [820/6500] lr: 0.000014 closs: 0.7296 (0.7380) grad_norm: 0.4293 (0.4602) time: 5.5809 data: 0.0001 max mem: 71357 +[00:53:23.091518] Epoch: [3] [830/6500] lr: 0.000014 closs: 0.6945 (0.7383) grad_norm: 0.4142 (0.4598) time: 5.5788 data: 0.0001 max mem: 71357 +[00:54:18.874645] Epoch: [3] [840/6500] lr: 0.000014 closs: 0.7030 (0.7381) grad_norm: 0.3834 (0.4588) time: 5.5774 data: 0.0001 max mem: 71357 +[00:55:14.607446] Epoch: [3] [850/6500] lr: 0.000014 closs: 0.7228 (0.7374) grad_norm: 0.3834 (0.4594) time: 5.5757 data: 0.0001 max mem: 71357 +[00:56:10.454988] Epoch: [3] [860/6500] lr: 0.000014 closs: 0.7877 (0.7379) grad_norm: 0.3806 (0.4584) time: 5.5789 data: 0.0001 max mem: 71357 +[00:57:06.203850] Epoch: [3] [870/6500] lr: 0.000014 closs: 0.7904 (0.7391) grad_norm: 0.3834 (0.4584) time: 5.5797 data: 0.0002 max mem: 71357 +[00:58:01.896877] Epoch: [3] [880/6500] lr: 0.000014 closs: 0.7406 (0.7385) grad_norm: 0.4107 (0.4582) time: 5.5720 data: 0.0002 max mem: 71357 +[00:58:57.712138] Epoch: [3] [890/6500] lr: 0.000014 closs: 0.7278 (0.7389) grad_norm: 0.3963 (0.4577) time: 5.5753 data: 0.0001 max mem: 71357 +[00:59:53.613174] Epoch: [3] [900/6500] lr: 0.000014 closs: 0.7661 (0.7382) grad_norm: 0.4107 (0.4574) time: 5.5857 data: 0.0001 max mem: 71357 +[01:00:49.265294] Epoch: [3] [910/6500] lr: 0.000014 closs: 0.7247 (0.7374) grad_norm: 0.4342 (0.4595) time: 5.5776 data: 0.0001 max mem: 71357 +[01:01:44.937344] Epoch: [3] [920/6500] lr: 0.000013 closs: 0.6816 (0.7375) grad_norm: 0.4522 (0.4598) time: 5.5661 data: 0.0001 max mem: 71357 +[01:02:40.641403] Epoch: [3] [930/6500] lr: 0.000013 closs: 0.7064 (0.7377) grad_norm: 0.4679 (0.4599) time: 5.5687 data: 0.0002 max mem: 71357 +[01:03:36.455103] Epoch: [3] [940/6500] lr: 0.000013 closs: 0.6902 (0.7376) grad_norm: 0.4658 (0.4596) time: 5.5758 data: 0.0002 max mem: 71357 +[01:04:32.206716] Epoch: [3] [950/6500] lr: 0.000013 closs: 0.7056 (0.7373) grad_norm: 0.4290 (0.4587) time: 5.5781 data: 0.0001 max mem: 71357 +[01:05:27.962419] Epoch: [3] [960/6500] lr: 0.000013 closs: 0.7121 (0.7371) grad_norm: 0.4165 (0.4583) time: 5.5753 data: 0.0001 max mem: 71357 +[01:06:23.683254] Epoch: [3] [970/6500] lr: 0.000013 closs: 0.7357 (0.7372) grad_norm: 0.3924 (0.4586) time: 5.5737 data: 0.0001 max mem: 71357 +[01:07:19.353981] Epoch: [3] [980/6500] lr: 0.000013 closs: 0.7258 (0.7370) grad_norm: 0.3924 (0.4584) time: 5.5695 data: 0.0002 max mem: 71357 +[01:08:15.129240] Epoch: [3] [990/6500] lr: 0.000013 closs: 0.7258 (0.7366) grad_norm: 0.4066 (0.4590) time: 5.5722 data: 0.0002 max mem: 71357 +[01:09:10.838789] Epoch: [3] [1000/6500] lr: 0.000013 closs: 0.7418 (0.7373) grad_norm: 0.3735 (0.4601) time: 5.5741 data: 0.0001 max mem: 71357 +[01:10:06.563829] Epoch: [3] [1010/6500] lr: 0.000013 closs: 0.7381 (0.7374) grad_norm: 0.3834 (0.4599) time: 5.5716 data: 0.0001 max mem: 71357 +[01:11:02.258175] Epoch: [3] [1020/6500] lr: 0.000013 closs: 0.7057 (0.7374) grad_norm: 0.4172 (0.4609) time: 5.5709 data: 0.0001 max mem: 71357 +[01:11:58.103183] Epoch: [3] [1030/6500] lr: 0.000013 closs: 0.7993 (0.7380) grad_norm: 0.4857 (0.4617) time: 5.5769 data: 0.0001 max mem: 71357 +[01:12:53.782290] Epoch: [3] [1040/6500] lr: 0.000013 closs: 0.7572 (0.7380) grad_norm: 0.4870 (0.4617) time: 5.5761 data: 0.0001 max mem: 71357 +[01:13:49.475418] Epoch: [3] [1050/6500] lr: 0.000013 closs: 0.7557 (0.7383) grad_norm: 0.4931 (0.4618) time: 5.5685 data: 0.0001 max mem: 71357 +[01:14:45.267483] Epoch: [3] [1060/6500] lr: 0.000013 closs: 0.7168 (0.7382) grad_norm: 0.4857 (0.4613) time: 5.5742 data: 0.0001 max mem: 71357 +[01:15:41.058202] Epoch: [3] [1070/6500] lr: 0.000013 closs: 0.7480 (0.7385) grad_norm: 0.4244 (0.4607) time: 5.5791 data: 0.0001 max mem: 71357 +[01:16:36.972671] Epoch: [3] [1080/6500] lr: 0.000013 closs: 0.7299 (0.7387) grad_norm: 0.3912 (0.4601) time: 5.5852 data: 0.0001 max mem: 71357 +[01:17:32.590345] Epoch: [3] [1090/6500] lr: 0.000013 closs: 0.6990 (0.7382) grad_norm: 0.3655 (0.4591) time: 5.5765 data: 0.0001 max mem: 71357 +[01:18:28.253054] Epoch: [3] [1100/6500] lr: 0.000013 closs: 0.6832 (0.7377) grad_norm: 0.3655 (0.4589) time: 5.5639 data: 0.0001 max mem: 71357 +[01:19:23.978321] Epoch: [3] [1110/6500] lr: 0.000013 closs: 0.7671 (0.7381) grad_norm: 0.3668 (0.4587) time: 5.5693 data: 0.0001 max mem: 71357 +[01:20:19.761158] Epoch: [3] [1120/6500] lr: 0.000013 closs: 0.7495 (0.7377) grad_norm: 0.3668 (0.4587) time: 5.5753 data: 0.0001 max mem: 71357 +[01:21:15.337249] Epoch: [3] [1130/6500] lr: 0.000013 closs: 0.7288 (0.7378) grad_norm: 0.4154 (0.4584) time: 5.5679 data: 0.0001 max mem: 71357 +[01:22:11.125607] Epoch: [3] [1140/6500] lr: 0.000013 closs: 0.6722 (0.7376) grad_norm: 0.3753 (0.4582) time: 5.5682 data: 0.0001 max mem: 71357 +[01:23:06.785225] Epoch: [3] [1150/6500] lr: 0.000013 closs: 0.6987 (0.7377) grad_norm: 0.3753 (0.4577) time: 5.5723 data: 0.0001 max mem: 71357 +[01:24:02.527031] Epoch: [3] [1160/6500] lr: 0.000013 closs: 0.7018 (0.7375) grad_norm: 0.4078 (0.4575) time: 5.5700 data: 0.0001 max mem: 71357 +[01:24:58.344508] Epoch: [3] [1170/6500] lr: 0.000013 closs: 0.7331 (0.7380) grad_norm: 0.3751 (0.4578) time: 5.5779 data: 0.0001 max mem: 71357 +[01:25:53.986697] Epoch: [3] [1180/6500] lr: 0.000013 closs: 0.7686 (0.7380) grad_norm: 0.4078 (0.4579) time: 5.5729 data: 0.0001 max mem: 71357 +[01:26:49.704392] Epoch: [3] [1190/6500] lr: 0.000013 closs: 0.6941 (0.7379) grad_norm: 0.4060 (0.4574) time: 5.5679 data: 0.0001 max mem: 71357 +[01:27:45.361863] Epoch: [3] [1200/6500] lr: 0.000013 closs: 0.7082 (0.7385) grad_norm: 0.4060 (0.4577) time: 5.5686 data: 0.0001 max mem: 71357 +[01:28:41.177916] Epoch: [3] [1210/6500] lr: 0.000013 closs: 0.7895 (0.7388) grad_norm: 0.4182 (0.4589) time: 5.5736 data: 0.0001 max mem: 71357 +[01:29:36.933677] Epoch: [3] [1220/6500] lr: 0.000013 closs: 0.7895 (0.7386) grad_norm: 0.4473 (0.4588) time: 5.5785 data: 0.0001 max mem: 71357 +[01:30:32.611778] Epoch: [3] [1230/6500] lr: 0.000013 closs: 0.6651 (0.7379) grad_norm: 0.4473 (0.4584) time: 5.5716 data: 0.0001 max mem: 71357 +[01:31:28.246111] Epoch: [3] [1240/6500] lr: 0.000013 closs: 0.7076 (0.7379) grad_norm: 0.4307 (0.4586) time: 5.5656 data: 0.0001 max mem: 71357 +[01:32:24.161221] Epoch: [3] [1250/6500] lr: 0.000013 closs: 0.7297 (0.7382) grad_norm: 0.4281 (0.4581) time: 5.5774 data: 0.0001 max mem: 71357 +[01:33:19.939621] Epoch: [3] [1260/6500] lr: 0.000013 closs: 0.7090 (0.7379) grad_norm: 0.3789 (0.4573) time: 5.5846 data: 0.0001 max mem: 71357 +[01:34:15.687899] Epoch: [3] [1270/6500] lr: 0.000013 closs: 0.6752 (0.7378) grad_norm: 0.3819 (0.4575) time: 5.5762 data: 0.0001 max mem: 71357 +[01:35:11.467944] Epoch: [3] [1280/6500] lr: 0.000012 closs: 0.7566 (0.7384) grad_norm: 0.3789 (0.4571) time: 5.5763 data: 0.0001 max mem: 71357 +[01:36:07.120143] Epoch: [3] [1290/6500] lr: 0.000012 closs: 0.8008 (0.7388) grad_norm: 0.3870 (0.4576) time: 5.5715 data: 0.0001 max mem: 71357 +[01:37:02.915605] Epoch: [3] [1300/6500] lr: 0.000012 closs: 0.7662 (0.7389) grad_norm: 0.4138 (0.4575) time: 5.5723 data: 0.0001 max mem: 71357 +[01:37:58.533136] Epoch: [3] [1310/6500] lr: 0.000012 closs: 0.7344 (0.7395) grad_norm: 0.4138 (0.4575) time: 5.5706 data: 0.0001 max mem: 71357 +[01:38:54.316913] Epoch: [3] [1320/6500] lr: 0.000012 closs: 0.7344 (0.7396) grad_norm: 0.4228 (0.4576) time: 5.5700 data: 0.0001 max mem: 71357 +[01:39:50.015096] Epoch: [3] [1330/6500] lr: 0.000012 closs: 0.7196 (0.7398) grad_norm: 0.4228 (0.4582) time: 5.5740 data: 0.0001 max mem: 71357 +[01:40:45.918757] Epoch: [3] [1340/6500] lr: 0.000012 closs: 0.7234 (0.7402) grad_norm: 0.4203 (0.4577) time: 5.5800 data: 0.0001 max mem: 71357 +[01:41:41.617278] Epoch: [3] [1350/6500] lr: 0.000012 closs: 0.7311 (0.7401) grad_norm: 0.4203 (0.4579) time: 5.5800 data: 0.0001 max mem: 71357 +[01:42:37.361572] Epoch: [3] [1360/6500] lr: 0.000012 closs: 0.6839 (0.7396) grad_norm: 0.4300 (0.4578) time: 5.5721 data: 0.0001 max mem: 71357 +[01:43:33.087776] Epoch: [3] [1370/6500] lr: 0.000012 closs: 0.7108 (0.7402) grad_norm: 0.4203 (0.4580) time: 5.5734 data: 0.0001 max mem: 71357 +[01:44:28.900787] Epoch: [3] [1380/6500] lr: 0.000012 closs: 0.8245 (0.7409) grad_norm: 0.4237 (0.4579) time: 5.5769 data: 0.0001 max mem: 71357 +[01:45:24.573697] Epoch: [3] [1390/6500] lr: 0.000012 closs: 0.7725 (0.7407) grad_norm: 0.4237 (0.4583) time: 5.5742 data: 0.0001 max mem: 71357 +[01:46:20.312427] Epoch: [3] [1400/6500] lr: 0.000012 closs: 0.7474 (0.7410) grad_norm: 0.4414 (0.4586) time: 5.5705 data: 0.0001 max mem: 71357 +[01:47:16.093990] Epoch: [3] [1410/6500] lr: 0.000012 closs: 0.7474 (0.7411) grad_norm: 0.4410 (0.4586) time: 5.5759 data: 0.0001 max mem: 71357 +[01:48:11.816746] Epoch: [3] [1420/6500] lr: 0.000012 closs: 0.7565 (0.7411) grad_norm: 0.4414 (0.4585) time: 5.5751 data: 0.0001 max mem: 71357 +[01:49:07.716266] Epoch: [3] [1430/6500] lr: 0.000012 closs: 0.7565 (0.7411) grad_norm: 0.4410 (0.4582) time: 5.5810 data: 0.0001 max mem: 71357 +[01:50:03.388723] Epoch: [3] [1440/6500] lr: 0.000012 closs: 0.7284 (0.7415) grad_norm: 0.4376 (0.4584) time: 5.5785 data: 0.0001 max mem: 71357 +[01:50:59.024566] Epoch: [3] [1450/6500] lr: 0.000012 closs: 0.7738 (0.7418) grad_norm: 0.4376 (0.4589) time: 5.5653 data: 0.0001 max mem: 71357 +[01:51:54.723070] Epoch: [3] [1460/6500] lr: 0.000012 closs: 0.7617 (0.7418) grad_norm: 0.4371 (0.4589) time: 5.5667 data: 0.0001 max mem: 71357 +[01:52:50.538357] Epoch: [3] [1470/6500] lr: 0.000012 closs: 0.6811 (0.7416) grad_norm: 0.4182 (0.4585) time: 5.5756 data: 0.0001 max mem: 71357 +[01:53:46.361051] Epoch: [3] [1480/6500] lr: 0.000012 closs: 0.7539 (0.7417) grad_norm: 0.4128 (0.4580) time: 5.5818 data: 0.0001 max mem: 71357 +[01:54:42.096154] Epoch: [3] [1490/6500] lr: 0.000012 closs: 0.7626 (0.7418) grad_norm: 0.3807 (0.4579) time: 5.5778 data: 0.0001 max mem: 71357 +[01:55:37.782445] Epoch: [3] [1500/6500] lr: 0.000012 closs: 0.7505 (0.7420) grad_norm: 0.3785 (0.4578) time: 5.5710 data: 0.0001 max mem: 71357 +[01:56:33.437642] Epoch: [3] [1510/6500] lr: 0.000012 closs: 0.7306 (0.7415) grad_norm: 0.3785 (0.4576) time: 5.5670 data: 0.0001 max mem: 71357 +[01:57:29.352279] Epoch: [3] [1520/6500] lr: 0.000012 closs: 0.6655 (0.7410) grad_norm: 0.3785 (0.4571) time: 5.5784 data: 0.0001 max mem: 71357 +[01:58:24.994944] Epoch: [3] [1530/6500] lr: 0.000012 closs: 0.6983 (0.7410) grad_norm: 0.4127 (0.4581) time: 5.5778 data: 0.0001 max mem: 71357 +[01:59:20.733197] Epoch: [3] [1540/6500] lr: 0.000012 closs: 0.6983 (0.7407) grad_norm: 0.4055 (0.4574) time: 5.5690 data: 0.0001 max mem: 71357 +[02:00:16.503116] Epoch: [3] [1550/6500] lr: 0.000012 closs: 0.7581 (0.7409) grad_norm: 0.3975 (0.4673) time: 5.5753 data: 0.0001 max mem: 71357 +[02:01:12.253133] Epoch: [3] [1560/6500] lr: 0.000012 closs: 0.7520 (0.7410) grad_norm: 0.4127 (0.4677) time: 5.5759 data: 0.0001 max mem: 71357 +[02:02:08.003585] Epoch: [3] [1570/6500] lr: 0.000012 closs: 0.7520 (0.7412) grad_norm: 0.3975 (0.4677) time: 5.5749 data: 0.0001 max mem: 71357 +[02:03:03.622228] Epoch: [3] [1580/6500] lr: 0.000012 closs: 0.7429 (0.7413) grad_norm: 0.4225 (0.4673) time: 5.5684 data: 0.0001 max mem: 71357 +[02:03:59.278490] Epoch: [3] [1590/6500] lr: 0.000012 closs: 0.6785 (0.7412) grad_norm: 0.4574 (0.4672) time: 5.5637 data: 0.0001 max mem: 71357 +[02:04:54.995154] Epoch: [3] [1600/6500] lr: 0.000012 closs: 0.7179 (0.7411) grad_norm: 0.4226 (0.4683) time: 5.5686 data: 0.0001 max mem: 71357 +[02:05:50.719879] Epoch: [3] [1610/6500] lr: 0.000012 closs: 0.7179 (0.7411) grad_norm: 0.4166 (0.4696) time: 5.5720 data: 0.0001 max mem: 71357 +[02:06:46.372165] Epoch: [3] [1620/6500] lr: 0.000012 closs: 0.7214 (0.7412) grad_norm: 0.4342 (0.4695) time: 5.5688 data: 0.0001 max mem: 71357 +[02:07:42.042038] Epoch: [3] [1630/6500] lr: 0.000012 closs: 0.7214 (0.7415) grad_norm: 0.4226 (0.4692) time: 5.5660 data: 0.0001 max mem: 71357 +[02:08:37.707675] Epoch: [3] [1640/6500] lr: 0.000012 closs: 0.6832 (0.7413) grad_norm: 0.4152 (0.4688) time: 5.5667 data: 0.0001 max mem: 71357 +[02:09:33.583277] Epoch: [3] [1650/6500] lr: 0.000012 closs: 0.7236 (0.7416) grad_norm: 0.3965 (0.4683) time: 5.5770 data: 0.0001 max mem: 71357 +[02:10:29.261319] Epoch: [3] [1660/6500] lr: 0.000012 closs: 0.7659 (0.7420) grad_norm: 0.3933 (0.4688) time: 5.5776 data: 0.0001 max mem: 71357 +[02:11:25.002648] Epoch: [3] [1670/6500] lr: 0.000011 closs: 0.7812 (0.7422) grad_norm: 0.3842 (0.4682) time: 5.5709 data: 0.0001 max mem: 71357 +[02:12:20.775138] Epoch: [3] [1680/6500] lr: 0.000011 closs: 0.7292 (0.7417) grad_norm: 0.3713 (0.4677) time: 5.5756 data: 0.0001 max mem: 71357 +[02:13:16.590435] Epoch: [3] [1690/6500] lr: 0.000011 closs: 0.7292 (0.7418) grad_norm: 0.3856 (0.4674) time: 5.5793 data: 0.0001 max mem: 71357 +[02:14:12.309570] Epoch: [3] [1700/6500] lr: 0.000011 closs: 0.7039 (0.7418) grad_norm: 0.3970 (0.4672) time: 5.5766 data: 0.0001 max mem: 71357 +[02:15:08.139498] Epoch: [3] [1710/6500] lr: 0.000011 closs: 0.6996 (0.7415) grad_norm: 0.4101 (0.4673) time: 5.5774 data: 0.0001 max mem: 71357 +[02:16:03.905752] Epoch: [3] [1720/6500] lr: 0.000011 closs: 0.7542 (0.7416) grad_norm: 0.4146 (0.4669) time: 5.5797 data: 0.0001 max mem: 71357 +[02:16:59.594667] Epoch: [3] [1730/6500] lr: 0.000011 closs: 0.7506 (0.7417) grad_norm: 0.4146 (0.4666) time: 5.5726 data: 0.0001 max mem: 71357 +[02:17:55.268246] Epoch: [3] [1740/6500] lr: 0.000011 closs: 0.6567 (0.7410) grad_norm: 0.4084 (0.4667) time: 5.5680 data: 0.0001 max mem: 71357 +[02:18:50.907554] Epoch: [3] [1750/6500] lr: 0.000011 closs: 0.6691 (0.7413) grad_norm: 0.4212 (0.4668) time: 5.5656 data: 0.0001 max mem: 71357 +[02:19:46.589279] Epoch: [3] [1760/6500] lr: 0.000011 closs: 0.6691 (0.7410) grad_norm: 0.4253 (0.4670) time: 5.5660 data: 0.0001 max mem: 71357 +[02:20:42.306504] Epoch: [3] [1770/6500] lr: 0.000011 closs: 0.6992 (0.7412) grad_norm: 0.4253 (0.4667) time: 5.5699 data: 0.0001 max mem: 71357 +[02:21:38.162516] Epoch: [3] [1780/6500] lr: 0.000011 closs: 0.7811 (0.7412) grad_norm: 0.4221 (0.4664) time: 5.5786 data: 0.0001 max mem: 71357 +[02:22:33.796796] Epoch: [3] [1790/6500] lr: 0.000011 closs: 0.8156 (0.7417) grad_norm: 0.4038 (0.4661) time: 5.5744 data: 0.0001 max mem: 71357 +[02:23:29.492190] Epoch: [3] [1800/6500] lr: 0.000011 closs: 0.8221 (0.7420) grad_norm: 0.4221 (0.4661) time: 5.5664 data: 0.0001 max mem: 71357 +[02:24:25.122788] Epoch: [3] [1810/6500] lr: 0.000011 closs: 0.7031 (0.7419) grad_norm: 0.4221 (0.4658) time: 5.5662 data: 0.0001 max mem: 71357 +[02:25:20.865166] Epoch: [3] [1820/6500] lr: 0.000011 closs: 0.7157 (0.7420) grad_norm: 0.4443 (0.4663) time: 5.5686 data: 0.0001 max mem: 71357 +[02:26:16.547329] Epoch: [3] [1830/6500] lr: 0.000011 closs: 0.7475 (0.7419) grad_norm: 0.4443 (0.4661) time: 5.5712 data: 0.0001 max mem: 71357 +[02:27:12.190688] Epoch: [3] [1840/6500] lr: 0.000011 closs: 0.7351 (0.7418) grad_norm: 0.4383 (0.4659) time: 5.5662 data: 0.0001 max mem: 71357 +[02:28:07.854923] Epoch: [3] [1850/6500] lr: 0.000011 closs: 0.7313 (0.7419) grad_norm: 0.4383 (0.4655) time: 5.5653 data: 0.0001 max mem: 71357 +[02:29:03.539524] Epoch: [3] [1860/6500] lr: 0.000011 closs: 0.7179 (0.7419) grad_norm: 0.3883 (0.4656) time: 5.5674 data: 0.0001 max mem: 71357 +[02:29:59.225284] Epoch: [3] [1870/6500] lr: 0.000011 closs: 0.7383 (0.7420) grad_norm: 0.4725 (0.4658) time: 5.5684 data: 0.0001 max mem: 71357 +[02:30:54.926822] Epoch: [3] [1880/6500] lr: 0.000011 closs: 0.7903 (0.7422) grad_norm: 0.4160 (0.4656) time: 5.5693 data: 0.0001 max mem: 71357 +[02:31:50.596193] Epoch: [3] [1890/6500] lr: 0.000011 closs: 0.7737 (0.7417) grad_norm: 0.4482 (0.4657) time: 5.5684 data: 0.0001 max mem: 71357 +[02:32:46.225288] Epoch: [3] [1900/6500] lr: 0.000011 closs: 0.7226 (0.7418) grad_norm: 0.4482 (0.4662) time: 5.5648 data: 0.0001 max mem: 71357 +[02:33:42.077006] Epoch: [3] [1910/6500] lr: 0.000011 closs: 0.7017 (0.7414) grad_norm: 0.4312 (0.4662) time: 5.5740 data: 0.0001 max mem: 71357 +[02:34:37.702178] Epoch: [3] [1920/6500] lr: 0.000011 closs: 0.6920 (0.7413) grad_norm: 0.4146 (0.4657) time: 5.5738 data: 0.0001 max mem: 71357 +[02:35:33.312850] Epoch: [3] [1930/6500] lr: 0.000011 closs: 0.7232 (0.7414) grad_norm: 0.4084 (0.4657) time: 5.5617 data: 0.0001 max mem: 71357 +[02:36:29.008470] Epoch: [3] [1940/6500] lr: 0.000011 closs: 0.7370 (0.7416) grad_norm: 0.3967 (0.4656) time: 5.5652 data: 0.0001 max mem: 71357 +[02:37:24.578544] Epoch: [3] [1950/6500] lr: 0.000011 closs: 0.6784 (0.7410) grad_norm: 0.4146 (0.4656) time: 5.5632 data: 0.0001 max mem: 71357 +[02:38:20.386332] Epoch: [3] [1960/6500] lr: 0.000011 closs: 0.7038 (0.7412) grad_norm: 0.4360 (0.4668) time: 5.5688 data: 0.0001 max mem: 71357 +[02:39:16.102321] Epoch: [3] [1970/6500] lr: 0.000011 closs: 0.7362 (0.7417) grad_norm: 0.4360 (0.4667) time: 5.5761 data: 0.0001 max mem: 71357 +[02:40:11.878609] Epoch: [3] [1980/6500] lr: 0.000011 closs: 0.7269 (0.7417) grad_norm: 0.4407 (0.4668) time: 5.5746 data: 0.0001 max mem: 71357 +[02:41:07.583458] Epoch: [3] [1990/6500] lr: 0.000011 closs: 0.7227 (0.7414) grad_norm: 0.3940 (0.4665) time: 5.5740 data: 0.0001 max mem: 71357 +[02:42:03.433265] Epoch: [3] [2000/6500] lr: 0.000011 closs: 0.6968 (0.7418) grad_norm: 0.3859 (0.4671) time: 5.5777 data: 0.0001 max mem: 71357 +[02:42:59.197575] Epoch: [3] [2010/6500] lr: 0.000011 closs: 0.7337 (0.7419) grad_norm: 0.3948 (0.4671) time: 5.5806 data: 0.0001 max mem: 71357 +[02:43:54.917992] Epoch: [3] [2020/6500] lr: 0.000011 closs: 0.7337 (0.7419) grad_norm: 0.3868 (0.4670) time: 5.5741 data: 0.0001 max mem: 71357 +[02:44:50.528619] Epoch: [3] [2030/6500] lr: 0.000011 closs: 0.7078 (0.7416) grad_norm: 0.3948 (0.4672) time: 5.5665 data: 0.0001 max mem: 71357 +[02:45:46.347257] Epoch: [3] [2040/6500] lr: 0.000011 closs: 0.7078 (0.7418) grad_norm: 0.3819 (0.4667) time: 5.5714 data: 0.0001 max mem: 71357 +[02:46:42.143262] Epoch: [3] [2050/6500] lr: 0.000011 closs: 0.8076 (0.7420) grad_norm: 0.3655 (0.4664) time: 5.5807 data: 0.0001 max mem: 71357 +[02:47:37.791827] Epoch: [3] [2060/6500] lr: 0.000011 closs: 0.7841 (0.7419) grad_norm: 0.3819 (0.4661) time: 5.5721 data: 0.0001 max mem: 71357 +[02:48:33.510323] Epoch: [3] [2070/6500] lr: 0.000010 closs: 0.7400 (0.7420) grad_norm: 0.3773 (0.4659) time: 5.5683 data: 0.0001 max mem: 71357 +[02:49:29.250770] Epoch: [3] [2080/6500] lr: 0.000010 closs: 0.7386 (0.7420) grad_norm: 0.3972 (0.4665) time: 5.5729 data: 0.0001 max mem: 71357 +[02:50:25.143434] Epoch: [3] [2090/6500] lr: 0.000010 closs: 0.7386 (0.7423) grad_norm: 0.4165 (0.4708) time: 5.5816 data: 0.0001 max mem: 71357 +[02:51:20.673668] Epoch: [3] [2100/6500] lr: 0.000010 closs: 0.7314 (0.7420) grad_norm: 0.5068 (0.4721) time: 5.5711 data: 0.0001 max mem: 71357 +[02:52:16.282560] Epoch: [3] [2110/6500] lr: 0.000010 closs: 0.7098 (0.7423) grad_norm: 0.5068 (0.4721) time: 5.5569 data: 0.0001 max mem: 71357 +[02:53:11.994560] Epoch: [3] [2120/6500] lr: 0.000010 closs: 0.7516 (0.7426) grad_norm: 0.4518 (0.4720) time: 5.5660 data: 0.0001 max mem: 71357 +[02:54:07.738349] Epoch: [3] [2130/6500] lr: 0.000010 closs: 0.7340 (0.7425) grad_norm: 0.4506 (0.4718) time: 5.5727 data: 0.0001 max mem: 71357 +[02:55:03.531949] Epoch: [3] [2140/6500] lr: 0.000010 closs: 0.6876 (0.7423) grad_norm: 0.4036 (0.4718) time: 5.5768 data: 0.0001 max mem: 71357 +[02:55:59.106827] Epoch: [3] [2150/6500] lr: 0.000010 closs: 0.6876 (0.7424) grad_norm: 0.4139 (0.4721) time: 5.5683 data: 0.0001 max mem: 71357 +[02:56:54.769431] Epoch: [3] [2160/6500] lr: 0.000010 closs: 0.7490 (0.7425) grad_norm: 0.4139 (0.4718) time: 5.5618 data: 0.0001 max mem: 71357 +[02:57:50.424404] Epoch: [3] [2170/6500] lr: 0.000010 closs: 0.6904 (0.7421) grad_norm: 0.4139 (0.4724) time: 5.5658 data: 0.0001 max mem: 71357 +[02:58:46.204630] Epoch: [3] [2180/6500] lr: 0.000010 closs: 0.7001 (0.7419) grad_norm: 0.4364 (0.4725) time: 5.5717 data: 0.0001 max mem: 71357 +[02:59:41.913462] Epoch: [3] [2190/6500] lr: 0.000010 closs: 0.7383 (0.7419) grad_norm: 0.4115 (0.4721) time: 5.5744 data: 0.0001 max mem: 71357 +[03:00:37.544665] Epoch: [3] [2200/6500] lr: 0.000010 closs: 0.7552 (0.7420) grad_norm: 0.3997 (0.4716) time: 5.5669 data: 0.0001 max mem: 71357 +[03:01:33.065049] Epoch: [3] [2210/6500] lr: 0.000010 closs: 0.7129 (0.7416) grad_norm: 0.4201 (0.4722) time: 5.5575 data: 0.0001 max mem: 71357 +[03:02:28.964229] Epoch: [3] [2220/6500] lr: 0.000010 closs: 0.7129 (0.7415) grad_norm: 0.3805 (0.4718) time: 5.5709 data: 0.0001 max mem: 71357 +[03:03:24.711773] Epoch: [3] [2230/6500] lr: 0.000010 closs: 0.7841 (0.7417) grad_norm: 0.4201 (0.4720) time: 5.5823 data: 0.0001 max mem: 71357 +[03:04:20.376127] Epoch: [3] [2240/6500] lr: 0.000010 closs: 0.7152 (0.7416) grad_norm: 0.4451 (0.4735) time: 5.5705 data: 0.0001 max mem: 71357 +[03:05:16.094408] Epoch: [3] [2250/6500] lr: 0.000010 closs: 0.7152 (0.7417) grad_norm: 0.3921 (0.4732) time: 5.5690 data: 0.0001 max mem: 71357 +[03:06:11.929158] Epoch: [3] [2260/6500] lr: 0.000010 closs: 0.7516 (0.7419) grad_norm: 0.4235 (0.4729) time: 5.5776 data: 0.0001 max mem: 71357 +[03:07:07.681225] Epoch: [3] [2270/6500] lr: 0.000010 closs: 0.7474 (0.7419) grad_norm: 0.4103 (0.4726) time: 5.5793 data: 0.0001 max mem: 71357 +[03:08:03.330780] Epoch: [3] [2280/6500] lr: 0.000010 closs: 0.7457 (0.7421) grad_norm: 0.4028 (0.4724) time: 5.5700 data: 0.0001 max mem: 71357 +[03:08:59.036717] Epoch: [3] [2290/6500] lr: 0.000010 closs: 0.7586 (0.7420) grad_norm: 0.4277 (0.4728) time: 5.5677 data: 0.0001 max mem: 71357 +[03:09:54.753564] Epoch: [3] [2300/6500] lr: 0.000010 closs: 0.7384 (0.7419) grad_norm: 0.4449 (0.4728) time: 5.5710 data: 0.0001 max mem: 71357 +[03:10:50.585338] Epoch: [3] [2310/6500] lr: 0.000010 closs: 0.6609 (0.7415) grad_norm: 0.4297 (0.4726) time: 5.5773 data: 0.0001 max mem: 71357 +[03:11:46.300428] Epoch: [3] [2320/6500] lr: 0.000010 closs: 0.6929 (0.7417) grad_norm: 0.4388 (0.4725) time: 5.5773 data: 0.0001 max mem: 71357 +[03:12:42.024279] Epoch: [3] [2330/6500] lr: 0.000010 closs: 0.7588 (0.7419) grad_norm: 0.4257 (0.4725) time: 5.5719 data: 0.0001 max mem: 71357 +[03:13:37.667847] Epoch: [3] [2340/6500] lr: 0.000010 closs: 0.7638 (0.7420) grad_norm: 0.4236 (0.4723) time: 5.5683 data: 0.0001 max mem: 71357 +[03:14:33.402863] Epoch: [3] [2350/6500] lr: 0.000010 closs: 0.7448 (0.7419) grad_norm: 0.4299 (0.4724) time: 5.5688 data: 0.0001 max mem: 71357 +[03:15:29.150906] Epoch: [3] [2360/6500] lr: 0.000010 closs: 0.6643 (0.7415) grad_norm: 0.4257 (0.4721) time: 5.5741 data: 0.0001 max mem: 71357 +[03:16:24.888869] Epoch: [3] [2370/6500] lr: 0.000010 closs: 0.6643 (0.7413) grad_norm: 0.3967 (0.4717) time: 5.5742 data: 0.0001 max mem: 71357 +[03:17:20.515563] Epoch: [3] [2380/6500] lr: 0.000010 closs: 0.6623 (0.7412) grad_norm: 0.4207 (0.4722) time: 5.5682 data: 0.0001 max mem: 71357 +[03:18:16.227993] Epoch: [3] [2390/6500] lr: 0.000010 closs: 0.7421 (0.7414) grad_norm: 0.4128 (0.4721) time: 5.5669 data: 0.0001 max mem: 71357 +[03:19:12.133308] Epoch: [3] [2400/6500] lr: 0.000010 closs: 0.6836 (0.7410) grad_norm: 0.3865 (0.4718) time: 5.5808 data: 0.0001 max mem: 71357 +[03:20:07.896174] Epoch: [3] [2410/6500] lr: 0.000010 closs: 0.6543 (0.7409) grad_norm: 0.4128 (0.4717) time: 5.5833 data: 0.0001 max mem: 71357 +[03:21:03.510971] Epoch: [3] [2420/6500] lr: 0.000010 closs: 0.7160 (0.7409) grad_norm: 0.4059 (0.4716) time: 5.5688 data: 0.0001 max mem: 71357 +[03:21:59.207139] Epoch: [3] [2430/6500] lr: 0.000010 closs: 0.7137 (0.7409) grad_norm: 0.4059 (0.4717) time: 5.5655 data: 0.0001 max mem: 71357 +[03:22:55.037062] Epoch: [3] [2440/6500] lr: 0.000010 closs: 0.6463 (0.7407) grad_norm: 0.4322 (0.4724) time: 5.5762 data: 0.0001 max mem: 71357 +[03:23:50.643702] Epoch: [3] [2450/6500] lr: 0.000010 closs: 0.6266 (0.7404) grad_norm: 0.4504 (0.4727) time: 5.5717 data: 0.0001 max mem: 71357 +[03:24:46.427959] Epoch: [3] [2460/6500] lr: 0.000010 closs: 0.6887 (0.7405) grad_norm: 0.4398 (0.4724) time: 5.5694 data: 0.0001 max mem: 71357 +[03:25:42.142791] Epoch: [3] [2470/6500] lr: 0.000010 closs: 0.7283 (0.7406) grad_norm: 0.4504 (0.4723) time: 5.5749 data: 0.0001 max mem: 71357 +[03:26:38.004975] Epoch: [3] [2480/6500] lr: 0.000010 closs: 0.6969 (0.7404) grad_norm: 0.4335 (0.4719) time: 5.5788 data: 0.0001 max mem: 71357 +[03:27:33.744560] Epoch: [3] [2490/6500] lr: 0.000010 closs: 0.6872 (0.7404) grad_norm: 0.3993 (0.4719) time: 5.5800 data: 0.0001 max mem: 71357 +[03:28:29.410295] Epoch: [3] [2500/6500] lr: 0.000010 closs: 0.6872 (0.7403) grad_norm: 0.4124 (0.4718) time: 5.5702 data: 0.0001 max mem: 71357 +[03:29:25.128655] Epoch: [3] [2510/6500] lr: 0.000009 closs: 0.7039 (0.7404) grad_norm: 0.3874 (0.4718) time: 5.5691 data: 0.0001 max mem: 71357 +[03:30:20.824465] Epoch: [3] [2520/6500] lr: 0.000009 closs: 0.7661 (0.7407) grad_norm: 0.3943 (0.4717) time: 5.5706 data: 0.0001 max mem: 71357 +[03:31:16.630889] Epoch: [3] [2530/6500] lr: 0.000009 closs: 0.7887 (0.7411) grad_norm: 0.3874 (0.4713) time: 5.5750 data: 0.0001 max mem: 71357 +[03:32:12.221908] Epoch: [3] [2540/6500] lr: 0.000009 closs: 0.7857 (0.7410) grad_norm: 0.3943 (0.4717) time: 5.5698 data: 0.0001 max mem: 71357 +[03:33:07.870496] Epoch: [3] [2550/6500] lr: 0.000009 closs: 0.7204 (0.7411) grad_norm: 0.4102 (0.4718) time: 5.5619 data: 0.0001 max mem: 71357 +[03:34:03.554231] Epoch: [3] [2560/6500] lr: 0.000009 closs: 0.7516 (0.7415) grad_norm: 0.4711 (0.4719) time: 5.5666 data: 0.0001 max mem: 71357 +[03:34:59.248273] Epoch: [3] [2570/6500] lr: 0.000009 closs: 0.7628 (0.7414) grad_norm: 0.4896 (0.4722) time: 5.5688 data: 0.0001 max mem: 71357 +[03:35:54.952768] Epoch: [3] [2580/6500] lr: 0.000009 closs: 0.6777 (0.7414) grad_norm: 0.4469 (0.4719) time: 5.5698 data: 0.0001 max mem: 71357 +[03:36:50.647324] Epoch: [3] [2590/6500] lr: 0.000009 closs: 0.7527 (0.7414) grad_norm: 0.4279 (0.4720) time: 5.5699 data: 0.0001 max mem: 71357 +[03:37:46.354833] Epoch: [3] [2600/6500] lr: 0.000009 closs: 0.7692 (0.7415) grad_norm: 0.3699 (0.4717) time: 5.5700 data: 0.0001 max mem: 71357 +[03:38:41.964847] Epoch: [3] [2610/6500] lr: 0.000009 closs: 0.7800 (0.7416) grad_norm: 0.3615 (0.4716) time: 5.5658 data: 0.0001 max mem: 71357 +[03:39:37.792299] Epoch: [3] [2620/6500] lr: 0.000009 closs: 0.6826 (0.7413) grad_norm: 0.4102 (0.4716) time: 5.5718 data: 0.0001 max mem: 71357 +[03:40:33.423699] Epoch: [3] [2630/6500] lr: 0.000009 closs: 0.7121 (0.7411) grad_norm: 0.3668 (0.4712) time: 5.5729 data: 0.0001 max mem: 71357 +[03:41:29.121161] Epoch: [3] [2640/6500] lr: 0.000009 closs: 0.7084 (0.7409) grad_norm: 0.4153 (0.4710) time: 5.5663 data: 0.0001 max mem: 71357 +[03:42:24.835125] Epoch: [3] [2650/6500] lr: 0.000009 closs: 0.7084 (0.7410) grad_norm: 0.3918 (0.4706) time: 5.5705 data: 0.0001 max mem: 71357 +[03:43:20.618505] Epoch: [3] [2660/6500] lr: 0.000009 closs: 0.7455 (0.7410) grad_norm: 0.3730 (0.4704) time: 5.5748 data: 0.0001 max mem: 71357 +[03:44:16.323482] Epoch: [3] [2670/6500] lr: 0.000009 closs: 0.7791 (0.7412) grad_norm: 0.3868 (0.4702) time: 5.5743 data: 0.0001 max mem: 71357 +[03:45:12.012818] Epoch: [3] [2680/6500] lr: 0.000009 closs: 0.7665 (0.7411) grad_norm: 0.3774 (0.4701) time: 5.5696 data: 0.0001 max mem: 71357 +[03:46:07.663209] Epoch: [3] [2690/6500] lr: 0.000009 closs: 0.6634 (0.7410) grad_norm: 0.4084 (0.4700) time: 5.5669 data: 0.0001 max mem: 71357 +[03:47:03.385219] Epoch: [3] [2700/6500] lr: 0.000009 closs: 0.7106 (0.7409) grad_norm: 0.4270 (0.4699) time: 5.5685 data: 0.0001 max mem: 71357 +[03:47:59.094951] Epoch: [3] [2710/6500] lr: 0.000009 closs: 0.6913 (0.7409) grad_norm: 0.4162 (0.4697) time: 5.5715 data: 0.0001 max mem: 71357 +[03:48:54.689454] Epoch: [3] [2720/6500] lr: 0.000009 closs: 0.6913 (0.7408) grad_norm: 0.4328 (0.4699) time: 5.5651 data: 0.0001 max mem: 71357 +[03:49:50.392630] Epoch: [3] [2730/6500] lr: 0.000009 closs: 0.7583 (0.7411) grad_norm: 0.4328 (0.4698) time: 5.5648 data: 0.0001 max mem: 71357 +[03:50:46.064539] Epoch: [3] [2740/6500] lr: 0.000009 closs: 0.7172 (0.7408) grad_norm: 0.4328 (0.4697) time: 5.5687 data: 0.0001 max mem: 71357 +[03:51:41.898318] Epoch: [3] [2750/6500] lr: 0.000009 closs: 0.7137 (0.7409) grad_norm: 0.4655 (0.4696) time: 5.5752 data: 0.0001 max mem: 71357 +[03:52:37.622614] Epoch: [3] [2760/6500] lr: 0.000009 closs: 0.7675 (0.7411) grad_norm: 0.4293 (0.4695) time: 5.5778 data: 0.0001 max mem: 71357 +[03:53:33.270434] Epoch: [3] [2770/6500] lr: 0.000009 closs: 0.7621 (0.7410) grad_norm: 0.4293 (0.4694) time: 5.5685 data: 0.0001 max mem: 71357 +[03:54:28.938310] Epoch: [3] [2780/6500] lr: 0.000009 closs: 0.7231 (0.7410) grad_norm: 0.4198 (0.4695) time: 5.5657 data: 0.0001 max mem: 71357 +[03:55:24.636676] Epoch: [3] [2790/6500] lr: 0.000009 closs: 0.7205 (0.7410) grad_norm: 0.4177 (0.4694) time: 5.5683 data: 0.0001 max mem: 71357 +[03:56:20.341616] Epoch: [3] [2800/6500] lr: 0.000009 closs: 0.7450 (0.7414) grad_norm: 0.4198 (0.4702) time: 5.5701 data: 0.0001 max mem: 71357 +[03:57:16.007696] Epoch: [3] [2810/6500] lr: 0.000009 closs: 0.7234 (0.7413) grad_norm: 0.4308 (0.4709) time: 5.5685 data: 0.0001 max mem: 71357 +[03:58:11.641875] Epoch: [3] [2820/6500] lr: 0.000009 closs: 0.6968 (0.7413) grad_norm: 0.4058 (0.4706) time: 5.5649 data: 0.0001 max mem: 71357 +[03:59:07.419506] Epoch: [3] [2830/6500] lr: 0.000009 closs: 0.7425 (0.7414) grad_norm: 0.4037 (0.4706) time: 5.5705 data: 0.0001 max mem: 71357 +[04:00:03.264856] Epoch: [3] [2840/6500] lr: 0.000009 closs: 0.7550 (0.7416) grad_norm: 0.4037 (0.4703) time: 5.5811 data: 0.0001 max mem: 71357 +[04:00:58.906002] Epoch: [3] [2850/6500] lr: 0.000009 closs: 0.7550 (0.7417) grad_norm: 0.3923 (0.4703) time: 5.5743 data: 0.0001 max mem: 71357 +[04:01:54.598558] Epoch: [3] [2860/6500] lr: 0.000009 closs: 0.7611 (0.7416) grad_norm: 0.4231 (0.4702) time: 5.5666 data: 0.0001 max mem: 71357 +[04:02:50.245016] Epoch: [3] [2870/6500] lr: 0.000009 closs: 0.7677 (0.7419) grad_norm: 0.4231 (0.4703) time: 5.5669 data: 0.0001 max mem: 71357 +[04:03:45.974594] Epoch: [3] [2880/6500] lr: 0.000009 closs: 0.7250 (0.7417) grad_norm: 0.4318 (0.4703) time: 5.5687 data: 0.0001 max mem: 71357 +[04:04:41.643615] Epoch: [3] [2890/6500] lr: 0.000009 closs: 0.7242 (0.7419) grad_norm: 0.4211 (0.4705) time: 5.5698 data: 0.0001 max mem: 71357 +[04:05:37.230252] Epoch: [3] [2900/6500] lr: 0.000009 closs: 0.7474 (0.7420) grad_norm: 0.4166 (0.4704) time: 5.5627 data: 0.0001 max mem: 71357 +[04:06:32.951079] Epoch: [3] [2910/6500] lr: 0.000009 closs: 0.7044 (0.7418) grad_norm: 0.4166 (0.4703) time: 5.5653 data: 0.0001 max mem: 71357 +[04:07:28.763809] Epoch: [3] [2920/6500] lr: 0.000009 closs: 0.6616 (0.7417) grad_norm: 0.4041 (0.4698) time: 5.5766 data: 0.0001 max mem: 71357 +[04:08:24.491781] Epoch: [3] [2930/6500] lr: 0.000009 closs: 0.6237 (0.7413) grad_norm: 0.4075 (0.4698) time: 5.5770 data: 0.0001 max mem: 71357 +[04:09:20.165615] Epoch: [3] [2940/6500] lr: 0.000009 closs: 0.6831 (0.7411) grad_norm: 0.4126 (0.4697) time: 5.5700 data: 0.0001 max mem: 71357 +[04:10:15.890629] Epoch: [3] [2950/6500] lr: 0.000009 closs: 0.6921 (0.7410) grad_norm: 0.4271 (0.4699) time: 5.5699 data: 0.0001 max mem: 71357 +[04:11:11.666969] Epoch: [3] [2960/6500] lr: 0.000009 closs: 0.6946 (0.7409) grad_norm: 0.4273 (0.4696) time: 5.5750 data: 0.0001 max mem: 71357 +[04:12:07.462625] Epoch: [3] [2970/6500] lr: 0.000009 closs: 0.6784 (0.7407) grad_norm: 0.3911 (0.4695) time: 5.5785 data: 0.0001 max mem: 71357 +[04:13:03.238329] Epoch: [3] [2980/6500] lr: 0.000009 closs: 0.7087 (0.7407) grad_norm: 0.3911 (0.4693) time: 5.5785 data: 0.0001 max mem: 71357 +[04:13:58.891238] Epoch: [3] [2990/6500] lr: 0.000009 closs: 0.7087 (0.7404) grad_norm: 0.3911 (0.4692) time: 5.5713 data: 0.0001 max mem: 71357 +[04:14:54.493479] Epoch: [3] [3000/6500] lr: 0.000008 closs: 0.6673 (0.7403) grad_norm: 0.4436 (0.4696) time: 5.5627 data: 0.0001 max mem: 71357 +[04:15:50.321151] Epoch: [3] [3010/6500] lr: 0.000008 closs: 0.7433 (0.7404) grad_norm: 0.4436 (0.4694) time: 5.5714 data: 0.0001 max mem: 71357 +[04:16:46.107372] Epoch: [3] [3020/6500] lr: 0.000008 closs: 0.7592 (0.7406) grad_norm: 0.4476 (0.4693) time: 5.5806 data: 0.0001 max mem: 71357 +[04:17:41.774052] Epoch: [3] [3030/6500] lr: 0.000008 closs: 0.7118 (0.7405) grad_norm: 0.4055 (0.4690) time: 5.5725 data: 0.0001 max mem: 71357 +[04:18:37.357139] Epoch: [3] [3040/6500] lr: 0.000008 closs: 0.7045 (0.7403) grad_norm: 0.3980 (0.4691) time: 5.5624 data: 0.0001 max mem: 71357 +[04:19:33.179800] Epoch: [3] [3050/6500] lr: 0.000008 closs: 0.7116 (0.7405) grad_norm: 0.3708 (0.4687) time: 5.5702 data: 0.0001 max mem: 71357 +[04:20:29.004476] Epoch: [3] [3060/6500] lr: 0.000008 closs: 0.7054 (0.7404) grad_norm: 0.3656 (0.4685) time: 5.5823 data: 0.0001 max mem: 71357 +[04:21:24.716825] Epoch: [3] [3070/6500] lr: 0.000008 closs: 0.7146 (0.7405) grad_norm: 0.3758 (0.4684) time: 5.5768 data: 0.0001 max mem: 71357 +[04:22:20.360022] Epoch: [3] [3080/6500] lr: 0.000008 closs: 0.7148 (0.7403) grad_norm: 0.3667 (0.4683) time: 5.5677 data: 0.0001 max mem: 71357 +[04:23:16.035938] Epoch: [3] [3090/6500] lr: 0.000008 closs: 0.7148 (0.7403) grad_norm: 0.3956 (0.4683) time: 5.5659 data: 0.0001 max mem: 71357 +[04:24:11.851079] Epoch: [3] [3100/6500] lr: 0.000008 closs: 0.6789 (0.7400) grad_norm: 0.4137 (0.4680) time: 5.5745 data: 0.0001 max mem: 71357 +[04:25:07.496748] Epoch: [3] [3110/6500] lr: 0.000008 closs: 0.6925 (0.7402) grad_norm: 0.3955 (0.4684) time: 5.5730 data: 0.0001 max mem: 71357 +[04:26:03.184320] Epoch: [3] [3120/6500] lr: 0.000008 closs: 0.7409 (0.7402) grad_norm: 0.3955 (0.4682) time: 5.5666 data: 0.0001 max mem: 71357 +[04:26:58.826254] Epoch: [3] [3130/6500] lr: 0.000008 closs: 0.7133 (0.7402) grad_norm: 0.4034 (0.4684) time: 5.5664 data: 0.0001 max mem: 71357 +[04:27:54.567322] Epoch: [3] [3140/6500] lr: 0.000008 closs: 0.6893 (0.7399) grad_norm: 0.4219 (0.4689) time: 5.5691 data: 0.0001 max mem: 71357 +[04:28:50.349706] Epoch: [3] [3150/6500] lr: 0.000008 closs: 0.6893 (0.7402) grad_norm: 0.4405 (0.4689) time: 5.5761 data: 0.0001 max mem: 71357 +[04:29:46.023632] Epoch: [3] [3160/6500] lr: 0.000008 closs: 0.7973 (0.7404) grad_norm: 0.4484 (0.4690) time: 5.5727 data: 0.0001 max mem: 71357 +[04:30:41.687580] Epoch: [3] [3170/6500] lr: 0.000008 closs: 0.7909 (0.7406) grad_norm: 0.4616 (0.4695) time: 5.5668 data: 0.0001 max mem: 71357 +[04:31:37.298065] Epoch: [3] [3180/6500] lr: 0.000008 closs: 0.7909 (0.7408) grad_norm: 0.4616 (0.4695) time: 5.5636 data: 0.0001 max mem: 71357 +[04:32:33.030157] Epoch: [3] [3190/6500] lr: 0.000008 closs: 0.8091 (0.7411) grad_norm: 0.4743 (0.4695) time: 5.5670 data: 0.0001 max mem: 71357 +[04:33:28.658546] Epoch: [3] [3200/6500] lr: 0.000008 closs: 0.7844 (0.7411) grad_norm: 0.4590 (0.4698) time: 5.5679 data: 0.0001 max mem: 71357 +[04:34:24.435166] Epoch: [3] [3210/6500] lr: 0.000008 closs: 0.7599 (0.7413) grad_norm: 0.4360 (0.4700) time: 5.5702 data: 0.0001 max mem: 71357 +[04:35:20.170339] Epoch: [3] [3220/6500] lr: 0.000008 closs: 0.7599 (0.7411) grad_norm: 0.4270 (0.4699) time: 5.5755 data: 0.0001 max mem: 71357 +[04:36:15.966734] Epoch: [3] [3230/6500] lr: 0.000008 closs: 0.6817 (0.7410) grad_norm: 0.4210 (0.4698) time: 5.5765 data: 0.0001 max mem: 71357 +[04:37:11.668373] Epoch: [3] [3240/6500] lr: 0.000008 closs: 0.6817 (0.7409) grad_norm: 0.4270 (0.4698) time: 5.5748 data: 0.0001 max mem: 71357 +[04:38:07.436973] Epoch: [3] [3250/6500] lr: 0.000008 closs: 0.7496 (0.7410) grad_norm: 0.4443 (0.4702) time: 5.5734 data: 0.0001 max mem: 71357 +[04:39:03.208428] Epoch: [3] [3260/6500] lr: 0.000008 closs: 0.6809 (0.7408) grad_norm: 0.4437 (0.4700) time: 5.5769 data: 0.0001 max mem: 71357 +[04:39:58.909951] Epoch: [3] [3270/6500] lr: 0.000008 closs: 0.6809 (0.7408) grad_norm: 0.4473 (0.4700) time: 5.5735 data: 0.0001 max mem: 71357 +[04:40:54.726343] Epoch: [3] [3280/6500] lr: 0.000008 closs: 0.7186 (0.7407) grad_norm: 0.4473 (0.4701) time: 5.5758 data: 0.0001 max mem: 71357 +[04:41:50.415725] Epoch: [3] [3290/6500] lr: 0.000008 closs: 0.7186 (0.7407) grad_norm: 0.4437 (0.4699) time: 5.5752 data: 0.0001 max mem: 71357 +[04:42:46.123758] Epoch: [3] [3300/6500] lr: 0.000008 closs: 0.7253 (0.7407) grad_norm: 0.4245 (0.4697) time: 5.5698 data: 0.0001 max mem: 71357 +[04:43:41.840664] Epoch: [3] [3310/6500] lr: 0.000008 closs: 0.6994 (0.7406) grad_norm: 0.4150 (0.4696) time: 5.5712 data: 0.0001 max mem: 71357 +[04:44:37.594666] Epoch: [3] [3320/6500] lr: 0.000008 closs: 0.7145 (0.7406) grad_norm: 0.4150 (0.4695) time: 5.5735 data: 0.0001 max mem: 71357 +[04:45:33.240389] Epoch: [3] [3330/6500] lr: 0.000008 closs: 0.7095 (0.7405) grad_norm: 0.4161 (0.4696) time: 5.5699 data: 0.0001 max mem: 71357 +[04:46:28.959317] Epoch: [3] [3340/6500] lr: 0.000008 closs: 0.7035 (0.7405) grad_norm: 0.4373 (0.4700) time: 5.5681 data: 0.0001 max mem: 71357 +[04:47:24.606653] Epoch: [3] [3350/6500] lr: 0.000008 closs: 0.7035 (0.7404) grad_norm: 0.4373 (0.4700) time: 5.5682 data: 0.0001 max mem: 71357 +[04:48:20.364255] Epoch: [3] [3360/6500] lr: 0.000008 closs: 0.7093 (0.7403) grad_norm: 0.4540 (0.4699) time: 5.5702 data: 0.0001 max mem: 71357 +[04:49:16.119586] Epoch: [3] [3370/6500] lr: 0.000008 closs: 0.7780 (0.7404) grad_norm: 0.4645 (0.4699) time: 5.5756 data: 0.0001 max mem: 71357 +[04:50:11.776158] Epoch: [3] [3380/6500] lr: 0.000008 closs: 0.8124 (0.7406) grad_norm: 0.4137 (0.4697) time: 5.5705 data: 0.0001 max mem: 71357 +[04:51:07.474741] Epoch: [3] [3390/6500] lr: 0.000008 closs: 0.7139 (0.7405) grad_norm: 0.4485 (0.4700) time: 5.5677 data: 0.0001 max mem: 71357 +[04:52:03.267438] Epoch: [3] [3400/6500] lr: 0.000008 closs: 0.6892 (0.7405) grad_norm: 0.3771 (0.4697) time: 5.5745 data: 0.0001 max mem: 71357 +[04:52:59.023647] Epoch: [3] [3410/6500] lr: 0.000008 closs: 0.7276 (0.7407) grad_norm: 0.3916 (0.4701) time: 5.5774 data: 0.0001 max mem: 71357 +[04:53:54.706683] Epoch: [3] [3420/6500] lr: 0.000008 closs: 0.6954 (0.7408) grad_norm: 0.3916 (0.4698) time: 5.5719 data: 0.0001 max mem: 71357 +[04:54:50.377473] Epoch: [3] [3430/6500] lr: 0.000008 closs: 0.6899 (0.7407) grad_norm: 0.4018 (0.4699) time: 5.5676 data: 0.0001 max mem: 71357 +[04:55:46.036905] Epoch: [3] [3440/6500] lr: 0.000008 closs: 0.7141 (0.7407) grad_norm: 0.4018 (0.4698) time: 5.5664 data: 0.0001 max mem: 71357 +[04:56:41.749545] Epoch: [3] [3450/6500] lr: 0.000008 closs: 0.7220 (0.7406) grad_norm: 0.3857 (0.4696) time: 5.5685 data: 0.0001 max mem: 71357 +[04:57:37.559081] Epoch: [3] [3460/6500] lr: 0.000008 closs: 0.7657 (0.7406) grad_norm: 0.3844 (0.4695) time: 5.5760 data: 0.0001 max mem: 71357 +[04:58:33.212538] Epoch: [3] [3470/6500] lr: 0.000008 closs: 0.7399 (0.7406) grad_norm: 0.3841 (0.4693) time: 5.5731 data: 0.0001 max mem: 71357 +[04:59:28.895834] Epoch: [3] [3480/6500] lr: 0.000008 closs: 0.7152 (0.7408) grad_norm: 0.4047 (0.4693) time: 5.5668 data: 0.0001 max mem: 71357 +[05:00:24.526761] Epoch: [3] [3490/6500] lr: 0.000008 closs: 0.7117 (0.7407) grad_norm: 0.4259 (0.4694) time: 5.5656 data: 0.0001 max mem: 71357 +[05:01:20.288024] Epoch: [3] [3500/6500] lr: 0.000008 closs: 0.7111 (0.7405) grad_norm: 0.4359 (0.4695) time: 5.5695 data: 0.0001 max mem: 71357 +[05:02:15.993095] Epoch: [3] [3510/6500] lr: 0.000008 closs: 0.6038 (0.7403) grad_norm: 0.4570 (0.4696) time: 5.5733 data: 0.0001 max mem: 71357 +[05:03:11.767699] Epoch: [3] [3520/6500] lr: 0.000008 closs: 0.6809 (0.7404) grad_norm: 0.4570 (0.4696) time: 5.5739 data: 0.0001 max mem: 71357 +[05:04:07.414649] Epoch: [3] [3530/6500] lr: 0.000008 closs: 0.7217 (0.7403) grad_norm: 0.4787 (0.4703) time: 5.5710 data: 0.0001 max mem: 71357 +[05:05:03.283087] Epoch: [3] [3540/6500] lr: 0.000008 closs: 0.7217 (0.7402) grad_norm: 0.3929 (0.4701) time: 5.5757 data: 0.0001 max mem: 71357 +[05:05:58.925727] Epoch: [3] [3550/6500] lr: 0.000007 closs: 0.7480 (0.7403) grad_norm: 0.3891 (0.4698) time: 5.5754 data: 0.0001 max mem: 71357 +[05:06:54.610701] Epoch: [3] [3560/6500] lr: 0.000007 closs: 0.7596 (0.7405) grad_norm: 0.3929 (0.4698) time: 5.5663 data: 0.0001 max mem: 71357 +[05:07:50.323204] Epoch: [3] [3570/6500] lr: 0.000007 closs: 0.7339 (0.7403) grad_norm: 0.3710 (0.4695) time: 5.5698 data: 0.0001 max mem: 71357 +[05:08:46.047153] Epoch: [3] [3580/6500] lr: 0.000007 closs: 0.6691 (0.7402) grad_norm: 0.3913 (0.4694) time: 5.5718 data: 0.0001 max mem: 71357 +[05:09:41.796100] Epoch: [3] [3590/6500] lr: 0.000007 closs: 0.7296 (0.7402) grad_norm: 0.4097 (0.4695) time: 5.5736 data: 0.0001 max mem: 71357 +[05:10:37.416644] Epoch: [3] [3600/6500] lr: 0.000007 closs: 0.7296 (0.7400) grad_norm: 0.4042 (0.4696) time: 5.5684 data: 0.0001 max mem: 71357 +[05:11:33.195633] Epoch: [3] [3610/6500] lr: 0.000007 closs: 0.6830 (0.7399) grad_norm: 0.4097 (0.4693) time: 5.5699 data: 0.0001 max mem: 71357 +[05:12:28.863592] Epoch: [3] [3620/6500] lr: 0.000007 closs: 0.7189 (0.7399) grad_norm: 0.4191 (0.4693) time: 5.5723 data: 0.0001 max mem: 71357 +[05:13:24.628791] Epoch: [3] [3630/6500] lr: 0.000007 closs: 0.6991 (0.7398) grad_norm: 0.4191 (0.4692) time: 5.5716 data: 0.0001 max mem: 71357 +[05:14:20.229395] Epoch: [3] [3640/6500] lr: 0.000007 closs: 0.7327 (0.7402) grad_norm: 0.3995 (0.4690) time: 5.5682 data: 0.0001 max mem: 71357 +[05:15:15.944815] Epoch: [3] [3650/6500] lr: 0.000007 closs: 0.7718 (0.7402) grad_norm: 0.4191 (0.4692) time: 5.5657 data: 0.0001 max mem: 71357 +[05:16:11.642121] Epoch: [3] [3660/6500] lr: 0.000007 closs: 0.7550 (0.7403) grad_norm: 0.4011 (0.4695) time: 5.5705 data: 0.0001 max mem: 71357 +[05:17:07.479650] Epoch: [3] [3670/6500] lr: 0.000007 closs: 0.7673 (0.7402) grad_norm: 0.3984 (0.4693) time: 5.5767 data: 0.0001 max mem: 71357 +[05:18:03.202417] Epoch: [3] [3680/6500] lr: 0.000007 closs: 0.7821 (0.7403) grad_norm: 0.3894 (0.4694) time: 5.5780 data: 0.0001 max mem: 71357 +[05:18:58.921587] Epoch: [3] [3690/6500] lr: 0.000007 closs: 0.7572 (0.7405) grad_norm: 0.3894 (0.4693) time: 5.5720 data: 0.0001 max mem: 71357 +[05:19:54.690134] Epoch: [3] [3700/6500] lr: 0.000007 closs: 0.7508 (0.7406) grad_norm: 0.4002 (0.4692) time: 5.5743 data: 0.0001 max mem: 71357 +[05:20:50.328799] Epoch: [3] [3710/6500] lr: 0.000007 closs: 0.6726 (0.7405) grad_norm: 0.4002 (0.4689) time: 5.5703 data: 0.0001 max mem: 71357 +[05:21:46.081446] Epoch: [3] [3720/6500] lr: 0.000007 closs: 0.6688 (0.7405) grad_norm: 0.4002 (0.4689) time: 5.5695 data: 0.0001 max mem: 71357 +[05:22:41.792831] Epoch: [3] [3730/6500] lr: 0.000007 closs: 0.7399 (0.7405) grad_norm: 0.4459 (0.4693) time: 5.5731 data: 0.0001 max mem: 71357 +[05:23:37.391272] Epoch: [3] [3740/6500] lr: 0.000007 closs: 0.7180 (0.7404) grad_norm: 0.4610 (0.4693) time: 5.5654 data: 0.0001 max mem: 71357 +[05:24:33.095997] Epoch: [3] [3750/6500] lr: 0.000007 closs: 0.6927 (0.7403) grad_norm: 0.4757 (0.4694) time: 5.5651 data: 0.0001 max mem: 71357 +[05:25:28.837747] Epoch: [3] [3760/6500] lr: 0.000007 closs: 0.7164 (0.7405) grad_norm: 0.4757 (0.4696) time: 5.5723 data: 0.0001 max mem: 71357 +[05:26:24.545897] Epoch: [3] [3770/6500] lr: 0.000007 closs: 0.7164 (0.7405) grad_norm: 0.4634 (0.4695) time: 5.5724 data: 0.0001 max mem: 71357 +[05:27:20.234683] Epoch: [3] [3780/6500] lr: 0.000007 closs: 0.7184 (0.7405) grad_norm: 0.4144 (0.4693) time: 5.5698 data: 0.0001 max mem: 71357 +[05:28:15.998002] Epoch: [3] [3790/6500] lr: 0.000007 closs: 0.7094 (0.7405) grad_norm: 0.3937 (0.4692) time: 5.5725 data: 0.0001 max mem: 71357 +[05:29:11.732433] Epoch: [3] [3800/6500] lr: 0.000007 closs: 0.6876 (0.7406) grad_norm: 0.3814 (0.4693) time: 5.5748 data: 0.0001 max mem: 71357 +[05:30:07.575713] Epoch: [3] [3810/6500] lr: 0.000007 closs: 0.7399 (0.7407) grad_norm: 0.3745 (0.4689) time: 5.5788 data: 0.0001 max mem: 71357 +[05:31:03.224368] Epoch: [3] [3820/6500] lr: 0.000007 closs: 0.7399 (0.7408) grad_norm: 0.3791 (0.4691) time: 5.5745 data: 0.0001 max mem: 71357 +[05:31:58.833849] Epoch: [3] [3830/6500] lr: 0.000007 closs: 0.7631 (0.7406) grad_norm: 0.4356 (0.4691) time: 5.5628 data: 0.0001 max mem: 71357 +[05:32:54.496305] Epoch: [3] [3840/6500] lr: 0.000007 closs: 0.6846 (0.7406) grad_norm: 0.4207 (0.4695) time: 5.5635 data: 0.0001 max mem: 71357 +[05:33:50.347789] Epoch: [3] [3850/6500] lr: 0.000007 closs: 0.7558 (0.7407) grad_norm: 0.5182 (0.4699) time: 5.5756 data: 0.0001 max mem: 71357 +[05:34:46.039279] Epoch: [3] [3860/6500] lr: 0.000007 closs: 0.7782 (0.7407) grad_norm: 0.4689 (0.4698) time: 5.5771 data: 0.0001 max mem: 71357 +[05:35:41.780086] Epoch: [3] [3870/6500] lr: 0.000007 closs: 0.7601 (0.7408) grad_norm: 0.4636 (0.4706) time: 5.5716 data: 0.0001 max mem: 71357 +[05:36:37.492954] Epoch: [3] [3880/6500] lr: 0.000007 closs: 0.7601 (0.7409) grad_norm: 0.4093 (0.4704) time: 5.5726 data: 0.0001 max mem: 71357 +[05:37:33.218084] Epoch: [3] [3890/6500] lr: 0.000007 closs: 0.7292 (0.7407) grad_norm: 0.3931 (0.4702) time: 5.5718 data: 0.0001 max mem: 71357 +[05:38:29.017484] Epoch: [3] [3900/6500] lr: 0.000007 closs: 0.7349 (0.7407) grad_norm: 0.4056 (0.4702) time: 5.5761 data: 0.0001 max mem: 71357 +[05:39:24.708152] Epoch: [3] [3910/6500] lr: 0.000007 closs: 0.8072 (0.7410) grad_norm: 0.3931 (0.4700) time: 5.5744 data: 0.0001 max mem: 71357 +[05:40:20.454098] Epoch: [3] [3920/6500] lr: 0.000007 closs: 0.8075 (0.7412) grad_norm: 0.3901 (0.4698) time: 5.5717 data: 0.0001 max mem: 71357 +[05:41:16.072069] Epoch: [3] [3930/6500] lr: 0.000007 closs: 0.7397 (0.7413) grad_norm: 0.3941 (0.4698) time: 5.5681 data: 0.0001 max mem: 71357 +[05:42:11.874456] Epoch: [3] [3940/6500] lr: 0.000007 closs: 0.7319 (0.7412) grad_norm: 0.3901 (0.4697) time: 5.5710 data: 0.0001 max mem: 71357 +[05:43:07.571147] Epoch: [3] [3950/6500] lr: 0.000007 closs: 0.7194 (0.7411) grad_norm: 0.4072 (0.4699) time: 5.5749 data: 0.0001 max mem: 71357 +[05:44:03.269451] Epoch: [3] [3960/6500] lr: 0.000007 closs: 0.6988 (0.7410) grad_norm: 0.4316 (0.4697) time: 5.5697 data: 0.0001 max mem: 71357 +[05:44:58.911292] Epoch: [3] [3970/6500] lr: 0.000007 closs: 0.6769 (0.7411) grad_norm: 0.4400 (0.4698) time: 5.5669 data: 0.0001 max mem: 71357 +[05:45:54.762157] Epoch: [3] [3980/6500] lr: 0.000007 closs: 0.6727 (0.7409) grad_norm: 0.4316 (0.4695) time: 5.5746 data: 0.0001 max mem: 71357 +[05:46:50.420700] Epoch: [3] [3990/6500] lr: 0.000007 closs: 0.7395 (0.7412) grad_norm: 0.4151 (0.4694) time: 5.5754 data: 0.0001 max mem: 71357 +[05:47:46.149461] Epoch: [3] [4000/6500] lr: 0.000007 closs: 0.7470 (0.7409) grad_norm: 0.4080 (0.4692) time: 5.5693 data: 0.0001 max mem: 71357 +[05:48:41.905023] Epoch: [3] [4010/6500] lr: 0.000007 closs: 0.6684 (0.7408) grad_norm: 0.3969 (0.4693) time: 5.5741 data: 0.0001 max mem: 71357 +[05:49:37.663723] Epoch: [3] [4020/6500] lr: 0.000007 closs: 0.7380 (0.7409) grad_norm: 0.4290 (0.4692) time: 5.5756 data: 0.0001 max mem: 71357 +[05:50:33.353764] Epoch: [3] [4030/6500] lr: 0.000007 closs: 0.7211 (0.7407) grad_norm: 0.4290 (0.4699) time: 5.5724 data: 0.0001 max mem: 71357 +[05:51:29.001387] Epoch: [3] [4040/6500] lr: 0.000007 closs: 0.7211 (0.7408) grad_norm: 0.4670 (0.4702) time: 5.5668 data: 0.0001 max mem: 71357 +[05:52:24.657192] Epoch: [3] [4050/6500] lr: 0.000007 closs: 0.7172 (0.7407) grad_norm: 0.4290 (0.4704) time: 5.5651 data: 0.0001 max mem: 71357 +[05:53:20.370617] Epoch: [3] [4060/6500] lr: 0.000007 closs: 0.6402 (0.7406) grad_norm: 0.4262 (0.4703) time: 5.5684 data: 0.0001 max mem: 71357 +[05:54:16.087743] Epoch: [3] [4070/6500] lr: 0.000007 closs: 0.6991 (0.7406) grad_norm: 0.4298 (0.4702) time: 5.5714 data: 0.0001 max mem: 71357 +[05:55:11.720085] Epoch: [3] [4080/6500] lr: 0.000007 closs: 0.7579 (0.7408) grad_norm: 0.4249 (0.4702) time: 5.5674 data: 0.0001 max mem: 71357 +[05:56:07.403518] Epoch: [3] [4090/6500] lr: 0.000007 closs: 0.7559 (0.7408) grad_norm: 0.4282 (0.4702) time: 5.5657 data: 0.0001 max mem: 71357 +[05:57:03.082301] Epoch: [3] [4100/6500] lr: 0.000007 closs: 0.6847 (0.7406) grad_norm: 0.3763 (0.4701) time: 5.5680 data: 0.0001 max mem: 71357 +[05:57:58.773781] Epoch: [3] [4110/6500] lr: 0.000007 closs: 0.6676 (0.7406) grad_norm: 0.3763 (0.4703) time: 5.5684 data: 0.0001 max mem: 71357 +[05:58:54.385862] Epoch: [3] [4120/6500] lr: 0.000007 closs: 0.7262 (0.7404) grad_norm: 0.4959 (0.4706) time: 5.5651 data: 0.0001 max mem: 71357 +[05:59:50.104040] Epoch: [3] [4130/6500] lr: 0.000007 closs: 0.7262 (0.7403) grad_norm: 0.4959 (0.4705) time: 5.5664 data: 0.0001 max mem: 71357 +[06:00:45.692846] Epoch: [3] [4140/6500] lr: 0.000007 closs: 0.7231 (0.7404) grad_norm: 0.5343 (0.4706) time: 5.5653 data: 0.0001 max mem: 71357 +[06:01:41.355915] Epoch: [3] [4150/6500] lr: 0.000007 closs: 0.7379 (0.7404) grad_norm: 0.5177 (0.4707) time: 5.5625 data: 0.0001 max mem: 71357 +[06:02:37.089968] Epoch: [3] [4160/6500] lr: 0.000007 closs: 0.7380 (0.7404) grad_norm: 0.4467 (0.4707) time: 5.5698 data: 0.0001 max mem: 71357 +[06:03:32.774291] Epoch: [3] [4170/6500] lr: 0.000007 closs: 0.7332 (0.7404) grad_norm: 0.5048 (0.4711) time: 5.5708 data: 0.0001 max mem: 71357 +[06:04:28.497632] Epoch: [3] [4180/6500] lr: 0.000007 closs: 0.7578 (0.7405) grad_norm: 0.4494 (0.4712) time: 5.5703 data: 0.0001 max mem: 71357 +[06:05:24.216613] Epoch: [3] [4190/6500] lr: 0.000007 closs: 0.7601 (0.7405) grad_norm: 0.4129 (0.4710) time: 5.5720 data: 0.0001 max mem: 71357 +[06:06:20.098227] Epoch: [3] [4200/6500] lr: 0.000007 closs: 0.7665 (0.7406) grad_norm: 0.3923 (0.4710) time: 5.5800 data: 0.0001 max mem: 71357 +[06:07:15.828052] Epoch: [3] [4210/6500] lr: 0.000007 closs: 0.7841 (0.7408) grad_norm: 0.3622 (0.4709) time: 5.5805 data: 0.0001 max mem: 71357 +[06:08:11.532027] Epoch: [3] [4220/6500] lr: 0.000007 closs: 0.7194 (0.7406) grad_norm: 0.3691 (0.4706) time: 5.5716 data: 0.0001 max mem: 71357 +[06:09:07.164298] Epoch: [3] [4230/6500] lr: 0.000006 closs: 0.6699 (0.7404) grad_norm: 0.3812 (0.4724) time: 5.5667 data: 0.0001 max mem: 71357 +[06:10:02.992629] Epoch: [3] [4240/6500] lr: 0.000006 closs: 0.6629 (0.7404) grad_norm: 0.4220 (0.4724) time: 5.5729 data: 0.0001 max mem: 71357 +[06:10:58.745878] Epoch: [3] [4250/6500] lr: 0.000006 closs: 0.7612 (0.7405) grad_norm: 0.3945 (0.4722) time: 5.5790 data: 0.0001 max mem: 71357 +[06:11:54.440900] Epoch: [3] [4260/6500] lr: 0.000006 closs: 0.7949 (0.7407) grad_norm: 0.4455 (0.4722) time: 5.5724 data: 0.0001 max mem: 71357 +[06:12:50.182793] Epoch: [3] [4270/6500] lr: 0.000006 closs: 0.8130 (0.7407) grad_norm: 0.4455 (0.4722) time: 5.5718 data: 0.0001 max mem: 71357 +[06:13:45.770945] Epoch: [3] [4280/6500] lr: 0.000006 closs: 0.6939 (0.7405) grad_norm: 0.4629 (0.4730) time: 5.5664 data: 0.0001 max mem: 71357 +[06:14:41.714399] Epoch: [3] [4290/6500] lr: 0.000006 closs: 0.7175 (0.7407) grad_norm: 0.4635 (0.4731) time: 5.5765 data: 0.0001 max mem: 71357 +[06:15:37.371600] Epoch: [3] [4300/6500] lr: 0.000006 closs: 0.8015 (0.7408) grad_norm: 0.4426 (0.4728) time: 5.5799 data: 0.0001 max mem: 71357 +[06:16:33.108011] Epoch: [3] [4310/6500] lr: 0.000006 closs: 0.7606 (0.7409) grad_norm: 0.4413 (0.4732) time: 5.5696 data: 0.0001 max mem: 71357 +[06:17:28.770795] Epoch: [3] [4320/6500] lr: 0.000006 closs: 0.7072 (0.7408) grad_norm: 0.3953 (0.4731) time: 5.5699 data: 0.0001 max mem: 71357 +[06:18:24.617908] Epoch: [3] [4330/6500] lr: 0.000006 closs: 0.7142 (0.7409) grad_norm: 0.3763 (0.4730) time: 5.5754 data: 0.0001 max mem: 71357 +[06:19:20.284502] Epoch: [3] [4340/6500] lr: 0.000006 closs: 0.7488 (0.7408) grad_norm: 0.4262 (0.4732) time: 5.5756 data: 0.0001 max mem: 71357 +[06:20:16.015270] Epoch: [3] [4350/6500] lr: 0.000006 closs: 0.6938 (0.7408) grad_norm: 0.4168 (0.4731) time: 5.5698 data: 0.0001 max mem: 71357 +[06:21:11.725133] Epoch: [3] [4360/6500] lr: 0.000006 closs: 0.7256 (0.7410) grad_norm: 0.4232 (0.4732) time: 5.5720 data: 0.0001 max mem: 71357 +[06:22:07.350207] Epoch: [3] [4370/6500] lr: 0.000006 closs: 0.7400 (0.7410) grad_norm: 0.4398 (0.4732) time: 5.5667 data: 0.0001 max mem: 71357 +[06:23:03.236264] Epoch: [3] [4380/6500] lr: 0.000006 closs: 0.7400 (0.7410) grad_norm: 0.4398 (0.4735) time: 5.5755 data: 0.0001 max mem: 71357 +[06:23:58.889034] Epoch: [3] [4390/6500] lr: 0.000006 closs: 0.7240 (0.7409) grad_norm: 0.4513 (0.4736) time: 5.5769 data: 0.0001 max mem: 71357 +[06:24:54.533006] Epoch: [3] [4400/6500] lr: 0.000006 closs: 0.7240 (0.7408) grad_norm: 0.4513 (0.4737) time: 5.5647 data: 0.0001 max mem: 71357 +[06:25:50.206903] Epoch: [3] [4410/6500] lr: 0.000006 closs: 0.7493 (0.7411) grad_norm: 0.4513 (0.4737) time: 5.5658 data: 0.0001 max mem: 71357 +[06:26:45.995600] Epoch: [3] [4420/6500] lr: 0.000006 closs: 0.7180 (0.7411) grad_norm: 0.4341 (0.4776) time: 5.5730 data: 0.0001 max mem: 71357 +[06:27:41.697454] Epoch: [3] [4430/6500] lr: 0.000006 closs: 0.6501 (0.7409) grad_norm: 0.4669 (0.4775) time: 5.5745 data: 0.0001 max mem: 71357 +[06:28:37.418266] Epoch: [3] [4440/6500] lr: 0.000006 closs: 0.6607 (0.7407) grad_norm: 0.4669 (0.4775) time: 5.5711 data: 0.0001 max mem: 71357 +[06:29:33.035519] Epoch: [3] [4450/6500] lr: 0.000006 closs: 0.6990 (0.7406) grad_norm: 0.4688 (0.4777) time: 5.5668 data: 0.0001 max mem: 71357 +[06:30:28.813181] Epoch: [3] [4460/6500] lr: 0.000006 closs: 0.7062 (0.7406) grad_norm: 0.4669 (0.4776) time: 5.5697 data: 0.0001 max mem: 71357 +[06:31:24.518637] Epoch: [3] [4470/6500] lr: 0.000006 closs: 0.8034 (0.7408) grad_norm: 0.4726 (0.4779) time: 5.5741 data: 0.0001 max mem: 71357 +[06:32:20.176534] Epoch: [3] [4480/6500] lr: 0.000006 closs: 0.7781 (0.7407) grad_norm: 0.4556 (0.4779) time: 5.5681 data: 0.0001 max mem: 71357 +[06:33:15.934576] Epoch: [3] [4490/6500] lr: 0.000006 closs: 0.7453 (0.7408) grad_norm: 0.4051 (0.4776) time: 5.5707 data: 0.0001 max mem: 71357 +[06:34:11.633237] Epoch: [3] [4500/6500] lr: 0.000006 closs: 0.7150 (0.7406) grad_norm: 0.4099 (0.4775) time: 5.5728 data: 0.0001 max mem: 71357 +[06:35:07.409947] Epoch: [3] [4510/6500] lr: 0.000006 closs: 0.6874 (0.7405) grad_norm: 0.4041 (0.4775) time: 5.5737 data: 0.0001 max mem: 71357 +[06:36:03.057657] Epoch: [3] [4520/6500] lr: 0.000006 closs: 0.7244 (0.7407) grad_norm: 0.4633 (0.4778) time: 5.5711 data: 0.0001 max mem: 71357 +[06:36:58.751863] Epoch: [3] [4530/6500] lr: 0.000006 closs: 0.7346 (0.7407) grad_norm: 0.4917 (0.4777) time: 5.5670 data: 0.0001 max mem: 71357 +[06:37:54.453452] Epoch: [3] [4540/6500] lr: 0.000006 closs: 0.7567 (0.7407) grad_norm: 0.4633 (0.4776) time: 5.5697 data: 0.0001 max mem: 71357 +[06:38:50.277539] Epoch: [3] [4550/6500] lr: 0.000006 closs: 0.7567 (0.7407) grad_norm: 0.4308 (0.4778) time: 5.5762 data: 0.0001 max mem: 71357 +[06:39:45.982251] Epoch: [3] [4560/6500] lr: 0.000006 closs: 0.7651 (0.7409) grad_norm: 0.4175 (0.4777) time: 5.5764 data: 0.0001 max mem: 71357 +[06:40:41.651886] Epoch: [3] [4570/6500] lr: 0.000006 closs: 0.7811 (0.7410) grad_norm: 0.3881 (0.4776) time: 5.5686 data: 0.0001 max mem: 71357 +[06:41:37.315108] Epoch: [3] [4580/6500] lr: 0.000006 closs: 0.7358 (0.7410) grad_norm: 0.3934 (0.4776) time: 5.5666 data: 0.0001 max mem: 71357 +[06:42:33.052296] Epoch: [3] [4590/6500] lr: 0.000006 closs: 0.7234 (0.7410) grad_norm: 0.3934 (0.4775) time: 5.5700 data: 0.0001 max mem: 71357 +[06:43:28.866940] Epoch: [3] [4600/6500] lr: 0.000006 closs: 0.7615 (0.7411) grad_norm: 0.4202 (0.4775) time: 5.5775 data: 0.0001 max mem: 71357 +[06:44:24.515048] Epoch: [3] [4610/6500] lr: 0.000006 closs: 0.6976 (0.7410) grad_norm: 0.4237 (0.4774) time: 5.5731 data: 0.0001 max mem: 71357 +[06:45:20.203725] Epoch: [3] [4620/6500] lr: 0.000006 closs: 0.6506 (0.7410) grad_norm: 0.4171 (0.4772) time: 5.5668 data: 0.0001 max mem: 71357 +[06:46:15.879177] Epoch: [3] [4630/6500] lr: 0.000006 closs: 0.7314 (0.7410) grad_norm: 0.4004 (0.4769) time: 5.5681 data: 0.0001 max mem: 71357 +[06:47:11.707137] Epoch: [3] [4640/6500] lr: 0.000006 closs: 0.7390 (0.7410) grad_norm: 0.3716 (0.4768) time: 5.5751 data: 0.0001 max mem: 71357 +[06:48:07.323873] Epoch: [3] [4650/6500] lr: 0.000006 closs: 0.7443 (0.7409) grad_norm: 0.3923 (0.4769) time: 5.5722 data: 0.0001 max mem: 71357 +[06:49:03.064604] Epoch: [3] [4660/6500] lr: 0.000006 closs: 0.7443 (0.7409) grad_norm: 0.3923 (0.4767) time: 5.5678 data: 0.0001 max mem: 71357 +[06:49:58.767683] Epoch: [3] [4670/6500] lr: 0.000006 closs: 0.7634 (0.7410) grad_norm: 0.3923 (0.4766) time: 5.5721 data: 0.0001 max mem: 71357 +[06:50:54.566372] Epoch: [3] [4680/6500] lr: 0.000006 closs: 0.7191 (0.7408) grad_norm: 0.3871 (0.4765) time: 5.5750 data: 0.0001 max mem: 71357 +[06:51:50.323458] Epoch: [3] [4690/6500] lr: 0.000006 closs: 0.7153 (0.7407) grad_norm: 0.3820 (0.4766) time: 5.5777 data: 0.0001 max mem: 71357 +[06:52:46.085085] Epoch: [3] [4700/6500] lr: 0.000006 closs: 0.6925 (0.7407) grad_norm: 0.3504 (0.4763) time: 5.5759 data: 0.0001 max mem: 71357 +[06:53:41.820424] Epoch: [3] [4710/6500] lr: 0.000006 closs: 0.6925 (0.7406) grad_norm: 0.3788 (0.4764) time: 5.5747 data: 0.0001 max mem: 71357 +[06:54:37.430020] Epoch: [3] [4720/6500] lr: 0.000006 closs: 0.7173 (0.7405) grad_norm: 0.3988 (0.4764) time: 5.5671 data: 0.0001 max mem: 71357 +[06:55:33.170954] Epoch: [3] [4730/6500] lr: 0.000006 closs: 0.7568 (0.7407) grad_norm: 0.4271 (0.4763) time: 5.5674 data: 0.0001 max mem: 71357 +[06:56:28.817636] Epoch: [3] [4740/6500] lr: 0.000006 closs: 0.7464 (0.7408) grad_norm: 0.4350 (0.4761) time: 5.5693 data: 0.0001 max mem: 71357 +[06:57:24.442554] Epoch: [3] [4750/6500] lr: 0.000006 closs: 0.7297 (0.7408) grad_norm: 0.4253 (0.4760) time: 5.5635 data: 0.0001 max mem: 71357 +[06:58:20.075994] Epoch: [3] [4760/6500] lr: 0.000006 closs: 0.7300 (0.7408) grad_norm: 0.4253 (0.4768) time: 5.5628 data: 0.0001 max mem: 71357 +[06:59:15.879354] Epoch: [3] [4770/6500] lr: 0.000006 closs: 0.6813 (0.7408) grad_norm: 0.4014 (0.4768) time: 5.5718 data: 0.0001 max mem: 71357 +[07:00:11.589236] Epoch: [3] [4780/6500] lr: 0.000006 closs: 0.6824 (0.7409) grad_norm: 0.4178 (0.4767) time: 5.5756 data: 0.0001 max mem: 71357 +[07:01:07.310329] Epoch: [3] [4790/6500] lr: 0.000006 closs: 0.7243 (0.7409) grad_norm: 0.4163 (0.4766) time: 5.5714 data: 0.0001 max mem: 71357 +[07:02:03.036933] Epoch: [3] [4800/6500] lr: 0.000006 closs: 0.7869 (0.7410) grad_norm: 0.4163 (0.4766) time: 5.5723 data: 0.0001 max mem: 71357 +[07:02:58.635514] Epoch: [3] [4810/6500] lr: 0.000006 closs: 0.7832 (0.7410) grad_norm: 0.4575 (0.4767) time: 5.5662 data: 0.0001 max mem: 71357 +[07:03:54.439261] Epoch: [3] [4820/6500] lr: 0.000006 closs: 0.7549 (0.7410) grad_norm: 0.4702 (0.4769) time: 5.5701 data: 0.0001 max mem: 71357 +[07:04:50.068310] Epoch: [3] [4830/6500] lr: 0.000006 closs: 0.7156 (0.7410) grad_norm: 0.4575 (0.4768) time: 5.5716 data: 0.0001 max mem: 71357 +[07:05:45.752970] Epoch: [3] [4840/6500] lr: 0.000006 closs: 0.7375 (0.7409) grad_norm: 0.4076 (0.4766) time: 5.5656 data: 0.0001 max mem: 71357 +[07:06:41.332388] Epoch: [3] [4850/6500] lr: 0.000006 closs: 0.7409 (0.7410) grad_norm: 0.4074 (0.4767) time: 5.5631 data: 0.0001 max mem: 71357 +[07:07:37.187318] Epoch: [3] [4860/6500] lr: 0.000006 closs: 0.6977 (0.7409) grad_norm: 0.4074 (0.4767) time: 5.5716 data: 0.0001 max mem: 71357 +[07:08:32.915254] Epoch: [3] [4870/6500] lr: 0.000006 closs: 0.6836 (0.7408) grad_norm: 0.4279 (0.4767) time: 5.5790 data: 0.0001 max mem: 71357 +[07:09:28.532950] Epoch: [3] [4880/6500] lr: 0.000006 closs: 0.7120 (0.7409) grad_norm: 0.4074 (0.4766) time: 5.5672 data: 0.0001 max mem: 71357 +[07:10:24.255320] Epoch: [3] [4890/6500] lr: 0.000006 closs: 0.7367 (0.7407) grad_norm: 0.4095 (0.4767) time: 5.5669 data: 0.0001 max mem: 71357 +[07:11:20.031107] Epoch: [3] [4900/6500] lr: 0.000006 closs: 0.7259 (0.7407) grad_norm: 0.4011 (0.4766) time: 5.5748 data: 0.0001 max mem: 71357 +[07:12:15.812216] Epoch: [3] [4910/6500] lr: 0.000006 closs: 0.7271 (0.7408) grad_norm: 0.4011 (0.4765) time: 5.5777 data: 0.0001 max mem: 71357 +[07:13:11.520623] Epoch: [3] [4920/6500] lr: 0.000006 closs: 0.7102 (0.7408) grad_norm: 0.4011 (0.4763) time: 5.5744 data: 0.0001 max mem: 71357 +[07:14:07.160205] Epoch: [3] [4930/6500] lr: 0.000006 closs: 0.7200 (0.7408) grad_norm: 0.3937 (0.4763) time: 5.5673 data: 0.0001 max mem: 71357 +[07:15:02.721256] Epoch: [3] [4940/6500] lr: 0.000006 closs: 0.7026 (0.7407) grad_norm: 0.4128 (0.4766) time: 5.5599 data: 0.0001 max mem: 71357 +[07:15:58.537548] Epoch: [3] [4950/6500] lr: 0.000006 closs: 0.6585 (0.7407) grad_norm: 0.4183 (0.4768) time: 5.5688 data: 0.0001 max mem: 71357 +[07:16:54.210667] Epoch: [3] [4960/6500] lr: 0.000006 closs: 0.6897 (0.7407) grad_norm: 0.4334 (0.4766) time: 5.5744 data: 0.0001 max mem: 71357 +[07:17:49.890698] Epoch: [3] [4970/6500] lr: 0.000006 closs: 0.7234 (0.7405) grad_norm: 0.4334 (0.4766) time: 5.5676 data: 0.0001 max mem: 71357 +[07:18:45.562111] Epoch: [3] [4980/6500] lr: 0.000006 closs: 0.7334 (0.7404) grad_norm: 0.3593 (0.4764) time: 5.5675 data: 0.0001 max mem: 71357 +[07:19:41.353861] Epoch: [3] [4990/6500] lr: 0.000006 closs: 0.6968 (0.7404) grad_norm: 0.3642 (0.4765) time: 5.5731 data: 0.0001 max mem: 71357 +[07:20:37.076513] Epoch: [3] [5000/6500] lr: 0.000006 closs: 0.6968 (0.7405) grad_norm: 0.4023 (0.4765) time: 5.5757 data: 0.0001 max mem: 71357 +[07:21:32.711715] Epoch: [3] [5010/6500] lr: 0.000006 closs: 0.7888 (0.7405) grad_norm: 0.4166 (0.4765) time: 5.5678 data: 0.0001 max mem: 71357 +[07:22:28.392680] Epoch: [3] [5020/6500] lr: 0.000006 closs: 0.8213 (0.7407) grad_norm: 0.4392 (0.4764) time: 5.5657 data: 0.0001 max mem: 71357 +[07:23:24.053200] Epoch: [3] [5030/6500] lr: 0.000006 closs: 0.8213 (0.7408) grad_norm: 0.4247 (0.4763) time: 5.5670 data: 0.0001 max mem: 71357 +[07:24:19.885504] Epoch: [3] [5040/6500] lr: 0.000006 closs: 0.7937 (0.7409) grad_norm: 0.4176 (0.4762) time: 5.5745 data: 0.0001 max mem: 71357 +[07:25:15.541561] Epoch: [3] [5050/6500] lr: 0.000006 closs: 0.7597 (0.7410) grad_norm: 0.4224 (0.4763) time: 5.5743 data: 0.0001 max mem: 71357 +[07:26:11.374618] Epoch: [3] [5060/6500] lr: 0.000006 closs: 0.6788 (0.7409) grad_norm: 0.4224 (0.4761) time: 5.5744 data: 0.0001 max mem: 71357 +[07:27:07.078997] Epoch: [3] [5070/6500] lr: 0.000006 closs: 0.7492 (0.7409) grad_norm: 0.4120 (0.4759) time: 5.5768 data: 0.0001 max mem: 71357 +[07:28:02.865357] Epoch: [3] [5080/6500] lr: 0.000006 closs: 0.7492 (0.7410) grad_norm: 0.4302 (0.4762) time: 5.5744 data: 0.0001 max mem: 71357 +[07:28:58.502309] Epoch: [3] [5090/6500] lr: 0.000006 closs: 0.7938 (0.7412) grad_norm: 0.4302 (0.4761) time: 5.5711 data: 0.0001 max mem: 71357 +[07:29:54.168306] Epoch: [3] [5100/6500] lr: 0.000006 closs: 0.7356 (0.7412) grad_norm: 0.4987 (0.4762) time: 5.5650 data: 0.0001 max mem: 71357 +[07:30:49.827035] Epoch: [3] [5110/6500] lr: 0.000006 closs: 0.7003 (0.7412) grad_norm: 0.4987 (0.4761) time: 5.5661 data: 0.0001 max mem: 71357 +[07:31:45.641823] Epoch: [3] [5120/6500] lr: 0.000006 closs: 0.7438 (0.7412) grad_norm: 0.4244 (0.4760) time: 5.5736 data: 0.0001 max mem: 71357 +[07:32:41.455089] Epoch: [3] [5130/6500] lr: 0.000006 closs: 0.7517 (0.7413) grad_norm: 0.4237 (0.4758) time: 5.5813 data: 0.0001 max mem: 71357 +[07:33:37.179398] Epoch: [3] [5140/6500] lr: 0.000006 closs: 0.7771 (0.7413) grad_norm: 0.4015 (0.4757) time: 5.5768 data: 0.0001 max mem: 71357 +[07:34:32.952469] Epoch: [3] [5150/6500] lr: 0.000006 closs: 0.6882 (0.7412) grad_norm: 0.3916 (0.4756) time: 5.5748 data: 0.0001 max mem: 71357 +[07:35:28.690763] Epoch: [3] [5160/6500] lr: 0.000006 closs: 0.7365 (0.7415) grad_norm: 0.4015 (0.4758) time: 5.5755 data: 0.0001 max mem: 71357 +[07:36:24.470225] Epoch: [3] [5170/6500] lr: 0.000006 closs: 0.7372 (0.7414) grad_norm: 0.4123 (0.4757) time: 5.5758 data: 0.0001 max mem: 71357 +[07:37:20.151884] Epoch: [3] [5180/6500] lr: 0.000006 closs: 0.7399 (0.7416) grad_norm: 0.4265 (0.4759) time: 5.5730 data: 0.0001 max mem: 71357 +[07:38:15.804126] Epoch: [3] [5190/6500] lr: 0.000005 closs: 0.7578 (0.7416) grad_norm: 0.4265 (0.4757) time: 5.5666 data: 0.0001 max mem: 71357 +[07:39:11.488282] Epoch: [3] [5200/6500] lr: 0.000005 closs: 0.6937 (0.7416) grad_norm: 0.4066 (0.4756) time: 5.5667 data: 0.0001 max mem: 71357 +[07:40:07.267455] Epoch: [3] [5210/6500] lr: 0.000005 closs: 0.7611 (0.7417) grad_norm: 0.3831 (0.4756) time: 5.5731 data: 0.0001 max mem: 71357 +[07:41:03.069520] Epoch: [3] [5220/6500] lr: 0.000005 closs: 0.7594 (0.7417) grad_norm: 0.3831 (0.4756) time: 5.5790 data: 0.0001 max mem: 71357 +[07:41:58.738721] Epoch: [3] [5230/6500] lr: 0.000005 closs: 0.6946 (0.7416) grad_norm: 0.4384 (0.4756) time: 5.5735 data: 0.0001 max mem: 71357 +[07:42:54.394666] Epoch: [3] [5240/6500] lr: 0.000005 closs: 0.5834 (0.7414) grad_norm: 0.4770 (0.4757) time: 5.5662 data: 0.0001 max mem: 71357 +[07:43:50.008917] Epoch: [3] [5250/6500] lr: 0.000005 closs: 0.6269 (0.7414) grad_norm: 0.4798 (0.4757) time: 5.5634 data: 0.0001 max mem: 71357 +[07:44:45.778422] Epoch: [3] [5260/6500] lr: 0.000005 closs: 0.7557 (0.7414) grad_norm: 0.4838 (0.4758) time: 5.5691 data: 0.0001 max mem: 71357 +[07:45:41.527673] Epoch: [3] [5270/6500] lr: 0.000005 closs: 0.7557 (0.7415) grad_norm: 0.5086 (0.4760) time: 5.5759 data: 0.0001 max mem: 71357 +[07:46:37.191214] Epoch: [3] [5280/6500] lr: 0.000005 closs: 0.7615 (0.7416) grad_norm: 0.4950 (0.4759) time: 5.5706 data: 0.0001 max mem: 71357 +[07:47:32.847414] Epoch: [3] [5290/6500] lr: 0.000005 closs: 0.7668 (0.7417) grad_norm: 0.4947 (0.4758) time: 5.5659 data: 0.0001 max mem: 71357 +[07:48:28.717965] Epoch: [3] [5300/6500] lr: 0.000005 closs: 0.7729 (0.7417) grad_norm: 0.4736 (0.4759) time: 5.5763 data: 0.0001 max mem: 71357 +[07:49:24.422934] Epoch: [3] [5310/6500] lr: 0.000005 closs: 0.7493 (0.7416) grad_norm: 0.4380 (0.4759) time: 5.5787 data: 0.0001 max mem: 71357 +[07:50:20.140110] Epoch: [3] [5320/6500] lr: 0.000005 closs: 0.7367 (0.7417) grad_norm: 0.4736 (0.4760) time: 5.5710 data: 0.0001 max mem: 71357 +[07:51:15.914706] Epoch: [3] [5330/6500] lr: 0.000005 closs: 0.7469 (0.7418) grad_norm: 0.4288 (0.4758) time: 5.5745 data: 0.0001 max mem: 71357 +[07:52:11.772834] Epoch: [3] [5340/6500] lr: 0.000005 closs: 0.7469 (0.7417) grad_norm: 0.3868 (0.4758) time: 5.5816 data: 0.0001 max mem: 71357 +[07:53:07.443542] Epoch: [3] [5350/6500] lr: 0.000005 closs: 0.7176 (0.7417) grad_norm: 0.3952 (0.4757) time: 5.5763 data: 0.0001 max mem: 71357 +[07:54:03.117070] Epoch: [3] [5360/6500] lr: 0.000005 closs: 0.7500 (0.7417) grad_norm: 0.3854 (0.4756) time: 5.5671 data: 0.0001 max mem: 71357 +[07:54:58.869745] Epoch: [3] [5370/6500] lr: 0.000005 closs: 0.7227 (0.7417) grad_norm: 0.3854 (0.4756) time: 5.5712 data: 0.0001 max mem: 71357 +[07:55:54.574182] Epoch: [3] [5380/6500] lr: 0.000005 closs: 0.7051 (0.7416) grad_norm: 0.3838 (0.4755) time: 5.5728 data: 0.0001 max mem: 71357 +[07:56:50.331719] Epoch: [3] [5390/6500] lr: 0.000005 closs: 0.7780 (0.7417) grad_norm: 0.4164 (0.4756) time: 5.5730 data: 0.0001 max mem: 71357 +[07:57:45.936766] Epoch: [3] [5400/6500] lr: 0.000005 closs: 0.7780 (0.7418) grad_norm: 0.4019 (0.4754) time: 5.5681 data: 0.0001 max mem: 71357 +[07:58:41.625507] Epoch: [3] [5410/6500] lr: 0.000005 closs: 0.7760 (0.7419) grad_norm: 0.4019 (0.4754) time: 5.5646 data: 0.0001 max mem: 71357 +[07:59:37.221626] Epoch: [3] [5420/6500] lr: 0.000005 closs: 0.7735 (0.7419) grad_norm: 0.4019 (0.4753) time: 5.5641 data: 0.0001 max mem: 71357 +[08:00:32.942836] Epoch: [3] [5430/6500] lr: 0.000005 closs: 0.7487 (0.7418) grad_norm: 0.3855 (0.4752) time: 5.5658 data: 0.0001 max mem: 71357 +[08:01:28.737895] Epoch: [3] [5440/6500] lr: 0.000005 closs: 0.7487 (0.7419) grad_norm: 0.3855 (0.4751) time: 5.5757 data: 0.0001 max mem: 71357 +[08:02:24.357380] Epoch: [3] [5450/6500] lr: 0.000005 closs: 0.7460 (0.7419) grad_norm: 0.3975 (0.4751) time: 5.5707 data: 0.0001 max mem: 71357 +[08:03:20.013233] Epoch: [3] [5460/6500] lr: 0.000005 closs: 0.7460 (0.7419) grad_norm: 0.4548 (0.4752) time: 5.5637 data: 0.0001 max mem: 71357 +[08:04:15.677277] Epoch: [3] [5470/6500] lr: 0.000005 closs: 0.8010 (0.7420) grad_norm: 0.4553 (0.4751) time: 5.5659 data: 0.0001 max mem: 71357 +[08:05:11.472975] Epoch: [3] [5480/6500] lr: 0.000005 closs: 0.7603 (0.7419) grad_norm: 0.4553 (0.4752) time: 5.5729 data: 0.0001 max mem: 71357 +[08:06:07.124938] Epoch: [3] [5490/6500] lr: 0.000005 closs: 0.7009 (0.7418) grad_norm: 0.4387 (0.4750) time: 5.5723 data: 0.0001 max mem: 71357 +[08:07:02.722427] Epoch: [3] [5500/6500] lr: 0.000005 closs: 0.7009 (0.7417) grad_norm: 0.4303 (0.4751) time: 5.5624 data: 0.0001 max mem: 71357 +[08:07:58.375710] Epoch: [3] [5510/6500] lr: 0.000005 closs: 0.6630 (0.7415) grad_norm: 0.4247 (0.4758) time: 5.5625 data: 0.0001 max mem: 71357 +[08:08:54.297860] Epoch: [3] [5520/6500] lr: 0.000005 closs: 0.7243 (0.7415) grad_norm: 0.4047 (0.4758) time: 5.5787 data: 0.0001 max mem: 71357 +[08:09:49.997283] Epoch: [3] [5530/6500] lr: 0.000005 closs: 0.7034 (0.7413) grad_norm: 0.4303 (0.4758) time: 5.5810 data: 0.0001 max mem: 71357 +[08:10:45.674960] Epoch: [3] [5540/6500] lr: 0.000005 closs: 0.7039 (0.7413) grad_norm: 0.4179 (0.4757) time: 5.5688 data: 0.0001 max mem: 71357 +[08:11:41.302248] Epoch: [3] [5550/6500] lr: 0.000005 closs: 0.7121 (0.7414) grad_norm: 0.4179 (0.4758) time: 5.5652 data: 0.0001 max mem: 71357 +[08:12:37.151238] Epoch: [3] [5560/6500] lr: 0.000005 closs: 0.6410 (0.7412) grad_norm: 0.4524 (0.4762) time: 5.5737 data: 0.0001 max mem: 71357 +[08:13:32.880058] Epoch: [3] [5570/6500] lr: 0.000005 closs: 0.6737 (0.7413) grad_norm: 0.4524 (0.4761) time: 5.5788 data: 0.0001 max mem: 71357 +[08:14:28.497505] Epoch: [3] [5580/6500] lr: 0.000005 closs: 0.7277 (0.7412) grad_norm: 0.4449 (0.4760) time: 5.5673 data: 0.0001 max mem: 71357 +[08:15:24.123502] Epoch: [3] [5590/6500] lr: 0.000005 closs: 0.7041 (0.7412) grad_norm: 0.4449 (0.4761) time: 5.5621 data: 0.0001 max mem: 71357 +[08:16:19.713976] Epoch: [3] [5600/6500] lr: 0.000005 closs: 0.7092 (0.7412) grad_norm: 0.3970 (0.4760) time: 5.5607 data: 0.0001 max mem: 71357 +[08:17:15.578612] Epoch: [3] [5610/6500] lr: 0.000005 closs: 0.7257 (0.7411) grad_norm: 0.3970 (0.4760) time: 5.5727 data: 0.0001 max mem: 71357 +[08:18:11.270830] Epoch: [3] [5620/6500] lr: 0.000005 closs: 0.7130 (0.7411) grad_norm: 0.4082 (0.4759) time: 5.5778 data: 0.0001 max mem: 71357 +[08:19:07.047039] Epoch: [3] [5630/6500] lr: 0.000005 closs: 0.6523 (0.7411) grad_norm: 0.4421 (0.4759) time: 5.5734 data: 0.0001 max mem: 71357 +[08:20:02.685791] Epoch: [3] [5640/6500] lr: 0.000005 closs: 0.6917 (0.7410) grad_norm: 0.4677 (0.4758) time: 5.5707 data: 0.0001 max mem: 71357 +[08:20:58.580036] Epoch: [3] [5650/6500] lr: 0.000005 closs: 0.7371 (0.7411) grad_norm: 0.4498 (0.4758) time: 5.5766 data: 0.0001 max mem: 71357 +[08:21:54.213166] Epoch: [3] [5660/6500] lr: 0.000005 closs: 0.7371 (0.7412) grad_norm: 0.4677 (0.4759) time: 5.5763 data: 0.0001 max mem: 71357 +[08:22:49.950783] Epoch: [3] [5670/6500] lr: 0.000005 closs: 0.7466 (0.7413) grad_norm: 0.3905 (0.4758) time: 5.5684 data: 0.0001 max mem: 71357 +[08:23:45.705757] Epoch: [3] [5680/6500] lr: 0.000005 closs: 0.7321 (0.7412) grad_norm: 0.4498 (0.4762) time: 5.5745 data: 0.0001 max mem: 71357 +[08:24:41.256843] Epoch: [3] [5690/6500] lr: 0.000005 closs: 0.6601 (0.7411) grad_norm: 0.4623 (0.4764) time: 5.5652 data: 0.0001 max mem: 71357 +[08:25:37.016730] Epoch: [3] [5700/6500] lr: 0.000005 closs: 0.6825 (0.7410) grad_norm: 0.4525 (0.4763) time: 5.5655 data: 0.0001 max mem: 71357 +[08:26:32.785310] Epoch: [3] [5710/6500] lr: 0.000005 closs: 0.6972 (0.7410) grad_norm: 0.4525 (0.4762) time: 5.5763 data: 0.0001 max mem: 71357 +[08:27:28.440180] Epoch: [3] [5720/6500] lr: 0.000005 closs: 0.7447 (0.7411) grad_norm: 0.4266 (0.4761) time: 5.5711 data: 0.0001 max mem: 71357 +[08:28:24.134165] Epoch: [3] [5730/6500] lr: 0.000005 closs: 0.7670 (0.7412) grad_norm: 0.4178 (0.4761) time: 5.5674 data: 0.0001 max mem: 71357 +[08:29:19.997220] Epoch: [3] [5740/6500] lr: 0.000005 closs: 0.7214 (0.7410) grad_norm: 0.4178 (0.4760) time: 5.5778 data: 0.0001 max mem: 71357 +[08:30:15.733399] Epoch: [3] [5750/6500] lr: 0.000005 closs: 0.6831 (0.7409) grad_norm: 0.4367 (0.4761) time: 5.5799 data: 0.0001 max mem: 71357 +[08:31:11.458578] Epoch: [3] [5760/6500] lr: 0.000005 closs: 0.6616 (0.7408) grad_norm: 0.4367 (0.4761) time: 5.5730 data: 0.0001 max mem: 71357 +[08:32:07.031492] Epoch: [3] [5770/6500] lr: 0.000005 closs: 0.7017 (0.7408) grad_norm: 0.5017 (0.4761) time: 5.5648 data: 0.0001 max mem: 71357 +[08:33:02.681051] Epoch: [3] [5780/6500] lr: 0.000005 closs: 0.7020 (0.7408) grad_norm: 0.4604 (0.4761) time: 5.5610 data: 0.0001 max mem: 71357 +[08:33:58.421784] Epoch: [3] [5790/6500] lr: 0.000005 closs: 0.7253 (0.7408) grad_norm: 0.4554 (0.4763) time: 5.5695 data: 0.0001 max mem: 71357 +[08:34:54.273999] Epoch: [3] [5800/6500] lr: 0.000005 closs: 0.7283 (0.7408) grad_norm: 0.4415 (0.4762) time: 5.5796 data: 0.0001 max mem: 71357 +[08:35:49.986871] Epoch: [3] [5810/6500] lr: 0.000005 closs: 0.7905 (0.7410) grad_norm: 0.4331 (0.4761) time: 5.5782 data: 0.0001 max mem: 71357 +[08:36:45.654555] Epoch: [3] [5820/6500] lr: 0.000005 closs: 0.7656 (0.7409) grad_norm: 0.4289 (0.4761) time: 5.5690 data: 0.0001 max mem: 71357 +[08:37:41.471240] Epoch: [3] [5830/6500] lr: 0.000005 closs: 0.6852 (0.7408) grad_norm: 0.3691 (0.4759) time: 5.5741 data: 0.0001 max mem: 71357 +[08:38:37.093389] Epoch: [3] [5840/6500] lr: 0.000005 closs: 0.6992 (0.7408) grad_norm: 0.3979 (0.4759) time: 5.5719 data: 0.0001 max mem: 71357 +[08:39:32.812807] Epoch: [3] [5850/6500] lr: 0.000005 closs: 0.7118 (0.7407) grad_norm: 0.3971 (0.4758) time: 5.5670 data: 0.0001 max mem: 71357 +[08:40:28.562085] Epoch: [3] [5860/6500] lr: 0.000005 closs: 0.7175 (0.7408) grad_norm: 0.3971 (0.4757) time: 5.5734 data: 0.0001 max mem: 71357 +[08:41:24.343575] Epoch: [3] [5870/6500] lr: 0.000005 closs: 0.7244 (0.7408) grad_norm: 0.4018 (0.4756) time: 5.5765 data: 0.0001 max mem: 71357 +[08:42:20.067075] Epoch: [3] [5880/6500] lr: 0.000005 closs: 0.7244 (0.7408) grad_norm: 0.4018 (0.4757) time: 5.5752 data: 0.0001 max mem: 71357 +[08:43:15.679788] Epoch: [3] [5890/6500] lr: 0.000005 closs: 0.7645 (0.7409) grad_norm: 0.4559 (0.4758) time: 5.5667 data: 0.0001 max mem: 71357 +[08:44:11.290632] Epoch: [3] [5900/6500] lr: 0.000005 closs: 0.7315 (0.7409) grad_norm: 0.4684 (0.4758) time: 5.5611 data: 0.0001 max mem: 71357 +[08:45:06.943686] Epoch: [3] [5910/6500] lr: 0.000005 closs: 0.7315 (0.7409) grad_norm: 0.4615 (0.4757) time: 5.5631 data: 0.0001 max mem: 71357 +[08:46:02.654843] Epoch: [3] [5920/6500] lr: 0.000005 closs: 0.7520 (0.7409) grad_norm: 0.4684 (0.4758) time: 5.5681 data: 0.0001 max mem: 71357 +[08:46:58.263460] Epoch: [3] [5930/6500] lr: 0.000005 closs: 0.7815 (0.7410) grad_norm: 0.4616 (0.4759) time: 5.5659 data: 0.0001 max mem: 71357 +[08:47:53.923587] Epoch: [3] [5940/6500] lr: 0.000005 closs: 0.7815 (0.7410) grad_norm: 0.4616 (0.4763) time: 5.5633 data: 0.0001 max mem: 71357 +[08:48:49.647222] Epoch: [3] [5950/6500] lr: 0.000005 closs: 0.7723 (0.7411) grad_norm: 0.4731 (0.4763) time: 5.5691 data: 0.0001 max mem: 71357 +[08:49:45.438272] Epoch: [3] [5960/6500] lr: 0.000005 closs: 0.6831 (0.7408) grad_norm: 0.4702 (0.4763) time: 5.5757 data: 0.0001 max mem: 71357 +[08:50:41.080388] Epoch: [3] [5970/6500] lr: 0.000005 closs: 0.7002 (0.7408) grad_norm: 0.4085 (0.4761) time: 5.5716 data: 0.0001 max mem: 71357 +[08:51:36.713518] Epoch: [3] [5980/6500] lr: 0.000005 closs: 0.7981 (0.7411) grad_norm: 0.4085 (0.4763) time: 5.5637 data: 0.0001 max mem: 71357 +[08:52:32.360209] Epoch: [3] [5990/6500] lr: 0.000005 closs: 0.8063 (0.7411) grad_norm: 0.3989 (0.4761) time: 5.5639 data: 0.0001 max mem: 71357 +[08:53:28.046393] Epoch: [3] [6000/6500] lr: 0.000005 closs: 0.7429 (0.7410) grad_norm: 0.4288 (0.4762) time: 5.5666 data: 0.0001 max mem: 71357 +[08:54:23.776691] Epoch: [3] [6010/6500] lr: 0.000005 closs: 0.7170 (0.7410) grad_norm: 0.4823 (0.4762) time: 5.5707 data: 0.0001 max mem: 71357 +[08:55:19.330299] Epoch: [3] [6020/6500] lr: 0.000005 closs: 0.6639 (0.7408) grad_norm: 0.4531 (0.4762) time: 5.5641 data: 0.0001 max mem: 71357 +[08:56:15.081992] Epoch: [3] [6030/6500] lr: 0.000005 closs: 0.6893 (0.7408) grad_norm: 0.4349 (0.4761) time: 5.5652 data: 0.0001 max mem: 71357 +[08:57:10.718534] Epoch: [3] [6040/6500] lr: 0.000005 closs: 0.7329 (0.7409) grad_norm: 0.3941 (0.4765) time: 5.5693 data: 0.0001 max mem: 71357 +[08:58:06.494709] Epoch: [3] [6050/6500] lr: 0.000005 closs: 0.8222 (0.7411) grad_norm: 0.3941 (0.4766) time: 5.5705 data: 0.0001 max mem: 71357 +[08:59:02.188091] Epoch: [3] [6060/6500] lr: 0.000005 closs: 0.7643 (0.7410) grad_norm: 0.3920 (0.4764) time: 5.5734 data: 0.0001 max mem: 71357 +[08:59:57.800593] Epoch: [3] [6070/6500] lr: 0.000005 closs: 0.7363 (0.7410) grad_norm: 0.4194 (0.4764) time: 5.5652 data: 0.0001 max mem: 71357 +[09:00:53.339756] Epoch: [3] [6080/6500] lr: 0.000005 closs: 0.7633 (0.7411) grad_norm: 0.4194 (0.4763) time: 5.5575 data: 0.0001 max mem: 71357 +[09:01:49.113502] Epoch: [3] [6090/6500] lr: 0.000005 closs: 0.7977 (0.7411) grad_norm: 0.4062 (0.4764) time: 5.5656 data: 0.0001 max mem: 71357 +[09:02:44.848015] Epoch: [3] [6100/6500] lr: 0.000005 closs: 0.8059 (0.7412) grad_norm: 0.4194 (0.4763) time: 5.5754 data: 0.0001 max mem: 71357 +[09:03:40.394260] Epoch: [3] [6110/6500] lr: 0.000005 closs: 0.7635 (0.7411) grad_norm: 0.4153 (0.4764) time: 5.5640 data: 0.0001 max mem: 71357 +[09:04:36.100670] Epoch: [3] [6120/6500] lr: 0.000005 closs: 0.6694 (0.7411) grad_norm: 0.3990 (0.4763) time: 5.5625 data: 0.0001 max mem: 71357 +[09:05:31.751861] Epoch: [3] [6130/6500] lr: 0.000005 closs: 0.7213 (0.7412) grad_norm: 0.4238 (0.4762) time: 5.5678 data: 0.0001 max mem: 71357 +[09:06:27.636239] Epoch: [3] [6140/6500] lr: 0.000005 closs: 0.7295 (0.7412) grad_norm: 0.4095 (0.4760) time: 5.5767 data: 0.0001 max mem: 71357 +[09:07:23.235152] Epoch: [3] [6150/6500] lr: 0.000005 closs: 0.7065 (0.7412) grad_norm: 0.4102 (0.4761) time: 5.5741 data: 0.0001 max mem: 71357 +[09:08:18.818036] Epoch: [3] [6160/6500] lr: 0.000005 closs: 0.7877 (0.7413) grad_norm: 0.4559 (0.4763) time: 5.5590 data: 0.0001 max mem: 71357 +[09:09:14.505759] Epoch: [3] [6170/6500] lr: 0.000005 closs: 0.7101 (0.7411) grad_norm: 0.4395 (0.4762) time: 5.5634 data: 0.0001 max mem: 71357 +[09:10:10.350422] Epoch: [3] [6180/6500] lr: 0.000005 closs: 0.6311 (0.7409) grad_norm: 0.5068 (0.4763) time: 5.5765 data: 0.0001 max mem: 71357 +[09:11:06.023015] Epoch: [3] [6190/6500] lr: 0.000005 closs: 0.7146 (0.7410) grad_norm: 0.4877 (0.4762) time: 5.5758 data: 0.0001 max mem: 71357 +[09:12:01.785870] Epoch: [3] [6200/6500] lr: 0.000005 closs: 0.7406 (0.7411) grad_norm: 0.4877 (0.4763) time: 5.5717 data: 0.0001 max mem: 71357 +[09:12:57.440583] Epoch: [3] [6210/6500] lr: 0.000005 closs: 0.7689 (0.7412) grad_norm: 0.4298 (0.4761) time: 5.5708 data: 0.0001 max mem: 71357 +[09:13:53.257089] Epoch: [3] [6220/6500] lr: 0.000005 closs: 0.7652 (0.7412) grad_norm: 0.4117 (0.4761) time: 5.5735 data: 0.0001 max mem: 71357 +[09:14:48.956788] Epoch: [3] [6230/6500] lr: 0.000005 closs: 0.7270 (0.7411) grad_norm: 0.4298 (0.4761) time: 5.5757 data: 0.0001 max mem: 71357 +[09:15:44.591134] Epoch: [3] [6240/6500] lr: 0.000005 closs: 0.7631 (0.7412) grad_norm: 0.4232 (0.4763) time: 5.5666 data: 0.0001 max mem: 71357 +[09:16:40.191312] Epoch: [3] [6250/6500] lr: 0.000005 closs: 0.8537 (0.7414) grad_norm: 0.4668 (0.4763) time: 5.5617 data: 0.0001 max mem: 71357 +[09:17:35.838897] Epoch: [3] [6260/6500] lr: 0.000005 closs: 0.8001 (0.7414) grad_norm: 0.4668 (0.4763) time: 5.5623 data: 0.0001 max mem: 71357 +[09:18:31.640816] Epoch: [3] [6270/6500] lr: 0.000005 closs: 0.7339 (0.7415) grad_norm: 0.4230 (0.4763) time: 5.5724 data: 0.0001 max mem: 71357 +[09:19:27.206966] Epoch: [3] [6280/6500] lr: 0.000005 closs: 0.7339 (0.7414) grad_norm: 0.4230 (0.4763) time: 5.5683 data: 0.0001 max mem: 71357 +[09:20:22.913908] Epoch: [3] [6290/6500] lr: 0.000005 closs: 0.7190 (0.7414) grad_norm: 0.4122 (0.4762) time: 5.5636 data: 0.0001 max mem: 71357 +[09:21:18.497214] Epoch: [3] [6300/6500] lr: 0.000005 closs: 0.7719 (0.7415) grad_norm: 0.4122 (0.4762) time: 5.5644 data: 0.0001 max mem: 71357 +[09:22:14.290866] Epoch: [3] [6310/6500] lr: 0.000005 closs: 0.7057 (0.7415) grad_norm: 0.4122 (0.4761) time: 5.5688 data: 0.0001 max mem: 71357 +[09:23:09.931856] Epoch: [3] [6320/6500] lr: 0.000005 closs: 0.6686 (0.7414) grad_norm: 0.4850 (0.4765) time: 5.5717 data: 0.0001 max mem: 71357 +[09:24:05.612801] Epoch: [3] [6330/6500] lr: 0.000005 closs: 0.6958 (0.7414) grad_norm: 0.4899 (0.4765) time: 5.5660 data: 0.0001 max mem: 71357 +[09:25:01.244089] Epoch: [3] [6340/6500] lr: 0.000005 closs: 0.7575 (0.7415) grad_norm: 0.4416 (0.4764) time: 5.5655 data: 0.0001 max mem: 71357 +[09:25:56.860737] Epoch: [3] [6350/6500] lr: 0.000005 closs: 0.7575 (0.7415) grad_norm: 0.4298 (0.4763) time: 5.5623 data: 0.0001 max mem: 71357 +[09:26:52.670083] Epoch: [3] [6360/6500] lr: 0.000005 closs: 0.7802 (0.7416) grad_norm: 0.4094 (0.4762) time: 5.5712 data: 0.0001 max mem: 71357 +[09:27:48.219140] Epoch: [3] [6370/6500] lr: 0.000005 closs: 0.7762 (0.7416) grad_norm: 0.4253 (0.4763) time: 5.5678 data: 0.0001 max mem: 71357 +[09:28:43.931513] Epoch: [3] [6380/6500] lr: 0.000005 closs: 0.7411 (0.7417) grad_norm: 0.4253 (0.4762) time: 5.5630 data: 0.0001 max mem: 71357 +[09:29:39.533688] Epoch: [3] [6390/6500] lr: 0.000005 closs: 0.7411 (0.7417) grad_norm: 0.4533 (0.4762) time: 5.5656 data: 0.0001 max mem: 71357 +[09:30:35.257343] Epoch: [3] [6400/6500] lr: 0.000005 closs: 0.7405 (0.7417) grad_norm: 0.4680 (0.4762) time: 5.5662 data: 0.0001 max mem: 71357 +[09:31:30.993736] Epoch: [3] [6410/6500] lr: 0.000005 closs: 0.7159 (0.7417) grad_norm: 0.4596 (0.4761) time: 5.5729 data: 0.0001 max mem: 71357 +[09:32:26.749437] Epoch: [3] [6420/6500] lr: 0.000005 closs: 0.7601 (0.7419) grad_norm: 0.4596 (0.4761) time: 5.5745 data: 0.0001 max mem: 71357 +[09:33:22.454703] Epoch: [3] [6430/6500] lr: 0.000005 closs: 0.7906 (0.7420) grad_norm: 0.4588 (0.4761) time: 5.5730 data: 0.0001 max mem: 71357 +[09:34:18.165306] Epoch: [3] [6440/6500] lr: 0.000005 closs: 0.7978 (0.7421) grad_norm: 0.4588 (0.4762) time: 5.5707 data: 0.0001 max mem: 71357 +[09:35:13.814875] Epoch: [3] [6450/6500] lr: 0.000005 closs: 0.8280 (0.7422) grad_norm: 0.4473 (0.4763) time: 5.5679 data: 0.0001 max mem: 71357 +[09:36:09.491251] Epoch: [3] [6460/6500] lr: 0.000005 closs: 0.7346 (0.7422) grad_norm: 0.4588 (0.4768) time: 5.5662 data: 0.0001 max mem: 71357 +[09:37:05.187478] Epoch: [3] [6470/6500] lr: 0.000005 closs: 0.7346 (0.7421) grad_norm: 0.4473 (0.4767) time: 5.5685 data: 0.0001 max mem: 71357 +[09:38:00.919941] Epoch: [3] [6480/6500] lr: 0.000005 closs: 0.6989 (0.7421) grad_norm: 0.4163 (0.4766) time: 5.5713 data: 0.0001 max mem: 71357 +[09:38:56.716451] Epoch: [3] [6490/6500] lr: 0.000005 closs: 0.7505 (0.7422) grad_norm: 0.4275 (0.4767) time: 5.5763 data: 0.0001 max mem: 71357 +[09:39:47.308149] Epoch: [3] Total time: 10:03:39 +[09:39:47.344265] Averaged stats: lr: 0.000005 closs: 0.7505 (0.7420) grad_norm: 0.4643 (0.4769) +[09:39:47.504858] model saved +[09:39:48.501829] optimizer saved +[09:39:48.502389] other rank-common saved +[09:39:48.505739] rank-specific saved +[09:39:48.505904] Training time 1 day, 16:16:17 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..b38695565af72457e9f05c9e4ca22d0b989e9a8c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:313f5e81fea27d147cde97e00126a7f8601b23525257ebf7458c92eecf0cc970 +size 3346363 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..d4696f71df544953bf208fff77e98e6459d6f009 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11c2c3bf6d2f02c6c58b38aea73b6f05069f5cb244a15495d6d0fb38005daee7 +size 13213149 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..4f1c0fc1c5869d93ad63691f8b563ddfc94a491e --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bf124baa9c1335af5aea4800b2ecc4751c23fe1e00b1039e65300f3a8a604df +size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch0/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..56aae264f8bb29bb559083b58986da1e9fb59b05 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cc391cf566e6ffa902c1bce14c8e49dfd776d4a477e9db68ddb38586324df74 +size 3346363 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..3c80900041c5cb481f436a046d4228a25f84f81c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0c008837893b85ed8ec5956d6325baad2bbb37f2ff9292343f070f45c5a214e +size 13213149 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..5aedb034e6703bb173a5809ba3841b4aec77b84f --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66c2800e6ca30ebd241d0f9e1eb878c8cefbe4b64abe9193d161a46e70ab4a7f +size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch1/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..dc4d43c9d981f1347aacc56e7915534f880e916e --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15fc9177852922041cbd21983bf3035d7a369572d47f5d3c59a95a96157645a6 +size 3346363 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..e25b2de607363f557126bbe17c77377ff503e06d --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba98b1d4711d2593bda512d80ff5dde0f7b0f59279b684d03bb9d176d451d23c +size 13213149 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..8dee9756db1f953b5909c16e290ec296695d713e --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:345734640b889b71c244859184f43ef9df7cdc2bb5b54df1b067c3eb5926cad0 +size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch2/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.model.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.model.pth new file mode 100644 index 0000000000000000000000000000000000000000..c41978c5774d772f60e84e9353d71223f4fe4cb1 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.model.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e603f33e4d2bb53c22e63aec2ab119a4eb5fa43229f5378fd59d317b09db95e +size 3346363 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth new file mode 100644 index 0000000000000000000000000000000000000000..001cee0a11609de544c4911f87ef6c06adfff2e6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.optimizer.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:796eb49d927deacfcebdb9e6a7f7410c3c039a8d122dfb050a99bb689f20f1aa +size 13213149 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.other.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.other.pth new file mode 100644 index 0000000000000000000000000000000000000000..3eb817d0ed9c35f2a5b4064f75035754d00a8e8a --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/consolidated.00-of-01.other.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80dc16b24ea013668c3f644439b10bf3c06724b5ab5982e34f615f5742cbffc9 +size 1687 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..52b71af1a9ce3ed182e1185cac54dc42f12a5fb6 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00000-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2932635da1a4de71c34aa8fcbcba91dfb0ac1ddc7859f8f87280546b7e786a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..20d239dfd49c5dfac4b0e9262df10a199c383e22 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00001-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88973b3c418b507bcde1467ec3902218b83d95fe4e022aca11b09c3f86cde7ac +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..44d15a9615f46731b4d1be2302ed11c2e22c5889 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00002-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eee15a274ea5f27c0360c85bd878d6e0f2072076cae26311c52798f7d836643a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..c02a05b764b46a3e2ea7f50bab8449d0128a76d9 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00003-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61651d612914693bf494e5609388a6f9239090c45b3abcc9c4fa5c7a814c7a7e +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9bdc7b095dfaed08b7ebb500fa76f2562a86c2c --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00004-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cd6ad8f3d2bcfa25c957717227143e64751970f9b367b28b205a5084a8f476a +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..93470a083d27c6e079dfb735e0a4fa8b7f6b0249 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00005-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf049e1944a87da00e6860d1884d0eb312dc5a389a832a4e76a582493ec26972 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e3ca8659ab49b709193c41ea8923e9f7217d09 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00006-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8174e84cf8a0553f73baf42bd13d65974b85944a834fa7f75433c0be044e2f04 +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth new file mode 100644 index 0000000000000000000000000000000000000000..6530350b10d02e206562d6d0b29a46a26d742899 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/epoch3/rank-specific-00007-of-00008.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6f9198ace60febfc0ad5d85588a3d4021799762f521c1a6b87adc99c8889ce +size 537 diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/log.txt b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/log.txt new file mode 100644 index 0000000000000000000000000000000000000000..abb77c76056fe3af1b2bf4deef0701efd4b3bf43 --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/log.txt @@ -0,0 +1,4 @@ +{"train_lr": 2.49692118226601e-05, "train_grad_norm": 0.5515950994171532, "train_closs": 0.9724007532174878, "epoch": 0, "val_lr": 2.49692118226601e-05, "val_grad_norm": 0.5515950994171532, "val_closs": 0.9724007532174878} +{"train_lr": 4.6114274981403966e-05, "train_grad_norm": 0.4230820783253374, "train_closs": 0.9194491385009709, "epoch": 1, "val_lr": 4.6114274981403966e-05, "val_grad_norm": 0.4230820783253374, "val_closs": 0.9194491385009709} +{"train_lr": 2.751385467980297e-05, "train_grad_norm": 0.43375423697415244, "train_closs": 0.9097992562197993, "epoch": 2, "val_lr": 2.751385467980297e-05, "val_grad_norm": 0.43375423697415244, "val_closs": 0.9097992562197993} +{"train_lr": 8.899579698398978e-06, "train_grad_norm": 0.4319171491881897, "train_closs": 0.9059112436445476, "epoch": 3, "val_lr": 8.899579698398978e-06, "val_grad_norm": 0.4319171491881897, "val_closs": 0.9059112436445476} diff --git a/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/output.log b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/output.log new file mode 100644 index 0000000000000000000000000000000000000000..2d61df6ec6a44103476fd8b35eaf94407dd1c73e --- /dev/null +++ b/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B/output.log @@ -0,0 +1,591 @@ +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +| distributed init (rank 2): env://, gpu 2 +| distributed init (rank 4): env://, gpu 4 +| distributed init (rank 6): env://, gpu 6 +| distributed init (rank 1): env://, gpu 1 +| distributed init (rank 7): env://, gpu 7 +| distributed init (rank 0): env://, gpu 0 +| distributed init (rank 5): env://, gpu 5 +| distributed init (rank 3): env://, gpu 3 +[01:06:34.716609] > initializing model parallel with size 1 +[01:06:34.716724] > initializing ddp with size 8 +[01:06:34.716732] > initializing pipeline with size 1 +[01:06:34.874304] job dir: /data/liuyijiang/mmlab/krisliu/LLaMA2-Accessory/accessory +[01:06:34.874416] Namespace(batch_size=8, +accum_iter=1, +llama_type='llama_peft', +llama_config=['../checkpoints/llama2/Llama-2-7b/params.json'], +no_visual=True, +tokenizer_path='../checkpoints/llama2/Llama-2-7b/tokenizer.model', +pretrained_path='../checkpoints/llama2/Llama-2-7b/', +pretrained_type='meta_ori', +weight_decay=0.02, +lr=5e-05, +min_lr=5e-06, +epochs=4, +warmup_epochs=1.0, +clip_grad=2, +max_words=512, +dialog=False, +data_config='configs/data/finetune/sg/alpaca.yaml', +output_dir='output/finetune/sg/alpaca_llamaPeft_normBias_QF_512_7B', +log_dir='./output_dir', +save_interval=1, +only_save_trainable=True, +device='cuda', +seed=0, +resume='', +num_workers=24, +pin_mem=True, +world_size=8, +local_rank=-1, +dist_on_itp=False, +dist_url='env://', +model_parallel_size=1, +data_parallel='sdp', +precision='bf16', +checkpointing=True, +quant=True, +rank=0, +gpu=0, +distributed=True, +dist_backend='nccl') +[01:06:34.875336] Start initialization. +[01:06:34.875369] ## Processing on RANK 0. +[01:06:34.885417] Model Args: + ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=512, lora_rank=-1, bias_tuning=True) +[01:07:21.595514] Model is Peft: True +[01:07:21.598774] Trainable parameter count : 1626112 (local rank), 1626112 (all). +[01:07:21.606589] ## Load pretrained from ../checkpoints/llama2/Llama-2-7b/ +[01:07:36.364995] ## Quantizing model to 4bit! + Qunatization Process: 0%| | 0/391 [00:00 +[01:22:05.499269] Start training for 4 epochs +[01:22:05.503408] log_dir: ./output_dir +[01:22:09.419465] Epoch: [0] [0/812] lr: 0.000000 grad_norm: 1.5851 (1.5851) closs: 1.1100 (1.1100) time: 3.9149 data: 1.4914 max mem: 11698 +[01:22:20.379352] Epoch: [0] [10/812] lr: 0.000001 grad_norm: 1.6298 (1.5954) closs: 1.1139 (1.1233) time: 1.3522 data: 0.1358 max mem: 17563 +[01:22:31.537117] Epoch: [0] [20/812] lr: 0.000001 grad_norm: 1.5263 (1.5275) closs: 1.0717 (1.1034) time: 1.1058 data: 0.0002 max mem: 17563 +[01:22:42.269605] Epoch: [0] [30/812] lr: 0.000002 grad_norm: 1.5396 (1.5436) closs: 1.1138 (1.1221) time: 1.0944 data: 0.0002 max mem: 17563 +[01:22:53.398607] Epoch: [0] [40/812] lr: 0.000002 grad_norm: 1.5046 (1.5073) closs: 1.1306 (1.1303) time: 1.0930 data: 0.0002 max mem: 17563 +[01:23:04.182090] Epoch: [0] [50/812] lr: 0.000003 grad_norm: 1.3218 (1.4667) closs: 1.1306 (1.1331) time: 1.0956 data: 0.0002 max mem: 17563 +[01:23:14.952358] Epoch: [0] [60/812] lr: 0.000004 grad_norm: 1.2240 (1.4199) closs: 1.1418 (1.1299) time: 1.0776 data: 0.0002 max mem: 17563 +[01:23:25.936660] Epoch: [0] [70/812] lr: 0.000004 grad_norm: 1.1201 (1.3708) closs: 1.0912 (1.1343) time: 1.0877 data: 0.0002 max mem: 17563 +[01:23:36.708068] Epoch: [0] [80/812] lr: 0.000005 grad_norm: 1.0076 (1.3227) closs: 1.0998 (1.1334) time: 1.0877 data: 0.0002 max mem: 17563 +[01:23:47.591464] Epoch: [0] [90/812] lr: 0.000006 grad_norm: 0.9119 (1.2713) closs: 1.0616 (1.1231) time: 1.0827 data: 0.0002 max mem: 17563 +[01:23:58.446457] Epoch: [0] [100/812] lr: 0.000006 grad_norm: 0.7848 (1.2187) closs: 1.0525 (1.1199) time: 1.0868 data: 0.0002 max mem: 17563 +[01:24:09.227248] Epoch: [0] [110/812] lr: 0.000007 grad_norm: 0.6812 (1.1715) closs: 1.0394 (1.1090) time: 1.0817 data: 0.0002 max mem: 17563 +[01:24:20.398593] Epoch: [0] [120/812] lr: 0.000007 grad_norm: 0.6384 (1.1269) closs: 1.0384 (1.1100) time: 1.0975 data: 0.0002 max mem: 17563 +[01:24:31.192852] Epoch: [0] [130/812] lr: 0.000008 grad_norm: 0.6113 (1.0871) closs: 1.0841 (1.1071) time: 1.0982 data: 0.0002 max mem: 17563 +[01:24:41.965595] Epoch: [0] [140/812] lr: 0.000009 grad_norm: 0.5842 (1.0530) closs: 0.9775 (1.0988) time: 1.0783 data: 0.0002 max mem: 17563 +[01:24:52.911946] Epoch: [0] [150/812] lr: 0.000009 grad_norm: 0.6050 (1.0261) closs: 0.9885 (1.0948) time: 1.0859 data: 0.0002 max mem: 17563 +[01:25:03.677533] Epoch: [0] [160/812] lr: 0.000010 grad_norm: 0.6036 (1.0009) closs: 1.0495 (1.0920) time: 1.0855 data: 0.0002 max mem: 17563 +[01:25:14.583376] Epoch: [0] [170/812] lr: 0.000010 grad_norm: 0.6036 (0.9806) closs: 0.9975 (1.0849) time: 1.0835 data: 0.0002 max mem: 17563 +[01:25:25.336144] Epoch: [0] [180/812] lr: 0.000011 grad_norm: 0.5856 (0.9554) closs: 0.9262 (1.0760) time: 1.0829 data: 0.0002 max mem: 17563 +[01:25:36.082435] Epoch: [0] [190/812] lr: 0.000012 grad_norm: 0.4878 (0.9305) closs: 0.9326 (1.0696) time: 1.0749 data: 0.0002 max mem: 17563 +[01:25:47.041738] Epoch: [0] [200/812] lr: 0.000012 grad_norm: 0.4558 (0.9080) closs: 0.9832 (1.0663) time: 1.0852 data: 0.0002 max mem: 17563 +[01:25:57.816030] Epoch: [0] [210/812] lr: 0.000013 grad_norm: 0.4558 (0.8868) closs: 0.9928 (1.0618) time: 1.0866 data: 0.0002 max mem: 17563 +[01:26:08.707981] Epoch: [0] [220/812] lr: 0.000014 grad_norm: 0.4294 (0.8654) closs: 0.9756 (1.0584) time: 1.0832 data: 0.0002 max mem: 17563 +[01:26:19.569916] Epoch: [0] [230/812] lr: 0.000014 grad_norm: 0.4231 (0.8473) closs: 0.9284 (1.0518) time: 1.0876 data: 0.0002 max mem: 17563 +[01:26:30.341740] Epoch: [0] [240/812] lr: 0.000015 grad_norm: 0.4198 (0.8302) closs: 0.9145 (1.0479) time: 1.0816 data: 0.0002 max mem: 17563 +[01:26:41.347965] Epoch: [0] [250/812] lr: 0.000015 grad_norm: 0.4445 (0.8150) closs: 0.9541 (1.0449) time: 1.0888 data: 0.0002 max mem: 17563 +[01:26:52.141904] Epoch: [0] [260/812] lr: 0.000016 grad_norm: 0.3947 (0.7985) closs: 0.9527 (1.0411) time: 1.0899 data: 0.0002 max mem: 17563 +[01:27:02.920099] Epoch: [0] [270/812] lr: 0.000017 grad_norm: 0.3933 (0.7847) closs: 0.9685 (1.0398) time: 1.0785 data: 0.0002 max mem: 17563 +[01:27:13.865000] Epoch: [0] [280/812] lr: 0.000017 grad_norm: 0.3972 (0.7720) closs: 0.9685 (1.0365) time: 1.0861 data: 0.0002 max mem: 17563 +[01:27:24.627033] Epoch: [0] [290/812] lr: 0.000018 grad_norm: 0.3972 (0.7602) closs: 0.9590 (1.0331) time: 1.0853 data: 0.0002 max mem: 17563 +[01:27:35.600123] Epoch: [0] [300/812] lr: 0.000018 grad_norm: 0.4177 (0.7492) closs: 0.9565 (1.0304) time: 1.0867 data: 0.0002 max mem: 17563 +[01:27:46.357189] Epoch: [0] [310/812] lr: 0.000019 grad_norm: 0.4166 (0.7387) closs: 0.9678 (1.0295) time: 1.0864 data: 0.0002 max mem: 17563 +[01:27:57.128434] Epoch: [0] [320/812] lr: 0.000020 grad_norm: 0.4347 (0.7308) closs: 0.9678 (1.0290) time: 1.0763 data: 0.0002 max mem: 17563 +[01:28:08.112145] Epoch: [0] [330/812] lr: 0.000020 grad_norm: 0.4021 (0.7216) closs: 0.9325 (1.0248) time: 1.0877 data: 0.0002 max mem: 17563 +[01:28:18.879864] Epoch: [0] [340/812] lr: 0.000021 grad_norm: 0.4021 (0.7125) closs: 0.9178 (1.0215) time: 1.0875 data: 0.0002 max mem: 17563 +[01:28:29.776770] Epoch: [0] [350/812] lr: 0.000022 grad_norm: 0.3938 (0.7043) closs: 0.9432 (1.0223) time: 1.0832 data: 0.0002 max mem: 17563 +[01:28:40.599074] Epoch: [0] [360/812] lr: 0.000022 grad_norm: 0.4132 (0.6974) closs: 0.9773 (1.0202) time: 1.0859 data: 0.0002 max mem: 17563 +[01:28:51.342835] Epoch: [0] [370/812] lr: 0.000023 grad_norm: 0.4203 (0.6895) closs: 0.9131 (1.0175) time: 1.0782 data: 0.0002 max mem: 17563 +[01:29:02.284783] Epoch: [0] [380/812] lr: 0.000023 grad_norm: 0.3938 (0.6827) closs: 0.9131 (1.0160) time: 1.0842 data: 0.0002 max mem: 17563 +[01:29:13.036230] Epoch: [0] [390/812] lr: 0.000024 grad_norm: 0.4150 (0.6784) closs: 0.9211 (1.0143) time: 1.0846 data: 0.0002 max mem: 17563 +[01:29:23.775505] Epoch: [0] [400/812] lr: 0.000025 grad_norm: 0.4409 (0.6724) closs: 0.9129 (1.0127) time: 1.0745 data: 0.0002 max mem: 17563 +[01:29:34.715185] Epoch: [0] [410/812] lr: 0.000025 grad_norm: 0.4361 (0.6671) closs: 0.9129 (1.0112) time: 1.0839 data: 0.0002 max mem: 17563 +[01:29:45.495129] Epoch: [0] [420/812] lr: 0.000026 grad_norm: 0.4243 (0.6609) closs: 0.8944 (1.0089) time: 1.0859 data: 0.0002 max mem: 17563 +[01:29:56.420098] Epoch: [0] [430/812] lr: 0.000026 grad_norm: 0.4060 (0.6558) closs: 0.8753 (1.0060) time: 1.0852 data: 0.0002 max mem: 17563 +[01:30:07.187456] Epoch: [0] [440/812] lr: 0.000027 grad_norm: 0.4138 (0.6505) closs: 0.8546 (1.0032) time: 1.0845 data: 0.0002 max mem: 17563 +[01:30:17.943263] Epoch: [0] [450/812] lr: 0.000028 grad_norm: 0.4181 (0.6457) closs: 0.8741 (1.0003) time: 1.0761 data: 0.0002 max mem: 17563 +[01:30:28.879556] Epoch: [0] [460/812] lr: 0.000028 grad_norm: 0.4111 (0.6407) closs: 0.9011 (0.9989) time: 1.0845 data: 0.0002 max mem: 17563 +[01:30:39.634616] Epoch: [0] [470/812] lr: 0.000029 grad_norm: 0.4111 (0.6368) closs: 0.9094 (0.9983) time: 1.0845 data: 0.0002 max mem: 17563 +[01:30:50.485964] Epoch: [0] [480/812] lr: 0.000030 grad_norm: 0.4260 (0.6326) closs: 0.9646 (0.9977) time: 1.0803 data: 0.0002 max mem: 17563 +[01:31:01.334825] Epoch: [0] [490/812] lr: 0.000030 grad_norm: 0.4045 (0.6274) closs: 0.9550 (0.9968) time: 1.0849 data: 0.0002 max mem: 17563 +[01:31:12.096791] Epoch: [0] [500/812] lr: 0.000031 grad_norm: 0.3751 (0.6228) closs: 0.9518 (0.9954) time: 1.0805 data: 0.0002 max mem: 17563 +[01:31:23.034985] Epoch: [0] [510/812] lr: 0.000031 grad_norm: 0.4087 (0.6203) closs: 0.9159 (0.9931) time: 1.0849 data: 0.0002 max mem: 17563 +[01:31:33.781764] Epoch: [0] [520/812] lr: 0.000032 grad_norm: 0.4179 (0.6162) closs: 0.8666 (0.9919) time: 1.0842 data: 0.0002 max mem: 17563 +[01:31:44.525309] Epoch: [0] [530/812] lr: 0.000033 grad_norm: 0.4241 (0.6133) closs: 0.9274 (0.9909) time: 1.0744 data: 0.0002 max mem: 17563 +[01:31:55.482190] Epoch: [0] [540/812] lr: 0.000033 grad_norm: 0.4228 (0.6100) closs: 0.9681 (0.9901) time: 1.0849 data: 0.0002 max mem: 17563 +[01:32:06.249000] Epoch: [0] [550/812] lr: 0.000034 grad_norm: 0.4076 (0.6068) closs: 0.9330 (0.9886) time: 1.0861 data: 0.0002 max mem: 17563 +[01:32:17.186588] Epoch: [0] [560/812] lr: 0.000034 grad_norm: 0.4216 (0.6043) closs: 0.8816 (0.9879) time: 1.0851 data: 0.0002 max mem: 17563 +[01:32:27.934998] Epoch: [0] [570/812] lr: 0.000035 grad_norm: 0.4354 (0.6016) closs: 0.9062 (0.9869) time: 1.0842 data: 0.0002 max mem: 17563 +[01:32:38.690637] Epoch: [0] [580/812] lr: 0.000036 grad_norm: 0.4128 (0.5987) closs: 0.9111 (0.9867) time: 1.0751 data: 0.0002 max mem: 17563 +[01:32:49.647407] Epoch: [0] [590/812] lr: 0.000036 grad_norm: 0.4128 (0.5956) closs: 0.9117 (0.9857) time: 1.0855 data: 0.0002 max mem: 17563 +[01:33:00.403073] Epoch: [0] [600/812] lr: 0.000037 grad_norm: 0.4093 (0.5925) closs: 0.9308 (0.9848) time: 1.0856 data: 0.0002 max mem: 17563 +[01:33:11.264501] Epoch: [0] [610/812] lr: 0.000038 grad_norm: 0.4305 (0.5901) closs: 0.8741 (0.9826) time: 1.0808 data: 0.0003 max mem: 17563 +[01:33:22.096122] Epoch: [0] [620/812] lr: 0.000038 grad_norm: 0.4412 (0.5876) closs: 0.8519 (0.9818) time: 1.0846 data: 0.0003 max mem: 17563 +[01:33:32.848945] Epoch: [0] [630/812] lr: 0.000039 grad_norm: 0.4384 (0.5857) closs: 0.9008 (0.9806) time: 1.0792 data: 0.0002 max mem: 17563 +[01:33:43.792068] Epoch: [0] [640/812] lr: 0.000039 grad_norm: 0.4455 (0.5836) closs: 0.9237 (0.9800) time: 1.0847 data: 0.0002 max mem: 17563 +[01:33:54.556290] Epoch: [0] [650/812] lr: 0.000040 grad_norm: 0.4246 (0.5814) closs: 0.9106 (0.9786) time: 1.0853 data: 0.0002 max mem: 17563 +[01:34:05.298115] Epoch: [0] [660/812] lr: 0.000041 grad_norm: 0.4145 (0.5796) closs: 0.8797 (0.9780) time: 1.0752 data: 0.0002 max mem: 17563 +[01:34:16.274488] Epoch: [0] [670/812] lr: 0.000041 grad_norm: 0.3915 (0.5766) closs: 0.8957 (0.9773) time: 1.0858 data: 0.0002 max mem: 17563 +[01:34:27.009081] Epoch: [0] [680/812] lr: 0.000042 grad_norm: 0.3898 (0.5742) closs: 0.9229 (0.9768) time: 1.0855 data: 0.0002 max mem: 17563 +[01:34:37.956230] Epoch: [0] [690/812] lr: 0.000042 grad_norm: 0.4105 (0.5721) closs: 0.9277 (0.9763) time: 1.0840 data: 0.0002 max mem: 17563 +[01:34:48.711827] Epoch: [0] [700/812] lr: 0.000043 grad_norm: 0.4030 (0.5699) closs: 0.9277 (0.9760) time: 1.0851 data: 0.0002 max mem: 17563 +[01:34:59.483778] Epoch: [0] [710/812] lr: 0.000044 grad_norm: 0.3941 (0.5679) closs: 0.8992 (0.9747) time: 1.0763 data: 0.0002 max mem: 17563 +[01:35:10.426566] Epoch: [0] [720/812] lr: 0.000044 grad_norm: 0.4368 (0.5662) closs: 0.8889 (0.9741) time: 1.0857 data: 0.0002 max mem: 17563 +[01:35:21.173421] Epoch: [0] [730/812] lr: 0.000045 grad_norm: 0.4336 (0.5642) closs: 0.9001 (0.9732) time: 1.0844 data: 0.0002 max mem: 17563 +[01:35:32.063229] Epoch: [0] [740/812] lr: 0.000046 grad_norm: 0.4336 (0.5623) closs: 0.8885 (0.9718) time: 1.0818 data: 0.0002 max mem: 17563 +[01:35:42.886626] Epoch: [0] [750/812] lr: 0.000046 grad_norm: 0.4241 (0.5608) closs: 0.9257 (0.9726) time: 1.0856 data: 0.0002 max mem: 17563 +[01:35:53.664183] Epoch: [0] [760/812] lr: 0.000047 grad_norm: 0.4208 (0.5596) closs: 0.9473 (0.9717) time: 1.0800 data: 0.0002 max mem: 17563 +[01:36:04.654574] Epoch: [0] [770/812] lr: 0.000047 grad_norm: 0.4265 (0.5578) closs: 0.9173 (0.9707) time: 1.0883 data: 0.0002 max mem: 17563 +[01:36:15.394778] Epoch: [0] [780/812] lr: 0.000048 grad_norm: 0.4275 (0.5565) closs: 0.9173 (0.9703) time: 1.0865 data: 0.0002 max mem: 17563 +[01:36:26.176020] Epoch: [0] [790/812] lr: 0.000049 grad_norm: 0.4331 (0.5550) closs: 0.8900 (0.9693) time: 1.0760 data: 0.0002 max mem: 17563 +[01:36:37.110017] Epoch: [0] [800/812] lr: 0.000049 grad_norm: 0.4156 (0.5532) closs: 0.8900 (0.9687) time: 1.0857 data: 0.0002 max mem: 17563 +[01:36:47.887354] Epoch: [0] [810/812] lr: 0.000050 grad_norm: 0.3976 (0.5518) closs: 0.8777 (0.9673) time: 1.0855 data: 0.0002 max mem: 17563 +[01:36:49.163189] Epoch: [0] Total time: 0:14:43 +[01:36:49.176568] Averaged stats: lr: 0.000050 grad_norm: 0.3924 (0.5516) closs: 0.8777 (0.9724) +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[01:36:49.242844] model saved +/root/anaconda3/envs/accessory/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:1110: UserWarning: ``FullyShardedDataParallel.full_optim_state_dict``is being deprecated and is replaced by ``FullyShardedDataParallel.optim_state_dict``. ``FullyShardedDataParallel.full_optim_state_dict`` may be removed after PyTorch 2.2. + warnings.warn( +[01:36:49.850475] optimizer saved +[01:36:49.850895] other rank-common saved +[01:36:49.852471] rank-specific saved +[01:36:49.856696] log_dir: ./output_dir +[01:36:52.189948] Epoch: [1] [0/812] lr: 0.000050 grad_norm: 0.4764 (0.4764) closs: 0.7142 (0.7142) time: 2.3324 data: 1.2396 max mem: 17563 +[01:37:03.067048] Epoch: [1] [10/812] lr: 0.000050 grad_norm: 0.4408 (0.4522) closs: 0.8549 (0.8482) time: 1.2008 data: 0.1129 max mem: 17563 +[01:37:13.868785] Epoch: [1] [20/812] lr: 0.000050 grad_norm: 0.4408 (0.4571) closs: 0.8781 (0.9186) time: 1.0839 data: 0.0002 max mem: 17563 +[01:37:24.639698] Epoch: [1] [30/812] lr: 0.000050 grad_norm: 0.4399 (0.4536) closs: 0.9031 (0.9141) time: 1.0786 data: 0.0002 max mem: 17563 +[01:37:35.491675] Epoch: [1] [40/812] lr: 0.000050 grad_norm: 0.4093 (0.4483) closs: 0.9425 (0.9326) time: 1.0811 data: 0.0002 max mem: 17563 +[01:37:46.229510] Epoch: [1] [50/812] lr: 0.000050 grad_norm: 0.3888 (0.4407) closs: 0.9432 (0.9284) time: 1.0794 data: 0.0002 max mem: 17563 +[01:37:56.982923] Epoch: [1] [60/812] lr: 0.000050 grad_norm: 0.3976 (0.4343) closs: 0.9081 (0.9261) time: 1.0745 data: 0.0002 max mem: 17563 +[01:38:07.884887] Epoch: [1] [70/812] lr: 0.000050 grad_norm: 0.3992 (0.4349) closs: 0.9186 (0.9288) time: 1.0827 data: 0.0002 max mem: 17563 +[01:38:18.645889] Epoch: [1] [80/812] lr: 0.000050 grad_norm: 0.4137 (0.4339) closs: 0.9349 (0.9248) time: 1.0831 data: 0.0002 max mem: 17563 +[01:38:29.481375] Epoch: [1] [90/812] lr: 0.000050 grad_norm: 0.3875 (0.4270) closs: 0.9349 (0.9295) time: 1.0797 data: 0.0002 max mem: 17563 +[01:38:40.338975] Epoch: [1] [100/812] lr: 0.000050 grad_norm: 0.3933 (0.4267) closs: 0.9616 (0.9318) time: 1.0846 data: 0.0002 max mem: 17563 +[01:38:51.268582] Epoch: [1] [110/812] lr: 0.000050 grad_norm: 0.4030 (0.4267) closs: 0.9432 (0.9282) time: 1.0893 data: 0.0002 max mem: 17563 +[01:39:02.289923] Epoch: [1] [120/812] lr: 0.000050 grad_norm: 0.4101 (0.4278) closs: 0.9432 (0.9294) time: 1.0975 data: 0.0002 max mem: 17563 +[01:39:13.138448] Epoch: [1] [130/812] lr: 0.000050 grad_norm: 0.4101 (0.4280) closs: 0.9284 (0.9292) time: 1.0934 data: 0.0002 max mem: 17563 +[01:39:24.044720] Epoch: [1] [140/812] lr: 0.000050 grad_norm: 0.4075 (0.4287) closs: 0.9221 (0.9280) time: 1.0877 data: 0.0002 max mem: 17563 +[01:39:34.896000] Epoch: [1] [150/812] lr: 0.000050 grad_norm: 0.4166 (0.4289) closs: 0.9403 (0.9278) time: 1.0878 data: 0.0002 max mem: 17563 +[01:39:45.869857] Epoch: [1] [160/812] lr: 0.000050 grad_norm: 0.4266 (0.4286) closs: 0.9551 (0.9301) time: 1.0912 data: 0.0002 max mem: 17563 +[01:39:56.708956] Epoch: [1] [170/812] lr: 0.000049 grad_norm: 0.4038 (0.4293) closs: 0.9698 (0.9322) time: 1.0906 data: 0.0002 max mem: 17563 +[01:40:07.568446] Epoch: [1] [180/812] lr: 0.000049 grad_norm: 0.3849 (0.4277) closs: 0.9374 (0.9326) time: 1.0849 data: 0.0003 max mem: 17563 +[01:40:18.500307] Epoch: [1] [190/812] lr: 0.000049 grad_norm: 0.3905 (0.4259) closs: 0.8962 (0.9308) time: 1.0895 data: 0.0003 max mem: 17563 +[01:40:29.409889] Epoch: [1] [200/812] lr: 0.000049 grad_norm: 0.3924 (0.4243) closs: 0.9011 (0.9293) time: 1.0920 data: 0.0002 max mem: 17563 +[01:40:40.336146] Epoch: [1] [210/812] lr: 0.000049 grad_norm: 0.3986 (0.4233) closs: 0.9540 (0.9315) time: 1.0917 data: 0.0002 max mem: 17563 +[01:40:51.171698] Epoch: [1] [220/812] lr: 0.000049 grad_norm: 0.3885 (0.4233) closs: 0.9565 (0.9301) time: 1.0880 data: 0.0002 max mem: 17563 +[01:41:02.091692] Epoch: [1] [230/812] lr: 0.000049 grad_norm: 0.3872 (0.4221) closs: 0.8715 (0.9287) time: 1.0877 data: 0.0002 max mem: 17563 +[01:41:13.045652] Epoch: [1] [240/812] lr: 0.000049 grad_norm: 0.3892 (0.4213) closs: 0.9001 (0.9298) time: 1.0936 data: 0.0002 max mem: 17563 +[01:41:24.014771] Epoch: [1] [250/812] lr: 0.000049 grad_norm: 0.4036 (0.4215) closs: 0.9496 (0.9289) time: 1.0961 data: 0.0002 max mem: 17563 +[01:41:34.910457] Epoch: [1] [260/812] lr: 0.000049 grad_norm: 0.4114 (0.4219) closs: 0.9459 (0.9293) time: 1.0932 data: 0.0002 max mem: 17563 +[01:41:45.860800] Epoch: [1] [270/812] lr: 0.000049 grad_norm: 0.4219 (0.4221) closs: 0.9364 (0.9278) time: 1.0922 data: 0.0002 max mem: 17563 +[01:41:56.732678] Epoch: [1] [280/812] lr: 0.000049 grad_norm: 0.4219 (0.4224) closs: 0.9236 (0.9279) time: 1.0910 data: 0.0002 max mem: 17563 +[01:42:07.667184] Epoch: [1] [290/812] lr: 0.000048 grad_norm: 0.4094 (0.4220) closs: 0.9319 (0.9285) time: 1.0902 data: 0.0002 max mem: 17563 +[01:42:18.499974] Epoch: [1] [300/812] lr: 0.000048 grad_norm: 0.4094 (0.4215) closs: 0.9029 (0.9271) time: 1.0883 data: 0.0002 max mem: 17563 +[01:42:29.390912] Epoch: [1] [310/812] lr: 0.000048 grad_norm: 0.4156 (0.4225) closs: 0.8847 (0.9262) time: 1.0861 data: 0.0002 max mem: 17563 +[01:42:40.312982] Epoch: [1] [320/812] lr: 0.000048 grad_norm: 0.4155 (0.4227) closs: 0.8956 (0.9267) time: 1.0906 data: 0.0002 max mem: 17563 +[01:42:51.239384] Epoch: [1] [330/812] lr: 0.000048 grad_norm: 0.3998 (0.4223) closs: 0.8956 (0.9259) time: 1.0923 data: 0.0002 max mem: 17563 +[01:43:02.180532] Epoch: [1] [340/812] lr: 0.000048 grad_norm: 0.4095 (0.4231) closs: 0.8600 (0.9239) time: 1.0933 data: 0.0002 max mem: 17563 +[01:43:13.068593] Epoch: [1] [350/812] lr: 0.000048 grad_norm: 0.4290 (0.4233) closs: 0.8778 (0.9234) time: 1.0914 data: 0.0002 max mem: 17563 +[01:43:23.907152] Epoch: [1] [360/812] lr: 0.000048 grad_norm: 0.4217 (0.4236) closs: 0.8525 (0.9212) time: 1.0863 data: 0.0002 max mem: 17563 +[01:43:34.848401] Epoch: [1] [370/812] lr: 0.000047 grad_norm: 0.4361 (0.4246) closs: 0.8350 (0.9202) time: 1.0889 data: 0.0002 max mem: 17563 +[01:43:45.784776] Epoch: [1] [380/812] lr: 0.000047 grad_norm: 0.4361 (0.4240) closs: 0.8984 (0.9204) time: 1.0938 data: 0.0002 max mem: 17563 +[01:43:56.644693] Epoch: [1] [390/812] lr: 0.000047 grad_norm: 0.4039 (0.4239) closs: 0.9187 (0.9209) time: 1.0897 data: 0.0002 max mem: 17563 +[01:44:07.528074] Epoch: [1] [400/812] lr: 0.000047 grad_norm: 0.4019 (0.4235) closs: 0.9184 (0.9206) time: 1.0871 data: 0.0002 max mem: 17563 +[01:44:18.390371] Epoch: [1] [410/812] lr: 0.000047 grad_norm: 0.4019 (0.4239) closs: 0.9184 (0.9220) time: 1.0872 data: 0.0002 max mem: 17563 +[01:44:29.324907] Epoch: [1] [420/812] lr: 0.000047 grad_norm: 0.4286 (0.4237) closs: 0.9624 (0.9224) time: 1.0898 data: 0.0002 max mem: 17563 +[01:44:40.135621] Epoch: [1] [430/812] lr: 0.000047 grad_norm: 0.4213 (0.4255) closs: 0.9003 (0.9216) time: 1.0872 data: 0.0002 max mem: 17563 +[01:44:50.992615] Epoch: [1] [440/812] lr: 0.000046 grad_norm: 0.4268 (0.4257) closs: 0.9141 (0.9217) time: 1.0833 data: 0.0002 max mem: 17563 +[01:45:01.952874] Epoch: [1] [450/812] lr: 0.000046 grad_norm: 0.4268 (0.4260) closs: 0.9244 (0.9213) time: 1.0908 data: 0.0002 max mem: 17563 +[01:45:12.875704] Epoch: [1] [460/812] lr: 0.000046 grad_norm: 0.4164 (0.4255) closs: 0.8638 (0.9207) time: 1.0941 data: 0.0002 max mem: 17563 +[01:45:23.810878] Epoch: [1] [470/812] lr: 0.000046 grad_norm: 0.4022 (0.4252) closs: 0.8908 (0.9207) time: 1.0928 data: 0.0002 max mem: 17563 +[01:45:34.685211] Epoch: [1] [480/812] lr: 0.000046 grad_norm: 0.4066 (0.4260) closs: 0.9035 (0.9213) time: 1.0904 data: 0.0002 max mem: 17563 +[01:45:45.553666] Epoch: [1] [490/812] lr: 0.000046 grad_norm: 0.4311 (0.4269) closs: 0.9485 (0.9221) time: 1.0871 data: 0.0002 max mem: 17563 +[01:45:56.494991] Epoch: [1] [500/812] lr: 0.000045 grad_norm: 0.4354 (0.4271) closs: 0.9109 (0.9218) time: 1.0904 data: 0.0002 max mem: 17563 +[01:46:07.415542] Epoch: [1] [510/812] lr: 0.000045 grad_norm: 0.3986 (0.4267) closs: 0.8969 (0.9214) time: 1.0930 data: 0.0002 max mem: 17563 +[01:46:18.280334] Epoch: [1] [520/812] lr: 0.000045 grad_norm: 0.4179 (0.4272) closs: 0.9120 (0.9214) time: 1.0892 data: 0.0002 max mem: 17563 +[01:46:29.242265] Epoch: [1] [530/812] lr: 0.000045 grad_norm: 0.4093 (0.4266) closs: 0.9045 (0.9215) time: 1.0913 data: 0.0002 max mem: 17563 +[01:46:40.097372] Epoch: [1] [540/812] lr: 0.000045 grad_norm: 0.4093 (0.4266) closs: 0.8894 (0.9216) time: 1.0908 data: 0.0002 max mem: 17563 +[01:46:51.082535] Epoch: [1] [550/812] lr: 0.000045 grad_norm: 0.3950 (0.4261) closs: 0.8833 (0.9210) time: 1.0919 data: 0.0002 max mem: 17563 +[01:47:01.956871] Epoch: [1] [560/812] lr: 0.000044 grad_norm: 0.3946 (0.4259) closs: 0.9061 (0.9212) time: 1.0929 data: 0.0002 max mem: 17563 +[01:47:12.829557] Epoch: [1] [570/812] lr: 0.000044 grad_norm: 0.4047 (0.4257) closs: 0.9278 (0.9216) time: 1.0873 data: 0.0002 max mem: 17563 +[01:47:23.771261] Epoch: [1] [580/812] lr: 0.000044 grad_norm: 0.4115 (0.4255) closs: 0.9212 (0.9207) time: 1.0906 data: 0.0002 max mem: 17563 +[01:47:34.689636] Epoch: [1] [590/812] lr: 0.000044 grad_norm: 0.4115 (0.4260) closs: 0.9243 (0.9210) time: 1.0929 data: 0.0002 max mem: 17563 +[01:47:45.757877] Epoch: [1] [600/812] lr: 0.000044 grad_norm: 0.3990 (0.4256) closs: 0.9293 (0.9212) time: 1.0993 data: 0.0002 max mem: 17563 +[01:47:56.585342] Epoch: [1] [610/812] lr: 0.000043 grad_norm: 0.3955 (0.4253) closs: 0.9080 (0.9213) time: 1.0947 data: 0.0002 max mem: 17563 +[01:48:07.414290] Epoch: [1] [620/812] lr: 0.000043 grad_norm: 0.4127 (0.4249) closs: 0.9134 (0.9214) time: 1.0828 data: 0.0002 max mem: 17563 +[01:48:18.313386] Epoch: [1] [630/812] lr: 0.000043 grad_norm: 0.3864 (0.4243) closs: 0.9365 (0.9217) time: 1.0863 data: 0.0002 max mem: 17563 +[01:48:29.236580] Epoch: [1] [640/812] lr: 0.000043 grad_norm: 0.3864 (0.4241) closs: 0.9075 (0.9217) time: 1.0910 data: 0.0002 max mem: 17563 +[01:48:40.090161] Epoch: [1] [650/812] lr: 0.000043 grad_norm: 0.3873 (0.4239) closs: 0.8929 (0.9215) time: 1.0888 data: 0.0002 max mem: 17563 +[01:48:51.000883] Epoch: [1] [660/812] lr: 0.000042 grad_norm: 0.3823 (0.4236) closs: 0.9560 (0.9225) time: 1.0881 data: 0.0002 max mem: 17563 +[01:49:01.838795] Epoch: [1] [670/812] lr: 0.000042 grad_norm: 0.3860 (0.4235) closs: 0.9866 (0.9232) time: 1.0874 data: 0.0002 max mem: 17563 +[01:49:12.727690] Epoch: [1] [680/812] lr: 0.000042 grad_norm: 0.3983 (0.4238) closs: 0.9759 (0.9238) time: 1.0863 data: 0.0002 max mem: 17563 +[01:49:23.555512] Epoch: [1] [690/812] lr: 0.000042 grad_norm: 0.3988 (0.4241) closs: 0.9759 (0.9243) time: 1.0858 data: 0.0002 max mem: 17563 +[01:49:34.420950] Epoch: [1] [700/812] lr: 0.000041 grad_norm: 0.4054 (0.4239) closs: 0.9389 (0.9243) time: 1.0846 data: 0.0002 max mem: 17563 +[01:49:45.387096] Epoch: [1] [710/812] lr: 0.000041 grad_norm: 0.4068 (0.4237) closs: 0.8852 (0.9239) time: 1.0915 data: 0.0002 max mem: 17563 +[01:49:56.367505] Epoch: [1] [720/812] lr: 0.000041 grad_norm: 0.4067 (0.4235) closs: 0.8749 (0.9235) time: 1.0972 data: 0.0004 max mem: 17563 +[01:50:07.313072] Epoch: [1] [730/812] lr: 0.000041 grad_norm: 0.4023 (0.4233) closs: 0.9136 (0.9235) time: 1.0962 data: 0.0003 max mem: 17563 +[01:50:18.143294] Epoch: [1] [740/812] lr: 0.000041 grad_norm: 0.4113 (0.4235) closs: 0.9239 (0.9234) time: 1.0887 data: 0.0002 max mem: 17563 +[01:50:28.998868] Epoch: [1] [750/812] lr: 0.000040 grad_norm: 0.4113 (0.4232) closs: 0.8806 (0.9231) time: 1.0842 data: 0.0002 max mem: 17563 +[01:50:39.933717] Epoch: [1] [760/812] lr: 0.000040 grad_norm: 0.3932 (0.4227) closs: 0.8906 (0.9235) time: 1.0895 data: 0.0002 max mem: 17563 +[01:50:50.842088] Epoch: [1] [770/812] lr: 0.000040 grad_norm: 0.3787 (0.4221) closs: 0.9384 (0.9230) time: 1.0921 data: 0.0002 max mem: 17563 +[01:51:01.700960] Epoch: [1] [780/812] lr: 0.000040 grad_norm: 0.3964 (0.4226) closs: 0.8583 (0.9227) time: 1.0883 data: 0.0002 max mem: 17563 +[01:51:12.543296] Epoch: [1] [790/812] lr: 0.000039 grad_norm: 0.4172 (0.4225) closs: 0.9070 (0.9229) time: 1.0850 data: 0.0001 max mem: 17563 +[01:51:23.433881] Epoch: [1] [800/812] lr: 0.000039 grad_norm: 0.4172 (0.4227) closs: 0.9150 (0.9225) time: 1.0866 data: 0.0001 max mem: 17563 +[01:51:34.266174] Epoch: [1] [810/812] lr: 0.000039 grad_norm: 0.4238 (0.4232) closs: 0.9150 (0.9225) time: 1.0861 data: 0.0001 max mem: 17563 +[01:51:35.625120] Epoch: [1] Total time: 0:14:45 +[01:51:35.641527] Averaged stats: lr: 0.000039 grad_norm: 0.4238 (0.4231) closs: 0.9164 (0.9194) +[01:51:35.700824] model saved +[01:51:36.391314] optimizer saved +[01:51:36.391767] other rank-common saved +[01:51:36.393341] rank-specific saved +[01:51:36.397531] log_dir: ./output_dir +[01:51:38.765850] Epoch: [2] [0/812] lr: 0.000039 grad_norm: 0.4308 (0.4308) closs: 0.8720 (0.8720) time: 2.3674 data: 1.2568 max mem: 17563 +[01:51:49.643979] Epoch: [2] [10/812] lr: 0.000038 grad_norm: 0.4273 (0.4116) closs: 0.8873 (0.9060) time: 1.2041 data: 0.1144 max mem: 17563 +[01:52:00.424263] Epoch: [2] [20/812] lr: 0.000038 grad_norm: 0.4013 (0.4131) closs: 0.8921 (0.8999) time: 1.0829 data: 0.0002 max mem: 17563 +[01:52:11.196775] Epoch: [2] [30/812] lr: 0.000038 grad_norm: 0.4013 (0.4098) closs: 0.8966 (0.9069) time: 1.0776 data: 0.0002 max mem: 17563 +[01:52:22.046734] Epoch: [2] [40/812] lr: 0.000038 grad_norm: 0.4087 (0.4220) closs: 0.9032 (0.9065) time: 1.0811 data: 0.0002 max mem: 17563 +[01:52:32.833783] Epoch: [2] [50/812] lr: 0.000037 grad_norm: 0.4287 (0.4248) closs: 0.9085 (0.9040) time: 1.0818 data: 0.0002 max mem: 17563 +[01:52:43.693301] Epoch: [2] [60/812] lr: 0.000037 grad_norm: 0.4213 (0.4241) closs: 0.9320 (0.9076) time: 1.0823 data: 0.0002 max mem: 17563 +[01:52:54.454634] Epoch: [2] [70/812] lr: 0.000037 grad_norm: 0.4030 (0.4262) closs: 0.9246 (0.9125) time: 1.0810 data: 0.0002 max mem: 17563 +[01:53:05.308252] Epoch: [2] [80/812] lr: 0.000037 grad_norm: 0.4046 (0.4238) closs: 0.9040 (0.9099) time: 1.0807 data: 0.0002 max mem: 17563 +[01:53:16.131876] Epoch: [2] [90/812] lr: 0.000036 grad_norm: 0.4062 (0.4232) closs: 0.8718 (0.9071) time: 1.0838 data: 0.0002 max mem: 17563 +[01:53:27.101904] Epoch: [2] [100/812] lr: 0.000036 grad_norm: 0.4088 (0.4233) closs: 0.8718 (0.9085) time: 1.0896 data: 0.0002 max mem: 17563 +[01:53:37.951020] Epoch: [2] [110/812] lr: 0.000036 grad_norm: 0.4034 (0.4227) closs: 0.8835 (0.9034) time: 1.0909 data: 0.0002 max mem: 17563 +[01:53:48.791842] Epoch: [2] [120/812] lr: 0.000036 grad_norm: 0.3949 (0.4206) closs: 0.8952 (0.9047) time: 1.0844 data: 0.0002 max mem: 17563 +[01:53:59.632784] Epoch: [2] [130/812] lr: 0.000035 grad_norm: 0.3865 (0.4186) closs: 0.8820 (0.9033) time: 1.0840 data: 0.0002 max mem: 17563 +[01:54:10.462481] Epoch: [2] [140/812] lr: 0.000035 grad_norm: 0.3865 (0.4176) closs: 0.8681 (0.9026) time: 1.0835 data: 0.0002 max mem: 17563 +[01:54:21.299678] Epoch: [2] [150/812] lr: 0.000035 grad_norm: 0.3883 (0.4179) closs: 0.8821 (0.9037) time: 1.0833 data: 0.0002 max mem: 17563 +[01:54:32.133168] Epoch: [2] [160/812] lr: 0.000035 grad_norm: 0.3883 (0.4174) closs: 0.9362 (0.9048) time: 1.0835 data: 0.0002 max mem: 17563 +[01:54:43.049102] Epoch: [2] [170/812] lr: 0.000034 grad_norm: 0.3878 (0.4167) closs: 0.8898 (0.9036) time: 1.0874 data: 0.0002 max mem: 17563 +[01:54:53.915941] Epoch: [2] [180/812] lr: 0.000034 grad_norm: 0.4010 (0.4158) closs: 0.8534 (0.9029) time: 1.0891 data: 0.0002 max mem: 17563 +[01:55:04.769991] Epoch: [2] [190/812] lr: 0.000034 grad_norm: 0.4093 (0.4189) closs: 0.9012 (0.9044) time: 1.0860 data: 0.0002 max mem: 17563 +[01:55:15.615925] Epoch: [2] [200/812] lr: 0.000033 grad_norm: 0.4076 (0.4203) closs: 0.9158 (0.9061) time: 1.0849 data: 0.0002 max mem: 17563 +[01:55:26.497841] Epoch: [2] [210/812] lr: 0.000033 grad_norm: 0.4065 (0.4216) closs: 0.9073 (0.9066) time: 1.0863 data: 0.0002 max mem: 17563 +[01:55:37.343158] Epoch: [2] [220/812] lr: 0.000033 grad_norm: 0.4200 (0.4232) closs: 0.9341 (0.9079) time: 1.0863 data: 0.0002 max mem: 17563 +[01:55:48.278472] Epoch: [2] [230/812] lr: 0.000033 grad_norm: 0.4200 (0.4235) closs: 0.9294 (0.9084) time: 1.0890 data: 0.0002 max mem: 17563 +[01:55:59.129091] Epoch: [2] [240/812] lr: 0.000032 grad_norm: 0.4072 (0.4236) closs: 0.8965 (0.9083) time: 1.0892 data: 0.0002 max mem: 17563 +[01:56:10.001149] Epoch: [2] [250/812] lr: 0.000032 grad_norm: 0.3972 (0.4229) closs: 0.8913 (0.9088) time: 1.0861 data: 0.0002 max mem: 17563 +[01:56:20.856739] Epoch: [2] [260/812] lr: 0.000032 grad_norm: 0.4202 (0.4234) closs: 0.8602 (0.9085) time: 1.0863 data: 0.0002 max mem: 17563 +[01:56:31.687201] Epoch: [2] [270/812] lr: 0.000031 grad_norm: 0.4245 (0.4227) closs: 0.8610 (0.9098) time: 1.0842 data: 0.0002 max mem: 17563 +[01:56:42.523986] Epoch: [2] [280/812] lr: 0.000031 grad_norm: 0.4059 (0.4224) closs: 0.9101 (0.9114) time: 1.0833 data: 0.0002 max mem: 17563 +[01:56:53.373471] Epoch: [2] [290/812] lr: 0.000031 grad_norm: 0.4080 (0.4223) closs: 0.8769 (0.9081) time: 1.0842 data: 0.0002 max mem: 17563 +[01:57:04.298846] Epoch: [2] [300/812] lr: 0.000031 grad_norm: 0.4092 (0.4218) closs: 0.8855 (0.9079) time: 1.0887 data: 0.0002 max mem: 17563 +[01:57:15.135896] Epoch: [2] [310/812] lr: 0.000030 grad_norm: 0.4092 (0.4234) closs: 0.9208 (0.9080) time: 1.0880 data: 0.0002 max mem: 17563 +[01:57:25.967555] Epoch: [2] [320/812] lr: 0.000030 grad_norm: 0.3988 (0.4233) closs: 0.9238 (0.9095) time: 1.0834 data: 0.0002 max mem: 17563 +[01:57:36.782449] Epoch: [2] [330/812] lr: 0.000030 grad_norm: 0.4307 (0.4249) closs: 0.9413 (0.9110) time: 1.0823 data: 0.0002 max mem: 17563 +[01:57:47.653331] Epoch: [2] [340/812] lr: 0.000029 grad_norm: 0.4349 (0.4252) closs: 0.9600 (0.9121) time: 1.0842 data: 0.0002 max mem: 17563 +[01:57:58.502839] Epoch: [2] [350/812] lr: 0.000029 grad_norm: 0.4147 (0.4250) closs: 0.9340 (0.9119) time: 1.0859 data: 0.0002 max mem: 17563 +[01:58:09.451401] Epoch: [2] [360/812] lr: 0.000029 grad_norm: 0.4077 (0.4246) closs: 0.9024 (0.9128) time: 1.0898 data: 0.0002 max mem: 17563 +[01:58:20.271015] Epoch: [2] [370/812] lr: 0.000029 grad_norm: 0.4077 (0.4243) closs: 0.9013 (0.9127) time: 1.0883 data: 0.0002 max mem: 17563 +[01:58:31.122124] Epoch: [2] [380/812] lr: 0.000028 grad_norm: 0.4193 (0.4244) closs: 0.9082 (0.9120) time: 1.0835 data: 0.0002 max mem: 17563 +[01:58:41.965781] Epoch: [2] [390/812] lr: 0.000028 grad_norm: 0.4238 (0.4249) closs: 0.9083 (0.9105) time: 1.0847 data: 0.0002 max mem: 17563 +[01:58:52.791949] Epoch: [2] [400/812] lr: 0.000028 grad_norm: 0.4226 (0.4252) closs: 0.8808 (0.9103) time: 1.0834 data: 0.0002 max mem: 17563 +[01:59:03.649565] Epoch: [2] [410/812] lr: 0.000027 grad_norm: 0.4266 (0.4251) closs: 0.8843 (0.9101) time: 1.0841 data: 0.0002 max mem: 17563 +[01:59:14.539085] Epoch: [2] [420/812] lr: 0.000027 grad_norm: 0.4235 (0.4253) closs: 0.8703 (0.9101) time: 1.0873 data: 0.0002 max mem: 17563 +[01:59:25.436742] Epoch: [2] [430/812] lr: 0.000027 grad_norm: 0.4027 (0.4252) closs: 0.8717 (0.9107) time: 1.0893 data: 0.0002 max mem: 17563 +[01:59:36.307301] Epoch: [2] [440/812] lr: 0.000027 grad_norm: 0.4181 (0.4259) closs: 0.9164 (0.9105) time: 1.0883 data: 0.0002 max mem: 17563 +[01:59:47.106471] Epoch: [2] [450/812] lr: 0.000026 grad_norm: 0.4380 (0.4258) closs: 0.9270 (0.9118) time: 1.0834 data: 0.0002 max mem: 17563 +[01:59:57.969495] Epoch: [2] [460/812] lr: 0.000026 grad_norm: 0.4109 (0.4264) closs: 0.9670 (0.9138) time: 1.0830 data: 0.0002 max mem: 17563 +[02:00:08.857151] Epoch: [2] [470/812] lr: 0.000026 grad_norm: 0.4159 (0.4406) closs: 0.9521 (0.9133) time: 1.0875 data: 0.0002 max mem: 17563 +[02:00:19.720928] Epoch: [2] [480/812] lr: 0.000025 grad_norm: 0.4153 (0.4403) closs: 0.9305 (0.9144) time: 1.0875 data: 0.0002 max mem: 17563 +[02:00:30.663962] Epoch: [2] [490/812] lr: 0.000025 grad_norm: 0.3940 (0.4397) closs: 0.9178 (0.9137) time: 1.0903 data: 0.0002 max mem: 17563 +[02:00:41.489372] Epoch: [2] [500/812] lr: 0.000025 grad_norm: 0.3940 (0.4394) closs: 0.8950 (0.9133) time: 1.0883 data: 0.0002 max mem: 17563 +[02:00:52.333520] Epoch: [2] [510/812] lr: 0.000024 grad_norm: 0.4045 (0.4389) closs: 0.8950 (0.9133) time: 1.0834 data: 0.0002 max mem: 17563 +[02:01:03.175443] Epoch: [2] [520/812] lr: 0.000024 grad_norm: 0.4021 (0.4383) closs: 0.8764 (0.9133) time: 1.0842 data: 0.0002 max mem: 17563 +[02:01:14.038903] Epoch: [2] [530/812] lr: 0.000024 grad_norm: 0.4054 (0.4382) closs: 0.9012 (0.9132) time: 1.0852 data: 0.0002 max mem: 17563 +[02:01:24.892673] Epoch: [2] [540/812] lr: 0.000024 grad_norm: 0.4148 (0.4375) closs: 0.8972 (0.9131) time: 1.0858 data: 0.0002 max mem: 17563 +[02:01:35.779226] Epoch: [2] [550/812] lr: 0.000023 grad_norm: 0.3733 (0.4364) closs: 0.8972 (0.9130) time: 1.0869 data: 0.0002 max mem: 17563 +[02:01:46.730439] Epoch: [2] [560/812] lr: 0.000023 grad_norm: 0.3960 (0.4364) closs: 0.8940 (0.9132) time: 1.0918 data: 0.0002 max mem: 17563 +[02:01:57.589054] Epoch: [2] [570/812] lr: 0.000023 grad_norm: 0.4338 (0.4367) closs: 0.8873 (0.9125) time: 1.0904 data: 0.0002 max mem: 17563 +[02:02:08.438216] Epoch: [2] [580/812] lr: 0.000022 grad_norm: 0.4207 (0.4365) closs: 0.8640 (0.9114) time: 1.0853 data: 0.0002 max mem: 17563 +[02:02:19.300511] Epoch: [2] [590/812] lr: 0.000022 grad_norm: 0.4164 (0.4366) closs: 0.8945 (0.9105) time: 1.0855 data: 0.0002 max mem: 17563 +[02:02:30.163210] Epoch: [2] [600/812] lr: 0.000022 grad_norm: 0.4080 (0.4360) closs: 0.8987 (0.9106) time: 1.0862 data: 0.0002 max mem: 17563 +[02:02:40.965411] Epoch: [2] [610/812] lr: 0.000022 grad_norm: 0.4122 (0.4359) closs: 0.9033 (0.9105) time: 1.0832 data: 0.0002 max mem: 17563 +[02:02:51.875860] Epoch: [2] [620/812] lr: 0.000021 grad_norm: 0.4149 (0.4355) closs: 0.9033 (0.9103) time: 1.0856 data: 0.0002 max mem: 17563 +[02:03:02.716853] Epoch: [2] [630/812] lr: 0.000021 grad_norm: 0.3996 (0.4353) closs: 0.8798 (0.9102) time: 1.0875 data: 0.0002 max mem: 17563 +[02:03:13.555206] Epoch: [2] [640/812] lr: 0.000021 grad_norm: 0.4151 (0.4353) closs: 0.8893 (0.9102) time: 1.0839 data: 0.0002 max mem: 17563 +[02:03:24.404228] Epoch: [2] [650/812] lr: 0.000021 grad_norm: 0.4151 (0.4359) closs: 0.8497 (0.9087) time: 1.0843 data: 0.0002 max mem: 17563 +[02:03:35.236727] Epoch: [2] [660/812] lr: 0.000020 grad_norm: 0.3994 (0.4354) closs: 0.8437 (0.9080) time: 1.0840 data: 0.0002 max mem: 17563 +[02:03:46.089429] Epoch: [2] [670/812] lr: 0.000020 grad_norm: 0.3948 (0.4353) closs: 0.8686 (0.9075) time: 1.0842 data: 0.0002 max mem: 17563 +[02:03:56.967232] Epoch: [2] [680/812] lr: 0.000020 grad_norm: 0.3992 (0.4350) closs: 0.9080 (0.9081) time: 1.0865 data: 0.0002 max mem: 17563 +[02:04:07.889221] Epoch: [2] [690/812] lr: 0.000019 grad_norm: 0.4095 (0.4347) closs: 0.9402 (0.9083) time: 1.0899 data: 0.0002 max mem: 17563 +[02:04:18.752487] Epoch: [2] [700/812] lr: 0.000019 grad_norm: 0.4252 (0.4347) closs: 0.8932 (0.9080) time: 1.0892 data: 0.0002 max mem: 17563 +[02:04:29.575645] Epoch: [2] [710/812] lr: 0.000019 grad_norm: 0.4252 (0.4346) closs: 0.8679 (0.9075) time: 1.0842 data: 0.0002 max mem: 17563 +[02:04:40.433907] Epoch: [2] [720/812] lr: 0.000019 grad_norm: 0.4004 (0.4343) closs: 0.8577 (0.9075) time: 1.0840 data: 0.0002 max mem: 17563 +[02:04:51.273096] Epoch: [2] [730/812] lr: 0.000018 grad_norm: 0.4196 (0.4342) closs: 0.8849 (0.9083) time: 1.0848 data: 0.0002 max mem: 17563 +[02:05:02.094971] Epoch: [2] [740/812] lr: 0.000018 grad_norm: 0.4285 (0.4345) closs: 0.8671 (0.9082) time: 1.0830 data: 0.0001 max mem: 17563 +[02:05:13.024661] Epoch: [2] [750/812] lr: 0.000018 grad_norm: 0.4137 (0.4345) closs: 0.8708 (0.9081) time: 1.0875 data: 0.0002 max mem: 17563 +[02:05:23.832680] Epoch: [2] [760/812] lr: 0.000018 grad_norm: 0.4091 (0.4344) closs: 0.8956 (0.9075) time: 1.0868 data: 0.0002 max mem: 17563 +[02:05:34.663264] Epoch: [2] [770/812] lr: 0.000017 grad_norm: 0.4201 (0.4345) closs: 0.9148 (0.9082) time: 1.0819 data: 0.0002 max mem: 17563 +[02:05:45.512165] Epoch: [2] [780/812] lr: 0.000017 grad_norm: 0.4217 (0.4345) closs: 0.9572 (0.9087) time: 1.0839 data: 0.0002 max mem: 17563 +[02:05:56.436831] Epoch: [2] [790/812] lr: 0.000017 grad_norm: 0.3903 (0.4340) closs: 0.9227 (0.9091) time: 1.0886 data: 0.0001 max mem: 17563 +[02:06:07.316986] Epoch: [2] [800/812] lr: 0.000017 grad_norm: 0.4042 (0.4339) closs: 0.9078 (0.9094) time: 1.0902 data: 0.0001 max mem: 17563 +[02:06:18.230131] Epoch: [2] [810/812] lr: 0.000016 grad_norm: 0.4042 (0.4338) closs: 0.9269 (0.9098) time: 1.0896 data: 0.0001 max mem: 17563 +[02:06:19.593496] Epoch: [2] Total time: 0:14:43 +[02:06:19.597975] Averaged stats: lr: 0.000016 grad_norm: 0.4042 (0.4338) closs: 0.9269 (0.9098) +[02:06:19.659399] model saved +[02:06:20.282609] optimizer saved +[02:06:20.283084] other rank-common saved +[02:06:20.284711] rank-specific saved +[02:06:20.288956] log_dir: ./output_dir +[02:06:22.710850] Epoch: [3] [0/812] lr: 0.000016 grad_norm: 0.4176 (0.4176) closs: 0.6748 (0.6748) time: 2.4210 data: 1.3135 max mem: 17563 +[02:06:33.519859] Epoch: [3] [10/812] lr: 0.000016 grad_norm: 0.4176 (0.4223) closs: 0.9734 (0.9533) time: 1.2027 data: 0.1195 max mem: 17563 +[02:06:44.266283] Epoch: [3] [20/812] lr: 0.000016 grad_norm: 0.3989 (0.4313) closs: 0.9278 (0.9248) time: 1.0777 data: 0.0001 max mem: 17563 +[02:06:55.019207] Epoch: [3] [30/812] lr: 0.000016 grad_norm: 0.3960 (0.4187) closs: 0.9133 (0.9231) time: 1.0749 data: 0.0002 max mem: 17563 +[02:07:05.786964] Epoch: [3] [40/812] lr: 0.000015 grad_norm: 0.4012 (0.4199) closs: 0.9042 (0.9126) time: 1.0760 data: 0.0002 max mem: 17563 +[02:07:16.545258] Epoch: [3] [50/812] lr: 0.000015 grad_norm: 0.4237 (0.4220) closs: 0.9111 (0.9140) time: 1.0762 data: 0.0002 max mem: 17563 +[02:07:27.329602] Epoch: [3] [60/812] lr: 0.000015 grad_norm: 0.4194 (0.4208) closs: 0.9097 (0.9181) time: 1.0771 data: 0.0002 max mem: 17563 +[02:07:38.193921] Epoch: [3] [70/812] lr: 0.000015 grad_norm: 0.4148 (0.4200) closs: 0.8976 (0.9157) time: 1.0824 data: 0.0003 max mem: 17563 +[02:07:49.051932] Epoch: [3] [80/812] lr: 0.000014 grad_norm: 0.4067 (0.4203) closs: 0.9098 (0.9146) time: 1.0860 data: 0.0003 max mem: 17563 +[02:07:59.806022] Epoch: [3] [90/812] lr: 0.000014 grad_norm: 0.4169 (0.4256) closs: 0.9106 (0.9134) time: 1.0805 data: 0.0002 max mem: 17563 +[02:08:10.644450] Epoch: [3] [100/812] lr: 0.000014 grad_norm: 0.4417 (0.4269) closs: 0.9029 (0.9120) time: 1.0796 data: 0.0002 max mem: 17563 +[02:08:21.501936] Epoch: [3] [110/812] lr: 0.000014 grad_norm: 0.4361 (0.4298) closs: 0.8539 (0.9054) time: 1.0847 data: 0.0002 max mem: 17563 +[02:08:32.378222] Epoch: [3] [120/812] lr: 0.000013 grad_norm: 0.4314 (0.4294) closs: 0.8574 (0.9092) time: 1.0866 data: 0.0002 max mem: 17563 +[02:08:43.200717] Epoch: [3] [130/812] lr: 0.000013 grad_norm: 0.4217 (0.4289) closs: 0.8647 (0.9055) time: 1.0849 data: 0.0002 max mem: 17563 +[02:08:54.073480] Epoch: [3] [140/812] lr: 0.000013 grad_norm: 0.4077 (0.4268) closs: 0.8831 (0.9097) time: 1.0847 data: 0.0002 max mem: 17563 +[02:09:04.989287] Epoch: [3] [150/812] lr: 0.000013 grad_norm: 0.4010 (0.4264) closs: 0.9527 (0.9097) time: 1.0894 data: 0.0002 max mem: 17563 +[02:09:15.881253] Epoch: [3] [160/812] lr: 0.000012 grad_norm: 0.4010 (0.4247) closs: 0.8847 (0.9097) time: 1.0903 data: 0.0002 max mem: 17563 +[02:09:26.738896] Epoch: [3] [170/812] lr: 0.000012 grad_norm: 0.4141 (0.4242) closs: 0.8847 (0.9086) time: 1.0874 data: 0.0002 max mem: 17563 +[02:09:37.549785] Epoch: [3] [180/812] lr: 0.000012 grad_norm: 0.4163 (0.4252) closs: 0.8829 (0.9072) time: 1.0833 data: 0.0002 max mem: 17563 +[02:09:48.429478] Epoch: [3] [190/812] lr: 0.000012 grad_norm: 0.4051 (0.4250) closs: 0.8713 (0.9066) time: 1.0844 data: 0.0002 max mem: 17563 +[02:09:59.361112] Epoch: [3] [200/812] lr: 0.000012 grad_norm: 0.3976 (0.4232) closs: 0.9178 (0.9084) time: 1.0905 data: 0.0002 max mem: 17563 +[02:10:10.221693] Epoch: [3] [210/812] lr: 0.000011 grad_norm: 0.3976 (0.4228) closs: 0.9029 (0.9076) time: 1.0895 data: 0.0002 max mem: 17563 +[02:10:20.991480] Epoch: [3] [220/812] lr: 0.000011 grad_norm: 0.4122 (0.4218) closs: 0.8810 (0.9092) time: 1.0814 data: 0.0002 max mem: 17563 +[02:10:31.909511] Epoch: [3] [230/812] lr: 0.000011 grad_norm: 0.3948 (0.4208) closs: 0.9004 (0.9098) time: 1.0843 data: 0.0002 max mem: 17563 +[02:10:42.753050] Epoch: [3] [240/812] lr: 0.000011 grad_norm: 0.4029 (0.4285) closs: 0.8996 (0.9098) time: 1.0880 data: 0.0002 max mem: 17563 +[02:10:53.652353] Epoch: [3] [250/812] lr: 0.000011 grad_norm: 0.4152 (0.4287) closs: 0.8878 (0.9093) time: 1.0871 data: 0.0002 max mem: 17563 +[02:11:04.489927] Epoch: [3] [260/812] lr: 0.000010 grad_norm: 0.4357 (0.4288) closs: 0.8396 (0.9074) time: 1.0868 data: 0.0002 max mem: 17563 +[02:11:15.354091] Epoch: [3] [270/812] lr: 0.000010 grad_norm: 0.4116 (0.4280) closs: 0.8460 (0.9079) time: 1.0850 data: 0.0002 max mem: 17563 +[02:11:26.291518] Epoch: [3] [280/812] lr: 0.000010 grad_norm: 0.4071 (0.4279) closs: 0.8640 (0.9096) time: 1.0900 data: 0.0002 max mem: 17563 +[02:11:37.145372] Epoch: [3] [290/812] lr: 0.000010 grad_norm: 0.4271 (0.4289) closs: 0.8646 (0.9088) time: 1.0895 data: 0.0002 max mem: 17563 +[02:11:47.992029] Epoch: [3] [300/812] lr: 0.000010 grad_norm: 0.4576 (0.4295) closs: 0.8555 (0.9060) time: 1.0849 data: 0.0002 max mem: 17563 +[02:11:58.846182] Epoch: [3] [310/812] lr: 0.000010 grad_norm: 0.4213 (0.4291) closs: 0.8791 (0.9076) time: 1.0850 data: 0.0004 max mem: 17563 +[02:12:09.703196] Epoch: [3] [320/812] lr: 0.000009 grad_norm: 0.4203 (0.4295) closs: 0.9359 (0.9080) time: 1.0855 data: 0.0004 max mem: 17563 +[02:12:20.668253] Epoch: [3] [330/812] lr: 0.000009 grad_norm: 0.4150 (0.4288) closs: 0.9336 (0.9078) time: 1.0910 data: 0.0002 max mem: 17563 +[02:12:31.500169] Epoch: [3] [340/812] lr: 0.000009 grad_norm: 0.4150 (0.4283) closs: 0.9348 (0.9082) time: 1.0898 data: 0.0002 max mem: 17563 +[02:12:42.261952] Epoch: [3] [350/812] lr: 0.000009 grad_norm: 0.4078 (0.4279) closs: 0.9334 (0.9088) time: 1.0796 data: 0.0002 max mem: 17563 +[02:12:53.174413] Epoch: [3] [360/812] lr: 0.000009 grad_norm: 0.4009 (0.4282) closs: 0.9129 (0.9094) time: 1.0836 data: 0.0002 max mem: 17563 +[02:13:04.026514] Epoch: [3] [370/812] lr: 0.000009 grad_norm: 0.4036 (0.4283) closs: 0.9507 (0.9097) time: 1.0882 data: 0.0002 max mem: 17563 +[02:13:14.864316] Epoch: [3] [380/812] lr: 0.000008 grad_norm: 0.4236 (0.4280) closs: 0.8826 (0.9087) time: 1.0844 data: 0.0002 max mem: 17563 +[02:13:25.695888] Epoch: [3] [390/812] lr: 0.000008 grad_norm: 0.4236 (0.4280) closs: 0.8234 (0.9077) time: 1.0834 data: 0.0002 max mem: 17563 +[02:13:36.522555] Epoch: [3] [400/812] lr: 0.000008 grad_norm: 0.4061 (0.4279) closs: 0.8981 (0.9082) time: 1.0828 data: 0.0002 max mem: 17563 +[02:13:47.446926] Epoch: [3] [410/812] lr: 0.000008 grad_norm: 0.4061 (0.4276) closs: 0.8877 (0.9069) time: 1.0875 data: 0.0002 max mem: 17563 +[02:13:58.297640] Epoch: [3] [420/812] lr: 0.000008 grad_norm: 0.4293 (0.4275) closs: 0.8711 (0.9062) time: 1.0887 data: 0.0002 max mem: 17563 +[02:14:09.250109] Epoch: [3] [430/812] lr: 0.000008 grad_norm: 0.4153 (0.4273) closs: 0.8631 (0.9048) time: 1.0901 data: 0.0002 max mem: 17563 +[02:14:20.064220] Epoch: [3] [440/812] lr: 0.000008 grad_norm: 0.4153 (0.4271) closs: 0.8631 (0.9042) time: 1.0883 data: 0.0002 max mem: 17563 +[02:14:30.897325] Epoch: [3] [450/812] lr: 0.000007 grad_norm: 0.4201 (0.4277) closs: 0.8829 (0.9038) time: 1.0823 data: 0.0002 max mem: 17563 +[02:14:41.784349] Epoch: [3] [460/812] lr: 0.000007 grad_norm: 0.4253 (0.4280) closs: 0.8706 (0.9030) time: 1.0859 data: 0.0002 max mem: 17563 +[02:14:52.658859] Epoch: [3] [470/812] lr: 0.000007 grad_norm: 0.4084 (0.4283) closs: 0.8706 (0.9027) time: 1.0880 data: 0.0002 max mem: 17563 +[02:15:03.399053] Epoch: [3] [480/812] lr: 0.000007 grad_norm: 0.4084 (0.4292) closs: 0.8693 (0.9023) time: 1.0807 data: 0.0002 max mem: 17563 +[02:15:14.332241] Epoch: [3] [490/812] lr: 0.000007 grad_norm: 0.4107 (0.4287) closs: 0.8744 (0.9026) time: 1.0836 data: 0.0002 max mem: 17563 +[02:15:25.185289] Epoch: [3] [500/812] lr: 0.000007 grad_norm: 0.3996 (0.4288) closs: 0.8780 (0.9021) time: 1.0892 data: 0.0002 max mem: 17563 +[02:15:36.055738] Epoch: [3] [510/812] lr: 0.000007 grad_norm: 0.3932 (0.4287) closs: 0.9032 (0.9035) time: 1.0861 data: 0.0002 max mem: 17563 +[02:15:46.875809] Epoch: [3] [520/812] lr: 0.000007 grad_norm: 0.4266 (0.4289) closs: 0.9807 (0.9043) time: 1.0845 data: 0.0002 max mem: 17563 +[02:15:57.742641] Epoch: [3] [530/812] lr: 0.000006 grad_norm: 0.4378 (0.4295) closs: 0.8574 (0.9035) time: 1.0843 data: 0.0002 max mem: 17563 +[02:16:08.662372] Epoch: [3] [540/812] lr: 0.000006 grad_norm: 0.4378 (0.4297) closs: 0.8314 (0.9034) time: 1.0893 data: 0.0002 max mem: 17563 +[02:16:19.574885] Epoch: [3] [550/812] lr: 0.000006 grad_norm: 0.4160 (0.4300) closs: 0.8906 (0.9025) time: 1.0915 data: 0.0002 max mem: 17563 +[02:16:30.444433] Epoch: [3] [560/812] lr: 0.000006 grad_norm: 0.4111 (0.4297) closs: 0.8671 (0.9019) time: 1.0890 data: 0.0002 max mem: 17563 +[02:16:41.256177] Epoch: [3] [570/812] lr: 0.000006 grad_norm: 0.4156 (0.4300) closs: 0.8896 (0.9025) time: 1.0840 data: 0.0002 max mem: 17563 +[02:16:52.063035] Epoch: [3] [580/812] lr: 0.000006 grad_norm: 0.4278 (0.4300) closs: 0.9103 (0.9030) time: 1.0809 data: 0.0002 max mem: 17563 +[02:17:02.975320] Epoch: [3] [590/812] lr: 0.000006 grad_norm: 0.4202 (0.4300) closs: 0.8944 (0.9028) time: 1.0859 data: 0.0002 max mem: 17563 +[02:17:13.808127] Epoch: [3] [600/812] lr: 0.000006 grad_norm: 0.4237 (0.4300) closs: 0.8329 (0.9024) time: 1.0872 data: 0.0002 max mem: 17563 +[02:17:24.593205] Epoch: [3] [610/812] lr: 0.000006 grad_norm: 0.4493 (0.4302) closs: 0.9092 (0.9032) time: 1.0808 data: 0.0002 max mem: 17563 +[02:17:35.534419] Epoch: [3] [620/812] lr: 0.000006 grad_norm: 0.4336 (0.4304) closs: 0.8930 (0.9035) time: 1.0862 data: 0.0002 max mem: 17563 +[02:17:46.375586] Epoch: [3] [630/812] lr: 0.000006 grad_norm: 0.4340 (0.4311) closs: 0.8824 (0.9027) time: 1.0890 data: 0.0002 max mem: 17563 +[02:17:57.232787] Epoch: [3] [640/812] lr: 0.000006 grad_norm: 0.4289 (0.4309) closs: 0.8533 (0.9024) time: 1.0848 data: 0.0002 max mem: 17563 +[02:18:08.059916] Epoch: [3] [650/812] lr: 0.000005 grad_norm: 0.4289 (0.4312) closs: 0.9116 (0.9035) time: 1.0841 data: 0.0002 max mem: 17563 +[02:18:18.919927] Epoch: [3] [660/812] lr: 0.000005 grad_norm: 0.4250 (0.4311) closs: 0.9397 (0.9042) time: 1.0843 data: 0.0002 max mem: 17563 +[02:18:29.845536] Epoch: [3] [670/812] lr: 0.000005 grad_norm: 0.4118 (0.4311) closs: 0.8926 (0.9034) time: 1.0892 data: 0.0002 max mem: 17563 +[02:18:40.681588] Epoch: [3] [680/812] lr: 0.000005 grad_norm: 0.4088 (0.4308) closs: 0.8901 (0.9040) time: 1.0880 data: 0.0002 max mem: 17563 +[02:18:51.531737] Epoch: [3] [690/812] lr: 0.000005 grad_norm: 0.3880 (0.4305) closs: 0.8910 (0.9038) time: 1.0842 data: 0.0002 max mem: 17563 +[02:19:02.397259] Epoch: [3] [700/812] lr: 0.000005 grad_norm: 0.4102 (0.4306) closs: 0.8910 (0.9042) time: 1.0857 data: 0.0002 max mem: 17563 +[02:19:13.242914] Epoch: [3] [710/812] lr: 0.000005 grad_norm: 0.4370 (0.4310) closs: 0.9252 (0.9046) time: 1.0855 data: 0.0002 max mem: 17563 +[02:19:24.166388] Epoch: [3] [720/812] lr: 0.000005 grad_norm: 0.4094 (0.4308) closs: 0.9252 (0.9048) time: 1.0884 data: 0.0002 max mem: 17563 +[02:19:35.054899] Epoch: [3] [730/812] lr: 0.000005 grad_norm: 0.4150 (0.4309) closs: 0.8867 (0.9040) time: 1.0905 data: 0.0003 max mem: 17563 +[02:19:45.825930] Epoch: [3] [740/812] lr: 0.000005 grad_norm: 0.4151 (0.4305) closs: 0.8867 (0.9045) time: 1.0829 data: 0.0003 max mem: 17563 +[02:19:56.768129] Epoch: [3] [750/812] lr: 0.000005 grad_norm: 0.4089 (0.4305) closs: 0.8809 (0.9040) time: 1.0856 data: 0.0002 max mem: 17563 +[02:20:07.605720] Epoch: [3] [760/812] lr: 0.000005 grad_norm: 0.4198 (0.4309) closs: 0.8766 (0.9045) time: 1.0889 data: 0.0002 max mem: 17563 +[02:20:18.461056] Epoch: [3] [770/812] lr: 0.000005 grad_norm: 0.4191 (0.4308) closs: 0.9031 (0.9046) time: 1.0846 data: 0.0002 max mem: 17563 +[02:20:29.271451] Epoch: [3] [780/812] lr: 0.000005 grad_norm: 0.4191 (0.4312) closs: 0.8966 (0.9044) time: 1.0832 data: 0.0002 max mem: 17563 +[02:20:40.125154] Epoch: [3] [790/812] lr: 0.000005 grad_norm: 0.4262 (0.4314) closs: 0.8927 (0.9045) time: 1.0831 data: 0.0001 max mem: 17563 +[02:20:51.038156] Epoch: [3] [800/812] lr: 0.000005 grad_norm: 0.4216 (0.4313) closs: 0.9266 (0.9048) time: 1.0883 data: 0.0001 max mem: 17563 +[02:21:01.797044] Epoch: [3] [810/812] lr: 0.000005 grad_norm: 0.4304 (0.4317) closs: 0.9818 (0.9056) time: 1.0835 data: 0.0001 max mem: 17563 +[02:21:03.159536] Epoch: [3] Total time: 0:14:42 +[02:21:03.170294] Averaged stats: lr: 0.000005 grad_norm: 0.4321 (0.4319) closs: 0.9721 (0.9059) +[02:21:03.235300] model saved +[02:21:03.979453] optimizer saved +[02:21:03.980150] other rank-common saved +[02:21:03.983130] rank-specific saved +[02:21:03.983442] Training time 0:58:58